librostlab
1.0.20
|
00001 /* 00002 Copyright (C) 2011 Laszlo Kajan, Technical University of Munich, Germany 00003 00004 This file is part of librostlab. 00005 00006 librostlab is free software: you can redistribute it and/or modify 00007 it under the terms of the GNU Lesser General Public License as published by 00008 the Free Software Foundation, either version 3 of the License, or 00009 (at your option) any later version. 00010 00011 This program is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU Lesser General Public License for more details. 00015 00016 You should have received a copy of the GNU Lesser General Public License 00017 along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 */ 00019 #ifndef ROSTLAB_READFASTA 00020 #define ROSTLAB_READFASTA 1 00021 00022 #include <boost/regex.hpp> 00023 #include <iostream> 00024 #include <fstream> 00025 #include "rostlab/rostlab_stdexcept.h" 00026 00027 namespace bo = boost; 00028 00029 namespace rostlab { 00030 namespace bio { 00031 00032 namespace fmt { 00033 class fasta{}; // fasta format class 00034 }; 00035 00036 template<typename _FmtT> 00037 class seq { 00038 private: 00039 std::string _desc; 00040 std::string _display_id; 00041 std::string _seqstr; 00042 public: 00043 seq(){}; 00044 seq( const std::string& __desc, const std::string& __display_id, const std::string& __seqstr ) : _desc(__desc), _display_id(__display_id), _seqstr(__seqstr) {}; 00045 virtual ~seq(){}; 00046 00047 std::string& seqstr(){ return _seqstr; }; 00048 }; 00049 00050 /*template<> // could specialize it... 00051 class seq<bio::fmt::fasta> 00052 { 00053 private: 00054 public: 00055 };*/ 00056 00057 /*template<typename _FmtT> 00058 std::istream& operator>>( std::istream& __is, bio::seq<_FmtT>& __n ) 00059 { 00060 return __is; 00061 }*/ 00062 00063 inline std::istream& operator>>( std::istream& __is, bio::seq<bio::fmt::fasta>& __seq ) 00064 { 00065 // based on Bio/SeqIO/fasta.pm 00066 std::string rec; rec.reserve(1024); 00067 while( __is.peek() != std::istream::traits_type::eof() ) 00068 { 00069 if(rec.capacity() == rec.size()) rec.reserve(rec.capacity() * 2); 00070 if( rec.size() && __is.peek() == '>' && *rec.rbegin() == '\n' ) break; 00071 else rec += __is.get(); 00072 } 00073 00074 if( !rec.size() || *rec.begin() != '>' ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': no leading '>'" ); 00075 00076 rec = bo::regex_replace( rec, bo::regex("^>"), "" ); // $entry =~ s/^>//; 00077 00078 bo::sregex_token_iterator i(rec.begin(), rec.end(), bo::regex("\n"), -1); // split(/\n/,$entry,2); 00079 00080 if( i == boost::sregex_token_iterator() ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': only one line" ); 00081 00082 std::string top = *i++; 00083 std::string sequence( i->first, static_cast<std::string::const_iterator>( rec.end() ) ); 00084 00085 sequence = bo::regex_replace( sequence, bo::regex(">"), "" ); // $sequence =~ s/>//g; 00086 00087 bo::match_results<std::string::const_iterator> what; 00088 std::string id, fulldesc; 00089 if( bo::regex_search( top, what, bo::regex("^[[:space:]]*([^[:space:]]+)[:space:]*(.*)") ) ) 00090 { id = std::string( what[1].first, what[1].second ); fulldesc = std::string( what[2].first, what[2].second ); } 00091 00092 if( id.empty() ) id = fulldesc; 00093 00094 sequence = bo::regex_replace( sequence, bo::regex("[ \t\n\r]"), "" ); 00095 00096 // alphabet? would be good to have this 00097 00098 __seq = bio::seq<bio::fmt::fasta>( fulldesc, id, sequence ); 00099 00100 return __is; 00101 } 00102 00103 }; // namespace bio 00104 }; // namespace rostlab 00105 00106 #endif /* ROSTLAB_READFASTA */ 00107 // vim:et:ts=2:ai: