librostlab  1.0.20
rostlab/readFasta.h
Go to the documentation of this file.
00001 /*
00002     Copyright (C) 2011 Laszlo Kajan, Technical University of Munich, Germany
00003 
00004     This file is part of librostlab.
00005 
00006     librostlab is free software: you can redistribute it and/or modify
00007     it under the terms of the GNU Lesser General Public License as published by
00008     the Free Software Foundation, either version 3 of the License, or
00009     (at your option) any later version.
00010 
00011     This program is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014     GNU Lesser General Public License for more details.
00015 
00016     You should have received a copy of the GNU Lesser General Public License
00017     along with this program.  If not, see <http://www.gnu.org/licenses/>.
00018 */
00019 #ifndef ROSTLAB_READFASTA
00020 #define ROSTLAB_READFASTA 1
00021 
00022 #include <boost/regex.hpp>
00023 #include <iostream>
00024 #include <fstream>
00025 #include "rostlab/rostlab_stdexcept.h"
00026 
00027 namespace bo = boost;
00028 
00029 namespace rostlab {
00030 namespace bio {
00031 
00032   namespace fmt {
00033     class fasta{}; // fasta format class
00034   };
00035 
00036 template<typename _FmtT>
00037 class seq {
00038   private:
00039     std::string        _desc;
00040     std::string        _display_id;
00041     std::string        _seqstr;
00042   public:
00043                   seq(){};
00044                   seq( const std::string& __desc, const std::string& __display_id, const std::string& __seqstr ) : _desc(__desc), _display_id(__display_id), _seqstr(__seqstr) {};
00045     virtual       ~seq(){};
00046 
00047     std::string&       seqstr(){ return _seqstr; };
00048 };
00049 
00050 /*template<> // could specialize it...
00051 class seq<bio::fmt::fasta>
00052 {
00053   private:
00054   public:
00055 };*/
00056 
00057 /*template<typename _FmtT>
00058 std::istream&          operator>>( std::istream& __is, bio::seq<_FmtT>& __n )
00059 {
00060   return __is;
00061 }*/
00062 
00063 inline std::istream&   operator>>( std::istream& __is, bio::seq<bio::fmt::fasta>& __seq )
00064 {
00065   // based on Bio/SeqIO/fasta.pm
00066   std::string rec; rec.reserve(1024);
00067   while( __is.peek() != std::istream::traits_type::eof() )
00068   {
00069     if(rec.capacity() == rec.size()) rec.reserve(rec.capacity() * 2);
00070     if( rec.size() && __is.peek() == '>' && *rec.rbegin() == '\n' ) break;
00071     else rec += __is.get();
00072   }
00073 
00074   if( !rec.size() || *rec.begin() != '>' ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': no leading '>'" );
00075 
00076   rec = bo::regex_replace( rec, bo::regex("^>"), "" ); // $entry =~ s/^>//;
00077 
00078   bo::sregex_token_iterator i(rec.begin(), rec.end(), bo::regex("\n"), -1); // split(/\n/,$entry,2);
00079 
00080   if( i == boost::sregex_token_iterator() ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': only one line" );
00081 
00082   std::string top = *i++;
00083   std::string sequence( i->first, static_cast<std::string::const_iterator>( rec.end() ) );
00084 
00085   sequence = bo::regex_replace( sequence, bo::regex(">"), "" ); // $sequence =~ s/>//g;
00086 
00087   bo::match_results<std::string::const_iterator> what;
00088   std::string id, fulldesc;
00089   if( bo::regex_search( top, what, bo::regex("^[[:space:]]*([^[:space:]]+)[:space:]*(.*)") ) )
00090   { id = std::string( what[1].first, what[1].second ); fulldesc = std::string( what[2].first, what[2].second ); }
00091 
00092   if( id.empty() ) id = fulldesc;
00093 
00094   sequence = bo::regex_replace( sequence, bo::regex("[ \t\n\r]"), "" );
00095 
00096   // alphabet? would be good to have this
00097 
00098   __seq = bio::seq<bio::fmt::fasta>( fulldesc, id, sequence );
00099 
00100   return __is;
00101 }
00102 
00103 }; // namespace bio
00104 }; // namespace rostlab
00105 
00106 #endif /* ROSTLAB_READFASTA */
00107 // vim:et:ts=2:ai:
 All Classes Namespaces Files Functions Variables Typedefs