2 # = lib/evo/io/parser/ncbi_tseq_parser - NcbiTSeqParser class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 # $Id: ncbi_tseq_parser.rb,v 1.5 2009/01/07 02:48:20 cmzmasek Exp $
10 require 'lib/evo/io/parser/msa_parser'
11 require 'lib/evo/taxonomy/taxonomy'
12 require 'lib/evo/msa/msa'
18 class NcbiTSeqParser < MsaParser
20 TSEQ_SEQ = "TSeq_sequence"
21 TSEQ_DEFLINE = "TSeq_defline"
22 TSEQ_ORGNAME = "TSeq_orgname"
23 TSEQ_TAXID = "TSeq_taxid"
25 TSEQ_ACCVER = "TSeq_accver"
27 TSEQ_TYPE = "TSeq_seqtype"
28 TSEQ_LENGTH = "TSeq_length"
36 # <TSeq_seqtype value="protein"/>
37 # <TSeq_gi>29341016</TSeq_gi>
38 # <TSeq_accver>AAO78806.1</TSeq_accver>
39 # <TSeq_sid>gnl|mbpwusl|BT3701</TSeq_sid>
40 # <TSeq_taxid>226186</TSeq_taxid>
41 # <TSeq_orgname>Bacteroides thetaiotaomicron VPI-5482</TSeq_orgname>
42 # <TSeq_defline>SusD [Bacteroides thetaiotaomicron VPI-5482]</TSeq_defline>
43 # <TSeq_length>551</TSeq_length>
44 # <TSeq_sequence>MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
48 Util.check_file_for_readability( path )
60 ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
61 File.open( path ) do | file |
62 while line = file.gets
63 line = ic.iconv( line )
65 if can_ignore?( line )
67 elsif line =~ /^\s*<TSeq>/
72 if line =~ /^\s*<\/TSeq>/
75 if taxid != nil || orgname != nil
80 taxonomy = Taxonomy.new( orgname, taxid , id_source )
104 sequence = Sequence.new( defline,
111 seqs.add_sequence( sequence )
119 elsif line =~ /^\s*<#{TSEQ_GI}>(\d+)<\/#{TSEQ_GI}>/
121 elsif line =~ /^\s*<#{TSEQ_ACCVER}>(.+)<\/#{TSEQ_ACCVER}>/
123 elsif line =~ /^\s*<#{TSEQ_SID}>(.+)<\/#{TSEQ_SID}>/
125 elsif line =~ /^\s*<#{TSEQ_TAXID}>(\d+)<\/#{TSEQ_TAXID}>/
127 elsif line =~ /^\s*<#{TSEQ_ORGNAME}>(.+)<\/#{TSEQ_ORGNAME}>/
129 elsif line =~ /^\s*<#{TSEQ_DEFLINE}>(.+)<\/#{TSEQ_DEFLINE}>/
131 elsif line =~ /^\s*<#{TSEQ_SEQ}>(.+)<\/#{TSEQ_SEQ}>/
133 elsif line =~ /^\s*<#{TSEQ_TYPE}/
134 elsif line =~ /^\s*<#{TSEQ_LENGTH}/
136 error_msg = "unexpected line format at line #{line_counter}: " + line
137 raise IOError, error_msg
147 def can_ignore?( line )
148 return ( line !~ /\S/ )
151 end # class NcbiTSeqParser