in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / ncbi_tseq_parser.rb
1 #
2 # = lib/evo/io/parser/ncbi_tseq_parser - NcbiTSeqParser class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: ncbi_tseq_parser.rb,v 1.5 2009/01/07 02:48:20 cmzmasek Exp $
8
9
10 require 'lib/evo/io/parser/msa_parser'
11 require 'lib/evo/taxonomy/taxonomy'
12 require 'lib/evo/msa/msa'
13
14 require 'iconv'
15
16 module Evoruby
17
18     class NcbiTSeqParser < MsaParser
19
20         TSEQ_SEQ = "TSeq_sequence"
21         TSEQ_DEFLINE = "TSeq_defline"
22         TSEQ_ORGNAME = "TSeq_orgname"
23         TSEQ_TAXID = "TSeq_taxid"
24         TSEQ_SID = "TSeq_sid"
25         TSEQ_ACCVER = "TSeq_accver"
26         TSEQ_GI = "TSeq_gi"
27         TSEQ_TYPE = "TSeq_seqtype"
28         TSEQ_LENGTH = "TSeq_length"
29
30         def initialize
31         end
32
33
34         #  <TSeqSet>
35         #<TSeq>
36         #  <TSeq_seqtype value="protein"/>
37         #  <TSeq_gi>29341016</TSeq_gi>
38         #  <TSeq_accver>AAO78806.1</TSeq_accver>
39         #  <TSeq_sid>gnl|mbpwusl|BT3701</TSeq_sid>
40         #  <TSeq_taxid>226186</TSeq_taxid>
41         #  <TSeq_orgname>Bacteroides thetaiotaomicron VPI-5482</TSeq_orgname>
42         #  <TSeq_defline>SusD [Bacteroides thetaiotaomicron VPI-5482]</TSeq_defline>
43         #  <TSeq_length>551</TSeq_length>
44         #  <TSeq_sequence>MKTKYIKQLFSAALIAVLSSGVTSCINDLDISPIDPQTGGSFDQQGVFVKGYAMLGVTGQKGIDGSPDLDGQDEGESGFYRTTFNCNELPTDECLWAWQENQDIPQLTSISWSPSSQRTEWVYVRLGYDITQYNFFLDQTEGMTDAETLRQRAEIRFLRALHYWYFLDLFGKAPFKEHFSNDLPVEKKGTELYTYIQNELNEIEADMYEPRQAPFGRADKAANWLLRARLYLNAGVYTGQTDYAKAEEYASKVIGSAYKLCTNYSELFMADNDENENAMQEIILPIRQDGVKTRNYGGSTYLVCGTRVAGMPRMGTTNGWSCIFARAAMVQKFFSNLEDVPMLPADVEIPTKGLDTDEQIDAFDAEHGIRTEDMIKAAGDDRALLYSGVGGGRRKIQTDAISGFTDGLSIVKWQNYRSDGKPVSHATYPDTDIPLFRLAEAYLTRAEAIFRQGGDATGDINELRKRANCTRKVQTVTEQELIDEWAREFYLEGRRRSDLVRFGMFTTNKYLWDWKGGAMNGTSVASYYNKYPIPVSDINNNRNMSQNEGYK</TSeq_sequence>
45         #</TSeq>
46
47         def parse( path )
48             Util.check_file_for_readability( path )
49             seqs = Msa.new
50
51             in_seq        = false
52             gi = nil
53             accver = nil
54             sid = nil
55             taxid = nil
56             orgname = nil
57             defline = nil
58             seq_str = nil
59             line_counter = 1
60             ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
61             File.open( path ) do | file |
62                 while line = file.gets
63                     line = ic.iconv( line )
64                     line_counter += 1
65                     if can_ignore?( line )
66
67                     elsif line =~ /^\s*<TSeq>/
68                         in_seq = true
69
70
71                     elsif in_seq
72                         if line =~ /^\s*<\/TSeq>/
73                             in_seq = false
74                             taxonomy = nil
75                             if taxid != nil || orgname != nil
76                                 id_source = nil
77                                 if taxid != nil
78                                     id_source = "ncbi"
79                                 end
80                                 taxonomy = Taxonomy.new( orgname, taxid , id_source )
81                             end
82                             id = nil
83                             id_source = nil
84                             symbol = nil
85                             if gi != nil
86                                 id = gi
87                                 id_source = "gi"
88                                 if sid != nil
89                                     symbol = sid
90                                 elsif accver != nil
91                                     symbol = accver
92                                 end
93                             elsif sid != nil
94                                 id = sid
95                                 id_source = "ncbi"
96                                 if accver != nil
97                                     symbol = accver
98                                 end
99                             elsif accver != nil
100                                 id = accver
101                                 id_source = "ncbi"
102                             end
103
104                             sequence = Sequence.new( defline,
105                                 seq_str,
106                                 id,
107                                 id_source,
108                                 taxonomy,
109                                 symbol )
110
111                             seqs.add_sequence( sequence )
112                             gi = nil
113                             accver = nil
114                             sid = nil
115                             taxid = nil
116                             orgname = nil
117                             defline = nil
118                             seq_str = nil
119                         elsif line =~ /^\s*<#{TSEQ_GI}>(\d+)<\/#{TSEQ_GI}>/
120                             gi = $1
121                         elsif line =~ /^\s*<#{TSEQ_ACCVER}>(.+)<\/#{TSEQ_ACCVER}>/
122                             accver = $1
123                         elsif line =~ /^\s*<#{TSEQ_SID}>(.+)<\/#{TSEQ_SID}>/
124                             sid = $1
125                         elsif line =~ /^\s*<#{TSEQ_TAXID}>(\d+)<\/#{TSEQ_TAXID}>/
126                             taxid = $1
127                         elsif line =~ /^\s*<#{TSEQ_ORGNAME}>(.+)<\/#{TSEQ_ORGNAME}>/
128                             orgname = $1
129                         elsif line =~ /^\s*<#{TSEQ_DEFLINE}>(.+)<\/#{TSEQ_DEFLINE}>/
130                             defline = $1
131                         elsif line =~ /^\s*<#{TSEQ_SEQ}>(.+)<\/#{TSEQ_SEQ}>/
132                             seq_str = $1
133                         elsif line =~ /^\s*<#{TSEQ_TYPE}/
134                         elsif line =~ /^\s*<#{TSEQ_LENGTH}/
135                         else
136                             error_msg = "unexpected line format at line #{line_counter}: " + line
137                             raise IOError, error_msg
138                         end
139                     end
140                 end
141             end
142             return seqs
143         end
144
145         private
146
147         def can_ignore?( line )
148             return ( line !~ /\S/ )
149         end
150
151     end # class NcbiTSeqParser
152
153 end # module Evoruby