in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / general_msa_parser.rb
1 #
2 # = lib/evo/io/parser/general_msa_parser - GeneralMsaParser class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6 #
7 # Last modified: 2017/02/07
8
9 require 'lib/evo/io/parser/msa_parser'
10 require 'lib/evo/msa/msa'
11
12 #require 'iconv'
13
14 module Evoruby
15
16     class GeneralMsaParser < MsaParser
17
18         def initialize
19         end
20
21         def parse( path )
22             Util.check_file_for_readability( path )
23             block                       = -1
24             current_seq_index_per_block = -1
25             current_name                = nil
26             saw_ignorable = true
27             is_first      = true
28             msa = Msa.new
29             ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
30             File.open( path ) do | file |
31                 while line = file.gets
32                     line = ic.iconv( line )
33                     if can_ignore?( line )
34                         saw_ignorable = true
35                     elsif ( is_first && is_program_name_line?( line ) ) 
36                     elsif( line =~ /^\S+\s+.+\s*$/ || line =~ /^\s+.+\s*$/ || line =~ /^\S+\s*$/ )
37                         if ( saw_ignorable )
38                             block += 1
39                             current_seq_index_per_block = -1
40                             saw_ignorable = false
41                         end
42                         current_seq_index_per_block += 1
43                         if ( line =~ /^(\S+)\s+(.+?)\s*$/ )
44                             name = $1
45                             seq  = $2.gsub( /\s/, '.' )
46                             a = msa.find_by_name( name, false, false )
47                             if ( a.length < 1 )
48                                 msa.add( name, seq )
49                             elsif ( a.length == 1 )
50                                 msa.get_sequence( a[ 0 ] ).append!( seq )
51                             else
52                                 error_msg = "Unexpected error at line: " + line
53                                 raise IOError, error_msg
54                             end
55                             current_name = name
56                         elsif ( line =~ /^\s+(.+?)\s*$/ )
57                             seq = $1.gsub( /\s/, '.' )
58                             a = msa.find_by_name( current_name, false, false )
59                             if ( a.length != 1  )
60                                 error_msg = "Unexpected error at line: " + line
61                                 raise IOError, error_msg
62                             else
63                                 msa.get_sequence( a[ 0 ] ).append!( seq )
64                             end
65
66                         elsif ( line =~ /^(\S+)\s*$/ )
67                             seq = $1
68                             if block == 0
69                                 error_msg = "First block cannot contain unnamed sequences"
70                                 raise IOError, error_msg
71                             else
72                                 msa.get_sequence( current_seq_index_per_block ).append!( seq )
73                             end
74                             current_name = nil
75                         end
76                     else
77                         error_msg = "Unexpected line: " + line
78                         raise IOError, error_msg
79                     end
80                     if ( is_first )
81                         is_first = false
82                     end
83                 end
84             end
85             return msa
86         end # def parse( path )
87
88         private
89
90         def can_ignore?( line )
91             return ( line !~ /[A-Za-z\-?\*_\.]/ ||
92                      line =~ /^\s+[*\.:]/ ||
93                      line =~ /^\s*#/ ||
94                      line =~ /^\s*%/ ||
95                      line =~ /^\s*\/\// ||
96                      line =~ /^\s*!!/  )
97         end
98         
99         def is_program_name_line?( line )
100             return ( line =~ /^CLUSTAL\s/ ||
101                      line =~ /^MUSCLE\s\(/ ||
102                      line =~ /^PROBCONS\s/ )             
103         end  
104     end # class GeneralMsaParser
105
106 end # module Evoruby