in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / general_msa_parser.rb
1 #
2 # = lib/evo/io/parser/general_msa_parser - GeneralMsaParser class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: general_msa_parser.rb,v 1.8 2009/10/08 22:44:54 cmzmasek Exp $
8 #
9 # last modified: 2009/10/08
10
11 require 'lib/evo/io/parser/msa_parser'
12 require 'lib/evo/msa/msa'
13
14 require 'iconv'
15
16 module Evoruby
17
18     class GeneralMsaParser < MsaParser
19
20         def initialize
21         end
22
23         def parse( path )
24             Util.check_file_for_readability( path )
25             block                       = -1
26             current_seq_index_per_block = -1
27             current_name                = nil
28             saw_ignorable = true
29             is_first      = true
30             msa = Msa.new
31             ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
32             File.open( path ) do | file |
33                 while line = file.gets
34                     line = ic.iconv( line )
35                     if can_ignore?( line )
36                         saw_ignorable = true
37                     elsif ( is_first && is_program_name_line?( line ) ) 
38                     elsif( line =~ /^\S+\s+.+\s*$/ || line =~ /^\s+.+\s*$/ || line =~ /^\S+\s*$/ )
39                         if ( saw_ignorable )
40                             block += 1
41                             current_seq_index_per_block = -1
42                             saw_ignorable = false
43                         end
44                         current_seq_index_per_block += 1
45                         if ( line =~ /^(\S+)\s+(.+?)\s*$/ )
46                             name = $1
47                             seq  = $2.gsub( /\s/, '.' )
48                             a = msa.find_by_name( name, false, false )
49                             if ( a.length < 1 )
50                                 msa.add( name, seq )
51                             elsif ( a.length == 1 )
52                                 msa.get_sequence( a[ 0 ] ).append!( seq )
53                             else
54                                 error_msg = "Unexpected error at line: " + line
55                                 raise IOError, error_msg
56                             end
57                             current_name = name
58                         elsif ( line =~ /^\s+(.+?)\s*$/ )
59                             seq = $1.gsub( /\s/, '.' )
60                             a = msa.find_by_name( current_name, false, false )
61                             if ( a.length != 1  )
62                                 error_msg = "Unexpected error at line: " + line
63                                 raise IOError, error_msg
64                             else
65                                 msa.get_sequence( a[ 0 ] ).append!( seq )
66                             end
67
68                         elsif ( line =~ /^(\S+)\s*$/ )
69                             seq = $1
70                             if block == 0
71                                 error_msg = "First block cannot contain unnamed sequences"
72                                 raise IOError, error_msg
73                             else
74                                 msa.get_sequence( current_seq_index_per_block ).append!( seq )
75                             end
76                             current_name = nil
77                         end
78                     else
79                         error_msg = "Unexpected line: " + line
80                         raise IOError, error_msg
81                     end
82                     if ( is_first )
83                         is_first = false
84                     end
85                 end
86             end
87             return msa
88         end # def parse( path )
89
90         private
91
92         def can_ignore?( line )
93             return ( line !~ /[A-Za-z\-?\*_\.]/ ||
94                      line =~ /^\s+[*\.:]/ ||
95                      line =~ /^\s*#/ ||
96                      line =~ /^\s*%/ ||
97                      line =~ /^\s*\/\// ||
98                      line =~ /^\s*!!/  )
99         end
100         
101         def is_program_name_line?( line )
102             return ( line =~ /^CLUSTAL\s/ ||
103                      line =~ /^MUSCLE\s\(/ ||
104                      line =~ /^PROBCONS\s/ )             
105         end  
106     end # class GeneralMsaParser
107
108 end # module Evoruby