in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / domains_to_forester.rb
1 #
2 # = lib/evo/apps/domains_to_forester - DomainsToForester class
3 #
4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: domains_to_forester.rb,v 1.11 2010/12/13 19:00:11 cmzmasek Exp $
8 #
9 # last modified: 06/11/2007
10
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/msa/msa_factory'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/sequence/protein_domain'
17 require 'lib/evo/sequence/domain_structure'
18
19 module Evoruby
20
21     class DomainsToForester
22
23         PRG_NAME       = "d2f"
24         PRG_DESC       = "parsed hmmpfam output to forester format"
25         PRG_VERSION    = "1.0.0"
26         PRG_DATE       = "2007.12.18"
27         COPYRIGHT      = "2007 Christian M Zmasek"
28         CONTACT        = "phylosoft@gmail.com"
29         WWW            = "www.phylosoft.org"
30
31         E_VALUE_THRESHOLD_OPTION         = "e"
32         OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
33         HELP_OPTION_1                    = "help"
34         HELP_OPTION_2                    = "h"
35
36         def parse( domains_list_file,
37                 original_seqs_file,
38                 outfile,
39                 column_delimiter,
40                 e_value_threshold,
41                 overwrite_if_same_from_to )
42             Util.check_file_for_readability( domains_list_file )
43             Util.check_file_for_readability( original_seqs_file )
44             Util.check_file_for_writability( outfile )
45
46             domain_structures = Hash.new() # protein name is key, domain structure is value
47
48             f = MsaFactory.new
49
50             original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
51             if ( original_seqs.get_number_of_seqs < 1 )
52                 error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
53                 raise ArgumentError, error_msg
54             end
55
56             File.open( domains_list_file ) do | file |
57                 while line = file.gets
58                     if ( !is_ignorable?( line ) )
59                         a = line.split( column_delimiter )
60                         l = a.length
61                         if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
62                             error_msg = "unexpected format at line: " + line
63                             raise IOError, error_msg
64                         end
65                         protein_name = a[ 0 ]
66                         domain_name  = a[ 1 ]
67                         seq_from     = -1
68                         seq_to       = -1
69                         begin
70                             seq_from = a[ 2 ].to_i
71                         rescue Exception
72                             error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
73                             raise IOError, error_msg
74                         end
75                         begin
76                             seq_to = a[ 3 ].to_i
77                         rescue Exception
78                             error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
79                             raise IOError, error_msg
80                         end
81
82                         e_value = -1
83                         if ( l > 4 )
84                             begin
85                                 e_value = a[ 4 ].to_f
86                             rescue Exception
87                                 error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
88                                 raise IOError, error_msg
89                             end
90                         end
91
92                         seq = original_seqs.get_by_name( protein_name, true, false )
93
94                         total_length = seq.get_length
95
96                         if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) )  )
97                             pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
98                             ds = nil
99                             if ( domain_structures.has_key?( protein_name ) )
100                                 ds = domain_structures[ protein_name ]
101                             else
102                                 ds = DomainStructure.new( total_length )
103                                 domain_structures[ protein_name ] = ds
104                             end
105                             ds.add_domain( pd, overwrite_if_same_from_to )
106                         end
107
108                     end
109                 end
110             end
111
112             out = File.open( outfile, "a" )
113             ds = domain_structures.sort
114             for d in ds
115                 protein_name     = d[ 0 ]
116                 domain_structure = d[ 1 ]
117                 out.print( protein_name.to_s )
118                 out.print( ":" )
119                 out.print( domain_structure.to_NHX )
120                 out.print( Constants::LINE_DELIMITER  )
121             end
122
123             out.flush()
124             out.close()
125
126         end # parse
127
128
129
130
131         def run()
132
133             Util.print_program_information( PRG_NAME,
134                 PRG_VERSION,
135                 PRG_DESC,
136                 PRG_DATE,
137                 COPYRIGHT,
138                 CONTACT,
139                 WWW,
140                 STDOUT )
141
142             begin
143                 cla = CommandLineArguments.new( ARGV )
144             rescue ArgumentError => e
145                 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
146             end
147
148             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
149                      cla.is_option_set?( HELP_OPTION_2 ) )
150                 print_help
151                 exit( 0 )
152             end
153
154             if ( cla.get_number_of_files != 3 )
155                 print_help
156                 exit( -1 )
157             end
158
159             allowed_opts = Array.new
160             allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
161             allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
162
163             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
164             if ( disallowed.length > 0 )
165                 Util.fatal_error( PRG_NAME,
166                     "unknown option(s): " + disallowed,
167                     STDOUT )
168             end
169
170             domains_list_file       = cla.get_file_name( 0 )
171             original_sequences_file = cla.get_file_name( 1 )
172             outfile                 = cla.get_file_name( 2 )
173
174
175             e_value_threshold = -1.0
176             if ( cla.is_option_set?( E_VALUE_THRESHOLD_OPTION ) )
177                 begin
178                     e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
179                 rescue ArgumentError => e
180                     Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
181                 end
182                 if ( e_value_threshold < 0.0 )
183                     Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
184                 end
185             end
186             overwrite_if_same_from_to = false
187             if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
188                 overwrite_if_same_from_to = true
189             end
190
191             puts()
192             puts( "Domains list file                      : " + domains_list_file )
193             puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
194             puts( "Outputfile                             : " + outfile )
195             if ( e_value_threshold >= 0.0 )
196                 puts( "E-value threshold                      : " + e_value_threshold.to_s )
197             else
198                 puts( "E-value threshold                      : no threshold" )
199             end
200             if ( overwrite_if_same_from_to )
201                 puts( "Overwrite if same from and to          : true" )
202             else
203                 puts( "Overwrite if same from and to          : false" )
204             end
205
206             puts
207
208             begin
209                 parse( domains_list_file,
210                     original_sequences_file,
211                     outfile,
212                     " ",
213                     e_value_threshold,
214                     overwrite_if_same_from_to )
215
216             rescue ArgumentError, IOError, StandardError => e
217                 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
218             rescue Exception => e
219                 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
220             end
221
222
223             puts
224             Util.print_message( PRG_NAME, 'OK' )
225             puts
226
227         end
228
229         private
230
231         def print_help()
232             puts()
233             puts( "Usage:" )
234             puts()
235             puts( "  " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
236             puts()
237             puts( "  options: -" + E_VALUE_THRESHOLD_OPTION  + "=<f> : E-value threshold, default is no threshold" )
238             puts( "               -" + OVERWRITE_IF_SAME_FROM_TO_OPTION  + " : overwrite domain with same start and end with domain with better E-value" )
239             puts()
240         end
241
242
243
244         def is_ignorable?( line )
245             return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
246         end
247
248
249     end # class DomainsToForester
250
251
252 end # module Evoruby