2 # = lib/evo/apps/domains_to_forester - DomainsToForester class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
9 # last modified: 06/11/2007
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/msa/msa_factory'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/sequence/protein_domain'
17 require 'lib/evo/sequence/domain_structure'
21 class DomainsToForester
24 PRG_DESC = "parsed hmmpfam output to forester format"
27 COPYRIGHT = "2012 Christian M Zmasek"
28 CONTACT = "phylosoft@gmail.com"
29 WWW = "www.phylosoft.org"
31 E_VALUE_THRESHOLD_OPTION = "e"
32 OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
33 HELP_OPTION_1 = "help"
36 def parse( domains_list_file,
41 overwrite_if_same_from_to )
42 Util.check_file_for_readability( domains_list_file )
43 Util.check_file_for_readability( original_seqs_file )
44 Util.check_file_for_writability( outfile )
46 domain_structures = Hash.new() # protein name is key, domain structure is value
50 original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
51 if ( original_seqs.get_number_of_seqs < 1 )
52 error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
53 raise ArgumentError, error_msg
56 File.open( domains_list_file ) do | file |
57 while line = file.gets
58 if !is_ignorable?( line )
59 a = line.split( column_delimiter )
61 if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
62 error_msg = "unexpected format at line: " + line
63 raise IOError, error_msg
70 seq_from = a[ 2 ].to_i
72 error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
73 raise IOError, error_msg
78 error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
79 raise IOError, error_msg
87 error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
88 raise IOError, error_msg
92 seq = original_seqs.get_by_name_start( protein_name )
94 total_length = seq.get_length
96 if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) )
97 pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
99 if ( domain_structures.has_key?( protein_name ) )
100 ds = domain_structures[ protein_name ]
102 ds = DomainStructure.new( total_length )
103 domain_structures[ protein_name ] = ds
105 ds.add_domain( pd, overwrite_if_same_from_to )
112 out = File.open( outfile, "a" )
113 ds = domain_structures.sort
115 protein_name = d[ 0 ]
116 domain_structure = d[ 1 ]
117 out.print( protein_name.to_s )
119 out.print( domain_structure.to_NHX )
120 out.print( Constants::LINE_DELIMITER )
133 Util.print_program_information( PRG_NAME,
143 cla = CommandLineArguments.new( ARGV )
144 rescue ArgumentError => e
145 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
148 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
149 cla.is_option_set?( HELP_OPTION_2 ) )
154 if cla.get_number_of_files != 3
159 allowed_opts = Array.new
160 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
161 allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
163 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
164 if ( disallowed.length > 0 )
165 Util.fatal_error( PRG_NAME,
166 "unknown option(s): " + disallowed,
170 domains_list_file = cla.get_file_name( 0 )
171 original_sequences_file = cla.get_file_name( 1 )
172 outfile = cla.get_file_name( 2 )
175 e_value_threshold = -1.0
176 if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
178 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
179 rescue ArgumentError => e
180 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
182 if ( e_value_threshold < 0.0 )
183 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
186 overwrite_if_same_from_to = false
187 if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
188 overwrite_if_same_from_to = true
192 puts( "Domains list file : " + domains_list_file )
193 puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
194 puts( "Outputfile : " + outfile )
195 if ( e_value_threshold >= 0.0 )
196 puts( "E-value threshold : " + e_value_threshold.to_s )
198 puts( "E-value threshold : no threshold" )
200 if ( overwrite_if_same_from_to )
201 puts( "Overwrite if same from and to : true" )
203 puts( "Overwrite if same from and to : false" )
209 parse( domains_list_file,
210 original_sequences_file,
214 overwrite_if_same_from_to )
216 rescue ArgumentError, IOError, StandardError => e
217 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
218 rescue Exception => e
219 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
224 Util.print_message( PRG_NAME, 'OK' )
235 puts( " " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
237 puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : E-value threshold, default is no threshold" )
238 puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
244 def is_ignorable?( line )
245 return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
249 end # class DomainsToForester