2 # = lib/evo/apps/domains_to_forester - DomainsToForester class
4 # Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
9 # last modified: 06/11/2007
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/util'
13 require 'lib/evo/util/command_line_arguments'
14 require 'lib/evo/msa/msa_factory'
15 require 'lib/evo/io/parser/fasta_parser'
16 require 'lib/evo/sequence/protein_domain'
17 require 'lib/evo/sequence/domain_structure'
21 class DomainsToForester
24 PRG_DESC = "parsed hmmpfam output to forester format"
27 COPYRIGHT = "2012 Christian M Zmasek"
28 CONTACT = "phylosoft@gmail.com"
29 WWW = "www.phylosoft.org"
31 E_VALUE_THRESHOLD_OPTION = "e"
32 OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
33 HELP_OPTION_1 = "help"
36 def parse( domains_list_file,
41 overwrite_if_same_from_to )
42 Util.check_file_for_readability( domains_list_file )
43 Util.check_file_for_readability( original_seqs_file )
44 Util.check_file_for_writability( outfile )
46 domain_structures = Hash.new() # protein name is key, domain structure is value
50 original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
51 if ( original_seqs.get_number_of_seqs < 1 )
52 error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
53 raise ArgumentError, error_msg
56 File.open( domains_list_file ) do | file |
57 while line = file.gets
58 if !is_ignorable?( line )
60 a = line.split( column_delimiter )
62 if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
63 error_msg = "unexpected format at line: " + line
64 raise IOError, error_msg
70 ##########################################
71 if domain_name =~ /RRM_\d/
72 puts "ignoring " + line
75 ##########################################
79 seq_from = a[ 2 ].to_i
81 error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
82 raise IOError, error_msg
87 error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
88 raise IOError, error_msg
96 error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
97 raise IOError, error_msg
101 seq = original_seqs.get_by_name_start( protein_name )
103 total_length = seq.get_length
105 if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) )
106 pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
108 if ( domain_structures.has_key?( protein_name ) )
109 ds = domain_structures[ protein_name ]
111 ds = DomainStructure.new( total_length )
112 domain_structures[ protein_name ] = ds
114 ds.add_domain( pd, overwrite_if_same_from_to )
121 out = File.open( outfile, "a" )
122 ds = domain_structures.sort
124 protein_name = d[ 0 ]
125 domain_structure = d[ 1 ]
126 out.print( protein_name.to_s )
128 out.print( domain_structure.to_NHX )
129 out.print( Constants::LINE_DELIMITER )
142 Util.print_program_information( PRG_NAME,
152 cla = CommandLineArguments.new( ARGV )
153 rescue ArgumentError => e
154 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
157 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
158 cla.is_option_set?( HELP_OPTION_2 ) )
163 if cla.get_number_of_files != 3
168 allowed_opts = Array.new
169 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
170 allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
172 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
173 if ( disallowed.length > 0 )
174 Util.fatal_error( PRG_NAME,
175 "unknown option(s): " + disallowed,
179 domains_list_file = cla.get_file_name( 0 )
180 original_sequences_file = cla.get_file_name( 1 )
181 outfile = cla.get_file_name( 2 )
184 e_value_threshold = -1.0
185 if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
187 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
188 rescue ArgumentError => e
189 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
191 if ( e_value_threshold < 0.0 )
192 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
195 overwrite_if_same_from_to = false
196 if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
197 overwrite_if_same_from_to = true
201 puts( "Domains list file : " + domains_list_file )
202 puts( "Fasta sequencefile (complete sequences): " + original_sequences_file )
203 puts( "Outputfile : " + outfile )
204 if ( e_value_threshold >= 0.0 )
205 puts( "E-value threshold : " + e_value_threshold.to_s )
207 puts( "E-value threshold : no threshold" )
209 if ( overwrite_if_same_from_to )
210 puts( "Overwrite if same from and to : true" )
212 puts( "Overwrite if same from and to : false" )
218 parse( domains_list_file,
219 original_sequences_file,
223 overwrite_if_same_from_to )
225 rescue ArgumentError, IOError, StandardError => e
226 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
227 rescue Exception => e
228 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
233 Util.print_message( PRG_NAME, 'OK' )
244 puts( " " + PRG_NAME + ".rb [options] <domains list file (parsed hmmpfam output)> <file containing complete sequences in fasta format> <outputfile>" )
246 puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f> : E-value threshold, default is no threshold" )
247 puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
253 def is_ignorable?( line )
254 return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
258 end # class DomainsToForester