2 # = lib/evo/apps/domains_to_forester - DomainsToForester class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10 require 'lib/evo/msa/msa_factory'
11 require 'lib/evo/io/parser/fasta_parser'
12 require 'lib/evo/sequence/protein_domain'
13 require 'lib/evo/sequence/domain_structure'
16 class DomainsToForester
19 PRG_DESC = "converting of parsed hmmpfam output to forester format"
22 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
24 E_VALUE_THRESHOLD_OPTION = "e"
25 OVERWRITE_IF_SAME_FROM_TO_OPTION = "o"
26 HELP_OPTION_1 = "help"
28 def parse( domains_list_file,
33 overwrite_if_same_from_to )
34 Util.check_file_for_readability( domains_list_file )
35 Util.check_file_for_readability( original_seqs_file )
36 Util.check_file_for_writability( outfile )
38 domain_structures = Hash.new() # protein name is key, domain structure is value
42 original_seqs = f.create_msa_from_file( original_seqs_file, FastaParser.new )
43 if ( original_seqs.get_number_of_seqs < 1 )
44 error_msg = "\"" + original_seqs_file + "\" appears devoid of sequences in fasta-format"
45 raise ArgumentError, error_msg
48 File.open( domains_list_file ) do | file |
49 while line = file.gets
50 if !is_ignorable?( line )
52 a = line.split( column_delimiter )
54 if ( ( l < 4 ) || ( e_value_threshold >= 0.0 && l < 5 ) )
55 error_msg = "unexpected format at line: " + line
56 raise IOError, error_msg
64 seq_from = a[ 2 ].to_i
66 error_msg = "failed to parse seq from from \"" + a[ 2 ] + "\" [line: " + line + "]"
67 raise IOError, error_msg
72 error_msg = "failed to parse seq to from \"" + a[ 3 ] + "\" [line: " + line + "]"
73 raise IOError, error_msg
81 error_msg = "failed to parse E-value from \"" + a[ 4 ] + "\" [line: " + line + "]"
82 raise IOError, error_msg
86 seq = original_seqs.get_by_name_start( protein_name )
88 total_length = seq.get_length
90 if ( ( ( e_value_threshold < 0.0 ) || ( e_value <= e_value_threshold ) ) )
91 pd = ProteinDomain.new( domain_name, seq_from, seq_to, "", e_value )
93 if ( domain_structures.has_key?( protein_name ) )
94 ds = domain_structures[ protein_name ]
96 ds = DomainStructure.new( total_length )
97 domain_structures[ protein_name ] = ds
99 ds.add_domain( pd, overwrite_if_same_from_to )
106 out = File.open( outfile, "a" )
107 ds = domain_structures.sort
109 protein_name = d[ 0 ]
110 domain_structure = d[ 1 ]
111 out.print( protein_name.to_s )
113 out.print( domain_structure.to_NHX )
114 out.print( Constants::LINE_DELIMITER )
124 Util.print_program_information( PRG_NAME,
131 if ( ARGV == nil || ( ARGV.length < 1 ) )
137 cla = CommandLineArguments.new( ARGV )
138 rescue ArgumentError => e
139 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
142 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
143 cla.is_option_set?( HELP_OPTION_2 ) )
148 unless ( cla.get_number_of_files == 1 || cla.get_number_of_files == 2 || cla.get_number_of_files == 3 )
153 allowed_opts = Array.new
154 allowed_opts.push( E_VALUE_THRESHOLD_OPTION )
155 allowed_opts.push( OVERWRITE_IF_SAME_FROM_TO_OPTION )
157 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
158 if ( disallowed.length > 0 )
159 Util.fatal_error( PRG_NAME,
160 "unknown option(s): " + disallowed,
164 e_value_threshold = -1.0
165 if cla.is_option_set?( E_VALUE_THRESHOLD_OPTION )
167 e_value_threshold = cla.get_option_value_as_float( E_VALUE_THRESHOLD_OPTION )
168 rescue ArgumentError => e
169 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
171 if ( e_value_threshold < 0.0 )
172 Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT )
176 domains_list_file = cla.get_file_name( 0 )
177 original_sequences_file = ""
179 if (cla.get_number_of_files == 3)
180 original_sequences_file = cla.get_file_name( 1 )
181 outfile = cla.get_file_name( 2 )
182 elsif (cla.get_number_of_files == 1 || cla.get_number_of_files == 2 )
183 if ( cla.get_number_of_files == 2 )
184 original_sequences_file = cla.get_file_name( 1 )
186 hmmscan_index = domains_list_file.index(Constants::HMMSCAN)
187 if ( hmmscan_index != nil )
188 prefix = domains_list_file[0 .. hmmscan_index-1 ]
189 suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
190 files = Dir.entries( "." )
191 matching_files = Util.get_matching_files( files, prefix, suffix)
192 if matching_files.length < 1
193 Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
194 '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
196 if matching_files.length > 1
197 Util.fatal_error( PRG_NAME, 'more than one file matching [' +
198 prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
200 original_sequences_file = matching_files[ 0 ]
202 Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
206 outfile = domains_list_file
207 if (outfile.end_with?(Constants::DOMAIN_TABLE_SUFFIX) )
208 outfile = outfile.chomp(Constants::DOMAIN_TABLE_SUFFIX)
210 if ( e_value_threshold >= 0.0 )
211 e = e_value_threshold >= 1 ? e_value_threshold.to_i : e_value_threshold.to_s.sub!('.','_')
212 outfile = outfile + Constants::DOMAINS_TO_FORESTER_EVALUE_CUTOFF_SUFFIX + e.to_s
214 outfile = outfile + Constants::DOMAINS_TO_FORESTER_OUTFILE_SUFFIX
217 overwrite_if_same_from_to = false
218 if ( cla.is_option_set?( OVERWRITE_IF_SAME_FROM_TO_OPTION ) )
219 overwrite_if_same_from_to = true
223 puts( "Domain table : " + domains_list_file )
224 puts( "Fasta sequence file (complete sequences): " + original_sequences_file )
225 puts( "Outputfile : " + outfile )
226 if ( e_value_threshold >= 0.0 )
227 puts( "E-value threshold : " + e_value_threshold.to_s )
229 puts( "E-value threshold : no threshold" )
231 if ( overwrite_if_same_from_to )
232 puts( "Overwrite if same from and to : true" )
234 puts( "Overwrite if same from and to : false" )
240 parse( domains_list_file,
241 original_sequences_file,
245 overwrite_if_same_from_to )
247 rescue ArgumentError, IOError, StandardError => e
248 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
249 rescue Exception => e
250 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
254 Util.print_message( PRG_NAME, "wrote: " + outfile )
255 Util.print_message( PRG_NAME, "next step in standard analysis pipeline: dsx.rb")
256 Util.print_message( PRG_NAME, 'OK' )
267 puts( " " + PRG_NAME + ".rb [options] <domain table (parsed hmmpfam output)> [file containing complete sequences in fasta format] [outputfile]" )
269 puts( " options: -" + E_VALUE_THRESHOLD_OPTION + "=<f>: E-value threshold, default is no threshold" )
270 puts( " -" + OVERWRITE_IF_SAME_FROM_TO_OPTION + " : overwrite domain with same start and end with domain with better E-value" )
272 puts( " [next step in standard analysis pipeline: dsx.rb]")
276 puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10.dff" )
278 puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table P53_ni.fasta" )
280 puts( " " + PRG_NAME + ".rb -o P53_hmmscan_#{Constants::PFAM_V_FOR_EX}_10_domain_table" )
284 def is_ignorable?( line )
285 return ( line !~ /[A-Za-z0-9-]/ || line =~ /^\s*#/)
288 end # class DomainsToForester