2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
4 # Copyright:: Copyright (C) 2012 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
19 class HmmscanDomainExtractor
26 def process_hmmscan_datas( hmmscan_datas,
38 actual_out_of = hmmscan_datas.size
40 domain_pass_counter = 0;
42 hmmscan_datas.each_with_index do |hmmscan_data, index|
43 if hmmscan_data.number < ( index + 1 )
44 error_msg = "hmmscan_data.number < ( index + 1 ) (this should not have happened)"
45 raise StandardError, error_msg
48 extract_domain( hmmscan_data.seq_name,
51 hmmscan_data.env_from,
59 domain_pass_counter += 1
61 if passed_seqs.find_by_name_start( hmmscan_data.seq_name, true ).length < 1
62 add_sequence( hmmscan_data.seq_name, in_msa, passed_seqs )
66 extract_domain( hmmscan_data.seq_name,
69 hmmscan_data.env_from,
79 last = index == hmmscan_datas.length - 1
80 next_env_from = hmmscan_datas[ index + 1 ].env_from
81 env_to = hmmscan_data.env_to
82 env_from = hmmscan_data.env_from
83 prev_env_to = hmmscan_datas[ index - 1 ].env_to
86 if (( first && ( ( next_env_from - env_to ) > min_linker) )
88 ( last && ( ( env_from - prev_env_to ) > min_linker ) )
90 ( !first && !last && ( next_env_from - env_to > min_linker ) && ( last && env_from - prev_env_to > min_linker ) ))
95 extract_domain( hmmscan_data.seq_name,
96 index.to_s + "+" + (index + 1).to_s,
98 hmmscan_datas[ index - 1 ].env_from,
112 extract_domain( sequence,
125 singlets_counter += 1
127 if sequence != prev_sequence
130 if ( env_from - prev_env_to ) <= min_linker
131 extract_domain( sequence,
132 prev_number.to_s + "+" + number.to_s,
145 close_pairs_counter += 2
148 extract_domain( sequence,
154 out_msa_distant_partners,
161 distant_pairs_counter += 1
164 extract_domain( sequence,
170 out_msa_distant_partners,
177 distant_pairs_counter += 1
181 end # sequence != prev_sequence else
183 prev_sequence = sequence
185 prev_env_from = env_from
187 #prev_i_e_value = i_e_value
193 end # def process_hmmscan_data
196 # raises ArgumentError, IOError, StandardError
197 def parse( domain_id,
207 add_domain_number_as_digit,
208 add_domain_number_as_letter,
214 Util.check_file_for_readability( hmmscan_output )
215 Util.check_file_for_readability( fasta_sequence_file )
216 Util.check_file_for_writability( outfile )
217 Util.check_file_for_writability( passed_seqs_outfile )
218 Util.check_file_for_writability( failed_seqs_outfile )
221 factory = MsaFactory.new()
222 in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
224 if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
225 error_msg = "could not find fasta sequences in " + fasta_sequence_file
226 raise IOError, error_msg
231 failed_seqs = Msa.new
232 passed_seqs = Msa.new
234 out_msa_distant_partners = nil
235 out_msa_singles = nil
237 out_msa_pairs = Msa.new
238 out_msa_distant_partners = Msa.new
239 out_msa_singles = Msa.new
242 ld = Constants::LINE_DELIMITER
244 domain_pass_counter = 0
245 domain_fail_counter = 0
247 distant_pairs_counter = 0
248 close_pairs_counter = 0
249 proteins_with_failing_domains = 0
250 max_domain_copy_number_per_protein = -1
251 max_domain_copy_number_sequence = ""
257 # prev_i_e_value = nil
261 hmmscan_datas = Array.new
264 File.open( hmmscan_output ) do | file |
265 while line = file.gets
266 if !is_ignorable?( line ) && line =~ /^\S+\s+/
268 # tn acc tlen query acc qlen Evalue score bias # of c-E i-E score bias hf ht af at ef et acc desc
269 # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
270 line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
283 if ( ( ( e_value_threshold < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
284 ( ( length_threshold <= 0 ) || ( env_to - env_from + 1 ) >= length_threshold.to_f ) )
285 hmmscan_datas << HmmsearchData.new( sequence, number, out_of, env_from, env_to, i_e_value )
286 if ( number > max_domain_copy_number_per_protein )
287 max_domain_copy_number_sequence = sequence
288 max_domain_copy_number_per_protein = number
291 print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
292 log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
293 if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
294 print( " iE=" + i_e_value.to_s )
295 log << " iE=" + i_e_value.to_s
297 if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
298 le = env_to - env_from + 1
299 print( " l=" + le.to_s )
300 log << " l=" + le.to_s
304 domain_fail_counter += 1
306 if failed_seqs.find_by_name_start( sequence, true ).length < 1
307 add_sequence( sequence, in_msa, failed_seqs )
308 proteins_with_failing_domains += 1
313 error_msg = "number > out_of ! (this should not have happened)"
314 raise StandardError, error_msg
317 if number == out_of && !hmmscan_datas.empty?
318 domain_pass_counter += process_hmmscan_datas( hmmscan_datas,
332 end # while line = file.gets
333 end # File.open( hmmsearch_output ) do | file |
337 if domain_pass_counter < 1
338 error_msg = "no domain sequences were extracted"
339 raise IOError, error_msg
343 puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
344 log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
347 if ( max_domain_copy_number_per_protein > 1 )
348 puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
349 log << "First protein with this copy number: " + max_domain_copy_number_sequence
353 write_msa( out_msa, outfile )
354 write_msa( passed_seqs, passed_seqs_outfile )
355 write_msa( failed_seqs, failed_seqs_outfile )
358 write_msa( out_msa_pairs, outfile +"_" + min_linker.to_s )
362 write_msa( out_msa_singles, outfile +"_singles" )
365 if out_msa_distant_partners
366 write_msa( out_msa_distant_partners, outfile + "_" + min_linker.to_s + "_isolated" );
371 log << "passing domains : " + domain_pass_counter.to_s + ld
373 log << "single domains : " + singlets_counter.to_s + ld
374 log << "domains in close pairs : " + close_pairs_counter.to_s + ld
375 log << "isolated domains : " + distant_pairs_counter.to_s + ld
377 log << "failing domains : " + domain_fail_counter.to_s + ld
378 log << "proteins with passing domains: " + passed_seqs.length.to_s + ld
379 log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
382 return domain_pass_counter
389 def write_msa( msa, filename )
391 w = FastaWriter.new()
392 w.set_line_width( 60 )
395 io.write_to_file( msa, filename, w )
397 error_msg = "could not write to \"" + filename + "\""
398 raise IOError, error_msg
403 def add_sequence( sequence_name, in_msa, add_to_msa )
404 seqs = in_msa.find_by_name_start( sequence_name, true )
405 if ( seqs.length < 1 )
406 error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
407 raise StandardError, error_msg
409 if ( seqs.length > 1 )
410 error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
411 raise StandardError, error_msg
413 seq = in_msa.get_sequence( seqs[ 0 ] )
414 add_to_msa.add_sequence( seq )
418 def extract_domain( sequence,
429 if number.is_a?( Fixnum ) && ( number < 1 || out_of < 1 || number > out_of )
430 error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
431 raise StandardError, error_msg
433 if seq_from < 1 || seq_to < 1 || seq_from >= seq_to
434 error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
435 raise StandardError, error_msg
437 seqs = in_msa.find_by_name_start( sequence, true )
439 error_msg = "sequence \"" + sequence + "\" not found in sequence file"
440 raise IOError, error_msg
443 error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
444 raise IOError, error_msg
446 # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
447 seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
449 orig_name = seq.get_name
451 seq.set_name( orig_name.split[ 0 ] )
454 seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
458 seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
461 if out_of != 1 && add_domain_number
462 seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
466 a = orig_name.rindex "["
467 b = orig_name.rindex "]"
469 error_msg = "species not found in " + orig_name
470 raise StandardError, error_msg
472 species = orig_name[ a .. b ]
473 seq.set_name( seq.get_name + " " + species )
475 out_msa.add_sequence( seq )
478 def is_ignorable?( line )
479 return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
482 end # class HmmscanDomainExtractor
485 def initialize( seq_name, number, out_of, env_from, env_to, i_e_value )
491 @i_e_value = i_e_value
493 attr_reader :seq_name, :number, :out_of, :env_from, :env_to, :i_e_value