+ # raises ArgumentError, IOError
+ def parse( inpath,
+ outpath,
+ column_delimiter,
+ i_e_value_threshold,
+ ignore_dufs,
+ get_descriptions,
+ fs_e_value_threshold,
+ hmm_for_protein_output,
+ uniprot,
+ species )
+
+ Util.check_file_for_readability( inpath )
+ Util.check_file_for_writability( outpath )
+
+ hmmscan_parser = HmmscanParser.new( inpath )
+ results = hmmscan_parser.parse
+
+ outfile = File.open( outpath, "a" )
+
+ query = ""
+ desc = ""
+ model = ""
+ env_from = ""
+ env_to = ""
+ i_e_value = ""
+
+ hmmscan_results_per_protein = []
+
+ prev_query = ""
+
+ results.each do | r |
+ model = r.model
+ query = r.query
+ i_e_value = r.i_e_value
+ env_from = r.env_from
+ env_to = r.env_to
+
+ if ( ( i_e_value_threshold < 0.0 ) || ( i_e_value <= i_e_value_threshold ) ) &&
+ ( !ignore_dufs || ( model !~ /^DUF\d+/ ) )
+ count_model( model )
+ outfile.print( query +
+ column_delimiter )
+ if ( get_descriptions )
+ outfile.print( desc +
+ column_delimiter )
+ end
+ outfile.print( model +
+ column_delimiter +
+ env_from.to_s +
+ column_delimiter +
+ env_to.to_s +
+ column_delimiter +
+ i_e_value.to_s )
+ outfile.print( Constants::LINE_DELIMITER )
+ end
+
+ if !hmm_for_protein_output.empty?
+ if !prev_query.empty? && prev_query != query
+ if !hmmscan_results_per_protein.empty?
+ process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+ fs_e_value_threshold,
+ hmm_for_protein_output,
+ i_e_value_threshold,
+ uniprot,
+ species )
+ end
+ hmmscan_results_per_protein.clear
+ end
+ prev_query = query
+
+ if USE_AVOID_HMMS
+ if !AVOID_HHMS.include? r.model
+ hmmscan_results_per_protein << r
+ end
+ else
+ hmmscan_results_per_protein << r
+ end
+ end
+ end
+
+ if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty?
+ process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+ fs_e_value_threshold,
+ hmm_for_protein_output,
+ i_e_value_threshold,
+ uniprot,
+ species )
+ end
+
+ outfile.flush()
+ outfile.close()
+ end # def parse
+
+ def process_id( id )
+ if id =~ /(sp|tr)\|\S+\|(\S+)/
+ id = $2
+ end
+ id
+ end
+
+ def count_model( model )
+ if ( @domain_counts.has_key?( model ) )
+ count = @domain_counts[ model ].to_i
+ count += 1
+ @domain_counts[ model ] = count
+ else
+ @domain_counts[ model ] = 1
+ end
+ end
+
+ def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
+ fs_e_value_threshold,
+ hmm_for_protein_output,
+ i_e_value_threshold,
+ uniprotkb,
+ species )
+
+ dc = 0
+ # filter according to i-Evalue threshold
+ # abort if fs Evalue too high
+ hmmscan_results_per_protein_filtered = []
+
+ hmmscan_results_per_protein.each do | r |
+
+
+ if r.model == hmm_for_protein_output
+ if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold
+ return
+ end
+ end
+ if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold
+ hmmscan_results_per_protein_filtered << r
+ if r.model == hmm_for_protein_output
+ dc += 1
+ end
+ end
+ end
+
+ if dc == 0
+ # passed on protein E-value, failed in per domain E-values
+ return
+ end
+
+ hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from }
+
+ own = nil
+ hmmscan_results_per_protein_filtered.each do | r |
+ if r.model == hmm_for_protein_output
+ own = r
+ end
+ end
+
+ s = ""
+ s << own.query + "\t"
+ s << species + "\t"
+ s << own.fs_e_value.to_s + "\t"
+ s << own.qlen.to_s + "\t"
+ s << dc.to_s + "\t"
+ s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
+ hmmscan_results_per_protein_filtered.each do | r |
+ s << r.model + " "
+ end
+ s << "\t"
+
+ if !uniprotkb.empty?
+ #e = UniprotKB::get_entry_by_id( process_id( own.query ) )
+
+ #if e != nil
+ # s << uniprot_annotation( e )
+ # # s << "\uniprot_annotationt"
+ #end
+ end
+
+ overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+
+ s << overview + "\t"
+
+ s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
+
+ prev_r = nil
+ hmmscan_results_per_protein_filtered.each do | r |
+
+ if prev_r != nil
+ s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
+ else
+ s << make_interdomain_sequence( r.env_from, false )
+ end
+ s << r.model
+ s << "["
+ s << r.env_from.to_s << "-" << r.env_to.to_s
+ s << "|ie=" << r.i_e_value.to_s
+ s << "|ce=" << r.c_e_value.to_s
+ s << "]"
+ prev_r = r
+ end
+ s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
+ puts s
+ end
+
+ def uniprot_annotation( e )
+ s = ""
+ pdb_ids = e.get_pdb_ids
+ if !pdb_ids.empty?
+ pdb_ids.each do | pdb |
+ s << pdb << ", "
+ end
+ else
+ s << "-"
+ end
+ s
+ end
+
+ def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+ linkers = ""
+ prev_r = nil
+ hmmscan_results_per_protein_filtered.each do | r |
+ if r.model == hmm_for_protein_output
+ if prev_r != nil
+ linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
+ end
+ prev_r = r
+ end
+ end
+ linkers
+ end
+
+ def get_domain_counts()
+ return @domain_counts
+ end
+
+ def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+ overview = ""
+ prev_r = nil
+ hmmscan_results_per_protein_filtered.each do | r |
+ if r.model == hmm_for_protein_output
+ if prev_r == nil
+ overview << hmm_for_protein_output
+ else
+ if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
+ overview << "~" << hmm_for_protein_output
+ else
+ overview << "----" << hmm_for_protein_output
+ end
+ end
+ prev_r = r
+ end
+ end
+ overview
+ end
+
+ def make_interdomain_sequence( d, mark_short = true )
+ s = ""
+ d /= 20
+ if d >= 10
+ s << "----//----"
+ elsif d >= 1
+ d.times do
+ s << "-"
+ end
+ elsif mark_short
+ s << "~"
+ end
+ s
+ end