X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fhmmscan_summary.rb;h=bb32da53ffc2b38470d5347f45c2ea97ca9d5935;hb=bae81cb152ccfbf17cfcb2aa4a35f49818ab25a3;hp=a8f15907c5641fa7860b50d7dec2e29ba348aac7;hpb=d03733a9998b73a5880b9dab0f688e6a69661019;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index a8f1590..bb32da5 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -6,7 +6,6 @@ # # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ # -# last modified: 121003 require 'set' @@ -14,7 +13,6 @@ require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_parser' -require 'lib/evo/io/parser/uniprot_parser' require 'lib/evo/io/web/uniprotkb' module Evoruby @@ -22,14 +20,15 @@ module Evoruby class HmmscanSummary PRG_NAME = "hsp" - PRG_VERSION = "2.000" + PRG_VERSION = "2.001" PRG_DESC = "hmmscan summary" - PRG_DATE = "2012.10.23" - COPYRIGHT = "2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + PRG_DATE = "2013.10.23" + COPYRIGHT = "2013 Christian M Zmasek" + CONTACT = "phyloxml@gmail.com" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" DELIMITER_OPTION = "d" + SPECIES_OPTION = "s" I_E_VALUE_THRESHOLD_OPTION = "ie" FS_E_VALUE_THRESHOLD_OPTION = "pe" HMM_FOR_PROTEIN_OUTPUT = "m" @@ -49,17 +48,14 @@ module Evoruby def run - ukb = UniprotKB.new - ukb.get - - Util.print_program_information( PRG_NAME, - PRG_VERSION, - PRG_DESC, - PRG_DATE, - COPYRIGHT, - CONTACT, - WWW, - STDOUT ) + # Util.print_program_information( PRG_NAME, + # PRG_VERSION, + # PRG_DESC, + # PRG_DATE, + # COPYRIGHT, + # CONTACT, + # WWW, + # STDOUT ) begin cla = CommandLineArguments.new( ARGV ) @@ -86,6 +82,7 @@ module Evoruby allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) allowed_opts.push( UNIPROT ) + allowed_opts.push( SPECIES_OPTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -118,6 +115,8 @@ module Evoruby end end + + fs_e_value_threshold = -1.0 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) ) begin @@ -148,6 +147,15 @@ module Evoruby end end + species = "HUMAN" + if ( cla.is_option_set?( SPECIES_OPTION ) ) + begin + species = cla.get_option_value( SPECIES_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + ignore_dufs = false if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) ignore_dufs = true @@ -158,41 +166,42 @@ module Evoruby parse_descriptions = true end - puts() - puts( "hmmpfam outputfile : " + inpath ) - puts( "outputfile : " + outpath ) - if ( i_e_value_threshold >= 0.0 ) - puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) - else - puts( "i-E-value threshold : no threshold" ) - end - if ( parse_descriptions ) - puts( "parse descriptions : true" ) - else - puts( "parse descriptions : false" ) - end - if ( ignore_dufs ) - puts( "ignore DUFs : true" ) - else - puts( "ignore DUFs : false" ) - end - if ( column_delimiter == "\t" ) - puts( "column delimiter : TAB" ) - else - puts( "column delimiter : " + column_delimiter ) - end - if fs_e_value_threshold >= 0.0 - puts( "E-value threshold : " + fs_e_value_threshold.to_s ) - else - puts( "E-value threshold : no threshold" ) - end - if !hmm_for_protein_output.empty? - puts( "HMM for proteins : " + hmm_for_protein_output ) - end - if !uniprot.empty? - puts( "Uniprot : " + uniprot ) - end - puts() +# puts() +# puts( "hmmpfam outputfile : " + inpath ) +# puts( "outputfile : " + outpath ) +# puts( "species : " + species ) +# if ( i_e_value_threshold >= 0.0 ) +# puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) +# else +# puts( "i-E-value threshold : no threshold" ) +# end +# if ( parse_descriptions ) +# puts( "parse descriptions : true" ) +# else +# puts( "parse descriptions : false" ) +# end +# if ( ignore_dufs ) +# puts( "ignore DUFs : true" ) +# else +# puts( "ignore DUFs : false" ) +# end +# if ( column_delimiter == "\t" ) +# puts( "column delimiter : TAB" ) +# else +# puts( "column delimiter : " + column_delimiter ) +# end +# if fs_e_value_threshold >= 0.0 +# puts( "E-value threshold : " + fs_e_value_threshold.to_s ) +# else +# puts( "E-value threshold : no threshold" ) +# end +# if !hmm_for_protein_output.empty? +# puts( "HMM for proteins : " + hmm_for_protein_output ) +# end +# if !uniprot.empty? +# puts( "Uniprot : " + uniprot ) +# end +# puts() begin parse( inpath, @@ -203,21 +212,21 @@ module Evoruby parse_descriptions, fs_e_value_threshold, hmm_for_protein_output, - uniprot ) + uniprot, + species ) rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end domain_counts = get_domain_counts() - - puts - puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" ) - puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) - puts - puts( Util.draw_histogram( domain_counts, "#" ) ) - puts - Util.print_message( PRG_NAME, 'OK' ) - puts +# puts +# puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" ) +# puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) +# puts +# puts( Util.draw_histogram( domain_counts, "#" ) ) +# puts +# Util.print_message( PRG_NAME, 'OK' ) +# puts end # def run @@ -232,9 +241,8 @@ module Evoruby get_descriptions, fs_e_value_threshold, hmm_for_protein_output, - uniprot ) - - + uniprot, + species ) Util.check_file_for_readability( inpath ) Util.check_file_for_writability( outpath ) @@ -242,11 +250,6 @@ module Evoruby hmmscan_parser = HmmscanParser.new( inpath ) results = hmmscan_parser.parse - uniprot_entries = nil - if !uniprot.empty? && !hmm_for_protein_output.empty? - uniprot_entries = read_uniprot( results, uniprot ) - end - outfile = File.open( outpath, "a" ) query = "" @@ -258,8 +261,6 @@ module Evoruby hmmscan_results_per_protein = [] - - prev_query = "" results.each do | r | @@ -295,7 +296,8 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprot_entries ) + uniprot, + species ) end hmmscan_results_per_protein.clear end @@ -310,17 +312,18 @@ module Evoruby end end end + if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty? process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprot_entries ) + uniprot, + species ) end outfile.flush() outfile.close() - end # def parse def process_id( id ) @@ -330,17 +333,6 @@ module Evoruby id end - def read_uniprot( hmmscan_results, uniprot ) - ids = Set.new - hmmscan_results.each do | r | - - ids << process_id( r.query ) - end - uniprot_parser = UniprotParser.new uniprot - uniprot_entries = uniprot_parser.parse ids - uniprot_entries - end - def count_model( model ) if ( @domain_counts.has_key?( model ) ) count = @domain_counts[ model ].to_i @@ -355,7 +347,8 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprot_entries ) + uniprotkb, + species ) dc = 0 # filter according to i-Evalue threshold @@ -363,12 +356,14 @@ module Evoruby hmmscan_results_per_protein_filtered = [] hmmscan_results_per_protein.each do | r | + + if r.model == hmm_for_protein_output - if r.fs_e_value > fs_e_value_threshold + if fs_e_value_threshold > 0.0 && r.fs_e_value > fs_e_value_threshold return end end - if r.i_e_value <= i_e_value_threshold + if i_e_value_threshold <= 0 || r.i_e_value <= i_e_value_threshold hmmscan_results_per_protein_filtered << r if r.model == hmm_for_protein_output dc += 1 @@ -392,7 +387,7 @@ module Evoruby s = "" s << own.query + "\t" - s << "HUMAN" + "\t" + s << species + "\t" s << own.fs_e_value.to_s + "\t" s << own.qlen.to_s + "\t" s << dc.to_s + "\t" @@ -401,26 +396,19 @@ module Evoruby s << r.model + " " end s << "\t" - e = uniprot_entries[ process_id( own.query ) ] - if e != nil && e.de != nil - e.de.each { |i| s << i + " " } - else - s << "-" - end - s << "\t" - - if e != nil && e.gn != nil - e.gn.each { |i| s << i + " " } - else - s << "-" - end + if !uniprotkb.empty? + #e = UniprotKB::get_entry_by_id( process_id( own.query ) ) + #if e != nil + # s << uniprot_annotation( e ) + # # s << "\uniprot_annotationt" + #end + end - s << "\t" overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - s << overview + "\t" + s << overview + "\t" s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t" @@ -444,6 +432,18 @@ module Evoruby puts s end + def uniprot_annotation( e ) + s = "" + pdb_ids = e.get_pdb_ids + if !pdb_ids.empty? + pdb_ids.each do | pdb | + s << pdb << ", " + end + else + s << "-" + end + s + end def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) linkers = "" @@ -499,7 +499,6 @@ module Evoruby end - def print_help() puts( "Usage:" ) puts() @@ -511,6 +510,7 @@ module Evoruby puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) + puts( " -" + SPECIES_OPTION + ": species for protein summary" ) puts() end