X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fhmmscan_analysis.rb;h=0aaab8465f38f347167cba6c420d9cb711e96efe;hb=156517428746b21481eb815bf860ffb6cb0314ef;hp=08d74c12c851441bdb7b8165478e1ac41aade492;hpb=a8682990994642d167ec26e60973e1fc7ef94b72;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb index 08d74c1..0aaab84 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb @@ -7,8 +7,6 @@ # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ # -require 'set' - require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' @@ -17,48 +15,41 @@ require 'lib/evo/msa/msa' require 'lib/evo/msa/msa_factory' require 'lib/evo/io/msa_io' require 'lib/evo/io/parser/fasta_parser' -require 'lib/evo/io/writer/fasta_writer' module Evoruby class HmmscanAnalysis - PRG_NAME = "hsp" - PRG_VERSION = "2.001" - PRG_DESC = "hmmscan summary" - PRG_DATE = "2013.10.23" + PRG_NAME = "hsa" + PRG_VERSION = "1.000" + PRG_DESC = "hmmscan analysis" + PRG_DATE = "130319" COPYRIGHT = "2013 Christian M Zmasek" CONTACT = "phyloxml@gmail.com" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" - DELIMITER_OPTION = "d" - SPECIES_OPTION = "s" I_E_VALUE_THRESHOLD_OPTION = "ie" FS_E_VALUE_THRESHOLD_OPTION = "pe" - HMM_FOR_PROTEIN_OUTPUT = "m" - IGNORE_DUF_OPTION = "i" - PARSE_OUT_DESCRIPITION_OPTION = "a" + TARGET_MODELS = "m" + EXTRACTION = "x" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" - USE_AVOID_HMMS = false + USE_AVOID_HMMS = true AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ] LIMIT_FOR_CLOSE_DOMAINS = 20 - def initialize - @domain_counts = Hash.new - end def run - # Util.print_program_information( PRG_NAME, - # PRG_VERSION, - # PRG_DESC, - # PRG_DATE, - # COPYRIGHT, - # CONTACT, - # WWW, - # STDOUT ) + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) begin cla = CommandLineArguments.new( ARGV ) @@ -72,19 +63,16 @@ module Evoruby exit( 0 ) end - if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 2 ) + if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 3 ) print_help exit( -1 ) end allowed_opts = Array.new - allowed_opts.push( DELIMITER_OPTION ) allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION ) allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( IGNORE_DUF_OPTION ) - allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) - allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) - allowed_opts.push( SPECIES_OPTION ) + allowed_opts.push( TARGET_MODELS ) + allowed_opts.push( EXTRACTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) if ( disallowed.length > 0 ) @@ -94,24 +82,23 @@ module Evoruby end inpath = cla.get_file_name( 0 ) + Util.check_file_for_readability( inpath ) seq_file_path = nil + extraction_output = nil - if ( cla.get_number_of_files == 2 ) + if ( cla.get_number_of_files == 3 ) seq_file_path = cla.get_file_name( 1 ) + Util.check_file_for_readability( seq_file_path ) + extraction_output = cla.get_file_name( 2 ) + if File.exist?( extraction_output ) + Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" ) + end + end msa = nil if seq_file_path != nil - msa = read_fasta_file(seq_file_path ) - end - - column_delimiter = "\t" - if ( cla.is_option_set?( DELIMITER_OPTION ) ) - begin - column_delimiter = cla.get_option_value( DELIMITER_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end + msa = read_fasta_file( seq_file_path ) end i_e_value_threshold = -1 @@ -138,52 +125,41 @@ module Evoruby end end - hmm_for_protein_outputs = [] - if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) ) + target_models = [] + if ( cla.is_option_set?( TARGET_MODELS ) ) begin - hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT ) - hmm_for_protein_outputs = hmm_for_protein_output.split( "~" ); + hmm_for_protein_output = cla.get_option_value( TARGET_MODELS ) + target_models = hmm_for_protein_output.split( "/" ); rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end - species = "HUMAN" - if ( cla.is_option_set?( SPECIES_OPTION ) ) + x_models = [] + if ( cla.is_option_set?( EXTRACTION ) ) begin - species = cla.get_option_value( SPECIES_OPTION ) + hmm_for_protein_output = cla.get_option_value( EXTRACTION ) + x_models = hmm_for_protein_output.split( "~" ); rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end - ignore_dufs = false - if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) - ignore_dufs = true - end - - parse_descriptions = false - if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) - parse_descriptions = true - end - - begin parse( inpath, - column_delimiter, i_e_value_threshold, - ignore_dufs, - parse_descriptions, fs_e_value_threshold, - hmm_for_protein_outputs, - species, - msa ) + target_models, + x_models, + msa, + extraction_output ) rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end # def run + private def read_fasta_file( input ) @@ -200,48 +176,37 @@ module Evoruby # raises ArgumentError, IOError def parse( inpath, - column_delimiter, i_e_value_threshold, - ignore_dufs, - get_descriptions, fs_e_value_threshold, - hmm_for_protein_outputs, - species, - msa ) + target_models, + x_models, + msa, + extraction_output ) - Util.check_file_for_readability( inpath ) + extraction_output_file = nil + if extraction_output != nil + extraction_output_file = File.open( extraction_output, "a" ) + end hmmscan_parser = HmmscanParser.new( inpath ) - results = hmmscan_parser.parse - - query = "" - desc = "" - model = "" - env_from = "" - env_to = "" - i_e_value = "" + results = hmmscan_parser.parse hmmscan_results_per_protein = [] + query = "" prev_query = "" - results.each do | r | - model = r.model - query = r.query - i_e_value = r.i_e_value - env_from = r.env_from - env_to = r.env_to - - + query = r.query if !prev_query.empty? && prev_query != query if !hmmscan_results_per_protein.empty? process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, - hmm_for_protein_outputs, + target_models, + x_models, i_e_value_threshold, - species, - msa ) + msa, + extraction_output_file ) end hmmscan_results_per_protein.clear end @@ -257,33 +222,28 @@ module Evoruby end - if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty? + if !hmmscan_results_per_protein.empty? process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, - hmm_for_protein_outputs, + target_models, + x_models, i_e_value_threshold, - species, - msa ) + msa, + extraction_output_file ) end - - - end # def parse - - def process_id( id ) - if id =~ /(sp|tr)\|\S+\|(\S+)/ - id = $2 + if extraction_output_file != nil + extraction_output_file.close end - id - end - + end # def parse def process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, target_hmms, + x_models, i_e_value_threshold, - species, - msa ) + msa, + extraction_output_file ) raise StandardError, "target hmms is empty" if target_hmms.length < 1 raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1 @@ -291,7 +251,6 @@ module Evoruby # filter according to i-Evalue threshold # abort if fs Evalue too high - if fs_e_value_threshold >= 0.0 hmmscan_results_per_protein.each do | r | target_hmms.each do | hmm | @@ -302,7 +261,6 @@ module Evoruby end end - # dcs = [] hmmscan_results_per_protein_filtered = [] matched = Set.new @@ -311,7 +269,6 @@ module Evoruby hmmscan_results_per_protein_filtered << r target_hmms.each do | hmm | if r.model == hmm - matched << hmm break end @@ -339,112 +296,147 @@ module Evoruby end end - s = "" query = nil + qlen = nil owns.each do | own | - s << own.query + "\t" - query = own.query + if query == nil + query = own.query + qlen = own.qlen + else + raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen + raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen + end + end + + species = nil + ll = nil + if msa != nil + seq = get_sequence( query, msa ) + species = get_species( seq ) + raise StandardError, "could not get species" if species == nil || species.empty? + if x_models != nil && x_models.length > 0 + ll = extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file ) + end end + + s = query + "\t" s << species + "\t" owns.each do | own | s << own.fs_e_value.to_s + "\t" end - owns.each do | own | - s << own.qlen.to_s + "\t" #TODO ! - end - # dcs.each do | dc | - # s << dc.to_s + "\t" - # end + s << qlen.to_s + "\t" + s << hmmscan_results_per_protein_filtered.length.to_s + "\t" hmmscan_results_per_protein_filtered.each do | r | s << r.model + " " end s << "\t" + if msa != nil + if ll != nil + s << ll.to_s + end + s << "\t" + end + s << make_overview_da( hmmscan_results_per_protein_filtered ) + s << "\t" + s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen ) + puts s + end - # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - - # s << overview + "\t" - - # s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t" - + def extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file ) + raise StandardError, "extraction output file is nil" if extraction_output_file == nil prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if prev_r != nil - s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) - - if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] ) - puts "xxx" - linker( prev_r.env_to, r.env_from, query, msa ) + if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] ) + ll = output_linker( prev_r.env_to, r.env_from, seq, extraction_output_file ) + return ll end + end + prev_r = r + end + return nil + end - else - s << make_interdomain_sequence( r.env_from, false ) - + def get_sequence( query, msa ) + seq = nil + indices = msa.find_by_name_start( query , true ) + if indices.length != 1 + if query[ -1, 1 ] == "|" + query.chop! end - s << r.model - s << "[" - s << r.env_from.to_s << "-" << r.env_to.to_s - s << " " << r.i_e_value.to_s - s << "]" - prev_r = r + seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ ) + else + seq = msa.get_sequence( indices[ 0 ] ) end - # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false ) - puts s + seq end - def linker( first, last , query , msa) - puts first.to_s + "-" + last.to_s + def get_species( seq ) + species = nil + if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/ + species = $1 + end + species + end + + + def output_linker( first, last , seq, output_file ) if ( last - first >= 1 ) - seq = msa.get_by_name( query, true, false ) - linker = seq.get_subsequence( first -1 , last - 1 ) - puts linker.get_sequence_as_string + output_file.print( ">" + seq.get_name + " [" + first.to_s + "-" + last.to_s + "]" + "\n") + output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" ) end + last - first + 1 end - def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - linkers = "" + def make_detailed_da( hmmscan_results_per_protein_filtered , qlen ) + s = "" prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if r.model == hmm_for_protein_output - if prev_r != nil - linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " " - end - prev_r = r + if prev_r != nil + s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) + else + s << make_interdomain_sequence( r.env_from, false ) end + s << r.model + s << "[" + s << r.env_from.to_s << "-" << r.env_to.to_s + s << " " << r.i_e_value.to_s + s << "]" + prev_r = r end - linkers + s << make_interdomain_sequence( qlen - prev_r.env_to, false ) + s end - - def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + def make_overview_da( hmmscan_results_per_protein_filtered ) overview = "" prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if r.model == hmm_for_protein_output - if prev_r == nil - overview << hmm_for_protein_output + if prev_r == nil + overview << r.model + else + if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS + overview << "~" << r.model else - if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS - overview << "~" << hmm_for_protein_output - else - overview << "----" << hmm_for_protein_output - end + overview << "----" << r.model end - prev_r = r end + prev_r = r + end overview end + def make_interdomain_sequence( d, mark_short = true ) s = "" d /= 20 @@ -466,13 +458,9 @@ module Evoruby puts() puts( " " + PRG_NAME + ".rb [options] " ) puts() - puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) - puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) - puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) - puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) + puts( " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) - puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) - puts( " -" + SPECIES_OPTION + ": species for protein summary" ) + puts( " -" + TARGET_MODELS + ": target HMMs" ) puts() end