From: cmzmasek@gmail.com Date: Tue, 19 Mar 2013 03:01:10 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=a8682990994642d167ec26e60973e1fc7ef94b72;p=jalview.git inprogress --- diff --git a/forester/ruby/evoruby/exe/hsa.rb b/forester/ruby/evoruby/exe/hsa.rb new file mode 100755 index 0000000..a79da24 --- /dev/null +++ b/forester/ruby/evoruby/exe/hsa.rb @@ -0,0 +1,20 @@ +#!/usr/local/bin/ruby -w +# +# = exe/hsp +# +# Copyright:: Copyright (C) 2006-2007 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hsp.rb,v 1.1 2009/11/25 05:42:04 cmzmasek Exp $ +# +# last modified: 11/24/2009 + +require 'lib/evo/tool/hmmscan_analysis' + +module Evoruby + + hsa = HmmscanAnalysis.new() + + hsa.run() + +end # module Evoruby diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb new file mode 100644 index 0000000..08d74c1 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb @@ -0,0 +1,481 @@ +# +# = lib/evo/tool/hmmscan_summary.rb - HmmscanSummary class +# +# Copyright:: Copyright (C) 2012 Christian M. Zmasek +# License:: GNU Lesser General Public License (LGPL) +# +# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $ +# + +require 'set' + +require 'lib/evo/util/constants' +require 'lib/evo/util/util' +require 'lib/evo/util/command_line_arguments' +require 'lib/evo/io/parser/hmmscan_parser' +require 'lib/evo/msa/msa' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/msa_io' +require 'lib/evo/io/parser/fasta_parser' +require 'lib/evo/io/writer/fasta_writer' + +module Evoruby + + class HmmscanAnalysis + + PRG_NAME = "hsp" + PRG_VERSION = "2.001" + PRG_DESC = "hmmscan summary" + PRG_DATE = "2013.10.23" + COPYRIGHT = "2013 Christian M Zmasek" + CONTACT = "phyloxml@gmail.com" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" + + DELIMITER_OPTION = "d" + SPECIES_OPTION = "s" + I_E_VALUE_THRESHOLD_OPTION = "ie" + FS_E_VALUE_THRESHOLD_OPTION = "pe" + HMM_FOR_PROTEIN_OUTPUT = "m" + IGNORE_DUF_OPTION = "i" + PARSE_OUT_DESCRIPITION_OPTION = "a" + HELP_OPTION_1 = "help" + HELP_OPTION_2 = "h" + + USE_AVOID_HMMS = false + AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ] + LIMIT_FOR_CLOSE_DOMAINS = 20 + + def initialize + @domain_counts = Hash.new + end + + def run + + # Util.print_program_information( PRG_NAME, + # PRG_VERSION, + # PRG_DESC, + # PRG_DATE, + # COPYRIGHT, + # CONTACT, + # WWW, + # STDOUT ) + + begin + cla = CommandLineArguments.new( ARGV ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + if ( cla.is_option_set?( HELP_OPTION_1 ) || + cla.is_option_set?( HELP_OPTION_2 ) ) + print_help + exit( 0 ) + end + + if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 2 ) + print_help + exit( -1 ) + end + + allowed_opts = Array.new + allowed_opts.push( DELIMITER_OPTION ) + allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION ) + allowed_opts.push( IGNORE_DUF_OPTION ) + allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) + allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) + allowed_opts.push( SPECIES_OPTION ) + + disallowed = cla.validate_allowed_options_as_str( allowed_opts ) + if ( disallowed.length > 0 ) + Util.fatal_error( PRG_NAME, + "unknown option(s): " + disallowed, + STDOUT ) + end + + inpath = cla.get_file_name( 0 ) + seq_file_path = nil + + if ( cla.get_number_of_files == 2 ) + seq_file_path = cla.get_file_name( 1 ) + end + + msa = nil + if seq_file_path != nil + msa = read_fasta_file(seq_file_path ) + end + + column_delimiter = "\t" + if ( cla.is_option_set?( DELIMITER_OPTION ) ) + begin + column_delimiter = cla.get_option_value( DELIMITER_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + i_e_value_threshold = -1 + if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) ) + begin + i_e_value_threshold = cla.get_option_value_as_float( I_E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( i_e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative i-E-value threshold", STDOUT ) + end + end + + fs_e_value_threshold = -1 + if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) ) + begin + fs_e_value_threshold = cla.get_option_value_as_float( FS_E_VALUE_THRESHOLD_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + if ( fs_e_value_threshold < 0.0 ) + Util.fatal_error( PRG_NAME, "attempt to use a negative E-value threshold", STDOUT ) + end + end + + hmm_for_protein_outputs = [] + if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) ) + begin + hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT ) + hmm_for_protein_outputs = hmm_for_protein_output.split( "~" ); + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + species = "HUMAN" + if ( cla.is_option_set?( SPECIES_OPTION ) ) + begin + species = cla.get_option_value( SPECIES_OPTION ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + end + + ignore_dufs = false + if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) + ignore_dufs = true + end + + parse_descriptions = false + if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) + parse_descriptions = true + end + + + begin + parse( inpath, + column_delimiter, + i_e_value_threshold, + ignore_dufs, + parse_descriptions, + fs_e_value_threshold, + hmm_for_protein_outputs, + species, + msa ) + rescue IOError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) + end + + end # def run + + private + + def read_fasta_file( input ) + f = MsaFactory.new() + msa = nil + begin + msa = f.create_msa_from_file( input, FastaParser.new() ) + rescue Exception => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) + end + msa + end + + + # raises ArgumentError, IOError + def parse( inpath, + column_delimiter, + i_e_value_threshold, + ignore_dufs, + get_descriptions, + fs_e_value_threshold, + hmm_for_protein_outputs, + species, + msa ) + + Util.check_file_for_readability( inpath ) + + hmmscan_parser = HmmscanParser.new( inpath ) + results = hmmscan_parser.parse + + + query = "" + desc = "" + model = "" + env_from = "" + env_to = "" + i_e_value = "" + + hmmscan_results_per_protein = [] + + prev_query = "" + + results.each do | r | + model = r.model + query = r.query + i_e_value = r.i_e_value + env_from = r.env_from + env_to = r.env_to + + + if !prev_query.empty? && prev_query != query + if !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_outputs, + i_e_value_threshold, + species, + msa ) + end + hmmscan_results_per_protein.clear + end + prev_query = query + + if USE_AVOID_HMMS + if !AVOID_HHMS.include? r.model + hmmscan_results_per_protein << r + end + else + hmmscan_results_per_protein << r + end + + end + + if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_outputs, + i_e_value_threshold, + species, + msa ) + end + + + end # def parse + + def process_id( id ) + if id =~ /(sp|tr)\|\S+\|(\S+)/ + id = $2 + end + id + end + + + + def process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + target_hmms, + i_e_value_threshold, + species, + msa ) + + raise StandardError, "target hmms is empty" if target_hmms.length < 1 + raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1 + + # filter according to i-Evalue threshold + # abort if fs Evalue too high + + + if fs_e_value_threshold >= 0.0 + hmmscan_results_per_protein.each do | r | + target_hmms.each do | hmm | + if r.model == hmm && r.fs_e_value > fs_e_value_threshold + return + end + end + end + end + + # dcs = [] + hmmscan_results_per_protein_filtered = [] + matched = Set.new + + hmmscan_results_per_protein.each do | r | + if i_e_value_threshold < 0 || r.i_e_value <= i_e_value_threshold + hmmscan_results_per_protein_filtered << r + target_hmms.each do | hmm | + if r.model == hmm + + matched << hmm + break + end + end + + end + end + + if matched.length < target_hmms.length + return + end + if hmmscan_results_per_protein_filtered.length < 1 + return + end + + hmmscan_results_per_protein_filtered.sort! { |r1,r2| r1.env_from <=> r2.env_from } + + owns = [] + target_hmms.each do | hmm | + hmmscan_results_per_protein_filtered.each do | r | + if r.model == hmm + owns << r + break + end + end + end + + s = "" + query = nil + owns.each do | own | + s << own.query + "\t" + query = own.query + end + s << species + "\t" + owns.each do | own | + s << own.fs_e_value.to_s + "\t" + end + owns.each do | own | + s << own.qlen.to_s + "\t" #TODO ! + end + + # dcs.each do | dc | + # s << dc.to_s + "\t" + # end + s << hmmscan_results_per_protein_filtered.length.to_s + "\t" + hmmscan_results_per_protein_filtered.each do | r | + s << r.model + " " + end + s << "\t" + + + + # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + + # s << overview + "\t" + + # s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t" + + prev_r = nil + hmmscan_results_per_protein_filtered.each do | r | + + if prev_r != nil + s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) + + if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] ) + puts "xxx" + linker( prev_r.env_to, r.env_from, query, msa ) + end + + + else + s << make_interdomain_sequence( r.env_from, false ) + + end + s << r.model + s << "[" + s << r.env_from.to_s << "-" << r.env_to.to_s + s << " " << r.i_e_value.to_s + s << "]" + prev_r = r + end + # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false ) + puts s + end + + + def linker( first, last , query , msa) + puts first.to_s + "-" + last.to_s + if ( last - first >= 1 ) + seq = msa.get_by_name( query, true, false ) + linker = seq.get_subsequence( first -1 , last - 1 ) + puts linker.get_sequence_as_string + end + + end + + + def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + linkers = "" + prev_r = nil + hmmscan_results_per_protein_filtered.each do | r | + if r.model == hmm_for_protein_output + if prev_r != nil + linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " " + end + prev_r = r + end + end + linkers + end + + + + def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + overview = "" + prev_r = nil + hmmscan_results_per_protein_filtered.each do | r | + if r.model == hmm_for_protein_output + if prev_r == nil + overview << hmm_for_protein_output + else + if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS + overview << "~" << hmm_for_protein_output + else + overview << "----" << hmm_for_protein_output + end + end + prev_r = r + end + end + overview + end + + def make_interdomain_sequence( d, mark_short = true ) + s = "" + d /= 20 + if d >= 10 + s << "----//----" + elsif d >= 1 + d.times do + s << "-" + end + elsif mark_short + s << "~" + end + s + end + + + def print_help() + puts( "Usage:" ) + puts() + puts( " " + PRG_NAME + ".rb [options] " ) + puts() + puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) + puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) + puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) + puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) + puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) + puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) + puts( " -" + SPECIES_OPTION + ": species for protein summary" ) + puts() + end + + end # class + +end # module Evoruby \ No newline at end of file