From d5a0815ac385d5bbc9c585d1c3c230c8aac5016f Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 19 Mar 2013 21:09:48 +0000 Subject: [PATCH] inprogress --- forester/ruby/evoruby/lib/evo/msa/msa.rb | 33 +++- forester/ruby/evoruby/lib/evo/sequence/sequence.rb | 4 +- .../ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb | 192 +++++++++----------- .../ruby/evoruby/lib/evo/tool/hmmscan_summary.rb | 149 ++++++--------- 4 files changed, 168 insertions(+), 210 deletions(-) diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index f338093..1681827 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -85,8 +85,31 @@ module Evoruby indices end + def find_by_name_pattern( name_re ) + indices = [] + for i in 0 ... get_number_of_seqs() + if name_re.match( get_sequence( i ).get_name() ) + indices.push( i ) + end + end + indices + end + + # throws ArgumentError + def get_by_name_pattern( name_re ) + indices = find_by_name_pattern( name_re ) + if ( indices.length > 1 ) + error_msg = "pattern \"" + name_re.to_s + "\" not unique" + raise ArgumentError, error_msg + elsif ( indices.length < 1 ) + error_msg = "pattern \"" + name_re.to_s + "\" not found" + raise ArgumentError, error_msg + end + get_sequence( indices[ 0 ] ) + end + def find_by_name_start( name, case_sensitive ) - indices = Array.new() + indices = [] for i in 0 ... get_number_of_seqs() get_sequence( i ).get_name() =~ /^\s*(\S+)/ current_name = $1 @@ -168,7 +191,7 @@ module Evoruby end def to_str - to_fasta + to_fasta end def to_fasta @@ -178,8 +201,8 @@ module Evoruby end s end - - + + def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 ) if ( !is_aligned() ) error_msg = "attempt to get overlap diagram of unaligned msa" @@ -328,7 +351,7 @@ module Evoruby end end end - + to_be_removed_ary = to_be_removed.to_a.sort.reverse to_be_removed_ary.each { | index | diff --git a/forester/ruby/evoruby/lib/evo/sequence/sequence.rb b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb index 09f8428..a5c2a98 100644 --- a/forester/ruby/evoruby/lib/evo/sequence/sequence.rb +++ b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb @@ -159,11 +159,11 @@ module Evoruby def to_str return "[" + @name + "] " + @molecular_sequence end - + def to_fasta return ">" + @name + Constants::LINE_DELIMITER + @molecular_sequence end - + end # class Sequence diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb index 08d74c1..3b48317 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_analysis.rb @@ -23,21 +23,18 @@ module Evoruby class HmmscanAnalysis - PRG_NAME = "hsp" - PRG_VERSION = "2.001" + PRG_NAME = "hsa" + PRG_VERSION = "1.000" PRG_DESC = "hmmscan summary" - PRG_DATE = "2013.10.23" + PRG_DATE = "130319" COPYRIGHT = "2013 Christian M Zmasek" CONTACT = "phyloxml@gmail.com" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" - DELIMITER_OPTION = "d" SPECIES_OPTION = "s" I_E_VALUE_THRESHOLD_OPTION = "ie" FS_E_VALUE_THRESHOLD_OPTION = "pe" HMM_FOR_PROTEIN_OUTPUT = "m" - IGNORE_DUF_OPTION = "i" - PARSE_OUT_DESCRIPITION_OPTION = "a" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" @@ -78,11 +75,8 @@ module Evoruby end allowed_opts = Array.new - allowed_opts.push( DELIMITER_OPTION ) allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION ) allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION ) - allowed_opts.push( IGNORE_DUF_OPTION ) - allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) allowed_opts.push( SPECIES_OPTION ) @@ -105,15 +99,6 @@ module Evoruby msa = read_fasta_file(seq_file_path ) end - column_delimiter = "\t" - if ( cla.is_option_set?( DELIMITER_OPTION ) ) - begin - column_delimiter = cla.get_option_value( DELIMITER_OPTION ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - i_e_value_threshold = -1 if ( cla.is_option_set?( I_E_VALUE_THRESHOLD_OPTION ) ) begin @@ -142,7 +127,7 @@ module Evoruby if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) ) begin hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT ) - hmm_for_protein_outputs = hmm_for_protein_output.split( "~" ); + hmm_for_protein_outputs = hmm_for_protein_output.split( "/" ); rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end @@ -157,27 +142,13 @@ module Evoruby end end - ignore_dufs = false - if ( cla.is_option_set?( IGNORE_DUF_OPTION ) ) - ignore_dufs = true - end - - parse_descriptions = false - if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) ) - parse_descriptions = true - end - - begin parse( inpath, - column_delimiter, i_e_value_threshold, - ignore_dufs, - parse_descriptions, fs_e_value_threshold, hmm_for_protein_outputs, species, - msa ) + msa ) rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end @@ -200,10 +171,7 @@ module Evoruby # raises ArgumentError, IOError def parse( inpath, - column_delimiter, i_e_value_threshold, - ignore_dufs, - get_descriptions, fs_e_value_threshold, hmm_for_protein_outputs, species, @@ -214,25 +182,14 @@ module Evoruby hmmscan_parser = HmmscanParser.new( inpath ) results = hmmscan_parser.parse - - query = "" - desc = "" - model = "" - env_from = "" - env_to = "" - i_e_value = "" - hmmscan_results_per_protein = [] + query = "" prev_query = "" results.each do | r | - model = r.model - query = r.query - i_e_value = r.i_e_value - env_from = r.env_from - env_to = r.env_to + query = r.query if !prev_query.empty? && prev_query != query if !hmmscan_results_per_protein.empty? @@ -241,7 +198,7 @@ module Evoruby hmm_for_protein_outputs, i_e_value_threshold, species, - msa ) + msa ) end hmmscan_results_per_protein.clear end @@ -263,7 +220,7 @@ module Evoruby hmm_for_protein_outputs, i_e_value_threshold, species, - msa ) + msa ) end @@ -339,52 +296,46 @@ module Evoruby end end - s = "" + query = nil + qlen = nil owns.each do | own | - s << own.query + "\t" - query = own.query + + if query == nil + query = own.query + + qlen = own.qlen + else + # sanity checks + raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen + raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen + end end + + s = query + "\t" s << species + "\t" owns.each do | own | s << own.fs_e_value.to_s + "\t" end - owns.each do | own | - s << own.qlen.to_s + "\t" #TODO ! - end - # dcs.each do | dc | - # s << dc.to_s + "\t" - # end + s << qlen.to_s + "\t" + s << hmmscan_results_per_protein_filtered.length.to_s + "\t" hmmscan_results_per_protein_filtered.each do | r | s << r.model + " " end s << "\t" - - - - # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - - # s << overview + "\t" - - # s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t" - + s << make_overview_da( hmmscan_results_per_protein_filtered ) + s << "\t" prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if prev_r != nil s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) - if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] ) - puts "xxx" - linker( prev_r.env_to, r.env_from, query, msa ) + extract_linker( prev_r.env_to, r.env_from, query, msa ) end - - else s << make_interdomain_sequence( r.env_from, false ) - end s << r.model s << "[" @@ -393,58 +344,90 @@ module Evoruby s << "]" prev_r = r end - # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false ) + s << make_interdomain_sequence( qlen - prev_r.env_to, false ) + puts s end + def extract_linkers( hmmscan_results_per_protein_filtered, target_hmms, query, msa ) + prev_r = nil + hmmscan_results_per_protein_filtered.each do | r | + if prev_r != nil + if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] ) + extract_linker( prev_r.env_to, r.env_from, query, msa ) + end + end + prev_r = r + end + end - def linker( first, last , query , msa) - puts first.to_s + "-" + last.to_s + def extract_linker( first, last , query , msa) if ( last - first >= 1 ) - seq = msa.get_by_name( query, true, false ) - linker = seq.get_subsequence( first -1 , last - 1 ) - puts linker.get_sequence_as_string + seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ ) + linker = seq.get_subsequence( first - 1 , last - 1 ) + linker.get_sequence_as_string end - end + def make_detailed_da( hmmscan_results_per_protein_filtered ) + s = "" + prev_r = nil + hmmscan_results_per_protein_filtered.each do | r | + if prev_r != nil + s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) + else + s << make_interdomain_sequence( r.env_from, false ) + end + s << r.model + s << "[" + s << r.env_from.to_s << "-" << r.env_to.to_s + s << " " << r.i_e_value.to_s + s << "]" + prev_r = r + end + s << make_interdomain_sequence( qlen - prev_r.env_to, false ) + s + end - def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - linkers = "" + def make_overview_da( hmmscan_results_per_protein_filtered ) + overview = "" prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if r.model == hmm_for_protein_output - if prev_r != nil - linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " " + + if prev_r == nil + overview << r.model + else + if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS + overview << "~" << r.model + else + overview << "----" << r.model end - prev_r = r end + prev_r = r + end - linkers + overview end - def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) - overview = "" + + + def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + linkers = "" prev_r = nil hmmscan_results_per_protein_filtered.each do | r | if r.model == hmm_for_protein_output - if prev_r == nil - overview << hmm_for_protein_output - else - if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS - overview << "~" << hmm_for_protein_output - else - overview << "----" << hmm_for_protein_output - end + if prev_r != nil + linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " " end prev_r = r end end - overview + linkers end + def make_interdomain_sequence( d, mark_short = true ) s = "" d /= 20 @@ -466,10 +449,7 @@ module Evoruby puts() puts( " " + PRG_NAME + ".rb [options] " ) puts() - puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" ) - puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) - puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" ) - puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" ) + puts( " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" ) puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" ) puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" ) puts( " -" + SPECIES_OPTION + ": species for protein summary" ) diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index bb32da5..08de9c1 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -13,16 +13,15 @@ require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_parser' -require 'lib/evo/io/web/uniprotkb' module Evoruby class HmmscanSummary PRG_NAME = "hsp" - PRG_VERSION = "2.001" + PRG_VERSION = "2.002" PRG_DESC = "hmmscan summary" - PRG_DATE = "2013.10.23" + PRG_DATE = "130319" COPYRIGHT = "2013 Christian M Zmasek" CONTACT = "phyloxml@gmail.com" WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" @@ -34,7 +33,6 @@ module Evoruby HMM_FOR_PROTEIN_OUTPUT = "m" IGNORE_DUF_OPTION = "i" PARSE_OUT_DESCRIPITION_OPTION = "a" - UNIPROT = "u" HELP_OPTION_1 = "help" HELP_OPTION_2 = "h" @@ -48,14 +46,14 @@ module Evoruby def run - # Util.print_program_information( PRG_NAME, - # PRG_VERSION, - # PRG_DESC, - # PRG_DATE, - # COPYRIGHT, - # CONTACT, - # WWW, - # STDOUT ) + Util.print_program_information( PRG_NAME, + PRG_VERSION, + PRG_DESC, + PRG_DATE, + COPYRIGHT, + CONTACT, + WWW, + STDOUT ) begin cla = CommandLineArguments.new( ARGV ) @@ -81,7 +79,6 @@ module Evoruby allowed_opts.push( IGNORE_DUF_OPTION ) allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION ) allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT ) - allowed_opts.push( UNIPROT ) allowed_opts.push( SPECIES_OPTION ) disallowed = cla.validate_allowed_options_as_str( allowed_opts ) @@ -115,8 +112,6 @@ module Evoruby end end - - fs_e_value_threshold = -1.0 if ( cla.is_option_set?( FS_E_VALUE_THRESHOLD_OPTION ) ) begin @@ -138,15 +133,6 @@ module Evoruby end end - uniprot = "" - if ( cla.is_option_set?( UNIPROT ) ) - begin - uniprot = cla.get_option_value( UNIPROT ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) - end - end - species = "HUMAN" if ( cla.is_option_set?( SPECIES_OPTION ) ) begin @@ -166,42 +152,39 @@ module Evoruby parse_descriptions = true end -# puts() -# puts( "hmmpfam outputfile : " + inpath ) -# puts( "outputfile : " + outpath ) -# puts( "species : " + species ) -# if ( i_e_value_threshold >= 0.0 ) -# puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) -# else -# puts( "i-E-value threshold : no threshold" ) -# end -# if ( parse_descriptions ) -# puts( "parse descriptions : true" ) -# else -# puts( "parse descriptions : false" ) -# end -# if ( ignore_dufs ) -# puts( "ignore DUFs : true" ) -# else -# puts( "ignore DUFs : false" ) -# end -# if ( column_delimiter == "\t" ) -# puts( "column delimiter : TAB" ) -# else -# puts( "column delimiter : " + column_delimiter ) -# end -# if fs_e_value_threshold >= 0.0 -# puts( "E-value threshold : " + fs_e_value_threshold.to_s ) -# else -# puts( "E-value threshold : no threshold" ) -# end -# if !hmm_for_protein_output.empty? -# puts( "HMM for proteins : " + hmm_for_protein_output ) -# end -# if !uniprot.empty? -# puts( "Uniprot : " + uniprot ) -# end -# puts() + puts() + puts( "hmmpfam outputfile : " + inpath ) + puts( "outputfile : " + outpath ) + puts( "species : " + species ) + if ( i_e_value_threshold >= 0.0 ) + puts( "i-E-value threshold : " + i_e_value_threshold.to_s ) + else + puts( "i-E-value threshold : no threshold" ) + end + if ( parse_descriptions ) + puts( "parse descriptions : true" ) + else + puts( "parse descriptions : false" ) + end + if ( ignore_dufs ) + puts( "ignore DUFs : true" ) + else + puts( "ignore DUFs : false" ) + end + if ( column_delimiter == "\t" ) + puts( "column delimiter : TAB" ) + else + puts( "column delimiter : " + column_delimiter ) + end + if fs_e_value_threshold >= 0.0 + puts( "E-value threshold : " + fs_e_value_threshold.to_s ) + else + puts( "E-value threshold : no threshold" ) + end + if !hmm_for_protein_output.empty? + puts( "HMM for proteins : " + hmm_for_protein_output ) + end + puts() begin parse( inpath, @@ -212,21 +195,20 @@ module Evoruby parse_descriptions, fs_e_value_threshold, hmm_for_protein_output, - uniprot, species ) rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end domain_counts = get_domain_counts() -# puts -# puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" ) -# puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) -# puts -# puts( Util.draw_histogram( domain_counts, "#" ) ) -# puts -# Util.print_message( PRG_NAME, 'OK' ) -# puts + puts + puts( "domain counts (considering potential i-E-value threshold and ignoring of DUFs):" ) + puts( "(number of different domains: " + domain_counts.length.to_s + ")" ) + puts + puts( Util.draw_histogram( domain_counts, "#" ) ) + puts + Util.print_message( PRG_NAME, 'OK' ) + puts end # def run @@ -241,7 +223,6 @@ module Evoruby get_descriptions, fs_e_value_threshold, hmm_for_protein_output, - uniprot, species ) Util.check_file_for_readability( inpath ) @@ -296,7 +277,6 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprot, species ) end hmmscan_results_per_protein.clear @@ -318,7 +298,6 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprot, species ) end @@ -347,7 +326,6 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprotkb, species ) dc = 0 @@ -397,15 +375,6 @@ module Evoruby end s << "\t" - if !uniprotkb.empty? - #e = UniprotKB::get_entry_by_id( process_id( own.query ) ) - - #if e != nil - # s << uniprot_annotation( e ) - # # s << "\uniprot_annotationt" - #end - end - overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) s << overview + "\t" @@ -414,7 +383,6 @@ module Evoruby prev_r = nil hmmscan_results_per_protein_filtered.each do | r | - if prev_r != nil s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 ) else @@ -428,23 +396,10 @@ module Evoruby s << "]" prev_r = r end - s << make_interdomain_sequence( own.qlen - prev_r.env_from, false ) + s << make_interdomain_sequence( own.qlen - prev_r.env_to, false ) puts s end - def uniprot_annotation( e ) - s = "" - pdb_ids = e.get_pdb_ids - if !pdb_ids.empty? - pdb_ids.each do | pdb | - s << pdb << ", " - end - else - s << "-" - end - s - end - def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) linkers = "" prev_r = nil -- 1.7.10.2