# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
#
-require 'set'
-
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
require 'lib/evo/msa/msa_factory'
require 'lib/evo/io/msa_io'
require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/writer/fasta_writer'
module Evoruby
PRG_NAME = "hsa"
PRG_VERSION = "1.000"
- PRG_DESC = "hmmscan summary"
+ PRG_DESC = "hmmscan analysis"
PRG_DATE = "130319"
COPYRIGHT = "2013 Christian M Zmasek"
CONTACT = "phyloxml@gmail.com"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
- SPECIES_OPTION = "s"
I_E_VALUE_THRESHOLD_OPTION = "ie"
FS_E_VALUE_THRESHOLD_OPTION = "pe"
- HMM_FOR_PROTEIN_OUTPUT = "m"
+ TARGET_MODELS = "m"
+ EXTRACTION = "x"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
- USE_AVOID_HMMS = false
+ USE_AVOID_HMMS = true
AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
LIMIT_FOR_CLOSE_DOMAINS = 20
- def initialize
- @domain_counts = Hash.new
- end
def run
- # Util.print_program_information( PRG_NAME,
- # PRG_VERSION,
- # PRG_DESC,
- # PRG_DATE,
- # COPYRIGHT,
- # CONTACT,
- # WWW,
- # STDOUT )
+ Util.print_program_information( PRG_NAME,
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ COPYRIGHT,
+ CONTACT,
+ WWW,
+ STDOUT )
begin
cla = CommandLineArguments.new( ARGV )
exit( 0 )
end
- if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 2 )
+ if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 3 )
print_help
exit( -1 )
end
allowed_opts = Array.new
allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
- allowed_opts.push( SPECIES_OPTION )
+ allowed_opts.push( TARGET_MODELS )
+ allowed_opts.push( EXTRACTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
end
inpath = cla.get_file_name( 0 )
+ Util.check_file_for_readability( inpath )
seq_file_path = nil
+ extraction_output = nil
- if ( cla.get_number_of_files == 2 )
+ if ( cla.get_number_of_files == 3 )
seq_file_path = cla.get_file_name( 1 )
+ Util.check_file_for_readability( seq_file_path )
+ extraction_output = cla.get_file_name( 2 )
+ if File.exist?( extraction_output )
+ Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" )
+ end
+
end
msa = nil
if seq_file_path != nil
- msa = read_fasta_file(seq_file_path )
+ msa = read_fasta_file( seq_file_path )
end
i_e_value_threshold = -1
end
end
- hmm_for_protein_outputs = []
- if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
+ target_models = []
+ if ( cla.is_option_set?( TARGET_MODELS ) )
begin
- hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
- hmm_for_protein_outputs = hmm_for_protein_output.split( "/" );
+ hmm_for_protein_output = cla.get_option_value( TARGET_MODELS )
+ target_models = hmm_for_protein_output.split( "/" );
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
- species = "HUMAN"
- if ( cla.is_option_set?( SPECIES_OPTION ) )
+ x_models = []
+ if ( cla.is_option_set?( EXTRACTION ) )
begin
- species = cla.get_option_value( SPECIES_OPTION )
+ hmm_for_protein_output = cla.get_option_value( EXTRACTION )
+ x_models = hmm_for_protein_output.split( "~" );
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
parse( inpath,
i_e_value_threshold,
fs_e_value_threshold,
- hmm_for_protein_outputs,
- species,
- msa )
+ target_models,
+ x_models,
+ msa,
+ extraction_output )
rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end # def run
+
private
def read_fasta_file( input )
def parse( inpath,
i_e_value_threshold,
fs_e_value_threshold,
- hmm_for_protein_outputs,
- species,
- msa )
-
- Util.check_file_for_readability( inpath )
+ target_models,
+ x_models,
+ msa,
+ extraction_output )
+
+ extraction_output_file = nil
+ if extraction_output != nil
+ extraction_output_file = File.open( extraction_output, "a" )
+ end
hmmscan_parser = HmmscanParser.new( inpath )
+
results = hmmscan_parser.parse
hmmscan_results_per_protein = []
query = ""
prev_query = ""
-
results.each do | r |
-
- query = r.query
-
+ query = r.query
if !prev_query.empty? && prev_query != query
if !hmmscan_results_per_protein.empty?
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
- hmm_for_protein_outputs,
+ target_models,
+ x_models,
i_e_value_threshold,
- species,
- msa )
+ msa,
+ extraction_output_file )
end
hmmscan_results_per_protein.clear
end
end
- if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
+ if !hmmscan_results_per_protein.empty?
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
- hmm_for_protein_outputs,
+ target_models,
+ x_models,
i_e_value_threshold,
- species,
- msa )
+ msa,
+ extraction_output_file )
end
-
-
- end # def parse
-
- def process_id( id )
- if id =~ /(sp|tr)\|\S+\|(\S+)/
- id = $2
+ if extraction_output_file != nil
+ extraction_output_file.close
end
- id
- end
-
+ end # def parse
def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
target_hmms,
+ x_models,
i_e_value_threshold,
- species,
- msa )
+ msa,
+ extraction_output_file )
raise StandardError, "target hmms is empty" if target_hmms.length < 1
raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
# filter according to i-Evalue threshold
# abort if fs Evalue too high
-
if fs_e_value_threshold >= 0.0
hmmscan_results_per_protein.each do | r |
target_hmms.each do | hmm |
end
end
- # dcs = []
hmmscan_results_per_protein_filtered = []
matched = Set.new
hmmscan_results_per_protein_filtered << r
target_hmms.each do | hmm |
if r.model == hmm
-
matched << hmm
break
end
end
end
-
query = nil
qlen = nil
owns.each do | own |
-
if query == nil
query = own.query
-
qlen = own.qlen
else
- # sanity checks
raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
end
end
+ species = nil
+ ll = nil
+ if msa != nil
+ seq = get_sequence( query, msa )
+ species = get_species( seq )
+ raise StandardError, "could not get species" if species == nil || species.empty?
+ if x_models != nil && x_models.length > 0
+ ll = extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+ end
+ end
+
s = query + "\t"
s << species + "\t"
owns.each do | own |
s << r.model + " "
end
s << "\t"
+ if msa != nil
+ if ll != nil
+ s << ll.to_s
+ end
+ s << "\t"
+ end
s << make_overview_da( hmmscan_results_per_protein_filtered )
s << "\t"
+ s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
+ puts s
+
+ end
+
+
+ def extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+ raise StandardError, "extraction output file is nil" if extraction_output_file == nil
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
if prev_r != nil
- s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
- if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
- extract_linker( prev_r.env_to, r.env_from, query, msa )
+ if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
+ ll = output_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
+ return ll
end
- else
- s << make_interdomain_sequence( r.env_from, false )
end
- s << r.model
- s << "["
- s << r.env_from.to_s << "-" << r.env_to.to_s
- s << " " << r.i_e_value.to_s
- s << "]"
prev_r = r
end
- s << make_interdomain_sequence( qlen - prev_r.env_to, false )
-
- puts s
+ return nil
end
- def extract_linkers( hmmscan_results_per_protein_filtered, target_hmms, query, msa )
- prev_r = nil
- hmmscan_results_per_protein_filtered.each do | r |
- if prev_r != nil
- if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
- extract_linker( prev_r.env_to, r.env_from, query, msa )
- end
+
+ def get_sequence( query, msa )
+ seq = nil
+ indices = msa.find_by_name_start( query , true )
+ if indices.length != 1
+ if query[ -1, 1 ] == "|"
+ query.chop!
end
- prev_r = r
+ seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
+ else
+ seq = msa.get_sequence( indices[ 0 ] )
end
+ seq
end
- def extract_linker( first, last , query , msa)
+
+ def get_species( seq )
+ species = nil
+ if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/
+ species = $1
+ end
+ species
+ end
+
+
+ def output_linker( first, last , seq, output_file )
if ( last - first >= 1 )
- seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
- linker = seq.get_subsequence( first - 1 , last - 1 )
- linker.get_sequence_as_string
+ output_file.print( ">" + seq.get_name + " [" + first.to_s + "-" + last.to_s + "]" + "\n")
+ output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
end
+ last - first + 1
+
end
- def make_detailed_da( hmmscan_results_per_protein_filtered )
+
+ def make_detailed_da( hmmscan_results_per_protein_filtered , qlen )
s = ""
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
s
end
+
def make_overview_da( hmmscan_results_per_protein_filtered )
overview = ""
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
-
if prev_r == nil
overview << r.model
else
end
-
-
-
- def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
- linkers = ""
- prev_r = nil
- hmmscan_results_per_protein_filtered.each do | r |
- if r.model == hmm_for_protein_output
- if prev_r != nil
- linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
- end
- prev_r = r
- end
- end
- linkers
- end
-
-
def make_interdomain_sequence( d, mark_short = true )
s = ""
d /= 20
puts()
puts( " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
- puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
- puts( " -" + SPECIES_OPTION + ": species for protein summary" )
+ puts( " -" + TARGET_MODELS + ": target HMMs" )
puts()
end