# $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
#
-require 'set'
-
require 'lib/evo/util/constants'
require 'lib/evo/util/util'
require 'lib/evo/util/command_line_arguments'
require 'lib/evo/msa/msa_factory'
require 'lib/evo/io/msa_io'
require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/writer/fasta_writer'
module Evoruby
class HmmscanAnalysis
- PRG_NAME = "hsp"
- PRG_VERSION = "2.001"
- PRG_DESC = "hmmscan summary"
- PRG_DATE = "2013.10.23"
+ PRG_NAME = "hsa"
+ PRG_VERSION = "1.000"
+ PRG_DESC = "hmmscan analysis"
+ PRG_DATE = "130319"
COPYRIGHT = "2013 Christian M Zmasek"
CONTACT = "phyloxml@gmail.com"
WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
- DELIMITER_OPTION = "d"
- SPECIES_OPTION = "s"
I_E_VALUE_THRESHOLD_OPTION = "ie"
FS_E_VALUE_THRESHOLD_OPTION = "pe"
- HMM_FOR_PROTEIN_OUTPUT = "m"
- IGNORE_DUF_OPTION = "i"
- PARSE_OUT_DESCRIPITION_OPTION = "a"
+ TARGET_MODELS = "m"
+ EXTRACTION = "x"
HELP_OPTION_1 = "help"
HELP_OPTION_2 = "h"
- USE_AVOID_HMMS = false
+ USE_AVOID_HMMS = true
AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
LIMIT_FOR_CLOSE_DOMAINS = 20
- def initialize
- @domain_counts = Hash.new
- end
def run
- # Util.print_program_information( PRG_NAME,
- # PRG_VERSION,
- # PRG_DESC,
- # PRG_DATE,
- # COPYRIGHT,
- # CONTACT,
- # WWW,
- # STDOUT )
+ Util.print_program_information( PRG_NAME,
+ PRG_VERSION,
+ PRG_DESC,
+ PRG_DATE,
+ COPYRIGHT,
+ CONTACT,
+ WWW,
+ STDOUT )
begin
cla = CommandLineArguments.new( ARGV )
exit( 0 )
end
- if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 2 )
+ if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 3 )
print_help
exit( -1 )
end
allowed_opts = Array.new
- allowed_opts.push( DELIMITER_OPTION )
allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
- allowed_opts.push( IGNORE_DUF_OPTION )
- allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
- allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
- allowed_opts.push( SPECIES_OPTION )
+ allowed_opts.push( TARGET_MODELS )
+ allowed_opts.push( EXTRACTION )
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
end
inpath = cla.get_file_name( 0 )
+ Util.check_file_for_readability( inpath )
seq_file_path = nil
+ extraction_output = nil
- if ( cla.get_number_of_files == 2 )
+ if ( cla.get_number_of_files == 3 )
seq_file_path = cla.get_file_name( 1 )
+ Util.check_file_for_readability( seq_file_path )
+ extraction_output = cla.get_file_name( 2 )
+ if File.exist?( extraction_output )
+ Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" )
+ end
+
end
msa = nil
if seq_file_path != nil
- msa = read_fasta_file(seq_file_path )
- end
-
- column_delimiter = "\t"
- if ( cla.is_option_set?( DELIMITER_OPTION ) )
- begin
- column_delimiter = cla.get_option_value( DELIMITER_OPTION )
- rescue ArgumentError => e
- Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
- end
+ msa = read_fasta_file( seq_file_path )
end
i_e_value_threshold = -1
end
end
- hmm_for_protein_outputs = []
- if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
+ target_models = []
+ if ( cla.is_option_set?( TARGET_MODELS ) )
begin
- hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
- hmm_for_protein_outputs = hmm_for_protein_output.split( "~" );
+ hmm_for_protein_output = cla.get_option_value( TARGET_MODELS )
+ target_models = hmm_for_protein_output.split( "/" );
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
- species = "HUMAN"
- if ( cla.is_option_set?( SPECIES_OPTION ) )
+ x_models = []
+ if ( cla.is_option_set?( EXTRACTION ) )
begin
- species = cla.get_option_value( SPECIES_OPTION )
+ hmm_for_protein_output = cla.get_option_value( EXTRACTION )
+ x_models = hmm_for_protein_output.split( "~" );
rescue ArgumentError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end
- ignore_dufs = false
- if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
- ignore_dufs = true
- end
-
- parse_descriptions = false
- if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
- parse_descriptions = true
- end
-
-
begin
parse( inpath,
- column_delimiter,
i_e_value_threshold,
- ignore_dufs,
- parse_descriptions,
fs_e_value_threshold,
- hmm_for_protein_outputs,
- species,
- msa )
+ target_models,
+ x_models,
+ msa,
+ extraction_output )
rescue IOError => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
end
end # def run
+
private
def read_fasta_file( input )
# raises ArgumentError, IOError
def parse( inpath,
- column_delimiter,
i_e_value_threshold,
- ignore_dufs,
- get_descriptions,
fs_e_value_threshold,
- hmm_for_protein_outputs,
- species,
- msa )
+ target_models,
+ x_models,
+ msa,
+ extraction_output )
- Util.check_file_for_readability( inpath )
+ extraction_output_file = nil
+ if extraction_output != nil
+ extraction_output_file = File.open( extraction_output, "a" )
+ end
hmmscan_parser = HmmscanParser.new( inpath )
- results = hmmscan_parser.parse
-
- query = ""
- desc = ""
- model = ""
- env_from = ""
- env_to = ""
- i_e_value = ""
+ results = hmmscan_parser.parse
hmmscan_results_per_protein = []
+ query = ""
prev_query = ""
-
results.each do | r |
- model = r.model
- query = r.query
- i_e_value = r.i_e_value
- env_from = r.env_from
- env_to = r.env_to
-
-
+ query = r.query
if !prev_query.empty? && prev_query != query
if !hmmscan_results_per_protein.empty?
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
- hmm_for_protein_outputs,
+ target_models,
+ x_models,
i_e_value_threshold,
- species,
- msa )
+ msa,
+ extraction_output_file )
end
hmmscan_results_per_protein.clear
end
end
- if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
+ if !hmmscan_results_per_protein.empty?
process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
- hmm_for_protein_outputs,
+ target_models,
+ x_models,
i_e_value_threshold,
- species,
- msa )
+ msa,
+ extraction_output_file )
end
-
-
- end # def parse
-
- def process_id( id )
- if id =~ /(sp|tr)\|\S+\|(\S+)/
- id = $2
+ if extraction_output_file != nil
+ extraction_output_file.close
end
- id
- end
-
+ end # def parse
def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
fs_e_value_threshold,
target_hmms,
+ x_models,
i_e_value_threshold,
- species,
- msa )
+ msa,
+ extraction_output_file )
raise StandardError, "target hmms is empty" if target_hmms.length < 1
raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
# filter according to i-Evalue threshold
# abort if fs Evalue too high
-
if fs_e_value_threshold >= 0.0
hmmscan_results_per_protein.each do | r |
target_hmms.each do | hmm |
end
end
- # dcs = []
hmmscan_results_per_protein_filtered = []
matched = Set.new
hmmscan_results_per_protein_filtered << r
target_hmms.each do | hmm |
if r.model == hmm
-
matched << hmm
break
end
end
end
- s = ""
query = nil
+ qlen = nil
owns.each do | own |
- s << own.query + "\t"
- query = own.query
+ if query == nil
+ query = own.query
+ qlen = own.qlen
+ else
+ raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
+ raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
+ end
+ end
+
+ species = nil
+ ll = nil
+ if msa != nil
+ seq = get_sequence( query, msa )
+ species = get_species( seq )
+ raise StandardError, "could not get species" if species == nil || species.empty?
+ if x_models != nil && x_models.length > 0
+ ll = extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+ end
end
+
+ s = query + "\t"
s << species + "\t"
owns.each do | own |
s << own.fs_e_value.to_s + "\t"
end
- owns.each do | own |
- s << own.qlen.to_s + "\t" #TODO !
- end
- # dcs.each do | dc |
- # s << dc.to_s + "\t"
- # end
+ s << qlen.to_s + "\t"
+
s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
hmmscan_results_per_protein_filtered.each do | r |
s << r.model + " "
end
s << "\t"
+ if msa != nil
+ if ll != nil
+ s << ll.to_s
+ end
+ s << "\t"
+ end
+ s << make_overview_da( hmmscan_results_per_protein_filtered )
+ s << "\t"
+ s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
+ puts s
+ end
- # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
-
- # s << overview + "\t"
-
- # s << calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) + "\t"
-
+ def extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+ raise StandardError, "extraction output file is nil" if extraction_output_file == nil
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
-
if prev_r != nil
- s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
-
- if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
- puts "xxx"
- linker( prev_r.env_to, r.env_from, query, msa )
+ if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
+ ll = output_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
+ return ll
end
+ end
+ prev_r = r
+ end
+ return nil
+ end
- else
- s << make_interdomain_sequence( r.env_from, false )
-
+ def get_sequence( query, msa )
+ seq = nil
+ indices = msa.find_by_name_start( query , true )
+ if indices.length != 1
+ if query[ -1, 1 ] == "|"
+ query.chop!
end
- s << r.model
- s << "["
- s << r.env_from.to_s << "-" << r.env_to.to_s
- s << " " << r.i_e_value.to_s
- s << "]"
- prev_r = r
+ seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
+ else
+ seq = msa.get_sequence( indices[ 0 ] )
end
- # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
- puts s
+ seq
end
- def linker( first, last , query , msa)
- puts first.to_s + "-" + last.to_s
+ def get_species( seq )
+ species = nil
+ if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/
+ species = $1
+ end
+ species
+ end
+
+
+ def output_linker( first, last , seq, output_file )
if ( last - first >= 1 )
- seq = msa.get_by_name( query, true, false )
- linker = seq.get_subsequence( first -1 , last - 1 )
- puts linker.get_sequence_as_string
+ output_file.print( ">" + seq.get_name + " [" + first.to_s + "-" + last.to_s + "]" + "\n")
+ output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
end
+ last - first + 1
end
- def calc_linkers( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
- linkers = ""
+ def make_detailed_da( hmmscan_results_per_protein_filtered , qlen )
+ s = ""
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
- if r.model == hmm_for_protein_output
- if prev_r != nil
- linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
- end
- prev_r = r
+ if prev_r != nil
+ s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
+ else
+ s << make_interdomain_sequence( r.env_from, false )
end
+ s << r.model
+ s << "["
+ s << r.env_from.to_s << "-" << r.env_to.to_s
+ s << " " << r.i_e_value.to_s
+ s << "]"
+ prev_r = r
end
- linkers
+ s << make_interdomain_sequence( qlen - prev_r.env_to, false )
+ s
end
-
- def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+ def make_overview_da( hmmscan_results_per_protein_filtered )
overview = ""
prev_r = nil
hmmscan_results_per_protein_filtered.each do | r |
- if r.model == hmm_for_protein_output
- if prev_r == nil
- overview << hmm_for_protein_output
+ if prev_r == nil
+ overview << r.model
+ else
+ if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
+ overview << "~" << r.model
else
- if ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
- overview << "~" << hmm_for_protein_output
- else
- overview << "----" << hmm_for_protein_output
- end
+ overview << "----" << r.model
end
- prev_r = r
end
+ prev_r = r
+
end
overview
end
+
def make_interdomain_sequence( d, mark_short = true )
s = ""
d /= 20
puts()
puts( " " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
puts()
- puts( " options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
- puts( " -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
- puts( " -" + PARSE_OUT_DESCRIPITION_OPTION + ": parse query description (in addition to query name)" )
- puts( " -" + IGNORE_DUF_OPTION + ": ignore DUFs" )
+ puts( " options: -" + I_E_VALUE_THRESHOLD_OPTION + ": i-E-value threshold, default is no threshold" )
puts( " -" + FS_E_VALUE_THRESHOLD_OPTION + ": E-value threshold for full protein sequences, only for protein summary" )
- puts( " -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
- puts( " -" + SPECIES_OPTION + ": species for protein summary" )
+ puts( " -" + TARGET_MODELS + ": target HMMs" )
puts()
end