inprogress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / hmmscan_analysis.rb
index 08d74c1..0aaab84 100644 (file)
@@ -7,8 +7,6 @@
 # $Id: hmmscan_parser.rb,v 1.5 2010/12/13 19:00:11 cmzmasek Exp $
 #
 
-require 'set'
-
 require 'lib/evo/util/constants'
 require 'lib/evo/util/util'
 require 'lib/evo/util/command_line_arguments'
@@ -17,48 +15,41 @@ require 'lib/evo/msa/msa'
 require 'lib/evo/msa/msa_factory'
 require 'lib/evo/io/msa_io'
 require 'lib/evo/io/parser/fasta_parser'
-require 'lib/evo/io/writer/fasta_writer'
 
 module Evoruby
 
   class HmmscanAnalysis
 
-    PRG_NAME       = "hsp"
-    PRG_VERSION    = "2.001"
-    PRG_DESC       = "hmmscan summary"
-    PRG_DATE       = "2013.10.23"
+    PRG_NAME       = "hsa"
+    PRG_VERSION    = "1.000"
+    PRG_DESC       = "hmmscan analysis"
+    PRG_DATE       = "130319"
     COPYRIGHT      = "2013 Christian M Zmasek"
     CONTACT        = "phyloxml@gmail.com"
     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
 
-    DELIMITER_OPTION              = "d"
-    SPECIES_OPTION                = "s"
     I_E_VALUE_THRESHOLD_OPTION    = "ie"
     FS_E_VALUE_THRESHOLD_OPTION   = "pe"
-    HMM_FOR_PROTEIN_OUTPUT        = "m"
-    IGNORE_DUF_OPTION             = "i"
-    PARSE_OUT_DESCRIPITION_OPTION = "a"
+    TARGET_MODELS                 = "m"
+    EXTRACTION                    = "x"
     HELP_OPTION_1                 = "help"
     HELP_OPTION_2                 = "h"
 
-    USE_AVOID_HMMS = false
+    USE_AVOID_HMMS = true
     AVOID_HHMS = [ "RRM_1", "RRM_2", "RRM_3", "RRM_4", "RRM_5", "RRM_6" ]
     LIMIT_FOR_CLOSE_DOMAINS = 20
 
-    def initialize
-      @domain_counts = Hash.new
-    end
 
     def run
 
-      #   Util.print_program_information( PRG_NAME,
-      #     PRG_VERSION,
-      #     PRG_DESC,
-      #     PRG_DATE,
-      #     COPYRIGHT,
-      #     CONTACT,
-      #     WWW,
-      #     STDOUT )
+      Util.print_program_information( PRG_NAME,
+        PRG_VERSION,
+        PRG_DESC,
+        PRG_DATE,
+        COPYRIGHT,
+        CONTACT,
+        WWW,
+        STDOUT )
 
       begin
         cla = CommandLineArguments.new( ARGV )
@@ -72,19 +63,16 @@ module Evoruby
         exit( 0 )
       end
 
-      if ( cla.get_number_of_files != 1 &&  cla.get_number_of_files != 2 )
+      if ( cla.get_number_of_files != 1 && cla.get_number_of_files != 3 )
         print_help
         exit( -1 )
       end
 
       allowed_opts = Array.new
-      allowed_opts.push( DELIMITER_OPTION )
       allowed_opts.push( I_E_VALUE_THRESHOLD_OPTION )
       allowed_opts.push( FS_E_VALUE_THRESHOLD_OPTION )
-      allowed_opts.push( IGNORE_DUF_OPTION )
-      allowed_opts.push( PARSE_OUT_DESCRIPITION_OPTION )
-      allowed_opts.push( HMM_FOR_PROTEIN_OUTPUT )
-      allowed_opts.push( SPECIES_OPTION )
+      allowed_opts.push( TARGET_MODELS )
+      allowed_opts.push( EXTRACTION )
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -94,24 +82,23 @@ module Evoruby
       end
 
       inpath = cla.get_file_name( 0 )
+      Util.check_file_for_readability( inpath )
       seq_file_path = nil
+      extraction_output = nil
 
-      if ( cla.get_number_of_files == 2 )
+      if ( cla.get_number_of_files == 3 )
         seq_file_path = cla.get_file_name( 1 )
+        Util.check_file_for_readability(  seq_file_path  )
+        extraction_output = cla.get_file_name( 2 )
+        if File.exist?( extraction_output )
+          Util.fatal_error( PRG_NAME, "error: [#{extraction_output}] already exists" )
+        end
+
       end
 
       msa = nil
       if seq_file_path != nil
-        msa = read_fasta_file(seq_file_path )
-      end
-
-      column_delimiter = "\t"
-      if ( cla.is_option_set?( DELIMITER_OPTION ) )
-        begin
-          column_delimiter = cla.get_option_value( DELIMITER_OPTION )
-        rescue ArgumentError => e
-          Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
-        end
+        msa = read_fasta_file( seq_file_path )
       end
 
       i_e_value_threshold = -1
@@ -138,52 +125,41 @@ module Evoruby
         end
       end
 
-      hmm_for_protein_outputs = []
-      if ( cla.is_option_set?( HMM_FOR_PROTEIN_OUTPUT ) )
+      target_models = []
+      if ( cla.is_option_set?( TARGET_MODELS ) )
         begin
-          hmm_for_protein_output = cla.get_option_value( HMM_FOR_PROTEIN_OUTPUT )
-          hmm_for_protein_outputs = hmm_for_protein_output.split( "~" );
+          hmm_for_protein_output = cla.get_option_value( TARGET_MODELS )
+          target_models = hmm_for_protein_output.split( "/" );
         rescue ArgumentError => e
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
 
-      species = "HUMAN"
-      if ( cla.is_option_set?( SPECIES_OPTION ) )
+      x_models = []
+      if ( cla.is_option_set?( EXTRACTION ) )
         begin
-          species = cla.get_option_value( SPECIES_OPTION )
+          hmm_for_protein_output = cla.get_option_value( EXTRACTION )
+          x_models = hmm_for_protein_output.split( "~" );
         rescue ArgumentError => e
           Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
         end
       end
 
-      ignore_dufs = false
-      if ( cla.is_option_set?( IGNORE_DUF_OPTION ) )
-        ignore_dufs = true
-      end
-
-      parse_descriptions = false
-      if ( cla.is_option_set?( PARSE_OUT_DESCRIPITION_OPTION ) )
-        parse_descriptions = true
-      end
-
-
       begin
         parse( inpath,
-          column_delimiter,
           i_e_value_threshold,
-          ignore_dufs,
-          parse_descriptions,
           fs_e_value_threshold,
-          hmm_for_protein_outputs,
-          species,
-          msa        )
+          target_models,
+          x_models,
+          msa,
+          extraction_output )
       rescue IOError => e
         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
       end
 
     end # def run
 
+
     private
 
     def read_fasta_file( input )
@@ -200,48 +176,37 @@ module Evoruby
 
     # raises ArgumentError, IOError
     def parse( inpath,
-        column_delimiter,
         i_e_value_threshold,
-        ignore_dufs,
-        get_descriptions,
         fs_e_value_threshold,
-        hmm_for_protein_outputs,
-        species,
-        msa )
+        target_models,
+        x_models,
+        msa,
+        extraction_output )
 
-      Util.check_file_for_readability( inpath )
+      extraction_output_file = nil
+      if extraction_output != nil
+        extraction_output_file = File.open( extraction_output, "a" )
+      end
 
       hmmscan_parser = HmmscanParser.new( inpath )
-      results = hmmscan_parser.parse
-
 
-      query     = ""
-      desc      = ""
-      model     = ""
-      env_from  = ""
-      env_to    = ""
-      i_e_value = ""
+      results = hmmscan_parser.parse
 
       hmmscan_results_per_protein = []
 
+      query     = ""
       prev_query = ""
-
       results.each do | r |
-        model     = r.model
-        query     = r.query
-        i_e_value = r.i_e_value
-        env_from  = r.env_from
-        env_to    = r.env_to
-
-
+        query = r.query
         if !prev_query.empty? && prev_query != query
           if !hmmscan_results_per_protein.empty?
             process_hmmscan_results_per_protein( hmmscan_results_per_protein,
               fs_e_value_threshold,
-              hmm_for_protein_outputs,
+              target_models,
+              x_models,
               i_e_value_threshold,
-              species,
-              msa            )
+              msa,
+              extraction_output_file )
           end
           hmmscan_results_per_protein.clear
         end
@@ -257,33 +222,28 @@ module Evoruby
 
       end
 
-      if !hmm_for_protein_outputs.empty? && !hmmscan_results_per_protein.empty?
+      if !hmmscan_results_per_protein.empty?
         process_hmmscan_results_per_protein( hmmscan_results_per_protein,
           fs_e_value_threshold,
-          hmm_for_protein_outputs,
+          target_models,
+          x_models,
           i_e_value_threshold,
-          species,
-          msa        )
+          msa,
+          extraction_output_file )
       end
-
-
-    end # def parse
-
-    def process_id( id )
-      if  id =~ /(sp|tr)\|\S+\|(\S+)/
-        id = $2
+      if extraction_output_file != nil
+        extraction_output_file.close
       end
-      id
-    end
-
+    end # def parse
 
 
     def process_hmmscan_results_per_protein( hmmscan_results_per_protein,
         fs_e_value_threshold,
         target_hmms,
+        x_models,
         i_e_value_threshold,
-        species,
-        msa )
+        msa,
+        extraction_output_file )
 
       raise StandardError, "target hmms is empty" if target_hmms.length < 1
       raise StandardError, "results is empty" if hmmscan_results_per_protein.length < 1
@@ -291,7 +251,6 @@ module Evoruby
       # filter according to i-Evalue threshold
       # abort if fs Evalue too high
 
-
       if fs_e_value_threshold >= 0.0
         hmmscan_results_per_protein.each do | r |
           target_hmms.each do | hmm |
@@ -302,7 +261,6 @@ module Evoruby
         end
       end
 
-      #  dcs = []
       hmmscan_results_per_protein_filtered = []
       matched = Set.new
 
@@ -311,7 +269,6 @@ module Evoruby
           hmmscan_results_per_protein_filtered << r
           target_hmms.each do | hmm |
             if r.model == hmm
-
               matched << hmm
               break
             end
@@ -339,112 +296,147 @@ module Evoruby
         end
       end
 
-      s = ""
       query = nil
+      qlen = nil
       owns.each do | own |
-        s << own.query + "\t"
-        query = own.query
+        if query == nil
+          query = own.query
+          qlen = own.qlen
+        else
+          raise StandardError, "failed sanity check" if query != own.query || qlen != own.qlen
+          raise StandardError, "failed sanity check: qlen != own.qlen" if qlen != own.qlen
+        end
+      end
+
+      species = nil
+      ll = nil
+      if msa != nil
+        seq = get_sequence( query, msa  )
+        species = get_species( seq )
+        raise StandardError, "could not get species" if species == nil || species.empty?
+        if x_models != nil &&  x_models.length > 0
+          ll = extract_linker( hmmscan_results_per_protein_filtered, x_models, seq,  extraction_output_file )
+        end
       end
+
+      s = query + "\t"
       s << species + "\t"
       owns.each do | own |
         s << own.fs_e_value.to_s + "\t"
       end
-      owns.each do | own |
-        s << own.qlen.to_s + "\t" #TODO !
-      end
 
-      #   dcs.each do | dc |
-      #     s << dc.to_s + "\t"
-      #   end
+      s << qlen.to_s + "\t"
+
       s << hmmscan_results_per_protein_filtered.length.to_s + "\t"
       hmmscan_results_per_protein_filtered.each do | r |
         s << r.model + " "
       end
       s << "\t"
+      if msa != nil
+        if ll != nil
+          s << ll.to_s
+        end
+        s << "\t"
+      end
+      s <<  make_overview_da( hmmscan_results_per_protein_filtered )
+      s << "\t"
+      s << make_detailed_da( hmmscan_results_per_protein_filtered, qlen )
+      puts s
 
+    end
 
 
-      # overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
-
-      #   s << overview  + "\t"
-
-      #  s << calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )  + "\t"
-
+    def extract_linker( hmmscan_results_per_protein_filtered, x_models, seq, extraction_output_file )
+      raise StandardError, "extraction output file is nil" if extraction_output_file == nil
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-
         if  prev_r != nil
-          s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
-
-          if ( target_hmms.length == 2 && prev_r.model == target_hmms[ 0 ] && r.model == target_hmms[ 1 ] )
-            puts "xxx"
-            linker( prev_r.env_to, r.env_from, query, msa )
+          if ( x_models.length == 2 && prev_r.model == x_models[ 0 ] && r.model == x_models[ 1 ] )
+            ll = output_linker( prev_r.env_to, r.env_from, seq, extraction_output_file )
+            return ll
           end
+        end
+        prev_r = r
+      end
+      return nil
+    end
 
 
-        else
-          s << make_interdomain_sequence( r.env_from, false )
-
+    def get_sequence( query, msa  )
+      seq = nil
+      indices = msa.find_by_name_start( query , true )
+      if indices.length != 1
+        if query[ -1, 1 ] == "|"
+          query.chop!
         end
-        s << r.model
-        s << "["
-        s << r.env_from.to_s << "-" << r.env_to.to_s
-        s << " " << r.i_e_value.to_s
-        s << "]"
-        prev_r = r
+        seq = msa.get_by_name_pattern( /\b#{Regexp.quote(query)}\b/ )
+      else
+        seq = msa.get_sequence( indices[ 0 ] )
       end
-      # s << make_interdomain_sequence( own.qlen - prev_r.env_from, false )
-      puts s
+      seq
     end
 
 
-    def linker( first, last , query , msa)
-      puts first.to_s + "-" + last.to_s
+    def get_species( seq )
+      species = nil
+      if seq.get_name =~ /\[([A-Z0-9]{3,5})\]/
+        species = $1
+      end
+      species
+    end
+
+
+    def output_linker( first, last , seq,  output_file )
       if ( last - first >= 1 )
-        seq = msa.get_by_name( query, true, false )
-        linker = seq.get_subsequence( first -1 , last - 1 )
-        puts linker.get_sequence_as_string
+        output_file.print( ">" + seq.get_name + " [" +  first.to_s + "-" + last.to_s +  "]" + "\n")
+        output_file.print( seq.get_subsequence( first - 1 , last - 1 ).get_sequence_as_string + "\n" )
       end
+      last - first + 1
 
     end
 
 
-    def calc_linkers(  hmmscan_results_per_protein_filtered, hmm_for_protein_output )
-      linkers = ""
+    def make_detailed_da( hmmscan_results_per_protein_filtered , qlen )
+      s = ""
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-        if r.model == hmm_for_protein_output
-          if  prev_r != nil
-            linkers << ( r.env_from - prev_r.env_to - 1 ).to_s + " "
-          end
-          prev_r = r
+        if  prev_r != nil
+          s << make_interdomain_sequence( r.env_from - prev_r.env_to - 1 )
+        else
+          s << make_interdomain_sequence( r.env_from, false )
         end
+        s << r.model
+        s << "["
+        s << r.env_from.to_s << "-" << r.env_to.to_s
+        s << " " << r.i_e_value.to_s
+        s << "]"
+        prev_r = r
       end
-      linkers
+      s << make_interdomain_sequence( qlen - prev_r.env_to, false )
+      s
     end
 
 
-
-    def make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output )
+    def make_overview_da( hmmscan_results_per_protein_filtered )
       overview = ""
       prev_r = nil
       hmmscan_results_per_protein_filtered.each do | r |
-        if r.model == hmm_for_protein_output
-          if prev_r == nil
-            overview << hmm_for_protein_output
+        if prev_r == nil
+          overview << r.model
+        else
+          if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
+            overview << "~" << r.model
           else
-            if  ( r.env_from - prev_r.env_to - 1 ) <= LIMIT_FOR_CLOSE_DOMAINS
-              overview << "~" << hmm_for_protein_output
-            else
-              overview << "----" << hmm_for_protein_output
-            end
+            overview << "----" << r.model
           end
-          prev_r = r
         end
+        prev_r = r
+
       end
       overview
     end
 
+
     def make_interdomain_sequence( d, mark_short = true )
       s = ""
       d /= 20
@@ -466,13 +458,9 @@ module Evoruby
       puts()
       puts( "  " + PRG_NAME + ".rb [options] <hmmscan outputfile> <outputfile>" )
       puts()
-      puts( "  options: -" + DELIMITER_OPTION + ": column delimiter for outputfile, default is TAB" )
-      puts( "           -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
-      puts( "           -" + PARSE_OUT_DESCRIPITION_OPTION  + ": parse query description (in addition to query name)" )
-      puts( "           -" + IGNORE_DUF_OPTION  + ": ignore DUFs" )
+      puts( "  options: -" + I_E_VALUE_THRESHOLD_OPTION  + ": i-E-value threshold, default is no threshold" )
       puts( "           -" + FS_E_VALUE_THRESHOLD_OPTION  + ": E-value threshold for full protein sequences, only for protein summary" )
-      puts( "           -" + HMM_FOR_PROTEIN_OUTPUT + ": HMM for protein summary" )
-      puts( "           -" + SPECIES_OPTION + ": species for protein summary" )
+      puts( "           -" + TARGET_MODELS + ": target HMMs" )
       puts()
     end