forester/ruby/evoruby/lib/evo/io/parser/hmmscan_domain_extractor.rb

   1 #
   2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
   3 #
   4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id:  $
   8
   9
  10 require 'lib/evo/util/constants'
  11 require 'lib/evo/msa/msa_factory'
  12 require 'lib/evo/io/msa_io'
  13 require 'lib/evo/io/writer/fasta_writer'
  14 require 'lib/evo/io/parser/fasta_parser'
  15
  16
  17 module Evoruby
  18
  19   class HmmscanDomainExtractor
  20
  21     TRIM_BY = 2
  22
  23     def initialize
  24     end
  25
  26     # raises ArgumentError, IOError, StandardError
  27     def parse( domain_id,
  28         hmmsearch_output,
  29         fasta_sequence_file,
  30         outfile,
  31         passed_seqs_outfile,
  32         failed_seqs_outfile,
  33         e_value_threshold,
  34         length_threshold,
  35         add_position,
  36         add_domain_number,
  37         add_domain_number_as_digit,
  38         add_domain_number_as_letter,
  39         trim_name,
  40          add_species,
  41         log )
  42
  43       Util.check_file_for_readability( hmmsearch_output )
  44       Util.check_file_for_readability( fasta_sequence_file )
  45       Util.check_file_for_writability( outfile )
  46       Util.check_file_for_writability( passed_seqs_outfile )
  47       Util.check_file_for_writability( failed_seqs_outfile )
  48
  49       in_msa = nil
  50       factory = MsaFactory.new()
  51       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
  52
  53       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
  54         error_msg = "could not find fasta sequences in " + fasta_sequence_file
  55         raise IOError, error_msg
  56       end
  57
  58       out_msa = Msa.new
  59       failed_seqs = Msa.new
  60       passed_seqs = Msa.new
  61
  62       ld = Constants::LINE_DELIMITER
  63
  64       domain_pass_counter     = 0
  65       domain_fail_counter     = 0
  66       proteins_with_passing_domains = 0
  67       proteins_with_failing_domains = 0
  68       max_domain_copy_number_per_protein = -1
  69       max_domain_copy_number_sequence    = ''
  70       failed_species_counts         = Hash.new
  71       passed_species_counts         = Hash.new
  72
  73       File.open( hmmsearch_output ) do | file |
  74         while line = file.gets
  75           if !is_ignorable?( line ) && line =~ /^\S+\s+/
  76
  77             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
  78             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
  79             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
  80
  81             target_name = $1
  82             if domain_id != target_name
  83               next
  84             end
  85
  86
  87             sequence = $4
  88             number   = $10.to_i
  89             out_of   = $11.to_i
  90             env_from = $20.to_i
  91             env_to   = $21.to_i
  92             i_e_value  = $13.to_f
  93             if ( number > max_domain_copy_number_per_protein )
  94               max_domain_copy_number_sequence    = sequence
  95               max_domain_copy_number_per_protein = number
  96             end
  97             if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
  98                  ( ( length_threshold.to_f <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
  99               extract_domain( sequence,
 100                 number,
 101                 out_of,
 102                 env_from,
 103                 env_to,
 104                 in_msa,
 105                 out_msa,
 106                 add_position,
 107                 add_domain_number,
 108                 add_domain_number_as_digit,
 109                 add_domain_number_as_letter,
 110                 trim_name )
 111               domain_pass_counter += 1
 112               count_species( sequence, passed_species_counts )
 113               if passed_seqs.find_by_name_start( sequence, true ).length < 1
 114                 add_sequence( sequence, in_msa, passed_seqs )
 115                 proteins_with_passing_domains += 1
 116               end
 117             else
 118               print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
 119               log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
 120               if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
 121                 print( " iE=" + i_e_value.to_s )
 122                 log << " iE=" + i_e_value.to_s
 123               end
 124               if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
 125                 le = env_to - env_from + 1
 126                 print( " l=" + le.to_s )
 127                 log << " l=" + le.to_s
 128               end
 129               print( Constants::LINE_DELIMITER )
 130               log << Constants::LINE_DELIMITER
 131               domain_fail_counter  += 1
 132               count_species( sequence, failed_species_counts )
 133               if failed_seqs.find_by_name_start( sequence, true ).length < 1
 134                 add_sequence( sequence, in_msa, failed_seqs )
 135                 proteins_with_failing_domains += 1
 136               end
 137             end
 138           end
 139         end
 140       end
 141
 142       if domain_pass_counter < 1
 143         error_msg = "no domain sequences were extracted"
 144         raise StandardError, error_msg
 145       end
 146
 147       log << Constants::LINE_DELIMITER
 148       puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
 149       log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
 150       log << Constants::LINE_DELIMITER
 151
 152       if ( max_domain_copy_number_per_protein > 1 )
 153         puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
 154         log << "First protein with this copy number: " + max_domain_copy_number_sequence
 155         log << Constants::LINE_DELIMITER
 156       end
 157
 158       io = MsaIO.new()
 159       w = FastaWriter.new()
 160       w.set_line_width( 60 )
 161       w.clean( true )
 162
 163       begin
 164         io.write_to_file( out_msa, outfile, w )
 165       rescue Exception
 166         error_msg = "could not write to \"" + outfile + "\""
 167         raise IOError, error_msg
 168       end
 169
 170       begin
 171         io.write_to_file( passed_seqs, passed_seqs_outfile, w )
 172       rescue Exception
 173         error_msg = "could not write to \"" + passed_seqs_outfile + "\""
 174         raise IOError, error_msg
 175       end
 176
 177       begin
 178         io.write_to_file( failed_seqs, failed_seqs_outfile, w )
 179       rescue Exception
 180         error_msg = "could not write to \"" + failed_seqs_outfile + "\""
 181         raise IOError, error_msg
 182       end
 183
 184       log << ld
 185       log << "passing domains              : " + domain_pass_counter.to_s + ld
 186       log << "failing domains              : " + domain_fail_counter.to_s + ld
 187       log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
 188       log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
 189       log << ld
 190       log << 'passing domains counts per species: ' << ld
 191       passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
 192       log << ld
 193       log << 'failing domains counts per species: ' << ld
 194       failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
 195       log << ld
 196       return domain_pass_counter
 197
 198     end # parse
 199
 200     private
 201
 202
 203     def add_sequence( sequence_name, in_msa, add_to_msa )
 204       seqs = in_msa.find_by_name_start( sequence_name, true )
 205       if ( seqs.length < 1 )
 206         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
 207         raise StandardError, error_msg
 208       end
 209       if ( seqs.length > 1 )
 210         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
 211         raise StandardError, error_msg
 212       end
 213       seq = in_msa.get_sequence( seqs[ 0 ] )
 214       add_to_msa.add_sequence( seq )
 215     end
 216
 217     # raises ArgumentError, StandardError
 218     def extract_domain( sequence,
 219         number,
 220         out_of,
 221         seq_from,
 222         seq_to,
 223         in_msa,
 224         out_msa,
 225         add_position,
 226         add_domain_number,
 227         add_domain_number_as_digit,
 228         add_domain_number_as_letter,
 229         trim_name,
 230         add_species      )
 231       if ( number < 1 || out_of < 1 || number > out_of )
 232         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
 233         raise ArgumentError, error_msg
 234       end
 235       if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to )
 236         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
 237         raise ArgumentError, error_msg
 238       end
 239       seqs = in_msa.find_by_name_start( sequence, true )
 240       if seqs.length < 1
 241         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
 242         raise StandardError, error_msg
 243       end
 244       if seqs.length > 1
 245         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
 246         raise StandardError, error_msg
 247       end
 248       # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
 249       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
 250
 251       orig_name = seq.get_name
 252
 253       seq.set_name( orig_name.split[ 0 ] )
 254
 255       if add_position
 256         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
 257       end
 258
 259       if trim_name
 260         seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
 261       end
 262
 263       if out_of != 1
 264         if ( add_domain_number_as_digit )
 265           seq.set_name( seq.get_name + number.to_s )
 266         elsif ( add_domain_number_as_letter )
 267           if number > 25
 268             error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
 269             raise StandardError, error_msg
 270           end
 271           seq.set_name( seq.get_name + ( number + 96 ).chr )
 272         elsif ( add_domain_number )
 273           seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
 274         end
 275
 276         if add_species
 277           a = orig_name.rindex "["
 278           b = orig_name.rindex "]"
 279           unless a && b
 280                error_msg = "species not found in " + orig_name
 281                raise StandardError, error_msg
 282           end
 283           species = orig_name[ a .. b ]
 284           seq.set_name( seq.get_name + " [" + species + "]" )
 285         end
 286
 287       end
 288
 289       # if ( seq.get_name.length > 10 )
 290       #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
 291       #   raise StandardError, error_msg
 292       # end
 293
 294       out_msa.add_sequence( seq )
 295     end
 296
 297     def count_species( sequence, species_counts_map )
 298       species = get_species( sequence )
 299       if species != nil
 300         if !species_counts_map.has_key?( species )
 301           species_counts_map[ species ] = 1
 302         else
 303           species_counts_map[ species ] = species_counts_map[ species ] + 1
 304         end
 305       end
 306     end
 307
 308     def get_species( sequence_name )
 309       if sequence_name =~ /^.+_(.+)$/
 310         return $1
 311       else
 312         return nil
 313       end
 314     end
 315
 316     def is_ignorable?( line )
 317       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
 318     end
 319
 320   end # class HmmscanDomainExtractor
 321
 322 end # module Evoruby
 323