bd3ae8bd83b3160ca2bd1dff1748abf0ccbc4efe
[jalview.git] / forester / ruby / evoruby / lib / evo / io / parser / hmmscan_domain_extractor.rb
1 #
2 # = lib/evo/io/parser/hmmscan_domain_extractor.rb - HmmscanDomainExtractor class
3 #
4 # Copyright::  Copyright (C) 2012 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id:  $
8
9
10 require 'lib/evo/util/constants'
11 require 'lib/evo/msa/msa_factory'
12 require 'lib/evo/io/msa_io'
13 require 'lib/evo/io/writer/fasta_writer'
14 require 'lib/evo/io/parser/fasta_parser'
15
16
17 module Evoruby
18
19   class HmmscanDomainExtractor
20
21     TRIM_BY = 2
22
23     def initialize
24     end
25
26     # raises ArgumentError, IOError, StandardError
27     def parse( domain_id,
28         hmmsearch_output,
29         fasta_sequence_file,
30         outfile,
31         passed_seqs_outfile,
32         failed_seqs_outfile,
33         e_value_threshold,
34         length_threshold,
35         add_position,
36         add_domain_number,
37         add_domain_number_as_digit,
38         add_domain_number_as_letter,
39         trim_name,
40         log )
41
42       Util.check_file_for_readability( hmmsearch_output )
43       Util.check_file_for_readability( fasta_sequence_file )
44       Util.check_file_for_writability( outfile )
45       Util.check_file_for_writability( passed_seqs_outfile )
46       Util.check_file_for_writability( failed_seqs_outfile )
47
48       in_msa = nil
49       factory = MsaFactory.new()
50       in_msa = factory.create_msa_from_file( fasta_sequence_file, FastaParser.new() )
51
52       if ( in_msa == nil || in_msa.get_number_of_seqs() < 1 )
53         error_msg = "could not find fasta sequences in " + fasta_sequence_file
54         raise IOError, error_msg
55       end
56
57       out_msa = Msa.new
58       failed_seqs = Msa.new
59       passed_seqs = Msa.new
60
61       ld = Constants::LINE_DELIMITER
62
63       domain_pass_counter     = 0
64       domain_fail_counter     = 0
65       proteins_with_passing_domains = 0
66       proteins_with_failing_domains = 0
67       max_domain_copy_number_per_protein = -1
68       max_domain_copy_number_sequence    = ''
69       failed_species_counts         = Hash.new
70       passed_species_counts         = Hash.new
71
72       File.open( hmmsearch_output ) do | file |
73         while line = file.gets
74           if !is_ignorable?( line ) && line =~ /^\S+\s+/
75
76             #         tn      acc     tlen    query   acc     qlen    Evalue  score   bias    #       of      c-E     i-E     score   bias    hf      ht      af      at      ef      et      acc     desc
77             #         1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23
78             line =~ /^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(.*)/
79
80             target_name = $1
81             if domain_id != target_name
82               next
83             end
84             
85             
86             sequence = $4
87             number   = $10.to_i
88             out_of   = $11.to_i
89             env_from = $20.to_i
90             env_to   = $21.to_i
91             i_e_value  = $13.to_f
92             if ( number > max_domain_copy_number_per_protein )
93               max_domain_copy_number_sequence    = sequence
94               max_domain_copy_number_per_protein = number
95             end
96             if ( ( ( e_value_threshold.to_f < 0.0 ) || ( i_e_value <= e_value_threshold ) ) &&
97                  ( ( length_threshold.to_f <= 0 )   || ( env_to - env_from + 1 ) >= length_threshold.to_f )  )
98               extract_domain( sequence,
99                 number,
100                 out_of,
101                 env_from,
102                 env_to,
103                 in_msa,
104                 out_msa,
105                 add_position,
106                 add_domain_number,
107                 add_domain_number_as_digit,
108                 add_domain_number_as_letter,
109                 trim_name )
110               domain_pass_counter += 1
111               count_species( sequence, passed_species_counts )
112               if passed_seqs.find_by_name_start( sequence, true ).length < 1
113                 add_sequence( sequence, in_msa, passed_seqs )
114                 proteins_with_passing_domains += 1
115               end
116             else
117               print( domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)" )
118               log << domain_fail_counter.to_s + ": " + sequence.to_s + " did not meet threshold(s)"
119               if ( ( e_value_threshold.to_f >= 0.0 ) && ( i_e_value > e_value_threshold ) )
120                 print( " iE=" + i_e_value.to_s )
121                 log << " iE=" + i_e_value.to_s
122               end
123               if ( ( length_threshold.to_f > 0 ) && ( env_to - env_from + 1 ) < length_threshold.to_f )
124                 le = env_to - env_from + 1
125                 print( " l=" + le.to_s )
126                 log << " l=" + le.to_s
127               end
128               print( Constants::LINE_DELIMITER )
129               log << Constants::LINE_DELIMITER
130               domain_fail_counter  += 1
131               count_species( sequence, failed_species_counts )
132               if failed_seqs.find_by_name_start( sequence, true ).length < 1
133                 add_sequence( sequence, in_msa, failed_seqs )
134                 proteins_with_failing_domains += 1
135               end
136             end
137           end
138         end
139       end
140
141       if domain_pass_counter < 1
142         error_msg = "no domain sequences were extracted"
143         raise StandardError, error_msg
144       end
145
146       log << Constants::LINE_DELIMITER
147       puts( "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s )
148       log << "Max domain copy number per protein : " + max_domain_copy_number_per_protein.to_s
149       log << Constants::LINE_DELIMITER
150
151       if ( max_domain_copy_number_per_protein > 1 )
152         puts( "First protein with this copy number: " + max_domain_copy_number_sequence )
153         log << "First protein with this copy number: " + max_domain_copy_number_sequence
154         log << Constants::LINE_DELIMITER
155       end
156
157       io = MsaIO.new()
158       w = FastaWriter.new()
159       w.set_line_width( 60 )
160       w.clean( true )
161
162       begin
163         io.write_to_file( out_msa, outfile, w )
164       rescue Exception
165         error_msg = "could not write to \"" + outfile + "\""
166         raise IOError, error_msg
167       end
168
169       begin
170         io.write_to_file( passed_seqs, passed_seqs_outfile, w )
171       rescue Exception
172         error_msg = "could not write to \"" + passed_seqs_outfile + "\""
173         raise IOError, error_msg
174       end
175
176       begin
177         io.write_to_file( failed_seqs, failed_seqs_outfile, w )
178       rescue Exception
179         error_msg = "could not write to \"" + failed_seqs_outfile + "\""
180         raise IOError, error_msg
181       end
182
183       log << ld
184       log << "passing domains              : " + domain_pass_counter.to_s + ld
185       log << "failing domains              : " + domain_fail_counter.to_s + ld
186       log << "proteins with passing domains: " + proteins_with_passing_domains.to_s + ld
187       log << "proteins with failing domains: " + proteins_with_failing_domains.to_s + ld
188       log << ld
189       log << 'passing domains counts per species: ' << ld
190       passed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
191       log << ld
192       log << 'failing domains counts per species: ' << ld
193       failed_species_counts.each_pair { | species, count | log << "#{species}: #{count}" << ld }
194       log << ld
195       return domain_pass_counter
196
197     end # parse
198
199     private
200
201
202     def add_sequence( sequence_name, in_msa, add_to_msa )
203       seqs = in_msa.find_by_name_start( sequence_name, true )
204       if ( seqs.length < 1 )
205         error_msg = "sequence \"" + sequence_name + "\" not found in sequence file"
206         raise StandardError, error_msg
207       end
208       if ( seqs.length > 1 )
209         error_msg = "sequence \"" + sequence_name + "\" not unique in sequence file"
210         raise StandardError, error_msg
211       end
212       seq = in_msa.get_sequence( seqs[ 0 ] )
213       add_to_msa.add_sequence( seq )
214     end
215
216     # raises ArgumentError, StandardError
217     def extract_domain( sequence,
218         number,
219         out_of,
220         seq_from,
221         seq_to,
222         in_msa,
223         out_msa,
224         add_position,
225         add_domain_number,
226         add_domain_number_as_digit,
227         add_domain_number_as_letter,
228         trim_name )
229       if ( number < 1 || out_of < 1 || number > out_of )
230         error_msg = "impossible: number=" + number.to_s + ", out of=" + out_of.to_s
231         raise ArgumentError, error_msg
232       end
233       if ( seq_from < 1 || seq_to < 1 || seq_from >= seq_to )
234         error_msg = "impossible: seq-f=" + seq_from.to_s + ", seq-t=" + seq_to.to_s
235         raise ArgumentError, error_msg
236       end
237       seqs = in_msa.find_by_name_start( sequence, true )
238       if seqs.length < 1
239         error_msg = "sequence \"" + sequence + "\" not found in sequence file"
240         raise StandardError, error_msg
241       end
242       if seqs.length > 1
243         error_msg = "sequence \"" + sequence + "\" not unique in sequence file"
244         raise StandardError, error_msg
245       end
246       # hmmsearch is 1 based, wheres sequences are 0 bases in this package.
247       seq = in_msa.get_sequence( seqs[ 0 ] ).get_subsequence( seq_from - 1, seq_to - 1 )
248       
249       seq.set_name( seq.get_name.split[ 0 ] )
250       
251       if add_position
252         seq.set_name( seq.get_name + "_" + seq_from.to_s + "-" + seq_to.to_s )
253       end
254
255       if trim_name
256         seq.set_name( seq.get_name[ 0, seq.get_name.length - TRIM_BY ] )
257       end
258
259       if out_of != 1
260         if ( add_domain_number_as_digit )
261           seq.set_name( seq.get_name + number.to_s )
262         elsif ( add_domain_number_as_letter )
263           if number > 25
264             error_msg = 'too many identical domains per sequence, cannot use letters to distinguish them'
265             raise StandardError, error_msg
266           end
267           seq.set_name( seq.get_name + ( number + 96 ).chr )
268         elsif ( add_domain_number )
269           seq.set_name( seq.get_name + "~" + number.to_s + "-" + out_of.to_s )
270         end
271       end
272
273       # if ( seq.get_name.length > 10 )
274       #   error_msg = "sequence name [" + seq.get_name + "] is longer than 10 characters"
275       #   raise StandardError, error_msg
276       # end
277
278       out_msa.add_sequence( seq )
279     end
280
281     def count_species( sequence, species_counts_map )
282       species = get_species( sequence )
283       if species != nil
284         if !species_counts_map.has_key?( species )
285           species_counts_map[ species ] = 1
286         else
287           species_counts_map[ species ] = species_counts_map[ species ] + 1
288         end
289       end
290     end
291
292     def get_species( sequence_name )
293       if sequence_name =~ /^.+_(.+)$/
294         return $1
295       else
296         return nil
297       end
298     end
299
300     def is_ignorable?( line )
301       return ( line !~ /[A-Za-z0-9-]/ || line =~/^#/ )
302     end
303
304   end # class HmmscanDomainExtractor
305
306 end # module Evoruby
307