c75e52191640f46c91311a6c6701e9f7e7d6d057
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmscanPerDomainTableParser.java
1 // $Id:
2 // $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.io.parsers;
29
30 import java.io.BufferedReader;
31 import java.io.File;
32 import java.io.FileReader;
33 import java.io.IOException;
34 import java.util.ArrayList;
35 import java.util.Date;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeMap;
42 import java.util.TreeSet;
43
44 import org.forester.protein.BasicDomain;
45 import org.forester.protein.BasicProtein;
46 import org.forester.protein.Domain;
47 import org.forester.protein.Protein;
48 import org.forester.util.ForesterUtil;
49
50 public final class HmmscanPerDomainTableParser {
51
52     private static final String           RETRO                       = "RETRO";
53     private static final String           PHAGE                       = "PHAGE";
54     private static final String           VIR                         = "VIR";
55     private static final String           TRANSPOS                    = "TRANSPOS";
56     private static final String           RV                          = "RV";
57     private static final String           GAG                         = "GAG_";
58     private static final String           HCV                         = "HCV_";
59     private static final String           HERPES                      = "HERPES_";
60     private static final String           BACULO                      = "BACULO_";
61     private static final int              E_VALUE_MAXIMUM_DEFAULT     = -1;
62     private static final int              LENGTH_RATIO_CUTOFF_DEFAULT = -1;
63     private static final ReturnType       RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
64     private static final boolean          IGNORE_DUFS_DEFAULT         = false;
65     private static final int              MAX_ALLOWED_OVERLAP_DEFAULT = -1;
66     private final Set<String>             _filter;
67     private final FilterType              _filter_type;
68     private final File                    _input_file;
69     private final String                  _species;
70     private double                        _fs_e_value_maximum;
71     private double                        _i_e_value_maximum;
72     private double                        _rel_env_length_ratio_cutoff;
73     private Map<String, Double>           _individual_score_cutoffs;
74     private boolean                       _ignore_dufs;
75     private boolean                       _ignore_virus_like_ids;
76     private int                           _max_allowed_overlap;
77     private boolean                       _ignore_engulfed_domains;
78     private ReturnType                    _return_type;
79     private int                           _proteins_encountered;
80     private int                           _proteins_ignored_due_to_filter;
81     private int                           _proteins_stored;
82     private int                           _domains_encountered;
83     private int                           _domains_ignored_due_to_duf;
84     private int                           _domains_ignored_due_to_overlap;
85     private int                           _domains_ignored_due_to_fs_e_value;
86     private int                           _domains_ignored_due_to_i_e_value;
87     private int                           _domains_ignored_due_to_rel_env_length_ratio_cutoff;
88     private int                           _domains_ignored_due_to_individual_score_cutoff;
89     private int                           _domains_stored;
90     private SortedSet<String>             _domains_stored_set;
91     private long                          _time;
92     private int                           _domains_ignored_due_to_negative_domain_filter;
93     private Map<String, Integer>          _domains_ignored_due_to_negative_domain_filter_counts_map;
94     private int                           _domains_ignored_due_to_virus_like_id;
95     private Map<String, Integer>          _domains_ignored_due_to_virus_like_id_counts_map;
96     private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff;
97     private final boolean                 _allow_proteins_with_same_name;
98
99     public HmmscanPerDomainTableParser( final File input_file,
100                                         final String species,
101                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
102         _input_file = input_file;
103         _species = species;
104         _filter = null;
105         _filter_type = FilterType.NONE;
106         _ind_cutoff = individual_cutoff_applies_to;
107         _allow_proteins_with_same_name = false;
108         init();
109     }
110
111     public HmmscanPerDomainTableParser( final File input_file,
112                                         final String species,
113                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
114                                         final boolean allow_proteins_with_same_name ) {
115         _input_file = input_file;
116         _species = species;
117         _filter = null;
118         _filter_type = FilterType.NONE;
119         _ind_cutoff = individual_cutoff_applies_to;
120         _allow_proteins_with_same_name = allow_proteins_with_same_name;
121         init();
122     }
123
124     public HmmscanPerDomainTableParser( final File input_file,
125                                         final String species,
126                                         final Set<String> filter,
127                                         final FilterType filter_type,
128                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
129         _input_file = input_file;
130         _species = species;
131         _filter = filter;
132         _filter_type = filter_type;
133         _ind_cutoff = individual_cutoff_applies_to;
134         _allow_proteins_with_same_name = false;
135         init();
136     }
137
138     public HmmscanPerDomainTableParser( final File input_file,
139                                         final String species,
140                                         final Set<String> filter,
141                                         final FilterType filter_type,
142                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
143                                         final boolean allow_proteins_with_same_name ) {
144         _input_file = input_file;
145         _species = species;
146         _filter = filter;
147         _filter_type = filter_type;
148         _ind_cutoff = individual_cutoff_applies_to;
149         _allow_proteins_with_same_name = allow_proteins_with_same_name;
150         init();
151     }
152
153     public boolean isAllowProteinsWithSameName() {
154         return _allow_proteins_with_same_name;
155     }
156
157     private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
158         final List<Domain> l = current_protein.getProteinDomains();
159         for( final Domain d : l ) {
160             getDomainsStoredSet().add( d.getDomainId() );
161         }
162         proteins.add( current_protein );
163         ++_proteins_stored;
164     }
165
166     private void addProtein( final List<Protein> proteins, Protein current_protein ) {
167         if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT )
168                 || isIgnoreEngulfedDomains() ) {
169             final int domains_count = current_protein.getNumberOfProteinDomains();
170             current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
171                                                                      isIgnoreEngulfedDomains(),
172                                                                      current_protein );
173             final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
174             _domains_stored -= domains_removed;
175             _domains_ignored_due_to_overlap += domains_removed;
176         }
177         if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN )
178                 || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
179             final Set<String> domain_ids_in_protein = new HashSet<String>();
180             for( final Domain d : current_protein.getProteinDomains() ) {
181                 domain_ids_in_protein.add( d.getDomainId() );
182             }
183             domain_ids_in_protein.retainAll( getFilter() );
184             if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) {
185                 if ( domain_ids_in_protein.size() > 0 ) {
186                     actuallyAddProtein( proteins, current_protein );
187                 }
188                 else {
189                     ++_proteins_ignored_due_to_filter;
190                 }
191             }
192             else {
193                 if ( domain_ids_in_protein.size() < 1 ) {
194                     actuallyAddProtein( proteins, current_protein );
195                 }
196                 else {
197                     ++_proteins_ignored_due_to_filter;
198                 }
199             }
200         }
201         else {
202             actuallyAddProtein( proteins, current_protein );
203         }
204     }
205
206     public int getDomainsEncountered() {
207         return _domains_encountered;
208     }
209
210     public int getDomainsIgnoredDueToDuf() {
211         return _domains_ignored_due_to_duf;
212     }
213
214     public int getDomainsIgnoredDueToIEval() {
215         return _domains_ignored_due_to_i_e_value;
216     }
217     
218     public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() {
219         return _domains_ignored_due_to_rel_env_length_ratio_cutoff;
220     }
221     
222     
223
224     public int getDomainsIgnoredDueToFsEval() {
225         return _domains_ignored_due_to_fs_e_value;
226     }
227
228     public int getDomainsIgnoredDueToIndividualScoreCutoff() {
229         return _domains_ignored_due_to_individual_score_cutoff;
230     }
231
232     public int getDomainsIgnoredDueToNegativeDomainFilter() {
233         return _domains_ignored_due_to_negative_domain_filter;
234     }
235
236     public Map<String, Integer> getDomainsIgnoredDueToNegativeDomainFilterCountsMap() {
237         return _domains_ignored_due_to_negative_domain_filter_counts_map;
238     }
239
240     public int getDomainsIgnoredDueToOverlap() {
241         return _domains_ignored_due_to_overlap;
242     }
243
244     public Map<String, Integer> getDomainsIgnoredDueToVirusLikeIdCountsMap() {
245         return _domains_ignored_due_to_virus_like_id_counts_map;
246     }
247
248     public int getDomainsIgnoredDueToVirusLikeIds() {
249         return _domains_ignored_due_to_virus_like_id;
250     }
251
252     public int getDomainsStored() {
253         return _domains_stored;
254     }
255
256     public SortedSet<String> getDomainsStoredSet() {
257         return _domains_stored_set;
258     }
259
260     private double getFsEValueMaximum() {
261         return _fs_e_value_maximum;
262     }
263
264     private double getIEValueMaximum() {
265         return _i_e_value_maximum;
266     }
267     
268     private double getRelEnvLengthRatioCutoff() {
269         return _rel_env_length_ratio_cutoff;
270     }
271
272     private Set<String> getFilter() {
273         return _filter;
274     }
275
276     private FilterType getFilterType() {
277         return _filter_type;
278     }
279
280     public INDIVIDUAL_SCORE_CUTOFF getIndividualCutoffAppliesTo() {
281         return _ind_cutoff;
282     }
283
284     private Map<String, Double> getIndividualScoreCutoffs() {
285         return _individual_score_cutoffs;
286     }
287
288     private File getInputFile() {
289         return _input_file;
290     }
291
292     private int getMaxAllowedOverlap() {
293         return _max_allowed_overlap;
294     }
295
296     public int getProteinsEncountered() {
297         return _proteins_encountered;
298     }
299
300     public int getProteinsIgnoredDueToFilter() {
301         return _proteins_ignored_due_to_filter;
302     }
303
304     public int getProteinsStored() {
305         return _proteins_stored;
306     }
307
308     private ReturnType getReturnType() {
309         return _return_type;
310     }
311
312     private String getSpecies() {
313         return _species;
314     }
315
316     public long getTime() {
317         return _time;
318     }
319
320     private void init() {
321         _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
322         _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
323         _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT;
324         setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
325         setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
326         _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
327         setIndividualScoreCutoffs( null );
328         setIgnoreEngulfedDomains( false );
329         setIgnoreVirusLikeIds( false );
330         intitCounts();
331     }
332
333     private void intitCounts() {
334         setDomainsStoredSet( new TreeSet<String>() );
335         setDomainsEncountered( 0 );
336         setProteinsEncountered( 0 );
337         setProteinsIgnoredDueToFilter( 0 );
338         setDomainsIgnoredDueToNegativeFilter( 0 );
339         setDomainsIgnoredDueToDuf( 0 );
340         setDomainsIgnoredDueToFsEval( 0 );
341         setDomainsIgnoredDueToIEval( 0 );
342         setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 );
343         setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
344         setDomainsIgnoredDueToVirusLikeId( 0 );
345         setDomainsIgnoredDueToOverlap( 0 );
346         setDomainsStored( 0 );
347         setProteinsStored( 0 );
348         setTime( 0 );
349         setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap<String, Integer>() );
350         setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap<String, Integer>() );
351     }
352
353     private boolean isIgnoreDufs() {
354         return _ignore_dufs;
355     }
356
357     private boolean isIgnoreEngulfedDomains() {
358         return _ignore_engulfed_domains;
359     }
360
361     private boolean isIgnoreVirusLikeIds() {
362         return _ignore_virus_like_ids;
363     }
364
365     public List<Protein> parse() throws IOException {
366         if ( ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE )
367                 && ( ( getIndividualScoreCutoffs() == null ) || ( getIndividualScoreCutoffs().size() < 1 ) ) ) {
368             throw new RuntimeException( "attempt to use individual cuttoffs with having set them" );
369         }
370         intitCounts();
371         final Set<String> prev_queries = new HashSet<String>();
372         final String error = ForesterUtil.isReadableFile( getInputFile() );
373         if ( !ForesterUtil.isEmpty( error ) ) {
374             throw new IOException( error );
375         }
376         final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
377         String line;
378         final List<Protein> proteins = new ArrayList<Protein>();
379         Protein current_protein = null;
380         int line_number = 0;
381         final long start_time = new Date().getTime();
382         String prev_query = "";
383         int prev_qlen = -1;
384         while ( ( line = br.readLine() ) != null ) {
385             line_number++;
386             if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) {
387                 continue;
388             }
389             // 0                    1           2    3                      4           5      6        7      8      9  10  11        12        13     14    15      16  17      18  19      20  21  22
390             // #                                                                              --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
391             // # target name        accession   tlen query name             accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
392             // #------------------- ---------- -----   -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
393             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   1   4   1.5e-41     3e-38  130.8  11.1     3   171   140   307   139   346 0.81 Ion transport protein
394             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   2   4   9.1e-45   1.8e-41  141.3  13.1     4   200   479   664   476   665 0.97 Ion transport protein
395             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   3   4   5.2e-45     1e-41  142.1  14.0     1   201   900  1117   900  1117 0.96 Ion transport protein
396             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   4   4   9.2e-51   1.8e-47  160.9  11.3     1   201  1217  1423  1217  1423 0.97 Ion transport protein
397             // PKD_channel          PF08016.5    426 jgi|Nemve1|7|gw.28.1.1 -           1604   5.9e-19   67.4  70.5   1   8   0.00053       1.1    7.3   0.4   220   264   142   191   134   200 0.73 Polycystin cation channel
398             final String tokens[] = line.split( "\\s+" );
399             final String target_id = tokens[ 0 ];
400             final String target_acc = tokens[ 1 ];
401             final int tlen = parseInt( tokens[ 2 ], line_number, "tlen" );
402             final String query = tokens[ 3 ];
403             final String query_acc = tokens[ 4 ];
404             final int qlen = parseInt( tokens[ 5 ], line_number, "qlen" );
405             final double fs_e_value = parseDouble( tokens[ 6 ], line_number, "E-value" );
406             final double fs_score = parseDouble( tokens[ 7 ], line_number, "score" );
407             final int domain_number = parseInt( tokens[ 9 ], line_number, "count" );
408             final int total_domains = parseInt( tokens[ 10 ], line_number, "total" );
409             final double c_e_value = parseDouble( tokens[ 11 ], line_number, "c-Evalue" );
410             final double i_e_value = parseDouble( tokens[ 12 ], line_number, "i-Evalue" );
411             final double domain_score = parseDouble( tokens[ 13 ], line_number, "score" );
412             final int hmm_from = parseInt( tokens[ 15 ], line_number, "hmm from" );
413             final int hmm_to = parseInt( tokens[ 16 ], line_number, "hmm to" );
414             final int ali_from = parseInt( tokens[ 17 ], line_number, "ali from" );
415             final int ali_to = parseInt( tokens[ 18 ], line_number, "ali to" );
416             final int env_from = parseInt( tokens[ 19 ], line_number, "env from" );
417             final int env_to = parseInt( tokens[ 20 ], line_number, "env to" );
418             ++_domains_encountered;
419             if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) {
420                 if ( !isAllowProteinsWithSameName() ) {
421                     if ( query.equals( prev_query ) ) {
422                         throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
423                                 + ", " + prev_qlen );
424                     }
425                     if ( prev_queries.contains( query ) ) {
426                         throw new IOException( "more than one protein named [" + query + "]" );
427                     }
428                 }
429                 prev_query = query;
430                 prev_qlen = qlen;
431                 prev_queries.add( query );
432                 if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
433                     addProtein( proteins, current_protein );
434                 }
435                 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
436                     current_protein = new BasicProtein( query, getSpecies(), qlen );
437                 }
438                 else {
439                     throw new IllegalArgumentException( "unknown return type" );
440                 }
441             }
442             boolean failed_cutoff = false;
443             if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE ) {
444                 if ( getIndividualScoreCutoffs().containsKey( target_id ) ) {
445                     final double cutoff = getIndividualScoreCutoffs().get( target_id );
446                     if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE ) {
447                         if ( fs_score < cutoff ) {
448                             failed_cutoff = true;
449                         }
450                     }
451                     else if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.DOMAIN ) {
452                         if ( domain_score < cutoff ) {
453                             failed_cutoff = true;
454                         }
455                     }
456                 }
457                 else {
458                     throw new IOException( "could not find a score cutoff value for domain id \"" + target_id
459                             + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
460                 }
461             }
462             final String uc_id = target_id.toUpperCase();
463             final int env_length = 1 + env_to - env_from;
464             if ( failed_cutoff ) {
465                 ++_domains_ignored_due_to_individual_score_cutoff;
466             }
467             else if ( ali_from == ali_to ) {
468                 //Ignore
469             }
470             else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
471                     && ( fs_e_value > getFsEValueMaximum() ) ) {
472                 ++_domains_ignored_due_to_fs_e_value;
473             }
474             else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
475                     && ( i_e_value > getIEValueMaximum() ) ) {
476                 ++_domains_ignored_due_to_i_e_value;
477             }
478             //
479             else if ( ( getRelEnvLengthRatioCutoff() > 0.0 )
480                     && (  env_length < ( getRelEnvLengthRatioCutoff() * tlen)   ) ) {
481                 ++_domains_ignored_due_to_rel_env_length_ratio_cutoff;
482             }
483             //
484             else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
485                 ++_domains_ignored_due_to_duf;
486             }
487             else if ( isIgnoreVirusLikeIds()
488                     && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
489                             || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
490                             || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) || uc_id.startsWith( BACULO ) ) ) {
491                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id );
492                 ++_domains_ignored_due_to_virus_like_id;
493             }
494             else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( target_id ) ) {
495                 ++_domains_ignored_due_to_negative_domain_filter;
496                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id );
497             }
498             else {
499                 try {
500                     final Domain pd = new BasicDomain( target_id,
501                                                        ali_from,
502                                                        ali_to,
503                                                        ( short ) domain_number,
504                                                        ( short ) total_domains,
505                                                        i_e_value,
506                                                        domain_score,
507                                                        ( short ) tlen,
508                                                        ( short ) hmm_from,
509                                                        ( short ) hmm_to );
510                     current_protein.addProteinDomain( pd );
511                 }
512                 catch ( final IllegalArgumentException e ) {
513                     throw new IOException( "problem with domain parsing at line " + line_number + "[" + line + "]: "
514                             + e.getMessage() );
515                 }
516                 ++_domains_stored;
517             }
518         } // while ( ( line = br.readLine() ) != null )
519         if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
520             addProtein( proteins, current_protein );
521         }
522         setProteinsEncountered( prev_queries.size() );
523         setTime( new Date().getTime() - start_time );
524         return proteins;
525     }
526
527     private double parseDouble( final String double_str, final int line_number, final String label )
528             throws IOException {
529         double d = -1;
530         try {
531             d = Double.valueOf( double_str ).doubleValue();
532         }
533         catch ( final NumberFormatException e ) {
534             throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number
535                     + "] in [" + getInputFile().getCanonicalPath() + "]" );
536         }
537         return d;
538     }
539
540     private int parseInt( final String double_str, final int line_number, final String label ) throws IOException {
541         int i = -1;
542         try {
543             i = Integer.valueOf( double_str ).intValue();
544         }
545         catch ( final NumberFormatException e ) {
546             throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number
547                     + "] in [" + getInputFile().getCanonicalPath() + "]" );
548         }
549         return i;
550     }
551
552     private void setDomainsEncountered( final int domains_encountered ) {
553         _domains_encountered = domains_encountered;
554     }
555
556     private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) {
557         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
558     }
559
560     private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) {
561         _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value;
562     }
563
564     private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) {
565         _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value;
566     }
567     
568     private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) {
569         _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff;
570     }
571
572     
573     
574     private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
575         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
576     }
577
578     private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map<String, Integer> domains_ignored_due_to_negative_domain_filter_counts_map ) {
579         _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map;
580     }
581
582     private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) {
583         _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter;
584     }
585
586     private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) {
587         _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap;
588     }
589
590     private void setDomainsIgnoredDueToVirusLikeId( final int i ) {
591         _domains_ignored_due_to_virus_like_id = i;
592     }
593
594     private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map<String, Integer> domains_ignored_due_to_virus_like_id_counts_map ) {
595         _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map;
596     }
597
598     private void setDomainsStored( final int domains_stored ) {
599         _domains_stored = domains_stored;
600     }
601
602     private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
603         _domains_stored_set = _storeddomains_stored;
604     }
605
606     public void setFsEValueMaximum( final double fs_e_value_maximum ) {
607         if ( fs_e_value_maximum < 0.0 ) {
608             throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
609         }
610         _fs_e_value_maximum = fs_e_value_maximum;
611     }
612
613     public void setIEValueMaximum( final double i_e_value_maximum ) {
614         if ( i_e_value_maximum < 0.0 ) {
615             throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
616         }
617         _i_e_value_maximum = i_e_value_maximum;
618     }
619     
620     public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) {
621         if ( rel_env_length_ratio_cutoff <= 0.0 ) {
622             throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" );
623         }
624         _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff;
625     }
626
627     public void setIgnoreDufs( final boolean ignore_dufs ) {
628         _ignore_dufs = ignore_dufs;
629     }
630
631     /**
632      * To ignore domains which are completely engulfed by domains (individual
633      * ones or stretches of overlapping ones) with better support values.
634      *
635      *
636      * @param ignored_engulfed_domains
637      */
638     public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
639         _ignore_engulfed_domains = ignore_engulfed_domains;
640     }
641
642     public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) {
643         _ignore_virus_like_ids = ignore_virus_like_ids;
644     }
645
646     /**
647      * Sets the individual  score cutoff values (for example, gathering
648      * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
649      *
650      * @param individual_score_cutoffs
651      */
652     public void setIndividualScoreCutoffs( final Map<String, Double> individual_score_cutoffs ) {
653         _individual_score_cutoffs = individual_score_cutoffs;
654     }
655
656     public void setMaxAllowedOverlap( final int max_allowed_overlap ) {
657         if ( max_allowed_overlap < 0 ) {
658             throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." );
659         }
660         _max_allowed_overlap = max_allowed_overlap;
661     }
662
663     private void setProteinsEncountered( final int proteins_encountered ) {
664         _proteins_encountered = proteins_encountered;
665     }
666
667     private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) {
668         _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter;
669     }
670
671     private void setProteinsStored( final int proteins_stored ) {
672         _proteins_stored = proteins_stored;
673     }
674
675     public void setReturnType( final ReturnType return_type ) {
676         _return_type = return_type;
677     }
678
679     private void setTime( final long time ) {
680         _time = time;
681     }
682
683     public static enum FilterType {
684                                    NONE,
685                                    POSITIVE_PROTEIN,
686                                    NEGATIVE_PROTEIN,
687                                    NEGATIVE_DOMAIN
688     }
689
690     static public enum INDIVIDUAL_SCORE_CUTOFF {
691                                                 FULL_SEQUENCE,
692                                                 DOMAIN,
693                                                 NONE;
694     }
695
696     public static enum ReturnType {
697                                    UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
698     }
699 }