inprogress
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmscanPerDomainTableParser.java
1 // $Id:
2 // $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.io.parsers;
29
30 import java.io.BufferedReader;
31 import java.io.File;
32 import java.io.FileReader;
33 import java.io.IOException;
34 import java.util.ArrayList;
35 import java.util.Date;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeMap;
42 import java.util.TreeSet;
43
44 import org.forester.protein.BasicDomain;
45 import org.forester.protein.BasicProtein;
46 import org.forester.protein.Domain;
47 import org.forester.protein.DomainId;
48 import org.forester.protein.Protein;
49 import org.forester.surfacing.SurfacingUtil;
50 import org.forester.util.ForesterUtil;
51
52 public final class HmmscanPerDomainTableParser {
53
54     private static final String           RETRO                       = "RETRO";
55     private static final String           PHAGE                       = "PHAGE";
56     private static final String           VIR                         = "VIR";
57     private static final String           TRANSPOS                    = "TRANSPOS";
58     private static final String           RV                          = "RV";
59     private static final String           GAG                         = "GAG_";
60     private static final String           HCV                         = "HCV_";
61     private static final String           HERPES                      = "HERPES_";
62     private static final String           BACULO                      = "BACULO_";
63     private static final int              E_VALUE_MAXIMUM_DEFAULT     = -1;
64     private static final ReturnType       RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
65     private static final boolean          IGNORE_DUFS_DEFAULT         = false;
66     private static final int              MAX_ALLOWED_OVERLAP_DEFAULT = -1;
67     private static final boolean          IGNORE_REPLACED_RRMS        = false;
68     private final Set<DomainId>           _filter;
69     private final FilterType              _filter_type;
70     private final File                    _input_file;
71     private final String                  _species;
72     private double                        _e_value_maximum;
73     private Map<String, Double>           _individual_score_cutoffs;
74     private boolean                       _ignore_dufs;
75     private boolean                       _ignore_virus_like_ids;
76     private int                           _max_allowed_overlap;
77     private boolean                       _ignore_engulfed_domains;
78     private ReturnType                    _return_type;
79     private int                           _proteins_encountered;
80     private int                           _proteins_ignored_due_to_filter;
81     private int                           _proteins_stored;
82     private int                           _domains_encountered;
83     private int                           _domains_ignored_due_to_duf;
84     private int                           _domains_ignored_due_to_overlap;
85     private int                           _domains_ignored_due_to_e_value;
86     private int                           _domains_ignored_due_to_individual_score_cutoff;
87     private int                           _domains_stored;
88     private SortedSet<DomainId>           _domains_stored_set;
89     private long                          _time;
90     private int                           _domains_ignored_due_to_negative_domain_filter;
91     private Map<String, Integer>          _domains_ignored_due_to_negative_domain_filter_counts_map;
92     private int                           _domains_ignored_due_to_virus_like_id;
93     private Map<String, Integer>          _domains_ignored_due_to_virus_like_id_counts_map;
94     private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff;
95     private final boolean                 _allow_proteins_with_same_name;
96
97     public HmmscanPerDomainTableParser( final File input_file,
98                                         final String species,
99                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
100         _input_file = input_file;
101         _species = species;
102         _filter = null;
103         _filter_type = FilterType.NONE;
104         _ind_cutoff = individual_cutoff_applies_to;
105         _allow_proteins_with_same_name = false;
106         init();
107     }
108
109     public HmmscanPerDomainTableParser( final File input_file,
110                                         final String species,
111                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
112                                         final boolean allow_proteins_with_same_name ) {
113         _input_file = input_file;
114         _species = species;
115         _filter = null;
116         _filter_type = FilterType.NONE;
117         _ind_cutoff = individual_cutoff_applies_to;
118         _allow_proteins_with_same_name = allow_proteins_with_same_name;
119         init();
120     }
121
122     public HmmscanPerDomainTableParser( final File input_file,
123                                         final String species,
124                                         final Set<DomainId> filter,
125                                         final FilterType filter_type,
126                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
127         _input_file = input_file;
128         _species = species;
129         _filter = filter;
130         _filter_type = filter_type;
131         _ind_cutoff = individual_cutoff_applies_to;
132         _allow_proteins_with_same_name = false;
133         init();
134     }
135
136     public HmmscanPerDomainTableParser( final File input_file,
137                                         final String species,
138                                         final Set<DomainId> filter,
139                                         final FilterType filter_type,
140                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
141                                         final boolean allow_proteins_with_same_name ) {
142         _input_file = input_file;
143         _species = species;
144         _filter = filter;
145         _filter_type = filter_type;
146         _ind_cutoff = individual_cutoff_applies_to;
147         _allow_proteins_with_same_name = allow_proteins_with_same_name;
148         init();
149     }
150
151     public boolean isAllowProteinsWithSameName() {
152         return _allow_proteins_with_same_name;
153     }
154
155     private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
156         final List<Domain> l = current_protein.getProteinDomains();
157         for( final Domain d : l ) {
158             getDomainsStoredSet().add( d.getDomainId() );
159         }
160         proteins.add( current_protein );
161         ++_proteins_stored;
162     }
163
164     private void addProtein( final List<Protein> proteins, Protein current_protein ) {
165         if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT )
166                 || isIgnoreEngulfedDomains() ) {
167             final int domains_count = current_protein.getNumberOfProteinDomains();
168             current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
169                                                                       isIgnoreEngulfedDomains(),
170                                                                       current_protein );
171             final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
172             _domains_stored -= domains_removed;
173             _domains_ignored_due_to_overlap += domains_removed;
174         }
175         if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
176             final Set<DomainId> domain_ids_in_protein = new HashSet<DomainId>();
177             for( final Domain d : current_protein.getProteinDomains() ) {
178                 domain_ids_in_protein.add( d.getDomainId() );
179             }
180             domain_ids_in_protein.retainAll( getFilter() );
181             if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) {
182                 if ( domain_ids_in_protein.size() > 0 ) {
183                     actuallyAddProtein( proteins, current_protein );
184                 }
185                 else {
186                     ++_proteins_ignored_due_to_filter;
187                 }
188             }
189             else {
190                 if ( domain_ids_in_protein.size() < 1 ) {
191                     actuallyAddProtein( proteins, current_protein );
192                 }
193                 else {
194                     ++_proteins_ignored_due_to_filter;
195                 }
196             }
197         }
198         else {
199             actuallyAddProtein( proteins, current_protein );
200         }
201     }
202
203     public int getDomainsEncountered() {
204         return _domains_encountered;
205     }
206
207     public int getDomainsIgnoredDueToDuf() {
208         return _domains_ignored_due_to_duf;
209     }
210
211     public int getDomainsIgnoredDueToEval() {
212         return _domains_ignored_due_to_e_value;
213     }
214
215     public int getDomainsIgnoredDueToIndividualScoreCutoff() {
216         return _domains_ignored_due_to_individual_score_cutoff;
217     }
218
219     public int getDomainsIgnoredDueToNegativeDomainFilter() {
220         return _domains_ignored_due_to_negative_domain_filter;
221     }
222
223     public Map<String, Integer> getDomainsIgnoredDueToNegativeDomainFilterCountsMap() {
224         return _domains_ignored_due_to_negative_domain_filter_counts_map;
225     }
226
227     public int getDomainsIgnoredDueToOverlap() {
228         return _domains_ignored_due_to_overlap;
229     }
230
231     public Map<String, Integer> getDomainsIgnoredDueToVirusLikeIdCountsMap() {
232         return _domains_ignored_due_to_virus_like_id_counts_map;
233     }
234
235     public int getDomainsIgnoredDueToVirusLikeIds() {
236         return _domains_ignored_due_to_virus_like_id;
237     }
238
239     public int getDomainsStored() {
240         return _domains_stored;
241     }
242
243     public SortedSet<DomainId> getDomainsStoredSet() {
244         return _domains_stored_set;
245     }
246
247     private double getEValueMaximum() {
248         return _e_value_maximum;
249     }
250
251     private Set<DomainId> getFilter() {
252         return _filter;
253     }
254
255     private FilterType getFilterType() {
256         return _filter_type;
257     }
258
259     public INDIVIDUAL_SCORE_CUTOFF getIndividualCutoffAppliesTo() {
260         return _ind_cutoff;
261     }
262
263     private Map<String, Double> getIndividualScoreCutoffs() {
264         return _individual_score_cutoffs;
265     }
266
267     private File getInputFile() {
268         return _input_file;
269     }
270
271     private int getMaxAllowedOverlap() {
272         return _max_allowed_overlap;
273     }
274
275     public int getProteinsEncountered() {
276         return _proteins_encountered;
277     }
278
279     public int getProteinsIgnoredDueToFilter() {
280         return _proteins_ignored_due_to_filter;
281     }
282
283     public int getProteinsStored() {
284         return _proteins_stored;
285     }
286
287     private ReturnType getReturnType() {
288         return _return_type;
289     }
290
291     private String getSpecies() {
292         return _species;
293     }
294
295     public long getTime() {
296         return _time;
297     }
298
299     private void init() {
300         _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
301         setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
302         setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
303         _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
304         setIndividualScoreCutoffs( null );
305         setIgnoreEngulfedDomains( false );
306         setIgnoreVirusLikeIds( false );
307         intitCounts();
308     }
309
310     private void intitCounts() {
311         setDomainsStoredSet( new TreeSet<DomainId>() );
312         setDomainsEncountered( 0 );
313         setProteinsEncountered( 0 );
314         setProteinsIgnoredDueToFilter( 0 );
315         setDomainsIgnoredDueToNegativeFilter( 0 );
316         setDomainsIgnoredDueToDuf( 0 );
317         setDomainsIgnoredDueToEval( 0 );
318         setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
319         setDomainsIgnoredDueToVirusLikeId( 0 );
320         setDomainsIgnoredDueToOverlap( 0 );
321         setDomainsStored( 0 );
322         setProteinsStored( 0 );
323         setTime( 0 );
324         setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap<String, Integer>() );
325         setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap<String, Integer>() );
326     }
327
328     private boolean isIgnoreDufs() {
329         return _ignore_dufs;
330     }
331
332     private boolean isIgnoreEngulfedDomains() {
333         return _ignore_engulfed_domains;
334     }
335
336     private boolean isIgnoreVirusLikeIds() {
337         return _ignore_virus_like_ids;
338     }
339
340     public List<Protein> parse() throws IOException {
341         if ( ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE )
342                 && ( ( getIndividualScoreCutoffs() == null ) || ( getIndividualScoreCutoffs().size() < 1 ) ) ) {
343             throw new RuntimeException( "attempt to use individual cuttoffs with having set them" );
344         }
345         intitCounts();
346         final Set<String> prev_queries = new HashSet<String>();
347         final String error = ForesterUtil.isReadableFile( getInputFile() );
348         if ( !ForesterUtil.isEmpty( error ) ) {
349             throw new IOException( error );
350         }
351         final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
352         String line;
353         final List<Protein> proteins = new ArrayList<Protein>();
354         Protein current_protein = null;
355         int line_number = 0;
356         final long start_time = new Date().getTime();
357         String prev_query = "";
358         int prev_qlen = -1;
359         while ( ( line = br.readLine() ) != null ) {
360             line_number++;
361             if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) {
362                 continue;
363             }
364             // 0                    1           2    3                      4           5      6        7      8      9  10  11        12        13     14    15      16  17      18  19      20  21  22      
365             // #                                                                              --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
366             // # target name        accession   tlen query name             accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
367             // #------------------- ---------- -----   -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
368             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   1   4   1.5e-41     3e-38  130.8  11.1     3   171   140   307   139   346 0.81 Ion transport protein
369             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   2   4   9.1e-45   1.8e-41  141.3  13.1     4   200   479   664   476   665 0.97 Ion transport protein
370             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   3   4   5.2e-45     1e-41  142.1  14.0     1   201   900  1117   900  1117 0.96 Ion transport protein
371             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   4   4   9.2e-51   1.8e-47  160.9  11.3     1   201  1217  1423  1217  1423 0.97 Ion transport protein
372             // PKD_channel          PF08016.5    426 jgi|Nemve1|7|gw.28.1.1 -           1604   5.9e-19   67.4  70.5   1   8   0.00053       1.1    7.3   0.4   220   264   142   191   134   200 0.73 Polycystin cation channel
373             final String tokens[] = line.split( "\\s+" );
374             final String target_id = tokens[ 0 ];
375             final String target_acc = tokens[ 1 ];
376             final int tlen = parseInt( tokens[ 2 ], line_number, "tlen" );
377             final String query = tokens[ 3 ];
378             final String query_acc = tokens[ 4 ];
379             final int qlen = parseInt( tokens[ 5 ], line_number, "qlen" );
380             final double fs_e_value = parseDouble( tokens[ 6 ], line_number, "E-value" );
381             final double fs_score = parseDouble( tokens[ 7 ], line_number, "score" );
382             final int domain_number = parseInt( tokens[ 9 ], line_number, "count" );
383             final int total_domains = parseInt( tokens[ 10 ], line_number, "total" );
384             final double c_e_value = parseDouble( tokens[ 11 ], line_number, "c-Evalue" );
385             final double i_e_value = parseDouble( tokens[ 12 ], line_number, "i-Evalue" );
386             final double domain_score = parseDouble( tokens[ 13 ], line_number, "score" );
387             final int hmm_from = parseInt( tokens[ 15 ], line_number, "hmm from" );
388             final int hmm_to = parseInt( tokens[ 16 ], line_number, "hmm to" );
389             final int ali_from = parseInt( tokens[ 17 ], line_number, "ali from" );
390             final int ali_to = parseInt( tokens[ 18 ], line_number, "ali to" );
391             final int env_from = parseInt( tokens[ 19 ], line_number, "env from" );
392             final int env_to = parseInt( tokens[ 20 ], line_number, "env to" );
393             ++_domains_encountered;
394             if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) {
395                 if ( !isAllowProteinsWithSameName() ) {
396                     if ( query.equals( prev_query ) ) {
397                         throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
398                                 + ", " + prev_qlen );
399                     }
400                     if ( prev_queries.contains( query ) ) {
401                         throw new IOException( "more than one protein named [" + query + "]" );
402                     }
403                 }
404                 prev_query = query;
405                 prev_qlen = qlen;
406                 prev_queries.add( query );
407                 if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
408                     addProtein( proteins, current_protein );
409                 }
410                 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
411                     current_protein = new BasicProtein( query, getSpecies(), qlen );
412                 }
413                 else {
414                     throw new IllegalArgumentException( "unknown return type" );
415                 }
416             }
417             boolean failed_cutoff = false;
418             if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE ) {
419                 if ( getIndividualScoreCutoffs().containsKey( target_id ) ) {
420                     final double cutoff = getIndividualScoreCutoffs().get( target_id );
421                     if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE ) {
422                         if ( fs_score < cutoff ) {
423                             failed_cutoff = true;
424                         }
425                     }
426                     else if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.DOMAIN ) {
427                         if ( domain_score < cutoff ) {
428                             failed_cutoff = true;
429                         }
430                     }
431                 }
432                 else {
433                     throw new IOException( "could not find a score cutoff value for domain id \"" + target_id
434                             + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
435                 }
436             }
437             final String uc_id = target_id.toUpperCase();
438             if ( failed_cutoff ) {
439                 ++_domains_ignored_due_to_individual_score_cutoff;
440             }
441             else if ( ali_from == ali_to ) {
442                 //Ignore
443             }
444             else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
445                     && ( fs_e_value > getEValueMaximum() ) ) {
446                 ++_domains_ignored_due_to_e_value;
447             }
448             else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
449                 ++_domains_ignored_due_to_duf;
450             }
451             else if ( IGNORE_REPLACED_RRMS
452                     && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
453                             .contains( "RRM_6" ) ) ) {
454             }
455             else if ( isIgnoreVirusLikeIds()
456                     && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
457                             || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
458                             || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) || uc_id.startsWith( BACULO ) ) ) {
459                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id );
460                 ++_domains_ignored_due_to_virus_like_id;
461             }
462             else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN )
463                     && getFilter().contains( new DomainId( target_id ) ) ) {
464                 ++_domains_ignored_due_to_negative_domain_filter;
465                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id );
466             }
467             else {
468                 try {
469                     final Domain pd = new BasicDomain( target_id,
470                                                        ali_from,
471                                                        ali_to,
472                                                        ( short ) domain_number,
473                                                        ( short ) total_domains,
474                                                        fs_e_value,
475                                                        fs_score,
476                                                        i_e_value,
477                                                        domain_score );
478                     current_protein.addProteinDomain( pd );
479                 }
480                 catch ( final IllegalArgumentException e ) {
481                     throw new IOException( "problem with domain parsing at line " + line_number + "[" + line + "]: "
482                             + e.getMessage() );
483                 }
484                 ++_domains_stored;
485             }
486         } // while ( ( line = br.readLine() ) != null )
487         if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
488             addProtein( proteins, current_protein );
489         }
490         setProteinsEncountered( prev_queries.size() );
491         setTime( new Date().getTime() - start_time );
492         return proteins;
493     }
494
495     private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException {
496         double d = -1;
497         try {
498             d = Double.valueOf( double_str ).doubleValue();
499         }
500         catch ( final NumberFormatException e ) {
501             throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number
502                     + "] in [" + getInputFile().getCanonicalPath() + "]" );
503         }
504         return d;
505     }
506
507     private int parseInt( final String double_str, final int line_number, final String label ) throws IOException {
508         int i = -1;
509         try {
510             i = Integer.valueOf( double_str ).intValue();
511         }
512         catch ( final NumberFormatException e ) {
513             throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number
514                     + "] in [" + getInputFile().getCanonicalPath() + "]" );
515         }
516         return i;
517     }
518
519     private void setDomainsEncountered( final int domains_encountered ) {
520         _domains_encountered = domains_encountered;
521     }
522
523     private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) {
524         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
525     }
526
527     private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
528         _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
529     }
530
531     private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
532         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
533     }
534
535     private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map<String, Integer> domains_ignored_due_to_negative_domain_filter_counts_map ) {
536         _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map;
537     }
538
539     private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) {
540         _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter;
541     }
542
543     private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) {
544         _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap;
545     }
546
547     private void setDomainsIgnoredDueToVirusLikeId( final int i ) {
548         _domains_ignored_due_to_virus_like_id = i;
549     }
550
551     private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map<String, Integer> domains_ignored_due_to_virus_like_id_counts_map ) {
552         _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map;
553     }
554
555     private void setDomainsStored( final int domains_stored ) {
556         _domains_stored = domains_stored;
557     }
558
559     private void setDomainsStoredSet( final SortedSet<DomainId> _storeddomains_stored ) {
560         _domains_stored_set = _storeddomains_stored;
561     }
562
563     public void setEValueMaximum( final double e_value_maximum ) {
564         if ( e_value_maximum < 0.0 ) {
565             throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
566         }
567         _e_value_maximum = e_value_maximum;
568     }
569
570     public void setIgnoreDufs( final boolean ignore_dufs ) {
571         _ignore_dufs = ignore_dufs;
572     }
573
574     /**
575      * To ignore domains which are completely engulfed by domains (individual
576      * ones or stretches of overlapping ones) with better support values.
577      * 
578      * 
579      * @param ignored_engulfed_domains
580      */
581     public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
582         _ignore_engulfed_domains = ignore_engulfed_domains;
583     }
584
585     public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) {
586         _ignore_virus_like_ids = ignore_virus_like_ids;
587     }
588
589     /**
590      * Sets the individual  score cutoff values (for example, gathering
591      * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
592      * 
593      * @param individual_score_cutoffs
594      */
595     public void setIndividualScoreCutoffs( final Map<String, Double> individual_score_cutoffs ) {
596         _individual_score_cutoffs = individual_score_cutoffs;
597     }
598
599     public void setMaxAllowedOverlap( final int max_allowed_overlap ) {
600         if ( max_allowed_overlap < 0 ) {
601             throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." );
602         }
603         _max_allowed_overlap = max_allowed_overlap;
604     }
605
606     private void setProteinsEncountered( final int proteins_encountered ) {
607         _proteins_encountered = proteins_encountered;
608     }
609
610     private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) {
611         _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter;
612     }
613
614     private void setProteinsStored( final int proteins_stored ) {
615         _proteins_stored = proteins_stored;
616     }
617
618     public void setReturnType( final ReturnType return_type ) {
619         _return_type = return_type;
620     }
621
622     private void setTime( final long time ) {
623         _time = time;
624     }
625
626     public static enum FilterType {
627         NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
628     }
629
630     static public enum INDIVIDUAL_SCORE_CUTOFF {
631         FULL_SEQUENCE, DOMAIN, NONE;
632     }
633
634     public static enum ReturnType {
635         UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
636     }
637 }