mb parsing
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmscanPerDomainTableParser.java
1 // $Id:
2 // $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
27
28 package org.forester.io.parsers;
29
30 import java.io.BufferedReader;
31 import java.io.File;
32 import java.io.FileReader;
33 import java.io.IOException;
34 import java.util.ArrayList;
35 import java.util.Date;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeMap;
42 import java.util.TreeSet;
43
44 import org.forester.surfacing.BasicDomain;
45 import org.forester.surfacing.BasicProtein;
46 import org.forester.surfacing.Domain;
47 import org.forester.surfacing.DomainId;
48 import org.forester.surfacing.Protein;
49 import org.forester.surfacing.SurfacingUtil;
50 import org.forester.util.ForesterUtil;
51
52 public final class HmmscanPerDomainTableParser {
53
54     private static final String           RETRO                       = "RETRO";
55     private static final String           PHAGE                       = "PHAGE";
56     private static final String           VIR                         = "VIR";
57     private static final String           TRANSPOS                    = "TRANSPOS";
58     private static final String           RV                          = "RV";
59     private static final String           GAG                         = "GAG_";
60     private static final String           HCV                         = "HCV_";
61     private static final String           HERPES                      = "HERPES_";
62     private static final String           BACULO                      = "BACULO_";
63     private static final int              E_VALUE_MAXIMUM_DEFAULT     = -1;
64     private static final ReturnType       RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
65     private static final boolean          IGNORE_DUFS_DEFAULT         = false;
66     private static final int              MAX_ALLOWED_OVERLAP_DEFAULT = -1;
67     private final Set<DomainId>           _filter;
68     private final FilterType              _filter_type;
69     private final File                    _input_file;
70     private final String                  _species;
71     private double                        _e_value_maximum;
72     private Map<String, Double>           _individual_score_cutoffs;
73     private boolean                       _ignore_dufs;
74     private boolean                       _ignore_virus_like_ids;
75     private int                           _max_allowed_overlap;
76     private boolean                       _ignore_engulfed_domains;
77     private ReturnType                    _return_type;
78     private int                           _proteins_encountered;
79     private int                           _proteins_ignored_due_to_filter;
80     private int                           _proteins_stored;
81     private int                           _domains_encountered;
82     private int                           _domains_ignored_due_to_duf;
83     private int                           _domains_ignored_due_to_overlap;
84     private int                           _domains_ignored_due_to_e_value;
85     private int                           _domains_ignored_due_to_individual_score_cutoff;
86     private int                           _domains_stored;
87     private SortedSet<DomainId>           _domains_stored_set;
88     private long                          _time;
89     private int                           _domains_ignored_due_to_negative_domain_filter;
90     private Map<String, Integer>          _domains_ignored_due_to_negative_domain_filter_counts_map;
91     private int                           _domains_ignored_due_to_virus_like_id;
92     private Map<String, Integer>          _domains_ignored_due_to_virus_like_id_counts_map;
93     private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff;
94     private final boolean                 _allow_proteins_with_same_name;
95
96     public HmmscanPerDomainTableParser( final File input_file,
97                                         final String species,
98                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
99         _input_file = input_file;
100         _species = species;
101         _filter = null;
102         _filter_type = FilterType.NONE;
103         _ind_cutoff = individual_cutoff_applies_to;
104         _allow_proteins_with_same_name = false;
105         init();
106     }
107
108     public HmmscanPerDomainTableParser( final File input_file,
109                                         final String species,
110                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
111                                         final boolean allow_proteins_with_same_name ) {
112         _input_file = input_file;
113         _species = species;
114         _filter = null;
115         _filter_type = FilterType.NONE;
116         _ind_cutoff = individual_cutoff_applies_to;
117         _allow_proteins_with_same_name = allow_proteins_with_same_name;
118         init();
119     }
120
121     public HmmscanPerDomainTableParser( final File input_file,
122                                         final String species,
123                                         final Set<DomainId> filter,
124                                         final FilterType filter_type,
125                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
126         _input_file = input_file;
127         _species = species;
128         _filter = filter;
129         _filter_type = filter_type;
130         _ind_cutoff = individual_cutoff_applies_to;
131         _allow_proteins_with_same_name = false;
132         init();
133     }
134
135     public HmmscanPerDomainTableParser( final File input_file,
136                                         final String species,
137                                         final Set<DomainId> filter,
138                                         final FilterType filter_type,
139                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
140                                         final boolean allow_proteins_with_same_name ) {
141         _input_file = input_file;
142         _species = species;
143         _filter = filter;
144         _filter_type = filter_type;
145         _ind_cutoff = individual_cutoff_applies_to;
146         _allow_proteins_with_same_name = allow_proteins_with_same_name;
147         init();
148     }
149
150     public boolean isAllowProteinsWithSameName() {
151         return _allow_proteins_with_same_name;
152     }
153
154     private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
155         final List<Domain> l = current_protein.getProteinDomains();
156         for( final Domain d : l ) {
157             getDomainsStoredSet().add( d.getDomainId() );
158         }
159         proteins.add( current_protein );
160         ++_proteins_stored;
161     }
162
163     private void addProtein( final List<Protein> proteins, Protein current_protein ) {
164         if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT )
165                 || isIgnoreEngulfedDomains() ) {
166             final int domains_count = current_protein.getNumberOfProteinDomains();
167             current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
168                                                                       isIgnoreEngulfedDomains(),
169                                                                       current_protein );
170             final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
171             _domains_stored -= domains_removed;
172             _domains_ignored_due_to_overlap += domains_removed;
173         }
174         if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
175             final Set<DomainId> domain_ids_in_protein = new HashSet<DomainId>();
176             for( final Domain d : current_protein.getProteinDomains() ) {
177                 domain_ids_in_protein.add( d.getDomainId() );
178             }
179             domain_ids_in_protein.retainAll( getFilter() );
180             if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) {
181                 if ( domain_ids_in_protein.size() > 0 ) {
182                     actuallyAddProtein( proteins, current_protein );
183                 }
184                 else {
185                     ++_proteins_ignored_due_to_filter;
186                 }
187             }
188             else {
189                 if ( domain_ids_in_protein.size() < 1 ) {
190                     actuallyAddProtein( proteins, current_protein );
191                 }
192                 else {
193                     ++_proteins_ignored_due_to_filter;
194                 }
195             }
196         }
197         else {
198             actuallyAddProtein( proteins, current_protein );
199         }
200     }
201
202     public int getDomainsEncountered() {
203         return _domains_encountered;
204     }
205
206     public int getDomainsIgnoredDueToDuf() {
207         return _domains_ignored_due_to_duf;
208     }
209
210     public int getDomainsIgnoredDueToEval() {
211         return _domains_ignored_due_to_e_value;
212     }
213
214     public int getDomainsIgnoredDueToIndividualScoreCutoff() {
215         return _domains_ignored_due_to_individual_score_cutoff;
216     }
217
218     public int getDomainsIgnoredDueToNegativeDomainFilter() {
219         return _domains_ignored_due_to_negative_domain_filter;
220     }
221
222     public Map<String, Integer> getDomainsIgnoredDueToNegativeDomainFilterCountsMap() {
223         return _domains_ignored_due_to_negative_domain_filter_counts_map;
224     }
225
226     public int getDomainsIgnoredDueToOverlap() {
227         return _domains_ignored_due_to_overlap;
228     }
229
230     public Map<String, Integer> getDomainsIgnoredDueToVirusLikeIdCountsMap() {
231         return _domains_ignored_due_to_virus_like_id_counts_map;
232     }
233
234     public int getDomainsIgnoredDueToVirusLikeIds() {
235         return _domains_ignored_due_to_virus_like_id;
236     }
237
238     public int getDomainsStored() {
239         return _domains_stored;
240     }
241
242     public SortedSet<DomainId> getDomainsStoredSet() {
243         return _domains_stored_set;
244     }
245
246     private double getEValueMaximum() {
247         return _e_value_maximum;
248     }
249
250     private Set<DomainId> getFilter() {
251         return _filter;
252     }
253
254     private FilterType getFilterType() {
255         return _filter_type;
256     }
257
258     public INDIVIDUAL_SCORE_CUTOFF getIndividualCutoffAppliesTo() {
259         return _ind_cutoff;
260     }
261
262     private Map<String, Double> getIndividualScoreCutoffs() {
263         return _individual_score_cutoffs;
264     }
265
266     private File getInputFile() {
267         return _input_file;
268     }
269
270     private int getMaxAllowedOverlap() {
271         return _max_allowed_overlap;
272     }
273
274     public int getProteinsEncountered() {
275         return _proteins_encountered;
276     }
277
278     public int getProteinsIgnoredDueToFilter() {
279         return _proteins_ignored_due_to_filter;
280     }
281
282     public int getProteinsStored() {
283         return _proteins_stored;
284     }
285
286     private ReturnType getReturnType() {
287         return _return_type;
288     }
289
290     private String getSpecies() {
291         return _species;
292     }
293
294     public long getTime() {
295         return _time;
296     }
297
298     private void init() {
299         _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
300         setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
301         setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
302         _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
303         setIndividualScoreCutoffs( null );
304         setIgnoreEngulfedDomains( false );
305         setIgnoreVirusLikeIds( false );
306         intitCounts();
307     }
308
309     private void intitCounts() {
310         setDomainsStoredSet( new TreeSet<DomainId>() );
311         setDomainsEncountered( 0 );
312         setProteinsEncountered( 0 );
313         setProteinsIgnoredDueToFilter( 0 );
314         setDomainsIgnoredDueToNegativeFilter( 0 );
315         setDomainsIgnoredDueToDuf( 0 );
316         setDomainsIgnoredDueToEval( 0 );
317         setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
318         setDomainsIgnoredDueToVirusLikeId( 0 );
319         setDomainsIgnoredDueToOverlap( 0 );
320         setDomainsStored( 0 );
321         setProteinsStored( 0 );
322         setTime( 0 );
323         setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap<String, Integer>() );
324         setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap<String, Integer>() );
325     }
326
327     private boolean isIgnoreDufs() {
328         return _ignore_dufs;
329     }
330
331     private boolean isIgnoreEngulfedDomains() {
332         return _ignore_engulfed_domains;
333     }
334
335     private boolean isIgnoreVirusLikeIds() {
336         return _ignore_virus_like_ids;
337     }
338
339     public List<Protein> parse() throws IOException {
340         if ( ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE )
341                 && ( ( getIndividualScoreCutoffs() == null ) || ( getIndividualScoreCutoffs().size() < 1 ) ) ) {
342             throw new RuntimeException( "attempt to use individual cuttoffs with having set them" );
343         }
344         intitCounts();
345         final Set<String> prev_queries = new HashSet<String>();
346         final String error = ForesterUtil.isReadableFile( getInputFile() );
347         if ( !ForesterUtil.isEmpty( error ) ) {
348             throw new IOException( error );
349         }
350         final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
351         String line;
352         final List<Protein> proteins = new ArrayList<Protein>();
353         Protein current_protein = null;
354         int line_number = 0;
355         final long start_time = new Date().getTime();
356         String prev_query = "";
357         int prev_qlen = -1;
358         while ( ( line = br.readLine() ) != null ) {
359             line_number++;
360             if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) {
361                 continue;
362             }
363             // 0                    1           2    3                      4           5      6        7      8      9  10  11        12        13     14    15      16  17      18  19      20  21  22      
364             // #                                                                              --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
365             // # target name        accession   tlen query name             accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
366             // #------------------- ---------- -----   -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
367             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   1   4   1.5e-41     3e-38  130.8  11.1     3   171   140   307   139   346 0.81 Ion transport protein
368             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   2   4   9.1e-45   1.8e-41  141.3  13.1     4   200   479   664   476   665 0.97 Ion transport protein
369             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   3   4   5.2e-45     1e-41  142.1  14.0     1   201   900  1117   900  1117 0.96 Ion transport protein
370             // Ion_trans            PF00520.24   201 jgi|Nemve1|7|gw.28.1.1 -           1604  6.3e-169  557.4  95.3   4   4   9.2e-51   1.8e-47  160.9  11.3     1   201  1217  1423  1217  1423 0.97 Ion transport protein
371             // PKD_channel          PF08016.5    426 jgi|Nemve1|7|gw.28.1.1 -           1604   5.9e-19   67.4  70.5   1   8   0.00053       1.1    7.3   0.4   220   264   142   191   134   200 0.73 Polycystin cation channel
372             final String tokens[] = line.split( "\\s+" );
373             final String target_id = tokens[ 0 ];
374             final String target_acc = tokens[ 1 ];
375             final int tlen = parseInt( tokens[ 2 ], line_number, "tlen" );
376             final String query = tokens[ 3 ];
377             final String query_acc = tokens[ 4 ];
378             final int qlen = parseInt( tokens[ 5 ], line_number, "qlen" );
379             final double fs_e_value = parseDouble( tokens[ 6 ], line_number, "E-value" );
380             final double fs_score = parseDouble( tokens[ 7 ], line_number, "score" );
381             final int domain_number = parseInt( tokens[ 9 ], line_number, "count" );
382             final int total_domains = parseInt( tokens[ 10 ], line_number, "total" );
383             final double c_e_value = parseDouble( tokens[ 11 ], line_number, "c-Evalue" );
384             final double i_e_value = parseDouble( tokens[ 12 ], line_number, "i-Evalue" );
385             final double domain_score = parseDouble( tokens[ 13 ], line_number, "score" );
386             final int hmm_from = parseInt( tokens[ 15 ], line_number, "hmm from" );
387             final int hmm_to = parseInt( tokens[ 16 ], line_number, "hmm to" );
388             final int ali_from = parseInt( tokens[ 17 ], line_number, "ali from" );
389             final int ali_to = parseInt( tokens[ 18 ], line_number, "ali to" );
390             final int env_from = parseInt( tokens[ 19 ], line_number, "env from" );
391             final int env_to = parseInt( tokens[ 20 ], line_number, "env to" );
392             ++_domains_encountered;
393             if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) {
394                 if ( !isAllowProteinsWithSameName() ) {
395                     if ( query.equals( prev_query ) ) {
396                         throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
397                                 + ", " + prev_qlen );
398                     }
399                     if ( prev_queries.contains( query ) ) {
400                         throw new IOException( "more than one protein named [" + query + "]" );
401                     }
402                 }
403                 prev_query = query;
404                 prev_qlen = qlen;
405                 prev_queries.add( query );
406                 if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
407                     addProtein( proteins, current_protein );
408                 }
409                 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
410                     current_protein = new BasicProtein( query, getSpecies() );
411                 }
412                 else {
413                     throw new IllegalArgumentException( "unknown return type" );
414                 }
415             }
416             boolean failed_cutoff = false;
417             if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE ) {
418                 if ( getIndividualScoreCutoffs().containsKey( target_id ) ) {
419                     final double cutoff = getIndividualScoreCutoffs().get( target_id );
420                     if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE ) {
421                         if ( fs_score < cutoff ) {
422                             failed_cutoff = true;
423                         }
424                     }
425                     else if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.DOMAIN ) {
426                         if ( domain_score < cutoff ) {
427                             failed_cutoff = true;
428                         }
429                     }
430                 }
431                 else {
432                     throw new IOException( "could not find a score cutoff value for domain id \"" + target_id
433                             + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
434                 }
435             }
436             final String uc_id = target_id.toUpperCase();
437             if ( failed_cutoff ) {
438                 ++_domains_ignored_due_to_individual_score_cutoff;
439             }
440             else if ( ali_from == ali_to ) {
441                 //Ignore
442             }
443             else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
444                     && ( fs_e_value > getEValueMaximum() ) ) {
445                 ++_domains_ignored_due_to_e_value;
446             }
447             else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
448                 ++_domains_ignored_due_to_duf;
449             }
450             else if ( isIgnoreVirusLikeIds()
451                     && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
452                             || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
453                             || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) || uc_id.startsWith( BACULO ) ) ) {
454                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id );
455                 ++_domains_ignored_due_to_virus_like_id;
456             }
457             else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN )
458                     && getFilter().contains( new DomainId( target_id ) ) ) {
459                 ++_domains_ignored_due_to_negative_domain_filter;
460                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id );
461             }
462             else {
463                 try {
464                     final Domain pd = new BasicDomain( target_id,
465                                                        ali_from,
466                                                        ali_to,
467                                                        ( short ) domain_number,
468                                                        ( short ) total_domains,
469                                                        fs_e_value,
470                                                        fs_score,
471                                                        i_e_value,
472                                                        domain_score );
473                     current_protein.addProteinDomain( pd );
474                 }
475                 catch ( final IllegalArgumentException e ) {
476                     throw new IOException( "problem with domain parsing at line " + line_number + "[" + line + "]: "
477                             + e.getMessage() );
478                 }
479                 ++_domains_stored;
480             }
481         } // while ( ( line = br.readLine() ) != null )
482         if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
483             addProtein( proteins, current_protein );
484         }
485         setProteinsEncountered( prev_queries.size() );
486         setTime( new Date().getTime() - start_time );
487         return proteins;
488     }
489
490     private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException {
491         double d = -1;
492         try {
493             d = Double.valueOf( double_str ).doubleValue();
494         }
495         catch ( final NumberFormatException e ) {
496             throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number
497                     + "] in [" + getInputFile().getCanonicalPath() + "]" );
498         }
499         return d;
500     }
501
502     private int parseInt( final String double_str, final int line_number, final String label ) throws IOException {
503         int i = -1;
504         try {
505             i = Integer.valueOf( double_str ).intValue();
506         }
507         catch ( final NumberFormatException e ) {
508             throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number
509                     + "] in [" + getInputFile().getCanonicalPath() + "]" );
510         }
511         return i;
512     }
513
514     private void setDomainsEncountered( final int domains_encountered ) {
515         _domains_encountered = domains_encountered;
516     }
517
518     private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) {
519         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
520     }
521
522     private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
523         _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
524     }
525
526     private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
527         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
528     }
529
530     private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map<String, Integer> domains_ignored_due_to_negative_domain_filter_counts_map ) {
531         _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map;
532     }
533
534     private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) {
535         _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter;
536     }
537
538     private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) {
539         _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap;
540     }
541
542     private void setDomainsIgnoredDueToVirusLikeId( final int i ) {
543         _domains_ignored_due_to_virus_like_id = i;
544     }
545
546     private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map<String, Integer> domains_ignored_due_to_virus_like_id_counts_map ) {
547         _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map;
548     }
549
550     private void setDomainsStored( final int domains_stored ) {
551         _domains_stored = domains_stored;
552     }
553
554     private void setDomainsStoredSet( final SortedSet<DomainId> _storeddomains_stored ) {
555         _domains_stored_set = _storeddomains_stored;
556     }
557
558     public void setEValueMaximum( final double e_value_maximum ) {
559         if ( e_value_maximum < 0.0 ) {
560             throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
561         }
562         _e_value_maximum = e_value_maximum;
563     }
564
565     public void setIgnoreDufs( final boolean ignore_dufs ) {
566         _ignore_dufs = ignore_dufs;
567     }
568
569     /**
570      * To ignore domains which are completely engulfed by domains (individual
571      * ones or stretches of overlapping ones) with better support values.
572      * 
573      * 
574      * @param ignored_engulfed_domains
575      */
576     public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
577         _ignore_engulfed_domains = ignore_engulfed_domains;
578     }
579
580     public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) {
581         _ignore_virus_like_ids = ignore_virus_like_ids;
582     }
583
584     /**
585      * Sets the individual  score cutoff values (for example, gathering
586      * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
587      * 
588      * @param individual_score_cutoffs
589      */
590     public void setIndividualScoreCutoffs( final Map<String, Double> individual_score_cutoffs ) {
591         _individual_score_cutoffs = individual_score_cutoffs;
592     }
593
594     public void setMaxAllowedOverlap( final int max_allowed_overlap ) {
595         if ( max_allowed_overlap < 0 ) {
596             throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." );
597         }
598         _max_allowed_overlap = max_allowed_overlap;
599     }
600
601     private void setProteinsEncountered( final int proteins_encountered ) {
602         _proteins_encountered = proteins_encountered;
603     }
604
605     private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) {
606         _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter;
607     }
608
609     private void setProteinsStored( final int proteins_stored ) {
610         _proteins_stored = proteins_stored;
611     }
612
613     public void setReturnType( final ReturnType return_type ) {
614         _return_type = return_type;
615     }
616
617     private void setTime( final long time ) {
618         _time = time;
619     }
620
621     public static enum FilterType {
622         NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
623     }
624
625     static public enum INDIVIDUAL_SCORE_CUTOFF {
626         FULL_SEQUENCE, DOMAIN, NONE;
627     }
628
629     public static enum ReturnType {
630         UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
631     }
632 }