inprogress
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmPfamOutputParser.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.io.parsers;
28
29 import java.io.BufferedReader;
30 import java.io.File;
31 import java.io.FileReader;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Date;
35 import java.util.HashSet;
36 import java.util.List;
37 import java.util.Map;
38 import java.util.Set;
39 import java.util.SortedSet;
40 import java.util.TreeMap;
41 import java.util.TreeSet;
42
43 import org.forester.protein.BasicDomain;
44 import org.forester.protein.BasicProtein;
45 import org.forester.protein.Domain;
46 import org.forester.protein.Protein;
47 import org.forester.util.ForesterUtil;
48
49 public final class HmmPfamOutputParser {
50
51     private static final String     RETRO                       = "RETRO";
52     private static final String     PHAGE                       = "PHAGE";
53     private static final String     VIR                         = "VIR";
54     private static final String     TRANSPOS                    = "TRANSPOS";
55     private static final String     RV                          = "RV";
56     private static final String     GAG                         = "GAG_";
57     private static final String     HCV                         = "HCV_";                                                    // New. Added on Jun 11, after 1st submission.
58     private static final String     HERPES                      = "Herpes_";                                                 // New. Added on Jun 11, after 1st submission.
59     private static final int        E_VALUE_MAXIMUM_DEFAULT     = -1;
60     private static final ReturnType RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
61     private static final boolean    IGNORE_DUFS_DEFAULT         = false;
62     private static final int        MAX_ALLOWED_OVERLAP_DEFAULT = -1;
63     private final Set<String>       _filter;
64     private final FilterType        _filter_type;
65     private final File              _input_file;
66     private final String            _species;
67     private double                  _e_value_maximum;
68     private Map<String, String>     _individual_domain_score_cutoffs;
69     private boolean                 _ignore_dufs;
70     private boolean                 _ignore_virus_like_ids;
71     private boolean                 _allow_non_unique_query;
72     private boolean                 _verbose;
73     private int                     _max_allowed_overlap;
74     private boolean                 _ignore_engulfed_domains;
75     private ReturnType              _return_type;
76     private int                     _proteins_encountered;
77     private int                     _proteins_ignored_due_to_filter;
78     private int                     _proteins_stored;
79     private int                     _domains_encountered;
80     private int                     _domains_ignored_due_to_duf;
81     private int                     _domains_ignored_due_to_overlap;
82     private int                     _domains_ignored_due_to_e_value;
83     private int                     _domains_ignored_due_to_individual_score_cutoff;
84     private int                     _domains_stored;
85     private SortedSet<String>       _domains_stored_set;
86     private long                    _time;
87     private int                     _domains_ignored_due_to_negative_domain_filter;
88     private Map<String, Integer>    _domains_ignored_due_to_negative_domain_filter_counts_map;
89     private int                     _domains_ignored_due_to_virus_like_id;
90     private Map<String, Integer>    _domains_ignored_due_to_virus_like_id_counts_map;
91
92     public HmmPfamOutputParser( final File input_file, final String species, final String model_type ) {
93         _input_file = input_file;
94         _species = species;
95         _filter = null;
96         _filter_type = FilterType.NONE;
97         init();
98     }
99
100     public HmmPfamOutputParser( final File input_file,
101                                 final String species,
102                                 final Set<String> filter,
103                                 final FilterType filter_type ) {
104         _input_file = input_file;
105         _species = species;
106         _filter = filter;
107         _filter_type = filter_type;
108         init();
109     }
110
111     private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
112         final List<Domain> l = current_protein.getProteinDomains();
113         for( final Domain d : l ) {
114             getDomainsStoredSet().add( d.getDomainId() );
115         }
116         proteins.add( current_protein );
117         ++_proteins_stored;
118     }
119
120     private void addProtein( final List<Protein> proteins, final Protein current_protein ) {
121         if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
122             final Set<String> domain_ids_in_protein = new HashSet<String>();
123             for( final Domain d : current_protein.getProteinDomains() ) {
124                 domain_ids_in_protein.add( d.getDomainId() );
125             }
126             domain_ids_in_protein.retainAll( getFilter() );
127             if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) {
128                 if ( domain_ids_in_protein.size() > 0 ) {
129                     actuallyAddProtein( proteins, current_protein );
130                 }
131                 else {
132                     ++_proteins_ignored_due_to_filter;
133                 }
134             }
135             else {
136                 if ( domain_ids_in_protein.size() < 1 ) {
137                     actuallyAddProtein( proteins, current_protein );
138                 }
139                 else {
140                     ++_proteins_ignored_due_to_filter;
141                 }
142             }
143         }
144         else {
145             actuallyAddProtein( proteins, current_protein );
146         }
147     }
148
149     public int getDomainsEncountered() {
150         return _domains_encountered;
151     }
152
153     public int getDomainsIgnoredDueToDuf() {
154         return _domains_ignored_due_to_duf;
155     }
156
157     public int getDomainsIgnoredDueToEval() {
158         return _domains_ignored_due_to_e_value;
159     }
160
161     public int getDomainsIgnoredDueToIndividualScoreCutoff() {
162         return _domains_ignored_due_to_individual_score_cutoff;
163     }
164
165     public int getDomainsIgnoredDueToNegativeDomainFilter() {
166         return _domains_ignored_due_to_negative_domain_filter;
167     }
168
169     public Map<String, Integer> getDomainsIgnoredDueToNegativeDomainFilterCountsMap() {
170         return _domains_ignored_due_to_negative_domain_filter_counts_map;
171     }
172
173     public int getDomainsIgnoredDueToOverlap() {
174         return _domains_ignored_due_to_overlap;
175     }
176
177     public Map<String, Integer> getDomainsIgnoredDueToVirusLikeIdCountsMap() {
178         return _domains_ignored_due_to_virus_like_id_counts_map;
179     }
180
181     public int getDomainsIgnoredDueToVirusLikeIds() {
182         return _domains_ignored_due_to_virus_like_id;
183     }
184
185     public int getDomainsStored() {
186         return _domains_stored;
187     }
188
189     public SortedSet<String> getDomainsStoredSet() {
190         return _domains_stored_set;
191     }
192
193     private double getEValueMaximum() {
194         return _e_value_maximum;
195     }
196
197     private Set<String> getFilter() {
198         return _filter;
199     }
200
201     private FilterType getFilterType() {
202         return _filter_type;
203     }
204
205     private Map<String, String> getIndividualDomainScoreCutoffs() {
206         return _individual_domain_score_cutoffs;
207     }
208
209     private File getInputFile() {
210         return _input_file;
211     }
212
213     private int getMaxAllowedOverlap() {
214         return _max_allowed_overlap;
215     }
216
217     public int getProteinsEncountered() {
218         return _proteins_encountered;
219     }
220
221     public int getProteinsIgnoredDueToFilter() {
222         return _proteins_ignored_due_to_filter;
223     }
224
225     public int getProteinsStored() {
226         return _proteins_stored;
227     }
228
229     private ReturnType getReturnType() {
230         return _return_type;
231     }
232
233     private String getSpecies() {
234         return _species;
235     }
236
237     public long getTime() {
238         return _time;
239     }
240
241     private void init() {
242         _e_value_maximum = HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT;
243         setIgnoreDufs( HmmPfamOutputParser.IGNORE_DUFS_DEFAULT );
244         setReturnType( HmmPfamOutputParser.RETURN_TYPE_DEFAULT );
245         _max_allowed_overlap = HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT;
246         setIndividualDomainScoreCutoffs( null );
247         setIgnoreEngulfedDomains( false );
248         setIgnoreVirusLikeIds( false );
249         setAllowNonUniqueQuery( false );
250         setVerbose( false );
251         intitCounts();
252     }
253
254     private void intitCounts() {
255         setDomainsStoredSet( new TreeSet<String>() );
256         setDomainsEncountered( 0 );
257         setProteinsEncountered( 0 );
258         setProteinsIgnoredDueToFilter( 0 );
259         setDomainsIgnoredDueToNegativeFilter( 0 );
260         setDomainsIgnoredDueToDuf( 0 );
261         setDomainsIgnoredDueToEval( 0 );
262         setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
263         setDomainsIgnoredDueToVirusLikeId( 0 );
264         setDomainsIgnoredDueToOverlap( 0 );
265         setDomainsStored( 0 );
266         setProteinsStored( 0 );
267         setTime( 0 );
268         setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap<String, Integer>() );
269         setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap<String, Integer>() );
270     }
271
272     private boolean isAllowNonUniqueQuery() {
273         return _allow_non_unique_query;
274     }
275
276     private boolean isIgnoreDufs() {
277         return _ignore_dufs;
278     }
279
280     private boolean isIgnoreEngulfedDomains() {
281         return _ignore_engulfed_domains;
282     }
283
284     private boolean isIgnoreVirusLikeIds() {
285         return _ignore_virus_like_ids;
286     }
287
288     private boolean isVerbose() {
289         return _verbose;
290     }
291
292     public List<Protein> parse() throws IOException {
293         intitCounts();
294         final Set<String> queries = new HashSet<String>();
295         final String error = ForesterUtil.isReadableFile( getInputFile() );
296         if ( !ForesterUtil.isEmpty( error ) ) {
297             throw new IOException( error );
298         }
299         final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
300         String line;
301         final List<Protein> proteins = new ArrayList<Protein>();
302         Protein current_protein = null;
303         int line_number = 0;
304         boolean saw_double_slash = true;
305         boolean can_parse_domains = false;
306         boolean saw_parsed_for_domains = false;
307         boolean saw_query_sequence = false;
308         boolean was_not_unique = false;
309         final long start_time = new Date().getTime();
310         while ( ( line = br.readLine() ) != null ) {
311             line_number++;
312             if ( line.length() < 1 ) {
313                 continue;
314             }
315             else if ( line.startsWith( "Query sequence:" ) ) {
316                 ++_proteins_encountered;
317                 if ( !saw_double_slash ) {
318                     throw new IOException( "unexpected format [line " + line_number + "] in ["
319                             + getInputFile().getCanonicalPath() + "]" );
320                 }
321                 saw_double_slash = false;
322                 saw_query_sequence = true;
323                 was_not_unique = false;
324                 final String query = line.substring( 16 ).trim();
325                 if ( ForesterUtil.isEmpty( query ) ) {
326                     throw new IOException( "query sequence cannot be empty [line " + line_number + "] in ["
327                             + getInputFile().getCanonicalPath() + "]" );
328                 }
329                 if ( queries.contains( query ) ) {
330                     if ( !isAllowNonUniqueQuery() ) {
331                         throw new IOException( "query \"" + query + "\" is not unique [line " + line_number + "] in ["
332                                 + getInputFile().getCanonicalPath() + "]" );
333                     }
334                     else if ( isVerbose() ) {
335                         ForesterUtil.printWarningMessage( getClass().getName(), "query \"" + query
336                                 + "\" is not unique [line " + line_number + "] in ["
337                                 + getInputFile().getCanonicalPath() + "]" );
338                     }
339                 }
340                 else {
341                     queries.add( query );
342                 }
343                 if ( current_protein != null ) {
344                     throw new IOException( "unexpected format [line " + line_number + "] in ["
345                             + getInputFile().getCanonicalPath() + "]" );
346                 }
347                 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
348                     current_protein = new BasicProtein( query, getSpecies(), 0 );
349                 }
350                 else {
351                     throw new IllegalArgumentException( "unknown return type" );
352                 }
353             }
354             else if ( line.startsWith( "Accession:" ) ) {
355                 if ( !saw_query_sequence || ( current_protein == null ) ) {
356                     throw new IOException( "unexpected format [line " + line_number + "] in ["
357                             + getInputFile().getCanonicalPath() + "]" );
358                 }
359                 ( ( BasicProtein ) current_protein ).setAccession( line.substring( 11 ).trim() );
360             }
361             else if ( line.startsWith( "Description:" ) ) {
362                 if ( !saw_query_sequence || ( current_protein == null ) ) {
363                     throw new IOException( "unexpected format [line " + line_number + "] in ["
364                             + getInputFile().getCanonicalPath() + "]" );
365                 }
366                 if ( was_not_unique ) {
367                     if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
368                         current_protein = new BasicProtein( current_protein.getProteinId() + " "
369                                 + line.substring( 13 ).trim(), getSpecies(), 0 );
370                     }
371                 }
372                 else {
373                     ( ( BasicProtein ) current_protein ).setDescription( line.substring( 13 ).trim() );
374                 }
375             }
376             else if ( line.startsWith( "Parsed for domains:" ) ) {
377                 if ( !saw_query_sequence ) {
378                     throw new IOException( "unexpected format [line " + line_number + "] in ["
379                             + getInputFile().getCanonicalPath() + "]" );
380                 }
381                 saw_query_sequence = false;
382                 saw_parsed_for_domains = true;
383             }
384             else if ( saw_parsed_for_domains && line.startsWith( "--------" ) ) {
385                 can_parse_domains = true;
386                 saw_parsed_for_domains = false;
387             }
388             else if ( line.startsWith( "Alignments of top-scoring domains:" ) ) {
389                 if ( !can_parse_domains ) {
390                     throw new IOException( "unexpected format [line " + line_number + "] in ["
391                             + getInputFile().getCanonicalPath() + "]" );
392                 }
393                 can_parse_domains = false;
394             }
395             else if ( line.startsWith( "//" ) ) {
396                 can_parse_domains = false;
397                 saw_double_slash = true;
398                 if ( current_protein.getProteinDomains().size() > 0 ) {
399                     if ( ( getMaxAllowedOverlap() != HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT )
400                             || isIgnoreEngulfedDomains() ) {
401                         final int domains_count = current_protein.getNumberOfProteinDomains();
402                         current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
403                                                                                  isIgnoreEngulfedDomains(),
404                                                                                  current_protein );
405                         final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
406                         _domains_stored -= domains_removed;
407                         _domains_ignored_due_to_overlap += domains_removed;
408                     }
409                     addProtein( proteins, current_protein );
410                 }
411                 current_protein = null;
412             }
413             else if ( can_parse_domains && ( line.indexOf( "[no hits above thresholds]" ) == -1 ) ) {
414                 final String[] s = line.split( "\\s+" );
415                 if ( s.length != 10 ) {
416                     throw new IOException( "unexpected format in hmmpfam output:  \"" + line + "\" [line "
417                             + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
418                 }
419                 final String id = s[ 0 ];
420                 final String domain_count_str = s[ 1 ];
421                 final String from_str = s[ 2 ];
422                 final String to_str = s[ 3 ];
423                 final String query_match_str = s[ 4 ];
424                 final String hmm_match_str = s[ 7 ];
425                 final String score_str = s[ 8 ];
426                 final String e_value_str = s[ 9 ];
427                 int from = -1;
428                 int to = -1;
429                 double e_value = -1;
430                 double score = -1;
431                 try {
432                     from = Integer.valueOf( from_str ).intValue();
433                 }
434                 catch ( final NumberFormatException e ) {
435                     throw new IOException( "could not parse seq-f from \"" + line + "\" [line " + line_number
436                             + "] in [" + getInputFile().getCanonicalPath() + "]" );
437                 }
438                 try {
439                     to = Integer.valueOf( to_str ).intValue();
440                 }
441                 catch ( final NumberFormatException e ) {
442                     throw new IOException( "could not parse seq-t from \"" + line + "\" [line " + line_number
443                             + "] in [" + getInputFile().getCanonicalPath() + "]" );
444                 }
445                 try {
446                     score = Double.valueOf( score_str ).doubleValue();
447                 }
448                 catch ( final NumberFormatException e ) {
449                     throw new IOException( "could not parse score from \"" + line + "\" [line " + line_number
450                             + "] in [" + getInputFile().getCanonicalPath() + "]" );
451                 }
452                 try {
453                     e_value = Double.valueOf( e_value_str ).doubleValue();
454                 }
455                 catch ( final NumberFormatException e ) {
456                     throw new IOException( "could not parse E-value from \"" + line + "\" [line " + line_number
457                             + "] in [" + getInputFile().getCanonicalPath() + "]" );
458                 }
459                 if ( hmm_match_str.equals( "[]" ) ) {
460                     //is_complete_hmm_match = true;
461                 }
462                 else if ( !( hmm_match_str.equals( ".]" ) || hmm_match_str.equals( "[." ) || hmm_match_str
463                         .equals( ".." ) ) ) {
464                     throw new IOException( "unexpected format in hmmpfam output:  \"" + line + "\" [line "
465                             + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
466                 }
467                 if ( query_match_str.equals( ".." ) ) {
468                     // is_complete_query_match = true;
469                 }
470                 else if ( !( query_match_str.equals( ".]" ) || query_match_str.equals( "[." ) || query_match_str
471                         .equals( "[]" ) ) ) {
472                     throw new IOException( "unexpected format in hmmpfam output:  \"" + line + "\" [line "
473                             + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
474                 }
475                 final String[] ct = domain_count_str.split( "/" );
476                 if ( ct.length != 2 ) {
477                     throw new IOException( "unexpected format in hmmpfam output:  \"" + line + "\" [line "
478                             + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
479                 }
480                 final String number_str = ct[ 0 ];
481                 final String total_str = ct[ 1 ];
482                 int number = -1;
483                 int total = -1;
484                 try {
485                     number = Integer.valueOf( ( number_str ) ).intValue();
486                 }
487                 catch ( final NumberFormatException e ) {
488                     throw new IOException( "could not parse domain number from \"" + line + "\" [line " + line_number
489                             + "] in [" + getInputFile().getCanonicalPath() + "]" );
490                 }
491                 try {
492                     total = Integer.valueOf( ( total_str ) ).intValue();
493                 }
494                 catch ( final NumberFormatException e ) {
495                     throw new IOException( "could not parse domain count from \"" + line + "\" [line " + line_number
496                             + "] in [" + getInputFile().getCanonicalPath() + "]" );
497                 }
498                 ++_domains_encountered;
499                 boolean failed_cutoff = false;
500                 if ( getIndividualDomainScoreCutoffs() != null ) {
501                     if ( getIndividualDomainScoreCutoffs().containsKey( id ) ) {
502                         final double cutoff = Double.parseDouble( getIndividualDomainScoreCutoffs().get( id ) );
503                         if ( score < cutoff ) {
504                             failed_cutoff = true;
505                         }
506                     }
507                     else {
508                         throw new IOException( "could not find a score cutoff value for domain id \"" + id
509                                 + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
510                     }
511                 }
512                 final String uc_id = id.toUpperCase();
513                 if ( failed_cutoff ) {
514                     ++_domains_ignored_due_to_individual_score_cutoff;
515                 }
516                 else if ( ( getEValueMaximum() != HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT )
517                         && ( e_value > getEValueMaximum() ) ) {
518                     ++_domains_ignored_due_to_e_value;
519                 }
520                 else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
521                     ++_domains_ignored_due_to_duf;
522                 }
523                 else if ( isIgnoreVirusLikeIds()
524                         && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
525                                 || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
526                                 || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) ) ) {
527                     ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), id );
528                     ++_domains_ignored_due_to_virus_like_id;
529                 }
530                 else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( id ) ) {
531                     ++_domains_ignored_due_to_negative_domain_filter;
532                     ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), id );
533                 }
534                 else {
535                     final BasicDomain pd = new BasicDomain( id,
536                                                             from,
537                                                             to,
538                                                             ( short ) number,
539                                                             ( short ) total,
540                                                             e_value,
541                                                             score );
542                     current_protein.addProteinDomain( pd );
543                     ++_domains_stored;
544                 }
545             }
546         } // while ( ( line = br.readLine() ) != null )
547         setTime( new Date().getTime() - start_time );
548         if ( !saw_double_slash ) {
549             throw new IOException( "file ends unexpectedly [line " + line_number + "]" );
550         }
551         return proteins;
552     }
553
554     public void setAllowNonUniqueQuery( final boolean allow_non_unique_query ) {
555         _allow_non_unique_query = allow_non_unique_query;
556     }
557
558     private void setDomainsEncountered( final int domains_encountered ) {
559         _domains_encountered = domains_encountered;
560     }
561
562     private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) {
563         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
564     }
565
566     public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
567         _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
568     }
569
570     public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
571         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
572     }
573
574     private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map<String, Integer> domains_ignored_due_to_negative_domain_filter_counts_map ) {
575         _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map;
576     }
577
578     private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) {
579         _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter;
580     }
581
582     private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) {
583         _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap;
584     }
585
586     private void setDomainsIgnoredDueToVirusLikeId( final int i ) {
587         _domains_ignored_due_to_virus_like_id = i;
588     }
589
590     private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map<String, Integer> domains_ignored_due_to_virus_like_id_counts_map ) {
591         _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map;
592     }
593
594     private void setDomainsStored( final int domains_stored ) {
595         _domains_stored = domains_stored;
596     }
597
598     private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
599         _domains_stored_set = _storeddomains_stored;
600     }
601
602     public void setEValueMaximum( final double e_value_maximum ) {
603         if ( e_value_maximum < 0.0 ) {
604             throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
605         }
606         _e_value_maximum = e_value_maximum;
607     }
608
609     public void setIgnoreDufs( final boolean ignore_dufs ) {
610         _ignore_dufs = ignore_dufs;
611     }
612
613     /**
614      * To ignore domains which are completely engulfed by domains (individual
615      * ones or stretches of overlapping ones) with better support values.
616      * 
617      * 
618      * @param ignored_engulfed_domains
619      */
620     public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
621         _ignore_engulfed_domains = ignore_engulfed_domains;
622     }
623
624     public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) {
625         _ignore_virus_like_ids = ignore_virus_like_ids;
626     }
627
628     /**
629      * Sets the individual domain score cutoff values (for example, gathering
630      * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
631      * 
632      * @param individual_domain_score_cutoffs
633      */
634     public void setIndividualDomainScoreCutoffs( final Map<String, String> individual_domain_score_cutoffs ) {
635         _individual_domain_score_cutoffs = individual_domain_score_cutoffs;
636     }
637
638     public void setMaxAllowedOverlap( final int max_allowed_overlap ) {
639         if ( max_allowed_overlap < 0 ) {
640             throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." );
641         }
642         _max_allowed_overlap = max_allowed_overlap;
643     }
644
645     private void setProteinsEncountered( final int proteins_encountered ) {
646         _proteins_encountered = proteins_encountered;
647     }
648
649     private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) {
650         _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter;
651     }
652
653     private void setProteinsStored( final int proteins_stored ) {
654         _proteins_stored = proteins_stored;
655     }
656
657     public void setReturnType( final ReturnType return_type ) {
658         _return_type = return_type;
659     }
660
661     private void setTime( final long time ) {
662         _time = time;
663     }
664
665     public void setVerbose( final boolean verbose ) {
666         _verbose = verbose;
667     }
668
669     public static enum FilterType {
670         NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
671     }
672
673     public static enum ReturnType {
674         UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
675     }
676 }