3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.io.parsers;
29 import java.io.BufferedReader;
31 import java.io.FileReader;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Date;
35 import java.util.HashSet;
36 import java.util.List;
39 import java.util.SortedSet;
40 import java.util.TreeMap;
41 import java.util.TreeSet;
43 import org.forester.protein.BasicDomain;
44 import org.forester.protein.BasicProtein;
45 import org.forester.protein.Domain;
46 import org.forester.protein.Protein;
47 import org.forester.util.ForesterUtil;
49 public final class HmmPfamOutputParser {
51 private static final String RETRO = "RETRO";
52 private static final String PHAGE = "PHAGE";
53 private static final String VIR = "VIR";
54 private static final String TRANSPOS = "TRANSPOS";
55 private static final String RV = "RV";
56 private static final String GAG = "GAG_";
57 private static final String HCV = "HCV_"; // New. Added on Jun 11, after 1st submission.
58 private static final String HERPES = "Herpes_"; // New. Added on Jun 11, after 1st submission.
59 private static final int E_VALUE_MAXIMUM_DEFAULT = -1;
60 private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
61 private static final boolean IGNORE_DUFS_DEFAULT = false;
62 private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
63 private final Set<String> _filter;
64 private final FilterType _filter_type;
65 private final File _input_file;
66 private final String _species;
67 private double _e_value_maximum;
68 private Map<String, String> _individual_domain_score_cutoffs;
69 private boolean _ignore_dufs;
70 private boolean _ignore_virus_like_ids;
71 private boolean _allow_non_unique_query;
72 private boolean _verbose;
73 private int _max_allowed_overlap;
74 private boolean _ignore_engulfed_domains;
75 private ReturnType _return_type;
76 private int _proteins_encountered;
77 private int _proteins_ignored_due_to_filter;
78 private int _proteins_stored;
79 private int _domains_encountered;
80 private int _domains_ignored_due_to_duf;
81 private int _domains_ignored_due_to_overlap;
82 private int _domains_ignored_due_to_e_value;
83 private int _domains_ignored_due_to_individual_score_cutoff;
84 private int _domains_stored;
85 private SortedSet<String> _domains_stored_set;
87 private int _domains_ignored_due_to_negative_domain_filter;
88 private Map<String, Integer> _domains_ignored_due_to_negative_domain_filter_counts_map;
89 private int _domains_ignored_due_to_virus_like_id;
90 private Map<String, Integer> _domains_ignored_due_to_virus_like_id_counts_map;
92 public HmmPfamOutputParser( final File input_file, final String species, final String model_type ) {
93 _input_file = input_file;
96 _filter_type = FilterType.NONE;
100 public HmmPfamOutputParser( final File input_file,
101 final String species,
102 final Set<String> filter,
103 final FilterType filter_type ) {
104 _input_file = input_file;
107 _filter_type = filter_type;
111 private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
112 final List<Domain> l = current_protein.getProteinDomains();
113 for( final Domain d : l ) {
114 getDomainsStoredSet().add( d.getDomainId() );
116 proteins.add( current_protein );
120 private void addProtein( final List<Protein> proteins, final Protein current_protein ) {
121 if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
122 final Set<String> domain_ids_in_protein = new HashSet<String>();
123 for( final Domain d : current_protein.getProteinDomains() ) {
124 domain_ids_in_protein.add( d.getDomainId() );
126 domain_ids_in_protein.retainAll( getFilter() );
127 if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) {
128 if ( domain_ids_in_protein.size() > 0 ) {
129 actuallyAddProtein( proteins, current_protein );
132 ++_proteins_ignored_due_to_filter;
136 if ( domain_ids_in_protein.size() < 1 ) {
137 actuallyAddProtein( proteins, current_protein );
140 ++_proteins_ignored_due_to_filter;
145 actuallyAddProtein( proteins, current_protein );
149 public int getDomainsEncountered() {
150 return _domains_encountered;
153 public int getDomainsIgnoredDueToDuf() {
154 return _domains_ignored_due_to_duf;
157 public int getDomainsIgnoredDueToEval() {
158 return _domains_ignored_due_to_e_value;
161 public int getDomainsIgnoredDueToIndividualScoreCutoff() {
162 return _domains_ignored_due_to_individual_score_cutoff;
165 public int getDomainsIgnoredDueToNegativeDomainFilter() {
166 return _domains_ignored_due_to_negative_domain_filter;
169 public Map<String, Integer> getDomainsIgnoredDueToNegativeDomainFilterCountsMap() {
170 return _domains_ignored_due_to_negative_domain_filter_counts_map;
173 public int getDomainsIgnoredDueToOverlap() {
174 return _domains_ignored_due_to_overlap;
177 public Map<String, Integer> getDomainsIgnoredDueToVirusLikeIdCountsMap() {
178 return _domains_ignored_due_to_virus_like_id_counts_map;
181 public int getDomainsIgnoredDueToVirusLikeIds() {
182 return _domains_ignored_due_to_virus_like_id;
185 public int getDomainsStored() {
186 return _domains_stored;
189 public SortedSet<String> getDomainsStoredSet() {
190 return _domains_stored_set;
193 private double getEValueMaximum() {
194 return _e_value_maximum;
197 private Set<String> getFilter() {
201 private FilterType getFilterType() {
205 private Map<String, String> getIndividualDomainScoreCutoffs() {
206 return _individual_domain_score_cutoffs;
209 private File getInputFile() {
213 private int getMaxAllowedOverlap() {
214 return _max_allowed_overlap;
217 public int getProteinsEncountered() {
218 return _proteins_encountered;
221 public int getProteinsIgnoredDueToFilter() {
222 return _proteins_ignored_due_to_filter;
225 public int getProteinsStored() {
226 return _proteins_stored;
229 private ReturnType getReturnType() {
233 private String getSpecies() {
237 public long getTime() {
241 private void init() {
242 _e_value_maximum = HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT;
243 setIgnoreDufs( HmmPfamOutputParser.IGNORE_DUFS_DEFAULT );
244 setReturnType( HmmPfamOutputParser.RETURN_TYPE_DEFAULT );
245 _max_allowed_overlap = HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT;
246 setIndividualDomainScoreCutoffs( null );
247 setIgnoreEngulfedDomains( false );
248 setIgnoreVirusLikeIds( false );
249 setAllowNonUniqueQuery( false );
254 private void intitCounts() {
255 setDomainsStoredSet( new TreeSet<String>() );
256 setDomainsEncountered( 0 );
257 setProteinsEncountered( 0 );
258 setProteinsIgnoredDueToFilter( 0 );
259 setDomainsIgnoredDueToNegativeFilter( 0 );
260 setDomainsIgnoredDueToDuf( 0 );
261 setDomainsIgnoredDueToEval( 0 );
262 setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
263 setDomainsIgnoredDueToVirusLikeId( 0 );
264 setDomainsIgnoredDueToOverlap( 0 );
265 setDomainsStored( 0 );
266 setProteinsStored( 0 );
268 setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap<String, Integer>() );
269 setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap<String, Integer>() );
272 private boolean isAllowNonUniqueQuery() {
273 return _allow_non_unique_query;
276 private boolean isIgnoreDufs() {
280 private boolean isIgnoreEngulfedDomains() {
281 return _ignore_engulfed_domains;
284 private boolean isIgnoreVirusLikeIds() {
285 return _ignore_virus_like_ids;
288 private boolean isVerbose() {
292 public List<Protein> parse() throws IOException {
294 final Set<String> queries = new HashSet<String>();
295 final String error = ForesterUtil.isReadableFile( getInputFile() );
296 if ( !ForesterUtil.isEmpty( error ) ) {
297 throw new IOException( error );
299 final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) );
301 final List<Protein> proteins = new ArrayList<Protein>();
302 Protein current_protein = null;
304 boolean saw_double_slash = true;
305 boolean can_parse_domains = false;
306 boolean saw_parsed_for_domains = false;
307 boolean saw_query_sequence = false;
308 boolean was_not_unique = false;
309 final long start_time = new Date().getTime();
310 while ( ( line = br.readLine() ) != null ) {
312 if ( line.length() < 1 ) {
315 else if ( line.startsWith( "Query sequence:" ) ) {
316 ++_proteins_encountered;
317 if ( !saw_double_slash ) {
318 throw new IOException( "unexpected format [line " + line_number + "] in ["
319 + getInputFile().getCanonicalPath() + "]" );
321 saw_double_slash = false;
322 saw_query_sequence = true;
323 was_not_unique = false;
324 final String query = line.substring( 16 ).trim();
325 if ( ForesterUtil.isEmpty( query ) ) {
326 throw new IOException( "query sequence cannot be empty [line " + line_number + "] in ["
327 + getInputFile().getCanonicalPath() + "]" );
329 if ( queries.contains( query ) ) {
330 if ( !isAllowNonUniqueQuery() ) {
331 throw new IOException( "query \"" + query + "\" is not unique [line " + line_number + "] in ["
332 + getInputFile().getCanonicalPath() + "]" );
334 else if ( isVerbose() ) {
335 ForesterUtil.printWarningMessage( getClass().getName(), "query \"" + query
336 + "\" is not unique [line " + line_number + "] in ["
337 + getInputFile().getCanonicalPath() + "]" );
341 queries.add( query );
343 if ( current_protein != null ) {
344 throw new IOException( "unexpected format [line " + line_number + "] in ["
345 + getInputFile().getCanonicalPath() + "]" );
347 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
348 current_protein = new BasicProtein( query, getSpecies(), 0 );
351 throw new IllegalArgumentException( "unknown return type" );
354 else if ( line.startsWith( "Accession:" ) ) {
355 if ( !saw_query_sequence || ( current_protein == null ) ) {
356 throw new IOException( "unexpected format [line " + line_number + "] in ["
357 + getInputFile().getCanonicalPath() + "]" );
359 ( ( BasicProtein ) current_protein ).setAccession( line.substring( 11 ).trim() );
361 else if ( line.startsWith( "Description:" ) ) {
362 if ( !saw_query_sequence || ( current_protein == null ) ) {
363 throw new IOException( "unexpected format [line " + line_number + "] in ["
364 + getInputFile().getCanonicalPath() + "]" );
366 if ( was_not_unique ) {
367 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
368 current_protein = new BasicProtein( current_protein.getProteinId() + " "
369 + line.substring( 13 ).trim(), getSpecies(), 0 );
373 ( ( BasicProtein ) current_protein ).setDescription( line.substring( 13 ).trim() );
376 else if ( line.startsWith( "Parsed for domains:" ) ) {
377 if ( !saw_query_sequence ) {
378 throw new IOException( "unexpected format [line " + line_number + "] in ["
379 + getInputFile().getCanonicalPath() + "]" );
381 saw_query_sequence = false;
382 saw_parsed_for_domains = true;
384 else if ( saw_parsed_for_domains && line.startsWith( "--------" ) ) {
385 can_parse_domains = true;
386 saw_parsed_for_domains = false;
388 else if ( line.startsWith( "Alignments of top-scoring domains:" ) ) {
389 if ( !can_parse_domains ) {
390 throw new IOException( "unexpected format [line " + line_number + "] in ["
391 + getInputFile().getCanonicalPath() + "]" );
393 can_parse_domains = false;
395 else if ( line.startsWith( "//" ) ) {
396 can_parse_domains = false;
397 saw_double_slash = true;
398 if ( current_protein.getProteinDomains().size() > 0 ) {
399 if ( ( getMaxAllowedOverlap() != HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT )
400 || isIgnoreEngulfedDomains() ) {
401 final int domains_count = current_protein.getNumberOfProteinDomains();
402 current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
403 isIgnoreEngulfedDomains(),
405 final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
406 _domains_stored -= domains_removed;
407 _domains_ignored_due_to_overlap += domains_removed;
409 addProtein( proteins, current_protein );
411 current_protein = null;
413 else if ( can_parse_domains && ( line.indexOf( "[no hits above thresholds]" ) == -1 ) ) {
414 final String[] s = line.split( "\\s+" );
415 if ( s.length != 10 ) {
416 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
417 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
419 final String id = s[ 0 ];
420 final String domain_count_str = s[ 1 ];
421 final String from_str = s[ 2 ];
422 final String to_str = s[ 3 ];
423 final String query_match_str = s[ 4 ];
424 final String hmm_match_str = s[ 7 ];
425 final String score_str = s[ 8 ];
426 final String e_value_str = s[ 9 ];
432 from = Integer.valueOf( from_str ).intValue();
434 catch ( final NumberFormatException e ) {
435 throw new IOException( "could not parse seq-f from \"" + line + "\" [line " + line_number
436 + "] in [" + getInputFile().getCanonicalPath() + "]" );
439 to = Integer.valueOf( to_str ).intValue();
441 catch ( final NumberFormatException e ) {
442 throw new IOException( "could not parse seq-t from \"" + line + "\" [line " + line_number
443 + "] in [" + getInputFile().getCanonicalPath() + "]" );
446 score = Double.valueOf( score_str ).doubleValue();
448 catch ( final NumberFormatException e ) {
449 throw new IOException( "could not parse score from \"" + line + "\" [line " + line_number
450 + "] in [" + getInputFile().getCanonicalPath() + "]" );
453 e_value = Double.valueOf( e_value_str ).doubleValue();
455 catch ( final NumberFormatException e ) {
456 throw new IOException( "could not parse E-value from \"" + line + "\" [line " + line_number
457 + "] in [" + getInputFile().getCanonicalPath() + "]" );
459 if ( hmm_match_str.equals( "[]" ) ) {
460 //is_complete_hmm_match = true;
462 else if ( !( hmm_match_str.equals( ".]" ) || hmm_match_str.equals( "[." ) || hmm_match_str
463 .equals( ".." ) ) ) {
464 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
465 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
467 if ( query_match_str.equals( ".." ) ) {
468 // is_complete_query_match = true;
470 else if ( !( query_match_str.equals( ".]" ) || query_match_str.equals( "[." ) || query_match_str
471 .equals( "[]" ) ) ) {
472 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
473 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
475 final String[] ct = domain_count_str.split( "/" );
476 if ( ct.length != 2 ) {
477 throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line "
478 + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
480 final String number_str = ct[ 0 ];
481 final String total_str = ct[ 1 ];
485 number = Integer.valueOf( ( number_str ) ).intValue();
487 catch ( final NumberFormatException e ) {
488 throw new IOException( "could not parse domain number from \"" + line + "\" [line " + line_number
489 + "] in [" + getInputFile().getCanonicalPath() + "]" );
492 total = Integer.valueOf( ( total_str ) ).intValue();
494 catch ( final NumberFormatException e ) {
495 throw new IOException( "could not parse domain count from \"" + line + "\" [line " + line_number
496 + "] in [" + getInputFile().getCanonicalPath() + "]" );
498 ++_domains_encountered;
499 boolean failed_cutoff = false;
500 if ( getIndividualDomainScoreCutoffs() != null ) {
501 if ( getIndividualDomainScoreCutoffs().containsKey( id ) ) {
502 final double cutoff = Double.parseDouble( getIndividualDomainScoreCutoffs().get( id ) );
503 if ( score < cutoff ) {
504 failed_cutoff = true;
508 throw new IOException( "could not find a score cutoff value for domain id \"" + id
509 + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
512 final String uc_id = id.toUpperCase();
513 if ( failed_cutoff ) {
514 ++_domains_ignored_due_to_individual_score_cutoff;
516 else if ( ( getEValueMaximum() != HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT )
517 && ( e_value > getEValueMaximum() ) ) {
518 ++_domains_ignored_due_to_e_value;
520 else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
521 ++_domains_ignored_due_to_duf;
523 else if ( isIgnoreVirusLikeIds()
524 && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
525 || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
526 || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) ) ) {
527 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), id );
528 ++_domains_ignored_due_to_virus_like_id;
530 else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( id ) ) {
531 ++_domains_ignored_due_to_negative_domain_filter;
532 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), id );
535 final BasicDomain pd = new BasicDomain( id,
542 current_protein.addProteinDomain( pd );
546 } // while ( ( line = br.readLine() ) != null )
547 setTime( new Date().getTime() - start_time );
548 if ( !saw_double_slash ) {
549 throw new IOException( "file ends unexpectedly [line " + line_number + "]" );
554 public void setAllowNonUniqueQuery( final boolean allow_non_unique_query ) {
555 _allow_non_unique_query = allow_non_unique_query;
558 private void setDomainsEncountered( final int domains_encountered ) {
559 _domains_encountered = domains_encountered;
562 private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) {
563 _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
566 public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
567 _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
570 public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
571 _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
574 private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map<String, Integer> domains_ignored_due_to_negative_domain_filter_counts_map ) {
575 _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map;
578 private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) {
579 _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter;
582 private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) {
583 _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap;
586 private void setDomainsIgnoredDueToVirusLikeId( final int i ) {
587 _domains_ignored_due_to_virus_like_id = i;
590 private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map<String, Integer> domains_ignored_due_to_virus_like_id_counts_map ) {
591 _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map;
594 private void setDomainsStored( final int domains_stored ) {
595 _domains_stored = domains_stored;
598 private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
599 _domains_stored_set = _storeddomains_stored;
602 public void setEValueMaximum( final double e_value_maximum ) {
603 if ( e_value_maximum < 0.0 ) {
604 throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
606 _e_value_maximum = e_value_maximum;
609 public void setIgnoreDufs( final boolean ignore_dufs ) {
610 _ignore_dufs = ignore_dufs;
614 * To ignore domains which are completely engulfed by domains (individual
615 * ones or stretches of overlapping ones) with better support values.
618 * @param ignored_engulfed_domains
620 public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
621 _ignore_engulfed_domains = ignore_engulfed_domains;
624 public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) {
625 _ignore_virus_like_ids = ignore_virus_like_ids;
629 * Sets the individual domain score cutoff values (for example, gathering
630 * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
632 * @param individual_domain_score_cutoffs
634 public void setIndividualDomainScoreCutoffs( final Map<String, String> individual_domain_score_cutoffs ) {
635 _individual_domain_score_cutoffs = individual_domain_score_cutoffs;
638 public void setMaxAllowedOverlap( final int max_allowed_overlap ) {
639 if ( max_allowed_overlap < 0 ) {
640 throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." );
642 _max_allowed_overlap = max_allowed_overlap;
645 private void setProteinsEncountered( final int proteins_encountered ) {
646 _proteins_encountered = proteins_encountered;
649 private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) {
650 _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter;
653 private void setProteinsStored( final int proteins_stored ) {
654 _proteins_stored = proteins_stored;
657 public void setReturnType( final ReturnType return_type ) {
658 _return_type = return_type;
661 private void setTime( final long time ) {
665 public void setVerbose( final boolean verbose ) {
669 public static enum FilterType {
670 NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
673 public static enum ReturnType {
674 UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN