(no commit message)
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmPfamOutputParser.java
index 800606c..96729a4 100644 (file)
@@ -22,7 +22,7 @@
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 //
 // Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
 
 package org.forester.io.parsers;
 
@@ -40,12 +40,10 @@ import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
+import org.forester.protein.BasicDomain;
 import org.forester.protein.BasicProtein;
 import org.forester.protein.Domain;
-import org.forester.protein.DomainId;
 import org.forester.protein.Protein;
-import org.forester.surfacing.BasicDomain;
-import org.forester.surfacing.SurfacingUtil;
 import org.forester.util.ForesterUtil;
 
 public final class HmmPfamOutputParser {
@@ -62,11 +60,10 @@ public final class HmmPfamOutputParser {
     private static final ReturnType RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
     private static final boolean    IGNORE_DUFS_DEFAULT         = false;
     private static final int        MAX_ALLOWED_OVERLAP_DEFAULT = -1;
-    private final Set<DomainId>     _filter;
+    private final Set<String>       _filter;
     private final FilterType        _filter_type;
     private final File              _input_file;
     private final String            _species;
-    private final String            _model_type;
     private double                  _e_value_maximum;
     private Map<String, String>     _individual_domain_score_cutoffs;
     private boolean                 _ignore_dufs;
@@ -85,7 +82,7 @@ public final class HmmPfamOutputParser {
     private int                     _domains_ignored_due_to_e_value;
     private int                     _domains_ignored_due_to_individual_score_cutoff;
     private int                     _domains_stored;
-    private SortedSet<DomainId>     _domains_stored_set;
+    private SortedSet<String>       _domains_stored_set;
     private long                    _time;
     private int                     _domains_ignored_due_to_negative_domain_filter;
     private Map<String, Integer>    _domains_ignored_due_to_negative_domain_filter_counts_map;
@@ -95,7 +92,6 @@ public final class HmmPfamOutputParser {
     public HmmPfamOutputParser( final File input_file, final String species, final String model_type ) {
         _input_file = input_file;
         _species = species;
-        _model_type = model_type;
         _filter = null;
         _filter_type = FilterType.NONE;
         init();
@@ -103,12 +99,10 @@ public final class HmmPfamOutputParser {
 
     public HmmPfamOutputParser( final File input_file,
                                 final String species,
-                                final String model_type,
-                                final Set<DomainId> filter,
+                                final Set<String> filter,
                                 final FilterType filter_type ) {
         _input_file = input_file;
         _species = species;
-        _model_type = model_type;
         _filter = filter;
         _filter_type = filter_type;
         init();
@@ -125,7 +119,7 @@ public final class HmmPfamOutputParser {
 
     private void addProtein( final List<Protein> proteins, final Protein current_protein ) {
         if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
-            final Set<DomainId> domain_ids_in_protein = new HashSet<DomainId>();
+            final Set<String> domain_ids_in_protein = new HashSet<String>();
             for( final Domain d : current_protein.getProteinDomains() ) {
                 domain_ids_in_protein.add( d.getDomainId() );
             }
@@ -192,7 +186,7 @@ public final class HmmPfamOutputParser {
         return _domains_stored;
     }
 
-    public SortedSet<DomainId> getDomainsStoredSet() {
+    public SortedSet<String> getDomainsStoredSet() {
         return _domains_stored_set;
     }
 
@@ -200,7 +194,7 @@ public final class HmmPfamOutputParser {
         return _e_value_maximum;
     }
 
-    private Set<DomainId> getFilter() {
+    private Set<String> getFilter() {
         return _filter;
     }
 
@@ -220,10 +214,6 @@ public final class HmmPfamOutputParser {
         return _max_allowed_overlap;
     }
 
-    private String getModelType() {
-        return _model_type;
-    }
-
     public int getProteinsEncountered() {
         return _proteins_encountered;
     }
@@ -262,7 +252,7 @@ public final class HmmPfamOutputParser {
     }
 
     private void intitCounts() {
-        setDomainsStoredSet( new TreeSet<DomainId>() );
+        setDomainsStoredSet( new TreeSet<String>() );
         setDomainsEncountered( 0 );
         setProteinsEncountered( 0 );
         setProteinsIgnoredDueToFilter( 0 );
@@ -343,8 +333,8 @@ public final class HmmPfamOutputParser {
                     }
                     else if ( isVerbose() ) {
                         ForesterUtil.printWarningMessage( getClass().getName(), "query \"" + query
-                                + "\" is not unique [line " + line_number + "] in ["
-                                + getInputFile().getCanonicalPath() + "]" );
+                                                          + "\" is not unique [line " + line_number + "] in ["
+                                                          + getInputFile().getCanonicalPath() + "]" );
                     }
                 }
                 else {
@@ -409,9 +399,9 @@ public final class HmmPfamOutputParser {
                     if ( ( getMaxAllowedOverlap() != HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT )
                             || isIgnoreEngulfedDomains() ) {
                         final int domains_count = current_protein.getNumberOfProteinDomains();
-                        current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
-                                                                                  isIgnoreEngulfedDomains(),
-                                                                                  current_protein );
+                        current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
+                                                                                 isIgnoreEngulfedDomains(),
+                                                                                 current_protein );
                         final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
                         _domains_stored -= domains_removed;
                         _domains_ignored_due_to_overlap += domains_removed;
@@ -438,38 +428,36 @@ public final class HmmPfamOutputParser {
                 int to = -1;
                 double e_value = -1;
                 double score = -1;
-                boolean is_complete_hmm_match = false;
-                boolean is_complete_query_match = false;
                 try {
                     from = Integer.valueOf( from_str ).intValue();
                 }
                 catch ( final NumberFormatException e ) {
                     throw new IOException( "could not parse seq-f from \"" + line + "\" [line " + line_number
-                            + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                           + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 try {
                     to = Integer.valueOf( to_str ).intValue();
                 }
                 catch ( final NumberFormatException e ) {
                     throw new IOException( "could not parse seq-t from \"" + line + "\" [line " + line_number
-                            + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                           + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 try {
                     score = Double.valueOf( score_str ).doubleValue();
                 }
                 catch ( final NumberFormatException e ) {
                     throw new IOException( "could not parse score from \"" + line + "\" [line " + line_number
-                            + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                           + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 try {
                     e_value = Double.valueOf( e_value_str ).doubleValue();
                 }
                 catch ( final NumberFormatException e ) {
                     throw new IOException( "could not parse E-value from \"" + line + "\" [line " + line_number
-                            + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                           + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 if ( hmm_match_str.equals( "[]" ) ) {
-                    is_complete_hmm_match = true;
+                    //is_complete_hmm_match = true;
                 }
                 else if ( !( hmm_match_str.equals( ".]" ) || hmm_match_str.equals( "[." ) || hmm_match_str
                         .equals( ".." ) ) ) {
@@ -477,7 +465,7 @@ public final class HmmPfamOutputParser {
                             + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 if ( query_match_str.equals( ".." ) ) {
-                    is_complete_query_match = true;
+                    // is_complete_query_match = true;
                 }
                 else if ( !( query_match_str.equals( ".]" ) || query_match_str.equals( "[." ) || query_match_str
                         .equals( "[]" ) ) ) {
@@ -498,14 +486,14 @@ public final class HmmPfamOutputParser {
                 }
                 catch ( final NumberFormatException e ) {
                     throw new IOException( "could not parse domain number from \"" + line + "\" [line " + line_number
-                            + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                           + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 try {
                     total = Integer.valueOf( ( total_str ) ).intValue();
                 }
                 catch ( final NumberFormatException e ) {
                     throw new IOException( "could not parse domain count from \"" + line + "\" [line " + line_number
-                            + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                           + "] in [" + getInputFile().getCanonicalPath() + "]" );
                 }
                 ++_domains_encountered;
                 boolean failed_cutoff = false;
@@ -518,7 +506,7 @@ public final class HmmPfamOutputParser {
                     }
                     else {
                         throw new IOException( "could not find a score cutoff value for domain id \"" + id
-                                + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                                               + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
                     }
                 }
                 final String uc_id = id.toUpperCase();
@@ -539,8 +527,7 @@ public final class HmmPfamOutputParser {
                     ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), id );
                     ++_domains_ignored_due_to_virus_like_id;
                 }
-                else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN )
-                        && getFilter().contains( new DomainId( id ) ) ) {
+                else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( id ) ) {
                     ++_domains_ignored_due_to_negative_domain_filter;
                     ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), id );
                 }
@@ -608,7 +595,7 @@ public final class HmmPfamOutputParser {
         _domains_stored = domains_stored;
     }
 
-    private void setDomainsStoredSet( final SortedSet<DomainId> _storeddomains_stored ) {
+    private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
         _domains_stored_set = _storeddomains_stored;
     }
 
@@ -626,8 +613,8 @@ public final class HmmPfamOutputParser {
     /**
      * To ignore domains which are completely engulfed by domains (individual
      * ones or stretches of overlapping ones) with better support values.
-     * 
-     * 
+     *
+     *
      * @param ignored_engulfed_domains
      */
     public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
@@ -641,7 +628,7 @@ public final class HmmPfamOutputParser {
     /**
      * Sets the individual domain score cutoff values (for example, gathering
      * thresholds from Pfam). Domain ids are the keys, cutoffs the values.
-     * 
+     *
      * @param individual_domain_score_cutoffs
      */
     public void setIndividualDomainScoreCutoffs( final Map<String, String> individual_domain_score_cutoffs ) {