inprogress
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmscanPerDomainTableParser.java
index ac6a2bc..f063866 100644 (file)
@@ -23,7 +23,7 @@
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 //
 // Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
 
 package org.forester.io.parsers;
 
@@ -41,12 +41,10 @@ import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
-import org.forester.surfacing.BasicDomain;
-import org.forester.surfacing.BasicProtein;
-import org.forester.surfacing.Domain;
-import org.forester.surfacing.DomainId;
-import org.forester.surfacing.Protein;
-import org.forester.surfacing.SurfacingUtil;
+import org.forester.protein.BasicDomain;
+import org.forester.protein.BasicProtein;
+import org.forester.protein.Domain;
+import org.forester.protein.Protein;
 import org.forester.util.ForesterUtil;
 
 public final class HmmscanPerDomainTableParser {
@@ -64,11 +62,14 @@ public final class HmmscanPerDomainTableParser {
     private static final ReturnType       RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
     private static final boolean          IGNORE_DUFS_DEFAULT         = false;
     private static final int              MAX_ALLOWED_OVERLAP_DEFAULT = -1;
-    private final Set<DomainId>           _filter;
+    private static final boolean          IGNORE_REPLACED_RRMS        = false;
+    private static final boolean          IGNORE_hGDE_amylase         = true;                                                      //TODO eventually remove me, added 10/22/13
+    private final Set<String>             _filter;
     private final FilterType              _filter_type;
     private final File                    _input_file;
     private final String                  _species;
-    private double                        _e_value_maximum;
+    private double                        _fs_e_value_maximum;
+    private double                        _i_e_value_maximum;
     private Map<String, Double>           _individual_score_cutoffs;
     private boolean                       _ignore_dufs;
     private boolean                       _ignore_virus_like_ids;
@@ -81,16 +82,18 @@ public final class HmmscanPerDomainTableParser {
     private int                           _domains_encountered;
     private int                           _domains_ignored_due_to_duf;
     private int                           _domains_ignored_due_to_overlap;
-    private int                           _domains_ignored_due_to_e_value;
+    private int                           _domains_ignored_due_to_fs_e_value;
+    private int                           _domains_ignored_due_to_i_e_value;
     private int                           _domains_ignored_due_to_individual_score_cutoff;
     private int                           _domains_stored;
-    private SortedSet<DomainId>           _domains_stored_set;
+    private SortedSet<String>             _domains_stored_set;
     private long                          _time;
     private int                           _domains_ignored_due_to_negative_domain_filter;
     private Map<String, Integer>          _domains_ignored_due_to_negative_domain_filter_counts_map;
     private int                           _domains_ignored_due_to_virus_like_id;
     private Map<String, Integer>          _domains_ignored_due_to_virus_like_id_counts_map;
     private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff;
+    private final boolean                 _allow_proteins_with_same_name;
 
     public HmmscanPerDomainTableParser( final File input_file,
                                         final String species,
@@ -100,12 +103,26 @@ public final class HmmscanPerDomainTableParser {
         _filter = null;
         _filter_type = FilterType.NONE;
         _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = false;
         init();
     }
 
     public HmmscanPerDomainTableParser( final File input_file,
                                         final String species,
-                                        final Set<DomainId> filter,
+                                        final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
+                                        final boolean allow_proteins_with_same_name ) {
+        _input_file = input_file;
+        _species = species;
+        _filter = null;
+        _filter_type = FilterType.NONE;
+        _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = allow_proteins_with_same_name;
+        init();
+    }
+
+    public HmmscanPerDomainTableParser( final File input_file,
+                                        final String species,
+                                        final Set<String> filter,
                                         final FilterType filter_type,
                                         final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
         _input_file = input_file;
@@ -113,9 +130,29 @@ public final class HmmscanPerDomainTableParser {
         _filter = filter;
         _filter_type = filter_type;
         _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = false;
         init();
     }
 
+    public HmmscanPerDomainTableParser( final File input_file,
+                                        final String species,
+                                        final Set<String> filter,
+                                        final FilterType filter_type,
+                                        final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
+                                        final boolean allow_proteins_with_same_name ) {
+        _input_file = input_file;
+        _species = species;
+        _filter = filter;
+        _filter_type = filter_type;
+        _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = allow_proteins_with_same_name;
+        init();
+    }
+
+    public boolean isAllowProteinsWithSameName() {
+        return _allow_proteins_with_same_name;
+    }
+
     private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
         final List<Domain> l = current_protein.getProteinDomains();
         for( final Domain d : l ) {
@@ -129,15 +166,15 @@ public final class HmmscanPerDomainTableParser {
         if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT )
                 || isIgnoreEngulfedDomains() ) {
             final int domains_count = current_protein.getNumberOfProteinDomains();
-            current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
-                                                                      isIgnoreEngulfedDomains(),
-                                                                      current_protein );
+            current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
+                                                                     isIgnoreEngulfedDomains(),
+                                                                     current_protein );
             final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
             _domains_stored -= domains_removed;
             _domains_ignored_due_to_overlap += domains_removed;
         }
         if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
-            final Set<DomainId> domain_ids_in_protein = new HashSet<DomainId>();
+            final Set<String> domain_ids_in_protein = new HashSet<String>();
             for( final Domain d : current_protein.getProteinDomains() ) {
                 domain_ids_in_protein.add( d.getDomainId() );
             }
@@ -172,8 +209,12 @@ public final class HmmscanPerDomainTableParser {
         return _domains_ignored_due_to_duf;
     }
 
-    public int getDomainsIgnoredDueToEval() {
-        return _domains_ignored_due_to_e_value;
+    public int getDomainsIgnoredDueToIEval() {
+        return _domains_ignored_due_to_i_e_value;
+    }
+
+    public int getDomainsIgnoredDueToFsEval() {
+        return _domains_ignored_due_to_fs_e_value;
     }
 
     public int getDomainsIgnoredDueToIndividualScoreCutoff() {
@@ -204,15 +245,19 @@ public final class HmmscanPerDomainTableParser {
         return _domains_stored;
     }
 
-    public SortedSet<DomainId> getDomainsStoredSet() {
+    public SortedSet<String> getDomainsStoredSet() {
         return _domains_stored_set;
     }
 
-    private double getEValueMaximum() {
-        return _e_value_maximum;
+    private double getFsEValueMaximum() {
+        return _fs_e_value_maximum;
     }
 
-    private Set<DomainId> getFilter() {
+    private double getIEValueMaximum() {
+        return _i_e_value_maximum;
+    }
+
+    private Set<String> getFilter() {
         return _filter;
     }
 
@@ -261,7 +306,8 @@ public final class HmmscanPerDomainTableParser {
     }
 
     private void init() {
-        _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
+        _fs_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
+        _i_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
         setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
         setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
         _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
@@ -272,13 +318,14 @@ public final class HmmscanPerDomainTableParser {
     }
 
     private void intitCounts() {
-        setDomainsStoredSet( new TreeSet<DomainId>() );
+        setDomainsStoredSet( new TreeSet<String>() );
         setDomainsEncountered( 0 );
         setProteinsEncountered( 0 );
         setProteinsIgnoredDueToFilter( 0 );
         setDomainsIgnoredDueToNegativeFilter( 0 );
         setDomainsIgnoredDueToDuf( 0 );
-        setDomainsIgnoredDueToEval( 0 );
+        setDomainsIgnoredDueToFsEval( 0 );
+        setDomainsIgnoredDueToIEval( 0 );
         setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
         setDomainsIgnoredDueToVirusLikeId( 0 );
         setDomainsIgnoredDueToOverlap( 0 );
@@ -356,12 +403,14 @@ public final class HmmscanPerDomainTableParser {
             final int env_to = parseInt( tokens[ 20 ], line_number, "env to" );
             ++_domains_encountered;
             if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) {
-                if ( query.equals( prev_query ) ) {
-                    throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", "
-                            + prev_qlen );
-                }
-                if ( prev_queries.contains( query ) ) {
-                    throw new IOException( "more than one protein named [" + query + "]" );
+                if ( !isAllowProteinsWithSameName() ) {
+                    if ( query.equals( prev_query ) ) {
+                        throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
+                                + ", " + prev_qlen );
+                    }
+                    if ( prev_queries.contains( query ) ) {
+                        throw new IOException( "more than one protein named [" + query + "]" );
+                    }
                 }
                 prev_query = query;
                 prev_qlen = qlen;
@@ -370,7 +419,7 @@ public final class HmmscanPerDomainTableParser {
                     addProtein( proteins, current_protein );
                 }
                 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
-                    current_protein = new BasicProtein( query, getSpecies() );
+                    current_protein = new BasicProtein( query, getSpecies(), qlen );
                 }
                 else {
                     throw new IllegalArgumentException( "unknown return type" );
@@ -403,13 +452,23 @@ public final class HmmscanPerDomainTableParser {
             else if ( ali_from == ali_to ) {
                 //Ignore
             }
-            else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
-                    && ( fs_e_value > getEValueMaximum() ) ) {
-                ++_domains_ignored_due_to_e_value;
+            else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
+                    && ( fs_e_value > getFsEValueMaximum() ) ) {
+                ++_domains_ignored_due_to_fs_e_value;
+            }
+            else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
+                    && ( i_e_value > getIEValueMaximum() ) ) {
+                ++_domains_ignored_due_to_i_e_value;
             }
             else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
                 ++_domains_ignored_due_to_duf;
             }
+            else if ( IGNORE_REPLACED_RRMS
+                    && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
+                            .contains( "RRM_6" ) ) ) {
+            }
+            else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) {
+            }
             else if ( isIgnoreVirusLikeIds()
                     && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
                             || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
@@ -417,8 +476,7 @@ public final class HmmscanPerDomainTableParser {
                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id );
                 ++_domains_ignored_due_to_virus_like_id;
             }
-            else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN )
-                    && getFilter().contains( new DomainId( target_id ) ) ) {
+            else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( target_id ) ) {
                 ++_domains_ignored_due_to_negative_domain_filter;
                 ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id );
             }
@@ -429,8 +487,6 @@ public final class HmmscanPerDomainTableParser {
                                                        ali_to,
                                                        ( short ) domain_number,
                                                        ( short ) total_domains,
-                                                       fs_e_value,
-                                                       fs_score,
                                                        i_e_value,
                                                        domain_score );
                     current_protein.addProteinDomain( pd );
@@ -482,11 +538,15 @@ public final class HmmscanPerDomainTableParser {
         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
     }
 
-    public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
-        _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
+    private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) {
+        _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value;
+    }
+
+    private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) {
+        _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value;
     }
 
-    public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
+    private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
     }
 
@@ -514,15 +574,22 @@ public final class HmmscanPerDomainTableParser {
         _domains_stored = domains_stored;
     }
 
-    private void setDomainsStoredSet( final SortedSet<DomainId> _storeddomains_stored ) {
+    private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
         _domains_stored_set = _storeddomains_stored;
     }
 
-    public void setEValueMaximum( final double e_value_maximum ) {
-        if ( e_value_maximum < 0.0 ) {
+    public void setFsEValueMaximum( final double fs_e_value_maximum ) {
+        if ( fs_e_value_maximum < 0.0 ) {
+            throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
+        }
+        _fs_e_value_maximum = fs_e_value_maximum;
+    }
+
+    public void setIEValueMaximum( final double i_e_value_maximum ) {
+        if ( i_e_value_maximum < 0.0 ) {
             throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
         }
-        _e_value_maximum = e_value_maximum;
+        _i_e_value_maximum = i_e_value_maximum;
     }
 
     public void setIgnoreDufs( final boolean ignore_dufs ) {