rio - gsdir work...
[jalview.git] / forester / java / src / org / forester / io / parsers / HmmscanPerDomainTableParser.java
index 186ecc8..b5d1bc1 100644 (file)
@@ -41,11 +41,11 @@ import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
-import org.forester.surfacing.BasicDomain;
-import org.forester.surfacing.BasicProtein;
-import org.forester.surfacing.Domain;
-import org.forester.surfacing.DomainId;
-import org.forester.surfacing.Protein;
+import org.forester.protein.BasicDomain;
+import org.forester.protein.BasicProtein;
+import org.forester.protein.Domain;
+import org.forester.protein.DomainId;
+import org.forester.protein.Protein;
 import org.forester.surfacing.SurfacingUtil;
 import org.forester.util.ForesterUtil;
 
@@ -64,6 +64,7 @@ public final class HmmscanPerDomainTableParser {
     private static final ReturnType       RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
     private static final boolean          IGNORE_DUFS_DEFAULT         = false;
     private static final int              MAX_ALLOWED_OVERLAP_DEFAULT = -1;
+    private static final boolean          IGNORE_REPLACED_RRMS        = false;
     private final Set<DomainId>           _filter;
     private final FilterType              _filter_type;
     private final File                    _input_file;
@@ -91,6 +92,7 @@ public final class HmmscanPerDomainTableParser {
     private int                           _domains_ignored_due_to_virus_like_id;
     private Map<String, Integer>          _domains_ignored_due_to_virus_like_id_counts_map;
     private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff;
+    private final boolean                 _allow_proteins_with_same_name;
 
     public HmmscanPerDomainTableParser( final File input_file,
                                         final String species,
@@ -100,6 +102,20 @@ public final class HmmscanPerDomainTableParser {
         _filter = null;
         _filter_type = FilterType.NONE;
         _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = false;
+        init();
+    }
+
+    public HmmscanPerDomainTableParser( final File input_file,
+                                        final String species,
+                                        final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
+                                        final boolean allow_proteins_with_same_name ) {
+        _input_file = input_file;
+        _species = species;
+        _filter = null;
+        _filter_type = FilterType.NONE;
+        _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = allow_proteins_with_same_name;
         init();
     }
 
@@ -113,9 +129,29 @@ public final class HmmscanPerDomainTableParser {
         _filter = filter;
         _filter_type = filter_type;
         _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = false;
+        init();
+    }
+
+    public HmmscanPerDomainTableParser( final File input_file,
+                                        final String species,
+                                        final Set<DomainId> filter,
+                                        final FilterType filter_type,
+                                        final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
+                                        final boolean allow_proteins_with_same_name ) {
+        _input_file = input_file;
+        _species = species;
+        _filter = filter;
+        _filter_type = filter_type;
+        _ind_cutoff = individual_cutoff_applies_to;
+        _allow_proteins_with_same_name = allow_proteins_with_same_name;
         init();
     }
 
+    public boolean isAllowProteinsWithSameName() {
+        return _allow_proteins_with_same_name;
+    }
+
     private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
         final List<Domain> l = current_protein.getProteinDomains();
         for( final Domain d : l ) {
@@ -356,12 +392,14 @@ public final class HmmscanPerDomainTableParser {
             final int env_to = parseInt( tokens[ 20 ], line_number, "env to" );
             ++_domains_encountered;
             if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) {
-                if ( query.equals( prev_query ) ) {
-                    throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", "
-                            + prev_qlen );
-                }
-                if ( prev_queries.contains( query ) ) {
-                    throw new IOException( "more than one protein named [" + query + "]" );
+                if ( !isAllowProteinsWithSameName() ) {
+                    if ( query.equals( prev_query ) ) {
+                        throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
+                                + ", " + prev_qlen );
+                    }
+                    if ( prev_queries.contains( query ) ) {
+                        throw new IOException( "more than one protein named [" + query + "]" );
+                    }
                 }
                 prev_query = query;
                 prev_qlen = qlen;
@@ -370,7 +408,7 @@ public final class HmmscanPerDomainTableParser {
                     addProtein( proteins, current_protein );
                 }
                 if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
-                    current_protein = new BasicProtein( query, getSpecies() );
+                    current_protein = new BasicProtein( query, getSpecies(), qlen );
                 }
                 else {
                     throw new IllegalArgumentException( "unknown return type" );
@@ -410,6 +448,10 @@ public final class HmmscanPerDomainTableParser {
             else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
                 ++_domains_ignored_due_to_duf;
             }
+            else if ( IGNORE_REPLACED_RRMS
+                    && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
+                            .contains( "RRM_6" ) ) ) {
+            }
             else if ( isIgnoreVirusLikeIds()
                     && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
                             || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )