X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2FHmmscanPerDomainTableParser.java;h=b5d1bc13d36c74b8c2fb3b114771faf5963bf66a;hb=f0e9864f2a7aa03931cc4c243ca9819bd9bdb9a9;hp=ac6a2bcc0bb9638ceb8e7fa7468b756cb9451f7d;hpb=eee996a6476a1e3d84c07f8f690dcde3ff4b2ef5;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java index ac6a2bc..b5d1bc1 100644 --- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -41,11 +41,11 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import org.forester.surfacing.BasicDomain; -import org.forester.surfacing.BasicProtein; -import org.forester.surfacing.Domain; -import org.forester.surfacing.DomainId; -import org.forester.surfacing.Protein; +import org.forester.protein.BasicDomain; +import org.forester.protein.BasicProtein; +import org.forester.protein.Domain; +import org.forester.protein.DomainId; +import org.forester.protein.Protein; import org.forester.surfacing.SurfacingUtil; import org.forester.util.ForesterUtil; @@ -64,6 +64,7 @@ public final class HmmscanPerDomainTableParser { private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN; private static final boolean IGNORE_DUFS_DEFAULT = false; private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + private static final boolean IGNORE_REPLACED_RRMS = false; private final Set _filter; private final FilterType _filter_type; private final File _input_file; @@ -91,6 +92,7 @@ public final class HmmscanPerDomainTableParser { private int _domains_ignored_due_to_virus_like_id; private Map _domains_ignored_due_to_virus_like_id_counts_map; private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff; + private final boolean _allow_proteins_with_same_name; public HmmscanPerDomainTableParser( final File input_file, final String species, @@ -100,6 +102,20 @@ public final class HmmscanPerDomainTableParser { _filter = null; _filter_type = FilterType.NONE; _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = false; + init(); + } + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, + final boolean allow_proteins_with_same_name ) { + _input_file = input_file; + _species = species; + _filter = null; + _filter_type = FilterType.NONE; + _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = allow_proteins_with_same_name; init(); } @@ -113,9 +129,29 @@ public final class HmmscanPerDomainTableParser { _filter = filter; _filter_type = filter_type; _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = false; + init(); + } + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final Set filter, + final FilterType filter_type, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, + final boolean allow_proteins_with_same_name ) { + _input_file = input_file; + _species = species; + _filter = filter; + _filter_type = filter_type; + _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = allow_proteins_with_same_name; init(); } + public boolean isAllowProteinsWithSameName() { + return _allow_proteins_with_same_name; + } + private void actuallyAddProtein( final List proteins, final Protein current_protein ) { final List l = current_protein.getProteinDomains(); for( final Domain d : l ) { @@ -356,12 +392,14 @@ public final class HmmscanPerDomainTableParser { final int env_to = parseInt( tokens[ 20 ], line_number, "env to" ); ++_domains_encountered; if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) { - if ( query.equals( prev_query ) ) { - throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", " - + prev_qlen ); - } - if ( prev_queries.contains( query ) ) { - throw new IOException( "more than one protein named [" + query + "]" ); + if ( !isAllowProteinsWithSameName() ) { + if ( query.equals( prev_query ) ) { + throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + + ", " + prev_qlen ); + } + if ( prev_queries.contains( query ) ) { + throw new IOException( "more than one protein named [" + query + "]" ); + } } prev_query = query; prev_qlen = qlen; @@ -370,7 +408,7 @@ public final class HmmscanPerDomainTableParser { addProtein( proteins, current_protein ); } if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) { - current_protein = new BasicProtein( query, getSpecies() ); + current_protein = new BasicProtein( query, getSpecies(), qlen ); } else { throw new IllegalArgumentException( "unknown return type" ); @@ -410,6 +448,10 @@ public final class HmmscanPerDomainTableParser { else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { ++_domains_ignored_due_to_duf; } + else if ( IGNORE_REPLACED_RRMS + && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id + .contains( "RRM_6" ) ) ) { + } else if ( isIgnoreVirusLikeIds() && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO ) || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG ) @@ -482,11 +524,11 @@ public final class HmmscanPerDomainTableParser { _domains_ignored_due_to_duf = domains_ignored_due_to_duf; } - public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { + private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; } - public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { + private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; }