X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2FHmmscanPerDomainTableParser.java;h=f063866e2e44d17870f873085c4ca91f659eabeb;hb=64731196184662d30d794bc339a5ecd567cd5e86;hp=b5d1bc13d36c74b8c2fb3b114771faf5963bf66a;hpb=e9ca0dc1764303d53fc6b9b087f33cdee53726ea;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java index b5d1bc1..f063866 100644 --- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -23,7 +23,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.io.parsers; @@ -44,9 +44,7 @@ import java.util.TreeSet; import org.forester.protein.BasicDomain; import org.forester.protein.BasicProtein; import org.forester.protein.Domain; -import org.forester.protein.DomainId; import org.forester.protein.Protein; -import org.forester.surfacing.SurfacingUtil; import org.forester.util.ForesterUtil; public final class HmmscanPerDomainTableParser { @@ -65,11 +63,13 @@ public final class HmmscanPerDomainTableParser { private static final boolean IGNORE_DUFS_DEFAULT = false; private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; private static final boolean IGNORE_REPLACED_RRMS = false; - private final Set _filter; + private static final boolean IGNORE_hGDE_amylase = true; //TODO eventually remove me, added 10/22/13 + private final Set _filter; private final FilterType _filter_type; private final File _input_file; private final String _species; - private double _e_value_maximum; + private double _fs_e_value_maximum; + private double _i_e_value_maximum; private Map _individual_score_cutoffs; private boolean _ignore_dufs; private boolean _ignore_virus_like_ids; @@ -82,10 +82,11 @@ public final class HmmscanPerDomainTableParser { private int _domains_encountered; private int _domains_ignored_due_to_duf; private int _domains_ignored_due_to_overlap; - private int _domains_ignored_due_to_e_value; + private int _domains_ignored_due_to_fs_e_value; + private int _domains_ignored_due_to_i_e_value; private int _domains_ignored_due_to_individual_score_cutoff; private int _domains_stored; - private SortedSet _domains_stored_set; + private SortedSet _domains_stored_set; private long _time; private int _domains_ignored_due_to_negative_domain_filter; private Map _domains_ignored_due_to_negative_domain_filter_counts_map; @@ -121,7 +122,7 @@ public final class HmmscanPerDomainTableParser { public HmmscanPerDomainTableParser( final File input_file, final String species, - final Set filter, + final Set filter, final FilterType filter_type, final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) { _input_file = input_file; @@ -135,7 +136,7 @@ public final class HmmscanPerDomainTableParser { public HmmscanPerDomainTableParser( final File input_file, final String species, - final Set filter, + final Set filter, final FilterType filter_type, final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, final boolean allow_proteins_with_same_name ) { @@ -165,15 +166,15 @@ public final class HmmscanPerDomainTableParser { if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT ) || isIgnoreEngulfedDomains() ) { final int domains_count = current_protein.getNumberOfProteinDomains(); - current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(), - isIgnoreEngulfedDomains(), - current_protein ); + current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(), + isIgnoreEngulfedDomains(), + current_protein ); final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains(); _domains_stored -= domains_removed; _domains_ignored_due_to_overlap += domains_removed; } if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { - final Set domain_ids_in_protein = new HashSet(); + final Set domain_ids_in_protein = new HashSet(); for( final Domain d : current_protein.getProteinDomains() ) { domain_ids_in_protein.add( d.getDomainId() ); } @@ -208,8 +209,12 @@ public final class HmmscanPerDomainTableParser { return _domains_ignored_due_to_duf; } - public int getDomainsIgnoredDueToEval() { - return _domains_ignored_due_to_e_value; + public int getDomainsIgnoredDueToIEval() { + return _domains_ignored_due_to_i_e_value; + } + + public int getDomainsIgnoredDueToFsEval() { + return _domains_ignored_due_to_fs_e_value; } public int getDomainsIgnoredDueToIndividualScoreCutoff() { @@ -240,15 +245,19 @@ public final class HmmscanPerDomainTableParser { return _domains_stored; } - public SortedSet getDomainsStoredSet() { + public SortedSet getDomainsStoredSet() { return _domains_stored_set; } - private double getEValueMaximum() { - return _e_value_maximum; + private double getFsEValueMaximum() { + return _fs_e_value_maximum; + } + + private double getIEValueMaximum() { + return _i_e_value_maximum; } - private Set getFilter() { + private Set getFilter() { return _filter; } @@ -297,7 +306,8 @@ public final class HmmscanPerDomainTableParser { } private void init() { - _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + _fs_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + _i_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT ); setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT ); _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT; @@ -308,13 +318,14 @@ public final class HmmscanPerDomainTableParser { } private void intitCounts() { - setDomainsStoredSet( new TreeSet() ); + setDomainsStoredSet( new TreeSet() ); setDomainsEncountered( 0 ); setProteinsEncountered( 0 ); setProteinsIgnoredDueToFilter( 0 ); setDomainsIgnoredDueToNegativeFilter( 0 ); setDomainsIgnoredDueToDuf( 0 ); - setDomainsIgnoredDueToEval( 0 ); + setDomainsIgnoredDueToFsEval( 0 ); + setDomainsIgnoredDueToIEval( 0 ); setDomainsIgnoredDueToIndividualScoreCutoff( 0 ); setDomainsIgnoredDueToVirusLikeId( 0 ); setDomainsIgnoredDueToOverlap( 0 ); @@ -441,9 +452,13 @@ public final class HmmscanPerDomainTableParser { else if ( ali_from == ali_to ) { //Ignore } - else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) - && ( fs_e_value > getEValueMaximum() ) ) { - ++_domains_ignored_due_to_e_value; + else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( fs_e_value > getFsEValueMaximum() ) ) { + ++_domains_ignored_due_to_fs_e_value; + } + else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( i_e_value > getIEValueMaximum() ) ) { + ++_domains_ignored_due_to_i_e_value; } else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { ++_domains_ignored_due_to_duf; @@ -452,6 +467,8 @@ public final class HmmscanPerDomainTableParser { && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id .contains( "RRM_6" ) ) ) { } + else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) { + } else if ( isIgnoreVirusLikeIds() && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO ) || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG ) @@ -459,8 +476,7 @@ public final class HmmscanPerDomainTableParser { ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id ); ++_domains_ignored_due_to_virus_like_id; } - else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) - && getFilter().contains( new DomainId( target_id ) ) ) { + else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( target_id ) ) { ++_domains_ignored_due_to_negative_domain_filter; ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id ); } @@ -471,8 +487,6 @@ public final class HmmscanPerDomainTableParser { ali_to, ( short ) domain_number, ( short ) total_domains, - fs_e_value, - fs_score, i_e_value, domain_score ); current_protein.addProteinDomain( pd ); @@ -524,8 +538,12 @@ public final class HmmscanPerDomainTableParser { _domains_ignored_due_to_duf = domains_ignored_due_to_duf; } - private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { - _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; + private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) { + _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value; + } + + private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) { + _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value; } private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { @@ -556,15 +574,22 @@ public final class HmmscanPerDomainTableParser { _domains_stored = domains_stored; } - private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { + private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { _domains_stored_set = _storeddomains_stored; } - public void setEValueMaximum( final double e_value_maximum ) { - if ( e_value_maximum < 0.0 ) { + public void setFsEValueMaximum( final double fs_e_value_maximum ) { + if ( fs_e_value_maximum < 0.0 ) { + throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); + } + _fs_e_value_maximum = fs_e_value_maximum; + } + + public void setIEValueMaximum( final double i_e_value_maximum ) { + if ( i_e_value_maximum < 0.0 ) { throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); } - _e_value_maximum = e_value_maximum; + _i_e_value_maximum = i_e_value_maximum; } public void setIgnoreDufs( final boolean ignore_dufs ) {