X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2FHmmscanPerDomainTableParser.java;h=5e0ba9cfa6d7311eb3e780e4fa37c5d1967fa31d;hb=665e671efec73fcb36a9aac45f119330f290fa81;hp=ac6a2bcc0bb9638ceb8e7fa7468b756cb9451f7d;hpb=eee996a6476a1e3d84c07f8f690dcde3ff4b2ef5;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java index ac6a2bc..5e0ba9c 100644 --- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -23,7 +23,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.io.parsers; @@ -41,12 +41,10 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import org.forester.surfacing.BasicDomain; -import org.forester.surfacing.BasicProtein; -import org.forester.surfacing.Domain; -import org.forester.surfacing.DomainId; -import org.forester.surfacing.Protein; -import org.forester.surfacing.SurfacingUtil; +import org.forester.protein.BasicDomain; +import org.forester.protein.BasicProtein; +import org.forester.protein.Domain; +import org.forester.protein.Protein; import org.forester.util.ForesterUtil; public final class HmmscanPerDomainTableParser { @@ -64,11 +62,14 @@ public final class HmmscanPerDomainTableParser { private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN; private static final boolean IGNORE_DUFS_DEFAULT = false; private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; - private final Set _filter; + private static final boolean IGNORE_REPLACED_RRMS = false; + private static final boolean IGNORE_hGDE_amylase = true; //TODO eventually remove me, added 10/22/13 + private final Set _filter; private final FilterType _filter_type; private final File _input_file; private final String _species; - private double _e_value_maximum; + private double _fs_e_value_maximum; + private double _i_e_value_maximum; private Map _individual_score_cutoffs; private boolean _ignore_dufs; private boolean _ignore_virus_like_ids; @@ -81,16 +82,18 @@ public final class HmmscanPerDomainTableParser { private int _domains_encountered; private int _domains_ignored_due_to_duf; private int _domains_ignored_due_to_overlap; - private int _domains_ignored_due_to_e_value; + private int _domains_ignored_due_to_fs_e_value; + private int _domains_ignored_due_to_i_e_value; private int _domains_ignored_due_to_individual_score_cutoff; private int _domains_stored; - private SortedSet _domains_stored_set; + private SortedSet _domains_stored_set; private long _time; private int _domains_ignored_due_to_negative_domain_filter; private Map _domains_ignored_due_to_negative_domain_filter_counts_map; private int _domains_ignored_due_to_virus_like_id; private Map _domains_ignored_due_to_virus_like_id_counts_map; private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff; + private final boolean _allow_proteins_with_same_name; public HmmscanPerDomainTableParser( final File input_file, final String species, @@ -100,12 +103,26 @@ public final class HmmscanPerDomainTableParser { _filter = null; _filter_type = FilterType.NONE; _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = false; init(); } public HmmscanPerDomainTableParser( final File input_file, final String species, - final Set filter, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, + final boolean allow_proteins_with_same_name ) { + _input_file = input_file; + _species = species; + _filter = null; + _filter_type = FilterType.NONE; + _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = allow_proteins_with_same_name; + init(); + } + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final Set filter, final FilterType filter_type, final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) { _input_file = input_file; @@ -113,9 +130,29 @@ public final class HmmscanPerDomainTableParser { _filter = filter; _filter_type = filter_type; _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = false; init(); } + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final Set filter, + final FilterType filter_type, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, + final boolean allow_proteins_with_same_name ) { + _input_file = input_file; + _species = species; + _filter = filter; + _filter_type = filter_type; + _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = allow_proteins_with_same_name; + init(); + } + + public boolean isAllowProteinsWithSameName() { + return _allow_proteins_with_same_name; + } + private void actuallyAddProtein( final List proteins, final Protein current_protein ) { final List l = current_protein.getProteinDomains(); for( final Domain d : l ) { @@ -129,15 +166,15 @@ public final class HmmscanPerDomainTableParser { if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT ) || isIgnoreEngulfedDomains() ) { final int domains_count = current_protein.getNumberOfProteinDomains(); - current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(), - isIgnoreEngulfedDomains(), - current_protein ); + current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(), + isIgnoreEngulfedDomains(), + current_protein ); final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains(); _domains_stored -= domains_removed; _domains_ignored_due_to_overlap += domains_removed; } if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { - final Set domain_ids_in_protein = new HashSet(); + final Set domain_ids_in_protein = new HashSet(); for( final Domain d : current_protein.getProteinDomains() ) { domain_ids_in_protein.add( d.getDomainId() ); } @@ -172,8 +209,12 @@ public final class HmmscanPerDomainTableParser { return _domains_ignored_due_to_duf; } - public int getDomainsIgnoredDueToEval() { - return _domains_ignored_due_to_e_value; + public int getDomainsIgnoredDueToIEval() { + return _domains_ignored_due_to_i_e_value; + } + + public int getDomainsIgnoredDueToFsEval() { + return _domains_ignored_due_to_fs_e_value; } public int getDomainsIgnoredDueToIndividualScoreCutoff() { @@ -204,15 +245,19 @@ public final class HmmscanPerDomainTableParser { return _domains_stored; } - public SortedSet getDomainsStoredSet() { + public SortedSet getDomainsStoredSet() { return _domains_stored_set; } - private double getEValueMaximum() { - return _e_value_maximum; + private double getFsEValueMaximum() { + return _fs_e_value_maximum; } - private Set getFilter() { + private double getIEValueMaximum() { + return _i_e_value_maximum; + } + + private Set getFilter() { return _filter; } @@ -261,7 +306,8 @@ public final class HmmscanPerDomainTableParser { } private void init() { - _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + _fs_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + _i_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT ); setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT ); _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT; @@ -272,13 +318,14 @@ public final class HmmscanPerDomainTableParser { } private void intitCounts() { - setDomainsStoredSet( new TreeSet() ); + setDomainsStoredSet( new TreeSet() ); setDomainsEncountered( 0 ); setProteinsEncountered( 0 ); setProteinsIgnoredDueToFilter( 0 ); setDomainsIgnoredDueToNegativeFilter( 0 ); setDomainsIgnoredDueToDuf( 0 ); - setDomainsIgnoredDueToEval( 0 ); + setDomainsIgnoredDueToFsEval( 0 ); + setDomainsIgnoredDueToIEval( 0 ); setDomainsIgnoredDueToIndividualScoreCutoff( 0 ); setDomainsIgnoredDueToVirusLikeId( 0 ); setDomainsIgnoredDueToOverlap( 0 ); @@ -325,7 +372,7 @@ public final class HmmscanPerDomainTableParser { if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) { continue; } - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 // # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord // # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target // #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- @@ -356,12 +403,14 @@ public final class HmmscanPerDomainTableParser { final int env_to = parseInt( tokens[ 20 ], line_number, "env to" ); ++_domains_encountered; if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) { - if ( query.equals( prev_query ) ) { - throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", " - + prev_qlen ); - } - if ( prev_queries.contains( query ) ) { - throw new IOException( "more than one protein named [" + query + "]" ); + if ( !isAllowProteinsWithSameName() ) { + if ( query.equals( prev_query ) ) { + throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + + ", " + prev_qlen ); + } + if ( prev_queries.contains( query ) ) { + throw new IOException( "more than one protein named [" + query + "]" ); + } } prev_query = query; prev_qlen = qlen; @@ -370,7 +419,7 @@ public final class HmmscanPerDomainTableParser { addProtein( proteins, current_protein ); } if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) { - current_protein = new BasicProtein( query, getSpecies() ); + current_protein = new BasicProtein( query, getSpecies(), qlen ); } else { throw new IllegalArgumentException( "unknown return type" ); @@ -393,7 +442,7 @@ public final class HmmscanPerDomainTableParser { } else { throw new IOException( "could not find a score cutoff value for domain id \"" + target_id - + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); } } final String uc_id = target_id.toUpperCase(); @@ -403,13 +452,23 @@ public final class HmmscanPerDomainTableParser { else if ( ali_from == ali_to ) { //Ignore } - else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) - && ( fs_e_value > getEValueMaximum() ) ) { - ++_domains_ignored_due_to_e_value; + else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( fs_e_value > getFsEValueMaximum() ) ) { + ++_domains_ignored_due_to_fs_e_value; + } + else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( i_e_value > getIEValueMaximum() ) ) { + ++_domains_ignored_due_to_i_e_value; } else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { ++_domains_ignored_due_to_duf; } + else if ( IGNORE_REPLACED_RRMS + && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id + .contains( "RRM_6" ) ) ) { + } + else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) { + } else if ( isIgnoreVirusLikeIds() && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO ) || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG ) @@ -417,8 +476,7 @@ public final class HmmscanPerDomainTableParser { ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id ); ++_domains_ignored_due_to_virus_like_id; } - else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) - && getFilter().contains( new DomainId( target_id ) ) ) { + else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( target_id ) ) { ++_domains_ignored_due_to_negative_domain_filter; ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id ); } @@ -429,8 +487,6 @@ public final class HmmscanPerDomainTableParser { ali_to, ( short ) domain_number, ( short ) total_domains, - fs_e_value, - fs_score, i_e_value, domain_score ); current_protein.addProteinDomain( pd ); @@ -457,7 +513,7 @@ public final class HmmscanPerDomainTableParser { } catch ( final NumberFormatException e ) { throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number - + "] in [" + getInputFile().getCanonicalPath() + "]" ); + + "] in [" + getInputFile().getCanonicalPath() + "]" ); } return d; } @@ -469,7 +525,7 @@ public final class HmmscanPerDomainTableParser { } catch ( final NumberFormatException e ) { throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number - + "] in [" + getInputFile().getCanonicalPath() + "]" ); + + "] in [" + getInputFile().getCanonicalPath() + "]" ); } return i; } @@ -482,11 +538,15 @@ public final class HmmscanPerDomainTableParser { _domains_ignored_due_to_duf = domains_ignored_due_to_duf; } - public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { - _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; + private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) { + _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value; + } + + private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) { + _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value; } - public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { + private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; } @@ -514,15 +574,22 @@ public final class HmmscanPerDomainTableParser { _domains_stored = domains_stored; } - private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { + private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { _domains_stored_set = _storeddomains_stored; } - public void setEValueMaximum( final double e_value_maximum ) { - if ( e_value_maximum < 0.0 ) { + public void setFsEValueMaximum( final double fs_e_value_maximum ) { + if ( fs_e_value_maximum < 0.0 ) { + throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); + } + _fs_e_value_maximum = fs_e_value_maximum; + } + + public void setIEValueMaximum( final double i_e_value_maximum ) { + if ( i_e_value_maximum < 0.0 ) { throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); } - _e_value_maximum = e_value_maximum; + _i_e_value_maximum = i_e_value_maximum; } public void setIgnoreDufs( final boolean ignore_dufs ) { @@ -532,8 +599,8 @@ public final class HmmscanPerDomainTableParser { /** * To ignore domains which are completely engulfed by domains (individual * ones or stretches of overlapping ones) with better support values. - * - * + * + * * @param ignored_engulfed_domains */ public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) { @@ -547,7 +614,7 @@ public final class HmmscanPerDomainTableParser { /** * Sets the individual score cutoff values (for example, gathering * thresholds from Pfam). Domain ids are the keys, cutoffs the values. - * + * * @param individual_score_cutoffs */ public void setIndividualScoreCutoffs( final Map individual_score_cutoffs ) {