X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2FHmmscanPerDomainTableParser.java;h=fbaf157f7318653061683235567ac65d51e1ab91;hb=c1fe7fa5cf07889a74cb6de92cd9a4f39e5687db;hp=ac6a2bcc0bb9638ceb8e7fa7468b756cb9451f7d;hpb=eee996a6476a1e3d84c07f8f690dcde3ff4b2ef5;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java index ac6a2bc..fbaf157 100644 --- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -23,7 +23,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.io.parsers; @@ -41,12 +41,10 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import org.forester.surfacing.BasicDomain; -import org.forester.surfacing.BasicProtein; -import org.forester.surfacing.Domain; -import org.forester.surfacing.DomainId; -import org.forester.surfacing.Protein; -import org.forester.surfacing.SurfacingUtil; +import org.forester.protein.BasicDomain; +import org.forester.protein.BasicProtein; +import org.forester.protein.Domain; +import org.forester.protein.Protein; import org.forester.util.ForesterUtil; public final class HmmscanPerDomainTableParser { @@ -61,14 +59,17 @@ public final class HmmscanPerDomainTableParser { private static final String HERPES = "HERPES_"; private static final String BACULO = "BACULO_"; private static final int E_VALUE_MAXIMUM_DEFAULT = -1; + private static final int LENGTH_RATIO_CUTOFF_DEFAULT = -1; private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN; private static final boolean IGNORE_DUFS_DEFAULT = false; private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; - private final Set _filter; + private final Set _filter; private final FilterType _filter_type; private final File _input_file; private final String _species; - private double _e_value_maximum; + private double _fs_e_value_maximum; + private double _i_e_value_maximum; + private double _rel_env_length_ratio_cutoff; private Map _individual_score_cutoffs; private boolean _ignore_dufs; private boolean _ignore_virus_like_ids; @@ -81,16 +82,19 @@ public final class HmmscanPerDomainTableParser { private int _domains_encountered; private int _domains_ignored_due_to_duf; private int _domains_ignored_due_to_overlap; - private int _domains_ignored_due_to_e_value; + private int _domains_ignored_due_to_fs_e_value; + private int _domains_ignored_due_to_i_e_value; + private int _domains_ignored_due_to_rel_env_length_ratio_cutoff; private int _domains_ignored_due_to_individual_score_cutoff; private int _domains_stored; - private SortedSet _domains_stored_set; + private SortedSet _domains_stored_set; private long _time; private int _domains_ignored_due_to_negative_domain_filter; private Map _domains_ignored_due_to_negative_domain_filter_counts_map; private int _domains_ignored_due_to_virus_like_id; private Map _domains_ignored_due_to_virus_like_id_counts_map; private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff; + private final boolean _allow_proteins_with_same_name; public HmmscanPerDomainTableParser( final File input_file, final String species, @@ -100,12 +104,26 @@ public final class HmmscanPerDomainTableParser { _filter = null; _filter_type = FilterType.NONE; _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = false; init(); } public HmmscanPerDomainTableParser( final File input_file, final String species, - final Set filter, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, + final boolean allow_proteins_with_same_name ) { + _input_file = input_file; + _species = species; + _filter = null; + _filter_type = FilterType.NONE; + _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = allow_proteins_with_same_name; + init(); + } + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final Set filter, final FilterType filter_type, final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) { _input_file = input_file; @@ -113,9 +131,29 @@ public final class HmmscanPerDomainTableParser { _filter = filter; _filter_type = filter_type; _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = false; + init(); + } + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final Set filter, + final FilterType filter_type, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to, + final boolean allow_proteins_with_same_name ) { + _input_file = input_file; + _species = species; + _filter = filter; + _filter_type = filter_type; + _ind_cutoff = individual_cutoff_applies_to; + _allow_proteins_with_same_name = allow_proteins_with_same_name; init(); } + public boolean isAllowProteinsWithSameName() { + return _allow_proteins_with_same_name; + } + private void actuallyAddProtein( final List proteins, final Protein current_protein ) { final List l = current_protein.getProteinDomains(); for( final Domain d : l ) { @@ -129,15 +167,16 @@ public final class HmmscanPerDomainTableParser { if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT ) || isIgnoreEngulfedDomains() ) { final int domains_count = current_protein.getNumberOfProteinDomains(); - current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(), - isIgnoreEngulfedDomains(), - current_protein ); + current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(), + isIgnoreEngulfedDomains(), + current_protein ); final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains(); _domains_stored -= domains_removed; _domains_ignored_due_to_overlap += domains_removed; } - if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { - final Set domain_ids_in_protein = new HashSet(); + if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) + || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { + final Set domain_ids_in_protein = new HashSet(); for( final Domain d : current_protein.getProteinDomains() ) { domain_ids_in_protein.add( d.getDomainId() ); } @@ -172,8 +211,18 @@ public final class HmmscanPerDomainTableParser { return _domains_ignored_due_to_duf; } - public int getDomainsIgnoredDueToEval() { - return _domains_ignored_due_to_e_value; + public int getDomainsIgnoredDueToIEval() { + return _domains_ignored_due_to_i_e_value; + } + + public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() { + return _domains_ignored_due_to_rel_env_length_ratio_cutoff; + } + + + + public int getDomainsIgnoredDueToFsEval() { + return _domains_ignored_due_to_fs_e_value; } public int getDomainsIgnoredDueToIndividualScoreCutoff() { @@ -204,15 +253,23 @@ public final class HmmscanPerDomainTableParser { return _domains_stored; } - public SortedSet getDomainsStoredSet() { + public SortedSet getDomainsStoredSet() { return _domains_stored_set; } - private double getEValueMaximum() { - return _e_value_maximum; + private double getFsEValueMaximum() { + return _fs_e_value_maximum; } - private Set getFilter() { + private double getIEValueMaximum() { + return _i_e_value_maximum; + } + + private double getRelEnvLengthRatioCutoff() { + return _rel_env_length_ratio_cutoff; + } + + private Set getFilter() { return _filter; } @@ -261,7 +318,9 @@ public final class HmmscanPerDomainTableParser { } private void init() { - _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT; + _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT; + _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT; setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT ); setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT ); _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT; @@ -272,13 +331,15 @@ public final class HmmscanPerDomainTableParser { } private void intitCounts() { - setDomainsStoredSet( new TreeSet() ); + setDomainsStoredSet( new TreeSet() ); setDomainsEncountered( 0 ); setProteinsEncountered( 0 ); setProteinsIgnoredDueToFilter( 0 ); setDomainsIgnoredDueToNegativeFilter( 0 ); setDomainsIgnoredDueToDuf( 0 ); - setDomainsIgnoredDueToEval( 0 ); + setDomainsIgnoredDueToFsEval( 0 ); + setDomainsIgnoredDueToIEval( 0 ); + setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 ); setDomainsIgnoredDueToIndividualScoreCutoff( 0 ); setDomainsIgnoredDueToVirusLikeId( 0 ); setDomainsIgnoredDueToOverlap( 0 ); @@ -325,7 +386,7 @@ public final class HmmscanPerDomainTableParser { if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) { continue; } - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 // # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord // # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target // #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- @@ -356,21 +417,27 @@ public final class HmmscanPerDomainTableParser { final int env_to = parseInt( tokens[ 20 ], line_number, "env to" ); ++_domains_encountered; if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) { - if ( query.equals( prev_query ) ) { - throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", " - + prev_qlen ); - } - if ( prev_queries.contains( query ) ) { - throw new IOException( "more than one protein named [" + query + "]" ); + if ( !isAllowProteinsWithSameName() ) { + if ( query.equals( prev_query ) ) { + throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + + ", " + prev_qlen ); + } + if ( prev_queries.contains( query ) ) { + throw new IOException( "more than one protein named [" + query + "]" ); + } } + final String fail_query = prev_query; //TODO prev_query = query; prev_qlen = qlen; prev_queries.add( query ); if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) { addProtein( proteins, current_protein ); } + else { + System.out.println(fail_query ); //TODO + } if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) { - current_protein = new BasicProtein( query, getSpecies() ); + current_protein = new BasicProtein( query, getSpecies(), qlen ); } else { throw new IllegalArgumentException( "unknown return type" ); @@ -397,16 +464,27 @@ public final class HmmscanPerDomainTableParser { } } final String uc_id = target_id.toUpperCase(); + final int env_length = 1 + env_to - env_from; if ( failed_cutoff ) { ++_domains_ignored_due_to_individual_score_cutoff; } else if ( ali_from == ali_to ) { //Ignore } - else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) - && ( fs_e_value > getEValueMaximum() ) ) { - ++_domains_ignored_due_to_e_value; + else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( fs_e_value > getFsEValueMaximum() ) ) { + ++_domains_ignored_due_to_fs_e_value; } + else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( i_e_value > getIEValueMaximum() ) ) { + ++_domains_ignored_due_to_i_e_value; + } + // + else if ( ( getRelEnvLengthRatioCutoff() > 0.0 ) + && ( env_length < ( getRelEnvLengthRatioCutoff() * tlen) ) ) { + ++_domains_ignored_due_to_rel_env_length_ratio_cutoff; + } + // else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { ++_domains_ignored_due_to_duf; } @@ -417,8 +495,7 @@ public final class HmmscanPerDomainTableParser { ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id ); ++_domains_ignored_due_to_virus_like_id; } - else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) - && getFilter().contains( new DomainId( target_id ) ) ) { + else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( target_id ) ) { ++_domains_ignored_due_to_negative_domain_filter; ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id ); } @@ -429,10 +506,11 @@ public final class HmmscanPerDomainTableParser { ali_to, ( short ) domain_number, ( short ) total_domains, - fs_e_value, - fs_score, i_e_value, - domain_score ); + domain_score, + ( short ) tlen, + ( short ) hmm_from, + ( short ) hmm_to ); current_protein.addProteinDomain( pd ); } catch ( final IllegalArgumentException e ) { @@ -450,7 +528,8 @@ public final class HmmscanPerDomainTableParser { return proteins; } - private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException { + private double parseDouble( final String double_str, final int line_number, final String label ) + throws IOException { double d = -1; try { d = Double.valueOf( double_str ).doubleValue(); @@ -482,11 +561,21 @@ public final class HmmscanPerDomainTableParser { _domains_ignored_due_to_duf = domains_ignored_due_to_duf; } - public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { - _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; + private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) { + _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value; + } + + private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) { + _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value; + } + + private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) { + _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff; } - public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { + + + private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; } @@ -514,15 +603,29 @@ public final class HmmscanPerDomainTableParser { _domains_stored = domains_stored; } - private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { + private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { _domains_stored_set = _storeddomains_stored; } - public void setEValueMaximum( final double e_value_maximum ) { - if ( e_value_maximum < 0.0 ) { + public void setFsEValueMaximum( final double fs_e_value_maximum ) { + if ( fs_e_value_maximum < 0.0 ) { throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); } - _e_value_maximum = e_value_maximum; + _fs_e_value_maximum = fs_e_value_maximum; + } + + public void setIEValueMaximum( final double i_e_value_maximum ) { + if ( i_e_value_maximum < 0.0 ) { + throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); + } + _i_e_value_maximum = i_e_value_maximum; + } + + public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) { + if ( rel_env_length_ratio_cutoff <= 0.0 ) { + throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" ); + } + _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff; } public void setIgnoreDufs( final boolean ignore_dufs ) { @@ -532,8 +635,8 @@ public final class HmmscanPerDomainTableParser { /** * To ignore domains which are completely engulfed by domains (individual * ones or stretches of overlapping ones) with better support values. - * - * + * + * * @param ignored_engulfed_domains */ public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) { @@ -547,7 +650,7 @@ public final class HmmscanPerDomainTableParser { /** * Sets the individual score cutoff values (for example, gathering * thresholds from Pfam). Domain ids are the keys, cutoffs the values. - * + * * @param individual_score_cutoffs */ public void setIndividualScoreCutoffs( final Map individual_score_cutoffs ) { @@ -582,14 +685,19 @@ public final class HmmscanPerDomainTableParser { } public static enum FilterType { - NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN + NONE, + POSITIVE_PROTEIN, + NEGATIVE_PROTEIN, + NEGATIVE_DOMAIN } static public enum INDIVIDUAL_SCORE_CUTOFF { - FULL_SEQUENCE, DOMAIN, NONE; + FULL_SEQUENCE, + DOMAIN, + NONE; } public static enum ReturnType { - UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN + UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN } }