// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.io.parsers;
import org.forester.protein.BasicDomain;
import org.forester.protein.BasicProtein;
import org.forester.protein.Domain;
-import org.forester.protein.DomainId;
import org.forester.protein.Protein;
-import org.forester.surfacing.SurfacingUtil;
import org.forester.util.ForesterUtil;
public final class HmmscanPerDomainTableParser {
private static final String HERPES = "HERPES_";
private static final String BACULO = "BACULO_";
private static final int E_VALUE_MAXIMUM_DEFAULT = -1;
+ private static final int LENGTH_RATIO_CUTOFF_DEFAULT = -1;
private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
private static final boolean IGNORE_DUFS_DEFAULT = false;
private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
- private static final boolean IGNORE_REPLACED_RRMS = true;
- private final Set<DomainId> _filter;
+ private final Set<String> _filter;
private final FilterType _filter_type;
private final File _input_file;
private final String _species;
- private double _e_value_maximum;
+ private double _fs_e_value_maximum;
+ private double _i_e_value_maximum;
+ private double _rel_env_length_ratio_cutoff;
private Map<String, Double> _individual_score_cutoffs;
private boolean _ignore_dufs;
private boolean _ignore_virus_like_ids;
private int _domains_encountered;
private int _domains_ignored_due_to_duf;
private int _domains_ignored_due_to_overlap;
- private int _domains_ignored_due_to_e_value;
+ private int _domains_ignored_due_to_fs_e_value;
+ private int _domains_ignored_due_to_i_e_value;
+ private int _domains_ignored_due_to_rel_env_length_ratio_cutoff;
private int _domains_ignored_due_to_individual_score_cutoff;
private int _domains_stored;
- private SortedSet<DomainId> _domains_stored_set;
+ private SortedSet<String> _domains_stored_set;
private long _time;
private int _domains_ignored_due_to_negative_domain_filter;
private Map<String, Integer> _domains_ignored_due_to_negative_domain_filter_counts_map;
public HmmscanPerDomainTableParser( final File input_file,
final String species,
- final Set<DomainId> filter,
+ final Set<String> filter,
final FilterType filter_type,
final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) {
_input_file = input_file;
public HmmscanPerDomainTableParser( final File input_file,
final String species,
- final Set<DomainId> filter,
+ final Set<String> filter,
final FilterType filter_type,
final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
final boolean allow_proteins_with_same_name ) {
if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT )
|| isIgnoreEngulfedDomains() ) {
final int domains_count = current_protein.getNumberOfProteinDomains();
- current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
- isIgnoreEngulfedDomains(),
- current_protein );
+ current_protein = ForesterUtil.removeOverlappingDomains( getMaxAllowedOverlap(),
+ isIgnoreEngulfedDomains(),
+ current_protein );
final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains();
_domains_stored -= domains_removed;
_domains_ignored_due_to_overlap += domains_removed;
}
- if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
- final Set<DomainId> domain_ids_in_protein = new HashSet<DomainId>();
+ if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN )
+ || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
+ final Set<String> domain_ids_in_protein = new HashSet<String>();
for( final Domain d : current_protein.getProteinDomains() ) {
domain_ids_in_protein.add( d.getDomainId() );
}
return _domains_ignored_due_to_duf;
}
- public int getDomainsIgnoredDueToEval() {
- return _domains_ignored_due_to_e_value;
+ public int getDomainsIgnoredDueToIEval() {
+ return _domains_ignored_due_to_i_e_value;
+ }
+
+ public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() {
+ return _domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+
+
+
+ public int getDomainsIgnoredDueToFsEval() {
+ return _domains_ignored_due_to_fs_e_value;
}
public int getDomainsIgnoredDueToIndividualScoreCutoff() {
return _domains_stored;
}
- public SortedSet<DomainId> getDomainsStoredSet() {
+ public SortedSet<String> getDomainsStoredSet() {
return _domains_stored_set;
}
- private double getEValueMaximum() {
- return _e_value_maximum;
+ private double getFsEValueMaximum() {
+ return _fs_e_value_maximum;
+ }
+
+ private double getIEValueMaximum() {
+ return _i_e_value_maximum;
+ }
+
+ private double getRelEnvLengthRatioCutoff() {
+ return _rel_env_length_ratio_cutoff;
}
- private Set<DomainId> getFilter() {
+ private Set<String> getFilter() {
return _filter;
}
}
private void init() {
- _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
+ _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+ _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+ _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT;
setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
_max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
}
private void intitCounts() {
- setDomainsStoredSet( new TreeSet<DomainId>() );
+ setDomainsStoredSet( new TreeSet<String>() );
setDomainsEncountered( 0 );
setProteinsEncountered( 0 );
setProteinsIgnoredDueToFilter( 0 );
setDomainsIgnoredDueToNegativeFilter( 0 );
setDomainsIgnoredDueToDuf( 0 );
- setDomainsIgnoredDueToEval( 0 );
+ setDomainsIgnoredDueToFsEval( 0 );
+ setDomainsIgnoredDueToIEval( 0 );
+ setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 );
setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
setDomainsIgnoredDueToVirusLikeId( 0 );
setDomainsIgnoredDueToOverlap( 0 );
if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) {
continue;
}
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
// # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
// #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
throw new IOException( "more than one protein named [" + query + "]" );
}
}
+ final String fail_query = prev_query; //TODO
prev_query = query;
prev_qlen = qlen;
prev_queries.add( query );
if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
addProtein( proteins, current_protein );
}
+ else {
+ System.out.println(fail_query ); //TODO
+ }
if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
current_protein = new BasicProtein( query, getSpecies(), qlen );
}
}
}
final String uc_id = target_id.toUpperCase();
+ final int env_length = 1 + env_to - env_from;
if ( failed_cutoff ) {
++_domains_ignored_due_to_individual_score_cutoff;
}
else if ( ali_from == ali_to ) {
//Ignore
}
- else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
- && ( fs_e_value > getEValueMaximum() ) ) {
- ++_domains_ignored_due_to_e_value;
+ else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
+ && ( fs_e_value > getFsEValueMaximum() ) ) {
+ ++_domains_ignored_due_to_fs_e_value;
+ }
+ else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
+ && ( i_e_value > getIEValueMaximum() ) ) {
+ ++_domains_ignored_due_to_i_e_value;
}
+ //
+ else if ( ( getRelEnvLengthRatioCutoff() > 0.0 )
+ && ( env_length < ( getRelEnvLengthRatioCutoff() * tlen) ) ) {
+ ++_domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+ //
else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
++_domains_ignored_due_to_duf;
}
- else if ( IGNORE_REPLACED_RRMS
- && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
- .contains( "RRM_6" ) ) ) {
- }
else if ( isIgnoreVirusLikeIds()
&& ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
|| uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id );
++_domains_ignored_due_to_virus_like_id;
}
- else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN )
- && getFilter().contains( new DomainId( target_id ) ) ) {
+ else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) && getFilter().contains( target_id ) ) {
++_domains_ignored_due_to_negative_domain_filter;
ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id );
}
ali_to,
( short ) domain_number,
( short ) total_domains,
- fs_e_value,
- fs_score,
i_e_value,
- domain_score );
+ domain_score,
+ ( short ) tlen,
+ ( short ) hmm_from,
+ ( short ) hmm_to );
current_protein.addProteinDomain( pd );
}
catch ( final IllegalArgumentException e ) {
return proteins;
}
- private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException {
+ private double parseDouble( final String double_str, final int line_number, final String label )
+ throws IOException {
double d = -1;
try {
d = Double.valueOf( double_str ).doubleValue();
_domains_ignored_due_to_duf = domains_ignored_due_to_duf;
}
- private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
- _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
+ private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) {
+ _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value;
}
+ private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) {
+ _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value;
+ }
+
+ private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) {
+ _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+
+
+
private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
_domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
}
_domains_stored = domains_stored;
}
- private void setDomainsStoredSet( final SortedSet<DomainId> _storeddomains_stored ) {
+ private void setDomainsStoredSet( final SortedSet<String> _storeddomains_stored ) {
_domains_stored_set = _storeddomains_stored;
}
- public void setEValueMaximum( final double e_value_maximum ) {
- if ( e_value_maximum < 0.0 ) {
+ public void setFsEValueMaximum( final double fs_e_value_maximum ) {
+ if ( fs_e_value_maximum < 0.0 ) {
throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
}
- _e_value_maximum = e_value_maximum;
+ _fs_e_value_maximum = fs_e_value_maximum;
+ }
+
+ public void setIEValueMaximum( final double i_e_value_maximum ) {
+ if ( i_e_value_maximum < 0.0 ) {
+ throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
+ }
+ _i_e_value_maximum = i_e_value_maximum;
+ }
+
+ public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) {
+ if ( rel_env_length_ratio_cutoff <= 0.0 ) {
+ throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" );
+ }
+ _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff;
}
public void setIgnoreDufs( final boolean ignore_dufs ) {
/**
* To ignore domains which are completely engulfed by domains (individual
* ones or stretches of overlapping ones) with better support values.
- *
- *
+ *
+ *
* @param ignored_engulfed_domains
*/
public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
/**
* Sets the individual score cutoff values (for example, gathering
* thresholds from Pfam). Domain ids are the keys, cutoffs the values.
- *
+ *
* @param individual_score_cutoffs
*/
public void setIndividualScoreCutoffs( final Map<String, Double> individual_score_cutoffs ) {
}
public static enum FilterType {
- NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
+ NONE,
+ POSITIVE_PROTEIN,
+ NEGATIVE_PROTEIN,
+ NEGATIVE_DOMAIN
}
static public enum INDIVIDUAL_SCORE_CUTOFF {
- FULL_SEQUENCE, DOMAIN, NONE;
+ FULL_SEQUENCE,
+ DOMAIN,
+ NONE;
}
public static enum ReturnType {
- UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
+ UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
}
}