private static final String HERPES = "HERPES_";
private static final String BACULO = "BACULO_";
private static final int E_VALUE_MAXIMUM_DEFAULT = -1;
+ private static final int LENGTH_RATIO_CUTOFF_DEFAULT = -1;
private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
private static final boolean IGNORE_DUFS_DEFAULT = false;
private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
- private static final boolean IGNORE_REPLACED_RRMS = false;
- private static final boolean IGNORE_hGDE_amylase = true; //TODO eventually remove me, added 10/22/13
private final Set<String> _filter;
private final FilterType _filter_type;
private final File _input_file;
private final String _species;
- private double _e_value_maximum;
+ private double _fs_e_value_maximum;
+ private double _i_e_value_maximum;
+ private double _rel_env_length_ratio_cutoff;
private Map<String, Double> _individual_score_cutoffs;
private boolean _ignore_dufs;
private boolean _ignore_virus_like_ids;
private int _domains_encountered;
private int _domains_ignored_due_to_duf;
private int _domains_ignored_due_to_overlap;
- private int _domains_ignored_due_to_e_value;
+ private int _domains_ignored_due_to_fs_e_value;
+ private int _domains_ignored_due_to_i_e_value;
+ private int _domains_ignored_due_to_rel_env_length_ratio_cutoff;
private int _domains_ignored_due_to_individual_score_cutoff;
private int _domains_stored;
private SortedSet<String> _domains_stored_set;
_domains_stored -= domains_removed;
_domains_ignored_due_to_overlap += domains_removed;
}
- if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
+ if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN )
+ || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
final Set<String> domain_ids_in_protein = new HashSet<String>();
for( final Domain d : current_protein.getProteinDomains() ) {
domain_ids_in_protein.add( d.getDomainId() );
return _domains_ignored_due_to_duf;
}
- public int getDomainsIgnoredDueToEval() {
- return _domains_ignored_due_to_e_value;
+ public int getDomainsIgnoredDueToIEval() {
+ return _domains_ignored_due_to_i_e_value;
+ }
+
+ public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() {
+ return _domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+
+
+
+ public int getDomainsIgnoredDueToFsEval() {
+ return _domains_ignored_due_to_fs_e_value;
}
public int getDomainsIgnoredDueToIndividualScoreCutoff() {
return _domains_stored_set;
}
- private double getEValueMaximum() {
- return _e_value_maximum;
+ private double getFsEValueMaximum() {
+ return _fs_e_value_maximum;
+ }
+
+ private double getIEValueMaximum() {
+ return _i_e_value_maximum;
+ }
+
+ private double getRelEnvLengthRatioCutoff() {
+ return _rel_env_length_ratio_cutoff;
}
private Set<String> getFilter() {
}
private void init() {
- _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
+ _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+ _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+ _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT;
setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
_max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
setProteinsIgnoredDueToFilter( 0 );
setDomainsIgnoredDueToNegativeFilter( 0 );
setDomainsIgnoredDueToDuf( 0 );
- setDomainsIgnoredDueToEval( 0 );
+ setDomainsIgnoredDueToFsEval( 0 );
+ setDomainsIgnoredDueToIEval( 0 );
+ setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 );
setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
setDomainsIgnoredDueToVirusLikeId( 0 );
setDomainsIgnoredDueToOverlap( 0 );
if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) {
continue;
}
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
// # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
// #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
throw new IOException( "more than one protein named [" + query + "]" );
}
}
+ final String fail_query = prev_query; //TODO
prev_query = query;
prev_qlen = qlen;
prev_queries.add( query );
if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) {
addProtein( proteins, current_protein );
}
+ else {
+ System.out.println(fail_query ); //TODO
+ }
if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
current_protein = new BasicProtein( query, getSpecies(), qlen );
}
}
}
final String uc_id = target_id.toUpperCase();
+ final int env_length = 1 + env_to - env_from;
if ( failed_cutoff ) {
++_domains_ignored_due_to_individual_score_cutoff;
}
else if ( ali_from == ali_to ) {
//Ignore
}
- else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
- && ( fs_e_value > getEValueMaximum() ) ) {
- ++_domains_ignored_due_to_e_value;
+ else if ( ( getFsEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
+ && ( fs_e_value > getFsEValueMaximum() ) ) {
+ ++_domains_ignored_due_to_fs_e_value;
}
- else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
- ++_domains_ignored_due_to_duf;
+ else if ( ( getIEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT )
+ && ( i_e_value > getIEValueMaximum() ) ) {
+ ++_domains_ignored_due_to_i_e_value;
}
- else if ( IGNORE_REPLACED_RRMS
- && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
- .contains( "RRM_6" ) ) ) {
+ //
+ else if ( ( getRelEnvLengthRatioCutoff() > 0.0 )
+ && ( env_length < ( getRelEnvLengthRatioCutoff() * tlen) ) ) {
+ ++_domains_ignored_due_to_rel_env_length_ratio_cutoff;
}
- else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) {
+ //
+ else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
+ ++_domains_ignored_due_to_duf;
}
else if ( isIgnoreVirusLikeIds()
&& ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
ali_to,
( short ) domain_number,
( short ) total_domains,
- fs_e_value,
- fs_score,
i_e_value,
- domain_score );
+ domain_score,
+ ( short ) tlen,
+ ( short ) hmm_from,
+ ( short ) hmm_to );
current_protein.addProteinDomain( pd );
}
catch ( final IllegalArgumentException e ) {
return proteins;
}
- private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException {
+ private double parseDouble( final String double_str, final int line_number, final String label )
+ throws IOException {
double d = -1;
try {
d = Double.valueOf( double_str ).doubleValue();
_domains_ignored_due_to_duf = domains_ignored_due_to_duf;
}
- private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
- _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
+ private void setDomainsIgnoredDueToFsEval( final int domains_ignored_due_to_fs_e_value ) {
+ _domains_ignored_due_to_fs_e_value = domains_ignored_due_to_fs_e_value;
}
+ private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) {
+ _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value;
+ }
+
+ private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) {
+ _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+
+
+
private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
_domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
}
_domains_stored_set = _storeddomains_stored;
}
- public void setEValueMaximum( final double e_value_maximum ) {
- if ( e_value_maximum < 0.0 ) {
+ public void setFsEValueMaximum( final double fs_e_value_maximum ) {
+ if ( fs_e_value_maximum < 0.0 ) {
+ throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
+ }
+ _fs_e_value_maximum = fs_e_value_maximum;
+ }
+
+ public void setIEValueMaximum( final double i_e_value_maximum ) {
+ if ( i_e_value_maximum < 0.0 ) {
throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" );
}
- _e_value_maximum = e_value_maximum;
+ _i_e_value_maximum = i_e_value_maximum;
+ }
+
+ public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) {
+ if ( rel_env_length_ratio_cutoff <= 0.0 ) {
+ throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" );
+ }
+ _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff;
}
public void setIgnoreDufs( final boolean ignore_dufs ) {
/**
* To ignore domains which are completely engulfed by domains (individual
* ones or stretches of overlapping ones) with better support values.
- *
- *
+ *
+ *
* @param ignored_engulfed_domains
*/
public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) {
/**
* Sets the individual score cutoff values (for example, gathering
* thresholds from Pfam). Domain ids are the keys, cutoffs the values.
- *
+ *
* @param individual_score_cutoffs
*/
public void setIndividualScoreCutoffs( final Map<String, Double> individual_score_cutoffs ) {
}
public static enum FilterType {
- NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
+ NONE,
+ POSITIVE_PROTEIN,
+ NEGATIVE_PROTEIN,
+ NEGATIVE_DOMAIN
}
static public enum INDIVIDUAL_SCORE_CUTOFF {
- FULL_SEQUENCE, DOMAIN, NONE;
+ FULL_SEQUENCE,
+ DOMAIN,
+ NONE;
}
public static enum ReturnType {
- UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
+ UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
}
}