import java.util.TreeMap;
import java.util.TreeSet;
-import org.forester.surfacing.BasicDomain;
-import org.forester.surfacing.BasicProtein;
-import org.forester.surfacing.Domain;
-import org.forester.surfacing.DomainId;
-import org.forester.surfacing.Protein;
+import org.forester.protein.BasicDomain;
+import org.forester.protein.BasicProtein;
+import org.forester.protein.Domain;
+import org.forester.protein.DomainId;
+import org.forester.protein.Protein;
import org.forester.surfacing.SurfacingUtil;
import org.forester.util.ForesterUtil;
private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
private static final boolean IGNORE_DUFS_DEFAULT = false;
private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
+ private static final boolean IGNORE_REPLACED_RRMS = false;
private final Set<DomainId> _filter;
private final FilterType _filter_type;
private final File _input_file;
private int _domains_ignored_due_to_virus_like_id;
private Map<String, Integer> _domains_ignored_due_to_virus_like_id_counts_map;
private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff;
+ private final boolean _allow_proteins_with_same_name;
public HmmscanPerDomainTableParser( final File input_file,
final String species,
_filter = null;
_filter_type = FilterType.NONE;
_ind_cutoff = individual_cutoff_applies_to;
+ _allow_proteins_with_same_name = false;
+ init();
+ }
+
+ public HmmscanPerDomainTableParser( final File input_file,
+ final String species,
+ final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
+ final boolean allow_proteins_with_same_name ) {
+ _input_file = input_file;
+ _species = species;
+ _filter = null;
+ _filter_type = FilterType.NONE;
+ _ind_cutoff = individual_cutoff_applies_to;
+ _allow_proteins_with_same_name = allow_proteins_with_same_name;
init();
}
_filter = filter;
_filter_type = filter_type;
_ind_cutoff = individual_cutoff_applies_to;
+ _allow_proteins_with_same_name = false;
+ init();
+ }
+
+ public HmmscanPerDomainTableParser( final File input_file,
+ final String species,
+ final Set<DomainId> filter,
+ final FilterType filter_type,
+ final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to,
+ final boolean allow_proteins_with_same_name ) {
+ _input_file = input_file;
+ _species = species;
+ _filter = filter;
+ _filter_type = filter_type;
+ _ind_cutoff = individual_cutoff_applies_to;
+ _allow_proteins_with_same_name = allow_proteins_with_same_name;
init();
}
+ public boolean isAllowProteinsWithSameName() {
+ return _allow_proteins_with_same_name;
+ }
+
private void actuallyAddProtein( final List<Protein> proteins, final Protein current_protein ) {
final List<Domain> l = current_protein.getProteinDomains();
for( final Domain d : l ) {
final int env_to = parseInt( tokens[ 20 ], line_number, "env to" );
++_domains_encountered;
if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) {
- if ( query.equals( prev_query ) ) {
- throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", "
- + prev_qlen );
- }
- if ( prev_queries.contains( query ) ) {
- throw new IOException( "more than one protein named [" + query + "]" );
+ if ( !isAllowProteinsWithSameName() ) {
+ if ( query.equals( prev_query ) ) {
+ throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
+ + ", " + prev_qlen );
+ }
+ if ( prev_queries.contains( query ) ) {
+ throw new IOException( "more than one protein named [" + query + "]" );
+ }
}
prev_query = query;
prev_qlen = qlen;
addProtein( proteins, current_protein );
}
if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) {
- current_protein = new BasicProtein( query, getSpecies() );
+ current_protein = new BasicProtein( query, getSpecies(), qlen );
}
else {
throw new IllegalArgumentException( "unknown return type" );
else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
++_domains_ignored_due_to_duf;
}
+ else if ( IGNORE_REPLACED_RRMS
+ && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
+ .contains( "RRM_6" ) ) ) {
+ }
else if ( isIgnoreVirusLikeIds()
&& ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
|| uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
_domains_ignored_due_to_duf = domains_ignored_due_to_duf;
}
- public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
+ private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
_domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
}
- public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
+ private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
_domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
}