package org.forester.surfacing;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.forester.protein.BinaryDomainCombination;
import org.forester.species.Species;
-import org.forester.util.DescriptiveStatistics;
+import org.forester.util.ForesterUtil;
public class BasicCombinableDomains implements CombinableDomains {
final private String _key_domain;
private int _key_domain_count;
- private int _key_domain_proteins_count;
final private Species _species;
final private TreeMap<String, Integer> _combining_domains;
- private DescriptiveStatistics _key_domain_confidence_statistics;
+ final private Set<String> _proteins_with_key_domain;
public BasicCombinableDomains( final String key_domain, final Species species ) {
_key_domain = key_domain;
_species = species;
_combining_domains = new TreeMap<String, Integer>();
- init();
+ _proteins_with_key_domain = new HashSet<String>();
+ _key_domain_count = 0;
}
@Override
}
@Override
+ public void addKeyDomainProtein( final String protein ) {
+ if ( ForesterUtil.isEmpty( protein ) ) {
+ throw new IllegalArgumentException( "attempt to add null or empty protein" );
+ }
+ getKeyDomainProteins().add( protein );
+ }
+
+ @Override
public List<String> getAllDomains() {
final List<String> domains = getCombinableDomains();
if ( !domains.contains( getKeyDomain() ) ) {
}
@Override
- public DescriptiveStatistics getKeyDomainConfidenceDescriptiveStatistics() {
- return _key_domain_confidence_statistics;
- }
-
- @Override
public int getKeyDomainCount() {
return _key_domain_count;
}
@Override
public int getKeyDomainProteinsCount() {
- return _key_domain_proteins_count;
+ return getKeyDomainProteins().size();
}
@Override
return _species;
}
- private void init() {
- _key_domain_count = 0;
- _key_domain_proteins_count = 0;
- _key_domain_confidence_statistics = null;
- }
-
@Override
public boolean isCombinable( final String protein_domain ) {
return getCombiningDomains().containsKey( protein_domain );
}
@Override
- public void setKeyDomainConfidenceDescriptiveStatistics( final DescriptiveStatistics key_domain_confidence_statistics ) {
- _key_domain_confidence_statistics = key_domain_confidence_statistics;
- }
-
- @Override
public void setKeyDomainCount( final int key_domain_count ) {
_key_domain_count = key_domain_count;
}
@Override
- public void setKeyDomainProteinsCount( final int key_domain_proteins_count ) {
- _key_domain_proteins_count = key_domain_proteins_count;
- }
-
- @Override
public List<BinaryDomainCombination> toBinaryDomainCombinations() {
final List<BinaryDomainCombination> binary_combinations = new ArrayList<BinaryDomainCombination>( getNumberOfCombinableDomains() );
for( final String domain : getCombiningDomains().keySet() ) {
sb.append( getCombiningDomainIdsAsStringBuilder() );
return sb.toString();
}
+
+ @Override
+ public Set<String> getKeyDomainProteins() {
+ return _proteins_with_key_domain;
+ }
}
// ~~~OLD:
//throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinable domains" );
// ~~~new:
- final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
+ final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
species_data.put( domains_list.get( 0 ).getSpecies(),
createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
return new PrintableDomainSimilarity( domains_list.get( 0 ),
isTreatAsBinaryComparison() );
}
final DescriptiveStatistics stat = new BasicDescriptiveStatistics();
- final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
+ final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
species_data.put( domains_list.get( 0 ).getSpecies(),
createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
int max_difference_in_counts = 0;
return _treat_as_binary_comparison;
}
- private static SpeciesSpecificDomainSimilariyData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
- final SpeciesSpecificDomainSimilariyData sd = new PrintableSpeciesSpecificDomainSimilariyData( cd.getKeyDomainProteinsCount(),
- cd.getKeyDomainCount(),
- cd.getNumberOfCombinableDomains() );
+ private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
+ final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
+ cd.getNumberOfCombinableDomains() );
+ for( final String prot : cd.getKeyDomainProteins() ) {
+ sd.addKeyDomainProtein( prot );
+ }
for( final String domain : cd.getCombinableDomains() ) {
sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );
}
package org.forester.surfacing;
-import java.text.DecimalFormat;
-import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
- private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" );
private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator<CombinableDomains>() {
@Override
}
// Produces something like:
- // 2-oxoacid_dh 5 5 2 4.8E-67 Biotin_lipoyl [4], E3_binding [3]
+ // 2-oxoacid_dh 5 5 2 Biotin_lipoyl [4], E3_binding [3]
@Override
public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
final StringBuilder sb = new StringBuilder();
sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
- sb.append( ForesterUtil.pad( new StringBuffer( ""
- + FORMATTER.format( cb.getKeyDomainConfidenceDescriptiveStatistics()
- .median() ) ),
- 10,
- ' ',
- false ) );
sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
sb.append( ForesterUtil.getLineSeparator() );
}
}
private static void countDomains( final Map<String, Integer> domain_counts,
- final Map<String, Integer> domain_protein_counts,
- final Map<String, DescriptiveStatistics> stats,
final Set<String> saw_c,
- final String id_i,
- final double support ) {
+ final String id_i ) {
if ( domain_counts.containsKey( id_i ) ) {
domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
- if ( !saw_c.contains( id_i ) ) {
- domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
- }
}
else {
- stats.put( id_i, new BasicDescriptiveStatistics() );
domain_counts.put( id_i, 1 );
- domain_protein_counts.put( id_i, 1 );
}
- stats.get( id_i ).addValue( support );
saw_c.add( id_i );
}
final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
final Map<String, Integer> domain_counts = new HashMap<String, Integer>();
- final Map<String, Integer> domain_protein_counts = new HashMap<String, Integer>();
- final Map<String, DescriptiveStatistics> stats = new HashMap<String, DescriptiveStatistics>();
for( final Protein protein : protein_list ) {
if ( !protein.getSpecies().equals( species ) ) {
throw new IllegalArgumentException( "species (" + protein.getSpecies()
final Domain pd_i = protein.getProteinDomain( i );
final String id_i = pd_i.getDomainId();
final int current_start = pd_i.getFrom();
- BasicGenomeWideCombinableDomains.countDomains( domain_counts,
- domain_protein_counts,
- stats,
- saw_c,
- id_i,
- pd_i.getPerSequenceEvalue() );
+ BasicGenomeWideCombinableDomains.countDomains( domain_counts, saw_c, id_i );
if ( !saw_i.contains( id_i ) ) {
if ( dc_type == DomainCombinationType.BASIC ) {
saw_i.add( id_i );
}
instance.add( id_i, domain_combination );
}
+ domain_combination.addKeyDomainProtein( protein.getProteinId().getId() );//^^^^^^^^^^^^^^
final Set<String> saw_j = new HashSet<String>();
if ( ignore_combination_with_same_domain ) {
saw_j.add( id_i );
domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() );
}
}
- //
}
}
}
for( final String key_id : domain_counts.keySet() ) {
instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
- instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
- instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );
}
return instance;
}
package org.forester.surfacing;
import java.util.List;
+import java.util.Set;
import java.util.SortedMap;
import org.forester.protein.BinaryDomainCombination;
import org.forester.species.Species;
-import org.forester.util.DescriptiveStatistics;
public interface CombinableDomains {
public String getKeyDomain();
/**
- * Gets descriptive statistics for the confidence (i.e. E-values) of the key
- * domain.
- *
- *
- * @return descriptive statistics for the confidence of the key domain
- */
- public DescriptiveStatistics getKeyDomainConfidenceDescriptiveStatistics();
-
- /**
* Returns how many times the key domain is present in a given species
* genome.
*
*/
public int getKeyDomainProteinsCount();
+ public Set<String> getKeyDomainProteins();
+
public int getNumberOfCombinableDomains();
public int getNumberOfProteinsExhibitingCombination( final String protein_domain );
public boolean isCombinable( final String protein_domain );
/**
- * This is to set descriptive statistics for the confidence (i.e. E-values)
- * of the key domain.
- *
- *
- * @param statistics
- */
- void setKeyDomainConfidenceDescriptiveStatistics( final DescriptiveStatistics statistics );
-
- /**
* Sets how many times the key domain is present in a given species genome.
*
* @param key_domain_count
*/
void setKeyDomainCount( final int key_domain_count );
- /**
- * Sets how many proteins with the key domain are present in a given species
- * genome.
- *
- * @param key_domain_proteins_count
- * key domain protein count in species
- */
- void setKeyDomainProteinsCount( final int key_domain_proteins_count );
-
public List<BinaryDomainCombination> toBinaryDomainCombinations();
+
+ void addKeyDomainProtein( String protein );
}
\ No newline at end of file
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
-import java.text.DecimalFormat;
-import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
*/
public final class DomainCountsDifferenceUtil {
- private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" );
private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES = COPY_CALCULATION_MODE.MIN;
private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES = COPY_CALCULATION_MODE.MIN;
private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_LOW_COPY_SPECIES = COPY_CALCULATION_MODE.MAX;
private static String combinableDomaindToString( final CombinableDomains cd ) {
final StringBuilder sb = new StringBuilder();
sb.append( cd.getKeyDomainProteinsCount() );
- sb.append( "\t[" );
- sb.append( FORMATTER.format( cd.getKeyDomainConfidenceDescriptiveStatistics().median() ) );
- sb.append( "]" );
return sb.toString();
}
sb.append( cd.getKeyDomainProteinsCount() );
sb.append( "</b>, " );
sb.append( cd.getNumberOfCombinableDomains() );
- sb.append( "]</td><td>[" );
- sb.append( FORMATTER.format( cd.getKeyDomainConfidenceDescriptiveStatistics().median() ) );
sb.append( "]</td><td>" );
sb.append( cd.getCombiningDomainIdsAsStringBuilder() );
return sb.toString();
*
* @return SortedMap<String, SpeciesSpecificDomainSimilariyData>
*/
- public SortedMap<Species, SpeciesSpecificDomainSimilariyData> getSpeciesData();
+ public SortedMap<Species, SpeciesSpecificDcData> getSpeciesData();
public double getStandardDeviationOfSimilarityScore();
public class PrintableDomainSimilarity implements DomainSimilarity {
- final public static String SPECIES_SEPARATOR = " ";
- final private static int EQUAL = 0;
- final private static String NO_SPECIES = " ";
- private static final boolean PRINT_MORE_INFO = false;
- final private double _min;
- final private double _max;
- final private double _mean;
- final private double _sd;
- final private int _n;
- private final int _max_difference_in_counts;
- private final int _max_difference;
- final private CombinableDomains _combinable_domains;
- final private SortedMap<Species, SpeciesSpecificDomainSimilariyData> _species_data;
- private List<Species> _species_order;
- private DomainSimilarityCalculator.Detailedness _detailedness;
- private final boolean _treat_as_binary_comparison;
+ final public static String SPECIES_SEPARATOR = " ";
+ final private static int EQUAL = 0;
+ final private static String NO_SPECIES = " ";
+ private static final boolean PRINT_MORE_INFO = false;
+ final private double _min;
+ final private double _max;
+ final private double _mean;
+ final private double _sd;
+ final private int _n;
+ private final int _max_difference_in_counts;
+ private final int _max_difference;
+ final private CombinableDomains _combinable_domains;
+ final private SortedMap<Species, SpeciesSpecificDcData> _species_data;
+ private List<Species> _species_order;
+ private DomainSimilarityCalculator.Detailedness _detailedness;
+ private final boolean _treat_as_binary_comparison;
public PrintableDomainSimilarity( final CombinableDomains combinable_domains,
final double min,
final int n,
final int max_difference_in_counts,
final int max_difference,
- final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data,
+ final SortedMap<Species, SpeciesSpecificDcData> species_data,
final boolean sort_by_species_count_first,
final boolean treat_as_binary_comparison ) {
if ( combinable_domains == null ) {
}
@Override
- public SortedMap<Species, SpeciesSpecificDomainSimilariyData> getSpeciesData() {
+ public SortedMap<Species, SpeciesSpecificDcData> getSpeciesData() {
return _species_data;
}
import java.util.Set;
import java.util.SortedMap;
+import java.util.SortedSet;
import java.util.TreeMap;
+import java.util.TreeSet;
-class PrintableSpeciesSpecificDomainSimilariyData implements SpeciesSpecificDomainSimilariyData {
+import org.forester.util.ForesterUtil;
+
+class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData {
final SortedMap<String, Integer> _combinable_domain_id_to_count_map;
- final private int _key_domain_proteins_count;
+ final SortedSet<String> _key_domain_proteins;
final private int _key_domain_domains_count;
final private int _combinable_domains_count;
- public PrintableSpeciesSpecificDomainSimilariyData( final int key_domain_proteins_count,
- final int key_domain_domains_count,
- final int combinable_domains ) {
- _key_domain_proteins_count = key_domain_proteins_count;
+ public PrintableSpeciesSpecificDcData( final int key_domain_domains_count, final int combinable_domains ) {
+ _key_domain_proteins = new TreeSet<String>();
_key_domain_domains_count = key_domain_domains_count;
_combinable_domains_count = combinable_domains;
_combinable_domain_id_to_count_map = new TreeMap<String, Integer>();
}
private int getKeyDomainProteinsCount() {
- return _key_domain_proteins_count;
+ return _key_domain_proteins.size();
}
@Override
}
@Override
+ public void addKeyDomainProtein( final String protein ) {
+ if ( ForesterUtil.isEmpty( protein ) ) {
+ throw new IllegalArgumentException( "attempt to add null or empty protein" );
+ }
+ if ( getKeyDomainProteins().contains( protein ) ) {
+ throw new IllegalArgumentException( "protein \"" + protein + "\" is not unique" );
+ }
+ getKeyDomainProteins().add( protein );
+ }
+
+ @Override
+ public SortedSet<String> getKeyDomainProteins() {
+ return _key_domain_proteins;
+ }
+
+ @Override
public String toString() {
return toStringBuffer( DomainSimilarityCalculator.Detailedness.LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES, false )
.toString();
sb.append( getCombinableDomainIdToCountsMap().get( domain_id ) );
}
}
+ sb.append( " [" );
+ boolean first = true;
+ for( final String p : getKeyDomainProteins() ) {
+ String link = null;
+ final String up_id = ForesterUtil.extractUniProtKbProteinSeqIdentifier( p );
+ if ( !ForesterUtil.isEmpty( up_id ) ) {
+ link = "<a href=\"" + ForesterUtil.UNIPROT_KB + up_id + "\" target=\"_up_window\">" + up_id + "</a>";
+ }
+ else {
+ link = "<a href=\"" + "http://www.google.com/search?q=" + p + "\" target=\"_g_window\">" + p + "</a>";
+ }
+ if ( first ) {
+ first = false;
+ }
+ else {
+ sb.append( ", " );
+ }
+ sb.append( p );
+ }
+ sb.append( "]" );
return sb;
}
}
package org.forester.surfacing;
import java.util.SortedMap;
+import java.util.SortedSet;
/*
* A helper class for PrintableDomainSimilarity.
*/
-interface SpeciesSpecificDomainSimilariyData {
+interface SpeciesSpecificDcData {
public void addProteinsExhibitingCombinationCount( final String domain_id, final int count );
public int getNumberOfProteinsExhibitingCombinationWith( final String domain_id );
public StringBuffer toStringBuffer( final DomainSimilarityCalculator.Detailedness detailedness, boolean html );
+
+ public SortedSet<String> getKeyDomainProteins();
+
+ void addKeyDomainProtein( String protein );
}
if ( !sa3.getDomainId().equals( "A" ) ) {
return false;
}
- final SpeciesSpecificDomainSimilariyData ssdsd = sa3.getSpeciesData().get( new BasicSpecies( "ciona" ) );
+ final SpeciesSpecificDcData ssdsd = sa3.getSpeciesData().get( new BasicSpecies( "ciona" ) );
if ( ssdsd.getCombinableDomainIdToCountsMap().size() != 4 ) {
return false;
}
if ( !sa4.getDomainId().equals( "A" ) ) {
return false;
}
- final SpeciesSpecificDomainSimilariyData ssdsd4 = sa4.getSpeciesData().get( new BasicSpecies( "ciona" ) );
+ final SpeciesSpecificDcData ssdsd4 = sa4.getSpeciesData().get( new BasicSpecies( "ciona" ) );
if ( ssdsd4.getCombinableDomainIdToCountsMap().size() != 5 ) {
return false;
}
if ( !sa5_d.getSpecies().last().equals( new BasicSpecies( "rabbit" ) ) ) {
return false;
}
- final SpeciesSpecificDomainSimilariyData ssdsd5 = sa5_d.getSpeciesData().get( new BasicSpecies( "ciona" ) );
+ final SpeciesSpecificDcData ssdsd5 = sa5_d.getSpeciesData().get( new BasicSpecies( "ciona" ) );
if ( ssdsd5.getCombinableDomainIdToCountsMap().size() != 4 ) {
return false;
}
if ( !sa6_d.getSpecies().last().equals( new BasicSpecies( "rabbit" ) ) ) {
return false;
}
- final SpeciesSpecificDomainSimilariyData ssdsd6 = sa6_d.getSpeciesData().get( new BasicSpecies( "ciona" ) );
+ final SpeciesSpecificDcData ssdsd6 = sa6_d.getSpeciesData().get( new BasicSpecies( "ciona" ) );
if ( ssdsd6.getCombinableDomainIdToCountsMap().size() != 5 ) {
return false;
}
return v;
}
+ public static String extractUniProtKbProteinSeqIdentifier( final String str ) {
+ String upkb = null;
+ Matcher m = UNIPROT_KB_PATTERN_1.matcher( str );
+ if ( m.find() ) {
+ upkb = m.group( 1 );
+ }
+ else {
+ m = UNIPROT_KB_PATTERN_2.matcher( str );
+ if ( m.find() ) {
+ upkb = m.group();
+ }
+ }
+ return upkb;
+ }
+
public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
String upkb = null;
if ( node.getNodeData().isHasSequence() ) {