X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FDomainSimilarity.java;h=72d03f1fa15a759e6e19981583254f1bd6d235ec;hb=2af0eeb66b17ef94432816233efce4f976f82517;hp=07874397dc5ea4450e5663fdc1393855a99e108a;hpb=87d34f343d0262cd0c009c6f1da058a5a217bc64;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/DomainSimilarity.java b/forester/java/src/org/forester/surfacing/DomainSimilarity.java index 0787439..72d03f1 100644 --- a/forester/java/src/org/forester/surfacing/DomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/DomainSimilarity.java @@ -26,30 +26,167 @@ package org.forester.surfacing; +import java.awt.Color; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import org.forester.phylogeny.Phylogeny; import org.forester.species.Species; -import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.util.ForesterUtil; -/* - * This is to represent a measure of similarity between two or more domains from - * different genomes. - */ -public interface DomainSimilarity extends Comparable { +public class DomainSimilarity implements Comparable { - static public enum DomainSimilarityScoring { - DOMAINS, PROTEINS, COMBINATIONS; + final public static String SPECIES_SEPARATOR = " "; + final private static int EQUAL = 0; + final private static String NO_SPECIES = " "; + private static final boolean OUTPUT_TAXCODES_PER_DOMAIN = false; + final private CombinableDomains _combinable_domains; + private DomainSimilarityCalculator.Detailedness _detailedness; + final private double _max; + private final int _max_difference; + private final int _max_difference_in_counts; + final private double _mean; + final private double _min; + final private int _n; + final private double _sd; + final private SortedMap _species_data; + private List _species_order; + private final boolean _treat_as_binary_comparison; + + public DomainSimilarity( final CombinableDomains combinable_domains, + final double min, + final double max, + final double mean, + final double median, + final double sd, + final int n, + final int max_difference_in_counts, + final int max_difference, + final SortedMap species_data, + final boolean sort_by_species_count_first, + final boolean treat_as_binary_comparison ) { + if ( combinable_domains == null ) { + throw new IllegalArgumentException( "attempt to use null combinable domains" ); + } + if ( species_data == null ) { + throw new IllegalArgumentException( "attempt to use null species data" ); + } + if ( species_data.size() < 1 ) { + throw new IllegalArgumentException( "attempt to use empty species data" ); + } + if ( n < 0 ) { + throw new IllegalArgumentException( "attempt to use N less than 0" ); + } + if ( ( species_data.size() > 1 ) && ( n < 1 ) ) { + throw new IllegalArgumentException( "attempt to use N less than 1" ); + } + if ( sd < 0.0 ) { + throw new IllegalArgumentException( "attempt to use negative SD" ); + } + if ( max < min ) { + throw new IllegalArgumentException( "attempt to use max smaller than min" ); + } + init(); + _combinable_domains = combinable_domains; + _min = min; + _max = max; + _mean = mean; + _sd = sd; + _n = n; + _max_difference_in_counts = max_difference_in_counts; + _max_difference = max_difference; + _species_data = species_data; + _treat_as_binary_comparison = treat_as_binary_comparison; + final int s = species_data.size(); + if ( ( ( s * s ) - s ) != ( getN() * 2 ) ) { + throw new IllegalArgumentException( "illegal species count and n: species count:" + s + ", n:" + _n + + " for domain " + combinable_domains.getKeyDomain() ); + } + if ( s > 2 ) { + if ( getMaximalDifferenceInCounts() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" ); + } + if ( getMaximalDifference() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" ); + } + } } - public static enum DomainSimilaritySortField { - MIN, MAX, SD, MEAN, ABS_MAX_COUNTS_DIFFERENCE, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, SPECIES_COUNT, DOMAIN_ID, + public DomainSimilarity( final CombinableDomains combinable_domains, + final int max_difference_in_counts, + final int max_difference, + final SortedMap species_data, + final boolean sort_by_species_count_first, + final boolean treat_as_binary_comparison ) { + if ( combinable_domains == null ) { + throw new IllegalArgumentException( "attempt to use null combinable domains" ); + } + if ( species_data == null ) { + throw new IllegalArgumentException( "attempt to use null species data" ); + } + if ( species_data.size() < 1 ) { + throw new IllegalArgumentException( "attempt to use empty species data" ); + } + init(); + _combinable_domains = combinable_domains; + _min = -1; + _max = -1; + _mean = -1; + _sd = -1; + _n = -1; + _max_difference_in_counts = max_difference_in_counts; + _max_difference = max_difference; + _species_data = species_data; + _treat_as_binary_comparison = treat_as_binary_comparison; + final int s = species_data.size(); + if ( s > 2 ) { + if ( getMaximalDifferenceInCounts() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" ); + } + if ( getMaximalDifference() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" ); + } + } } - public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain );; + public int compareTo( final DomainSimilarity domain_similarity ) { + if ( this == domain_similarity ) { + return EQUAL; + } + else if ( domain_similarity == null ) { + throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to null" ); + } + else if ( domain_similarity.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to " + + domain_similarity.getClass() ); + } + return compareByDomainId( domain_similarity ); + } - public String getDomainId(); + public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain ) { + final SortedSet sorted_ids = new TreeSet(); + if ( getSpeciesData().containsKey( species_of_combinable_domain ) ) { + for( final String id : getSpeciesData().get( species_of_combinable_domain ) + .getCombinableDomainIdToCountsMap().keySet() ) { + sorted_ids.add( id ); + } + } + return sorted_ids; + } + + public String getDomainId() { + return getCombinableDomains().getKeyDomain(); + } /** * For pairwise similarities, this should return the "difference"; for example the difference in counts @@ -58,11 +195,10 @@ public interface DomainSimilarity extends Comparable { * For pairwise similarities, this should return the difference, * while for comparisons of more than two domains, this should return the maximal difference * - * - * - * @return */ - public int getMaximalDifference(); + public int getMaximalDifference() { + return _max_difference; + } /** * For pairwise similarities, this should return the difference in counts, @@ -72,13 +208,21 @@ public interface DomainSimilarity extends Comparable { * * @return the (maximal) difference in counts */ - public int getMaximalDifferenceInCounts(); + public int getMaximalDifferenceInCounts() { + return _max_difference_in_counts; + } - public double getMaximalSimilarityScore(); + public double getMaximalSimilarityScore() { + return _max; + } - public double getMeanSimilarityScore(); + public double getMeanSimilarityScore() { + return _mean; + } - public double getMinimalSimilarityScore(); + public double getMinimalSimilarityScore() { + return _min; + } /** * This should return the number of pairwise distances used to calculate @@ -86,9 +230,21 @@ public interface DomainSimilarity extends Comparable { * * @return the number of pairwise distances */ - public int getN(); + public int getN() { + return _n; + } - public SortedSet getSpecies(); + public SortedSet getSpecies() { + final SortedSet species = new TreeSet(); + for( final Species s : getSpeciesData().keySet() ) { + species.add( s ); + } + return species; + } + + public List getSpeciesCustomOrder() { + return _species_order; + } /** * This should return a map, which maps species names to @@ -97,9 +253,389 @@ public interface DomainSimilarity extends Comparable { * * @return SortedMap */ - public SortedMap getSpeciesData(); + public SortedMap getSpeciesData() { + return _species_data; + } + + public double getStandardDeviationOfSimilarityScore() { + return _sd; + } + + public void setDetailedness( final Detailedness detailedness ) { + _detailedness = detailedness; + } + + public void setSpeciesOrder( final List species_order ) { + if ( !species_order.containsAll( getSpeciesData().keySet() ) ) { + throw new IllegalArgumentException( "list to order species must contain all species of multiple combinable domains similarity" ); + } + _species_order = species_order; + } - public double getStandardDeviationOfSimilarityScore(); + public StringBuffer toStringBuffer( final DomainSimilarity.PRINT_OPTION print_option, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + switch ( print_option ) { + case SIMPLE_TAB_DELIMITED: + return toStringBufferSimpleTabDelimited(); + case HTML: + return toStringBufferDetailedHTML( tax_code_to_id_map, phy, OUTPUT_TAXCODES_PER_DOMAIN ); + default: + throw new AssertionError( "Unknown print option: " + print_option ); + } + } + + private void addSpeciesSpecificDomainData( final StringBuffer sb, + final Species species, + final boolean html, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + if ( html ) { + sb.append( "" ); + sb.append( "" ); + addTaxWithLink( sb, species.getSpeciesId(), tax_code_to_id_map, phy ); + sb.append( "" ); + } + else { + sb.append( species.getSpeciesId() ); + } + if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) { + if ( html ) { + //sb.append( ":" ); + } + else { + sb.append( "\t" ); + } + sb.append( getSpeciesData().get( species ).toStringBuffer( getDetaildness(), html ) ); + } + if ( html ) { + //sb.append( "
" ); + sb.append( "" ); + } + else { + sb.append( "\n\t" ); + } + } + + private void addTaxWithLink( final StringBuffer sb, + final String tax_code, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + String hex = null; + if ( ( phy != null ) && !phy.isEmpty() ) { + hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax_code, phy ); + } + sb.append( "" ); + if ( !ForesterUtil.isEmpty( tax_code ) + && ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( tax_code ) ) ) { + if ( !ForesterUtil.isEmpty( hex ) ) { + sb.append( "" ); + sb.append( tax_code ); + sb.append( "" ); + } + else { + sb.append( "" ); + sb.append( tax_code ); + sb.append( "" ); + } + } + else { + sb.append( tax_code ); + } + sb.append( "" ); + } + + private int compareByDomainId( final DomainSimilarity other ) { + return getDomainId().compareToIgnoreCase( other.getDomainId() ); + } - public StringBuffer toStringBuffer( PRINT_OPTION print_option, Map tax_code_to_id_map ); + private CombinableDomains getCombinableDomains() { + return _combinable_domains; + } + + private DomainSimilarityCalculator.Detailedness getDetaildness() { + return _detailedness; + } + + private StringBuffer getDomainDataInAlphabeticalOrder() { + final SortedMap> m = new TreeMap>(); + final StringBuffer sb = new StringBuffer(); + for( final Species species : getSpeciesData().keySet() ) { + for( final String combable_dom : getCombinableDomainIds( species ) ) { + if ( !m.containsKey( combable_dom ) ) { + m.put( combable_dom, new TreeSet() ); + } + m.get( combable_dom ).add( species.getSpeciesId() ); + } + } + for( final Map.Entry> e : m.entrySet() ) { + sb.append( "" + e.getKey() + "" ); + sb.append( " " ); + sb.append( "" ); + for( final String tax : e.getValue() ) { + final String hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax, null ); + if ( !ForesterUtil.isEmpty( hex ) ) { + sb.append( "" ); + sb.append( tax ); + sb.append( "" ); + } + else { + sb.append( tax ); + } + sb.append( " " ); + } + sb.append( "" ); + sb.append( "
\n" ); + } + return sb; + } + + private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + final StringBuffer sb = new StringBuffer(); + sb.append( "" ); + for( final Species species : getSpeciesData().keySet() ) { + addSpeciesSpecificDomainData( sb, species, html, tax_code_to_id_map, phy ); + } + sb.append( "
" ); + return sb; + } + + private StringBuffer getSpeciesDataInCustomOrder( final boolean html, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + final StringBuffer sb = new StringBuffer(); + for( final Species order_species : getSpeciesCustomOrder() ) { + if ( getSpeciesData().keySet().contains( order_species ) ) { + addSpeciesSpecificDomainData( sb, order_species, html, tax_code_to_id_map, phy ); + } + else { + sb.append( DomainSimilarity.NO_SPECIES ); + sb.append( DomainSimilarity.SPECIES_SEPARATOR ); + } + } + return sb; + } + + private StringBuffer getTaxonomyGroupDistribution( final Phylogeny tol ) { + final SortedMap> domain_to_species_set_map = new TreeMap>(); + for( final Species species : getSpeciesData().keySet() ) { + for( final String combable_dom : getCombinableDomainIds( species ) ) { + if ( !domain_to_species_set_map.containsKey( combable_dom ) ) { + domain_to_species_set_map.put( combable_dom, new HashSet() ); + } + domain_to_species_set_map.get( combable_dom ).add( species.getSpeciesId() ); + } + } + final StringBuffer sb = new StringBuffer(); + sb.append( "" ); + for( final Map.Entry> domain_to_species_set : domain_to_species_set_map.entrySet() ) { + final Map counts = new HashMap(); + for( final String tax_code : domain_to_species_set.getValue() ) { + final String group = SurfacingUtil.obtainTaxonomyGroup( tax_code, tol ); + if ( !ForesterUtil.isEmpty( group ) ) { + if ( !counts.containsKey( group ) ) { + counts.put( group, 1 ); + } + else { + counts.put( group, counts.get( group ) + 1 ); + } + } + else { + return null; + } + } + final SortedMap> counts_to_groups = new TreeMap>( new Comparator() { + + @Override + public int compare( final Integer first, final Integer second ) { + return second.compareTo( first ); + } + } ); + for( final Map.Entry group_to_counts : counts.entrySet() ) { + final int c = group_to_counts.getValue(); + if ( !counts_to_groups.containsKey( c ) ) { + counts_to_groups.put( c, new TreeSet() ); + } + counts_to_groups.get( c ).add( group_to_counts.getKey() ); + } + sb.append( "" ); + sb.append( "" ); + boolean first = true; + for( final Entry> count_to_groups : counts_to_groups.entrySet() ) { + if ( first ) { + first = false; + } + else { + sb.append( "" ); + sb.append( "" ); + } + sb.append( "" ); + sb.append( "" ); + } + sb.append( ForesterUtil.getLineSeparator() ); + } + sb.append( "
" ); + sb.append( "" + + domain_to_species_set.getKey() + "" ); + sb.append( " " ); + sb.append( "
" ); + sb.append( "" ); + final SortedSet groups = count_to_groups.getValue(); + sb.append( count_to_groups.getKey() ); + sb.append( " " ); + for( final String group : groups ) { + final Color color = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group ); + if ( color == null ) { + throw new IllegalArgumentException( "no color found for taxonomy group\"" + group + "\"" ); + } + final String hex = String.format( "#%02x%02x%02x", + color.getRed(), + color.getGreen(), + color.getBlue() ); + sb.append( "" ); + sb.append( " " ); + sb.append( group ); + sb.append( "" ); + } + sb.append( "
" ); + return sb; + } + + private void init() { + _detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + } + + private boolean isTreatAsBinaryComparison() { + return _treat_as_binary_comparison; + } + + private StringBuffer toStringBufferDetailedHTML( final Map tax_code_to_id_map, + final Phylogeny phy, + final boolean output_tax_codes_per_domain ) { + final StringBuffer sb = new StringBuffer(); + sb.append( "" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "" + + getDomainId() + "" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "gs" ); + sb.append( "" ); + if ( getMaximalSimilarityScore() > 0 ) { + sb.append( "" ); + sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); + sb.append( "" ); + if ( SurfacingConstants.PRINT_MORE_DOM_SIMILARITY_INFO ) { + if ( !isTreatAsBinaryComparison() ) { + sb.append( "" ); + sb.append( "(" ); + sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) ); + sb.append( ")" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "[" ); + sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) ); + sb.append( "-" ); + sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) ); + sb.append( "]" ); + sb.append( "" ); + } + } + } + sb.append( "" ); + sb.append( getMaximalDifference() ); + sb.append( "" ); + sb.append( "" ); + if ( isTreatAsBinaryComparison() ) { + sb.append( getMaximalDifferenceInCounts() ); + } + else { + sb.append( Math.abs( getMaximalDifferenceInCounts() ) ); + } + sb.append( "" ); + if ( !isTreatAsBinaryComparison() ) { + sb.append( "" ); + sb.append( "" ); + sb.append( getSpeciesData().size() ); + sb.append( "" ); + sb.append( "" ); + } + if ( ( getSpeciesCustomOrder() == null ) || getSpeciesCustomOrder().isEmpty() ) { + sb.append( "" ); + sb.append( getSpeciesDataInAlphabeticalOrder( true, tax_code_to_id_map, phy ) ); + if ( output_tax_codes_per_domain ) { + sb.append( getDomainDataInAlphabeticalOrder() ); + } + sb.append( getTaxonomyGroupDistribution( phy ) ); + sb.append( "" ); + } + else { + sb.append( "" ); + sb.append( getSpeciesDataInCustomOrder( true, tax_code_to_id_map, phy ) ); + if ( output_tax_codes_per_domain ) { + sb.append( getDomainDataInAlphabeticalOrder() ); + } + sb.append( getTaxonomyGroupDistribution( phy ) ); + sb.append( "" ); + } + sb.append( "" ); + return sb; + } + + private StringBuffer toStringBufferSimpleTabDelimited() { + final StringBuffer sb = new StringBuffer(); + sb.append( getDomainId() ); + sb.append( "\t" ); + sb.append( getSpeciesDataInAlphabeticalOrder( false, null, null ) ); + sb.append( "\n" ); + return sb; + } + + static public enum DomainSimilarityScoring { + COMBINATIONS, DOMAINS, PROTEINS; + } + + public static enum DomainSimilaritySortField { + ABS_MAX_COUNTS_DIFFERENCE, DOMAIN_ID, MAX, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, MEAN, MIN, SD, SPECIES_COUNT, + } + + public static enum PRINT_OPTION { + HTML, SIMPLE_TAB_DELIMITED; + } + + class ValueComparator implements Comparator { + + final private Map _base; + + public ValueComparator( final Map base ) { + _base = base; + } + + public int compare( final String a, final String b ) { + if ( _base.get( a ) >= _base.get( b ) ) { + return -1; + } + else { + return 1; + } // returning 0 would merge keys + } + } }