// $Id: // // FORESTER -- software libraries and applications // for evolutionary biology research and applications. // // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com // WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.surfacing; import java.awt.Color; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import org.forester.phylogeny.Phylogeny; import org.forester.species.Species; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.util.ForesterUtil; public class DomainSimilarity implements Comparable { final public static String SPECIES_SEPARATOR = " "; final private static int EQUAL = 0; final private static String NO_SPECIES = " "; private static final boolean OUTPUT_TAXCODES_PER_DOMAIN = false; final private CombinableDomains _combinable_domains; private DomainSimilarityCalculator.Detailedness _detailedness; final private double _max; private final int _max_difference; private final int _max_difference_in_counts; final private double _mean; final private double _min; final private int _n; final private double _sd; final private SortedMap _species_data; private List _species_order; private final boolean _treat_as_binary_comparison; public DomainSimilarity( final CombinableDomains combinable_domains, final double min, final double max, final double mean, final double median, final double sd, final int n, final int max_difference_in_counts, final int max_difference, final SortedMap species_data, final boolean sort_by_species_count_first, final boolean treat_as_binary_comparison ) { if ( combinable_domains == null ) { throw new IllegalArgumentException( "attempt to use null combinable domains" ); } if ( species_data == null ) { throw new IllegalArgumentException( "attempt to use null species data" ); } if ( species_data.size() < 1 ) { throw new IllegalArgumentException( "attempt to use empty species data" ); } if ( n < 0 ) { throw new IllegalArgumentException( "attempt to use N less than 0" ); } if ( ( species_data.size() > 1 ) && ( n < 1 ) ) { throw new IllegalArgumentException( "attempt to use N less than 1" ); } if ( sd < 0.0 ) { throw new IllegalArgumentException( "attempt to use negative SD" ); } if ( max < min ) { throw new IllegalArgumentException( "attempt to use max smaller than min" ); } init(); _combinable_domains = combinable_domains; _min = min; _max = max; _mean = mean; _sd = sd; _n = n; _max_difference_in_counts = max_difference_in_counts; _max_difference = max_difference; _species_data = species_data; _treat_as_binary_comparison = treat_as_binary_comparison; final int s = species_data.size(); if ( ( ( s * s ) - s ) != ( getN() * 2 ) ) { throw new IllegalArgumentException( "illegal species count and n: species count:" + s + ", n:" + _n + " for domain " + combinable_domains.getKeyDomain() ); } if ( s > 2 ) { if ( getMaximalDifferenceInCounts() < 0 ) { throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" ); } if ( getMaximalDifference() < 0 ) { throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" ); } } } public DomainSimilarity( final CombinableDomains combinable_domains, final int max_difference_in_counts, final int max_difference, final SortedMap species_data, final boolean sort_by_species_count_first, final boolean treat_as_binary_comparison ) { if ( combinable_domains == null ) { throw new IllegalArgumentException( "attempt to use null combinable domains" ); } if ( species_data == null ) { throw new IllegalArgumentException( "attempt to use null species data" ); } if ( species_data.size() < 1 ) { throw new IllegalArgumentException( "attempt to use empty species data" ); } init(); _combinable_domains = combinable_domains; _min = -1; _max = -1; _mean = -1; _sd = -1; _n = -1; _max_difference_in_counts = max_difference_in_counts; _max_difference = max_difference; _species_data = species_data; _treat_as_binary_comparison = treat_as_binary_comparison; final int s = species_data.size(); if ( s > 2 ) { if ( getMaximalDifferenceInCounts() < 0 ) { throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" ); } if ( getMaximalDifference() < 0 ) { throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" ); } } } public int compareTo( final DomainSimilarity domain_similarity ) { if ( this == domain_similarity ) { return EQUAL; } else if ( domain_similarity == null ) { throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to null" ); } else if ( domain_similarity.getClass() != this.getClass() ) { throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to " + domain_similarity.getClass() ); } return compareByDomainId( domain_similarity ); } public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain ) { final SortedSet sorted_ids = new TreeSet(); if ( getSpeciesData().containsKey( species_of_combinable_domain ) ) { for( final String id : getSpeciesData().get( species_of_combinable_domain ) .getCombinableDomainIdToCountsMap().keySet() ) { sorted_ids.add( id ); } } return sorted_ids; } public String getDomainId() { return getCombinableDomains().getKeyDomain(); } /** * For pairwise similarities, this should return the "difference"; for example the difference in counts * for copy number based features (the same as getMaximalDifferenceInCounts(), or the number * of actually different domain combinations. * For pairwise similarities, this should return the difference, * while for comparisons of more than two domains, this should return the maximal difference * */ public int getMaximalDifference() { return _max_difference; } /** * For pairwise similarities, this should return the difference in counts, * while for comparisons of more than two domains, this should return the maximal difference * in counts * * * @return the (maximal) difference in counts */ public int getMaximalDifferenceInCounts() { return _max_difference_in_counts; } public double getMaximalSimilarityScore() { return _max; } public double getMeanSimilarityScore() { return _mean; } public double getMinimalSimilarityScore() { return _min; } /** * This should return the number of pairwise distances used to calculate * this similarity score * * @return the number of pairwise distances */ public int getN() { return _n; } public SortedSet getSpecies() { final SortedSet species = new TreeSet(); for( final Species s : getSpeciesData().keySet() ) { species.add( s ); } return species; } public List getSpeciesCustomOrder() { return _species_order; } /** * This should return a map, which maps species names to * SpeciesSpecificDomainSimilariyData * * * @return SortedMap */ public SortedMap getSpeciesData() { return _species_data; } public double getStandardDeviationOfSimilarityScore() { return _sd; } public void setDetailedness( final Detailedness detailedness ) { _detailedness = detailedness; } public void setSpeciesOrder( final List species_order ) { if ( !species_order.containsAll( getSpeciesData().keySet() ) ) { throw new IllegalArgumentException( "list to order species must contain all species of multiple combinable domains similarity" ); } _species_order = species_order; } public StringBuffer toStringBuffer( final DomainSimilarity.PRINT_OPTION print_option, final Map tax_code_to_id_map, final Phylogeny phy ) { switch ( print_option ) { case SIMPLE_TAB_DELIMITED: return toStringBufferSimpleTabDelimited(); case HTML: return toStringBufferDetailedHTML( tax_code_to_id_map, phy, OUTPUT_TAXCODES_PER_DOMAIN ); default: throw new AssertionError( "Unknown print option: " + print_option ); } } private void addSpeciesSpecificDomainData( final StringBuffer sb, final Species species, final boolean html, final Map tax_code_to_id_map, final Phylogeny phy ) { if ( html ) { sb.append( "" ); sb.append( "" ); addTaxWithLink( sb, species.getSpeciesId(), tax_code_to_id_map, phy ); sb.append( "" ); } else { sb.append( species.getSpeciesId() ); } if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) { if ( html ) { //sb.append( ":" ); } else { sb.append( "\t" ); } sb.append( getSpeciesData().get( species ).toStringBuffer( getDetaildness(), html ) ); } if ( html ) { //sb.append( "
" ); sb.append( "" ); } else { sb.append( "\n\t" ); } } private void addTaxWithLink( final StringBuffer sb, final String tax_code, final Map tax_code_to_id_map, final Phylogeny phy ) { String hex = null; if ( ( phy != null ) && !phy.isEmpty() ) { hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax_code, phy ); } sb.append( "" ); if ( !ForesterUtil.isEmpty( tax_code ) && ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( tax_code ) ) ) { if ( !ForesterUtil.isEmpty( hex ) ) { sb.append( "" ); sb.append( tax_code ); sb.append( "" ); } else { sb.append( "" ); sb.append( tax_code ); sb.append( "" ); } } else { sb.append( tax_code ); } sb.append( "" ); } private int compareByDomainId( final DomainSimilarity other ) { return getDomainId().compareToIgnoreCase( other.getDomainId() ); } private CombinableDomains getCombinableDomains() { return _combinable_domains; } private DomainSimilarityCalculator.Detailedness getDetaildness() { return _detailedness; } private StringBuffer getDomainDataInAlphabeticalOrder() { final SortedMap> m = new TreeMap>(); final StringBuffer sb = new StringBuffer(); for( final Species species : getSpeciesData().keySet() ) { for( final String combable_dom : getCombinableDomainIds( species ) ) { if ( !m.containsKey( combable_dom ) ) { m.put( combable_dom, new TreeSet() ); } m.get( combable_dom ).add( species.getSpeciesId() ); } } for( final Map.Entry> e : m.entrySet() ) { sb.append( "" + e.getKey() + "" ); sb.append( " " ); sb.append( "" ); for( final String tax : e.getValue() ) { final String hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax, null ); if ( !ForesterUtil.isEmpty( hex ) ) { sb.append( "" ); sb.append( tax ); sb.append( "" ); } else { sb.append( tax ); } sb.append( " " ); } sb.append( "" ); sb.append( "
\n" ); } return sb; } private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html, final Map tax_code_to_id_map, final Phylogeny phy ) { final StringBuffer sb = new StringBuffer(); sb.append( "" ); for( final Species species : getSpeciesData().keySet() ) { addSpeciesSpecificDomainData( sb, species, html, tax_code_to_id_map, phy ); } sb.append( "
" ); return sb; } private StringBuffer getSpeciesDataInCustomOrder( final boolean html, final Map tax_code_to_id_map, final Phylogeny phy ) { final StringBuffer sb = new StringBuffer(); for( final Species order_species : getSpeciesCustomOrder() ) { if ( getSpeciesData().keySet().contains( order_species ) ) { addSpeciesSpecificDomainData( sb, order_species, html, tax_code_to_id_map, phy ); } else { sb.append( DomainSimilarity.NO_SPECIES ); sb.append( DomainSimilarity.SPECIES_SEPARATOR ); } } return sb; } private StringBuffer getTaxonomyGroupDistribution( final Phylogeny tol ) { final SortedMap> domain_to_species_set_map = new TreeMap>(); for( final Species species : getSpeciesData().keySet() ) { for( final String combable_dom : getCombinableDomainIds( species ) ) { if ( !domain_to_species_set_map.containsKey( combable_dom ) ) { domain_to_species_set_map.put( combable_dom, new HashSet() ); } domain_to_species_set_map.get( combable_dom ).add( species.getSpeciesId() ); } } final StringBuffer sb = new StringBuffer(); sb.append( "" ); for( final Map.Entry> domain_to_species_set : domain_to_species_set_map.entrySet() ) { final Map counts = new HashMap(); for( final String tax_code : domain_to_species_set.getValue() ) { final String group = SurfacingUtil.obtainTaxonomyGroup( tax_code, tol ); if ( !ForesterUtil.isEmpty( group ) ) { if ( !counts.containsKey( group ) ) { counts.put( group, 1 ); } else { counts.put( group, counts.get( group ) + 1 ); } } else { return null; } } final SortedMap> counts_to_groups = new TreeMap>( new Comparator() { @Override public int compare( final Integer first, final Integer second ) { return second.compareTo( first ); } } ); for( final Map.Entry group_to_counts : counts.entrySet() ) { final int c = group_to_counts.getValue(); if ( !counts_to_groups.containsKey( c ) ) { counts_to_groups.put( c, new TreeSet() ); } counts_to_groups.get( c ).add( group_to_counts.getKey() ); } sb.append( "" ); sb.append( "" ); boolean first = true; for( final Entry> count_to_groups : counts_to_groups.entrySet() ) { if ( first ) { first = false; } else { sb.append( "" ); sb.append( "" ); } sb.append( "" ); sb.append( "" ); } sb.append( ForesterUtil.getLineSeparator() ); } sb.append( "
" ); sb.append( "" + domain_to_species_set.getKey() + "" ); sb.append( " " ); sb.append( "
" ); sb.append( "" ); final SortedSet groups = count_to_groups.getValue(); sb.append( count_to_groups.getKey() ); sb.append( " " ); for( final String group : groups ) { final Color color = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group ); if ( color == null ) { throw new IllegalArgumentException( "no color found for taxonomy group\"" + group + "\"" ); } final String hex = String.format( "#%02x%02x%02x", color.getRed(), color.getGreen(), color.getBlue() ); sb.append( "" ); sb.append( " " ); sb.append( group ); sb.append( "" ); } sb.append( "
" ); return sb; } private void init() { _detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; } private boolean isTreatAsBinaryComparison() { return _treat_as_binary_comparison; } private StringBuffer toStringBufferDetailedHTML( final Map tax_code_to_id_map, final Phylogeny phy, final boolean output_tax_codes_per_domain ) { final StringBuffer sb = new StringBuffer(); sb.append( "" ); sb.append( "" ); sb.append( "" ); sb.append( "" + getDomainId() + "" ); sb.append( "" ); sb.append( "" ); sb.append( "" ); sb.append( "" ); sb.append( "gs" ); sb.append( "" ); if ( getMaximalSimilarityScore() > 0 ) { sb.append( "" ); sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); sb.append( "" ); if ( SurfacingConstants.PRINT_MORE_DOM_SIMILARITY_INFO ) { if ( !isTreatAsBinaryComparison() ) { sb.append( "" ); sb.append( "(" ); sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) ); sb.append( ")" ); sb.append( "" ); sb.append( "" ); sb.append( "[" ); sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) ); sb.append( "-" ); sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) ); sb.append( "]" ); sb.append( "" ); } } } sb.append( "" ); sb.append( getMaximalDifference() ); sb.append( "" ); sb.append( "" ); if ( isTreatAsBinaryComparison() ) { sb.append( getMaximalDifferenceInCounts() ); } else { sb.append( Math.abs( getMaximalDifferenceInCounts() ) ); } sb.append( "" ); if ( !isTreatAsBinaryComparison() ) { sb.append( "" ); sb.append( "" ); sb.append( getSpeciesData().size() ); sb.append( "" ); sb.append( "" ); } if ( ( getSpeciesCustomOrder() == null ) || getSpeciesCustomOrder().isEmpty() ) { sb.append( "" ); sb.append( getSpeciesDataInAlphabeticalOrder( true, tax_code_to_id_map, phy ) ); if ( output_tax_codes_per_domain ) { sb.append( getDomainDataInAlphabeticalOrder() ); } sb.append( getTaxonomyGroupDistribution( phy ) ); sb.append( "" ); } else { sb.append( "" ); sb.append( getSpeciesDataInCustomOrder( true, tax_code_to_id_map, phy ) ); if ( output_tax_codes_per_domain ) { sb.append( getDomainDataInAlphabeticalOrder() ); } sb.append( getTaxonomyGroupDistribution( phy ) ); sb.append( "" ); } sb.append( "" ); return sb; } private StringBuffer toStringBufferSimpleTabDelimited() { final StringBuffer sb = new StringBuffer(); sb.append( getDomainId() ); sb.append( "\t" ); sb.append( getSpeciesDataInAlphabeticalOrder( false, null, null ) ); sb.append( "\n" ); return sb; } static public enum DomainSimilarityScoring { COMBINATIONS, DOMAINS, PROTEINS; } public static enum DomainSimilaritySortField { ABS_MAX_COUNTS_DIFFERENCE, DOMAIN_ID, MAX, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, MEAN, MIN, SD, SPECIES_COUNT, } public static enum PRINT_OPTION { HTML, SIMPLE_TAB_DELIMITED; } class ValueComparator implements Comparator { final private Map _base; public ValueComparator( final Map base ) { _base = base; } public int compare( final String a, final String b ) { if ( _base.get( a ) >= _base.get( b ) ) { return -1; } else { return 1; } // returning 0 would merge keys } } }