3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.surfacing;
29 import java.awt.Color;
30 import java.util.Comparator;
31 import java.util.HashMap;
32 import java.util.HashSet;
33 import java.util.List;
35 import java.util.Map.Entry;
37 import java.util.SortedMap;
38 import java.util.SortedSet;
39 import java.util.TreeMap;
40 import java.util.TreeSet;
42 import org.forester.phylogeny.Phylogeny;
43 import org.forester.species.Species;
44 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
45 import org.forester.util.ForesterUtil;
47 public class DomainSimilarity implements Comparable<DomainSimilarity> {
49 final public static String SPECIES_SEPARATOR = " ";
50 final private static int EQUAL = 0;
51 final private static String NO_SPECIES = " ";
52 private static final boolean OUTPUT_TAXCODES_PER_DOMAIN = false;
53 final private CombinableDomains _combinable_domains;
54 private DomainSimilarityCalculator.Detailedness _detailedness;
55 final private double _max;
56 private final int _max_difference;
57 private final int _max_difference_in_counts;
58 final private double _mean;
59 final private double _min;
61 final private double _sd;
62 final private SortedMap<Species, SpeciesSpecificDcData> _species_data;
63 private List<Species> _species_order;
64 private final boolean _treat_as_binary_comparison;
66 public DomainSimilarity( final CombinableDomains combinable_domains,
73 final int max_difference_in_counts,
74 final int max_difference,
75 final SortedMap<Species, SpeciesSpecificDcData> species_data,
76 final boolean sort_by_species_count_first,
77 final boolean treat_as_binary_comparison ) {
78 if ( combinable_domains == null ) {
79 throw new IllegalArgumentException( "attempt to use null combinable domains" );
81 if ( species_data == null ) {
82 throw new IllegalArgumentException( "attempt to use null species data" );
84 if ( species_data.size() < 1 ) {
85 throw new IllegalArgumentException( "attempt to use empty species data" );
88 throw new IllegalArgumentException( "attempt to use N less than 0" );
90 if ( ( species_data.size() > 1 ) && ( n < 1 ) ) {
91 throw new IllegalArgumentException( "attempt to use N less than 1" );
94 throw new IllegalArgumentException( "attempt to use negative SD" );
97 throw new IllegalArgumentException( "attempt to use max smaller than min" );
100 _combinable_domains = combinable_domains;
106 _max_difference_in_counts = max_difference_in_counts;
107 _max_difference = max_difference;
108 _species_data = species_data;
109 _treat_as_binary_comparison = treat_as_binary_comparison;
110 final int s = species_data.size();
111 if ( ( ( s * s ) - s ) != ( getN() * 2 ) ) {
112 throw new IllegalArgumentException( "illegal species count and n: species count:" + s + ", n:" + _n
113 + " for domain " + combinable_domains.getKeyDomain() );
116 if ( getMaximalDifferenceInCounts() < 0 ) {
117 throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" );
119 if ( getMaximalDifference() < 0 ) {
120 throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" );
125 public DomainSimilarity( final CombinableDomains combinable_domains,
126 final int max_difference_in_counts,
127 final int max_difference,
128 final SortedMap<Species, SpeciesSpecificDcData> species_data,
129 final boolean sort_by_species_count_first,
130 final boolean treat_as_binary_comparison ) {
131 if ( combinable_domains == null ) {
132 throw new IllegalArgumentException( "attempt to use null combinable domains" );
134 if ( species_data == null ) {
135 throw new IllegalArgumentException( "attempt to use null species data" );
137 if ( species_data.size() < 1 ) {
138 throw new IllegalArgumentException( "attempt to use empty species data" );
141 _combinable_domains = combinable_domains;
147 _max_difference_in_counts = max_difference_in_counts;
148 _max_difference = max_difference;
149 _species_data = species_data;
150 _treat_as_binary_comparison = treat_as_binary_comparison;
151 final int s = species_data.size();
153 if ( getMaximalDifferenceInCounts() < 0 ) {
154 throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" );
156 if ( getMaximalDifference() < 0 ) {
157 throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" );
163 public int compareTo( final DomainSimilarity domain_similarity ) {
164 if ( this == domain_similarity ) {
167 else if ( domain_similarity == null ) {
168 throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to null" );
170 else if ( domain_similarity.getClass() != this.getClass() ) {
171 throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to "
172 + domain_similarity.getClass() );
174 return compareByDomainId( domain_similarity );
177 public SortedSet<String> getCombinableDomainIds( final Species species_of_combinable_domain ) {
178 final SortedSet<String> sorted_ids = new TreeSet<String>();
179 if ( getSpeciesData().containsKey( species_of_combinable_domain ) ) {
180 for( final String id : getSpeciesData().get( species_of_combinable_domain )
181 .getCombinableDomainIdToCountsMap().keySet() ) {
182 sorted_ids.add( id );
188 public String getDomainId() {
189 return getCombinableDomains().getKeyDomain();
193 * For pairwise similarities, this should return the "difference"; for example the difference in counts
194 * for copy number based features (the same as getMaximalDifferenceInCounts(), or the number
195 * of actually different domain combinations.
196 * For pairwise similarities, this should return the difference,
197 * while for comparisons of more than two domains, this should return the maximal difference
200 public int getMaximalDifference() {
201 return _max_difference;
205 * For pairwise similarities, this should return the difference in counts,
206 * while for comparisons of more than two domains, this should return the maximal difference
210 * @return the (maximal) difference in counts
212 public int getMaximalDifferenceInCounts() {
213 return _max_difference_in_counts;
216 public double getMaximalSimilarityScore() {
220 public double getMeanSimilarityScore() {
224 public double getMinimalSimilarityScore() {
229 * This should return the number of pairwise distances used to calculate
230 * this similarity score
232 * @return the number of pairwise distances
238 public SortedSet<Species> getSpecies() {
239 final SortedSet<Species> species = new TreeSet<Species>();
240 for( final Species s : getSpeciesData().keySet() ) {
246 public List<Species> getSpeciesCustomOrder() {
247 return _species_order;
251 * This should return a map, which maps species names to
252 * SpeciesSpecificDomainSimilariyData
255 * @return SortedMap<String, SpeciesSpecificDomainSimilariyData>
257 public SortedMap<Species, SpeciesSpecificDcData> getSpeciesData() {
258 return _species_data;
261 public double getStandardDeviationOfSimilarityScore() {
265 public void setDetailedness( final Detailedness detailedness ) {
266 _detailedness = detailedness;
269 public void setSpeciesOrder( final List<Species> species_order ) {
270 if ( !species_order.containsAll( getSpeciesData().keySet() ) ) {
271 throw new IllegalArgumentException( "list to order species must contain all species of multiple combinable domains similarity" );
273 _species_order = species_order;
276 public StringBuffer toStringBuffer( final DomainSimilarity.PRINT_OPTION print_option,
277 final Map<String, Integer> tax_code_to_id_map,
278 final Phylogeny phy ) {
279 switch ( print_option ) {
280 case SIMPLE_TAB_DELIMITED:
281 return toStringBufferSimpleTabDelimited();
283 return toStringBufferDetailedHTML( tax_code_to_id_map, phy, OUTPUT_TAXCODES_PER_DOMAIN );
285 throw new AssertionError( "Unknown print option: " + print_option );
289 private void addSpeciesSpecificDomainData( final StringBuffer sb,
290 final Species species,
292 final Map<String, Integer> tax_code_to_id_map,
293 final Phylogeny phy ) {
297 addTaxWithLink( sb, species.getSpeciesId(), tax_code_to_id_map, phy );
298 sb.append( "</td>" );
301 sb.append( species.getSpeciesId() );
303 if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) {
310 sb.append( getSpeciesData().get( species ).toStringBuffer( getDetaildness(), html ) );
313 //sb.append( "<br>" );
314 sb.append( "</tr>" );
321 private void addTaxWithLink( final StringBuffer sb,
322 final String tax_code,
323 final Map<String, Integer> tax_code_to_id_map,
324 final Phylogeny phy ) {
326 if ( ( phy != null ) && !phy.isEmpty() ) {
327 hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax_code, phy );
330 if ( !ForesterUtil.isEmpty( tax_code )
331 && ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( tax_code ) ) ) {
332 if ( !ForesterUtil.isEmpty( hex ) ) {
333 sb.append( "<a href=\"" );
334 sb.append( SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK );
335 sb.append( tax_code_to_id_map.get( tax_code ) );
336 sb.append( "\" target=\"tw\"><span style=\"color:" );
339 sb.append( tax_code );
340 sb.append( "</span></a>" );
343 sb.append( "<a href=\"" );
344 sb.append( SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK );
345 sb.append( tax_code_to_id_map.get( tax_code ) );
346 sb.append( "\" target=\"tw\">" );
347 sb.append( tax_code );
352 sb.append( tax_code );
357 private int compareByDomainId( final DomainSimilarity other ) {
358 return getDomainId().compareToIgnoreCase( other.getDomainId() );
361 private CombinableDomains getCombinableDomains() {
362 return _combinable_domains;
365 private DomainSimilarityCalculator.Detailedness getDetaildness() {
366 return _detailedness;
369 private StringBuffer getDomainDataInAlphabeticalOrder() {
370 final SortedMap<String, SortedSet<String>> m = new TreeMap<String, SortedSet<String>>();
371 final StringBuffer sb = new StringBuffer();
372 for( final Species species : getSpeciesData().keySet() ) {
373 for( final String combable_dom : getCombinableDomainIds( species ) ) {
374 if ( !m.containsKey( combable_dom ) ) {
375 m.put( combable_dom, new TreeSet<String>() );
377 m.get( combable_dom ).add( species.getSpeciesId() );
380 for( final Map.Entry<String, SortedSet<String>> e : m.entrySet() ) {
381 sb.append( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + e.getKey() + "\">" + e.getKey() + "</a>" );
383 sb.append( "<span style=\"font-size:7px\">" );
384 for( final String tax : e.getValue() ) {
385 final String hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax, null );
386 if ( !ForesterUtil.isEmpty( hex ) ) {
387 sb.append( "<span style=\"color:" );
391 sb.append( "</span>" );
398 sb.append( "</span>" );
399 sb.append( "<br>\n" );
404 private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html,
405 final Map<String, Integer> tax_code_to_id_map,
406 final Phylogeny phy ) {
407 final StringBuffer sb = new StringBuffer();
408 sb.append( "<table>" );
409 for( final Species species : getSpeciesData().keySet() ) {
410 addSpeciesSpecificDomainData( sb, species, html, tax_code_to_id_map, phy );
412 sb.append( "</table>" );
416 private StringBuffer getSpeciesDataInCustomOrder( final boolean html,
417 final Map<String, Integer> tax_code_to_id_map,
418 final Phylogeny phy ) {
419 final StringBuffer sb = new StringBuffer();
420 for( final Species order_species : getSpeciesCustomOrder() ) {
421 if ( getSpeciesData().keySet().contains( order_species ) ) {
422 addSpeciesSpecificDomainData( sb, order_species, html, tax_code_to_id_map, phy );
425 sb.append( DomainSimilarity.NO_SPECIES );
426 sb.append( DomainSimilarity.SPECIES_SEPARATOR );
432 private StringBuffer getTaxonomyGroupDistribution( final Phylogeny tol ) {
433 final SortedMap<String, Set<String>> domain_to_species_set_map = new TreeMap<String, Set<String>>();
434 for( final Species species : getSpeciesData().keySet() ) {
435 for( final String combable_dom : getCombinableDomainIds( species ) ) {
436 if ( !domain_to_species_set_map.containsKey( combable_dom ) ) {
437 domain_to_species_set_map.put( combable_dom, new HashSet<String>() );
439 domain_to_species_set_map.get( combable_dom ).add( species.getSpeciesId() );
442 final StringBuffer sb = new StringBuffer();
443 sb.append( "<table>" );
444 for( final Map.Entry<String, Set<String>> domain_to_species_set : domain_to_species_set_map.entrySet() ) {
445 final Map<String, Integer> counts = new HashMap<String, Integer>();
446 for( final String tax_code : domain_to_species_set.getValue() ) {
447 final String group = SurfacingUtil.obtainTaxonomyGroup( tax_code, tol );
448 if ( !ForesterUtil.isEmpty( group ) ) {
449 if ( !counts.containsKey( group ) ) {
450 counts.put( group, 1 );
453 counts.put( group, counts.get( group ) + 1 );
460 final SortedMap<Integer, SortedSet<String>> counts_to_groups = new TreeMap<Integer, SortedSet<String>>( new Comparator<Integer>() {
463 public int compare( final Integer first, final Integer second ) {
464 return second.compareTo( first );
467 for( final Map.Entry<String, Integer> group_to_counts : counts.entrySet() ) {
468 final int c = group_to_counts.getValue();
469 if ( !counts_to_groups.containsKey( c ) ) {
470 counts_to_groups.put( c, new TreeSet<String>() );
472 counts_to_groups.get( c ).add( group_to_counts.getKey() );
476 sb.append( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_to_species_set.getKey() + "\">"
477 + domain_to_species_set.getKey() + "</a>" );
479 sb.append( "</td>" );
480 boolean first = true;
481 for( final Entry<Integer, SortedSet<String>> count_to_groups : counts_to_groups.entrySet() ) {
488 sb.append( "</td>" );
491 final SortedSet<String> groups = count_to_groups.getValue();
492 sb.append( count_to_groups.getKey() );
494 for( final String group : groups ) {
495 final Color color = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group );
496 if ( color == null ) {
497 throw new IllegalArgumentException( "no color found for taxonomy group\"" + group + "\"" );
499 final String hex = String.format( "#%02x%02x%02x",
503 sb.append( "<span style=\"color:" );
508 sb.append( "</span>" );
510 sb.append( "</td>" );
511 sb.append( "</tr>" );
513 sb.append( ForesterUtil.getLineSeparator() );
515 sb.append( "</table>" );
519 private void init() {
520 _detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS;
523 private boolean isTreatAsBinaryComparison() {
524 return _treat_as_binary_comparison;
527 private StringBuffer toStringBufferDetailedHTML( final Map<String, Integer> tax_code_to_id_map,
529 final boolean output_tax_codes_per_domain ) {
530 final StringBuffer sb = new StringBuffer();
534 sb.append( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + getDomainId() + "\" target=\"pfam_window\">"
535 + getDomainId() + "</a>" );
537 sb.append( "<a name=\"" + getDomainId() + "\">" );
538 sb.append( "</td>" );
540 sb.append( "<a href=\"" + SurfacingConstants.GOOGLE_SCHOLAR_SEARCH + getDomainId()
541 + "\" target=\"gs_window\">gs</a>" );
542 sb.append( "</td>" );
543 if ( getMaximalSimilarityScore() > 0 ) {
545 sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) );
546 sb.append( "</td>" );
547 if ( SurfacingConstants.PRINT_MORE_DOM_SIMILARITY_INFO ) {
548 if ( !isTreatAsBinaryComparison() ) {
551 sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) );
553 sb.append( "</td>" );
556 sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) );
558 sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) );
560 sb.append( "</td>" );
565 sb.append( getMaximalDifference() );
566 sb.append( "</td>" );
568 if ( isTreatAsBinaryComparison() ) {
569 sb.append( getMaximalDifferenceInCounts() );
572 sb.append( Math.abs( getMaximalDifferenceInCounts() ) );
574 sb.append( "</td>" );
575 if ( !isTreatAsBinaryComparison() ) {
578 sb.append( getSpeciesData().size() );
580 sb.append( "</td>" );
582 if ( ( getSpeciesCustomOrder() == null ) || getSpeciesCustomOrder().isEmpty() ) {
584 sb.append( getSpeciesDataInAlphabeticalOrder( true, tax_code_to_id_map, phy ) );
585 if ( output_tax_codes_per_domain ) {
586 sb.append( getDomainDataInAlphabeticalOrder() );
588 sb.append( getTaxonomyGroupDistribution( phy ) );
589 sb.append( "</td>" );
593 sb.append( getSpeciesDataInCustomOrder( true, tax_code_to_id_map, phy ) );
594 if ( output_tax_codes_per_domain ) {
595 sb.append( getDomainDataInAlphabeticalOrder() );
597 sb.append( getTaxonomyGroupDistribution( phy ) );
598 sb.append( "</td>" );
600 sb.append( "</tr>" );
604 private StringBuffer toStringBufferSimpleTabDelimited() {
605 final StringBuffer sb = new StringBuffer();
606 sb.append( getDomainId() );
608 sb.append( getSpeciesDataInAlphabeticalOrder( false, null, null ) );
613 static public enum DomainSimilarityScoring {
614 COMBINATIONS, DOMAINS, PROTEINS;
617 public static enum DomainSimilaritySortField {
618 ABS_MAX_COUNTS_DIFFERENCE, DOMAIN_ID, MAX, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, MEAN, MIN, SD, SPECIES_COUNT,
621 public static enum PRINT_OPTION {
622 HTML, SIMPLE_TAB_DELIMITED;
625 class ValueComparator implements Comparator<String> {
627 final private Map<String, Integer> _base;
629 public ValueComparator( final Map<String, Integer> base ) {
634 public int compare( final String a, final String b ) {
635 if ( _base.get( a ) >= _base.get( b ) ) {
640 } // returning 0 would merge keys