4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
28 package org.forester.surfacing;
30 import java.util.ArrayList;
31 import java.util.List;
32 import java.util.SortedMap;
33 import java.util.SortedSet;
34 import java.util.TreeMap;
35 import java.util.TreeSet;
37 import org.forester.protein.DomainId;
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
42 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
44 final DomainSimilarity.DomainSimilaritySortField _sort;
45 private final boolean _sort_by_species_count_first;
46 private final boolean _treat_as_binary_comparison;
48 public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
49 final boolean sort_by_species_count_first,
50 final boolean treat_as_binary_comparison ) {
52 _sort_by_species_count_first = sort_by_species_count_first;
53 _treat_as_binary_comparison = treat_as_binary_comparison;
57 public SortedSet<DomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
58 final List<GenomeWideCombinableDomains> cdc_list,
59 final boolean ignore_domains_without_combinations_in_any_genome,
60 final boolean ignore_domains_specific_to_one_genome ) {
61 if ( cdc_list.size() < 2 ) {
62 throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
64 final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
65 final SortedSet<DomainId> keys = new TreeSet<DomainId>();
66 for( final GenomeWideCombinableDomains cdc : cdc_list ) {
67 keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
69 for( final DomainId key : keys ) {
70 final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
71 final List<Species> species_with_key_id_domain = new ArrayList<Species>();
72 for( final GenomeWideCombinableDomains cdc : cdc_list ) {
73 if ( cdc.contains( key ) ) {
74 same_id_cd_list.add( cdc.get( key ) );
75 species_with_key_id_domain.add( cdc.getSpecies() );
78 if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
79 boolean without_combinations = true;
80 for( final CombinableDomains cd : same_id_cd_list ) {
81 if ( cd.getNumberOfCombinableDomains() > 0 ) {
82 without_combinations = false;
86 if ( without_combinations ) {
90 // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55
91 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 // OLD: if ( same_id_cd_list.size() > 1 ) {
93 if ( same_id_cd_list.size() > 0 ) {
94 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
95 final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
97 similarities.add( s );
100 throw new RuntimeException( "similarity is null: this should not have happened" );
106 throw new RuntimeException( "this should not have happened" );
109 // else if ( same_id_cd_list.size() == 1 ) {
110 // TODO need to go in file
111 // System.out.println( "only in one species [" +
112 // species_with_key_id_domain.get( 0 ) + "]: " + key_id );
115 // throw new RuntimeException( "this should not have happened" );
121 private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
122 final List<CombinableDomains> domains_list ) {
123 if ( domains_list.size() == 1 ) {
124 // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55
125 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
127 //throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinable domains" );
129 final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
130 species_data.put( domains_list.get( 0 ).getSpecies(),
131 createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
132 return new PrintableDomainSimilarity( domains_list.get( 0 ),
143 isSortBySpeciesCountFirst(),
144 isTreatAsBinaryComparison() );
146 final DescriptiveStatistics stat = new BasicDescriptiveStatistics();
147 final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
148 species_data.put( domains_list.get( 0 ).getSpecies(),
149 createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
150 int max_difference_in_counts = 0;
151 int max_difference = 0;
152 final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
153 for( int i = 1; i < domains_list.size(); ++i ) {
154 species_data.put( domains_list.get( i ).getSpecies(),
155 createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
156 final CombinableDomains domains_i = domains_list.get( i );
157 for( int j = 0; j < i; ++j ) {
158 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
159 .calculateSimilarity( domains_i, domains_list.get( j ) );
160 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
162 if ( is_domain_combination_based ) {
163 difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
164 .getNumberOfDifferentDomains();
167 difference = difference_in_counts;
169 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
170 max_difference_in_counts = difference_in_counts;
172 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
173 max_difference = difference;
175 stat.addValue( pairwise_similarity.getSimilarityScore() );
178 if ( stat.getN() < 1 ) {
179 throw new AssertionError( "empty descriptive statistics: this should not have happened" );
181 if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
182 throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
184 if ( ( /*stat.getN() != 1 ||*/!isTreatAsBinaryComparison() ) && ( max_difference_in_counts < 0 ) ) {
185 max_difference_in_counts = Math.abs( max_difference_in_counts );
186 if ( !is_domain_combination_based ) {
187 max_difference = Math.abs( max_difference ); //=max_difference_in_counts for !is_domain_combination_based.
190 DomainSimilarity similarity = null;
191 if ( stat.getN() == 1 ) {
192 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
195 stat.arithmeticMean(),
199 max_difference_in_counts,
203 isSortBySpeciesCountFirst(),
204 isTreatAsBinaryComparison() );
207 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
210 stat.arithmeticMean(),
212 stat.sampleStandardDeviation(),
214 max_difference_in_counts,
218 isSortBySpeciesCountFirst(),
219 isTreatAsBinaryComparison() );
224 private DomainSimilarity.DomainSimilaritySortField getSort() {
228 private boolean isSortBySpeciesCountFirst() {
229 return _sort_by_species_count_first;
232 private boolean isTreatAsBinaryComparison() {
233 return _treat_as_binary_comparison;
236 private static SpeciesSpecificDomainSimilariyData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
237 final SpeciesSpecificDomainSimilariyData sd = new PrintableSpeciesSpecificDomainSimilariyData( cd.getKeyDomainProteinsCount(),
238 cd.getKeyDomainCount(),
239 cd.getNumberOfCombinableDomains(),
240 cd.getKeyDomainConfidenceDescriptiveStatistics() );
241 for( final DomainId domain : cd.getCombinableDomains() ) {
242 sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );