4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
28 package org.forester.surfacing;
30 import java.text.DecimalFormat;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.SortedMap;
34 import java.util.SortedSet;
35 import java.util.TreeMap;
36 import java.util.TreeSet;
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
41 import org.forester.util.ForesterUtil;
43 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
45 final DomainSimilarity.DomainSimilaritySortField _sort;
46 private final boolean _calc_similarity_score;
47 private final boolean _sort_by_species_count_first;
48 private final boolean _treat_as_binary_comparison;
49 private final boolean _verbose;
51 public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
52 final boolean sort_by_species_count_first,
53 final boolean treat_as_binary_comparison,
54 final boolean calc_similarity_score,
55 final boolean verbose ) {
57 _sort_by_species_count_first = sort_by_species_count_first;
58 _treat_as_binary_comparison = treat_as_binary_comparison;
59 _calc_similarity_score = calc_similarity_score;
63 public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
64 final boolean sort_by_species_count_first,
65 final boolean treat_as_binary_comparison,
66 final boolean calc_similarity_score ) {
68 _sort_by_species_count_first = sort_by_species_count_first;
69 _treat_as_binary_comparison = treat_as_binary_comparison;
70 _calc_similarity_score = calc_similarity_score;
75 public SortedSet<DomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
76 final List<GenomeWideCombinableDomains> cdc_list,
77 final boolean ignore_domains_without_combinations_in_any_genome,
78 final boolean ignore_domains_specific_to_one_genome ) {
79 if ( cdc_list.size() < 2 ) {
80 throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
82 final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
83 final SortedSet<String> keys = new TreeSet<String>();
84 for( final GenomeWideCombinableDomains cdc : cdc_list ) {
85 keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
87 final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
90 System.out.println( keys.size() );
92 for( final String key : keys ) {
94 ForesterUtil.updateProgress( counter, pf );
97 final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
98 final List<Species> species_with_key_id_domain = new ArrayList<Species>();
99 for( final GenomeWideCombinableDomains cdc : cdc_list ) {
100 if ( cdc.contains( key ) ) {
101 same_id_cd_list.add( cdc.get( key ) );
102 species_with_key_id_domain.add( cdc.getSpecies() );
105 if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
106 boolean without_combinations = true;
107 for( final CombinableDomains cd : same_id_cd_list ) {
108 if ( cd.getNumberOfCombinableDomains() > 0 ) {
109 without_combinations = false;
113 if ( without_combinations ) {
117 if ( same_id_cd_list.size() > 0 ) {
118 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
119 final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
121 similarities.add( s );
124 throw new RuntimeException( "similarity is null: this should not have happened" );
129 throw new RuntimeException( "this should not have happened" );
133 System.out.println();
138 public boolean isCalcSimilarityScore() {
139 return _calc_similarity_score;
142 private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
143 final List<CombinableDomains> domains_list ) {
144 if ( domains_list.size() == 1 ) {
145 final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
146 species_data.put( domains_list.get( 0 ).getSpecies(),
147 createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
148 if ( !isCalcSimilarityScore() ) {
149 return new DomainSimilarity( domains_list.get( 0 ),
153 isSortBySpeciesCountFirst(),
154 isTreatAsBinaryComparison() );
157 return new DomainSimilarity( domains_list.get( 0 ),
167 isSortBySpeciesCountFirst(),
168 isTreatAsBinaryComparison() );
171 DescriptiveStatistics stat = null;
172 if ( isCalcSimilarityScore() ) {
173 stat = new BasicDescriptiveStatistics();
175 final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
176 species_data.put( domains_list.get( 0 ).getSpecies(),
177 createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
178 int max_difference_in_counts = 0;
179 int max_difference = 0;
180 final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
181 for( int i = 1; i < domains_list.size(); ++i ) {
182 species_data.put( domains_list.get( i ).getSpecies(),
183 createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
184 final CombinableDomains domains_i = domains_list.get( i );
185 for( int j = 0; j < i; ++j ) {
186 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
187 .calculateSimilarity( domains_i, domains_list.get( j ) );
188 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
190 if ( is_domain_combination_based ) {
191 difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
192 .getNumberOfDifferentDomains();
195 difference = difference_in_counts;
197 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
198 max_difference_in_counts = difference_in_counts;
200 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
201 max_difference = difference;
203 if ( isCalcSimilarityScore() ) {
204 stat.addValue( pairwise_similarity.getSimilarityScore() );
208 if ( isCalcSimilarityScore() ) {
209 if ( stat.getN() < 1 ) {
210 throw new RuntimeException( "empty descriptive statistics: this should not have happened" );
212 if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
213 throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
216 if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) {
217 max_difference_in_counts = Math.abs( max_difference_in_counts );
218 if ( !is_domain_combination_based ) {
219 max_difference = Math.abs( max_difference );
222 DomainSimilarity similarity = null;
223 if ( !isCalcSimilarityScore() ) {
224 similarity = new DomainSimilarity( domains_list.get( 0 ),
225 max_difference_in_counts,
228 isSortBySpeciesCountFirst(),
229 isTreatAsBinaryComparison() );
232 if ( stat.getN() == 1 ) {
233 similarity = new DomainSimilarity( domains_list.get( 0 ),
236 stat.arithmeticMean(),
240 max_difference_in_counts,
243 isSortBySpeciesCountFirst(),
244 isTreatAsBinaryComparison() );
247 similarity = new DomainSimilarity( domains_list.get( 0 ),
250 stat.arithmeticMean(),
252 stat.sampleStandardDeviation(),
254 max_difference_in_counts,
257 isSortBySpeciesCountFirst(),
258 isTreatAsBinaryComparison() );
264 private boolean isSortBySpeciesCountFirst() {
265 return _sort_by_species_count_first;
268 private boolean isTreatAsBinaryComparison() {
269 return _treat_as_binary_comparison;
272 private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
273 final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
274 cd.getNumberOfCombinableDomains() );
275 for( final String prot : cd.getKeyDomainProteins() ) {
276 sd.addKeyDomainProtein( prot );
278 for( final String domain : cd.getCombinableDomains() ) {
279 sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );