4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
28 package org.forester.surfacing;
30 import java.text.DecimalFormat;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.SortedMap;
34 import java.util.SortedSet;
35 import java.util.TreeMap;
36 import java.util.TreeSet;
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
41 import org.forester.util.ForesterUtil;
43 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
45 final PrintableDomainSimilarity.DomainSimilaritySortField _sort;
46 private final boolean _calc_similarity_score;
47 private final boolean _sort_by_species_count_first;
48 private final boolean _treat_as_binary_comparison;
50 public BasicDomainSimilarityCalculator( final PrintableDomainSimilarity.DomainSimilaritySortField sort,
51 final boolean sort_by_species_count_first,
52 final boolean treat_as_binary_comparison,
53 final boolean calc_similarity_score ) {
55 _sort_by_species_count_first = sort_by_species_count_first;
56 _treat_as_binary_comparison = treat_as_binary_comparison;
57 _calc_similarity_score = calc_similarity_score;
61 public SortedSet<PrintableDomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
62 final List<GenomeWideCombinableDomains> cdc_list,
63 final boolean ignore_domains_without_combinations_in_any_genome,
64 final boolean ignore_domains_specific_to_one_genome ) {
65 if ( cdc_list.size() < 2 ) {
66 throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
68 final SortedSet<PrintableDomainSimilarity> similarities = new TreeSet<PrintableDomainSimilarity>();
69 final SortedSet<String> keys = new TreeSet<String>();
70 for( final GenomeWideCombinableDomains cdc : cdc_list ) {
71 keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
73 final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
75 System.out.println( keys.size() );
76 for( final String key : keys ) {
77 ForesterUtil.updateProgress( counter, pf );
79 final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
80 final List<Species> species_with_key_id_domain = new ArrayList<Species>();
81 for( final GenomeWideCombinableDomains cdc : cdc_list ) {
82 if ( cdc.contains( key ) ) {
83 same_id_cd_list.add( cdc.get( key ) );
84 species_with_key_id_domain.add( cdc.getSpecies() );
87 if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
88 boolean without_combinations = true;
89 for( final CombinableDomains cd : same_id_cd_list ) {
90 if ( cd.getNumberOfCombinableDomains() > 0 ) {
91 without_combinations = false;
95 if ( without_combinations ) {
99 if ( same_id_cd_list.size() > 0 ) {
100 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
101 final PrintableDomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
103 similarities.add( s );
106 throw new RuntimeException( "similarity is null: this should not have happened" );
111 throw new RuntimeException( "this should not have happened" );
114 System.out.println();
118 public boolean isCalcSimilarityScore() {
119 return _calc_similarity_score;
122 private PrintableDomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
123 final List<CombinableDomains> domains_list ) {
124 if ( domains_list.size() == 1 ) {
125 final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
126 species_data.put( domains_list.get( 0 ).getSpecies(),
127 createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
128 if ( !isCalcSimilarityScore() ) {
129 return new PrintableDomainSimilarity( domains_list.get( 0 ),
133 isSortBySpeciesCountFirst(),
134 isTreatAsBinaryComparison() );
137 return new PrintableDomainSimilarity( domains_list.get( 0 ),
147 isSortBySpeciesCountFirst(),
148 isTreatAsBinaryComparison() );
151 DescriptiveStatistics stat = null;
152 if ( isCalcSimilarityScore() ) {
153 stat = new BasicDescriptiveStatistics();
155 final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
156 species_data.put( domains_list.get( 0 ).getSpecies(),
157 createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
158 int max_difference_in_counts = 0;
159 int max_difference = 0;
160 final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
161 for( int i = 1; i < domains_list.size(); ++i ) {
162 species_data.put( domains_list.get( i ).getSpecies(),
163 createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
164 final CombinableDomains domains_i = domains_list.get( i );
165 for( int j = 0; j < i; ++j ) {
166 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
167 .calculateSimilarity( domains_i, domains_list.get( j ) );
168 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
170 if ( is_domain_combination_based ) {
171 difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
172 .getNumberOfDifferentDomains();
175 difference = difference_in_counts;
177 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
178 max_difference_in_counts = difference_in_counts;
180 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
181 max_difference = difference;
183 if ( isCalcSimilarityScore() ) {
184 stat.addValue( pairwise_similarity.getSimilarityScore() );
188 if ( isCalcSimilarityScore() ) {
189 if ( stat.getN() < 1 ) {
190 throw new RuntimeException( "empty descriptive statistics: this should not have happened" );
192 if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
193 throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
196 if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) {
197 max_difference_in_counts = Math.abs( max_difference_in_counts );
198 if ( !is_domain_combination_based ) {
199 max_difference = Math.abs( max_difference );
202 PrintableDomainSimilarity similarity = null;
203 if ( !isCalcSimilarityScore() ) {
204 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
205 max_difference_in_counts,
208 isSortBySpeciesCountFirst(),
209 isTreatAsBinaryComparison() );
212 if ( stat.getN() == 1 ) {
213 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
216 stat.arithmeticMean(),
220 max_difference_in_counts,
223 isSortBySpeciesCountFirst(),
224 isTreatAsBinaryComparison() );
227 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
230 stat.arithmeticMean(),
232 stat.sampleStandardDeviation(),
234 max_difference_in_counts,
237 isSortBySpeciesCountFirst(),
238 isTreatAsBinaryComparison() );
244 private boolean isSortBySpeciesCountFirst() {
245 return _sort_by_species_count_first;
248 private boolean isTreatAsBinaryComparison() {
249 return _treat_as_binary_comparison;
252 private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
253 final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
254 cd.getNumberOfCombinableDomains() );
255 for( final String prot : cd.getKeyDomainProteins() ) {
256 sd.addKeyDomainProtein( prot );
258 for( final String domain : cd.getCombinableDomains() ) {
259 sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );