in progress...
[jalview.git] / forester / java / src / org / forester / surfacing / BasicDomainSimilarityCalculator.java
1 // $Id:
2 // Exp $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.surfacing;
29
30 import java.text.DecimalFormat;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.SortedMap;
34 import java.util.SortedSet;
35 import java.util.TreeMap;
36 import java.util.TreeSet;
37
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
41 import org.forester.util.ForesterUtil;
42
43 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
44
45     final DomainSimilarity.DomainSimilaritySortField _sort;
46     private final boolean                            _calc_similarity_score;
47     private final boolean                            _sort_by_species_count_first;
48     private final boolean                            _treat_as_binary_comparison;
49     private final boolean                            _verbose;
50
51     public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
52                                             final boolean sort_by_species_count_first,
53                                             final boolean treat_as_binary_comparison,
54                                             final boolean calc_similarity_score,
55                                             final boolean verbose ) {
56         _sort = sort;
57         _sort_by_species_count_first = sort_by_species_count_first;
58         _treat_as_binary_comparison = treat_as_binary_comparison;
59         _calc_similarity_score = calc_similarity_score;
60         _verbose = verbose;
61     }
62
63     public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
64                                             final boolean sort_by_species_count_first,
65                                             final boolean treat_as_binary_comparison,
66                                             final boolean calc_similarity_score ) {
67         _sort = sort;
68         _sort_by_species_count_first = sort_by_species_count_first;
69         _treat_as_binary_comparison = treat_as_binary_comparison;
70         _calc_similarity_score = calc_similarity_score;
71         _verbose = false;
72     }
73
74     @Override
75     public SortedSet<DomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
76                                                               final List<GenomeWideCombinableDomains> cdc_list,
77                                                               final boolean ignore_domains_without_combinations_in_any_genome,
78                                                               final boolean ignore_domains_specific_to_one_genome ) {
79         if ( cdc_list.size() < 2 ) {
80             throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
81         }
82         final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
83         final SortedSet<String> keys = new TreeSet<String>();
84         for( final GenomeWideCombinableDomains cdc : cdc_list ) {
85             keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
86         }
87         final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
88         int counter = 1;
89         if ( _verbose ) {
90             System.out.println( keys.size() );
91         }
92         for( final String key : keys ) {
93             if ( _verbose ) {
94                 ForesterUtil.updateProgress( counter, pf );
95             }
96             counter++;
97             final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
98             final List<Species> species_with_key_id_domain = new ArrayList<Species>();
99             for( final GenomeWideCombinableDomains cdc : cdc_list ) {
100                 if ( cdc.contains( key ) ) {
101                     same_id_cd_list.add( cdc.get( key ) );
102                     species_with_key_id_domain.add( cdc.getSpecies() );
103                 }
104             }
105             if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
106                 boolean without_combinations = true;
107                 for( final CombinableDomains cd : same_id_cd_list ) {
108                     if ( cd.getNumberOfCombinableDomains() > 0 ) {
109                         without_combinations = false;
110                         break;
111                     }
112                 }
113                 if ( without_combinations ) {
114                     continue;
115                 }
116             }
117             if ( same_id_cd_list.size() > 0 ) {
118                 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
119                     final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
120                     if ( s != null ) {
121                         similarities.add( s );
122                     }
123                     else {
124                         throw new RuntimeException( "similarity is null: this should not have happened" );
125                     }
126                 }
127             }
128             else {
129                 throw new RuntimeException( "this should not have happened" );
130             }
131         }
132         if ( _verbose ) {
133             System.out.println();
134         }
135         return similarities;
136     }
137
138     public boolean isCalcSimilarityScore() {
139         return _calc_similarity_score;
140     }
141
142     private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
143                                                   final List<CombinableDomains> domains_list ) {
144         if ( domains_list.size() == 1 ) {
145             final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
146             species_data.put( domains_list.get( 0 ).getSpecies(),
147                               createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
148             if ( !isCalcSimilarityScore() ) {
149                 return new DomainSimilarity( domains_list.get( 0 ),
150                                              0,
151                                              0,
152                                              species_data,
153                                              isSortBySpeciesCountFirst(),
154                                              isTreatAsBinaryComparison() );
155             }
156             else {
157                 return new DomainSimilarity( domains_list.get( 0 ),
158                                              1.0,
159                                              1.0,
160                                              1.0,
161                                              1.0,
162                                              0.0,
163                                              0,
164                                              0,
165                                              0,
166                                              species_data,
167                                              isSortBySpeciesCountFirst(),
168                                              isTreatAsBinaryComparison() );
169             }
170         }
171         DescriptiveStatistics stat = null;
172         if ( isCalcSimilarityScore() ) {
173             stat = new BasicDescriptiveStatistics();
174         }
175         final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
176         species_data.put( domains_list.get( 0 ).getSpecies(),
177                           createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
178         int max_difference_in_counts = 0;
179         int max_difference = 0;
180         final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
181         for( int i = 1; i < domains_list.size(); ++i ) {
182             species_data.put( domains_list.get( i ).getSpecies(),
183                               createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
184             final CombinableDomains domains_i = domains_list.get( i );
185             for( int j = 0; j < i; ++j ) {
186                 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
187                         .calculateSimilarity( domains_i, domains_list.get( j ) );
188                 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
189                 int difference = 0;
190                 if ( is_domain_combination_based ) {
191                     difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
192                             .getNumberOfDifferentDomains();
193                 }
194                 else {
195                     difference = difference_in_counts;
196                 }
197                 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
198                     max_difference_in_counts = difference_in_counts;
199                 }
200                 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
201                     max_difference = difference;
202                 }
203                 if ( isCalcSimilarityScore() ) {
204                     stat.addValue( pairwise_similarity.getSimilarityScore() );
205                 }
206             }
207         }
208         if ( isCalcSimilarityScore() ) {
209             if ( stat.getN() < 1 ) {
210                 throw new RuntimeException( "empty descriptive statistics: this should not have happened" );
211             }
212             if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
213                 throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
214             }
215         }
216         if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) {
217             max_difference_in_counts = Math.abs( max_difference_in_counts );
218             if ( !is_domain_combination_based ) {
219                 max_difference = Math.abs( max_difference );
220             }
221         }
222         DomainSimilarity similarity = null;
223         if ( !isCalcSimilarityScore() ) {
224             similarity = new DomainSimilarity( domains_list.get( 0 ),
225                                                max_difference_in_counts,
226                                                max_difference,
227                                                species_data,
228                                                isSortBySpeciesCountFirst(),
229                                                isTreatAsBinaryComparison() );
230         }
231         else {
232             if ( stat.getN() == 1 ) {
233                 similarity = new DomainSimilarity( domains_list.get( 0 ),
234                                                    stat.getMin(),
235                                                    stat.getMax(),
236                                                    stat.arithmeticMean(),
237                                                    stat.median(),
238                                                    0.0,
239                                                    stat.getN(),
240                                                    max_difference_in_counts,
241                                                    max_difference,
242                                                    species_data,
243                                                    isSortBySpeciesCountFirst(),
244                                                    isTreatAsBinaryComparison() );
245             }
246             else {
247                 similarity = new DomainSimilarity( domains_list.get( 0 ),
248                                                    stat.getMin(),
249                                                    stat.getMax(),
250                                                    stat.arithmeticMean(),
251                                                    stat.median(),
252                                                    stat.sampleStandardDeviation(),
253                                                    stat.getN(),
254                                                    max_difference_in_counts,
255                                                    max_difference,
256                                                    species_data,
257                                                    isSortBySpeciesCountFirst(),
258                                                    isTreatAsBinaryComparison() );
259             }
260         }
261         return similarity;
262     }
263
264     private boolean isSortBySpeciesCountFirst() {
265         return _sort_by_species_count_first;
266     }
267
268     private boolean isTreatAsBinaryComparison() {
269         return _treat_as_binary_comparison;
270     }
271
272     private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
273         final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
274                                                                              cd.getNumberOfCombinableDomains() );
275         for( final String prot : cd.getKeyDomainProteins() ) {
276             sd.addKeyDomainProtein( prot );
277         }
278         for( final String domain : cd.getCombinableDomains() ) {
279             sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );
280         }
281         return sd;
282     }
283 }