inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / BasicDomainSimilarityCalculator.java
1 // $Id:
2 // Exp $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.surfacing;
29
30 import java.text.DecimalFormat;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.SortedMap;
34 import java.util.SortedSet;
35 import java.util.TreeMap;
36 import java.util.TreeSet;
37
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
41 import org.forester.util.ForesterUtil;
42
43 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
44
45     final DomainSimilarity.DomainSimilaritySortField _sort;
46     private final boolean                            _sort_by_species_count_first;
47     private final boolean                            _treat_as_binary_comparison;
48     private final boolean                            _calc_similarity_score;
49
50     public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
51                                             final boolean sort_by_species_count_first,
52                                             final boolean treat_as_binary_comparison,
53                                             final boolean calc_similarity_score ) {
54         _sort = sort;
55         _sort_by_species_count_first = sort_by_species_count_first;
56         _treat_as_binary_comparison = treat_as_binary_comparison;
57         _calc_similarity_score = calc_similarity_score;
58     }
59
60     public boolean isCalcSimilarityScore() {
61         return _calc_similarity_score;
62     }
63
64     @Override
65     public SortedSet<DomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
66                                                               final List<GenomeWideCombinableDomains> cdc_list,
67                                                               final boolean ignore_domains_without_combinations_in_any_genome,
68                                                               final boolean ignore_domains_specific_to_one_genome ) {
69         if ( cdc_list.size() < 2 ) {
70             throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
71         }
72         final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
73         final SortedSet<String> keys = new TreeSet<String>();
74         for( final GenomeWideCombinableDomains cdc : cdc_list ) {
75             keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
76         }
77         final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
78         int counter = 1;
79         System.out.println( keys.size() );
80         for( final String key : keys ) {
81             ForesterUtil.updateProgress( counter, pf );
82             counter++;
83             final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
84             final List<Species> species_with_key_id_domain = new ArrayList<Species>();
85             for( final GenomeWideCombinableDomains cdc : cdc_list ) {
86                 if ( cdc.contains( key ) ) {
87                     same_id_cd_list.add( cdc.get( key ) );
88                     species_with_key_id_domain.add( cdc.getSpecies() );
89                 }
90             }
91             if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
92                 boolean without_combinations = true;
93                 for( final CombinableDomains cd : same_id_cd_list ) {
94                     if ( cd.getNumberOfCombinableDomains() > 0 ) {
95                         without_combinations = false;
96                         break;
97                     }
98                 }
99                 if ( without_combinations ) {
100                     continue;
101                 }
102             }
103             if ( same_id_cd_list.size() > 0 ) {
104                 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
105                     final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
106                     if ( s != null ) {
107                         similarities.add( s );
108                     }
109                     else {
110                         throw new RuntimeException( "similarity is null: this should not have happened" );
111                     }
112                 }
113             }
114             else {
115                 throw new RuntimeException( "this should not have happened" );
116             }
117         }
118         System.out.println();
119         return similarities;
120     }
121
122     private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
123                                                   final List<CombinableDomains> domains_list ) {
124         if ( domains_list.size() == 1 ) {
125             final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
126             species_data.put( domains_list.get( 0 ).getSpecies(),
127                               createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
128             if ( !isCalcSimilarityScore() ) {
129                 return new PrintableDomainSimilarity( domains_list.get( 0 ),
130                                                       0,
131                                                       0,
132                                                       species_data,
133                                                       isSortBySpeciesCountFirst(),
134                                                       isTreatAsBinaryComparison() );
135             }
136             else {
137                 return new PrintableDomainSimilarity( domains_list.get( 0 ),
138                                                       1.0,
139                                                       1.0,
140                                                       1.0,
141                                                       1.0,
142                                                       0.0,
143                                                       0,
144                                                       0,
145                                                       0,
146                                                       species_data,
147                                                       isSortBySpeciesCountFirst(),
148                                                       isTreatAsBinaryComparison() );
149             }
150         }
151         DescriptiveStatistics stat = null;
152         if ( isCalcSimilarityScore() ) {
153             stat = new BasicDescriptiveStatistics();
154         }
155         final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
156         species_data.put( domains_list.get( 0 ).getSpecies(),
157                           createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
158         int max_difference_in_counts = 0;
159         int max_difference = 0;
160         final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
161         for( int i = 1; i < domains_list.size(); ++i ) {
162             species_data.put( domains_list.get( i ).getSpecies(),
163                               createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
164             final CombinableDomains domains_i = domains_list.get( i );
165             for( int j = 0; j < i; ++j ) {
166                 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
167                         .calculateSimilarity( domains_i, domains_list.get( j ) );
168                 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
169                 int difference = 0;
170                 if ( is_domain_combination_based ) {
171                     difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
172                             .getNumberOfDifferentDomains();
173                 }
174                 else {
175                     difference = difference_in_counts;
176                 }
177                 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
178                     max_difference_in_counts = difference_in_counts;
179                 }
180                 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
181                     max_difference = difference;
182                 }
183                 if ( isCalcSimilarityScore() ) {
184                     stat.addValue( pairwise_similarity.getSimilarityScore() );
185                 }
186             }
187         }
188         if ( isCalcSimilarityScore() ) {
189             if ( stat.getN() < 1 ) {
190                 throw new RuntimeException( "empty descriptive statistics: this should not have happened" );
191             }
192             if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
193                 throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
194             }
195         }
196         if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) {
197             max_difference_in_counts = Math.abs( max_difference_in_counts );
198             if ( !is_domain_combination_based ) {
199                 max_difference = Math.abs( max_difference );
200             }
201         }
202         DomainSimilarity similarity = null;
203         if ( !isCalcSimilarityScore() ) {
204             similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
205                                                         max_difference_in_counts,
206                                                         max_difference,
207                                                         species_data,
208                                                         isSortBySpeciesCountFirst(),
209                                                         isTreatAsBinaryComparison() );
210         }
211         else {
212             if ( stat.getN() == 1 ) {
213                 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
214                                                             stat.getMin(),
215                                                             stat.getMax(),
216                                                             stat.arithmeticMean(),
217                                                             stat.median(),
218                                                             0.0,
219                                                             stat.getN(),
220                                                             max_difference_in_counts,
221                                                             max_difference,
222                                                             species_data,
223                                                             isSortBySpeciesCountFirst(),
224                                                             isTreatAsBinaryComparison() );
225             }
226             else {
227                 similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
228                                                             stat.getMin(),
229                                                             stat.getMax(),
230                                                             stat.arithmeticMean(),
231                                                             stat.median(),
232                                                             stat.sampleStandardDeviation(),
233                                                             stat.getN(),
234                                                             max_difference_in_counts,
235                                                             max_difference,
236                                                             species_data,
237                                                             isSortBySpeciesCountFirst(),
238                                                             isTreatAsBinaryComparison() );
239             }
240         }
241         return similarity;
242     }
243
244     private boolean isSortBySpeciesCountFirst() {
245         return _sort_by_species_count_first;
246     }
247
248     private boolean isTreatAsBinaryComparison() {
249         return _treat_as_binary_comparison;
250     }
251
252     private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
253         final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
254                                                                              cd.getNumberOfCombinableDomains() );
255         for( final String prot : cd.getKeyDomainProteins() ) {
256             sd.addKeyDomainProtein( prot );
257         }
258         for( final String domain : cd.getCombinableDomains() ) {
259             sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );
260         }
261         return sd;
262     }
263 }