inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / BasicDomainSimilarityCalculator.java
1 // $Id:
2 // Exp $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.surfacing;
29
30 import java.text.DecimalFormat;
31 import java.util.ArrayList;
32 import java.util.List;
33 import java.util.SortedMap;
34 import java.util.SortedSet;
35 import java.util.TreeMap;
36 import java.util.TreeSet;
37
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
41 import org.forester.util.ForesterUtil;
42
43 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
44
45     final DomainSimilarity.DomainSimilaritySortField _sort;
46     private final boolean                            _calc_similarity_score;
47     private final boolean                            _sort_by_species_count_first;
48     private final boolean                            _treat_as_binary_comparison;
49
50     public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
51                                             final boolean sort_by_species_count_first,
52                                             final boolean treat_as_binary_comparison,
53                                             final boolean calc_similarity_score ) {
54         _sort = sort;
55         _sort_by_species_count_first = sort_by_species_count_first;
56         _treat_as_binary_comparison = treat_as_binary_comparison;
57         _calc_similarity_score = calc_similarity_score;
58     }
59
60     @Override
61     public SortedSet<DomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
62                                                               final List<GenomeWideCombinableDomains> cdc_list,
63                                                               final boolean ignore_domains_without_combinations_in_any_genome,
64                                                               final boolean ignore_domains_specific_to_one_genome ) {
65         if ( cdc_list.size() < 2 ) {
66             throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
67         }
68         final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
69         final SortedSet<String> keys = new TreeSet<String>();
70         for( final GenomeWideCombinableDomains cdc : cdc_list ) {
71             keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
72         }
73         final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
74         int counter = 1;
75         System.out.println( keys.size() );
76         for( final String key : keys ) {
77             ForesterUtil.updateProgress( counter, pf );
78             counter++;
79             final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
80             final List<Species> species_with_key_id_domain = new ArrayList<Species>();
81             for( final GenomeWideCombinableDomains cdc : cdc_list ) {
82                 if ( cdc.contains( key ) ) {
83                     same_id_cd_list.add( cdc.get( key ) );
84                     species_with_key_id_domain.add( cdc.getSpecies() );
85                 }
86             }
87             if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
88                 boolean without_combinations = true;
89                 for( final CombinableDomains cd : same_id_cd_list ) {
90                     if ( cd.getNumberOfCombinableDomains() > 0 ) {
91                         without_combinations = false;
92                         break;
93                     }
94                 }
95                 if ( without_combinations ) {
96                     continue;
97                 }
98             }
99             if ( same_id_cd_list.size() > 0 ) {
100                 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
101                     final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
102                     if ( s != null ) {
103                         similarities.add( s );
104                     }
105                     else {
106                         throw new RuntimeException( "similarity is null: this should not have happened" );
107                     }
108                 }
109             }
110             else {
111                 throw new RuntimeException( "this should not have happened" );
112             }
113         }
114         System.out.println();
115         return similarities;
116     }
117
118     public boolean isCalcSimilarityScore() {
119         return _calc_similarity_score;
120     }
121
122     private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
123                                                   final List<CombinableDomains> domains_list ) {
124         if ( domains_list.size() == 1 ) {
125             final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
126             species_data.put( domains_list.get( 0 ).getSpecies(),
127                               createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
128             if ( !isCalcSimilarityScore() ) {
129                 return new DomainSimilarity( domains_list.get( 0 ),
130                                              0,
131                                              0,
132                                              species_data,
133                                              isSortBySpeciesCountFirst(),
134                                              isTreatAsBinaryComparison() );
135             }
136             else {
137                 return new DomainSimilarity( domains_list.get( 0 ),
138                                              1.0,
139                                              1.0,
140                                              1.0,
141                                              1.0,
142                                              0.0,
143                                              0,
144                                              0,
145                                              0,
146                                              species_data,
147                                              isSortBySpeciesCountFirst(),
148                                              isTreatAsBinaryComparison() );
149             }
150         }
151         DescriptiveStatistics stat = null;
152         if ( isCalcSimilarityScore() ) {
153             stat = new BasicDescriptiveStatistics();
154         }
155         final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
156         species_data.put( domains_list.get( 0 ).getSpecies(),
157                           createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
158         int max_difference_in_counts = 0;
159         int max_difference = 0;
160         final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
161         for( int i = 1; i < domains_list.size(); ++i ) {
162             species_data.put( domains_list.get( i ).getSpecies(),
163                               createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
164             final CombinableDomains domains_i = domains_list.get( i );
165             for( int j = 0; j < i; ++j ) {
166                 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
167                         .calculateSimilarity( domains_i, domains_list.get( j ) );
168                 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
169                 int difference = 0;
170                 if ( is_domain_combination_based ) {
171                     difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
172                             .getNumberOfDifferentDomains();
173                 }
174                 else {
175                     difference = difference_in_counts;
176                 }
177                 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
178                     max_difference_in_counts = difference_in_counts;
179                 }
180                 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
181                     max_difference = difference;
182                 }
183                 if ( isCalcSimilarityScore() ) {
184                     stat.addValue( pairwise_similarity.getSimilarityScore() );
185                 }
186             }
187         }
188         if ( isCalcSimilarityScore() ) {
189             if ( stat.getN() < 1 ) {
190                 throw new RuntimeException( "empty descriptive statistics: this should not have happened" );
191             }
192             if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
193                 throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
194             }
195         }
196         if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) {
197             max_difference_in_counts = Math.abs( max_difference_in_counts );
198             if ( !is_domain_combination_based ) {
199                 max_difference = Math.abs( max_difference );
200             }
201         }
202         DomainSimilarity similarity = null;
203         if ( !isCalcSimilarityScore() ) {
204             similarity = new DomainSimilarity( domains_list.get( 0 ),
205                                                max_difference_in_counts,
206                                                max_difference,
207                                                species_data,
208                                                isSortBySpeciesCountFirst(),
209                                                isTreatAsBinaryComparison() );
210         }
211         else {
212             if ( stat.getN() == 1 ) {
213                 similarity = new DomainSimilarity( domains_list.get( 0 ),
214                                                    stat.getMin(),
215                                                    stat.getMax(),
216                                                    stat.arithmeticMean(),
217                                                    stat.median(),
218                                                    0.0,
219                                                    stat.getN(),
220                                                    max_difference_in_counts,
221                                                    max_difference,
222                                                    species_data,
223                                                    isSortBySpeciesCountFirst(),
224                                                    isTreatAsBinaryComparison() );
225             }
226             else {
227                 similarity = new DomainSimilarity( domains_list.get( 0 ),
228                                                    stat.getMin(),
229                                                    stat.getMax(),
230                                                    stat.arithmeticMean(),
231                                                    stat.median(),
232                                                    stat.sampleStandardDeviation(),
233                                                    stat.getN(),
234                                                    max_difference_in_counts,
235                                                    max_difference,
236                                                    species_data,
237                                                    isSortBySpeciesCountFirst(),
238                                                    isTreatAsBinaryComparison() );
239             }
240         }
241         return similarity;
242     }
243
244     private boolean isSortBySpeciesCountFirst() {
245         return _sort_by_species_count_first;
246     }
247
248     private boolean isTreatAsBinaryComparison() {
249         return _treat_as_binary_comparison;
250     }
251
252     private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
253         final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
254                                                                              cd.getNumberOfCombinableDomains() );
255         for( final String prot : cd.getKeyDomainProteins() ) {
256             sd.addKeyDomainProtein( prot );
257         }
258         for( final String domain : cd.getCombinableDomains() ) {
259             sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );
260         }
261         return sd;
262     }
263 }