in progress
[jalview.git] / forester / java / src / org / forester / surfacing / BasicDomainSimilarityCalculator.java
1 // $Id:
2 // Exp $
3 //
4 // FORESTER -- software libraries and applications
5 // for evolutionary biology research and applications.
6 //
7 // Copyright (C) 2008-2009 Christian M. Zmasek
8 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
27
28 package org.forester.surfacing;
29
30 import java.util.ArrayList;
31 import java.util.List;
32 import java.util.SortedMap;
33 import java.util.SortedSet;
34 import java.util.TreeMap;
35 import java.util.TreeSet;
36
37 import org.forester.protein.DomainId;
38 import org.forester.species.Species;
39 import org.forester.util.BasicDescriptiveStatistics;
40 import org.forester.util.DescriptiveStatistics;
41
42 public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
43
44     final DomainSimilarity.DomainSimilaritySortField _sort;
45     private final boolean                            _sort_by_species_count_first;
46     private final boolean                            _treat_as_binary_comparison;
47
48     public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
49                                             final boolean sort_by_species_count_first,
50                                             final boolean treat_as_binary_comparison ) {
51         _sort = sort;
52         _sort_by_species_count_first = sort_by_species_count_first;
53         _treat_as_binary_comparison = treat_as_binary_comparison;
54     }
55
56     @Override
57     public SortedSet<DomainSimilarity> calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator,
58                                                               final List<GenomeWideCombinableDomains> cdc_list,
59                                                               final boolean ignore_domains_without_combinations_in_any_genome,
60                                                               final boolean ignore_domains_specific_to_one_genome ) {
61         if ( cdc_list.size() < 2 ) {
62             throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
63         }
64         final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
65         final SortedSet<DomainId> keys = new TreeSet<DomainId>();
66         for( final GenomeWideCombinableDomains cdc : cdc_list ) {
67             keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
68         }
69         for( final DomainId key : keys ) {
70             final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
71             final List<Species> species_with_key_id_domain = new ArrayList<Species>();
72             for( final GenomeWideCombinableDomains cdc : cdc_list ) {
73                 if ( cdc.contains( key ) ) {
74                     same_id_cd_list.add( cdc.get( key ) );
75                     species_with_key_id_domain.add( cdc.getSpecies() );
76                 }
77             }
78             if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<<
79                 boolean without_combinations = true;
80                 for( final CombinableDomains cd : same_id_cd_list ) {
81                     if ( cd.getNumberOfCombinableDomains() > 0 ) {
82                         without_combinations = false;
83                         break;
84                     }
85                 }
86                 if ( without_combinations ) {
87                     continue;
88                 }
89             }
90             // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55
91             // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92             // OLD: if ( same_id_cd_list.size() > 1 ) {
93             if ( same_id_cd_list.size() > 0 ) {
94                 if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
95                     final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
96                     if ( s != null ) {
97                         similarities.add( s );
98                     }
99                     else {
100                         throw new RuntimeException( "similarity is null: this should not have happened" );
101                     }
102                 }
103             }
104             // ~~~ NEW:
105             else {
106                 throw new RuntimeException( "this should not have happened" );
107             }
108             // ~~~ OLD:
109             // else if ( same_id_cd_list.size() == 1 ) {
110             // TODO need to go in file
111             // System.out.println( "only in one species [" +
112             // species_with_key_id_domain.get( 0 ) + "]: " + key_id );
113             //}
114             //else {
115             //    throw new RuntimeException( "this should not have happened" );
116             // }
117         }
118         return similarities;
119     }
120
121     private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
122                                                   final List<CombinableDomains> domains_list ) {
123         if ( domains_list.size() == 1 ) {
124             // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55
125             // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
126             // ~~~OLD:
127             //throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinable domains" );
128             // ~~~new: 
129             final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
130             species_data.put( domains_list.get( 0 ).getSpecies(),
131                               createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
132             return new PrintableDomainSimilarity( domains_list.get( 0 ),
133                                                   1.0,
134                                                   1.0,
135                                                   1.0,
136                                                   1.0,
137                                                   0.0,
138                                                   0,
139                                                   0,
140                                                   0,
141                                                   species_data,
142                                                   getSort(),
143                                                   isSortBySpeciesCountFirst(),
144                                                   isTreatAsBinaryComparison() );
145         }
146         final DescriptiveStatistics stat = new BasicDescriptiveStatistics();
147         final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
148         species_data.put( domains_list.get( 0 ).getSpecies(),
149                           createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
150         int max_difference_in_counts = 0;
151         int max_difference = 0;
152         final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator;
153         for( int i = 1; i < domains_list.size(); ++i ) {
154             species_data.put( domains_list.get( i ).getSpecies(),
155                               createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) );
156             final CombinableDomains domains_i = domains_list.get( i );
157             for( int j = 0; j < i; ++j ) {
158                 final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator
159                         .calculateSimilarity( domains_i, domains_list.get( j ) );
160                 final int difference_in_counts = pairwise_similarity.getDifferenceInCounts();
161                 int difference = 0;
162                 if ( is_domain_combination_based ) {
163                     difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity )
164                             .getNumberOfDifferentDomains();
165                 }
166                 else {
167                     difference = difference_in_counts;
168                 }
169                 if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) {
170                     max_difference_in_counts = difference_in_counts;
171                 }
172                 if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
173                     max_difference = difference;
174                 }
175                 stat.addValue( pairwise_similarity.getSimilarityScore() );
176             }
177         }
178         if ( stat.getN() < 1 ) {
179             throw new AssertionError( "empty descriptive statistics: this should not have happened" );
180         }
181         if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
182             throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
183         }
184         if ( ( /*stat.getN() != 1 ||*/!isTreatAsBinaryComparison() ) && ( max_difference_in_counts < 0 ) ) {
185             max_difference_in_counts = Math.abs( max_difference_in_counts );
186             if ( !is_domain_combination_based ) {
187                 max_difference = Math.abs( max_difference ); //=max_difference_in_counts for !is_domain_combination_based.
188             }
189         }
190         DomainSimilarity similarity = null;
191         if ( stat.getN() == 1 ) {
192             similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
193                                                         stat.getMin(),
194                                                         stat.getMax(),
195                                                         stat.arithmeticMean(),
196                                                         stat.median(),
197                                                         0.0,
198                                                         stat.getN(),
199                                                         max_difference_in_counts,
200                                                         max_difference,
201                                                         species_data,
202                                                         getSort(),
203                                                         isSortBySpeciesCountFirst(),
204                                                         isTreatAsBinaryComparison() );
205         }
206         else {
207             similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
208                                                         stat.getMin(),
209                                                         stat.getMax(),
210                                                         stat.arithmeticMean(),
211                                                         stat.median(),
212                                                         stat.sampleStandardDeviation(),
213                                                         stat.getN(),
214                                                         max_difference_in_counts,
215                                                         max_difference,
216                                                         species_data,
217                                                         getSort(),
218                                                         isSortBySpeciesCountFirst(),
219                                                         isTreatAsBinaryComparison() );
220         }
221         return similarity;
222     }
223
224     private DomainSimilarity.DomainSimilaritySortField getSort() {
225         return _sort;
226     }
227
228     private boolean isSortBySpeciesCountFirst() {
229         return _sort_by_species_count_first;
230     }
231
232     private boolean isTreatAsBinaryComparison() {
233         return _treat_as_binary_comparison;
234     }
235
236     private static SpeciesSpecificDomainSimilariyData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
237         final SpeciesSpecificDomainSimilariyData sd = new PrintableSpeciesSpecificDomainSimilariyData( cd.getKeyDomainProteinsCount(),
238                                                                                                        cd.getKeyDomainCount(),
239                                                                                                        cd.getNumberOfCombinableDomains(),
240                                                                                                        cd.getKeyDomainConfidenceDescriptiveStatistics() );
241         for( final DomainId domain : cd.getCombinableDomains() ) {
242             sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );
243         }
244         return sd;
245     }
246 }