initial commit
[jalview.git] / forester / java / src / org / forester / surfacing / PairwiseGenomeComparator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 // 
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 // 
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
26
27 package org.forester.surfacing;
28
29 import java.io.BufferedWriter;
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.Writer;
34 import java.util.ArrayList;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Random;
38 import java.util.SortedSet;
39 import java.util.TreeSet;
40
41 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
42 import org.forester.evoinference.matrix.distance.DistanceMatrix;
43 import org.forester.go.GoId;
44 import org.forester.go.GoNameSpace;
45 import org.forester.go.GoTerm;
46 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
47 import org.forester.util.DescriptiveStatistics;
48 import org.forester.util.ForesterUtil;
49
50 public class PairwiseGenomeComparator {
51
52     private List<DistanceMatrix> _domain_distance_scores_means;
53     private List<DistanceMatrix> _shared_domains_based_distances;
54     private List<DistanceMatrix> _shared_binary_combinations_based_distances;
55
56     //private List<HistogramData>  _histogram_datas;
57     public PairwiseGenomeComparator() {
58         init();
59     }
60
61     public List<DistanceMatrix> getDomainDistanceScoresMeans() {
62         return _domain_distance_scores_means;
63     }
64
65     //public List<HistogramData> getHistogramDatas() {
66     //    return _histogram_datas;
67     //}
68     public List<DistanceMatrix> getSharedBinaryCombinationsBasedDistances() {
69         return _shared_binary_combinations_based_distances;
70     }
71
72     public List<DistanceMatrix> getSharedDomainsBasedDistances() {
73         return _shared_domains_based_distances;
74     }
75
76     private void init() {
77         //_histogram_datas = new ArrayList<HistogramData>();
78         _domain_distance_scores_means = new ArrayList<DistanceMatrix>();
79         _shared_domains_based_distances = new ArrayList<DistanceMatrix>();
80         _shared_binary_combinations_based_distances = new ArrayList<DistanceMatrix>();
81     }
82
83     public void performPairwiseComparisons( final StringBuilder html_desc,
84                                             final boolean sort_by_species_count_first,
85                                             final Detailedness detailedness,
86                                             final boolean ignore_domains_without_combs_in_all_spec,
87                                             final boolean ignore_domains_specific_to_one_species,
88                                             final DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field,
89                                             final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
90                                             final DomainSimilarity.DomainSimilarityScoring scoring,
91                                             final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
92                                             final Map<GoId, GoTerm> go_id_to_term_map,
93                                             final GoNameSpace go_namespace_limit,
94                                             final Species[] species,
95                                             final int number_of_genomes,
96                                             final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains,
97                                             final PairwiseDomainSimilarityCalculator pw_calc,
98                                             final String automated_pairwise_comparison_suffix,
99                                             final boolean verbose,
100                                             final String automated_pairwise_comparison_prefix,
101                                             final String command_line_prg_name,
102                                             final boolean display_histograms,
103                                             final File out_dir,
104                                             final boolean write_pairwise_comparisons ) {
105         init();
106         final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes );
107         final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
108         final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
109         if ( verbose ) {
110             System.out.println();
111             System.out.println( "Pairwise genome distances:" );
112             System.out.print( "[species-i - species-j:" );
113             System.out.print( " mean-score-based" );
114             System.out.print( " (sd)" );
115             System.out.print( " [N]" );
116             System.out.print( " | shared-domains-based" );
117             System.out.println( " | shared-binary-combinations-based]" );
118             System.out.println();
119         }
120         for( int i = 0; i < number_of_genomes; ++i ) {
121             final String species_i = species[ i ].getSpeciesId();
122             domain_distance_scores_means.setIdentifier( i, species_i );
123             shared_domains_based_distances.setIdentifier( i, species_i );
124             shared_binary_combinations_based_distances.setIdentifier( i, species_i );
125             if ( verbose ) {
126                 System.out.println( ( i + 1 ) + "/" + number_of_genomes );
127             }
128             for( int j = 0; j < i; ++j ) {
129                 if ( ( list_of_genome_wide_combinable_domains.get( i ).getSize() < 1 )
130                         || ( list_of_genome_wide_combinable_domains.get( j ).getSize() < 1 ) ) {
131                     domain_distance_scores_means
132                             .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
133                     shared_domains_based_distances
134                             .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
135                     shared_binary_combinations_based_distances
136                             .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
137                     continue;
138                 }
139                 final List<GenomeWideCombinableDomains> genome_pair = new ArrayList<GenomeWideCombinableDomains>( 2 );
140                 genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) );
141                 genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) );
142                 DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE;
143                 if ( domain_id_to_go_ids_map != null ) {
144                     go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL;
145                 }
146                 final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field,
147                                                                                              sort_by_species_count_first,
148                                                                                              true );
149                 final SortedSet<DomainSimilarity> similarities = calc
150                         .calculateSimilarities( pw_calc,
151                                                 genome_pair,
152                                                 ignore_domains_without_combs_in_all_spec,
153                                                 ignore_domains_specific_to_one_species );
154                 SurfacingUtil.decoratePrintableDomainSimilarities( similarities,
155                                                                    detailedness,
156                                                                    go_annotation_output,
157                                                                    go_id_to_term_map,
158                                                                    go_namespace_limit );
159                 final DescriptiveStatistics stats = SurfacingUtil
160                         .calculateDescriptiveStatisticsForMeanValues( similarities );
161                 final String species_j = species[ j ].getSpeciesId();
162                 final DomainArchitectureBasedGenomeSimilarityCalculator genome_similarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains
163                                                                                                                                                                       .get( i ),
164                                                                                                                                                               list_of_genome_wide_combinable_domains
165                                                                                                                                                                       .get( j ) );
166                 genome_similarity_calculator.setAllowDomainsToBeIgnored( false );
167                 // TODO make histos for these 5 values
168                 double dissimilarity_score_mean;
169                 if ( stats.getN() < 1 ) {
170                     // No domains in common
171                     dissimilarity_score_mean = 1.0;
172                 }
173                 else {
174                     dissimilarity_score_mean = 1.0 - stats.arithmeticMean();
175                 }
176                 final double shared_domains_based_genome_distance = 1.0 - genome_similarity_calculator
177                         .calculateSharedDomainsBasedGenomeSimilarityScore();
178                 final double shared_binary_combinations_based_genome_distance = 1.0 - genome_similarity_calculator
179                         .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore();
180                 domain_distance_scores_means.setValue( i, j, dissimilarity_score_mean );
181                 shared_domains_based_distances.setValue( i, j, shared_domains_based_genome_distance );
182                 shared_binary_combinations_based_distances.setValue( i,
183                                                                      j,
184                                                                      shared_binary_combinations_based_genome_distance );
185                 if ( verbose ) {
186                     System.out.print( species_i + "-" );
187                     System.out.print( species_j + ": " );
188                     System.out.print( ForesterUtil.round( dissimilarity_score_mean, 2 ) );
189                     if ( stats.getN() > 1 ) {
190                         System.out.print( " (" + ForesterUtil.round( stats.sampleStandardDeviation(), 2 ) + ")" );
191                     }
192                     else {
193                         System.out.print( " (n/a)" );
194                     }
195                     System.out.print( " [" + stats.getN() + "]" );
196                     System.out.print( " | " );
197                     System.out.print( ForesterUtil.round( shared_domains_based_genome_distance, 2 ) );
198                     System.out.print( " | " );
199                     System.out.println( ForesterUtil.round( shared_binary_combinations_based_genome_distance, 2 ) );
200                 }
201                 String pairwise_similarities_output_file_str = automated_pairwise_comparison_prefix + species_i + "_"
202                         + species_j + automated_pairwise_comparison_suffix;
203                 switch ( domain_similarity_print_option ) {
204                     case HTML:
205                         if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) {
206                             pairwise_similarities_output_file_str += ".html";
207                         }
208                         break;
209                 }
210                 DescriptiveStatistics pw_stats = null;
211                 if ( write_pairwise_comparisons ) {
212                     try {
213                         final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? pairwise_similarities_output_file_str
214                                 : out_dir + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) );
215                         pw_stats = SurfacingUtil.writeDomainSimilaritiesToFile( html_desc,
216                                                                                 new StringBuilder( species_i + "-"
217                                                                                         + species_j ),
218                                                                                 writer,
219                                                                                 similarities,
220                                                                                 true,
221                                                                                 null,
222                                                                                 domain_similarity_print_option,
223                                                                                 domain_similarity_sort_field,
224                                                                                 scoring,
225                                                                                 false );
226                     }
227                     catch ( final IOException e ) {
228                         ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \""
229                                 + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" );
230                     }
231                 }
232                 // pairwise_matrix.setValue( i, j, cdc_list.get( cdc_list.size()
233                 // - 1 ) );
234                 if ( pw_stats != null ) {
235                     if ( pw_stats.getMin() >= pw_stats.getMax() ) {
236                         ForesterUtil.printWarningMessage( command_line_prg_name, "for [" + species_i + "-" + species_j
237                                 + "] score minimum is [" + pw_stats.getMin() + "] while score maximum is ["
238                                 + pw_stats.getMax() + "], possibly indicating that a genome is compared to itself" );
239                     }
240                     if ( display_histograms && ( pw_stats.getMin() < pw_stats.getMax() ) ) {
241                         //final double[] values = pw_stats.getDataAsDoubleArray();
242                         // List<HistogramDataItem> data_items = new
243                         // ArrayList<HistogramDataItem>( values.length );
244                         // for( int n = 0; n < values.length; i++ ) {
245                         // data_items.add( new BasicHistogramDataItem( "", values[ n ] )
246                         // );
247                         // }
248                         //~   _histogram_datas.add( new HistogramData( species_i + "-" + species_j, values, null, 20 ) );
249                     }
250                 }
251             }
252         }
253         getDomainDistanceScoresMeans().add( domain_distance_scores_means );
254         getSharedDomainsBasedDistances().add( shared_domains_based_distances );
255         getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances );
256         if ( verbose ) {
257             System.out.println();
258         }
259     }
260
261     public void performPairwiseComparisonsJacknifed( final Species[] species,
262                                                      final int number_of_genomes,
263                                                      final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains,
264                                                      final boolean verbose,
265                                                      final int number_of_resamplings,
266                                                      final double jacknife_ratio,
267                                                      final long random_seed ) {
268         init();
269         if ( number_of_resamplings < 2 ) {
270             throw new IllegalArgumentException( "attempt to perform jacknife resampling with less than 2 resamplings" );
271         }
272         if ( jacknife_ratio <= 0.0 ) {
273             throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio of 0.0 or less" );
274         }
275         else if ( jacknife_ratio >= 1.0 ) {
276             throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio 1.0 or more" );
277         }
278         final DomainId[] all_unique_domain_ids = getAllUniqueDomainIdAsArray( list_of_genome_wide_combinable_domains );
279         if ( verbose ) {
280             System.out.println();
281             System.out.println( "Jacknife: total of domains: " + all_unique_domain_ids.length );
282         }
283         if ( verbose ) {
284             System.out.print( "resampling " );
285         }
286         final Random generator = new Random( random_seed );
287         for( int r = 0; r < number_of_resamplings; ++r ) {
288             if ( verbose ) {
289                 System.out.print( " " + r );
290             }
291             final SortedSet<DomainId> domain_ids_to_ignore = randomlyPickDomainIds( all_unique_domain_ids,
292                                                                                     jacknife_ratio,
293                                                                                     generator );
294             final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
295             final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
296             for( int i = 0; i < number_of_genomes; ++i ) {
297                 final String species_i = species[ i ].getSpeciesId();
298                 shared_domains_based_distances.setIdentifier( i, species_i );
299                 shared_binary_combinations_based_distances.setIdentifier( i, species_i );
300                 for( int j = 0; j < i; ++j ) {
301                     final List<GenomeWideCombinableDomains> genome_pair = new ArrayList<GenomeWideCombinableDomains>( 2 );
302                     genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) );
303                     genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) );
304                     final DomainArchitectureBasedGenomeSimilarityCalculator genome_simiarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains
305                                                                                                                                                                          .get( i ),
306                                                                                                                                                                  list_of_genome_wide_combinable_domains
307                                                                                                                                                                          .get( j ) );
308                     genome_simiarity_calculator.setAllowDomainsToBeIgnored( true );
309                     genome_simiarity_calculator.setDomainIdsToIgnore( domain_ids_to_ignore );
310                     shared_domains_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator
311                             .calculateSharedDomainsBasedGenomeSimilarityScore() );
312                     shared_binary_combinations_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator
313                             .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() );
314                 }
315             }
316             getSharedDomainsBasedDistances().add( shared_domains_based_distances );
317             getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances );
318         }
319         if ( verbose ) {
320             System.out.println();
321         }
322     }
323
324     static private DomainId[] getAllUniqueDomainIdAsArray( final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains ) {
325         DomainId[] all_domain_ids_array;
326         final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
327         for( final GenomeWideCombinableDomains genome_wide_combinable_domains : list_of_genome_wide_combinable_domains ) {
328             final SortedSet<DomainId> all_domains = genome_wide_combinable_domains.getAllDomainIds();
329             for( final DomainId domain : all_domains ) {
330                 all_domain_ids.add( domain );
331             }
332         }
333         all_domain_ids_array = new DomainId[ all_domain_ids.size() ];
334         int n = 0;
335         for( final DomainId domain_id : all_domain_ids ) {
336             all_domain_ids_array[ n++ ] = domain_id;
337         }
338         return all_domain_ids_array;
339     }
340
341     static private SortedSet<DomainId> randomlyPickDomainIds( final DomainId[] all_domain_ids_array,
342                                                               final double jacknife_ratio,
343                                                               final Random generator ) {
344         final int size = all_domain_ids_array.length;
345         final SortedSet<DomainId> random_domain_ids = new TreeSet<DomainId>();
346         final int number_of_ids_pick = ForesterUtil.roundToInt( jacknife_ratio * size );
347         while ( random_domain_ids.size() < number_of_ids_pick ) {
348             final int r = generator.nextInt( size );
349             random_domain_ids.add( all_domain_ids_array[ r ] );
350         }
351         return random_domain_ids;
352     }
353 }