(no commit message)
[jalview.git] / forester / java / src / org / forester / surfacing / PairwiseGenomeComparator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.surfacing;
28
29 import java.io.BufferedWriter;
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.Writer;
34 import java.util.ArrayList;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Random;
38 import java.util.SortedSet;
39 import java.util.TreeSet;
40
41 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
42 import org.forester.evoinference.matrix.distance.DistanceMatrix;
43 import org.forester.go.GoId;
44 import org.forester.go.GoNameSpace;
45 import org.forester.go.GoTerm;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.species.Species;
48 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
49 import org.forester.util.DescriptiveStatistics;
50 import org.forester.util.ForesterUtil;
51
52 public class PairwiseGenomeComparator {
53
54     private List<DistanceMatrix> _domain_distance_scores_means;
55     private List<DistanceMatrix> _shared_binary_combinations_based_distances;
56     private List<DistanceMatrix> _shared_domains_based_distances;
57
58     public PairwiseGenomeComparator() {
59         init();
60     }
61
62     public List<DistanceMatrix> getDomainDistanceScoresMeans() {
63         return _domain_distance_scores_means;
64     }
65
66     public List<DistanceMatrix> getSharedBinaryCombinationsBasedDistances() {
67         return _shared_binary_combinations_based_distances;
68     }
69
70     public List<DistanceMatrix> getSharedDomainsBasedDistances() {
71         return _shared_domains_based_distances;
72     }
73
74     public void performPairwiseComparisons( final StringBuilder html_desc,
75                                             final boolean sort_by_species_count_first,
76                                             final Detailedness detailedness,
77                                             final boolean ignore_domains_without_combs_in_all_spec,
78                                             final boolean ignore_domains_specific_to_one_species,
79                                             final DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field,
80                                             final DomainSimilarity.PRINT_OPTION domain_similarity_print_option,
81                                             final DomainSimilarity.DomainSimilarityScoring scoring,
82                                             final Map<String, List<GoId>> domain_id_to_go_ids_map,
83                                             final Map<GoId, GoTerm> go_id_to_term_map,
84                                             final GoNameSpace go_namespace_limit,
85                                             final Species[] species,
86                                             final int number_of_genomes,
87                                             final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains,
88                                             final PairwiseDomainSimilarityCalculator pw_calc,
89                                             final String automated_pairwise_comparison_suffix,
90                                             final boolean verbose,
91                                             final String automated_pairwise_comparison_prefix,
92                                             final String command_line_prg_name,
93                                             final File out_dir,
94                                             final boolean write_pairwise_comparisons,
95                                             final Map<String, Integer> tax_code_to_id_map,
96                                             final boolean calc_similarity_scores,
97                                             final Phylogeny phy ) {
98         init();
99         final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes );
100         final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
101         final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
102         if ( verbose ) {
103             System.out.println();
104             System.out.println( "Pairwise genome distances:" );
105             System.out.print( "[species-i - species-j:" );
106             System.out.print( " mean-score-based" );
107             System.out.print( " (sd)" );
108             System.out.print( " [N]" );
109             System.out.print( " | shared-domains-based" );
110             System.out.println( " | shared-binary-combinations-based]" );
111             System.out.println();
112         }
113         for( int i = 0; i < number_of_genomes; ++i ) {
114             final String species_i = species[ i ].getSpeciesId();
115             domain_distance_scores_means.setIdentifier( i, species_i );
116             shared_domains_based_distances.setIdentifier( i, species_i );
117             shared_binary_combinations_based_distances.setIdentifier( i, species_i );
118             if ( verbose ) {
119                 System.out.println( ( i + 1 ) + "/" + number_of_genomes );
120             }
121             for( int j = 0; j < i; ++j ) {
122                 if ( ( list_of_genome_wide_combinable_domains.get( i ).getSize() < 1 )
123                         || ( list_of_genome_wide_combinable_domains.get( j ).getSize() < 1 ) ) {
124                     domain_distance_scores_means
125                     .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
126                     shared_domains_based_distances
127                     .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
128                     shared_binary_combinations_based_distances
129                     .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
130                     continue;
131                 }
132                 final List<GenomeWideCombinableDomains> genome_pair = new ArrayList<GenomeWideCombinableDomains>( 2 );
133                 genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) );
134                 genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) );
135                 DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE;
136                 if ( domain_id_to_go_ids_map != null ) {
137                     go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL;
138                 }
139                 final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field,
140                                                                                              sort_by_species_count_first,
141                                                                                              true,
142                                                                                              calc_similarity_scores,
143                                                                                              true );
144                 final SortedSet<DomainSimilarity> similarities = calc
145                         .calculateSimilarities( pw_calc,
146                                                 genome_pair,
147                                                 ignore_domains_without_combs_in_all_spec,
148                                                 ignore_domains_specific_to_one_species );
149                 SurfacingUtil.decoratePrintableDomainSimilarities( similarities, detailedness );
150                 final DescriptiveStatistics stats = SurfacingUtil
151                         .calculateDescriptiveStatisticsForMeanValues( similarities );
152                 final String species_j = species[ j ].getSpeciesId();
153                 final DomainArchitectureBasedGenomeSimilarityCalculator genome_similarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains
154                                                                                                                                                               .get( i ),
155                                                                                                                                                               list_of_genome_wide_combinable_domains
156                                                                                                                                                               .get( j ) );
157                 genome_similarity_calculator.setAllowDomainsToBeIgnored( false );
158                 double dissimilarity_score_mean;
159                 if ( stats.getN() < 1 ) {
160                     // No domains in common
161                     dissimilarity_score_mean = 1.0;
162                 }
163                 else {
164                     dissimilarity_score_mean = 1.0 - stats.arithmeticMean();
165                 }
166                 final double shared_domains_based_genome_distance = 1.0 - genome_similarity_calculator
167                         .calculateSharedDomainsBasedGenomeSimilarityScore();
168                 final double shared_binary_combinations_based_genome_distance = 1.0 - genome_similarity_calculator
169                         .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore();
170                 domain_distance_scores_means.setValue( i, j, dissimilarity_score_mean );
171                 shared_domains_based_distances.setValue( i, j, shared_domains_based_genome_distance );
172                 shared_binary_combinations_based_distances.setValue( i,
173                                                                      j,
174                                                                      shared_binary_combinations_based_genome_distance );
175                 if ( verbose ) {
176                     System.out.print( species_i + "-" );
177                     System.out.print( species_j + ": " );
178                     System.out.print( ForesterUtil.round( dissimilarity_score_mean, 2 ) );
179                     if ( stats.getN() > 1 ) {
180                         System.out.print( " (" + ForesterUtil.round( stats.sampleStandardDeviation(), 2 ) + ")" );
181                     }
182                     else {
183                         System.out.print( " (n/a)" );
184                     }
185                     System.out.print( " [" + stats.getN() + "]" );
186                     System.out.print( " | " );
187                     System.out.print( ForesterUtil.round( shared_domains_based_genome_distance, 2 ) );
188                     System.out.print( " | " );
189                     System.out.println( ForesterUtil.round( shared_binary_combinations_based_genome_distance, 2 ) );
190                 }
191                 String pairwise_similarities_output_file_str = automated_pairwise_comparison_prefix + species_i + "_"
192                         + species_j + automated_pairwise_comparison_suffix;
193                 switch ( domain_similarity_print_option ) {
194                     case HTML:
195                         if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) {
196                             pairwise_similarities_output_file_str += ".html";
197                         }
198                         break;
199                 }
200                 if ( write_pairwise_comparisons ) {
201                     try {
202                         final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? pairwise_similarities_output_file_str
203                                 : out_dir + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) );
204                         SurfacingUtil.writeDomainSimilaritiesToFile( html_desc,
205                                                                      new StringBuilder( species_i + "-" + species_j ),
206                                                                      null,
207                                                                      writer,
208                                                                      null,
209                                                                      similarities,
210                                                                      true,
211                                                                      null,
212                                                                      domain_similarity_print_option,
213                                                                      scoring,
214                                                                      false,
215                                                                      tax_code_to_id_map,
216                                                                      phy,
217                                                                      null );
218                     }
219                     catch ( final IOException e ) {
220                         ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \""
221                                 + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" );
222                     }
223                 }
224             }
225         }
226         getDomainDistanceScoresMeans().add( domain_distance_scores_means );
227         getSharedDomainsBasedDistances().add( shared_domains_based_distances );
228         getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances );
229         if ( verbose ) {
230             System.out.println();
231         }
232     }
233
234     public void performPairwiseComparisonsJacknifed( final Species[] species,
235                                                      final int number_of_genomes,
236                                                      final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains,
237                                                      final boolean verbose,
238                                                      final int number_of_resamplings,
239                                                      final double jacknife_ratio,
240                                                      final long random_seed ) {
241         init();
242         if ( number_of_resamplings < 2 ) {
243             throw new IllegalArgumentException( "attempt to perform jacknife resampling with less than 2 resamplings" );
244         }
245         if ( jacknife_ratio <= 0.0 ) {
246             throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio of 0.0 or less" );
247         }
248         else if ( jacknife_ratio >= 1.0 ) {
249             throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio 1.0 or more" );
250         }
251         final String[] all_unique_domain_ids = getAllUniqueDomainIdAsArray( list_of_genome_wide_combinable_domains );
252         if ( verbose ) {
253             System.out.println();
254             System.out.println( "Jacknife: total of domains: " + all_unique_domain_ids.length );
255         }
256         if ( verbose ) {
257             System.out.print( "resampling " );
258         }
259         final Random generator = new Random( random_seed );
260         for( int r = 0; r < number_of_resamplings; ++r ) {
261             if ( verbose ) {
262                 System.out.print( " " + r );
263             }
264             final SortedSet<String> domain_ids_to_ignore = randomlyPickDomainIds( all_unique_domain_ids,
265                                                                                   jacknife_ratio,
266                                                                                   generator );
267             final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
268             final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
269             for( int i = 0; i < number_of_genomes; ++i ) {
270                 final String species_i = species[ i ].getSpeciesId();
271                 shared_domains_based_distances.setIdentifier( i, species_i );
272                 shared_binary_combinations_based_distances.setIdentifier( i, species_i );
273                 for( int j = 0; j < i; ++j ) {
274                     final List<GenomeWideCombinableDomains> genome_pair = new ArrayList<GenomeWideCombinableDomains>( 2 );
275                     genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) );
276                     genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) );
277                     final DomainArchitectureBasedGenomeSimilarityCalculator genome_simiarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains
278                                                                                                                                                                  .get( i ),
279                                                                                                                                                                  list_of_genome_wide_combinable_domains
280                                                                                                                                                                  .get( j ) );
281                     genome_simiarity_calculator.setAllowDomainsToBeIgnored( true );
282                     genome_simiarity_calculator.setDomainIdsToIgnore( domain_ids_to_ignore );
283                     shared_domains_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator
284                                                              .calculateSharedDomainsBasedGenomeSimilarityScore() );
285                     shared_binary_combinations_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator
286                                                                          .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() );
287                 }
288             }
289             getSharedDomainsBasedDistances().add( shared_domains_based_distances );
290             getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances );
291         }
292         if ( verbose ) {
293             System.out.println();
294         }
295     }
296
297     private void init() {
298         _domain_distance_scores_means = new ArrayList<DistanceMatrix>();
299         _shared_domains_based_distances = new ArrayList<DistanceMatrix>();
300         _shared_binary_combinations_based_distances = new ArrayList<DistanceMatrix>();
301     }
302
303     static private String[] getAllUniqueDomainIdAsArray( final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains ) {
304         String[] all_domain_ids_array;
305         final SortedSet<String> all_domain_ids = new TreeSet<String>();
306         for( final GenomeWideCombinableDomains genome_wide_combinable_domains : list_of_genome_wide_combinable_domains ) {
307             final SortedSet<String> all_domains = genome_wide_combinable_domains.getAllDomainIds();
308             for( final String domain : all_domains ) {
309                 all_domain_ids.add( domain );
310             }
311         }
312         all_domain_ids_array = new String[ all_domain_ids.size() ];
313         int n = 0;
314         for( final String domain_id : all_domain_ids ) {
315             all_domain_ids_array[ n++ ] = domain_id;
316         }
317         return all_domain_ids_array;
318     }
319
320     static private SortedSet<String> randomlyPickDomainIds( final String[] all_domain_ids_array,
321                                                             final double jacknife_ratio,
322                                                             final Random generator ) {
323         final int size = all_domain_ids_array.length;
324         final SortedSet<String> random_domain_ids = new TreeSet<String>();
325         final int number_of_ids_pick = ForesterUtil.roundToInt( jacknife_ratio * size );
326         while ( random_domain_ids.size() < number_of_ids_pick ) {
327             final int r = generator.nextInt( size );
328             random_domain_ids.add( all_domain_ids_array[ r ] );
329         }
330         return random_domain_ids;
331     }
332 }