3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
27 package org.forester.surfacing;
29 import java.io.BufferedWriter;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.Writer;
34 import java.util.ArrayList;
35 import java.util.List;
37 import java.util.Random;
38 import java.util.SortedSet;
39 import java.util.TreeSet;
41 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
42 import org.forester.evoinference.matrix.distance.DistanceMatrix;
43 import org.forester.go.GoId;
44 import org.forester.go.GoNameSpace;
45 import org.forester.go.GoTerm;
46 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
47 import org.forester.util.DescriptiveStatistics;
48 import org.forester.util.ForesterUtil;
50 public class PairwiseGenomeComparator {
52 private List<DistanceMatrix> _domain_distance_scores_means;
53 private List<DistanceMatrix> _shared_domains_based_distances;
54 private List<DistanceMatrix> _shared_binary_combinations_based_distances;
56 //private List<HistogramData> _histogram_datas;
57 public PairwiseGenomeComparator() {
61 public List<DistanceMatrix> getDomainDistanceScoresMeans() {
62 return _domain_distance_scores_means;
65 //public List<HistogramData> getHistogramDatas() {
66 // return _histogram_datas;
68 public List<DistanceMatrix> getSharedBinaryCombinationsBasedDistances() {
69 return _shared_binary_combinations_based_distances;
72 public List<DistanceMatrix> getSharedDomainsBasedDistances() {
73 return _shared_domains_based_distances;
77 //_histogram_datas = new ArrayList<HistogramData>();
78 _domain_distance_scores_means = new ArrayList<DistanceMatrix>();
79 _shared_domains_based_distances = new ArrayList<DistanceMatrix>();
80 _shared_binary_combinations_based_distances = new ArrayList<DistanceMatrix>();
83 public void performPairwiseComparisons( final StringBuilder html_desc,
84 final boolean sort_by_species_count_first,
85 final Detailedness detailedness,
86 final boolean ignore_domains_without_combs_in_all_spec,
87 final boolean ignore_domains_specific_to_one_species,
88 final DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field,
89 final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
90 final DomainSimilarity.DomainSimilarityScoring scoring,
91 final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
92 final Map<GoId, GoTerm> go_id_to_term_map,
93 final GoNameSpace go_namespace_limit,
94 final Species[] species,
95 final int number_of_genomes,
96 final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains,
97 final PairwiseDomainSimilarityCalculator pw_calc,
98 final String automated_pairwise_comparison_suffix,
99 final boolean verbose,
100 final String automated_pairwise_comparison_prefix,
101 final String command_line_prg_name,
102 final boolean display_histograms,
104 final boolean write_pairwise_comparisons ) {
106 final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes );
107 final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
108 final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
110 System.out.println();
111 System.out.println( "Pairwise genome distances:" );
112 System.out.print( "[species-i - species-j:" );
113 System.out.print( " mean-score-based" );
114 System.out.print( " (sd)" );
115 System.out.print( " [N]" );
116 System.out.print( " | shared-domains-based" );
117 System.out.println( " | shared-binary-combinations-based]" );
118 System.out.println();
120 for( int i = 0; i < number_of_genomes; ++i ) {
121 final String species_i = species[ i ].getSpeciesId();
122 domain_distance_scores_means.setIdentifier( i, species_i );
123 shared_domains_based_distances.setIdentifier( i, species_i );
124 shared_binary_combinations_based_distances.setIdentifier( i, species_i );
126 System.out.println( ( i + 1 ) + "/" + number_of_genomes );
128 for( int j = 0; j < i; ++j ) {
129 if ( ( list_of_genome_wide_combinable_domains.get( i ).getSize() < 1 )
130 || ( list_of_genome_wide_combinable_domains.get( j ).getSize() < 1 ) ) {
131 domain_distance_scores_means
132 .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
133 shared_domains_based_distances
134 .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
135 shared_binary_combinations_based_distances
136 .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE );
139 final List<GenomeWideCombinableDomains> genome_pair = new ArrayList<GenomeWideCombinableDomains>( 2 );
140 genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) );
141 genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) );
142 DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE;
143 if ( domain_id_to_go_ids_map != null ) {
144 go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL;
146 final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field,
147 sort_by_species_count_first,
149 final SortedSet<DomainSimilarity> similarities = calc
150 .calculateSimilarities( pw_calc,
152 ignore_domains_without_combs_in_all_spec,
153 ignore_domains_specific_to_one_species );
154 SurfacingUtil.decoratePrintableDomainSimilarities( similarities,
156 go_annotation_output,
158 go_namespace_limit );
159 final DescriptiveStatistics stats = SurfacingUtil
160 .calculateDescriptiveStatisticsForMeanValues( similarities );
161 final String species_j = species[ j ].getSpeciesId();
162 final DomainArchitectureBasedGenomeSimilarityCalculator genome_similarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains
164 list_of_genome_wide_combinable_domains
166 genome_similarity_calculator.setAllowDomainsToBeIgnored( false );
167 // TODO make histos for these 5 values
168 double dissimilarity_score_mean;
169 if ( stats.getN() < 1 ) {
170 // No domains in common
171 dissimilarity_score_mean = 1.0;
174 dissimilarity_score_mean = 1.0 - stats.arithmeticMean();
176 final double shared_domains_based_genome_distance = 1.0 - genome_similarity_calculator
177 .calculateSharedDomainsBasedGenomeSimilarityScore();
178 final double shared_binary_combinations_based_genome_distance = 1.0 - genome_similarity_calculator
179 .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore();
180 domain_distance_scores_means.setValue( i, j, dissimilarity_score_mean );
181 shared_domains_based_distances.setValue( i, j, shared_domains_based_genome_distance );
182 shared_binary_combinations_based_distances.setValue( i,
184 shared_binary_combinations_based_genome_distance );
186 System.out.print( species_i + "-" );
187 System.out.print( species_j + ": " );
188 System.out.print( ForesterUtil.round( dissimilarity_score_mean, 2 ) );
189 if ( stats.getN() > 1 ) {
190 System.out.print( " (" + ForesterUtil.round( stats.sampleStandardDeviation(), 2 ) + ")" );
193 System.out.print( " (n/a)" );
195 System.out.print( " [" + stats.getN() + "]" );
196 System.out.print( " | " );
197 System.out.print( ForesterUtil.round( shared_domains_based_genome_distance, 2 ) );
198 System.out.print( " | " );
199 System.out.println( ForesterUtil.round( shared_binary_combinations_based_genome_distance, 2 ) );
201 String pairwise_similarities_output_file_str = automated_pairwise_comparison_prefix + species_i + "_"
202 + species_j + automated_pairwise_comparison_suffix;
203 switch ( domain_similarity_print_option ) {
205 if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) {
206 pairwise_similarities_output_file_str += ".html";
210 DescriptiveStatistics pw_stats = null;
211 if ( write_pairwise_comparisons ) {
213 final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? pairwise_similarities_output_file_str
214 : out_dir + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) );
215 pw_stats = SurfacingUtil.writeDomainSimilaritiesToFile( html_desc,
216 new StringBuilder( species_i + "-"
222 domain_similarity_print_option,
223 domain_similarity_sort_field,
227 catch ( final IOException e ) {
228 ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \""
229 + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" );
232 // pairwise_matrix.setValue( i, j, cdc_list.get( cdc_list.size()
234 if ( pw_stats != null ) {
235 if ( pw_stats.getMin() >= pw_stats.getMax() ) {
237 .printWarningMessage( command_line_prg_name, "for [" + species_i + "-" + species_j
238 + "] score minimum is [" + pw_stats.getMin() + "] while score maximum is ["
240 + "], possibly indicating that a genome is compared to itself" );
242 if ( display_histograms && ( pw_stats.getMin() < pw_stats.getMax() ) ) {
243 //final double[] values = pw_stats.getDataAsDoubleArray();
244 // List<HistogramDataItem> data_items = new
245 // ArrayList<HistogramDataItem>( values.length );
246 // for( int n = 0; n < values.length; i++ ) {
247 // data_items.add( new BasicHistogramDataItem( "", values[ n ] )
250 //~ _histogram_datas.add( new HistogramData( species_i + "-" + species_j, values, null, 20 ) );
255 getDomainDistanceScoresMeans().add( domain_distance_scores_means );
256 getSharedDomainsBasedDistances().add( shared_domains_based_distances );
257 getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances );
259 System.out.println();
263 public void performPairwiseComparisonsJacknifed( final Species[] species,
264 final int number_of_genomes,
265 final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains,
266 final boolean verbose,
267 final int number_of_resamplings,
268 final double jacknife_ratio,
269 final long random_seed ) {
271 if ( number_of_resamplings < 2 ) {
272 throw new IllegalArgumentException( "attempt to perform jacknife resampling with less than 2 resamplings" );
274 if ( jacknife_ratio <= 0.0 ) {
275 throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio of 0.0 or less" );
277 else if ( jacknife_ratio >= 1.0 ) {
278 throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio 1.0 or more" );
280 final DomainId[] all_unique_domain_ids = getAllUniqueDomainIdAsArray( list_of_genome_wide_combinable_domains );
282 System.out.println();
283 System.out.println( "Jacknife: total of domains: " + all_unique_domain_ids.length );
286 System.out.print( "resampling " );
288 final Random generator = new Random( random_seed );
289 for( int r = 0; r < number_of_resamplings; ++r ) {
291 System.out.print( " " + r );
293 final SortedSet<DomainId> domain_ids_to_ignore = randomlyPickDomainIds( all_unique_domain_ids,
296 final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
297 final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes );
298 for( int i = 0; i < number_of_genomes; ++i ) {
299 final String species_i = species[ i ].getSpeciesId();
300 shared_domains_based_distances.setIdentifier( i, species_i );
301 shared_binary_combinations_based_distances.setIdentifier( i, species_i );
302 for( int j = 0; j < i; ++j ) {
303 final List<GenomeWideCombinableDomains> genome_pair = new ArrayList<GenomeWideCombinableDomains>( 2 );
304 genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) );
305 genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) );
306 final DomainArchitectureBasedGenomeSimilarityCalculator genome_simiarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains
308 list_of_genome_wide_combinable_domains
310 genome_simiarity_calculator.setAllowDomainsToBeIgnored( true );
311 genome_simiarity_calculator.setDomainIdsToIgnore( domain_ids_to_ignore );
312 shared_domains_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator
313 .calculateSharedDomainsBasedGenomeSimilarityScore() );
314 shared_binary_combinations_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator
315 .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() );
318 getSharedDomainsBasedDistances().add( shared_domains_based_distances );
319 getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances );
322 System.out.println();
326 static private DomainId[] getAllUniqueDomainIdAsArray( final List<GenomeWideCombinableDomains> list_of_genome_wide_combinable_domains ) {
327 DomainId[] all_domain_ids_array;
328 final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
329 for( final GenomeWideCombinableDomains genome_wide_combinable_domains : list_of_genome_wide_combinable_domains ) {
330 final SortedSet<DomainId> all_domains = genome_wide_combinable_domains.getAllDomainIds();
331 for( final DomainId domain : all_domains ) {
332 all_domain_ids.add( domain );
335 all_domain_ids_array = new DomainId[ all_domain_ids.size() ];
337 for( final DomainId domain_id : all_domain_ids ) {
338 all_domain_ids_array[ n++ ] = domain_id;
340 return all_domain_ids_array;
343 static private SortedSet<DomainId> randomlyPickDomainIds( final DomainId[] all_domain_ids_array,
344 final double jacknife_ratio,
345 final Random generator ) {
346 final int size = all_domain_ids_array.length;
347 final SortedSet<DomainId> random_domain_ids = new TreeSet<DomainId>();
348 final int number_of_ids_pick = ForesterUtil.roundToInt( jacknife_ratio * size );
349 while ( random_domain_ids.size() < number_of_ids_pick ) {
350 final int r = generator.nextInt( size );
351 random_domain_ids.add( all_domain_ids_array[ r ] );
353 return random_domain_ids;