X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FPairwiseGenomeComparator.java;h=ed4eae73769feba8241f994b8aff130554b6ab49;hb=3d5864a39739960c126f2ab5585162fd52d1f47d;hp=ae94f8174570d9840abe7d9a46b31e7c577fe5af;hpb=48f7a89be9d34f1930a1f863e608235cc27184c5;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java index ae94f81..ed4eae7 100644 --- a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -6,7 +6,7 @@ // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved -// +// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -16,13 +16,13 @@ // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. -// +// // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.surfacing; @@ -43,6 +43,8 @@ import org.forester.evoinference.matrix.distance.DistanceMatrix; import org.forester.go.GoId; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; +import org.forester.phylogeny.Phylogeny; +import org.forester.species.Species; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; @@ -50,10 +52,9 @@ import org.forester.util.ForesterUtil; public class PairwiseGenomeComparator { private List _domain_distance_scores_means; - private List _shared_domains_based_distances; private List _shared_binary_combinations_based_distances; + private List _shared_domains_based_distances; - //private List _histogram_datas; public PairwiseGenomeComparator() { init(); } @@ -62,9 +63,6 @@ public class PairwiseGenomeComparator { return _domain_distance_scores_means; } - //public List getHistogramDatas() { - // return _histogram_datas; - //} public List getSharedBinaryCombinationsBasedDistances() { return _shared_binary_combinations_based_distances; } @@ -73,22 +71,15 @@ public class PairwiseGenomeComparator { return _shared_domains_based_distances; } - private void init() { - //_histogram_datas = new ArrayList(); - _domain_distance_scores_means = new ArrayList(); - _shared_domains_based_distances = new ArrayList(); - _shared_binary_combinations_based_distances = new ArrayList(); - } - public void performPairwiseComparisons( final StringBuilder html_desc, final boolean sort_by_species_count_first, final Detailedness detailedness, final boolean ignore_domains_without_combs_in_all_spec, final boolean ignore_domains_specific_to_one_species, final DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field, - final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final DomainSimilarity.PRINT_OPTION domain_similarity_print_option, final DomainSimilarity.DomainSimilarityScoring scoring, - final Map> domain_id_to_go_ids_map, + final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, final Species[] species, @@ -99,13 +90,15 @@ public class PairwiseGenomeComparator { final boolean verbose, final String automated_pairwise_comparison_prefix, final String command_line_prg_name, - final boolean display_histograms, final File out_dir, - final boolean write_pairwise_comparisons ) { + final boolean write_pairwise_comparisons, + final Map tax_code_to_id_map, + final boolean calc_similarity_scores, + final Phylogeny phy ) { init(); - final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes ); - final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); - final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final DistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final DistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final DistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); if ( verbose ) { System.out.println(); System.out.println( "Pairwise genome distances:" ); @@ -129,11 +122,11 @@ public class PairwiseGenomeComparator { if ( ( list_of_genome_wide_combinable_domains.get( i ).getSize() < 1 ) || ( list_of_genome_wide_combinable_domains.get( j ).getSize() < 1 ) ) { domain_distance_scores_means - .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); + .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); shared_domains_based_distances - .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); + .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); shared_binary_combinations_based_distances - .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); + .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); continue; } final List genome_pair = new ArrayList( 2 ); @@ -145,26 +138,23 @@ public class PairwiseGenomeComparator { } final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, sort_by_species_count_first, + true, + calc_similarity_scores, true ); final SortedSet similarities = calc .calculateSimilarities( pw_calc, genome_pair, ignore_domains_without_combs_in_all_spec, ignore_domains_specific_to_one_species ); - SurfacingUtil.decoratePrintableDomainSimilarities( similarities, - detailedness, - go_annotation_output, - go_id_to_term_map, - go_namespace_limit ); + SurfacingUtil.decoratePrintableDomainSimilarities( similarities, detailedness ); final DescriptiveStatistics stats = SurfacingUtil .calculateDescriptiveStatisticsForMeanValues( similarities ); final String species_j = species[ j ].getSpeciesId(); final DomainArchitectureBasedGenomeSimilarityCalculator genome_similarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains - .get( i ), + .get( i ), list_of_genome_wide_combinable_domains - .get( j ) ); + .get( j ) ); genome_similarity_calculator.setAllowDomainsToBeIgnored( false ); - // TODO make histos for these 5 values double dissimilarity_score_mean; if ( stats.getN() < 1 ) { // No domains in common @@ -207,47 +197,30 @@ public class PairwiseGenomeComparator { } break; } - DescriptiveStatistics pw_stats = null; if ( write_pairwise_comparisons ) { try { final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? pairwise_similarities_output_file_str : out_dir + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); - pw_stats = SurfacingUtil.writeDomainSimilaritiesToFile( html_desc, - new StringBuilder( species_i + "-" - + species_j ), - writer, - similarities, - true, - null, - domain_similarity_print_option, - domain_similarity_sort_field, - scoring, - false ); + SurfacingUtil.writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( species_i + "-" + species_j ), + null, + writer, + null, + similarities, + true, + null, + domain_similarity_print_option, + scoring, + false, + tax_code_to_id_map, + phy, + null ); } catch ( final IOException e ) { ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \"" + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" ); } } - // pairwise_matrix.setValue( i, j, cdc_list.get( cdc_list.size() - // - 1 ) ); - if ( pw_stats != null ) { - if ( pw_stats.getMin() >= pw_stats.getMax() ) { - ForesterUtil.printWarningMessage( command_line_prg_name, "for [" + species_i + "-" + species_j - + "] score minimum is [" + pw_stats.getMin() + "] while score maximum is [" - + pw_stats.getMax() + "], possibly indicating that a genome is compared to itself" ); - } - if ( display_histograms && ( pw_stats.getMin() < pw_stats.getMax() ) ) { - //final double[] values = pw_stats.getDataAsDoubleArray(); - // List data_items = new - // ArrayList( values.length ); - // for( int n = 0; n < values.length; i++ ) { - // data_items.add( new BasicHistogramDataItem( "", values[ n ] ) - // ); - // } - //~ _histogram_datas.add( new HistogramData( species_i + "-" + species_j, values, null, 20 ) ); - } - } } } getDomainDistanceScoresMeans().add( domain_distance_scores_means ); @@ -275,7 +248,7 @@ public class PairwiseGenomeComparator { else if ( jacknife_ratio >= 1.0 ) { throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio 1.0 or more" ); } - final DomainId[] all_unique_domain_ids = getAllUniqueDomainIdAsArray( list_of_genome_wide_combinable_domains ); + final String[] all_unique_domain_ids = getAllUniqueDomainIdAsArray( list_of_genome_wide_combinable_domains ); if ( verbose ) { System.out.println(); System.out.println( "Jacknife: total of domains: " + all_unique_domain_ids.length ); @@ -288,11 +261,11 @@ public class PairwiseGenomeComparator { if ( verbose ) { System.out.print( " " + r ); } - final SortedSet domain_ids_to_ignore = randomlyPickDomainIds( all_unique_domain_ids, - jacknife_ratio, - generator ); - final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); - final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final SortedSet domain_ids_to_ignore = randomlyPickDomainIds( all_unique_domain_ids, + jacknife_ratio, + generator ); + final DistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final DistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); for( int i = 0; i < number_of_genomes; ++i ) { final String species_i = species[ i ].getSpeciesId(); shared_domains_based_distances.setIdentifier( i, species_i ); @@ -302,15 +275,15 @@ public class PairwiseGenomeComparator { genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) ); genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) ); final DomainArchitectureBasedGenomeSimilarityCalculator genome_simiarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains - .get( i ), + .get( i ), list_of_genome_wide_combinable_domains - .get( j ) ); + .get( j ) ); genome_simiarity_calculator.setAllowDomainsToBeIgnored( true ); genome_simiarity_calculator.setDomainIdsToIgnore( domain_ids_to_ignore ); shared_domains_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator - .calculateSharedDomainsBasedGenomeSimilarityScore() ); + .calculateSharedDomainsBasedGenomeSimilarityScore() ); shared_binary_combinations_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator - .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() ); + .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() ); } } getSharedDomainsBasedDistances().add( shared_domains_based_distances ); @@ -321,28 +294,34 @@ public class PairwiseGenomeComparator { } } - static private DomainId[] getAllUniqueDomainIdAsArray( final List list_of_genome_wide_combinable_domains ) { - DomainId[] all_domain_ids_array; - final SortedSet all_domain_ids = new TreeSet(); + private void init() { + _domain_distance_scores_means = new ArrayList(); + _shared_domains_based_distances = new ArrayList(); + _shared_binary_combinations_based_distances = new ArrayList(); + } + + static private String[] getAllUniqueDomainIdAsArray( final List list_of_genome_wide_combinable_domains ) { + String[] all_domain_ids_array; + final SortedSet all_domain_ids = new TreeSet(); for( final GenomeWideCombinableDomains genome_wide_combinable_domains : list_of_genome_wide_combinable_domains ) { - final SortedSet all_domains = genome_wide_combinable_domains.getAllDomainIds(); - for( final DomainId domain : all_domains ) { + final SortedSet all_domains = genome_wide_combinable_domains.getAllDomainIds(); + for( final String domain : all_domains ) { all_domain_ids.add( domain ); } } - all_domain_ids_array = new DomainId[ all_domain_ids.size() ]; + all_domain_ids_array = new String[ all_domain_ids.size() ]; int n = 0; - for( final DomainId domain_id : all_domain_ids ) { + for( final String domain_id : all_domain_ids ) { all_domain_ids_array[ n++ ] = domain_id; } return all_domain_ids_array; } - static private SortedSet randomlyPickDomainIds( final DomainId[] all_domain_ids_array, - final double jacknife_ratio, - final Random generator ) { + static private SortedSet randomlyPickDomainIds( final String[] all_domain_ids_array, + final double jacknife_ratio, + final Random generator ) { final int size = all_domain_ids_array.length; - final SortedSet random_domain_ids = new TreeSet(); + final SortedSet random_domain_ids = new TreeSet(); final int number_of_ids_pick = ForesterUtil.roundToInt( jacknife_ratio * size ); while ( random_domain_ids.size() < number_of_ids_pick ) { final int r = generator.nextInt( size );