From 1cc7085032221eb6dea3ceeacc34ed710cf18318 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 12 Jul 2013 23:53:00 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/application/surfacing.java | 39 +++--- .../surfacing/BasicDomainSimilarityCalculator.java | 142 ++++++++++++-------- .../CountsBasedPairwiseDomainSimilarity.java | 15 +-- ...ainCountsBasedPairwiseSimilarityCalculator.java | 10 +- .../surfacing/PairwiseGenomeComparator.java | 45 +++---- .../surfacing/PrintableDomainSimilarity.java | 74 +++++++--- ...ntsBasedPairwiseDomainSimilarityCalculator.java | 11 +- .../org/forester/surfacing/SurfacingConstants.java | 23 ++-- .../src/org/forester/surfacing/SurfacingUtil.java | 44 ++---- .../src/org/forester/surfacing/TestSurfacing.java | 18 ++- 10 files changed, 231 insertions(+), 190 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index 71020c5..6f22a3a 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -225,8 +225,8 @@ public class surfacing { final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.300"; - final static private String PRG_DATE = "130711"; + final static private String PRG_VERSION = "2.301"; + final static private String PRG_DATE = "130712"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -278,6 +278,7 @@ public class surfacing { public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; + private static final boolean CALC_SIMILARITY_SCORES = false; private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, final String[][] input_file_properties, @@ -1631,7 +1632,7 @@ public class surfacing { all_bin_domain_combinations_gained_fitch = new ArrayList(); all_bin_domain_combinations_lost_fitch = new ArrayList(); } - final DomainLengthsTable domain_lengths_table = new DomainLengthsTable(); + DomainLengthsTable domain_lengths_table = new DomainLengthsTable(); final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + D_PROMISCUITY_FILE_SUFFIX ); BufferedWriter per_genome_domain_promiscuity_statistics_writer = null; @@ -2009,12 +2010,13 @@ public class surfacing { ForesterUtil.programMessage( PRG_NAME, "Wrote domain length data to: " + domain_lengths_analysis_outfile ); System.out.println(); } + domain_lengths_table = null; final long analysis_start_time = new Date().getTime(); PairwiseDomainSimilarityCalculator pw_calc = null; - // double[] values_for_all_scores_histogram = null; final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, sort_by_species_count_first, - number_of_genomes == 2 ); + number_of_genomes == 2, + CALC_SIMILARITY_SCORES ); switch ( scoring ) { case COMBINATIONS: pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); @@ -2069,19 +2071,17 @@ public class surfacing { + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + "" + nl ); html_desc.append( "" + nl ); - final DescriptiveStatistics pw_stats = SurfacingUtil - .writeDomainSimilaritiesToFile( html_desc, - new StringBuilder( number_of_genomes + " genomes" ), - writer, - split_writers, - similarities, - number_of_genomes == 2, - species_order, - domain_similarity_print_option, - scoring, - true, - tax_code_to_id_map, - false ); + SurfacingUtil.writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( number_of_genomes + " genomes" ), + writer, + split_writers, + similarities, + number_of_genomes == 2, + species_order, + domain_similarity_print_option, + scoring, + true, + tax_code_to_id_map ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \"" + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); } @@ -2118,7 +2118,8 @@ public class surfacing { surfacing.PRG_NAME, out_dir, write_pwc_files, - tax_code_to_id_map ); + tax_code_to_id_map, + CALC_SIMILARITY_SCORES ); String matrix_output_file = new String( output_file.toString() ); if ( matrix_output_file.indexOf( '.' ) > 1 ) { matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); diff --git a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java index 042b785..81cef33 100644 --- a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java @@ -27,6 +27,7 @@ package org.forester.surfacing; +import java.text.DecimalFormat; import java.util.ArrayList; import java.util.List; import java.util.SortedMap; @@ -37,19 +38,27 @@ import java.util.TreeSet; import org.forester.species.Species; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator { final DomainSimilarity.DomainSimilaritySortField _sort; private final boolean _sort_by_species_count_first; private final boolean _treat_as_binary_comparison; + private final boolean _calc_similarity_score; public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort, final boolean sort_by_species_count_first, - final boolean treat_as_binary_comparison ) { + final boolean treat_as_binary_comparison, + final boolean calc_similarity_score ) { _sort = sort; _sort_by_species_count_first = sort_by_species_count_first; _treat_as_binary_comparison = treat_as_binary_comparison; + _calc_similarity_score = calc_similarity_score; + } + + public boolean isCalcSimilarityScore() { + return _calc_similarity_score; } @Override @@ -65,7 +74,12 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat for( final GenomeWideCombinableDomains cdc : cdc_list ) { keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() ); } + final DecimalFormat pf = new java.text.DecimalFormat( "000000" ); + int counter = 1; + System.out.println( keys.size() ); for( final String key : keys ) { + ForesterUtil.updateProgress( counter, pf ); + counter++; final List same_id_cd_list = new ArrayList( cdc_list.size() ); final List species_with_key_id_domain = new ArrayList(); for( final GenomeWideCombinableDomains cdc : cdc_list ) { @@ -86,9 +100,6 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat continue; } } - // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55 - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // OLD: if ( same_id_cd_list.size() > 1 ) { if ( same_id_cd_list.size() > 0 ) { if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) { final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list ); @@ -100,48 +111,47 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat } } } - // ~~~ NEW: else { throw new RuntimeException( "this should not have happened" ); } - // ~~~ OLD: - // else if ( same_id_cd_list.size() == 1 ) { - // TODO need to go in file - // System.out.println( "only in one species [" + - // species_with_key_id_domain.get( 0 ) + "]: " + key_id ); - //} - //else { - // throw new RuntimeException( "this should not have happened" ); - // } } + System.out.println(); return similarities; } private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator, final List domains_list ) { if ( domains_list.size() == 1 ) { - // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55 - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // ~~~OLD: - //throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinable domains" ); - // ~~~new: final SortedMap species_data = new TreeMap(); species_data.put( domains_list.get( 0 ).getSpecies(), createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) ); - return new PrintableDomainSimilarity( domains_list.get( 0 ), - 1.0, - 1.0, - 1.0, - 1.0, - 0.0, - 0, - 0, - 0, - species_data, - isSortBySpeciesCountFirst(), - isTreatAsBinaryComparison() ); + if ( !isCalcSimilarityScore() ) { + return new PrintableDomainSimilarity( domains_list.get( 0 ), + 0, + 0, + species_data, + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } + else { + return new PrintableDomainSimilarity( domains_list.get( 0 ), + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0, + 0, + 0, + species_data, + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } + } + DescriptiveStatistics stat = null; + if ( isCalcSimilarityScore() ) { + stat = new BasicDescriptiveStatistics(); } - final DescriptiveStatistics stat = new BasicDescriptiveStatistics(); final SortedMap species_data = new TreeMap(); species_data.put( domains_list.get( 0 ).getSpecies(), createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) ); @@ -170,30 +180,28 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat if ( Math.abs( difference ) > Math.abs( max_difference ) ) { max_difference = difference; } - stat.addValue( pairwise_similarity.getSimilarityScore() ); + if ( isCalcSimilarityScore() ) { + stat.addValue( pairwise_similarity.getSimilarityScore() ); + } } } - if ( stat.getN() < 1 ) { - throw new AssertionError( "empty descriptive statistics: this should not have happened" ); - } - if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) { - throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" ); + if ( isCalcSimilarityScore() ) { + if ( stat.getN() < 1 ) { + throw new RuntimeException( "empty descriptive statistics: this should not have happened" ); + } + if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) { + throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" ); + } } - if ( ( /*stat.getN() != 1 ||*/!isTreatAsBinaryComparison() ) && ( max_difference_in_counts < 0 ) ) { + if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) { max_difference_in_counts = Math.abs( max_difference_in_counts ); if ( !is_domain_combination_based ) { - max_difference = Math.abs( max_difference ); //=max_difference_in_counts for !is_domain_combination_based. + max_difference = Math.abs( max_difference ); } } DomainSimilarity similarity = null; - if ( stat.getN() == 1 ) { + if ( !isCalcSimilarityScore() ) { similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), - stat.getMin(), - stat.getMax(), - stat.arithmeticMean(), - stat.median(), - 0.0, - stat.getN(), max_difference_in_counts, max_difference, species_data, @@ -201,18 +209,34 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat isTreatAsBinaryComparison() ); } else { - similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), - stat.getMin(), - stat.getMax(), - stat.arithmeticMean(), - stat.median(), - stat.sampleStandardDeviation(), - stat.getN(), - max_difference_in_counts, - max_difference, - species_data, - isSortBySpeciesCountFirst(), - isTreatAsBinaryComparison() ); + if ( stat.getN() == 1 ) { + similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), + stat.getMin(), + stat.getMax(), + stat.arithmeticMean(), + stat.median(), + 0.0, + stat.getN(), + max_difference_in_counts, + max_difference, + species_data, + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } + else { + similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), + stat.getMin(), + stat.getMax(), + stat.arithmeticMean(), + stat.median(), + stat.sampleStandardDeviation(), + stat.getN(), + max_difference_in_counts, + max_difference, + species_data, + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } } return similarity; } diff --git a/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java b/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java index 81fdce7..8439f4c 100644 --- a/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java @@ -29,8 +29,8 @@ package org.forester.surfacing; public class CountsBasedPairwiseDomainSimilarity implements PairwiseDomainSimilarity { - private final double _score; - private final int _copy_number_difference; + private final short _copy_number_difference; + private final short _counts_sum; /** * counts_difference: (counts for domain 1) minus (counts for domain 2). @@ -39,16 +39,15 @@ public class CountsBasedPairwiseDomainSimilarity implements PairwiseDomainSimila * @param counts_difference value of domain_1 minus value of domain_2 * @param counts_sum */ - public CountsBasedPairwiseDomainSimilarity( final int counts_difference, final int counts_sum ) { + public CountsBasedPairwiseDomainSimilarity( final short counts_difference, final short counts_sum ) { if ( counts_sum <= 0 ) { throw new IllegalArgumentException( "attempt to use copy sum of less than or equal to 0: " + counts_sum ); } - _copy_number_difference = counts_difference; - final int abs_copy_number_difference = Math.abs( counts_difference ); - if ( abs_copy_number_difference > counts_sum ) { + if ( Math.abs( counts_difference ) > counts_sum ) { throw new IllegalArgumentException( "attempt to use absolute copy number difference larger than copy number sum" ); } - _score = 1.0 - ( ( double ) abs_copy_number_difference / counts_sum ); + _copy_number_difference = counts_difference; + _counts_sum = counts_sum; } /** @@ -62,6 +61,6 @@ public class CountsBasedPairwiseDomainSimilarity implements PairwiseDomainSimila @Override public double getSimilarityScore() { - return _score; + return ( 1.0 - ( ( double ) Math.abs( _copy_number_difference ) / _counts_sum ) ); } } diff --git a/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java index 6077ffc..904b642 100644 --- a/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java @@ -35,8 +35,12 @@ public class DomainCountsBasedPairwiseSimilarityCalculator implements PairwiseDo if ( !domains_1.getKeyDomain().equals( domains_2.getKeyDomain() ) ) { throw new IllegalArgumentException( "attempt to calculate similarity between domain collection with different keys" ); } - final int dc1 = domains_1.getKeyDomainCount(); - final int dc2 = domains_2.getKeyDomainCount(); - return new CountsBasedPairwiseDomainSimilarity( dc1 - dc2, dc1 + dc2 ); + if ( ( domains_1.getKeyDomainCount() > Short.MAX_VALUE ) || ( domains_2.getKeyDomainCount() > Short.MAX_VALUE ) + || ( ( domains_1.getKeyDomainCount() + domains_2.getKeyDomainCount() ) > Short.MAX_VALUE ) ) { + throw new IllegalArgumentException( "too large for short!" ); + } + final short dc1 = ( short ) domains_1.getKeyDomainCount(); + final short dc2 = ( short ) domains_2.getKeyDomainCount(); + return new CountsBasedPairwiseDomainSimilarity( ( short ) ( dc1 - dc2 ), ( short ) ( dc1 + dc2 ) ); } } diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java index e9ee8b3..c020d82 100644 --- a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -54,7 +54,6 @@ public class PairwiseGenomeComparator { private List _shared_domains_based_distances; private List _shared_binary_combinations_based_distances; - //private List _histogram_datas; public PairwiseGenomeComparator() { init(); } @@ -63,9 +62,6 @@ public class PairwiseGenomeComparator { return _domain_distance_scores_means; } - //public List getHistogramDatas() { - // return _histogram_datas; - //} public List getSharedBinaryCombinationsBasedDistances() { return _shared_binary_combinations_based_distances; } @@ -75,7 +71,6 @@ public class PairwiseGenomeComparator { } private void init() { - //_histogram_datas = new ArrayList(); _domain_distance_scores_means = new ArrayList(); _shared_domains_based_distances = new ArrayList(); _shared_binary_combinations_based_distances = new ArrayList(); @@ -102,7 +97,8 @@ public class PairwiseGenomeComparator { final String command_line_prg_name, final File out_dir, final boolean write_pairwise_comparisons, - final Map tax_code_to_id_map ) { + final Map tax_code_to_id_map, + final boolean calc_similarity_scores ) { init(); final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes ); final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); @@ -146,7 +142,8 @@ public class PairwiseGenomeComparator { } final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, sort_by_species_count_first, - true ); + true, + calc_similarity_scores ); final SortedSet similarities = calc .calculateSimilarities( pw_calc, genome_pair, @@ -203,39 +200,27 @@ public class PairwiseGenomeComparator { } break; } - DescriptiveStatistics pw_stats = null; if ( write_pairwise_comparisons ) { try { final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? pairwise_similarities_output_file_str : out_dir + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); - pw_stats = SurfacingUtil.writeDomainSimilaritiesToFile( html_desc, - new StringBuilder( species_i + "-" - + species_j ), - writer, - null, - similarities, - true, - null, - domain_similarity_print_option, - scoring, - false, - tax_code_to_id_map, - false ); + SurfacingUtil.writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( species_i + "-" + species_j ), + writer, + null, + similarities, + true, + null, + domain_similarity_print_option, + scoring, + false, + tax_code_to_id_map ); } catch ( final IOException e ) { ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \"" + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" ); } } - if ( pw_stats != null ) { - if ( pw_stats.getMin() >= pw_stats.getMax() ) { - ForesterUtil - .printWarningMessage( command_line_prg_name, "for [" + species_i + "-" + species_j - + "] score minimum is [" + pw_stats.getMin() + "] while score maximum is [" - + pw_stats.getMax() - + "], possibly indicating that a genome is compared to itself" ); - } - } } } getDomainDistanceScoresMeans().add( domain_distance_scores_means ); diff --git a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java index fc7d8b5..f9a13ef 100644 --- a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java @@ -42,7 +42,6 @@ public class PrintableDomainSimilarity implements DomainSimilarity { final public static String SPECIES_SEPARATOR = " "; final private static int EQUAL = 0; final private static String NO_SPECIES = " "; - private static final boolean PRINT_MORE_INFO = false; final private double _min; final private double _max; final private double _mean; @@ -115,6 +114,43 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } } + public PrintableDomainSimilarity( final CombinableDomains combinable_domains, + final int max_difference_in_counts, + final int max_difference, + final SortedMap species_data, + final boolean sort_by_species_count_first, + final boolean treat_as_binary_comparison ) { + if ( combinable_domains == null ) { + throw new IllegalArgumentException( "attempt to use null combinable domains" ); + } + if ( species_data == null ) { + throw new IllegalArgumentException( "attempt to use null species data" ); + } + if ( species_data.size() < 1 ) { + throw new IllegalArgumentException( "attempt to use empty species data" ); + } + init(); + _combinable_domains = combinable_domains; + _min = -1; + _max = -1; + _mean = -1; + _sd = -1; + _n = -1; + _max_difference_in_counts = max_difference_in_counts; + _max_difference = max_difference; + _species_data = species_data; + _treat_as_binary_comparison = treat_as_binary_comparison; + final int s = species_data.size(); + if ( s > 2 ) { + if ( getMaximalDifferenceInCounts() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" ); + } + if ( getMaximalDifference() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" ); + } + } + } + private void addSpeciesSpecificDomainData( final StringBuffer sb, final Species species, final boolean html, @@ -341,23 +377,25 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "gs" ); sb.append( "" ); - sb.append( "" ); - sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); - sb.append( "" ); - if ( PRINT_MORE_INFO ) { - if ( !isTreatAsBinaryComparison() ) { - sb.append( "" ); - sb.append( "(" ); - sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) ); - sb.append( ")" ); - sb.append( "" ); - sb.append( "" ); - sb.append( "[" ); - sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) ); - sb.append( "-" ); - sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) ); - sb.append( "]" ); - sb.append( "" ); + if ( getMaximalSimilarityScore() > 0 ) { + sb.append( "" ); + sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); + sb.append( "" ); + if ( SurfacingConstants.PRINT_MORE_DOM_SIMILARITY_INFO ) { + if ( !isTreatAsBinaryComparison() ) { + sb.append( "" ); + sb.append( "(" ); + sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) ); + sb.append( ")" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "[" ); + sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) ); + sb.append( "-" ); + sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) ); + sb.append( "]" ); + sb.append( "" ); + } } } sb.append( "" ); diff --git a/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java index 309d321..8c42787 100644 --- a/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java @@ -35,8 +35,13 @@ public class ProteinCountsBasedPairwiseDomainSimilarityCalculator implements Pai if ( !domains_1.getKeyDomain().equals( domains_2.getKeyDomain() ) ) { throw new IllegalArgumentException( "attempt to calculate similarity between domain collection with different keys" ); } - final int pc1 = domains_1.getKeyDomainProteinsCount(); - final int pc2 = domains_2.getKeyDomainProteinsCount(); - return new CountsBasedPairwiseDomainSimilarity( pc1 - pc2, pc1 + pc2 ); + if ( ( domains_1.getKeyDomainProteinsCount() > Short.MAX_VALUE ) + || ( domains_2.getKeyDomainProteinsCount() > Short.MAX_VALUE ) + || ( ( domains_1.getKeyDomainProteinsCount() + domains_2.getKeyDomainCount() ) > Short.MAX_VALUE ) ) { + throw new IllegalArgumentException( "too large for short!" ); + } + final short pc1 = ( short ) domains_1.getKeyDomainProteinsCount(); + final short pc2 = ( short ) domains_2.getKeyDomainProteinsCount(); + return new CountsBasedPairwiseDomainSimilarity( ( short ) ( pc1 - pc2 ), ( short ) ( pc1 + pc2 ) ); } } diff --git a/forester/java/src/org/forester/surfacing/SurfacingConstants.java b/forester/java/src/org/forester/surfacing/SurfacingConstants.java index 9709508..aa588a0 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingConstants.java +++ b/forester/java/src/org/forester/surfacing/SurfacingConstants.java @@ -30,15 +30,16 @@ import org.forester.util.ForesterUtil; public class SurfacingConstants { - public static final String AMIGO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; - public static final String EOL_LINK = "http://www.eol.org/search?q="; - public static final String GO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; - public static final String GOOGLE_SCHOLAR_SEARCH = "http://scholar.google.com/scholar?q="; - public static final String GOOGLE_WEB_SEARCH_LINK = "http://www.google.com/search?q="; - public static final String NL = ForesterUtil.LINE_SEPARATOR; - public static final String NONE = "[none]"; - public static final String PFAM_FAMILY_ID_LINK = "http://pfam.janelia.org/family/"; - public static final String UNIPROT_TAXONOMY_ID_LINK = "http://www.uniprot.org/taxonomy/"; - static final boolean SECONDARY_FEATURES_ARE_SCOP = true; - static final String SECONDARY_FEATURES_SCOP_LINK = "http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi?key="; + public static final String AMIGO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; + public static final String EOL_LINK = "http://www.eol.org/search?q="; + public static final String GO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; + public static final String GOOGLE_SCHOLAR_SEARCH = "http://scholar.google.com/scholar?q="; + public static final String GOOGLE_WEB_SEARCH_LINK = "http://www.google.com/search?q="; + public static final String NL = ForesterUtil.LINE_SEPARATOR; + public static final String NONE = "[none]"; + public static final String PFAM_FAMILY_ID_LINK = "http://pfam.janelia.org/family/"; + public static final String UNIPROT_TAXONOMY_ID_LINK = "http://www.uniprot.org/taxonomy/"; + static final boolean SECONDARY_FEATURES_ARE_SCOP = true; + static final String SECONDARY_FEATURES_SCOP_LINK = "http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi?key="; + static final boolean PRINT_MORE_DOM_SIMILARITY_INFO = false; } diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 70d5654..8c0c8b4 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -1665,36 +1665,18 @@ public final class SurfacingUtil { } } - public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc, - final StringBuilder html_title, - final Writer single_writer, - Map split_writers, - final SortedSet similarities, - final boolean treat_as_binary, - final List species_order, - final PrintableDomainSimilarity.PRINT_OPTION print_option, - final DomainSimilarity.DomainSimilarityScoring scoring, - final boolean verbose, - final Map tax_code_to_id_map, - final boolean print_some_stats ) + public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc, + final StringBuilder html_title, + final Writer single_writer, + Map split_writers, + final SortedSet similarities, + final boolean treat_as_binary, + final List species_order, + final PrintableDomainSimilarity.PRINT_OPTION print_option, + final DomainSimilarity.DomainSimilarityScoring scoring, + final boolean verbose, + final Map tax_code_to_id_map ) throws IOException { - DescriptiveStatistics stats = null; - AsciiHistogram histo = null; - if ( print_some_stats ) { - stats = new BasicDescriptiveStatistics(); - final String histogram_title = "score mean distribution:"; - for( final DomainSimilarity similarity : similarities ) { - stats.addValue( similarity.getMeanSimilarityScore() ); - } - try { - if ( stats.getMin() < stats.getMax() ) { - histo = new AsciiHistogram( stats, histogram_title ); - } - } - catch ( final Exception e ) { - histo = null; - } - } if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { split_writers = new HashMap(); split_writers.put( '_', single_writer ); @@ -1718,9 +1700,6 @@ public final class SurfacingUtil { w.write( SurfacingConstants.NL ); w.write( html_desc.toString() ); w.write( SurfacingConstants.NL ); - if ( print_some_stats ) { - printSomeStats( stats, histo, w ); - } w.write( "
" ); w.write( SurfacingConstants.NL ); w.write( "
" ); @@ -1798,7 +1777,6 @@ public final class SurfacingUtil { for( final Writer w : split_writers.values() ) { w.close(); } - return stats; } private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w ) diff --git a/forester/java/src/org/forester/surfacing/TestSurfacing.java b/forester/java/src/org/forester/surfacing/TestSurfacing.java index 3a79d7b..c2b857c 100644 --- a/forester/java/src/org/forester/surfacing/TestSurfacing.java +++ b/forester/java/src/org/forester/surfacing/TestSurfacing.java @@ -327,7 +327,8 @@ public class TestSurfacing { new BasicSpecies( "nemve" ) ) ); final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, - false ); + false, + true ); final SortedSet sims = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, @@ -477,7 +478,8 @@ public class TestSurfacing { new BasicSpecies( "nemve" ) ) ); final DomainSimilarityCalculator calc2 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, - false ); + false, + true ); final SortedSet sims2 = calc2 .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list2, @@ -564,7 +566,8 @@ public class TestSurfacing { new BasicSpecies( "nemve" ) ) ); final DomainSimilarityCalculator calc3 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, - false ); + false, + true ); final SortedSet sims3 = calc3 .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list3, @@ -607,7 +610,8 @@ public class TestSurfacing { new BasicSpecies( "nemve" ) ) ); final DomainSimilarityCalculator calc4 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, true, - false ); + false, + true ); final SortedSet sims4 = calc4 .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list4, @@ -1038,7 +1042,8 @@ public class TestSurfacing { new BasicSpecies( "nemve" ) ) ); final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, - false ); + false, + true ); final SortedSet sims = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, @@ -1146,7 +1151,8 @@ public class TestSurfacing { new BasicSpecies( "nemve" ) ) ); final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, - false ); + false, + true ); final SortedSet sims = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, -- 1.7.10.2