X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FSurfacingUtil.java;h=ea51d190da7ae4d2cc19980eabf460ac71157233;hb=0b49b8e750b34d28a5989facdd8a7959870de996;hp=ebce1ccbead6a18b3d0de0e79286ee0fdbc2d534;hpb=6ec28d1a3a137c14c9647aec7b9e40b77fe968bf;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index ebce1cc..ea51d19 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -26,6 +26,7 @@ package org.forester.surfacing; +import java.awt.Color; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -66,6 +67,8 @@ import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; import org.forester.go.PfamToGoMapping; import org.forester.io.parsers.nexus.NexusConstants; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; @@ -74,48 +77,50 @@ import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BasicDomain; import org.forester.protein.BasicProtein; import org.forester.protein.BinaryDomainCombination; import org.forester.protein.Domain; -import org.forester.protein.DomainId; import org.forester.protein.Protein; import org.forester.species.Species; +import org.forester.surfacing.DomainSimilarity.PRINT_OPTION; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; -import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput; import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; import org.forester.util.AsciiHistogram; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.BasicTable; import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; +import org.forester.util.TaxonomyColors; public final class SurfacingUtil { - private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); - private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { - - @Override - public int compare( final Domain d1, - final Domain d2 ) { - if ( d1.getPerSequenceEvalue() < d2 - .getPerSequenceEvalue() ) { - return -1; - } - else if ( d1 - .getPerSequenceEvalue() > d2 - .getPerSequenceEvalue() ) { - return 1; - } - else { - return d1.compareTo( d2 ); - } - } - }; - public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); - private static final boolean USE_LAST = true; + public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); + private final static Map _TAXCODE_HEXCOLORSTRING_MAP = new HashMap(); + private final static Map _TAXCODE_TAXGROUP_MAP = new HashMap(); + private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { + + @Override + public int compare( final Domain d1, + final Domain d2 ) { + if ( d1.getPerDomainEvalue() < d2 + .getPerDomainEvalue() ) { + return -1; + } + else if ( d1.getPerDomainEvalue() > d2 + .getPerDomainEvalue() ) { + return 1; + } + else { + return d1.compareTo( d2 ); + } + } + }; + private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); private SurfacingUtil() { // Hidden constructor. @@ -123,49 +128,20 @@ public final class SurfacingUtil { public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome, final SortedSet binary_domain_combinations ) { - final SortedMap all_cd = genome.getAllCombinableDomainsIds(); - for( final DomainId domain_id : all_cd.keySet() ) { + final SortedMap all_cd = genome.getAllCombinableDomainsIds(); + for( final String domain_id : all_cd.keySet() ) { binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() ); } } public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome, - final SortedSet domain_ids ) { - final SortedSet domains = genome.getAllDomainIds(); - for( final DomainId domain : domains ) { + final SortedSet domain_ids ) { + final SortedSet domains = genome.getAllDomainIds(); + for( final String domain : domains ) { domain_ids.add( domain ); } } - public static void addHtmlHead( final Writer w, final String title ) throws IOException { - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( "" ); - w.write( title ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final DomainSimilarity similarity : similarities ) { @@ -174,16 +150,6 @@ public final class SurfacingUtil { return stats; } - public static int calculateOverlap( final Domain domain, final List covered_positions ) { - int overlap_count = 0; - for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { - if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) { - ++overlap_count; - } - } - return overlap_count; - } - public static void checkForOutputFileWriteability( final File outfile ) { final String error = ForesterUtil.isWritableFile( outfile ); if ( !ForesterUtil.isEmpty( error ) ) { @@ -191,6 +157,33 @@ public final class SurfacingUtil { } } + public static void checkWriteabilityForPairwiseComparisons( final DomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + + "_" + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + } + } + public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, final BinaryDomainCombination.DomainCombinationType dc_type, final List all_binary_domains_combination_gained, @@ -205,24 +198,23 @@ public final class SurfacingUtil { || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); + .obtainInstance( matrix.getCharacter( c ) ) ); } else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); + .obtainInstance( matrix.getCharacter( c ) ) ); } else { - all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix - .getCharacter( c ) ) ); + all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.obtainInstance( matrix + .getCharacter( c ) ) ); } } } } } - public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { - final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings - .size() ); + public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { + final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings.size() ); for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); @@ -232,12 +224,12 @@ public final class SurfacingUtil { return domain_id_to_go_ids_map; } - public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) + public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) throws IOException { final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); - final Map> map = new TreeMap>(); + final Map> map = new TreeMap>(); for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { - final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) ); + final String domain_id = primary_table.getValue( 0, r ); if ( !map.containsKey( domain_id ) ) { map.put( domain_id, new HashSet() ); } @@ -255,6 +247,103 @@ public final class SurfacingUtil { return phylogeny; } + public static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double ie_value_max, + final double fs_e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "iE-value: " + ie_value_max ); + parameters_sb.append( ", FS E-value: " + fs_e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + } + else { + parameters_sb.append( ", Cutoff-scores-file: not-set" ); + } + if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); + } + else { + parameters_sb.append( ", Max-overlap: not-set" ); + } + if ( no_engulfing_overlaps ) { + parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); + } + else { + parameters_sb.append( ", Engulfing-overlaps: allowed" ); + } + if ( ignore_dufs ) { + parameters_sb.append( ", Ignore-dufs: true" ); + } + else { + parameters_sb.append( ", Ignore-dufs: false" ); + } + parameters_sb.append( ", DC type (if applicable): " + dc_type ); + return parameters_sb; + } + + public static void createSplitWriters( final File out_dir, + final String my_outfile, + final Map split_writers ) throws IOException { + split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_A.html" ) ) ); + split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_B.html" ) ) ); + split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_C.html" ) ) ); + split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_D.html" ) ) ); + split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_E.html" ) ) ); + split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_F.html" ) ) ); + split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_G.html" ) ) ); + split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_H.html" ) ) ); + split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_I.html" ) ) ); + split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_J.html" ) ) ); + split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_K.html" ) ) ); + split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_L.html" ) ) ); + split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_M.html" ) ) ); + split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_N.html" ) ) ); + split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_O.html" ) ) ); + split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_P.html" ) ) ); + split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Q.html" ) ) ); + split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_R.html" ) ) ); + split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_S.html" ) ) ); + split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_T.html" ) ) ); + split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_U.html" ) ) ); + split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_V.html" ) ) ); + split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_W.html" ) ) ); + split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_X.html" ) ) ); + split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Y.html" ) ) ); + split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Z.html" ) ) ); + split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_0.html" ) ) ); + } + public static Map createTaxCodeToIdMap( final Phylogeny phy ) { final Map m = new HashMap(); for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { @@ -288,26 +377,17 @@ public final class SurfacingUtil { } public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, - final Detailedness detailedness, - final GoAnnotationOutput go_annotation_output, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit ) { - if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) { - throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" ); - } + final Detailedness detailedness ) { for( final DomainSimilarity domain_similarity : domain_similarities ) { - if ( domain_similarity instanceof PrintableDomainSimilarity ) { - final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + if ( domain_similarity instanceof DomainSimilarity ) { + final DomainSimilarity printable_domain_similarity = domain_similarity; printable_domain_similarity.setDetailedness( detailedness ); - printable_domain_similarity.setGoAnnotationOutput( go_annotation_output ); - printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map ); - printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit ); } } } public static void doit( final List proteins, - final List query_domain_ids_nc_order, + final List query_domain_ids_nc_order, final Writer out, final String separator, final String limit_to_species, @@ -321,7 +401,7 @@ public final class SurfacingUtil { out.write( protein.getProteinId().getId() ); out.write( separator ); out.write( "[" ); - final Set visited_domain_ids = new HashSet(); + final Set visited_domain_ids = new HashSet(); boolean first = true; for( final Domain domain : protein.getProteinDomains() ) { if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { @@ -332,7 +412,7 @@ public final class SurfacingUtil { else { out.write( " " ); } - out.write( domain.getDomainId().getId() ); + out.write( domain.getDomainId() ); out.write( " {" ); out.write( "" + domain.getTotalCount() ); out.write( "}" ); @@ -378,7 +458,7 @@ public final class SurfacingUtil { 1 + all_genomes_domains_per_potein_histo.get( domains ) ); } if ( domains == 1 ) { - final String domain = protein.getProteinDomain( 0 ).getDomainId().getId(); + final String domain = protein.getProteinDomain( 0 ).getDomainId(); if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { if ( domains_which_never_single.contains( domain ) ) { domains_which_never_single.remove( domain ); @@ -391,7 +471,7 @@ public final class SurfacingUtil { } else if ( domains > 1 ) { for( final Domain d : protein.getProteinDomains() ) { - final String domain = d.getDomainId().getId(); + final String domain = d.getDomainId(); // System.out.println( domain ); if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { if ( domains_which_are_always_single.contains( domain ) ) { @@ -464,10 +544,6 @@ public final class SurfacingUtil { out.write( species + "\t" ); } out.write( ForesterUtil.LINE_SEPARATOR ); - // DescriptiveStatistics stats_for_domain = domain_lengths - // .calculateMeanBasedStatistics(); - //AsciiHistogram histo = new AsciiHistogram( stats_for_domain ); - //System.out.println( histo.toStringBuffer( 40, '=', 60, 4 ).toString() ); } } out.write( ForesterUtil.LINE_SEPARATOR ); @@ -501,23 +577,99 @@ public final class SurfacingUtil { } } out.close(); - // final List histogram_datas = new ArrayList(); - // for( int i = 0; i < number_of_genomes; ++i ) { - // final Species species = new BasicSpecies( input_file_properties[ i ][ 0 ] ); - // histogram_datas - // .add( new HistogramData( species.toString(), domain_lengths_table - // .calculateMeanBasedStatisticsForSpecies( species ) - // .getDataAsDoubleArray(), 5, 600, null, 60 ) ); - // } - // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); - // hf.setVisible( true ); System.gc(); } /** - * - * @param all_binary_domains_combination_lost_fitch - * @param consider_directedness_and_adjacency_for_bin_combinations + * Warning: This side-effects 'all_bin_domain_combinations_encountered'! + * + * + * @param output_file + * @param all_bin_domain_combinations_changed + * @param sum_of_all_domains_encountered + * @param all_bin_domain_combinations_encountered + * @param is_gains_analysis + * @param protein_length_stats_by_dc + * @throws IOException + */ + public static void executeFitchGainsAnalysis( final File output_file, + final List all_bin_domain_combinations_changed, + final int sum_of_all_domains_encountered, + final SortedSet all_bin_domain_combinations_encountered, + final boolean is_gains_analysis ) throws IOException { + checkForOutputFileWriteability( output_file ); + final Writer out = ForesterUtil.createBufferedWriter( output_file ); + final SortedMap bdc_to_counts = ForesterUtil + .listToSortedCountsMap( all_bin_domain_combinations_changed ); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + int above_one = 0; + int one = 0; + for( final Object bdc_object : bdc_to_counts.keySet() ) { + final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; + final int count = bdc_to_counts.get( bdc_object ); + if ( count < 1 ) { + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "count < 1 " ); + } + out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + if ( count > 1 ) { + all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); + all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); + above_one++; + } + else if ( count == 1 ) { + all_domains_in_combination_changed_only_once.add( bdc.getId0() ); + all_domains_in_combination_changed_only_once.add( bdc.getId1() ); + one++; + } + } + final int all = all_bin_domain_combinations_encountered.size(); + int never_lost = -1; + if ( !is_gains_analysis ) { + all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); + never_lost = all_bin_domain_combinations_encountered.size(); + for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { + out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); + } + } + if ( is_gains_analysis ) { + out.write( "Sum of all distinct domain combinations appearing once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + else { + out.write( "Sum of all distinct domain combinations never lost : " + never_lost + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + out.write( "All binary combinations : " + all + + ForesterUtil.LINE_SEPARATOR ); + out.write( "All domains : " + + sum_of_all_domains_encountered ); + out.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote fitch domain combination dynamics counts analysis to \"" + output_file + + "\"" ); + } + + /** + * + * @param all_binary_domains_combination_lost_fitch + * @param use_last_in_fitch_parsimony + * @param perform_dc_fich + * @param consider_directedness_and_adjacency_for_bin_combinations * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations * which were gained under unweighted (Fitch) parsimony. */ @@ -526,12 +678,12 @@ public final class SurfacingUtil { final String outfile_name, final DomainParsimonyCalculator domain_parsimony, final Phylogeny phylogeny, - final Map> domain_id_to_go_ids_map, + final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, final String parameters_str, - final Map>[] domain_id_to_secondary_features_maps, - final SortedSet positive_filter, + final Map>[] domain_id_to_secondary_features_maps, + final SortedSet positive_filter, final boolean output_binary_domain_combinations_for_graphs, final List all_binary_domains_combination_gained_fitch, final List all_binary_domains_combination_lost_fitch, @@ -540,7 +692,9 @@ public final class SurfacingUtil { final Map domain_number_stats_by_dc, final Map domain_length_stats_by_domain, final Map tax_code_to_id_map, - final boolean write_to_nexus ) { + final boolean write_to_nexus, + final boolean use_last_in_fitch_parsimony, + final boolean perform_dc_fich ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -561,9 +715,9 @@ public final class SurfacingUtil { domain_parsimony.executeDolloParsimonyOnDomainPresence(); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.GAIN, outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D, @@ -577,7 +731,7 @@ public final class SurfacingUtil { ForesterUtil.LINE_SEPARATOR, null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); + + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); //HTML: writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, @@ -611,22 +765,22 @@ public final class SurfacingUtil { all_pfams_lost_as_domains, "_dollo_losses_d", tax_code_to_id_map ); - writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - false, - domain_parsimony.getGainLossMatrix(), - null, - outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, - sep, - ForesterUtil.LINE_SEPARATOR, - "Dollo Parsimony | Present | Domains", - "", - domain_id_to_secondary_features_maps, - all_pfams_encountered, - null, - "_dollo_present_d", - tax_code_to_id_map ); + // writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // false, + // domain_parsimony.getGainLossMatrix(), + // null, + // outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, + // sep, + // ForesterUtil.LINE_SEPARATOR, + // "Dollo Parsimony | Present | Domains", + // "", + // domain_id_to_secondary_features_maps, + // all_pfams_encountered, + // null, + // "_dollo_present_d", + // tax_code_to_id_map ); preparePhylogeny( local_phylogeny_l, domain_parsimony, date_time, @@ -634,7 +788,7 @@ public final class SurfacingUtil { "dollo_on_domains_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name - + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); try { writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" ); writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" ); @@ -643,7 +797,7 @@ public final class SurfacingUtil { e.printStackTrace(); ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); } - if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) { + if ( perform_dc_fich && ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) ) { // FITCH DOMAIN COMBINATIONS // ------------------------- local_phylogeny_l = phylogeny.copy(); @@ -653,28 +807,28 @@ public final class SurfacingUtil { randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony; } else { - domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST ); + domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony ); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.GAIN, - outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.LOSS, outfile_name - + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, + + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); + + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); if ( all_binary_domains_combination_gained_fitch != null ) { collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), dc_type, @@ -689,14 +843,14 @@ public final class SurfacingUtil { } if ( output_binary_domain_combinations_for_graphs ) { SurfacingUtil - .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony - .getGainLossMatrix(), - null, - outfile_name - + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, - sep, - ForesterUtil.LINE_SEPARATOR, - BinaryDomainCombination.OutputFormat.DOT ); + .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony + .getGainLossMatrix(), + null, + outfile_name + + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, + sep, + ForesterUtil.LINE_SEPARATOR, + BinaryDomainCombination.OutputFormat.DOT ); } // HTML: writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, @@ -731,22 +885,22 @@ public final class SurfacingUtil { all_pfams_lost_as_dom_combinations, "_fitch_losses_dc", tax_code_to_id_map ); - writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - true, - domain_parsimony.getGainLossMatrix(), - null, - outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, - sep, - ForesterUtil.LINE_SEPARATOR, - "Fitch Parsimony | Present | Domain Combinations", - "", - null, - all_pfams_encountered, - null, - "_fitch_present_dc", - tax_code_to_id_map ); + // writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // true, + // domain_parsimony.getGainLossMatrix(), + // null, + // outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, + // sep, + // ForesterUtil.LINE_SEPARATOR, + // "Fitch Parsimony | Present | Domain Combinations", + // "", + // null, + // all_pfams_encountered, + // null, + // "_fitch_present_dc", + // tax_code_to_id_map ); writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map, go_id_to_term_map, outfile_name, @@ -761,19 +915,19 @@ public final class SurfacingUtil { date_time, "Fitch parsimony on binary domain combination presence/absence randomization: " + randomization, - "fitch_on_binary_domain_combinations_" + outfile_name, - parameters_str ); + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name - + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt", outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt", @@ -787,7 +941,8 @@ public final class SurfacingUtil { final DomainParsimonyCalculator secondary_features_parsimony, final Phylogeny phylogeny, final String parameters_str, - final Map mapping_results_map ) { + final Map mapping_results_map, + final boolean use_last_in_fitch_parsimony ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); System.out.println(); @@ -797,33 +952,33 @@ public final class SurfacingUtil { Phylogeny local_phylogeny_copy = phylogeny.copy(); secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.GAIN, - outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.LOSS, - outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), - null, - outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + null, + outfile_name + + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); preparePhylogeny( local_phylogeny_copy, secondary_features_parsimony, date_time, @@ -831,32 +986,87 @@ public final class SurfacingUtil { "dollo_on_secondary_features_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name - + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); // FITCH DOMAIN COMBINATIONS // ------------------------- local_phylogeny_copy = phylogeny.copy(); final String randomization = "no"; - secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST ); + secondary_features_parsimony + .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony ); preparePhylogeny( local_phylogeny_copy, secondary_features_parsimony, date_time, "Fitch parsimony on secondary binary domain combination presence/absence randomization: " + randomization, - "fitch_on_binary_domain_combinations_" + outfile_name, - parameters_str ); + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name - + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name - + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name - + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); + } + + public static void executePlusMinusAnalysis( final File output_file, + final List plus_minus_analysis_high_copy_base, + final List plus_minus_analysis_high_copy_target, + final List plus_minus_analysis_low_copy, + final List gwcd_list, + final SortedMap> protein_lists_per_species, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final List plus_minus_analysis_numbers ) { + final Set all_spec = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_spec.add( gwcd.getSpecies().getSpeciesId() ); + } + final File html_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX_HTML ); + final File plain_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX ); + final File html_out_dc = new File( output_file + surfacing.PLUS_MINUS_DC_SUFFIX_HTML ); + final File all_domains_go_ids_out_dom = new File( output_file + surfacing.PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); + final File passing_domains_go_ids_out_dom = new File( output_file + + surfacing.PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); + final File proteins_file_base = new File( output_file + "" ); + final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); + final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); + try { + DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list, + protein_lists_per_species, + plus_minus_analysis_high_copy_base, + plus_minus_analysis_high_copy_target, + plus_minus_analysis_low_copy, + min_diff, + factor, + plain_out_dom, + html_out_dom, + html_out_dc, + domain_id_to_go_ids_map, + go_id_to_term_map, + all_domains_go_ids_out_dom, + passing_domains_go_ids_out_dom, + proteins_file_base ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + plain_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc + + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \"" + + passing_domains_go_ids_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \"" + + all_domains_go_ids_out_dom + "\"" ); } public static void extractProteinNames( final List proteins, - final List query_domain_ids_nc_order, + final List query_domain_ids_nc_order, final Writer out, final String separator, final String limit_to_species ) throws IOException { @@ -869,7 +1079,7 @@ public final class SurfacingUtil { out.write( protein.getProteinId().getId() ); out.write( separator ); out.write( "[" ); - final Set visited_domain_ids = new HashSet(); + final Set visited_domain_ids = new HashSet(); boolean first = true; for( final Domain domain : protein.getProteinDomains() ) { if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { @@ -880,7 +1090,7 @@ public final class SurfacingUtil { else { out.write( " " ); } - out.write( domain.getDomainId().getId() ); + out.write( domain.getDomainId() ); out.write( " {" ); out.write( "" + domain.getTotalCount() ); out.write( "}" ); @@ -905,14 +1115,14 @@ public final class SurfacingUtil { } public static void extractProteinNames( final SortedMap> protein_lists_per_species, - final DomainId domain_id, + final String domain_id, final Writer out, final String separator, final String limit_to_species, final double domain_e_cutoff ) throws IOException { - System.out.println( "Per domain E-value: " + domain_e_cutoff ); + //System.out.println( "Per domain E-value: " + domain_e_cutoff ); for( final Species species : protein_lists_per_species.keySet() ) { - System.out.println( species + ":" ); + //System.out.println( species + ":" ); for( final Protein protein : protein_lists_per_species.get( species ) ) { if ( ForesterUtil.isEmpty( limit_to_species ) || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { @@ -931,7 +1141,7 @@ public final class SurfacingUtil { out.write( domain.getFrom() + "-" + domain.getTo() ); if ( prev_to >= 0 ) { final int l = domain.getFrom() - prev_to; - System.out.println( l ); + // System.out.println( l ); } prev_to = domain.getTo(); } @@ -980,10 +1190,10 @@ public final class SurfacingUtil { out.flush(); } - public static SortedSet getAllDomainIds( final List gwcd_list ) { - final SortedSet all_domains_ids = new TreeSet(); + public static SortedSet getAllDomainIds( final List gwcd_list ) { + final SortedSet all_domains_ids = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - final Set all_domains = gwcd.getAllDomainIds(); + final Set all_domains = gwcd.getAllDomainIds(); // for( final Domain domain : all_domains ) { all_domains_ids.addAll( all_domains ); // } @@ -996,7 +1206,7 @@ public final class SurfacingUtil { for( final Protein protein_domain_collection : protein_domain_collections ) { for( final Object name : protein_domain_collection.getProteinDomains() ) { final BasicDomain protein_domain = ( BasicDomain ) name; - final String id = protein_domain.getDomainId().getId(); + final String id = protein_domain.getDomainId(); if ( map.containsKey( id ) ) { map.put( id, map.get( id ) + 1 ); } @@ -1015,9 +1225,9 @@ public final class SurfacingUtil { final PhylogenyNode n = it.next(); if ( ForesterUtil.isEmpty( n.getName() ) && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() - .getScientificName() ) ) - && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() - .getCommonName() ) ) ) { + .getScientificName() ) ) + && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() + .getCommonName() ) ) ) { if ( n.getParent() != null ) { names.append( " " ); names.append( n.getParent().getName() ); @@ -1032,21 +1242,188 @@ public final class SurfacingUtil { return c; } - /** - * Returns true is Domain domain falls in an uninterrupted stretch of - * covered positions. - * - * @param domain - * @param covered_positions - * @return - */ - public static boolean isEngulfed( final Domain domain, final List covered_positions ) { - for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { - if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) { - return false; + public static void log( final String msg, final Writer w ) { + try { + w.write( msg ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + public static Phylogeny[] obtainAndPreProcessIntrees( final File[] intree_files, + final int number_of_genomes, + final String[][] input_file_properties ) { + final Phylogeny[] intrees = new Phylogeny[ intree_files.length ]; + int i = 0; + for( final File intree_file : intree_files ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + + error ); + } + try { + final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); + if ( p_array.length < 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( p_array.length > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); + } + intree = p_array[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + + "]: " + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "number of external nodes [" + intree.getNumberOfExternalNodes() + + "] of input tree [" + intree_file + + "] is smaller than the number of genomes the be analyzed [" + + number_of_genomes + "]" ); + } + final StringBuilder parent_names = new StringBuilder(); + final int nodes_lacking_name = getNumberOfNodesLackingName( intree, parent_names ); + if ( nodes_lacking_name > 0 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] has " + + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" ); + } + preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); + if ( !intree.isCompletelyBinary() ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file + + "] is not completely binary" ); + } + intrees[ i++ ] = intree; + } + return intrees; + } + + public static Phylogeny obtainFirstIntree( final File intree_file ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + error ); + } + try { + final Phylogeny[] phys = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); + if ( phys.length < 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( phys.length > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); } + intree = phys[ 0 ]; } - return true; + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + "]: " + + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + return intree; + } + + public static String obtainHexColorStringDependingOnTaxonomyGroup( final String tax_code, final Phylogeny phy ) + throws IllegalArgumentException { + if ( !_TAXCODE_HEXCOLORSTRING_MAP.containsKey( tax_code ) ) { + if ( ( phy != null ) && !phy.isEmpty() ) { + // final List nodes = phy.getNodesViaTaxonomyCode( tax_code ); + // Color c = null; + // if ( ( nodes == null ) || nodes.isEmpty() ) { + // throw new IllegalArgumentException( "code " + tax_code + " is not found" ); + // } + // if ( nodes.size() != 1 ) { + // throw new IllegalArgumentException( "code " + tax_code + " is not unique" ); + // } + // PhylogenyNode n = nodes.get( 0 ); + // while ( n != null ) { + // if ( n.getNodeData().isHasTaxonomy() + // && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + // c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getNodeData().getTaxonomy() + // .getScientificName(), tax_code ); + // } + // if ( ( c == null ) && !ForesterUtil.isEmpty( n.getName() ) ) { + // c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getName(), tax_code ); + // } + // if ( c != null ) { + // break; + // } + // n = n.getParent(); + // } + final String group = obtainTaxonomyGroup( tax_code, phy ); + final Color c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group ); + if ( c == null ) { + throw new IllegalArgumentException( "no color found for taxonomy group \"" + group + + "\" for code \"" + tax_code + "\"" ); + } + final String hex = String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() ); + _TAXCODE_HEXCOLORSTRING_MAP.put( tax_code, hex ); + } + else { + throw new IllegalArgumentException( "unable to obtain color for code " + tax_code + + " (tree is null or empty and code is not in map)" ); + } + } + return _TAXCODE_HEXCOLORSTRING_MAP.get( tax_code ); + } + + public static String obtainTaxonomyGroup( final String tax_code, final Phylogeny species_tree ) + throws IllegalArgumentException { + if ( !_TAXCODE_TAXGROUP_MAP.containsKey( tax_code ) ) { + if ( ( species_tree != null ) && !species_tree.isEmpty() ) { + final List nodes = species_tree.getNodesViaTaxonomyCode( tax_code ); + if ( ( nodes == null ) || nodes.isEmpty() ) { + throw new IllegalArgumentException( "code " + tax_code + " is not found" ); + } + if ( nodes.size() != 1 ) { + throw new IllegalArgumentException( "code " + tax_code + " is not unique" ); + } + PhylogenyNode n = nodes.get( 0 ); + String group = null; + while ( n != null ) { + if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + group = ForesterUtil.obtainNormalizedTaxonomyGroup( n.getNodeData().getTaxonomy() + .getScientificName() ); + } + if ( ForesterUtil.isEmpty( group ) && !ForesterUtil.isEmpty( n.getName() ) ) { + group = ForesterUtil.obtainNormalizedTaxonomyGroup( n.getName() ); + } + if ( !ForesterUtil.isEmpty( group ) ) { + break; + } + n = n.getParent(); + } + if ( ForesterUtil.isEmpty( group ) ) { + throw new IllegalArgumentException( "no group found for taxonomy code \"" + tax_code + "\"" ); + } + _TAXCODE_TAXGROUP_MAP.put( tax_code, group ); + } + else { + throw new IllegalArgumentException( "unable to obtain group for code " + tax_code + + " (tree is null or empty and code is not in map)" ); + } + } + return _TAXCODE_TAXGROUP_MAP.get( tax_code ); } public static void performDomainArchitectureAnalysis( final SortedMap> domain_architecutures, @@ -1117,10 +1494,229 @@ public final class SurfacingUtil { p.setRooted( true ); } + public static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, + final String[][] input_file_properties ) { + final String[] genomes = new String[ input_file_properties.length ]; + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] is not unique in input tree " + intree.getName() ); + } + genomes[ i ] = input_file_properties[ i ][ 1 ]; + } + // + final PhylogenyNodeIterator it = intree.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( ForesterUtil.isEmpty( n.getName() ) ) { + if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + n.setName( n.getNodeData().getTaxonomy().getTaxonomyCode() ); + } + else if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + n.setName( n.getNodeData().getTaxonomy().getScientificName() ); + } + else if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getCommonName() ) ) { + n.setName( n.getNodeData().getTaxonomy().getCommonName() ); + } + else { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "node with no name, scientific name, common name, or taxonomy code present" ); + } + } + } + // + final List igns = PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree ); + if ( igns.size() > 0 ) { + System.out.println( "Not using the following " + igns.size() + " nodes:" ); + for( int i = 0; i < igns.size(); ++i ) { + System.out.println( " " + i + ": " + igns.get( i ) ); + } + System.out.println( "--" ); + } + for( final String[] input_file_propertie : input_file_properties ) { + try { + intree.getNode( input_file_propertie[ 1 ] ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_propertie[ 1 ] + + "] not present/not unique in input tree" ); + } + } + } + + public static void printOutPercentageOfMultidomainProteins( final SortedMap all_genomes_domains_per_potein_histo, + final Writer log_writer ) { + int sum = 0; + for( final Entry entry : all_genomes_domains_per_potein_histo.entrySet() ) { + sum += entry.getValue(); + } + final double percentage = ( 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) ) / sum; + ForesterUtil.programMessage( surfacing.PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" ); + log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer ); + } + + public static void processFilter( final File filter_file, final SortedSet filter ) { + SortedSet filter_str = null; + try { + filter_str = ForesterUtil.file2set( filter_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( filter_str != null ) { + for( final String string : filter_str ) { + filter.add( string ); + } + } + if ( surfacing.VERBOSE ) { + System.out.println( "Filter:" ); + for( final String domainId : filter ) { + System.out.println( domainId ); + } + } + } + + public static String[][] processInputGenomesFile( final File input_genomes ) { + String[][] input_file_properties = null; + try { + input_file_properties = ForesterUtil.file22dArray( input_genomes ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "genomes files is to be in the following format \" \": " + + e.getLocalizedMessage() ); + } + final Set specs = new HashSet(); + final Set paths = new HashSet(); + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: " + + input_file_properties[ i ][ 1 ] ); + } + if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ] + + " is not unique" ); + } + specs.add( input_file_properties[ i ][ 1 ] ); + if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ] + + " is not unique" ); + } + paths.add( input_file_properties[ i ][ 0 ] ); + final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + return input_file_properties; + } + + public static void processPlusMinusAnalysisOption( final CommandLineArguments cla, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + if ( cla.isOptionSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for 'plus-minus' file: -" + + surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=" ); + } + final File plus_minus_file = new File( cla.getOptionValue( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( plus_minus_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + msg ); + } + processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers ); + } + } + + // First numbers is minimal difference, second is factor. + public static void processPlusMinusFile( final File plus_minus_file, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + Set species_set = null; + int min_diff = surfacing.PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT; + double factor = surfacing.PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT; + try { + species_set = ForesterUtil.file2set( plus_minus_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( species_set != null ) { + for( final String species : species_set ) { + final String species_trimmed = species.substring( 1 ); + if ( species.startsWith( "+" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_base.add( species_trimmed ); + } + else if ( species.startsWith( "*" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_target.add( species_trimmed ); + } + else if ( species.startsWith( "-" ) ) { + if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + low_copy.add( species_trimmed ); + } + else if ( species.startsWith( "$D" ) ) { + try { + min_diff = Integer.parseInt( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "could not parse integer value for minimal difference from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "$F" ) ) { + try { + factor = Double.parseDouble( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "could not parse double value for factor from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "#" ) ) { + // Comment, ignore. + } + else { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" + + species + "\"" ); + } + numbers.add( new Integer( min_diff + "" ) ); + numbers.add( new Double( factor + "" ) ); + } + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "'plus minus' file [" + plus_minus_file + "] appears empty" ); + } + } + /* * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value - * - * + * + * */ static public StringBuffer proteinToDomainCombinations( final Protein protein, final String protein_id, @@ -1136,7 +1732,7 @@ public final class SurfacingUtil { if ( domains.size() > 1 ) { final Map counts = new HashMap(); for( final Domain domain : domains ) { - final String id = domain.getDomainId().getId(); + final String id = domain.getDomainId(); if ( counts.containsKey( id ) ) { counts.put( id, counts.get( id ) + 1 ); } @@ -1153,24 +1749,24 @@ public final class SurfacingUtil { domain_n = domains.get( j ); domain_c = domains.get( i ); } - final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId(); + final String dc = domain_n.getDomainId() + domain_c.getDomainId(); if ( !dcs.contains( dc ) ) { dcs.add( dc ); sb.append( protein.getSpecies() ); sb.append( separator ); sb.append( protein_id ); sb.append( separator ); - sb.append( domain_n.getDomainId().getId() ); + sb.append( domain_n.getDomainId() ); sb.append( separator ); - sb.append( domain_c.getDomainId().getId() ); + sb.append( domain_c.getDomainId() ); sb.append( separator ); sb.append( domain_n.getPerDomainEvalue() ); sb.append( separator ); sb.append( domain_c.getPerDomainEvalue() ); sb.append( separator ); - sb.append( counts.get( domain_n.getDomainId().getId() ) ); + sb.append( counts.get( domain_n.getDomainId() ) ); sb.append( separator ); - sb.append( counts.get( domain_c.getDomainId().getId() ) ); + sb.append( counts.get( domain_c.getDomainId() ) ); sb.append( ForesterUtil.LINE_SEPARATOR ); } } @@ -1181,7 +1777,7 @@ public final class SurfacingUtil { sb.append( separator ); sb.append( protein_id ); sb.append( separator ); - sb.append( domains.get( 0 ).getDomainId().getId() ); + sb.append( domains.get( 0 ).getDomainId() ); sb.append( separator ); sb.append( separator ); sb.append( domains.get( 0 ).getPerDomainEvalue() ); @@ -1206,55 +1802,6 @@ public final class SurfacingUtil { return sb; } - /** - * - * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 => - * domain with 0.3 is ignored - * - * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored - * - * - * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_ - * ignored - * - * @param max_allowed_overlap - * maximal allowed overlap (inclusive) to be still considered not - * overlapping (zero or negative value to allow any overlap) - * @param remove_engulfed_domains - * to remove domains which are completely engulfed by coverage of - * domains with better support - * @param protein - * @return - */ - public static Protein removeOverlappingDomains( final int max_allowed_overlap, - final boolean remove_engulfed_domains, - final Protein protein ) { - final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies() - .getSpeciesId(), protein.getLength() ); - final List sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein ); - final List covered_positions = new ArrayList(); - for( final Domain domain : sorted ) { - if ( ( ( max_allowed_overlap < 0 ) || ( SurfacingUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) ) - && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) { - final int covered_positions_size = covered_positions.size(); - for( int i = covered_positions_size; i < domain.getFrom(); ++i ) { - covered_positions.add( false ); - } - final int new_covered_positions_size = covered_positions.size(); - for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { - if ( i < new_covered_positions_size ) { - covered_positions.set( i, true ); - } - else { - covered_positions.add( true ); - } - } - pruned_protein.addProteinDomain( domain ); - } - } - return pruned_protein; - } - public static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { final List domains = new ArrayList(); for( final Domain d : protein.getProteinDomains() ) { @@ -1304,7 +1851,7 @@ public final class SurfacingUtil { final SortedSet domains = collectAllDomainsChangedOnSubtree( node, get_gains ); if ( domains.size() > 0 ) { final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR - + node.getName() + suffix_for_filename ); + + node.getName() + suffix_for_filename ); for( final String domain : domains ) { writer.write( domain ); writer.write( ForesterUtil.LINE_SEPARATOR ); @@ -1331,7 +1878,7 @@ public final class SurfacingUtil { final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); for( final BinaryDomainCombination bdc : binary_combinations ) { out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) - .toString() ); + .toString() ); out_dot.write( SurfacingConstants.NL ); } out_dot.close(); @@ -1339,9 +1886,16 @@ public final class SurfacingUtil { catch ( final IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + if ( input_file_properties[ i ].length == 3 ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + } + else { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + + dc_outfile_dot + "\"" ); + } } public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, @@ -1413,7 +1967,7 @@ public final class SurfacingUtil { .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { BinaryDomainCombination bdc = null; try { - bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) ); + bdc = BasicBinaryDomainCombination.obtainInstance( matrix.getCharacter( c ) ); } catch ( final Exception e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); @@ -1432,7 +1986,7 @@ public final class SurfacingUtil { ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); } - public static void writeBinaryStatesMatrixToList( final Map> domain_id_to_go_ids_map, + public static void writeBinaryStatesMatrixToList( final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, final boolean domain_combinations, @@ -1443,7 +1997,7 @@ public final class SurfacingUtil { final String character_separator, final String title_for_html, final String prefix_for_html, - final Map>[] domain_id_to_secondary_features_maps, + final Map>[] domain_id_to_secondary_features_maps, final SortedSet all_pfams_encountered, final SortedSet pfams_gained_or_lost, final String suffix_for_per_node_events_file, @@ -1474,7 +2028,7 @@ public final class SurfacingUtil { int per_node_counter = 0; out.write( "" ); out.write( SurfacingConstants.NL ); - addHtmlHead( out, title_for_html ); + writeHtmlHead( out, title_for_html ); out.write( SurfacingConstants.NL ); out.write( "" ); out.write( SurfacingConstants.NL ); @@ -1530,7 +2084,7 @@ public final class SurfacingUtil { per_node_counter = 0; if ( matrix.getNumberOfCharacters() > 0 ) { per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); + + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); @@ -1599,7 +2153,7 @@ public final class SurfacingUtil { out.write( SurfacingConstants.NL ); out.write( "
" ); out.write( SurfacingConstants.NL ); - } // for( final String id : sorted_ids ) { + } // for( final String id : sorted_ids ) { out.write( "" ); out.write( SurfacingConstants.NL ); out.write( "" ); @@ -1642,15 +2196,15 @@ public final class SurfacingUtil { } else { per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats - .sampleStandardDeviation() ) + "\t" ); + .sampleStandardDeviation() ) + "\t" ); } per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" ); per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" ); per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" ); per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" ); - final SortedSet mpds = gwcd.getMostPromiscuosDomain(); - for( final DomainId mpd : mpds ) { - per_genome_domain_promiscuity_statistics_writer.write( mpd.getId() + " " ); + final SortedSet mpds = gwcd.getMostPromiscuosDomain(); + for( final String mpd : mpds ) { + per_genome_domain_promiscuity_statistics_writer.write( mpd + " " ); } per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); } @@ -1669,127 +2223,20 @@ public final class SurfacingUtil { } } - public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc, - final StringBuilder html_title, - final Writer single_writer, - Map split_writers, - final SortedSet similarities, - final boolean treat_as_binary, - final List species_order, - final PrintableDomainSimilarity.PRINT_OPTION print_option, - final DomainSimilarity.DomainSimilaritySortField sort_field, - final DomainSimilarity.DomainSimilarityScoring scoring, - final boolean verbose, - final Map tax_code_to_id_map ) - throws IOException { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - String histogram_title = null; - switch ( sort_field ) { - case ABS_MAX_COUNTS_DIFFERENCE: - if ( treat_as_binary ) { - histogram_title = "absolute counts difference:"; - } - else { - histogram_title = "absolute (maximal) counts difference:"; - } - break; - case MAX_COUNTS_DIFFERENCE: - if ( treat_as_binary ) { - histogram_title = "counts difference:"; - } - else { - histogram_title = "(maximal) counts difference:"; - } - break; - case DOMAIN_ID: - histogram_title = "score mean:"; - break; - case MIN: - histogram_title = "score minimum:"; - break; - case MAX: - histogram_title = "score maximum:"; - break; - case MAX_DIFFERENCE: - if ( treat_as_binary ) { - histogram_title = "difference:"; - } - else { - histogram_title = "(maximal) difference:"; - } - break; - case MEAN: - histogram_title = "score mean:"; - break; - case SD: - histogram_title = "score standard deviation:"; - break; - case SPECIES_COUNT: - histogram_title = "species number:"; - break; - default: - throw new AssertionError( "Unknown sort field: " + sort_field ); - } - for( final DomainSimilarity similarity : similarities ) { - switch ( sort_field ) { - case ABS_MAX_COUNTS_DIFFERENCE: - stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) ); - break; - case MAX_COUNTS_DIFFERENCE: - stats.addValue( similarity.getMaximalDifferenceInCounts() ); - break; - case DOMAIN_ID: - stats.addValue( similarity.getMeanSimilarityScore() ); - break; - case MIN: - stats.addValue( similarity.getMinimalSimilarityScore() ); - break; - case MAX: - stats.addValue( similarity.getMaximalSimilarityScore() ); - break; - case MAX_DIFFERENCE: - stats.addValue( similarity.getMaximalDifference() ); - break; - case MEAN: - stats.addValue( similarity.getMeanSimilarityScore() ); - break; - case SD: - stats.addValue( similarity.getStandardDeviationOfSimilarityScore() ); - break; - case SPECIES_COUNT: - stats.addValue( similarity.getSpecies().size() ); - break; - default: - throw new AssertionError( "Unknown sort field: " + sort_field ); - } - } - AsciiHistogram histo = null; - if ( stats.getMin() < stats.getMin() ) { - histo = new AsciiHistogram( stats, histogram_title ); - } - if ( verbose ) { - if ( histo != null ) { - System.out.println( histo.toStringBuffer( 20, '|', 40, 5 ) ); - } - System.out.println(); - System.out.println( "N : " + stats.getN() ); - System.out.println( "Min : " + stats.getMin() ); - System.out.println( "Max : " + stats.getMax() ); - System.out.println( "Mean : " + stats.arithmeticMean() ); - if ( stats.getN() > 1 ) { - System.out.println( "SD : " + stats.sampleStandardDeviation() ); - } - else { - System.out.println( "SD : n/a" ); - } - System.out.println( "Median : " + stats.median() ); - if ( stats.getN() > 1 ) { - System.out.println( "Pearsonian skewness : " + stats.pearsonianSkewness() ); - } - else { - System.out.println( "Pearsonian skewness : n/a" ); - } - } + public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc, + final StringBuilder html_title, + final Writer simple_tab_writer, + final Writer single_writer, + Map split_writers, + final SortedSet similarities, + final boolean treat_as_binary, + final List species_order, + final DomainSimilarity.PRINT_OPTION print_option, + final DomainSimilarity.DomainSimilarityScoring scoring, + final boolean verbose, + final Map tax_code_to_id_map, + final Phylogeny phy, + final Set pos_filter_doms ) throws IOException { if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { split_writers = new HashMap(); split_writers.put( '_', single_writer ); @@ -1803,10 +2250,10 @@ public final class SurfacingUtil { w.write( "" ); w.write( SurfacingConstants.NL ); if ( key != '_' ) { - addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() ); + writeHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() ); } else { - addHtmlHead( w, "DCs (" + html_title + ")" ); + writeHtmlHead( w, "DC analysis (" + html_title + ")" ); } w.write( SurfacingConstants.NL ); w.write( "" ); @@ -1814,75 +2261,123 @@ public final class SurfacingUtil { w.write( html_desc.toString() ); w.write( SurfacingConstants.NL ); w.write( "
" ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" );
-                    w.write( SurfacingConstants.NL );
-                    if ( histo != null ) {
-                        w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
-                        w.write( SurfacingConstants.NL );
-                    }
-                    w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); - } - else { - w.write( "" ); - } - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); - } - else { - w.write( "" ); - } - w.write( SurfacingConstants.NL ); - w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
Median: " + stats.median() + "
Pearsonian skewness: " + stats.pearsonianSkewness() + "
Pearsonian skewness: n/a
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); w.write( SurfacingConstants.NL ); w.write( "
" ); w.write( SurfacingConstants.NL ); w.write( "" ); w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); } break; } + // + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( similarity ).setSpeciesOrder( species_order ); + } + if ( single_writer != null ) { + if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) { + single_writer.write( "" ); + } + else { + single_writer.write( "" ); + } + single_writer.write( SurfacingConstants.NL ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() + .charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) { + local_writer.write( "" ); + } + else { + local_writer.write( "" ); + } + local_writer.write( SurfacingConstants.NL ); + } + } for( final Writer w : split_writers.values() ) { + w.write( "
Domains:
" + similarity.getDomainId() + + "
" + + similarity.getDomainId() + "
" + similarity.getDomainId() + + "
" + + similarity.getDomainId() + "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + // + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + writeColorLabels( "Deuterostomia", TaxonomyColors.DEUTEROSTOMIA_COLOR, w ); + writeColorLabels( "Protostomia", TaxonomyColors.PROTOSTOMIA_COLOR, w ); + writeColorLabels( "Cnidaria", TaxonomyColors.CNIDARIA_COLOR, w ); + writeColorLabels( "Placozoa", TaxonomyColors.PLACOZOA_COLOR, w ); + writeColorLabels( "Ctenophora (comb jellies)", TaxonomyColors.CTENOPHORA_COLOR, w ); + writeColorLabels( "Porifera (sponges)", TaxonomyColors.PORIFERA_COLOR, w ); + writeColorLabels( "Choanoflagellida", TaxonomyColors.CHOANOFLAGELLIDA, w ); + writeColorLabels( "Ichthyosporea & Filasterea", TaxonomyColors.ICHTHYOSPOREA_AND_FILASTEREA, w ); + writeColorLabels( "Dikarya (Ascomycota & Basidiomycota, so-called \"higher fungi\")", + TaxonomyColors.DIKARYA_COLOR, + w ); + writeColorLabels( "other Fungi", TaxonomyColors.OTHER_FUNGI_COLOR, w ); + writeColorLabels( "Nucleariidae and Fonticula group", + TaxonomyColors.NUCLEARIIDAE_AND_FONTICULA_GROUP_COLOR, + w ); + writeColorLabels( "Amoebozoa", TaxonomyColors.AMOEBOZOA_COLOR, w ); + writeColorLabels( "Embryophyta (plants)", TaxonomyColors.EMBRYOPHYTA_COLOR, w ); + writeColorLabels( "Chlorophyta (green algae)", TaxonomyColors.CHLOROPHYTA_COLOR, w ); + writeColorLabels( "Rhodophyta (red algae)", TaxonomyColors.RHODOPHYTA_COLOR, w ); + writeColorLabels( "Glaucocystophyce (Glaucophyta)", TaxonomyColors.GLAUCOPHYTA_COLOR, w ); + writeColorLabels( "Hacrobia (Cryptophyta & Haptophyceae & Centroheliozoa)", + TaxonomyColors.HACROBIA_COLOR, + w ); + writeColorLabels( "Stramenopiles (Chromophyta, heterokonts)", TaxonomyColors.STRAMENOPILES_COLOR, w ); + writeColorLabels( "Alveolata", TaxonomyColors.ALVEOLATA_COLOR, w ); + writeColorLabels( "Rhizaria", TaxonomyColors.RHIZARIA_COLOR, w ); + writeColorLabels( "Excavata", TaxonomyColors.EXCAVATA_COLOR, w ); + writeColorLabels( "Apusozoa", TaxonomyColors.APUSOZOA_COLOR, w ); + writeColorLabels( "Archaea", TaxonomyColors.ARCHAEA_COLOR, w ); + writeColorLabels( "Bacteria", TaxonomyColors.BACTERIA_COLOR, w ); + w.write( "
" ); + w.write( "Species group colors:" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + // + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); w.write( SurfacingConstants.NL ); } + // for( final DomainSimilarity similarity : similarities ) { if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + ( similarity ).setSpeciesOrder( species_order ); + } + if ( simple_tab_writer != null ) { + simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED, + tax_code_to_id_map, + null ).toString() ); } if ( single_writer != null ) { - single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() ); + single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); + single_writer.write( SurfacingConstants.NL ); } else { - Writer local_writer = split_writers.get( ( similarity.getDomainId().getId().charAt( 0 ) + "" ) - .toLowerCase().charAt( 0 ) ); + Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() + .charAt( 0 ) ); if ( local_writer == null ) { local_writer = split_writers.get( '0' ); } - local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() ); - } - for( final Writer w : split_writers.values() ) { - w.write( SurfacingConstants.NL ); + local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); + local_writer.write( SurfacingConstants.NL ); } } switch ( print_option ) { @@ -1899,11 +2394,60 @@ public final class SurfacingUtil { w.write( SurfacingConstants.NL ); } break; + default: + break; } for( final Writer w : split_writers.values() ) { w.close(); } - return stats; + } + + public static void writeHtmlHead( final Writer w, final String title ) throws IOException { + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( "" ); + w.write( title ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); } public static void writeMatrixToFile( final CharacterStateMatrix matrix, @@ -1952,6 +2496,57 @@ public final class SurfacingUtil { ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); } + public static void writePresentToNexus( final File output_file, + final File positive_filter_file, + final SortedSet filter, + final List gwcd_list ) { + try { + writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfDomainPresenceOrAbsence( gwcd_list, + positive_filter_file == null ? null + : filter ), + output_file + surfacing.DOMAINS_PRESENT_NEXUS, + Format.NEXUS_BINARY ); + writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), + output_file + surfacing.BDC_PRESENT_NEXUS, + Format.NEXUS_BINARY ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + public static void writeProteinListsForAllSpecies( final File output_dir, + final SortedMap> protein_lists_per_species, + final List gwcd_list, + final double domain_e_cutoff, + final Set pos_filter_doms ) { + final SortedSet all_domains = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_domains.addAll( gwcd.getAllDomainIds() ); + } + for( final String domain : all_domains ) { + if ( !ForesterUtil.isEmpty( pos_filter_doms ) && !pos_filter_doms.contains( domain ) ) { + continue; + } + final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + surfacing.SEQ_EXTRACT_SUFFIX ); + checkForOutputFileWriteability( out ); + try { + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); + extractProteinNames( protein_lists_per_species, + domain, + proteins_file_writer, + "\t", + surfacing.LIMIT_SPEC_FOR_PROT_EX, + domain_e_cutoff ); + proteins_file_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); + } + } + public static void writeTaxonomyLinks( final Writer writer, final String species, final Map tax_code_to_id_map ) throws IOException { @@ -1959,17 +2554,17 @@ public final class SurfacingUtil { writer.write( " [" ); if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) { writer.write( "uniprot" ); + + tax_code_to_id_map.get( species ) + "\" target=\"taxonomy_window\">uniprot" ); } else { writer.write( "eol" ); + + "\" target=\"taxonomy_window\">eol" ); writer.write( "|" ); writer.write( "scholar" ); + + "\" target=\"taxonomy_window\">scholar" ); writer.write( "|" ); writer.write( "google" ); + + "\" target=\"taxonomy_window\">google" ); } writer.write( "]" ); } @@ -2046,9 +2641,9 @@ public final class SurfacingUtil { final SortedMap> domain_lists_go_unique = new TreeMap>(); final Set dcs = dc_gain_counts.keySet(); final SortedSet more_than_once = new TreeSet(); - final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); long gained_multiple_times_domain_length_sum = 0; long gained_once_domain_length_sum = 0; @@ -2077,14 +2672,14 @@ public final class SurfacingUtil { dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); } dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) - .arithmeticMean() ); + .arithmeticMean() ); } if ( domain_number_stats_by_dc != null ) { if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); } dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) - .arithmeticMean() ); + .arithmeticMean() ); } if ( domain_length_stats_by_domain != null ) { if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { @@ -2092,9 +2687,9 @@ public final class SurfacingUtil { } final String[] ds = dc.split( "=" ); dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 0 ] ).arithmeticMean() ); + .get( ds[ 0 ] ).arithmeticMean() ); dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 1 ] ).arithmeticMean() ); + .get( ds[ 1 ] ).arithmeticMean() ); } if ( count > 1 ) { more_than_once.add( dc ); @@ -2282,16 +2877,19 @@ public final class SurfacingUtil { w.write( "Gained once, protein lengths:" ); w.write( "\n" ); w.write( gained_once_lengths_stats.toString() ); + gained_once_lengths_stats = null; w.write( "\n" ); w.write( "\n" ); w.write( "Gained once, domain counts:" ); w.write( "\n" ); w.write( gained_once_domain_count_stats.toString() ); + gained_once_domain_count_stats = null; w.write( "\n" ); w.write( "\n" ); w.write( "Gained multiple times, protein lengths:" ); w.write( "\n" ); w.write( gained_multiple_times_lengths_stats.toString() ); + gained_multiple_times_lengths_stats = null; w.write( "\n" ); w.write( "\n" ); w.write( "Gained multiple times, domain counts:" ); @@ -2336,32 +2934,32 @@ public final class SurfacingUtil { final CharacterStateMatrix.GainLossStates state, final String outfile ) { File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() - + ForesterUtil.FILE_SEPARATOR + base_dir ); + + ForesterUtil.FILE_SEPARATOR + base_dir ); if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); } if ( domain_combinations ) { per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DC" ); + + ForesterUtil.FILE_SEPARATOR + "DC" ); } else { per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); + + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); } if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); } if ( state == GainLossStates.GAIN ) { per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "GAINS" ); + + ForesterUtil.FILE_SEPARATOR + "GAINS" ); } else if ( state == GainLossStates.LOSS ) { per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); + + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); } else { per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); + + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); } if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); @@ -2370,15 +2968,51 @@ public final class SurfacingUtil { } private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { - final SortedMap cds = gwcd.getAllCombinableDomainsIds(); + final SortedMap cds = gwcd.getAllCombinableDomainsIds(); final SortedSet binary_combinations = new TreeSet(); - for( final DomainId domain_id : cds.keySet() ) { + for( final String domain_id : cds.keySet() ) { final CombinableDomains cd = cds.get( domain_id ); binary_combinations.addAll( cd.toBinaryDomainCombinations() ); } return binary_combinations; } + private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w ) + throws IOException { + w.write( "
" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" );
+        w.write( SurfacingConstants.NL );
+        if ( histo != null ) {
+            w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+            w.write( SurfacingConstants.NL );
+        }
+        w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + } + private static List splitDomainCombination( final String dc ) { final String[] s = dc.split( "=" ); if ( s.length != 2 ) { @@ -2392,13 +3026,13 @@ public final class SurfacingUtil { return l; } - private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, + private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final String outfile_name, final SortedSet all_pfams_encountered ) { final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name - + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); + + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); int biological_process_counter = 0; int cellular_component_counter = 0; @@ -2420,7 +3054,7 @@ public final class SurfacingUtil { for( final String pfam : all_pfams_encountered ) { all_pfams_encountered_writer.write( pfam ); all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); - final DomainId domain_id = new DomainId( pfam ); + final String domain_id = new String( pfam ); if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { ++pfams_with_mappings_counter; all_pfams_encountered_with_go_annotation_writer.write( pfam ); @@ -2467,10 +3101,10 @@ public final class SurfacingUtil { all_pfams_encountered_writer.close(); all_pfams_encountered_with_go_annotation_writer.close(); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() - + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); + + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter - + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file - + "\"" ); + + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file + + "\"" ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" + encountered_pfams_summary_file + "\"" ); @@ -2501,7 +3135,7 @@ public final class SurfacingUtil { summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); summary_writer.write( ForesterUtil.LINE_SEPARATOR ); summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter - + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); summary_writer.write( ForesterUtil.LINE_SEPARATOR ); summary_writer.write( "# Pfams without mapping to proc. or func. : " + pfams_without_mappings_to_bp_or_mf_counter + " [" @@ -2530,7 +3164,16 @@ public final class SurfacingUtil { } } - private static void writeDomainData( final Map> domain_id_to_go_ids_map, + private final static void writeColorLabels( final String l, final Color c, final Writer w ) throws IOException { + w.write( "" ); + w.write( l ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + + private static void writeDomainData( final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, final Writer out, @@ -2538,7 +3181,7 @@ public final class SurfacingUtil { final String domain_1, final String prefix_for_html, final String character_separator_for_non_html_output, - final Map>[] domain_id_to_secondary_features_maps, + final Map>[] domain_id_to_secondary_features_maps, final Set all_go_ids ) throws IOException { boolean any_go_annotation_present = false; boolean first_has_no_go = false; @@ -2546,27 +3189,25 @@ public final class SurfacingUtil { if ( ForesterUtil.isEmpty( domain_1 ) ) { domain_count = 1; } - // The following has a difficult to understand logic. + // The following has a difficult to understand logic. for( int d = 0; d < domain_count; ++d ) { List go_ids = null; boolean go_annotation_present = false; if ( d == 0 ) { - final DomainId domain_id = new DomainId( domain_0 ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + if ( domain_id_to_go_ids_map.containsKey( domain_0 ) ) { go_annotation_present = true; any_go_annotation_present = true; - go_ids = domain_id_to_go_ids_map.get( domain_id ); + go_ids = domain_id_to_go_ids_map.get( domain_0 ); } else { first_has_no_go = true; } } else { - final DomainId domain_id = new DomainId( domain_1 ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + if ( domain_id_to_go_ids_map.containsKey( domain_1 ) ) { go_annotation_present = true; any_go_annotation_present = true; - go_ids = domain_id_to_go_ids_map.get( domain_id ); + go_ids = domain_id_to_go_ids_map.get( domain_1 ); } } if ( go_annotation_present ) { @@ -2593,7 +3234,7 @@ public final class SurfacingUtil { final String go_id_str = go_id.getId(); out.write( "" ); out.write( "" + go_id_str + "" ); + + "\" target=\"amigo_window\">" + go_id_str + "" ); out.write( "" ); out.write( go_term.getName() ); if ( domain_count == 2 ) { @@ -2621,7 +3262,7 @@ public final class SurfacingUtil { out.write( SurfacingConstants.NL ); } } - } // for( int d = 0; d < domain_count; ++d ) + } // for( int d = 0; d < domain_count; ++d ) if ( !any_go_annotation_present ) { out.write( "" ); writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); @@ -2639,8 +3280,8 @@ public final class SurfacingUtil { final String domain_0, final String domain_1, final String prefix_for_detailed_html, - final Map>[] domain_id_to_secondary_features_maps ) - throws IOException { + final Map>[] domain_id_to_secondary_features_maps ) + throws IOException { out.write( "" ); if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { out.write( prefix_for_detailed_html ); @@ -2670,7 +3311,7 @@ public final class SurfacingUtil { } writer.close(); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name - + "]" ); + + "]" ); } catch ( final IOException e ) { ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); @@ -2682,7 +3323,7 @@ public final class SurfacingUtil { final Phylogeny phylogeny ) { if ( !( matrix instanceof BasicCharacterStateMatrix ) ) { throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class - + "] to nexus" ); + + "] to nexus" ); } final BasicCharacterStateMatrix my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix ) matrix; final List phylogenies = new ArrayList( 1 );