" ); - w.write( title );

X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FSurfacingUtil.java;h=73544b7f96b42024ad4ae955e70d9b5b1d1496c1;hb=c914fef4ec7f50e4cf01375a30207992c84659a3;hp=1cc2c7a0c4da0718edc106d48c4219ba93dbb410;hpb=cec76926e7d634373e238e61b805c723ef4c0ca7;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 1cc2c7a..73544b7 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -22,10 +22,11 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.surfacing; +import java.awt.Color; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -39,6 +40,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -65,57 +67,60 @@ import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; import org.forester.go.PfamToGoMapping; import org.forester.io.parsers.nexus.NexusConstants; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; -import org.forester.msa.MsaCompactor.SORT_BY; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE; +import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BasicDomain; import org.forester.protein.BasicProtein; import org.forester.protein.BinaryDomainCombination; import org.forester.protein.Domain; -import org.forester.protein.DomainId; import org.forester.protein.Protein; import org.forester.species.Species; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; -import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput; import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; +import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; import org.forester.util.AsciiHistogram; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.BasicTable; import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; +import org.forester.util.TaxonomyColors; public final class SurfacingUtil { - private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" ); - private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); - private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { - - @Override - public int compare( final Domain d1, - final Domain d2 ) { - if ( d1.getPerSequenceEvalue() < d2 - .getPerSequenceEvalue() ) { - return -1; - } - else if ( d1 - .getPerSequenceEvalue() > d2 - .getPerSequenceEvalue() ) { - return 1; - } - else { - return d1.compareTo( d2 ); - } - } - }; - public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); - private static final boolean USE_LAST = true; + public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); + private final static Map _TAXCODE_HEXCOLORSTRING_MAP = new HashMap(); + private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { + + @Override + public int compare( final Domain d1, + final Domain d2 ) { + if ( d1.getPerSequenceEvalue() < d2 + .getPerSequenceEvalue() ) { + return -1; + } + else if ( d1 + .getPerSequenceEvalue() > d2 + .getPerSequenceEvalue() ) { + return 1; + } + else { + return d1.compareTo( d2 ); + } + } + }; + private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); private SurfacingUtil() { // Hidden constructor. @@ -123,49 +128,20 @@ public final class SurfacingUtil { public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome, final SortedSet binary_domain_combinations ) { - final SortedMap all_cd = genome.getAllCombinableDomainsIds(); - for( final DomainId domain_id : all_cd.keySet() ) { + final SortedMap all_cd = genome.getAllCombinableDomainsIds(); + for( final String domain_id : all_cd.keySet() ) { binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() ); } } public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome, - final SortedSet domain_ids ) { - final SortedSet domains = genome.getAllDomainIds(); - for( final DomainId domain : domains ) { + final SortedSet domain_ids ) { + final SortedSet domains = genome.getAllDomainIds(); + for( final String domain : domains ) { domain_ids.add( domain ); } } - public static void addHtmlHead( final Writer w, final String title ) throws IOException { - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( "" ); - w.write( title ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final DomainSimilarity similarity : similarities ) { @@ -174,508 +150,372 @@ public final class SurfacingUtil { return stats; } - private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, - final String outfilename_for_counts, - final String outfilename_for_dc, - final String outfilename_for_dc_for_go_mapping, - final String outfilename_for_dc_for_go_mapping_unique, - final String outfilename_for_rank_counts, - final String outfilename_for_ancestor_species_counts, - final String outfilename_for_protein_stats, - final Map protein_length_stats_by_dc, - final Map domain_number_stats_by_dc, - final Map domain_length_stats_by_domain ) { - try { - // - // if ( protein_length_stats_by_dc != null ) { - // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // if ( domain_number_stats_by_dc != null ) { - // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // - final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); - final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); - final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); - final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); - final SortedMap dc_gain_counts = new TreeMap(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); - for( final String dc : gained_dc ) { - if ( dc_gain_counts.containsKey( dc ) ) { - dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); + public static void checkForOutputFileWriteability( final File outfile ) { + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + + public static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + + "_" + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + } + } + + public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final BinaryDomainCombination.DomainCombinationType dc_type, + final List all_binary_domains_combination_gained, + final boolean get_gains ) { + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + for( final String id : sorted_ids ) { + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) + || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { + if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { + all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); + } + else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { + all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); } else { - dc_gain_counts.put( dc, 1 ); + all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix + .getCharacter( c ) ) ); } } } - final SortedMap histogram = new TreeMap(); - final SortedMap domain_lists = new TreeMap(); - final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); - final SortedMap> domain_lists_go = new TreeMap>(); - final SortedMap> domain_lists_go_unique = new TreeMap>(); - final Set dcs = dc_gain_counts.keySet(); - final SortedSet more_than_once = new TreeSet(); - final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); - long gained_multiple_times_domain_length_sum = 0; - long gained_once_domain_length_sum = 0; - long gained_multiple_times_domain_length_count = 0; - long gained_once_domain_length_count = 0; - for( final String dc : dcs ) { - final int count = dc_gain_counts.get( dc ); - if ( histogram.containsKey( count ) ) { - histogram.put( count, histogram.get( count ) + 1 ); - domain_lists.get( count ).append( ", " + dc ); - domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); - } - else { - histogram.put( count, 1 ); - domain_lists.put( count, new StringBuilder( dc ) ); - final PriorityQueue q = new PriorityQueue(); - q.addAll( splitDomainCombination( dc ) ); - domain_lists_go.put( count, q ); - final SortedSet set = new TreeSet(); - set.addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.put( count, set ); - } - if ( protein_length_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { - dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + } + + public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { + final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings.size() ); + for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { + if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { + domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); + } + domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); + } + return domain_id_to_go_ids_map; + } + + public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) + throws IOException { + final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); + final Map> map = new TreeMap>(); + for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { + final String domain_id = primary_table.getValue( 0, r ); + if ( !map.containsKey( domain_id ) ) { + map.put( domain_id, new HashSet() ); + } + map.get( domain_id ).add( primary_table.getValue( 1, r ) ); + } + return map; + } + + public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { + checkForOutputFileWriteability( nj_tree_outfile ); + final NeighborJoining nj = NeighborJoining.createInstance(); + final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); + phylogeny.setName( nj_tree_outfile.getName() ); + writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); + return phylogeny; + } + + public static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "E-value: " + e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + } + else { + parameters_sb.append( ", Cutoff-scores-file: not-set" ); + } + if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); + } + else { + parameters_sb.append( ", Max-overlap: not-set" ); + } + if ( no_engulfing_overlaps ) { + parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); + } + else { + parameters_sb.append( ", Engulfing-overlaps: allowed" ); + } + if ( ignore_dufs ) { + parameters_sb.append( ", Ignore-dufs: true" ); + } + else { + parameters_sb.append( ", Ignore-dufs: false" ); + } + parameters_sb.append( ", DC type (if applicable): " + dc_type ); + return parameters_sb; + } + + public static void createSplitWriters( final File out_dir, + final String my_outfile, + final Map split_writers ) throws IOException { + split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_A.html" ) ) ); + split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_B.html" ) ) ); + split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_C.html" ) ) ); + split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_D.html" ) ) ); + split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_E.html" ) ) ); + split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_F.html" ) ) ); + split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_G.html" ) ) ); + split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_H.html" ) ) ); + split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_I.html" ) ) ); + split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_J.html" ) ) ); + split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_K.html" ) ) ); + split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_L.html" ) ) ); + split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_M.html" ) ) ); + split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_N.html" ) ) ); + split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_O.html" ) ) ); + split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_P.html" ) ) ); + split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Q.html" ) ) ); + split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_R.html" ) ) ); + split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_S.html" ) ) ); + split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_T.html" ) ) ); + split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_U.html" ) ) ); + split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_V.html" ) ) ); + split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_W.html" ) ) ); + split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_X.html" ) ) ); + split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Y.html" ) ) ); + split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Z.html" ) ) ); + split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_0.html" ) ) ); + } + + public static Map createTaxCodeToIdMap( final Phylogeny phy ) { + final Map m = new HashMap(); + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + final Taxonomy t = n.getNodeData().getTaxonomy(); + final String c = t.getTaxonomyCode(); + if ( !ForesterUtil.isEmpty( c ) ) { + if ( n.getNodeData().getTaxonomy() == null ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); } - dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_number_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue(); + if ( ForesterUtil.isEmpty( id ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); } - dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_length_stats_by_domain != null ) { - if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); + if ( m.containsKey( c ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" ); } - final String[] ds = dc.split( "=" ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 0 ] ).arithmeticMean() ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 1 ] ).arithmeticMean() ); + final int iid = Integer.valueOf( id ); + if ( m.containsValue( iid ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" ); + } + m.put( c, iid ); } - if ( count > 1 ) { - more_than_once.add( dc ); - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_lengths_stats.addValue( element ); + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n ); + } + } + return m; + } + + public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, + final Detailedness detailedness ) { + for( final DomainSimilarity domain_similarity : domain_similarities ) { + if ( domain_similarity instanceof PrintableDomainSimilarity ) { + final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + printable_domain_similarity.setDetailedness( detailedness ); + } + } + } + + public static void doit( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species, + final Map> average_protein_lengths_by_dc ) throws IOException { + for( final Protein protein : proteins ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); } } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_domain_count_stats.addValue( element ); - } - } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } - } - } - else { - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_lengths_stats.addValue( element ); - } - } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_domain_count_stats.addValue( element ); - } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); } + out.write( SurfacingConstants.NL ); } } - final Set histogram_keys = histogram.keySet(); - for( final Integer histogram_key : histogram_keys ) { - final int count = histogram.get( histogram_key ); - final StringBuilder dc = domain_lists.get( histogram_key ); - out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); - out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); - out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); - Arrays.sort( sorted ); - for( final Object domain : sorted ) { - out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); - } - out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { - out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); - } + } + out.flush(); + } + + public static void domainsPerProteinsStatistics( final String genome, + final List protein_list, + final DescriptiveStatistics all_genomes_domains_per_potein_stats, + final SortedMap all_genomes_domains_per_potein_histo, + final SortedSet domains_which_are_always_single, + final SortedSet domains_which_are_sometimes_single_sometimes_not, + final SortedSet domains_which_never_single, + final Writer writer ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Protein protein : protein_list ) { + final int domains = protein.getNumberOfProteinDomains(); + //System.out.println( domains ); + stats.addValue( domains ); + all_genomes_domains_per_potein_stats.addValue( domains ); + if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { + all_genomes_domains_per_potein_histo.put( domains, 1 ); } - out_counts.close(); - out_dc.close(); - out_dc_for_go_mapping.close(); - out_dc_for_go_mapping_unique.close(); - final SortedMap lca_rank_counts = new TreeMap(); - final SortedMap lca_ancestor_species_counts = new TreeMap(); - for( final String dc : more_than_once ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { - nodes.add( n ); + else { + all_genomes_domains_per_potein_histo.put( domains, + 1 + all_genomes_domains_per_potein_histo.get( domains ) ); + } + if ( domains == 1 ) { + final String domain = protein.getProteinDomain( 0 ).getDomainId(); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_never_single.contains( domain ) ) { + domains_which_never_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_are_always_single.add( domain ); } } - for( int i = 0; i < nodes.size() - 1; ++i ) { - for( int j = i + 1; j < nodes.size(); ++j ) { - final PhylogenyNode lca = PhylogenyMethods.obtainLCA( nodes.get( i ), - nodes.get( j ) ); - String rank = "unknown"; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { - rank = lca.getNodeData().getTaxonomy().getRank(); - } - addToCountMap( lca_rank_counts, rank ); - String lca_species; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getScientificName(); - } - else if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + } + else if ( domains > 1 ) { + for( final Domain d : protein.getProteinDomains() ) { + final String domain = d.getDomainId(); + // System.out.println( domain ); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_are_always_single.contains( domain ) ) { + domains_which_are_always_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); } else { - lca_species = lca.getName(); + domains_which_never_single.add( domain ); } - addToCountMap( lca_ancestor_species_counts, lca_species ); } } } - final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); - final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); - ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); - ForesterUtil.map2writer( out_for_ancestor_species_counts, - lca_ancestor_species_counts, - "\t", - ForesterUtil.LINE_SEPARATOR ); - out_for_rank_counts.close(); - out_for_ancestor_species_counts.close(); - if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) - && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { - final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); - w.write( "Domain Lengths: " ); - w.write( "\n" ); - if ( domain_length_stats_by_domain != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } - } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Protein Lengths: " ); - w.write( "\n" ); - if ( protein_length_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_protein_length_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } + } + try { + writer.write( genome ); + writer.write( "\t" ); + if ( stats.getN() >= 1 ) { + writer.write( stats.arithmeticMean() + "" ); + writer.write( "\t" ); + if ( stats.getN() >= 2 ) { + writer.write( stats.sampleStandardDeviation() + "" ); } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Number of domains: " ); - w.write( "\n" ); - if ( domain_number_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_number_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } + else { + writer.write( "" ); } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_once_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_multiple_times_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " - + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, protein lengths:" ); - w.write( "\n" ); - w.write( gained_once_lengths_stats.toString() ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain counts:" ); - w.write( "\n" ); - w.write( gained_once_domain_count_stats.toString() ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, protein lengths:" ); - w.write( "\n" ); - w.write( gained_multiple_times_lengths_stats.toString() ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain counts:" ); - w.write( "\n" ); - w.write( gained_multiple_times_domain_count_stats.toString() ); - w.flush(); - w.close(); + writer.write( "\t" ); + writer.write( stats.median() + "" ); + writer.write( "\t" ); + writer.write( stats.getN() + "" ); + writer.write( "\t" ); + writer.write( stats.getMin() + "" ); + writer.write( "\t" ); + writer.write( stats.getMax() + "" ); + } + else { + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "0" ); + writer.write( "\t" ); + writer.write( "\t" ); } + writer.write( "\n" ); } catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" - + outfilename_for_counts + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" - + outfilename_for_dc + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping) [" - + outfilename_for_dc_for_go_mapping + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" - + outfilename_for_dc_for_go_mapping_unique + "]" ); - } - - private final static void addToCountMap( final Map map, final String s ) { - if ( map.containsKey( s ) ) { - map.put( s, map.get( s ) + 1 ); - } - else { - map.put( s, 1 ); - } - } - - public static int calculateOverlap( final Domain domain, final List covered_positions ) { - int overlap_count = 0; - for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { - if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) { - ++overlap_count; - } - } - return overlap_count; - } - - public static void checkForOutputFileWriteability( final File outfile ) { - final String error = ForesterUtil.isWritableFile( outfile ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - } - } - - private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, - final boolean get_gains ) { - final SortedSet domains = new TreeSet(); - for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { - final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); - if ( get_gains ) { - domains.addAll( chars.getGainedCharacters() ); - } - else { - domains.addAll( chars.getLostCharacters() ); - } - } - return domains; - } - - public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, - final BinaryDomainCombination.DomainCombinationType dc_type, - final List all_binary_domains_combination_gained, - final boolean get_gains ) { - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - for( final String id : sorted_ids ) { - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) - || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { - if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { - all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { - all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else { - all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix - .getCharacter( c ) ) ); - } - } - } - } - } - - private static File createBaseDirForPerNodeDomainFiles( final String base_dir, - final boolean domain_combinations, - final CharacterStateMatrix.GainLossStates state, - final String outfile ) { - File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() - + ForesterUtil.FILE_SEPARATOR + base_dir ); - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( domain_combinations ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DC" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( state == GainLossStates.GAIN ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "GAINS" ); - } - else if ( state == GainLossStates.LOSS ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - return per_node_go_mapped_domain_gain_loss_files_base_dir; - } - - public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { - final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings - .size() ); - for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { - if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { - domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); - } - domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); - } - return domain_id_to_go_ids_map; - } - - public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) - throws IOException { - final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, "\t" ); - final Map> map = new TreeMap>(); - for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { - final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) ); - if ( !map.containsKey( domain_id ) ) { - map.put( domain_id, new HashSet() ); - } - map.get( domain_id ).add( primary_table.getValue( 1, r ) ); - } - return map; - } - - public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { - checkForOutputFileWriteability( nj_tree_outfile ); - final NeighborJoining nj = NeighborJoining.createInstance(); - final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); - phylogeny.setName( nj_tree_outfile.getName() ); - writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); - return phylogeny; - } - - private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { - final SortedMap cds = gwcd.getAllCombinableDomainsIds(); - final SortedSet binary_combinations = new TreeSet(); - for( final DomainId domain_id : cds.keySet() ) { - final CombinableDomains cd = cds.get( domain_id ); - binary_combinations.addAll( cd.toBinaryDomainCombinations() ); - } - return binary_combinations; - } - - public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, - final Detailedness detailedness, - final GoAnnotationOutput go_annotation_output, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit ) { - if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) { - throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" ); - } - for( final DomainSimilarity domain_similarity : domain_similarities ) { - if ( domain_similarity instanceof PrintableDomainSimilarity ) { - final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; - printable_domain_similarity.setDetailedness( detailedness ); - printable_domain_similarity.setGoAnnotationOutput( go_annotation_output ); - printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map ); - printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit ); - } + e.printStackTrace(); } } @@ -702,10 +542,6 @@ public final class SurfacingUtil { out.write( species + "\t" ); } out.write( ForesterUtil.LINE_SEPARATOR ); - // DescriptiveStatistics stats_for_domain = domain_lengths - // .calculateMeanBasedStatistics(); - //AsciiHistogram histo = new AsciiHistogram( stats_for_domain ); - //System.out.println( histo.toStringBuffer( 40, '=', 60, 4 ).toString() ); } } out.write( ForesterUtil.LINE_SEPARATOR ); @@ -739,22 +575,97 @@ public final class SurfacingUtil { } } out.close(); - // final List histogram_datas = new ArrayList(); - // for( int i = 0; i < number_of_genomes; ++i ) { - // final Species species = new BasicSpecies( input_file_properties[ i ][ 0 ] ); - // histogram_datas - // .add( new HistogramData( species.toString(), domain_lengths_table - // .calculateMeanBasedStatisticsForSpecies( species ) - // .getDataAsDoubleArray(), 5, 600, null, 60 ) ); - // } - // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); - // hf.setVisible( true ); System.gc(); } /** + * Warning: This side-effects 'all_bin_domain_combinations_encountered'! + * + * + * @param output_file + * @param all_bin_domain_combinations_changed + * @param sum_of_all_domains_encountered + * @param all_bin_domain_combinations_encountered + * @param is_gains_analysis + * @param protein_length_stats_by_dc + * @throws IOException + */ + public static void executeFitchGainsAnalysis( final File output_file, + final List all_bin_domain_combinations_changed, + final int sum_of_all_domains_encountered, + final SortedSet all_bin_domain_combinations_encountered, + final boolean is_gains_analysis ) throws IOException { + checkForOutputFileWriteability( output_file ); + final Writer out = ForesterUtil.createBufferedWriter( output_file ); + final SortedMap bdc_to_counts = ForesterUtil + .listToSortedCountsMap( all_bin_domain_combinations_changed ); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + int above_one = 0; + int one = 0; + for( final Object bdc_object : bdc_to_counts.keySet() ) { + final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; + final int count = bdc_to_counts.get( bdc_object ); + if ( count < 1 ) { + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "count < 1 " ); + } + out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + if ( count > 1 ) { + all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); + all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); + above_one++; + } + else if ( count == 1 ) { + all_domains_in_combination_changed_only_once.add( bdc.getId0() ); + all_domains_in_combination_changed_only_once.add( bdc.getId1() ); + one++; + } + } + final int all = all_bin_domain_combinations_encountered.size(); + int never_lost = -1; + if ( !is_gains_analysis ) { + all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); + never_lost = all_bin_domain_combinations_encountered.size(); + for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { + out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); + } + } + if ( is_gains_analysis ) { + out.write( "Sum of all distinct domain combinations appearing once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + else { + out.write( "Sum of all distinct domain combinations never lost : " + never_lost + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + out.write( "All binary combinations : " + all + + ForesterUtil.LINE_SEPARATOR ); + out.write( "All domains : " + + sum_of_all_domains_encountered ); + out.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote fitch domain combination dynamics counts analysis to \"" + output_file + + "\"" ); + } + + /** * * @param all_binary_domains_combination_lost_fitch + * @param use_last_in_fitch_parsimony * @param consider_directedness_and_adjacency_for_bin_combinations * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations * which were gained under unweighted (Fitch) parsimony. @@ -764,19 +675,22 @@ public final class SurfacingUtil { final String outfile_name, final DomainParsimonyCalculator domain_parsimony, final Phylogeny phylogeny, - final Map> domain_id_to_go_ids_map, + final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, final String parameters_str, - final Map>[] domain_id_to_secondary_features_maps, - final SortedSet positive_filter, + final Map>[] domain_id_to_secondary_features_maps, + final SortedSet positive_filter, final boolean output_binary_domain_combinations_for_graphs, final List all_binary_domains_combination_gained_fitch, final List all_binary_domains_combination_lost_fitch, final BinaryDomainCombination.DomainCombinationType dc_type, final Map protein_length_stats_by_dc, final Map domain_number_stats_by_dc, - final Map domain_length_stats_by_domain ) { + final Map domain_length_stats_by_domain, + final Map tax_code_to_id_map, + final boolean write_to_nexus, + final boolean use_last_in_fitch_parsimony ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -784,7 +698,9 @@ public final class SurfacingUtil { final SortedSet all_pfams_lost_as_domains = new TreeSet(); final SortedSet all_pfams_gained_as_dom_combinations = new TreeSet(); final SortedSet all_pfams_lost_as_dom_combinations = new TreeSet(); - writeToNexus( outfile_name, domain_parsimony, phylogeny ); + if ( write_to_nexus ) { + writeToNexus( outfile_name, domain_parsimony, phylogeny ); + } // DOLLO DOMAINS // ------------- Phylogeny local_phylogeny_l = phylogeny.copy(); @@ -827,7 +743,8 @@ public final class SurfacingUtil { domain_id_to_secondary_features_maps, all_pfams_encountered, all_pfams_gained_as_domains, - "_dollo_gains_d" ); + "_dollo_gains_d", + tax_code_to_id_map ); writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, go_namespace_limit, @@ -842,22 +759,24 @@ public final class SurfacingUtil { domain_id_to_secondary_features_maps, all_pfams_encountered, all_pfams_lost_as_domains, - "_dollo_losses_d" ); - writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - false, - domain_parsimony.getGainLossMatrix(), - null, - outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, - sep, - ForesterUtil.LINE_SEPARATOR, - "Dollo Parsimony | Present | Domains", - "", - domain_id_to_secondary_features_maps, - all_pfams_encountered, - null, - "_dollo_present_d" ); + "_dollo_losses_d", + tax_code_to_id_map ); + // writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // false, + // domain_parsimony.getGainLossMatrix(), + // null, + // outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, + // sep, + // ForesterUtil.LINE_SEPARATOR, + // "Dollo Parsimony | Present | Domains", + // "", + // domain_id_to_secondary_features_maps, + // all_pfams_encountered, + // null, + // "_dollo_present_d", + // tax_code_to_id_map ); preparePhylogeny( local_phylogeny_l, domain_parsimony, date_time, @@ -884,7 +803,7 @@ public final class SurfacingUtil { randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony; } else { - domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST ); + domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony ); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); @@ -944,7 +863,8 @@ public final class SurfacingUtil { null, all_pfams_encountered, all_pfams_gained_as_dom_combinations, - "_fitch_gains_dc" ); + "_fitch_gains_dc", + tax_code_to_id_map ); writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, go_namespace_limit, @@ -959,22 +879,24 @@ public final class SurfacingUtil { null, all_pfams_encountered, all_pfams_lost_as_dom_combinations, - "_fitch_losses_dc" ); - writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - true, - domain_parsimony.getGainLossMatrix(), - null, - outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, - sep, - ForesterUtil.LINE_SEPARATOR, - "Fitch Parsimony | Present | Domain Combinations", - "", - null, - all_pfams_encountered, - null, - "_fitch_present_dc" ); + "_fitch_losses_dc", + tax_code_to_id_map ); + // writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // true, + // domain_parsimony.getGainLossMatrix(), + // null, + // outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, + // sep, + // ForesterUtil.LINE_SEPARATOR, + // "Fitch Parsimony | Present | Domain Combinations", + // "", + // null, + // all_pfams_encountered, + // null, + // "_fitch_present_dc", + // tax_code_to_id_map ); writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map, go_id_to_term_map, outfile_name, @@ -1015,7 +937,8 @@ public final class SurfacingUtil { final DomainParsimonyCalculator secondary_features_parsimony, final Phylogeny phylogeny, final String parameters_str, - final Map mapping_results_map ) { + final Map mapping_results_map, + final boolean use_last_in_fitch_parsimony ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); System.out.println(); @@ -1064,7 +987,8 @@ public final class SurfacingUtil { // ------------------------- local_phylogeny_copy = phylogeny.copy(); final String randomization = "no"; - secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST ); + secondary_features_parsimony + .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony ); preparePhylogeny( local_phylogeny_copy, secondary_features_parsimony, date_time, @@ -1083,58 +1007,62 @@ public final class SurfacingUtil { + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); } - public static void doit( final List proteins, - final List query_domain_ids_nc_order, - final Writer out, - final String separator, - final String limit_to_species, - final Map> average_protein_lengths_by_dc ) throws IOException { - for( final Protein protein : proteins ) { - if ( ForesterUtil.isEmpty( limit_to_species ) - || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { - if ( protein.contains( query_domain_ids_nc_order, true ) ) { - out.write( protein.getSpecies().getSpeciesId() ); - out.write( separator ); - out.write( protein.getProteinId().getId() ); - out.write( separator ); - out.write( "[" ); - final Set visited_domain_ids = new HashSet(); - boolean first = true; - for( final Domain domain : protein.getProteinDomains() ) { - if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { - visited_domain_ids.add( domain.getDomainId() ); - if ( first ) { - first = false; - } - else { - out.write( " " ); - } - out.write( domain.getDomainId().getId() ); - out.write( " {" ); - out.write( "" + domain.getTotalCount() ); - out.write( "}" ); - } - } - out.write( "]" ); - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getDescription() ); - } - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getAccession() ); - } - out.write( SurfacingConstants.NL ); - } - } + public static void executePlusMinusAnalysis( final File output_file, + final List plus_minus_analysis_high_copy_base, + final List plus_minus_analysis_high_copy_target, + final List plus_minus_analysis_low_copy, + final List gwcd_list, + final SortedMap> protein_lists_per_species, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final List

N:	" + stats.getN() + "
Min:	" + stats.getMin() + "
Max:	" + stats.getMax() + "
Mean:	" + stats.arithmeticMean() + "
SD:	" + stats.sampleStandardDeviation() + "
SD:	n/a

N:	" + stats.getN() + "
Min:	" + stats.getMin() + "
Max:	" + stats.getMax() + "
Mean:	" + stats.arithmeticMean() + "
SD:	" + stats.sampleStandardDeviation() + "
SD:	n/a

Median:	" + stats.median() + "
Pearsonian skewness:	" + stats.pearsonianSkewness() + "
" ); + out.write( "" + go_id_str + "" ); + out.write( "	" ); + out.write( go_term.getName() ); + if ( domain_count == 2 ) { + out.write( " (" + d + ")" ); + } + out.write( "	" ); + // out.write( top ); + // out.write( "	" ); + out.write( "[" ); + out.write( go_term.getGoNameSpace().toShortString() ); + out.write( "]" ); + out.write( "
Pearsonian skewness:	n/a
" ); + out.write( "	" ); + out.write( "	" ); + out.write( "	" ); + out.write( "

" ); out.write( "" + id + "" ); - writeTaxonomyLinks( out, id ); + writeTaxonomyLinks( out, id, tax_code_to_id_map ); out.write( "