X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FSurfacingUtil.java;h=ab83ebfed2477a6649ed95ad9082077c01d3ea97;hb=fffc26ac5f8cf4eaa5faea6a7e369b94d381d859;hp=f9798976abc7c8eb5ec5512a391c1f241ecb8fc1;hpb=7f4318a3ef37864b5453e3cd56270b8e91e76b9f;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index f979897..ab83ebf 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -41,6 +41,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.PriorityQueue; import java.util.Set; import java.util.SortedMap; @@ -57,6 +58,7 @@ import org.forester.evoinference.matrix.character.CharacterStateMatrix; import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates; +import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.matrix.distance.DistanceMatrix; import org.forester.go.GoId; import org.forester.go.GoNameSpace; @@ -71,6 +73,13 @@ import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.protein.BasicDomain; +import org.forester.protein.BasicProtein; +import org.forester.protein.BinaryDomainCombination; +import org.forester.protein.Domain; +import org.forester.protein.DomainId; +import org.forester.protein.Protein; +import org.forester.species.Species; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput; import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; @@ -105,6 +114,7 @@ public final class SurfacingUtil { } }; public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); + private static final boolean USE_LAST = true; private SurfacingUtil() { // Hidden constructor. @@ -169,8 +179,36 @@ public final class SurfacingUtil { final String outfilename_for_dc_for_go_mapping, final String outfilename_for_dc_for_go_mapping_unique, final String outfilename_for_rank_counts, - final String outfilename_for_ancestor_species_counts ) { + final String outfilename_for_ancestor_species_counts, + final String outfilename_for_protein_stats, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { try { + // + // if ( protein_length_stats_by_dc != null ) { + // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // if ( domain_number_stats_by_dc != null ) { + // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); @@ -190,10 +228,21 @@ public final class SurfacingUtil { } final SortedMap histogram = new TreeMap(); final SortedMap domain_lists = new TreeMap(); + final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); final SortedMap> domain_lists_go = new TreeMap>(); final SortedMap> domain_lists_go_unique = new TreeMap>(); final Set dcs = dc_gain_counts.keySet(); final SortedSet more_than_once = new TreeSet(); + final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); + long gained_multiple_times_domain_length_sum = 0; + long gained_once_domain_length_sum = 0; + long gained_multiple_times_domain_length_count = 0; + long gained_once_domain_length_count = 0; for( final String dc : dcs ) { final int count = dc_gain_counts.get( dc ); if ( histogram.containsKey( count ) ) { @@ -212,8 +261,84 @@ public final class SurfacingUtil { set.addAll( splitDomainCombination( dc ) ); domain_lists_go_unique.put( count, set ); } + if ( protein_length_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { + dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_number_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_length_stats_by_domain != null ) { + if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); + } + final String[] ds = dc.split( "=" ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 0 ] ).arithmeticMean() ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 1 ] ).arithmeticMean() ); + } if ( count > 1 ) { more_than_once.add( dc ); + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + } + } + else { + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + } } } final Set histogram_keys = histogram.keySet(); @@ -237,7 +362,6 @@ public final class SurfacingUtil { out_dc.close(); out_dc_for_go_mapping.close(); out_dc_for_go_mapping_unique.close(); - // final SortedMap lca_rank_counts = new TreeMap(); final SortedMap lca_ancestor_species_counts = new TreeMap(); for( final String dc : more_than_once ) { @@ -250,7 +374,7 @@ public final class SurfacingUtil { } for( int i = 0; i < nodes.size() - 1; ++i ) { for( int j = i + 1; j < nodes.size(); ++j ) { - final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ), + final PhylogenyNode lca = PhylogenyMethods.obtainLCA( nodes.get( i ), nodes.get( j ) ); String rank = "unknown"; if ( lca.getNodeData().isHasTaxonomy() @@ -283,6 +407,89 @@ public final class SurfacingUtil { ForesterUtil.LINE_SEPARATOR ); out_for_rank_counts.close(); out_for_ancestor_species_counts.close(); + if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) + && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); + w.write( "Domain Lengths: " ); + w.write( "\n" ); + if ( domain_length_stats_by_domain != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Protein Lengths: " ); + w.write( "\n" ); + if ( protein_length_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_protein_length_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Number of domains: " ); + w.write( "\n" ); + if ( domain_number_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_number_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_once_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_multiple_times_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, protein lengths:" ); + w.write( "\n" ); + w.write( gained_once_lengths_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain counts:" ); + w.write( "\n" ); + w.write( gained_once_domain_count_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, protein lengths:" ); + w.write( "\n" ); + w.write( gained_multiple_times_lengths_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain counts:" ); + w.write( "\n" ); + w.write( gained_multiple_times_domain_count_stats.toString() ); + w.flush(); + w.close(); + } } catch ( final IOException e ) { ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); @@ -436,7 +643,7 @@ public final class SurfacingUtil { public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { checkForOutputFileWriteability( nj_tree_outfile ); final NeighborJoining nj = NeighborJoining.createInstance(); - final Phylogeny phylogeny = nj.execute( distance ); + final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); phylogeny.setName( nj_tree_outfile.getName() ); writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); return phylogeny; @@ -565,7 +772,10 @@ public final class SurfacingUtil { final boolean output_binary_domain_combinations_for_graphs, final List all_binary_domains_combination_gained_fitch, final List all_binary_domains_combination_lost_fitch, - final BinaryDomainCombination.DomainCombinationType dc_type ) { + final BinaryDomainCombination.DomainCombinationType dc_type, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -673,7 +883,7 @@ public final class SurfacingUtil { randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony; } else { - domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( true ); + domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST ); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); @@ -782,12 +992,21 @@ public final class SurfacingUtil { parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); - calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name - + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" ); + calculateIndependentDomainCombinationGains( local_phylogeny_l, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, + outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt", + outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt", + outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt", + protein_length_stats_by_dc, + domain_number_stats_by_dc, + domain_length_stats_by_domain ); } } @@ -802,7 +1021,7 @@ public final class SurfacingUtil { writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES, secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ), phylogeny ); - final Phylogeny local_phylogeny_copy = phylogeny.copy(); + Phylogeny local_phylogeny_copy = phylogeny.copy(); secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); @@ -840,6 +1059,77 @@ public final class SurfacingUtil { parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + // FITCH DOMAIN COMBINATIONS + // ------------------------- + local_phylogeny_copy = phylogeny.copy(); + final String randomization = "no"; + secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST ); + preparePhylogeny( local_phylogeny_copy, + secondary_features_parsimony, + date_time, + "Fitch parsimony on secondary binary domain combination presence/absence randomization: " + + randomization, + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); + SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); + calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); + } + + public static void doit( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species, + final Map> average_protein_lengths_by_dc ) throws IOException { + for( final Protein protein : proteins ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId().getId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); + } + } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + } + out.flush(); } public static void extractProteinNames( final List proteins, @@ -902,16 +1192,30 @@ public final class SurfacingUtil { || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { final List domains = protein.getProteinDomains( domain_id ); if ( domains.size() > 0 ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( final Domain domain : domains ) { - stats.addValue( domain.getPerSequenceEvalue() ); - } out.write( protein.getSpecies().getSpeciesId() ); out.write( separator ); out.write( protein.getProteinId().getId() ); out.write( separator ); - out.write( "[" + FORMATTER.format( stats.median() ) + "]" ); + out.write( domain_id.toString() ); + out.write( separator ); + for( final Domain domain : domains ) { + out.write( "/" ); + out.write( domain.getFrom() + "-" + domain.getTo() ); + } + out.write( "/" ); out.write( separator ); + out.write( "{" ); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( first ) { + first = false; + } + else { + out.write( "," ); + } + out.write( domain.getDomainId().toString() ); + } + out.write( "}" ); if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() .equals( SurfacingConstants.NONE ) ) ) { out.write( protein.getDescription() ); @@ -1132,7 +1436,7 @@ public final class SurfacingUtil { final boolean remove_engulfed_domains, final Protein protein ) { final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies() - .getSpeciesId() ); + .getSpeciesId(), protein.getLength() ); final List sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein ); final List covered_positions = new ArrayList(); for( final Domain domain : sorted ) { @@ -1651,181 +1955,6 @@ public final class SurfacingUtil { ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" ); } - public static void writeBinaryStatesMatrixToListORIGIG( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit, - final boolean domain_combinations, - final CharacterStateMatrix matrix, - final CharacterStateMatrix.GainLossStates state, - final String filename, - final String indentifier_characters_separator, - final String character_separator, - final String title_for_html, - final String prefix_for_html, - final Map>[] domain_id_to_secondary_features_maps, - final SortedSet all_pfams_encountered, - final SortedSet pfams_gained_or_lost, - final String suffix_for_per_node_events_file ) { - if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); - } - else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" ); - } - else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" ); - } - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - try { - final Writer out = new BufferedWriter( new FileWriter( outfile ) ); - final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, - domain_combinations, - state, - filename ); - Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null; - File per_node_go_mapped_domain_gain_loss_outfile = null; - int per_node_counter = 0; - out.write( "" ); - out.write( SurfacingConstants.NL ); - addHtmlHead( out, title_for_html ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( title_for_html ); - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - for( final String id : sorted_ids ) { - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - out.write( "
" ); - out.write( "" + id + "" ); - writeTaxonomyLinks( out, id ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - for( final String id : sorted_ids ) { - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( "" + id + "" ); - writeTaxonomyLinks( out, id ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - per_node_counter = 0; - if ( matrix.getNumberOfCharacters() > 0 ) { - per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); - SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); - per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil - .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); - } - else { - per_node_go_mapped_domain_gain_loss_outfile = null; - per_node_go_mapped_domain_gain_loss_outfile_writer = null; - } - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - // Not nice: - // using null to indicate either UNCHANGED_PRESENT or GAIN. - if ( ( matrix.getState( id, c ) == state ) - || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix - .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) { - final String character = matrix.getCharacter( c ); - String domain_0 = ""; - String domain_1 = ""; - if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) { - final String[] s = character.split( BinaryDomainCombination.SEPARATOR ); - if ( s.length != 2 ) { - throw new AssertionError( "this should not have happened: unexpected format for domain combination: [" - + character + "]" ); - } - domain_0 = s[ 0 ]; - domain_1 = s[ 1 ]; - } - else { - domain_0 = character; - } - writeDomainData( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - out, - domain_0, - domain_1, - prefix_for_html, - character_separator, - domain_id_to_secondary_features_maps, - null ); - all_pfams_encountered.add( domain_0 ); - if ( pfams_gained_or_lost != null ) { - pfams_gained_or_lost.add( domain_0 ); - } - if ( !ForesterUtil.isEmpty( domain_1 ) ) { - all_pfams_encountered.add( domain_1 ); - if ( pfams_gained_or_lost != null ) { - pfams_gained_or_lost.add( domain_1 ); - } - } - if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { - writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer, - domain_0, - domain_1 ); - per_node_counter++; - } - } - } - if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { - per_node_go_mapped_domain_gain_loss_outfile_writer.close(); - if ( per_node_counter < 1 ) { - per_node_go_mapped_domain_gain_loss_outfile.delete(); - } - per_node_counter = 0; - } - out.write( "
" ); - out.write( "Pfam domain(s)" ); - out.write( "" ); - out.write( "GO term acc" ); - out.write( "" ); - out.write( "GO term" ); - out.write( "" ); - out.write( "Penultimate GO term" ); - out.write( "" ); - out.write( "GO namespace" ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - } // for( final String id : sorted_ids ) { - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.flush(); - out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" ); - } - public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties, final File output_dir, final Writer per_genome_domain_promiscuity_statistics_writer,