X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FSurfacingUtil.java;h=d3e8c0a0c36e31fde063518a0ccca34b13dbe9f1;hb=2ae595537adc997e493ff35f5ab5d2e80709bac2;hp=0e200dcb6a4b033bf681990b04444e37b7d4caa5;hpb=e367379ed038c8922bf7d898ba64d432c8ad3aec;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 0e200dc..d3e8c0a 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -34,6 +34,7 @@ import java.io.Writer; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -164,11 +165,15 @@ public final class SurfacingUtil { private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, final String outfilename_for_counts, final String outfilename_for_dc, - final String outfilename_for_dc_for_go_mapping ) { + final String outfilename_for_dc_for_go_mapping, + final String outfilename_for_dc_for_go_mapping_unique, + final String outfilename_for_rank_counts, + final String outfilename_for_ancestor_species_counts ) { try { final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); + final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); final SortedMap dc_gain_counts = new TreeMap(); for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { final PhylogenyNode n = it.next(); @@ -185,20 +190,29 @@ public final class SurfacingUtil { final SortedMap histogram = new TreeMap(); final SortedMap domain_lists = new TreeMap(); final SortedMap> domain_lists_go = new TreeMap>(); + final SortedMap> domain_lists_go_unique = new TreeMap>(); final Set dcs = dc_gain_counts.keySet(); + final SortedSet more_than_once = new TreeSet(); for( final String dc : dcs ) { final int count = dc_gain_counts.get( dc ); if ( histogram.containsKey( count ) ) { histogram.put( count, histogram.get( count ) + 1 ); - domain_lists.put( count, domain_lists.get( count ).append( ", " + dc ) ); - domain_lists_go.get( count ).add( dc ); + domain_lists.get( count ).append( ", " + dc ); + domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); } else { histogram.put( count, 1 ); domain_lists.put( count, new StringBuilder( dc ) ); final PriorityQueue q = new PriorityQueue(); - q.add( dc ); + q.addAll( splitDomainCombination( dc ) ); domain_lists_go.put( count, q ); + final SortedSet set = new TreeSet(); + set.addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.put( count, set ); + } + if ( count > 1 ) { + more_than_once.add( dc ); } } final Set histogram_keys = histogram.keySet(); @@ -207,10 +221,67 @@ public final class SurfacingUtil { final StringBuilder dc = domain_lists.get( histogram_key ); out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); + out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); + Arrays.sort( sorted ); + for( final Object domain : sorted ) { + out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { + out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); + } } out_counts.close(); out_dc.close(); out_dc_for_go_mapping.close(); + out_dc_for_go_mapping_unique.close(); + // + final SortedMap lca_rank_counts = new TreeMap(); + final SortedMap lca_ancestor_species_counts = new TreeMap(); + for( final String dc : more_than_once ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { + nodes.add( n ); + } + } + for( int i = 0; i < nodes.size() - 1; ++i ) { + for( int j = i + 1; j < nodes.size(); ++j ) { + final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ), + nodes.get( j ) ); + String rank = "unknown"; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { + rank = lca.getNodeData().getTaxonomy().getRank(); + } + addToCountMap( lca_rank_counts, rank ); + String lca_species; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getScientificName(); + } + else if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + } + else { + lca_species = lca.getName(); + } + addToCountMap( lca_ancestor_species_counts, lca_species ); + } + } + } + final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); + final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); + ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); + ForesterUtil.map2writer( out_for_ancestor_species_counts, + lca_ancestor_species_counts, + "\t", + ForesterUtil.LINE_SEPARATOR ); + out_for_rank_counts.close(); + out_for_ancestor_species_counts.close(); } catch ( final IOException e ) { ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); @@ -222,6 +293,18 @@ public final class SurfacingUtil { ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to (for GO mapping) [" + outfilename_for_dc_for_go_mapping + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" + + outfilename_for_dc_for_go_mapping_unique + "]" ); + } + + private final static void addToCountMap( final Map map, final String s ) { + if ( map.containsKey( s ) ) { + map.put( s, map.get( s ) + 1 ); + } + else { + map.put( s, 1 ); + } } public static int calculateOverlap( final Domain domain, final List covered_positions ) { @@ -701,7 +784,9 @@ public final class SurfacingUtil { calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX ); + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name + + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" ); } } @@ -870,11 +955,17 @@ public final class SurfacingUtil { final PhylogenyNode n = it.next(); if ( ForesterUtil.isEmpty( n.getName() ) && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() - .getScientificName() ) ) ) { + .getScientificName() ) ) + && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() + .getCommonName() ) ) ) { if ( n.getParent() != null ) { names.append( " " ); names.append( n.getParent().getName() ); } + final List l = n.getAllExternalDescendants(); + for( final Object object : l ) { + System.out.println( l.toString() ); + } ++c; } } @@ -1066,6 +1157,19 @@ public final class SurfacingUtil { return domains; } + private static List splitDomainCombination( final String dc ) { + final String[] s = dc.split( "=" ); + if ( s.length != 2 ) { + ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " + + dc ); + System.exit( -1 ); + } + final List l = new ArrayList( 2 ); + l.add( s[ 0 ] ); + l.add( s[ 1 ] ); + return l; + } + public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p, final boolean get_gains, final String outdir, @@ -2249,4 +2353,74 @@ public final class SurfacingUtil { domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(), phylogeny ); } + + public static void domainsPerProteinsStatistics( final String genome, + final List protein_list, + final DescriptiveStatistics all_genomes_domains_per_potein_stats, + final SortedMap all_genomes_domains_per_potein_histo, + final SortedSet domains_which_are_always_single, + final SortedSet domains_which_are_sometimes_single_sometimes_not, + final SortedSet domains_which_never_single, + final Writer writer ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Protein protein : protein_list ) { + final int domains = protein.getNumberOfProteinDomains(); + //System.out.println( domains ); + stats.addValue( domains ); + all_genomes_domains_per_potein_stats.addValue( domains ); + if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { + all_genomes_domains_per_potein_histo.put( domains, 1 ); + } + else { + all_genomes_domains_per_potein_histo.put( domains, + 1 + all_genomes_domains_per_potein_histo.get( domains ) ); + } + if ( domains == 1 ) { + final String domain = protein.getProteinDomain( 0 ).getDomainId().getId(); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_never_single.contains( domain ) ) { + domains_which_never_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_are_always_single.add( domain ); + } + } + } + else if ( domains > 1 ) { + for( final Domain d : protein.getProteinDomains() ) { + final String domain = d.getDomainId().getId(); + // System.out.println( domain ); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_are_always_single.contains( domain ) ) { + domains_which_are_always_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_never_single.add( domain ); + } + } + } + } + } + try { + writer.write( genome ); + writer.write( "\t" ); + writer.write( stats.arithmeticMean() + "" ); + writer.write( "\t" ); + writer.write( stats.sampleStandardDeviation() + "" ); + writer.write( "\t" ); + writer.write( stats.median() + "" ); + writer.write( "\t" ); + writer.write( stats.getN() + "" ); + writer.write( "\t" ); + writer.write( stats.getMin() + "" ); + writer.write( "\t" ); + writer.write( stats.getMax() + "" ); + writer.write( "\n" ); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } }