in progress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
index 0e200dc..862621a 100644 (file)
@@ -34,6 +34,7 @@ import java.io.Writer;
 import java.text.DecimalFormat;
 import java.text.NumberFormat;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
@@ -164,11 +165,15 @@ public final class SurfacingUtil {
     private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
                                                                     final String outfilename_for_counts,
                                                                     final String outfilename_for_dc,
-                                                                    final String outfilename_for_dc_for_go_mapping ) {
+                                                                    final String outfilename_for_dc_for_go_mapping,
+                                                                    final String outfilename_for_dc_for_go_mapping_unique,
+                                                                    final String outfilename_for_rank_counts,
+                                                                    final String outfilename_for_ancestor_species_counts ) {
         try {
             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
             final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
+            final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
             final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
             for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
                 final PhylogenyNode n = it.next();
@@ -185,20 +190,29 @@ public final class SurfacingUtil {
             final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
             final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
+            final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
             final Set<String> dcs = dc_gain_counts.keySet();
+            final SortedSet<String> more_than_once = new TreeSet<String>();
             for( final String dc : dcs ) {
                 final int count = dc_gain_counts.get( dc );
                 if ( histogram.containsKey( count ) ) {
                     histogram.put( count, histogram.get( count ) + 1 );
-                    domain_lists.put( count, domain_lists.get( count ).append( ", " + dc ) );
-                    domain_lists_go.get( count ).add( dc );
+                    domain_lists.get( count ).append( ", " + dc );
+                    domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
+                    domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
                 }
                 else {
                     histogram.put( count, 1 );
                     domain_lists.put( count, new StringBuilder( dc ) );
                     final PriorityQueue<String> q = new PriorityQueue<String>();
-                    q.add( dc );
+                    q.addAll( splitDomainCombination( dc ) );
                     domain_lists_go.put( count, q );
+                    final SortedSet<String> set = new TreeSet<String>();
+                    set.addAll( splitDomainCombination( dc ) );
+                    domain_lists_go_unique.put( count, set );
+                }
+                if ( count > 1 ) {
+                    more_than_once.add( dc );
                 }
             }
             final Set<Integer> histogram_keys = histogram.keySet();
@@ -207,10 +221,67 @@ public final class SurfacingUtil {
                 final StringBuilder dc = domain_lists.get( histogram_key );
                 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
                 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
+                out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
+                final Object[] sorted = domain_lists_go.get( histogram_key ).toArray();
+                Arrays.sort( sorted );
+                for( final Object domain : sorted ) {
+                    out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR );
+                }
+                out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
+                for( final String domain : domain_lists_go_unique.get( histogram_key ) ) {
+                    out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR );
+                }
             }
             out_counts.close();
             out_dc.close();
             out_dc_for_go_mapping.close();
+            out_dc_for_go_mapping_unique.close();
+            //
+            final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
+            final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
+            for( final String dc : more_than_once ) {
+                final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
+                for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
+                    final PhylogenyNode n = it.next();
+                    if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
+                        nodes.add( n );
+                    }
+                }
+                for( int i = 0; i < nodes.size() - 1; ++i ) {
+                    for( int j = i + 1; j < nodes.size(); ++j ) {
+                        final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ),
+                                                                                            nodes.get( j ) );
+                        String rank = "unknown";
+                        if ( lca.getNodeData().isHasTaxonomy()
+                                && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
+                            rank = lca.getNodeData().getTaxonomy().getRank();
+                        }
+                        addToCountMap( lca_rank_counts, rank );
+                        String lca_species;
+                        if ( lca.getNodeData().isHasTaxonomy()
+                                && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
+                            lca_species = lca.getNodeData().getTaxonomy().getScientificName();
+                        }
+                        else if ( lca.getNodeData().isHasTaxonomy()
+                                && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
+                            lca_species = lca.getNodeData().getTaxonomy().getCommonName();
+                        }
+                        else {
+                            lca_species = lca.getName();
+                        }
+                        addToCountMap( lca_ancestor_species_counts, lca_species );
+                    }
+                }
+            }
+            final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
+            final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
+            ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
+            ForesterUtil.map2writer( out_for_ancestor_species_counts,
+                                     lca_ancestor_species_counts,
+                                     "\t",
+                                     ForesterUtil.LINE_SEPARATOR );
+            out_for_rank_counts.close();
+            out_for_ancestor_species_counts.close();
         }
         catch ( final IOException e ) {
             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
@@ -222,6 +293,18 @@ public final class SurfacingUtil {
         ForesterUtil.programMessage( surfacing.PRG_NAME,
                                      "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
                                              + outfilename_for_dc_for_go_mapping + "]" );
+        ForesterUtil.programMessage( surfacing.PRG_NAME,
+                                     "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
+                                             + outfilename_for_dc_for_go_mapping_unique + "]" );
+    }
+
+    private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
+        if ( map.containsKey( s ) ) {
+            map.put( s, map.get( s ) + 1 );
+        }
+        else {
+            map.put( s, 1 );
+        }
     }
 
     public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
@@ -701,7 +784,9 @@ public final class SurfacingUtil {
             calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name
                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
-                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX );
+                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name
+                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name
+                    + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" );
         }
     }
 
@@ -870,11 +955,17 @@ public final class SurfacingUtil {
             final PhylogenyNode n = it.next();
             if ( ForesterUtil.isEmpty( n.getName() )
                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
-                            .getScientificName() ) ) ) {
+                            .getScientificName() ) )
+                    && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
+                            .getCommonName() ) ) ) {
                 if ( n.getParent() != null ) {
                     names.append( " " );
                     names.append( n.getParent().getName() );
                 }
+                final List l = n.getAllExternalDescendants();
+                for( final Object object : l ) {
+                    System.out.println( l.toString() );
+                }
                 ++c;
             }
         }
@@ -1066,6 +1157,19 @@ public final class SurfacingUtil {
         return domains;
     }
 
+    private static List<String> splitDomainCombination( final String dc ) {
+        final String[] s = dc.split( "=" );
+        if ( s.length != 2 ) {
+            ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
+                    + dc );
+            System.exit( -1 );
+        }
+        final List<String> l = new ArrayList<String>( 2 );
+        l.add( s[ 0 ] );
+        l.add( s[ 1 ] );
+        return l;
+    }
+
     public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
                                                             final boolean get_gains,
                                                             final String outdir,
@@ -2249,4 +2353,74 @@ public final class SurfacingUtil {
                       domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
                       phylogeny );
     }
+
+    public static void domainsPerProteinsStatistics( final String genome,
+                                                     final List<Protein> protein_list,
+                                                     final DescriptiveStatistics all_genomes_domains_per_potein_stats,
+                                                     final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
+                                                     final SortedSet<String> domains_which_are_always_single,
+                                                     final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
+                                                     final SortedSet<String> domains_which_never_single,
+                                                     final Writer writer ) {
+        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        for( final Protein protein : protein_list ) {
+            final int domains = protein.getNumberOfProteinDomains();
+            //System.out.println( domains );
+            stats.addValue( domains );
+            all_genomes_domains_per_potein_stats.addValue( domains );
+            if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
+                all_genomes_domains_per_potein_histo.put( domains, 1 );
+            }
+            else {
+                all_genomes_domains_per_potein_histo.put( domains,
+                                                          1 + all_genomes_domains_per_potein_histo.get( domains ) );
+            }
+            if ( domains == 1 ) {
+                final String domain = protein.getProteinDomain( 0 ).getDomainId().getId();
+                if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
+                    if ( domains_which_never_single.contains( domain ) ) {
+                        domains_which_never_single.remove( domain );
+                        domains_which_are_sometimes_single_sometimes_not.add( domain );
+                    }
+                    else {
+                        domains_which_are_always_single.add( domain );
+                    }
+                }
+            }
+            else if ( domains > 1 ) {
+                for( final Domain d : protein.getProteinDomains() ) {
+                    final String domain = d.getDomainId().getId();
+                    // System.out.println( domain );
+                    if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
+                        if ( domains_which_are_always_single.contains( domain ) ) {
+                            domains_which_are_always_single.remove( domain );
+                            domains_which_are_sometimes_single_sometimes_not.add( domain );
+                        }
+                        else {
+                            domains_which_never_single.add( domain );
+                        }
+                    }
+                }
+            }
+        }
+        try {
+            writer.write( genome );
+            writer.write( "\t" );
+            writer.write( stats.arithmeticMean() + "" );
+            writer.write( "\t" );
+            writer.write( stats.sampleStandardDeviation() + "" );
+            writer.write( "\t" );
+            writer.write( stats.median() + "" );
+            writer.write( "\t" );
+            writer.write( stats.getN() + "" );
+            writer.write( "\t" );
+            writer.write( stats.getMin() + "" );
+            writer.write( "\t" );
+            writer.write( stats.getMax() + "" );
+            writer.write( "\n" );
+        }
+        catch ( final IOException e ) {
+            e.printStackTrace();
+        }
+    }
 }