in progress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
index 60cf2f4..862621a 100644 (file)
@@ -166,7 +166,9 @@ public final class SurfacingUtil {
                                                                     final String outfilename_for_counts,
                                                                     final String outfilename_for_dc,
                                                                     final String outfilename_for_dc_for_go_mapping,
-                                                                    final String outfilename_for_dc_for_go_mapping_unique ) {
+                                                                    final String outfilename_for_dc_for_go_mapping_unique,
+                                                                    final String outfilename_for_rank_counts,
+                                                                    final String outfilename_for_ancestor_species_counts ) {
         try {
             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
@@ -190,6 +192,7 @@ public final class SurfacingUtil {
             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
             final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
             final Set<String> dcs = dc_gain_counts.keySet();
+            final SortedSet<String> more_than_once = new TreeSet<String>();
             for( final String dc : dcs ) {
                 final int count = dc_gain_counts.get( dc );
                 if ( histogram.containsKey( count ) ) {
@@ -208,6 +211,9 @@ public final class SurfacingUtil {
                     set.addAll( splitDomainCombination( dc ) );
                     domain_lists_go_unique.put( count, set );
                 }
+                if ( count > 1 ) {
+                    more_than_once.add( dc );
+                }
             }
             final Set<Integer> histogram_keys = histogram.keySet();
             for( final Integer histogram_key : histogram_keys ) {
@@ -230,6 +236,52 @@ public final class SurfacingUtil {
             out_dc.close();
             out_dc_for_go_mapping.close();
             out_dc_for_go_mapping_unique.close();
+            //
+            final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
+            final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
+            for( final String dc : more_than_once ) {
+                final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
+                for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
+                    final PhylogenyNode n = it.next();
+                    if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
+                        nodes.add( n );
+                    }
+                }
+                for( int i = 0; i < nodes.size() - 1; ++i ) {
+                    for( int j = i + 1; j < nodes.size(); ++j ) {
+                        final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ),
+                                                                                            nodes.get( j ) );
+                        String rank = "unknown";
+                        if ( lca.getNodeData().isHasTaxonomy()
+                                && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
+                            rank = lca.getNodeData().getTaxonomy().getRank();
+                        }
+                        addToCountMap( lca_rank_counts, rank );
+                        String lca_species;
+                        if ( lca.getNodeData().isHasTaxonomy()
+                                && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
+                            lca_species = lca.getNodeData().getTaxonomy().getScientificName();
+                        }
+                        else if ( lca.getNodeData().isHasTaxonomy()
+                                && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
+                            lca_species = lca.getNodeData().getTaxonomy().getCommonName();
+                        }
+                        else {
+                            lca_species = lca.getName();
+                        }
+                        addToCountMap( lca_ancestor_species_counts, lca_species );
+                    }
+                }
+            }
+            final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
+            final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
+            ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
+            ForesterUtil.map2writer( out_for_ancestor_species_counts,
+                                     lca_ancestor_species_counts,
+                                     "\t",
+                                     ForesterUtil.LINE_SEPARATOR );
+            out_for_rank_counts.close();
+            out_for_ancestor_species_counts.close();
         }
         catch ( final IOException e ) {
             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
@@ -246,6 +298,15 @@ public final class SurfacingUtil {
                                              + outfilename_for_dc_for_go_mapping_unique + "]" );
     }
 
+    private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
+        if ( map.containsKey( s ) ) {
+            map.put( s, map.get( s ) + 1 );
+        }
+        else {
+            map.put( s, 1 );
+        }
+    }
+
     public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
         int overlap_count = 0;
         for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
@@ -724,7 +785,8 @@ public final class SurfacingUtil {
                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name
-                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX );
+                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name
+                    + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" );
         }
     }
 
@@ -893,11 +955,17 @@ public final class SurfacingUtil {
             final PhylogenyNode n = it.next();
             if ( ForesterUtil.isEmpty( n.getName() )
                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
-                            .getScientificName() ) ) ) {
+                            .getScientificName() ) )
+                    && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
+                            .getCommonName() ) ) ) {
                 if ( n.getParent() != null ) {
                     names.append( " " );
                     names.append( n.getParent().getName() );
                 }
+                final List l = n.getAllExternalDescendants();
+                for( final Object object : l ) {
+                    System.out.println( l.toString() );
+                }
                 ++c;
             }
         }
@@ -2285,4 +2353,74 @@ public final class SurfacingUtil {
                       domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
                       phylogeny );
     }
+
+    public static void domainsPerProteinsStatistics( final String genome,
+                                                     final List<Protein> protein_list,
+                                                     final DescriptiveStatistics all_genomes_domains_per_potein_stats,
+                                                     final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
+                                                     final SortedSet<String> domains_which_are_always_single,
+                                                     final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
+                                                     final SortedSet<String> domains_which_never_single,
+                                                     final Writer writer ) {
+        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        for( final Protein protein : protein_list ) {
+            final int domains = protein.getNumberOfProteinDomains();
+            //System.out.println( domains );
+            stats.addValue( domains );
+            all_genomes_domains_per_potein_stats.addValue( domains );
+            if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
+                all_genomes_domains_per_potein_histo.put( domains, 1 );
+            }
+            else {
+                all_genomes_domains_per_potein_histo.put( domains,
+                                                          1 + all_genomes_domains_per_potein_histo.get( domains ) );
+            }
+            if ( domains == 1 ) {
+                final String domain = protein.getProteinDomain( 0 ).getDomainId().getId();
+                if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
+                    if ( domains_which_never_single.contains( domain ) ) {
+                        domains_which_never_single.remove( domain );
+                        domains_which_are_sometimes_single_sometimes_not.add( domain );
+                    }
+                    else {
+                        domains_which_are_always_single.add( domain );
+                    }
+                }
+            }
+            else if ( domains > 1 ) {
+                for( final Domain d : protein.getProteinDomains() ) {
+                    final String domain = d.getDomainId().getId();
+                    // System.out.println( domain );
+                    if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
+                        if ( domains_which_are_always_single.contains( domain ) ) {
+                            domains_which_are_always_single.remove( domain );
+                            domains_which_are_sometimes_single_sometimes_not.add( domain );
+                        }
+                        else {
+                            domains_which_never_single.add( domain );
+                        }
+                    }
+                }
+            }
+        }
+        try {
+            writer.write( genome );
+            writer.write( "\t" );
+            writer.write( stats.arithmeticMean() + "" );
+            writer.write( "\t" );
+            writer.write( stats.sampleStandardDeviation() + "" );
+            writer.write( "\t" );
+            writer.write( stats.median() + "" );
+            writer.write( "\t" );
+            writer.write( stats.getN() + "" );
+            writer.write( "\t" );
+            writer.write( stats.getMin() + "" );
+            writer.write( "\t" );
+            writer.write( stats.getMax() + "" );
+            writer.write( "\n" );
+        }
+        catch ( final IOException e ) {
+            e.printStackTrace();
+        }
+    }
 }