in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sat, 5 May 2012 04:16:19 +0000 (04:16 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sat, 5 May 2012 04:16:19 +0000 (04:16 +0000)
forester/java/src/org/forester/application/surfacing.java
forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java
forester/java/src/org/forester/surfacing/SurfacingUtil.java
forester/java/src/org/forester/surfacing/TestSurfacing.java

index fc9567d..08dde5e 100644 (file)
@@ -234,8 +234,8 @@ public class surfacing {
     final static private String                               INPUT_SPECIES_TREE_OPTION                                                     = "species_tree";
     final static private String                               SEQ_EXTRACT_OPTION                                                            = "prot_extract";
     final static private char                                 SEPARATOR_FOR_INPUT_VALUES                                                    = '#';
-    final static private String                               PRG_VERSION                                                                   = "2.230";
-    final static private String                               PRG_DATE                                                                      = "2012.04.22";
+    final static private String                               PRG_VERSION                                                                   = "2.240";
+    final static private String                               PRG_DATE                                                                      = "2012.05.04";
     final static private String                               E_MAIL                                                                        = "czmasek@burnham.org";
     final static private String                               WWW                                                                           = "www.phylosoft.org/forester/applications/surfacing";
     final static private boolean                              IGNORE_DUFS_DEFAULT                                                           = true;
@@ -358,6 +358,7 @@ public class surfacing {
      * @param sum_of_all_domains_encountered
      * @param all_bin_domain_combinations_encountered
      * @param is_gains_analysis
+     * @param protein_length_stats_by_dc 
      * @throws IOException
      */
     private static void executeFitchGainsAnalysis( final File output_file,
@@ -1412,7 +1413,6 @@ public class surfacing {
         System.out.println( "Ignore combination with self: " + ignore_combination_with_same );
         html_desc.append( "<tr><td>Ignore combination with self for domain combination similarity analyses:</td><td>"
                 + ignore_combination_with_same + "</td></tr>" + nl );
-        ;
         System.out.println( "Consider directedness       : "
                 + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) );
         html_desc.append( "<tr><td>Consider directedness of binary domain combinations:</td><td>"
@@ -1759,6 +1759,8 @@ public class surfacing {
         catch ( final IOException e3 ) {
             e3.printStackTrace();
         }
+        final Map<String, DescriptiveStatistics> protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+        final Map<String, DescriptiveStatistics> domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
         // Main loop:
         for( int i = 0; i < number_of_genomes; ++i ) {
             System.out.println();
@@ -1927,7 +1929,9 @@ public class surfacing {
                                      ignore_combination_with_same,
                                      new BasicSpecies( input_file_properties[ i ][ 1 ] ),
                                      domain_id_to_go_ids_map,
-                                     dc_type ) );
+                                     dc_type,
+                                     protein_length_stats_by_dc,
+                                     domain_number_stats_by_dc ) );
             domain_lengths_table.addLengths( protein_list );
             if ( gwcd_list.get( i ).getSize() > 0 ) {
                 SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties,
@@ -2255,7 +2259,9 @@ public class surfacing {
                                                         output_binary_domain_combinationsfor_graph_analysis,
                                                         all_bin_domain_combinations_gained_fitch,
                                                         all_bin_domain_combinations_lost_fitch,
-                                                        dc_type );
+                                                        dc_type,
+                                                        protein_length_stats_by_dc,
+                                                        domain_number_stats_by_dc );
                 // Listing of all domain combinations gained is only done if only one input tree is used. 
                 if ( ( domain_id_to_secondary_features_maps != null )
                         && ( domain_id_to_secondary_features_maps.length > 0 ) ) {
index 6112bdb..ef98751 100644 (file)
@@ -264,21 +264,25 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom
                                ignore_combination_with_same_domain,
                                species,
                                null,
-                               DomainCombinationType.BASIC );
+                               DomainCombinationType.BASIC,
+                               null,
+                               null );
     }
 
     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
                                                                    final boolean ignore_combination_with_same_domain,
                                                                    final Species species,
                                                                    final DomainCombinationType dc_type ) {
-        return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type );
+        return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type, null, null );
     }
 
     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
                                                                    final boolean ignore_combination_with_same_domain,
                                                                    final Species species,
                                                                    final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
-                                                                   final DomainCombinationType dc_type ) {
+                                                                   final DomainCombinationType dc_type,
+                                                                   final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+                                                                   final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
         final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
         final Map<DomainId, Integer> domain_counts = new HashMap<DomainId, Integer>();
         final Map<DomainId, Integer> domain_protein_counts = new HashMap<DomainId, Integer>();
@@ -361,6 +365,27 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom
                     if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
                         domain_combination.addCombinableDomain( closest.getDomainId() );
                     }
+                    if ( protein_length_stats_by_dc != null ) {
+                        final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
+                        for( final BinaryDomainCombination dc : dcs ) {
+                            final String dc_str = dc.toString();
+                            if ( !protein_length_stats_by_dc.containsKey( dc_str ) ) {
+                                protein_length_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
+                            }
+                            protein_length_stats_by_dc.get( dc_str ).addValue( protein.getLength() );
+                        }
+                    }
+                    if ( domain_number_stats_by_dc != null ) {
+                        final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
+                        for( final BinaryDomainCombination dc : dcs ) {
+                            final String dc_str = dc.toString();
+                            if ( !domain_number_stats_by_dc.containsKey( dc_str ) ) {
+                                domain_number_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
+                            }
+                            domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() );
+                        }
+                    }
+                    //
                 }
             }
         }
index c0b8b8f..e998e82 100644 (file)
@@ -41,6 +41,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.SortedMap;
@@ -176,7 +177,9 @@ public final class SurfacingUtil {
                                                                     final String outfilename_for_dc_for_go_mapping,
                                                                     final String outfilename_for_dc_for_go_mapping_unique,
                                                                     final String outfilename_for_rank_counts,
-                                                                    final String outfilename_for_ancestor_species_counts ) {
+                                                                    final String outfilename_for_ancestor_species_counts,
+                                                                    final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+                                                                    final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
         try {
             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
@@ -197,6 +200,8 @@ public final class SurfacingUtil {
             }
             final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
             final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
+            final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
+            final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
             final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
             final Set<String> dcs = dc_gain_counts.keySet();
@@ -219,6 +224,20 @@ public final class SurfacingUtil {
                     set.addAll( splitDomainCombination( dc ) );
                     domain_lists_go_unique.put( count, set );
                 }
+                if ( protein_length_stats_by_dc != null ) {
+                    if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
+                        dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
+                    }
+                    dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
+                            .arithmeticMean() );
+                }
+                if ( domain_number_stats_by_dc != null ) {
+                    if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
+                        dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
+                    }
+                    dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
+                            .arithmeticMean() );
+                }
                 if ( count > 1 ) {
                     more_than_once.add( dc );
                 }
@@ -290,6 +309,22 @@ public final class SurfacingUtil {
                                      ForesterUtil.LINE_SEPARATOR );
             out_for_rank_counts.close();
             out_for_ancestor_species_counts.close();
+            System.out.println( "Lengths: " );
+            if ( protein_length_stats_by_dc != null ) {
+                for( final Entry<?, ?> entry : dc_reapp_counts_to_protein_length_stats.entrySet() ) {
+                    System.out.println( entry.getKey().toString() );
+                    System.out.println( ": " );
+                    System.out.println( entry.getValue().toString() );
+                }
+            }
+            System.out.println( "Number of domains: " );
+            if ( domain_number_stats_by_dc != null ) {
+                for( final Entry<?, ?> entry : dc_reapp_counts_to_domain_number_stats.entrySet() ) {
+                    System.out.println( entry.getKey().toString() );
+                    System.out.println( ": " );
+                    System.out.println( entry.getValue().toString() );
+                }
+            }
         }
         catch ( final IOException e ) {
             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
@@ -572,7 +607,9 @@ public final class SurfacingUtil {
                                                  final boolean output_binary_domain_combinations_for_graphs,
                                                  final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
                                                  final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
-                                                 final BinaryDomainCombination.DomainCombinationType dc_type ) {
+                                                 final BinaryDomainCombination.DomainCombinationType dc_type,
+                                                 final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+                                                 final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
         final String date_time = ForesterUtil.getCurrentDateTime();
         final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
@@ -789,12 +826,19 @@ public final class SurfacingUtil {
                               parameters_str );
             SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
                     + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
-            calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name
-                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
-                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
-                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name
-                    + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name
-                    + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" );
+            calculateIndependentDomainCombinationGains( local_phylogeny_l,
+                                                        outfile_name
+                                                                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
+                                                        outfile_name
+                                                                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
+                                                        outfile_name
+                                                                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
+                                                        outfile_name
+                                                                + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
+                                                        outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
+                                                        outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
+                                                        protein_length_stats_by_dc,
+                                                        domain_number_stats_by_dc );
         }
     }
 
@@ -867,7 +911,57 @@ public final class SurfacingUtil {
                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
                 + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
-                + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt" );
+                + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null );
+    }
+
+    public static void doit( final List<Protein> proteins,
+                             final List<DomainId> query_domain_ids_nc_order,
+                             final Writer out,
+                             final String separator,
+                             final String limit_to_species,
+                             final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
+        for( final Protein protein : proteins ) {
+            if ( ForesterUtil.isEmpty( limit_to_species )
+                    || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+                if ( protein.contains( query_domain_ids_nc_order, true ) ) {
+                    out.write( protein.getSpecies().getSpeciesId() );
+                    out.write( separator );
+                    out.write( protein.getProteinId().getId() );
+                    out.write( separator );
+                    out.write( "[" );
+                    final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
+                    boolean first = true;
+                    for( final Domain domain : protein.getProteinDomains() ) {
+                        if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
+                            visited_domain_ids.add( domain.getDomainId() );
+                            if ( first ) {
+                                first = false;
+                            }
+                            else {
+                                out.write( " " );
+                            }
+                            out.write( domain.getDomainId().getId() );
+                            out.write( " {" );
+                            out.write( "" + domain.getTotalCount() );
+                            out.write( "}" );
+                        }
+                    }
+                    out.write( "]" );
+                    out.write( separator );
+                    if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
+                            .equals( SurfacingConstants.NONE ) ) ) {
+                        out.write( protein.getDescription() );
+                    }
+                    out.write( separator );
+                    if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
+                            .equals( SurfacingConstants.NONE ) ) ) {
+                        out.write( protein.getAccession() );
+                    }
+                    out.write( SurfacingConstants.NL );
+                }
+            }
+        }
+        out.flush();
     }
 
     public static void extractProteinNames( final List<Protein> proteins,
index 8d4186c..11f4246 100644 (file)
@@ -4108,7 +4108,9 @@ public class TestSurfacing {
                                      true,
                                      new BasicSpecies( "human" ),
                                      null,
-                                     DomainCombinationType.BASIC );
+                                     DomainCombinationType.BASIC,
+                                     null,
+                                     null );
             cd = cdcc2.get( new DomainId( "A" ) );
             if ( cd.getKeyDomainCount() != 9 ) {
                 return false;