in progress
[jalview.git] / forester / java / src / org / forester / application / surfacing.java
index fc9567d..a085c9d 100644 (file)
@@ -46,7 +46,7 @@ import java.util.TreeSet;
 
 import org.forester.evoinference.distance.NeighborJoining;
 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
-import org.forester.evoinference.matrix.distance.DistanceMatrix;
+import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
 import org.forester.go.GoId;
 import org.forester.go.GoNameSpace;
 import org.forester.go.GoTerm;
@@ -64,6 +64,7 @@ import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.protein.BinaryDomainCombination;
+import org.forester.protein.Domain;
 import org.forester.protein.DomainId;
 import org.forester.protein.Protein;
 import org.forester.species.BasicSpecies;
@@ -234,8 +235,8 @@ public class surfacing {
     final static private String                               INPUT_SPECIES_TREE_OPTION                                                     = "species_tree";
     final static private String                               SEQ_EXTRACT_OPTION                                                            = "prot_extract";
     final static private char                                 SEPARATOR_FOR_INPUT_VALUES                                                    = '#';
-    final static private String                               PRG_VERSION                                                                   = "2.230";
-    final static private String                               PRG_DATE                                                                      = "2012.04.22";
+    final static private String                               PRG_VERSION                                                                   = "2.250";
+    final static private String                               PRG_DATE                                                                      = "2012.05.07";
     final static private String                               E_MAIL                                                                        = "czmasek@burnham.org";
     final static private String                               WWW                                                                           = "www.phylosoft.org/forester/applications/surfacing";
     final static private boolean                              IGNORE_DUFS_DEFAULT                                                           = true;
@@ -285,6 +286,7 @@ public class surfacing {
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX                       = "_indep_dc_gains_fitch_lists_MAPPED.txt";
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX        = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt";
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt";
+    private static final boolean                              PERFORM_DC_REGAIN_PROTEINS_STATS                                              = true;
 
     private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
                                                                  final String[][] input_file_properties,
@@ -358,6 +360,7 @@ public class surfacing {
      * @param sum_of_all_domains_encountered
      * @param all_bin_domain_combinations_encountered
      * @param is_gains_analysis
+     * @param protein_length_stats_by_dc 
      * @throws IOException
      */
     private static void executeFitchGainsAnalysis( final File output_file,
@@ -543,7 +546,8 @@ public class surfacing {
         return intrees;
     }
 
-    private static List<Phylogeny> inferSpeciesTrees( final File outfile, final List<DistanceMatrix> distances_list ) {
+    private static List<Phylogeny> inferSpeciesTrees( final File outfile,
+                                                      final List<BasicSymmetricalDistanceMatrix> distances_list ) {
         final NeighborJoining nj = NeighborJoining.createInstance();
         final List<Phylogeny> phylogenies = nj.execute( distances_list );
         final PhylogenyWriter w = new PhylogenyWriter();
@@ -1412,7 +1416,6 @@ public class surfacing {
         System.out.println( "Ignore combination with self: " + ignore_combination_with_same );
         html_desc.append( "<tr><td>Ignore combination with self for domain combination similarity analyses:</td><td>"
                 + ignore_combination_with_same + "</td></tr>" + nl );
-        ;
         System.out.println( "Consider directedness       : "
                 + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) );
         html_desc.append( "<tr><td>Consider directedness of binary domain combinations:</td><td>"
@@ -1759,6 +1762,13 @@ public class surfacing {
         catch ( final IOException e3 ) {
             e3.printStackTrace();
         }
+        Map<String, DescriptiveStatistics> protein_length_stats_by_dc = null;
+        Map<String, DescriptiveStatistics> domain_number_stats_by_dc = null;
+        final Map<String, DescriptiveStatistics> domain_length_stats_by_domain = new HashMap<String, DescriptiveStatistics>();
+        if ( PERFORM_DC_REGAIN_PROTEINS_STATS ) {
+            protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+            domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+        }
         // Main loop:
         for( int i = 0; i < number_of_genomes; ++i ) {
             System.out.println();
@@ -1909,6 +1919,13 @@ public class surfacing {
                     dc_data_writer.write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" )
                             .toString() );
                     ++count;
+                    for( final Domain d : protein.getProteinDomains() ) {
+                        final String d_str = d.getDomainId().toString();
+                        if ( !domain_length_stats_by_domain.containsKey( d_str ) ) {
+                            domain_length_stats_by_domain.put( d_str, new BasicDescriptiveStatistics() );
+                        }
+                        domain_length_stats_by_domain.get( d_str ).addValue( d.getLength() );
+                    }
                 }
             }
             catch ( final IOException e ) {
@@ -1927,7 +1944,9 @@ public class surfacing {
                                      ignore_combination_with_same,
                                      new BasicSpecies( input_file_properties[ i ][ 1 ] ),
                                      domain_id_to_go_ids_map,
-                                     dc_type ) );
+                                     dc_type,
+                                     protein_length_stats_by_dc,
+                                     domain_number_stats_by_dc ) );
             domain_lengths_table.addLengths( protein_list );
             if ( gwcd_list.get( i ).getSize() > 0 ) {
                 SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties,
@@ -2255,7 +2274,10 @@ public class surfacing {
                                                         output_binary_domain_combinationsfor_graph_analysis,
                                                         all_bin_domain_combinations_gained_fitch,
                                                         all_bin_domain_combinations_lost_fitch,
-                                                        dc_type );
+                                                        dc_type,
+                                                        protein_length_stats_by_dc,
+                                                        domain_number_stats_by_dc,
+                                                        domain_length_stats_by_domain );
                 // Listing of all domain combinations gained is only done if only one input tree is used. 
                 if ( ( domain_id_to_secondary_features_maps != null )
                         && ( domain_id_to_secondary_features_maps.length > 0 ) ) {