in progress
[jalview.git] / forester / java / src / org / forester / application / surfacing.java
index 08dde5e..a085c9d 100644 (file)
@@ -46,7 +46,7 @@ import java.util.TreeSet;
 
 import org.forester.evoinference.distance.NeighborJoining;
 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
-import org.forester.evoinference.matrix.distance.DistanceMatrix;
+import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
 import org.forester.go.GoId;
 import org.forester.go.GoNameSpace;
 import org.forester.go.GoTerm;
@@ -64,6 +64,7 @@ import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.protein.BinaryDomainCombination;
+import org.forester.protein.Domain;
 import org.forester.protein.DomainId;
 import org.forester.protein.Protein;
 import org.forester.species.BasicSpecies;
@@ -234,8 +235,8 @@ public class surfacing {
     final static private String                               INPUT_SPECIES_TREE_OPTION                                                     = "species_tree";
     final static private String                               SEQ_EXTRACT_OPTION                                                            = "prot_extract";
     final static private char                                 SEPARATOR_FOR_INPUT_VALUES                                                    = '#';
-    final static private String                               PRG_VERSION                                                                   = "2.240";
-    final static private String                               PRG_DATE                                                                      = "2012.05.04";
+    final static private String                               PRG_VERSION                                                                   = "2.250";
+    final static private String                               PRG_DATE                                                                      = "2012.05.07";
     final static private String                               E_MAIL                                                                        = "czmasek@burnham.org";
     final static private String                               WWW                                                                           = "www.phylosoft.org/forester/applications/surfacing";
     final static private boolean                              IGNORE_DUFS_DEFAULT                                                           = true;
@@ -285,6 +286,7 @@ public class surfacing {
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX                       = "_indep_dc_gains_fitch_lists_MAPPED.txt";
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX        = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt";
     public static final String                                INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt";
+    private static final boolean                              PERFORM_DC_REGAIN_PROTEINS_STATS                                              = true;
 
     private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
                                                                  final String[][] input_file_properties,
@@ -544,7 +546,8 @@ public class surfacing {
         return intrees;
     }
 
-    private static List<Phylogeny> inferSpeciesTrees( final File outfile, final List<DistanceMatrix> distances_list ) {
+    private static List<Phylogeny> inferSpeciesTrees( final File outfile,
+                                                      final List<BasicSymmetricalDistanceMatrix> distances_list ) {
         final NeighborJoining nj = NeighborJoining.createInstance();
         final List<Phylogeny> phylogenies = nj.execute( distances_list );
         final PhylogenyWriter w = new PhylogenyWriter();
@@ -1759,8 +1762,13 @@ public class surfacing {
         catch ( final IOException e3 ) {
             e3.printStackTrace();
         }
-        final Map<String, DescriptiveStatistics> protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
-        final Map<String, DescriptiveStatistics> domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+        Map<String, DescriptiveStatistics> protein_length_stats_by_dc = null;
+        Map<String, DescriptiveStatistics> domain_number_stats_by_dc = null;
+        final Map<String, DescriptiveStatistics> domain_length_stats_by_domain = new HashMap<String, DescriptiveStatistics>();
+        if ( PERFORM_DC_REGAIN_PROTEINS_STATS ) {
+            protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+            domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+        }
         // Main loop:
         for( int i = 0; i < number_of_genomes; ++i ) {
             System.out.println();
@@ -1911,6 +1919,13 @@ public class surfacing {
                     dc_data_writer.write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" )
                             .toString() );
                     ++count;
+                    for( final Domain d : protein.getProteinDomains() ) {
+                        final String d_str = d.getDomainId().toString();
+                        if ( !domain_length_stats_by_domain.containsKey( d_str ) ) {
+                            domain_length_stats_by_domain.put( d_str, new BasicDescriptiveStatistics() );
+                        }
+                        domain_length_stats_by_domain.get( d_str ).addValue( d.getLength() );
+                    }
                 }
             }
             catch ( final IOException e ) {
@@ -2261,7 +2276,8 @@ public class surfacing {
                                                         all_bin_domain_combinations_lost_fitch,
                                                         dc_type,
                                                         protein_length_stats_by_dc,
-                                                        domain_number_stats_by_dc );
+                                                        domain_number_stats_by_dc,
+                                                        domain_length_stats_by_domain );
                 // Listing of all domain combinations gained is only done if only one input tree is used. 
                 if ( ( domain_id_to_secondary_features_maps != null )
                         && ( domain_id_to_secondary_features_maps.length > 0 ) ) {