cleanup
[jalview.git] / forester / java / src / org / forester / application / gsdi.java
index 4b625cd..4c17f0e 100644 (file)
@@ -31,9 +31,14 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
 
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.nhx.NHXParser;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.writers.PhylogenyWriter;
@@ -45,8 +50,8 @@ import org.forester.phylogeny.factories.PhylogenyFactory;
 import org.forester.sdi.GSDI;
 import org.forester.sdi.SDI;
 import org.forester.sdi.SDI.TaxonomyComparisonBase;
+import org.forester.sdi.SDIException;
 import org.forester.sdi.SDIse;
-import org.forester.sdi.SdiException;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.EasyWriter;
 import org.forester.util.ForesterConstants;
@@ -203,20 +208,42 @@ public final class gsdi {
                 final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree );
                 switch ( comp_base ) {
                     case SCIENTIFIC_NAME:
-                        PhylogenyMethods
-                                .transferNodeNameToField( species_tree,
-                                                          PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1,
-                                                          true );
+                        try {
+                            PhylogenyMethods
+                                    .transferNodeNameToField( species_tree,
+                                                              PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME,
+                                                              true );
+                        }
+                        catch ( final PhyloXmlDataFormatException e ) {
+                            ForesterUtil.fatalError( gsdi.PRG_NAME,
+                                                     "Failed to transfer general node name to scientific name, in ["
+                                                             + species_tree_file + "]: " + e.getMessage() );
+                        }
                         break;
                     case CODE:
-                        PhylogenyMethods.transferNodeNameToField( species_tree,
-                                                                  PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE,
-                                                                  true );
+                        try {
+                            PhylogenyMethods
+                                    .transferNodeNameToField( species_tree,
+                                                              PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE,
+                                                              true );
+                        }
+                        catch ( final PhyloXmlDataFormatException e ) {
+                            ForesterUtil.fatalError( gsdi.PRG_NAME,
+                                                     "Failed to transfer general node name to taxonomy code, in ["
+                                                             + species_tree_file + "]: " + e.getMessage() );
+                        }
                         break;
                     case ID:
-                        PhylogenyMethods.transferNodeNameToField( species_tree,
-                                                                  PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID,
-                                                                  true );
+                        try {
+                            PhylogenyMethods.transferNodeNameToField( species_tree,
+                                                                      PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID,
+                                                                      true );
+                        }
+                        catch ( final PhyloXmlDataFormatException e ) {
+                            ForesterUtil.fatalError( gsdi.PRG_NAME,
+                                                     "Failed to transfer general node name to taxonomy id, in ["
+                                                             + species_tree_file + "]: " + e.getMessage() );
+                        }
                         break;
                     default:
                         ForesterUtil.fatalError( gsdi.PRG_NAME, "unable to determine comparison base" );
@@ -224,8 +251,8 @@ public final class gsdi {
             }
         }
         catch ( final IOException e ) {
-            ForesterUtil.fatalError( gsdi.PRG_NAME,
-                                     "Failed to read species tree from [" + gene_tree_file + "]: " + e.getMessage() );
+            ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to read species tree from [" + species_tree_file + "]: "
+                    + e.getMessage() );
         }
         gene_tree.setRooted( true );
         species_tree.setRooted( true );
@@ -241,39 +268,35 @@ public final class gsdi {
                 ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree is not completely binary, use GSDI instead" );
             }
         }
-        // For timing.
-        // gene_tree = Helper.createBalancedTree( 10 );
-        // species_tree = Helper.createBalancedTree( 13 );
-        // species_tree = Helper.createUnbalancedTree( 1024 );
-        // gene_tree = Helper.createUnbalancedTree( 8192 );
-        // species_tree = gene_tree.copyTree();
-        // gene_tree = species_tree.copyTree();
-        // Helper.numberSpeciesInOrder( species_tree );
-        // Helper.numberSpeciesInOrder( gene_tree );
-        // Helper.randomizeSpecies( 1, 8192, gene_tree );
-        // Helper.intervalNumberSpecies( gene_tree, 4096 );
-        // Helper.numberSpeciesInDescOrder( gene_tree );
         log_writer.println( PRG_NAME + " - " + PRG_DESC );
         log_writer.println( "  version         : " + PRG_VERSION );
         log_writer.println( "  date            : " + PRG_DATE );
         log_writer.println( "  forester version: " + ForesterConstants.FORESTER_VERSION );
-        log_writer.println( "Start time: " + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
-        log_writer.println( "Gene tree file: " + gene_tree_file.getCanonicalPath() );
-        log_writer.println( "Gene tree name: "
+        log_writer.println();
+        log_writer.println( "Start time                               : "
+                + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
+        System.out.println( "Start time                               : "
+                + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
+        log_writer.println( "Gene tree file                           : " + gene_tree_file.getCanonicalPath() );
+        System.out.println( "Gene tree file                           : " + gene_tree_file.getCanonicalPath() );
+        log_writer.println( "Gene tree name                           : "
+                + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
+        System.out.println( "Gene tree name                           : "
                 + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
-        log_writer.println( "Species tree file: " + species_tree_file.getCanonicalPath() );
-        log_writer.println( "Species tree name: "
+        log_writer.println( "Species tree file                        : " + species_tree_file.getCanonicalPath() );
+        System.out.println( "Species tree file                        : " + species_tree_file.getCanonicalPath() );
+        log_writer.println( "Species tree name                        : "
+                + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
+        System.out.println( "Species tree name                        : "
                 + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
-        System.out.println();
         SDI sdi = null;
         final long start_time = new Date().getTime();
         try {
             if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
-                System.out.println();
-                System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
-                System.out.println( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
-                log_writer.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
-                log_writer.write( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
+                System.out.println( "Use most parsimonous duplication model   : " + most_parsimonous_duplication_model );
+                System.out.println( "Allow stripping of gene tree nodes       : " + allow_stripping_of_gene_tree );
+                log_writer.println( "Use most parsimonous duplication model   : " + most_parsimonous_duplication_model );
+                log_writer.println( "Allow stripping of gene tree nodes       : " + allow_stripping_of_gene_tree );
                 log_writer.flush();
                 sdi = new GSDI( gene_tree,
                                 species_tree,
@@ -289,7 +312,7 @@ public final class gsdi {
                 sdi = new SDIse( gene_tree, species_tree );
             }
         }
-        catch ( final SdiException e ) {
+        catch ( final SDIException e ) {
             log_writer.println( "User Error: " + e.getLocalizedMessage() );
             log_writer.close();
             ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
@@ -305,9 +328,15 @@ public final class gsdi {
             e.printStackTrace();
             System.exit( -1 );
         }
-        System.out.println();
-        System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
-        log_writer.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
+        System.out.println( "Running time (excluding I/O)             : " + ( new Date().getTime() - start_time )
+                + "ms" );
+        log_writer.println( "Running time (excluding I/O)             : " + ( new Date().getTime() - start_time )
+                + "ms" );
+        if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) {
+            final GSDI gsdi = ( GSDI ) sdi;
+            System.out.println( "Mapping based on                         : " + gsdi.getTaxCompBase() );
+            log_writer.println( "Mapping based on                         : " + gsdi.getTaxCompBase() );
+        }
         try {
             final PhylogenyWriter writer = new PhylogenyWriter();
             writer.toPhyloXML( out_file, gene_tree, 0 );
@@ -315,18 +344,14 @@ public final class gsdi {
         catch ( final IOException e ) {
             ForesterUtil.fatalError( PRG_NAME, "Failed to write to [" + out_file + "]: " + e.getMessage() );
         }
-        System.out.println();
-        System.out.println( "Wrote resulting gene tree to: " + out_file );
-        System.out.println();
-        log_writer.println( "Wrote resulting gene tree to: " + out_file );
+        System.out.println( "Wrote resulting gene tree to             : " + out_file );
+        log_writer.println( "Wrote resulting gene tree to             : " + out_file );
         if ( base_algorithm == BASE_ALGORITHM.SDI ) {
             sdi.computeMappingCostL();
-            System.out.println( "Mapping cost                    : " + sdi.computeMappingCostL() );
-            log_writer.println( "Mapping cost                    : " + sdi.computeMappingCostL() );
+            System.out.println( "Mapping cost                             : " + sdi.computeMappingCostL() );
+            log_writer.println( "Mapping cost                             : " + sdi.computeMappingCostL() );
         }
-        System.out.println( "Number of duplications          : " + sdi.getDuplicationsSum() );
-        log_writer.println( "Number of duplications          : " + sdi.getDuplicationsSum() );
-        if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) {
+        else if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) {
             final GSDI gsdi = ( GSDI ) sdi;
             final File species_tree_used_file = new File( out_file + SUFFIX_FOR_SPECIES_TREE_USED );
             try {
@@ -337,35 +362,99 @@ public final class gsdi {
                 ForesterUtil.fatalError( PRG_NAME,
                                          "Failed to write to [" + species_tree_used_file + "]: " + e.getMessage() );
             }
-            System.out.println();
-            System.out.println( "Wrote used species tree to: " + species_tree_used_file );
-            System.out.println();
-            log_writer.println( "Wrote used species tree to: " + species_tree_used_file );
+            System.out.println( "Wrote (stripped) species tree to         : " + species_tree_used_file );
+            log_writer.println( "Wrote (stripped) species tree to         : " + species_tree_used_file );
+        }
+        System.out.println( "Number of external nodes in gene tree    : " + gene_tree.getNumberOfExternalNodes() );
+        log_writer.println( "Number of external nodes in gene tree    : " + gene_tree.getNumberOfExternalNodes() );
+        System.out.println( "Number of external nodes in species tree : "
+                + sdi.getSpeciesTree().getNumberOfExternalNodes() );
+        log_writer.println( "Number of external nodes in species tree : "
+                + sdi.getSpeciesTree().getNumberOfExternalNodes() );
+        if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) {
+            final GSDI gsdi = ( GSDI ) sdi;
+            final int poly = PhylogenyMethods.countNumberOfPolytomies( gsdi.getSpeciesTree() );
+            System.out.println( "Number of polytomies in species tree     : " + poly );
+            log_writer.println( "Number of polytomies in species tree     : " + poly );
+            System.out.println( "External nodes stripped from gene tree   : "
+                    + gsdi.getStrippedExternalGeneTreeNodes().size() );
+            log_writer.println( "External nodes stripped from gene tree   : "
+                    + gsdi.getStrippedExternalGeneTreeNodes().size() );
+            System.out.println( "External nodes stripped from species tree: "
+                    + gsdi.getStrippedSpeciesTreeNodes().size() );
+            log_writer.println( "External nodes stripped from species tree: "
+                    + gsdi.getStrippedSpeciesTreeNodes().size() );
+        }
+        System.out.println();
+        System.out.println( "Number of duplications                   : " + sdi.getDuplicationsSum() );
+        log_writer.println( "Number of duplications                   : " + sdi.getDuplicationsSum() );
+        if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) {
+            final GSDI gsdi = ( GSDI ) sdi;
             if ( !most_parsimonous_duplication_model ) {
-                final int duplications = gsdi.getSpeciationOrDuplicationEventsSum();
-                System.out.println( "Number of potential duplications: " + duplications );
-                log_writer.println( "Number of potential duplications: " + duplications );
-            }
-            final int spec = gsdi.getSpeciationsSum();
-            System.out.println( "Number of speciations            : " + spec );
-            log_writer.println( "Number of speciations            : " + spec );
-            for( final PhylogenyNode n : gsdi.getMappedExternalSpeciesTreeNodes() ) {
-                System.out.println( n.toString() );
+                final int u = gsdi.getSpeciationOrDuplicationEventsSum();
+                System.out.println( "Number of potential duplications         : " + u );
+                log_writer.println( "Number of potential duplications         : " + u );
             }
+            System.out.println( "Number of speciations                    : " + gsdi.getSpeciationsSum() );
+            log_writer.println( "Number of speciations                    : " + gsdi.getSpeciationsSum() );
+            log_writer.println();
+            printMappedNodesToLog( log_writer, gsdi );
+            log_writer.println();
+            printStrippedGeneTreeNodesToLog( log_writer, gsdi );
+            log_writer.println();
+            printStrippedSpeciesTreeNodesToLog( log_writer, gsdi );
         }
         System.out.println();
+        System.out.println( "Wrote log to                             : " + log_file );
+        System.out.println();
         log_writer.close();
-        //  some stat on gene tree:
-        //      filename, name
-        //      number of external nodes, strppided nodes
-        //  some stats on sepcies tree, external nodes,
-        //  filename, name
-        //  internal nodes
-        //  how many of which are polytomies
-        //wrote log file to
-        //  if ( allow_stripping_of_gene_tree ) {
-        //      stripped x nodes, y external nodes remain
-        //  }
+    }
+
+    private static void printMappedNodesToLog( final EasyWriter log_writer, final GSDI gsdi ) throws IOException {
+        final SortedSet<String> ss = new TreeSet<String>();
+        for( final PhylogenyNode n : gsdi.getMappedExternalSpeciesTreeNodes() ) {
+            ss.add( n.toString() );
+        }
+        log_writer.println( "The following " + ss.size() + " species were used: " );
+        for( final String s : ss ) {
+            log_writer.println( "  " + s );
+        }
+    }
+
+    private static void printStrippedGeneTreeNodesToLog( final EasyWriter log_writer, final GSDI gsdi )
+            throws IOException {
+        final SortedMap<String, Integer> sm = new TreeMap<String, Integer>();
+        for( final PhylogenyNode n : gsdi.getStrippedExternalGeneTreeNodes() ) {
+            final String s = n.toString();
+            if ( sm.containsKey( s ) ) {
+                sm.put( s, sm.get( s ) + 1 );
+            }
+            else {
+                sm.put( s, 1 );
+            }
+        }
+        log_writer.println( "The following " + sm.size() + " nodes were stripped from the gene tree: " );
+        for( final String s : sm.keySet() ) {
+            final int count = sm.get( s );
+            if ( count == 1 ) {
+                log_writer.println( "  " + s );
+            }
+            else {
+                log_writer.println( "  " + s + " [" + count + "]" );
+            }
+        }
+    }
+
+    private static void printStrippedSpeciesTreeNodesToLog( final EasyWriter log_writer, final GSDI gsdi )
+            throws IOException {
+        final SortedSet<String> ss = new TreeSet<String>();
+        for( final PhylogenyNode n : gsdi.getStrippedSpeciesTreeNodes() ) {
+            ss.add( n.toString() );
+        }
+        log_writer.println( "The following " + ss.size() + " nodes were stripped from the species tree: " );
+        for( final String n : ss ) {
+            log_writer.println( "  " + n );
+        }
     }
 
     private static void print_help() {