improving GSDI, under construction...
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Jun 2012 00:18:32 +0000 (00:18 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Jun 2012 00:18:32 +0000 (00:18 +0000)
forester/java/src/org/forester/application/gsdi.java
forester/java/src/org/forester/phylogeny/PhylogenyMethods.java
forester/java/src/org/forester/phylogeny/data/Taxonomy.java
forester/java/src/org/forester/sdi/GSDI.java
forester/java/src/org/forester/sdi/SDI.java
forester/java/src/org/forester/util/ForesterUtil.java

index b53d2f1..0c1a387 100644 (file)
@@ -27,6 +27,8 @@ package org.forester.application;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.Writer;
+import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
@@ -42,70 +44,81 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
 import org.forester.sdi.GSDI;
 import org.forester.sdi.SDI;
+import org.forester.sdi.SDI.TaxonomyComparisonBase;
 import org.forester.sdi.SDIse;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.ForesterUtil;
 
 public final class gsdi {
 
-    final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true;
-    final static private String STRIP_OPTION                           = "s";
-    final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION       = "g";
-    
-    final static private String SDI_OPTION                             = "b";
-    final static private String MOST_PARSIMONIOUS_OPTION               = "m";
-    final static private String GUESS_FORMAT_OF_SPECIES_TREE           = "q";
-    final static private String HELP_OPTION_1                          = "help";
-    final static private String HELP_OPTION_2                          = "h";
-    final static private String DEFAULT_OUTFILE_SUFFIX                 = "_gsdi_out.phylo.xml";
+    private enum BASE_ALGORITHM {
+        GSDI, SDI
+    }
+    final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE     = true;
+    final static private String STRIP_OPTION                               = "s";
+    final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION        = "g";
+    final static private String SDI_OPTION                                 = "b";
+    final static private String MOST_PARSIMONIOUS_OPTION                   = "m";
+    final static private String GUESS_FORMAT_OF_SPECIES_TREE               = "q";
+    final static private String HELP_OPTION_1                              = "help";
+    final static private String HELP_OPTION_2                              = "h";
+    final static private String DEFAULT_OUTFILE_SUFFIX                     = "_gsdi_out.phylo.xml";
     final static private String SUFFIX_FOR_LIST_OF_STIPPED_GENE_TREE_NODES = "_stripped_gene_tree_nodes.txt";
-    final static private String SUFFIX_FOR_LOG_FILE                    = "_gsdi_log.txt";    
-    final static private String PRG_NAME                               = "gsdi";
-    final static private String PRG_VERSION                            = "0.901";
-    final static private String PRG_DATE                               = "120608";
-    final static private String PRG_DESC                               = "general speciation duplication inference";
-    final static private String E_MAIL                                 = "phylosoft@gmail.com";
-    final static private String WWW                                    = "www.phylosoft.org/forester";
+    final static private String LOGFILE_SUFFIX                             = "_gsdi_log.txt";
+    final static private String PRG_NAME                                   = "gsdi";
+    final static private String PRG_VERSION                                = "0.901";
+    final static private String PRG_DATE                                   = "120608";
+    final static private String PRG_DESC                                   = "general speciation duplication inference";
+    final static private String E_MAIL                                     = "phylosoft@gmail.com";
+    final static private String WWW                                        = "www.phylosoft.org/forester";
 
     public static void main( final String args[] ) {
-        ForesterUtil.printProgramInformation( PRG_NAME,
-                                              PRG_DESC,
-                                              PRG_VERSION,
-                                              PRG_DATE,
-                                              E_MAIL,
-                                              WWW,
-                                              ForesterUtil.getForesterLibraryInformation() );
-        CommandLineArguments cla = null;
         try {
-            cla = new CommandLineArguments( args );
-        }
-        catch ( final Exception e ) {
-            ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
-        }
-        if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) {
-            System.out.println();
-            gsdi.print_help();
-            System.exit( 0 );
-        }
-        else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) {
-            System.out.println();
-            System.out.println( "Wrong number of arguments." );
-            System.out.println();
-            gsdi.print_help();
-            System.exit( -1 );
+            ForesterUtil.printProgramInformation( PRG_NAME,
+                                                  PRG_DESC,
+                                                  PRG_VERSION,
+                                                  PRG_DATE,
+                                                  E_MAIL,
+                                                  WWW,
+                                                  ForesterUtil.getForesterLibraryInformation() );
+            CommandLineArguments cla = null;
+            try {
+                cla = new CommandLineArguments( args );
+            }
+            catch ( final Exception e ) {
+                ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
+            }
+            if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) {
+                System.out.println();
+                gsdi.print_help();
+                System.exit( 0 );
+            }
+            else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) {
+                System.out.println();
+                System.out.println( "Wrong number of arguments." );
+                System.out.println();
+                gsdi.print_help();
+                System.exit( -1 );
+            }
+            final List<String> allowed_options = new ArrayList<String>();
+            allowed_options.add( gsdi.STRIP_OPTION );
+            allowed_options.add( gsdi.SDI_OPTION );
+            allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE );
+            allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION );
+            allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
+            final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
+            if ( dissallowed_options.length() > 0 ) {
+                ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options );
+            }
+            execute( cla );
         }
-        final List<String> allowed_options = new ArrayList<String>();
-        allowed_options.add( gsdi.STRIP_OPTION );
-        allowed_options.add( gsdi.SDI_OPTION );
-        allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE );
-        allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION );
-        allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
-        
-        final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
-        if ( dissallowed_options.length() > 0 ) {
-            ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options );
+        catch ( final IOException e ) {
+            ForesterUtil.fatalError( gsdi.PRG_NAME, e.getMessage() );
         }
-        boolean use_sdise = false;
+    }
+
+    private static void execute( final CommandLineArguments cla ) throws IOException {
+        BASE_ALGORITHM base_algorithm = BASE_ALGORITHM.GSDI;
         boolean strip = false;
         boolean most_parsimonous_duplication_model = false;
         boolean species_tree_in_phyloxml = true;
@@ -114,10 +127,10 @@ public final class gsdi {
             strip = true;
         }
         if ( cla.isOptionSet( gsdi.SDI_OPTION ) ) {
-            use_sdise = true;
+            base_algorithm = BASE_ALGORITHM.SDI;
         }
         if ( cla.isOptionSet( gsdi.MOST_PARSIMONIOUS_OPTION ) ) {
-            if ( use_sdise ) {
+            if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
                 ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" );
             }
             most_parsimonous_duplication_model = true;
@@ -126,19 +139,18 @@ public final class gsdi {
             species_tree_in_phyloxml = false;
         }
         if ( cla.isOptionSet( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) {
-            if ( use_sdise ) {
+            if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
                 ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only allow stripping of gene tree with GSDI" );
             }
             allow_stripping_of_gene_tree = true;
         }
-       
-        
-        
         Phylogeny species_tree = null;
         Phylogeny gene_tree = null;
         File gene_tree_file = null;
         File species_tree_file = null;
         File out_file = null;
+        File log_file = null;
+        Writer log_writer = null;
         try {
             gene_tree_file = cla.getFile( 0 );
             species_tree_file = cla.getFile( 1 );
@@ -147,8 +159,8 @@ public final class gsdi {
             }
             else {
                 out_file = new File( ForesterUtil.removeSuffix( gene_tree_file.toString() ) + DEFAULT_OUTFILE_SUFFIX );
-                //out_file = new File( gsdi.DEFAULT_OUTFILE );
             }
+            log_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + LOGFILE_SUFFIX );
         }
         catch ( final IllegalArgumentException e ) {
             ForesterUtil.fatalError( gsdi.PRG_NAME, "error in command line: " + e.getMessage() );
@@ -162,6 +174,23 @@ public final class gsdi {
         if ( ForesterUtil.isWritableFile( out_file ) != null ) {
             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
         }
+        if ( ForesterUtil.isWritableFile( log_file ) != null ) {
+            ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
+        }
+        try {
+            log_writer = ForesterUtil.createBufferedWriter( log_file );
+        }
+        catch ( final IOException e ) {
+            ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
+        }
+        try {
+            final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+            gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
+        }
+        catch ( final IOException e ) {
+            ForesterUtil.fatalError( gsdi.PRG_NAME,
+                                     "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() );
+        }
         try {
             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
             if ( species_tree_in_phyloxml ) {
@@ -173,31 +202,39 @@ public final class gsdi {
                     ( ( NHXParser ) p ).setReplaceUnderscores( true );
                 }
                 species_tree = factory.create( species_tree_file, p )[ 0 ];
-                PhylogenyMethods.transferNodeNameToField( species_tree,
-                                                          PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME );
+                final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree );
+                switch ( comp_base ) {
+                    case SCIENTIFIC_NAME:
+                        PhylogenyMethods
+                                .transferNodeNameToField( species_tree,
+                                                          PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1 );
+                        break;
+                    case CODE:
+                        PhylogenyMethods.transferNodeNameToField( species_tree,
+                                                                  PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE );
+                        break;
+                    case ID:
+                        PhylogenyMethods.transferNodeNameToField( species_tree,
+                                                                  PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID );
+                        break;
+                    default:
+                        ForesterUtil.fatalError( gsdi.PRG_NAME, "unable to determine comparison base" );
+                }
             }
         }
         catch ( final IOException e ) {
             ForesterUtil.fatalError( gsdi.PRG_NAME,
                                      "Failed to read species tree from [" + gene_tree_file + "]: " + e.getMessage() );
         }
-        try {
-            final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
-            gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
-        }
-        catch ( final IOException e ) {
-            ForesterUtil.fatalError( gsdi.PRG_NAME,
-                                     "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() );
-        }
         gene_tree.setRooted( true );
         species_tree.setRooted( true );
         if ( !gene_tree.isCompletelyBinary() ) {
             ForesterUtil.fatalError( gsdi.PRG_NAME, "gene tree (\"" + gene_tree_file + "\") is not completely binary" );
         }
-        if ( use_sdise ) {
+        if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
             if ( !species_tree.isCompletelyBinary() ) {
                 ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree (\"" + species_tree_file
-                        + "\") is not completely binary" );
+                        + "\") is not completely binary, use GSDI instead" );
             }
         }
         // For timing.
@@ -212,24 +249,41 @@ public final class gsdi {
         // Helper.randomizeSpecies( 1, 8192, gene_tree );
         // Helper.intervalNumberSpecies( gene_tree, 4096 );
         // Helper.numberSpeciesInDescOrder( gene_tree );
+        log_writer.write( PRG_NAME + " " + PRG_VERSION + " " + PRG_DATE );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
+        log_writer.write( PRG_DESC );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
+        log_writer.write( PRG_DESC );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
+        log_writer.write( new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
         System.out.println();
         System.out.println( "Strip species tree: " + strip );
+        log_writer.write( "Strip species tree: " + strip );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
         SDI sdi = null;
         final long start_time = new Date().getTime();
         try {
-            if ( use_sdise ) {
+            if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
                 System.out.println();
-                System.out.println( "Using SDIse algorithm" );
-                sdi = new SDIse( gene_tree, species_tree );
+                System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
+                System.out.println( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
+                log_writer.write( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
+                log_writer.write( ForesterUtil.LINE_SEPARATOR );
+                log_writer.write( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
+                log_writer.write( ForesterUtil.LINE_SEPARATOR );
+                sdi = new GSDI( gene_tree,
+                                species_tree,
+                                most_parsimonous_duplication_model,
+                                allow_stripping_of_gene_tree );
             }
             else {
                 System.out.println();
-                System.out.println( "Using GSDI algorithm" );
-                System.out.println();
-                System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
-                System.out.println( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
-                sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model, allow_stripping_of_gene_tree );
-              
+                System.out.println( "Using SDIse algorithm" );
+                log_writer.write( "Using SDIse algorithm" );
+                log_writer.write( ForesterUtil.LINE_SEPARATOR );
+                sdi = new SDIse( gene_tree, species_tree );
             }
         }
         catch ( final Exception e ) {
@@ -237,6 +291,8 @@ public final class gsdi {
         }
         System.out.println();
         System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
+        log_writer.write( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
         try {
             final PhylogenyWriter writer = new PhylogenyWriter();
             writer.toPhyloXML( out_file, gene_tree, 0 );
@@ -247,32 +303,41 @@ public final class gsdi {
         System.out.println();
         System.out.println( "Successfully wrote resulting gene tree to: " + out_file );
         System.out.println();
-        if ( use_sdise ) {
+        log_writer.write( "Wrote resulting gene tree to: " + out_file );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
+        if ( base_algorithm == BASE_ALGORITHM.SDI ) {
             sdi.computeMappingCostL();
             System.out.println( "Mapping cost                    : " + sdi.computeMappingCostL() );
+            log_writer.write( "Mapping cost                    : " + sdi.computeMappingCostL() );
+            log_writer.write( ForesterUtil.LINE_SEPARATOR );
         }
         System.out.println( "Number of duplications          : " + sdi.getDuplicationsSum() );
-        if ( !use_sdise && !most_parsimonous_duplication_model ) {
-            System.out.println( "Number of potential duplications: "
-                    + ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum() );
+        log_writer.write( "Number of duplications          : " + sdi.getDuplicationsSum() );
+        log_writer.write( ForesterUtil.LINE_SEPARATOR );
+        if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) && !most_parsimonous_duplication_model ) {
+            final int duplications = ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum();
+            System.out.println( "Number of potential duplications: " + duplications );
+            log_writer.write( "Number of potential duplications: " + duplications );
+            log_writer.write( ForesterUtil.LINE_SEPARATOR );
         }
-        if ( !use_sdise ) {
-            System.out.println( "Number of speciations            : " + ( ( GSDI ) sdi ).getSpeciationsSum() );
+        if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
+            final int spec = ( ( GSDI ) sdi ).getSpeciationsSum();
+            System.out.println( "Number of speciations            : " + spec );
+            log_writer.write( "Number of speciations            : " + spec );
+            log_writer.write( ForesterUtil.LINE_SEPARATOR );
         }
         System.out.println();
-        some stat on gene tree:
-            filename, name
-            number of external nodes, strppided nodes
-        some stats on sepcies tree, external nodes,
-        filename, name
-        internal nodes
-        how many of which are polytomies
+        //  some stat on gene tree:
+        //      filename, name
+        //      number of external nodes, strppided nodes
+        //  some stats on sepcies tree, external nodes,
+        //  filename, name
+        //  internal nodes
+        //  how many of which are polytomies
         //wrote log file to
-        if ( allow_stripping_of_gene_tree ) {
-            stripped x nodes, y external nodes remain
-            
-            
-        }
+        //  if ( allow_stripping_of_gene_tree ) {
+        //      stripped x nodes, y external nodes remain
+        //  }
     }
 
     private static void print_help() {
@@ -280,28 +345,29 @@ public final class gsdi {
                 + " [-options] <gene tree in phyloXML format> <species tree in phyloXML format> [outfile]" );
         System.out.println();
         System.out.println( "Options:" );
-        System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + );
-        System.out.println( " -" + gsdi.STRIP_OPTION + ": to strip the species tree of unneeded species prior to duplication inference" );
+        //    System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + );
+        System.out.println( " -" + gsdi.STRIP_OPTION
+                + ": to strip the species tree of unneeded species prior to duplication inference" );
         System.out.println( " -" + gsdi.SDI_OPTION + ": to use SDI algorithm instead of GSDI algorithm" );//TODO gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION not allowed
         System.out.println( " -" + gsdi.MOST_PARSIMONIOUS_OPTION
                 + ": use most parimonious duplication model for GSDI: " );
         System.out.println( "     assign nodes as speciations which would otherwise be assiged" );
         System.out.println( "     as unknown because of polytomies in the species tree" );
-        System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE + ": to allow species tree in other formats than" );
-        System.out.println( "     phyloXML (Newick, NHX, Nexus)" );
-        
-        
+        System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
+                + ": to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" );
         System.out.println();
         System.out.println( "Species tree:" );
-        System.out.println( " In phyloXML format (unless option -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
-                + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" );
+        System.out
+                .println( " In phyloXML format (unless option -"
+                        + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
+                        + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" );
         System.out.println();
         System.out.println( "Gene tree:" );
         System.out.println( " In phyloXM format, with taxonomy and sequence data in appropriate fields" );
         System.out.println();
         System.out.println( "Example:" );
-        System.out.println( "gsdi 
-        System.out.println();
+        //  System.out.println( "gsdi 
+        //    System.out.println();
         System.out.println( "Note -- GSDI algorithm is under development" );
         System.out.println();
     }
index dbf5d5f..bb13e65 100644 (file)
@@ -474,6 +474,13 @@ public class PhylogenyMethods {
                                 .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) );
                         break;
                     }
+                    case TAXONOMY_ID: {
+                        if ( !n.getNodeData().isHasTaxonomy() ) {
+                            n.getNodeData().setTaxonomy( new Taxonomy() );
+                        }
+                        n.getNodeData().getTaxonomy().setIdentifier( new Identifier( name ) );
+                        break;
+                    }
                 }
             }
         }
@@ -1637,7 +1644,8 @@ public class PhylogenyMethods {
         SEQUENCE_SYMBOL,
         SEQUENCE_NAME,
         TAXONOMY_ID_UNIPROT_1,
-        TAXONOMY_ID_UNIPROT_2;
+        TAXONOMY_ID_UNIPROT_2,
+        TAXONOMY_ID;
     }
 
     public static enum TAXONOMY_EXTRACTION {
index 1375652..6d8ade1 100644 (file)
@@ -197,7 +197,7 @@ public class Taxonomy implements PhylogenyData, MultipleUris, Comparable<Taxonom
 
     @Override
     public int hashCode() {
-        if ( getIdentifier() != null && !ForesterUtil.isEmpty( getIdentifier().getValue() ) ) {
+        if ( ( getIdentifier() != null ) && !ForesterUtil.isEmpty( getIdentifier().getValue() ) ) {
             return getIdentifier().hashCode();
         }
         else if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) ) {
@@ -252,10 +252,8 @@ public class Taxonomy implements PhylogenyData, MultipleUris, Comparable<Taxonom
         }
         final Taxonomy tax = ( Taxonomy ) data;
         if ( ( getIdentifier() != null ) && ( tax.getIdentifier() != null )
-          && !ForesterUtil.isEmpty( getIdentifier().getValue() )  
-          && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() )
-        
-        ) {
+                && !ForesterUtil.isEmpty( getIdentifier().getValue() )
+                && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
             return getIdentifier().isEqual( tax.getIdentifier() );
         }
         else if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) && !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
index e0cd6f3..5939037 100644 (file)
@@ -27,6 +27,7 @@ package org.forester.sdi;
 
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 
 import org.forester.phylogeny.Phylogeny;
@@ -65,7 +66,7 @@ public final class GSDI extends SDI {
     private final boolean                         _strip_gene_tree;
     private int                                   _speciation_or_duplication_events_sum;
     private int                                   _speciations_sum;
-    private final Set<PhylogenyNode>              _stripped_gene_tree_nodes; 
+    private final Set<PhylogenyNode>              _stripped_gene_tree_nodes;
 
     /**
      * Constructor which sets the gene tree and the species tree to be compared.
@@ -102,9 +103,9 @@ public final class GSDI extends SDI {
         _speciations_sum = 0;
         _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
         _transversal_counts = new HashMap<PhylogenyNode, Integer>();
-        _duplications_sum = 0; 
+        _duplications_sum = 0;
         _strip_gene_tree = strip_gene_tree;
-        _stripped_gene_tree_nodes = new HashSet<PhylogenyNode>(); 
+        _stripped_gene_tree_nodes = new HashSet<PhylogenyNode>();
         getSpeciesTree().preOrderReId();
         linkNodesOfG();
         geneTreePostOrderTraversal( getGeneTree().getRoot() );
@@ -287,6 +288,61 @@ public final class GSDI extends SDI {
         }
     }
 
+    final void linkNodesOfG2() {
+        final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = createTaxonomyToNodeMap();
+        if ( _strip_gene_tree ) {
+            stripGeneTree( speciestree_ext_nodes );
+            if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
+                throw new IllegalArgumentException( "species tree does not contain any"
+                        + " nodes matching species in the gene tree" );
+            }
+        }
+        // Retrieve the reference to the PhylogenyNode with a matching species.
+        for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+            final PhylogenyNode g = iter.next();
+            if ( !g.getNodeData().isHasTaxonomy() ) {
+                throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
+            }
+            final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
+            if ( s == null ) {
+                throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy()
+                        + " not present in species tree" );
+            }
+            g.setLink( s );
+        }
+        //////
+        final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
+        final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
+        if ( _strip_gene_tree ) {
+            stripGeneTree( speciestree_ext_nodes );
+            if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
+                throw new IllegalArgumentException( "species tree does not contain any"
+                        + " nodes matching species in the gene tree" );
+            }
+        }
+        // Put references to all external nodes of the species tree into a map.
+        // Stringyfied taxonomy is the key, node is the value.
+        for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
+            final PhylogenyNode s = iter.next();
+            final String tax_str = taxonomyToString( s, tax_comp_base );
+            if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
+                throw new IllegalArgumentException( "taxonomy [" + s + "] is not unique in species phylogeny" );
+            }
+            speciestree_ext_nodes.put( tax_str, s );
+        }
+        // Retrieve the reference to the node with a matching stringyfied taxonomy.
+        for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+            final PhylogenyNode g = iter.next();
+            final String tax_str = taxonomyToString( g, tax_comp_base );
+            final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
+            if ( s == null ) {
+                throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
+                        + "] not present in species tree" );
+            }
+            g.setLink( s );
+        }
+    }
+
     final private HashMap<Taxonomy, PhylogenyNode> createTaxonomyToNodeMap() {
         final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = new HashMap<Taxonomy, PhylogenyNode>();
         for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) {
@@ -303,25 +359,64 @@ public final class GSDI extends SDI {
     }
 
     private final void stripGeneTree( final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes ) {
-      //  final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
-        
+        //  final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
         for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
             final PhylogenyNode g = iter.next();
             if ( !g.getNodeData().isHasTaxonomy() ) {
                 throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
             }
-            final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
-            if ( s == null ) {
+            if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) {
                 _stripped_gene_tree_nodes.add( g );
             }
         }
-        for( final PhylogenyNode n :  _stripped_gene_tree_nodes ) {
+        for( final PhylogenyNode n : _stripped_gene_tree_nodes ) {
             _gene_tree.deleteSubtree( n, true );
-            
         }
-        
     }
-    
+
+    public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
+        int with_id_count = 0;
+        int with_code_count = 0;
+        int with_sn_count = 0;
+        int max = 0;
+        for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+            final PhylogenyNode g = iter.next();
+            if ( g.getNodeData().isHasTaxonomy() ) {
+                final Taxonomy tax = g.getNodeData().getTaxonomy();
+                if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
+                    if ( ++with_id_count > max ) {
+                        max = with_id_count;
+                    }
+                }
+                if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
+                    if ( ++with_code_count > max ) {
+                        max = with_code_count;
+                    }
+                }
+                if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
+                    if ( ++with_sn_count > max ) {
+                        max = with_sn_count;
+                    }
+                }
+            }
+        }
+        if ( max == 0 ) {
+            throw new IllegalArgumentException( "gene tree has no taxonomic data" );
+        }
+        else if ( max == 1 ) {
+            throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
+        }
+        else if ( max == with_sn_count ) {
+            return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
+        }
+        else if ( max == with_id_count ) {
+            return SDI.TaxonomyComparisonBase.ID;
+        }
+        else {
+            return SDI.TaxonomyComparisonBase.CODE;
+        }
+    }
+
     public Set<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
         return _stripped_gene_tree_nodes;
     }
index 6f163cc..c783ce0 100644 (file)
@@ -131,7 +131,6 @@ public abstract class SDI {
         boolean all_have_id = true;
         boolean all_have_code = true;
         boolean all_have_sn = true;
-        boolean all_have_cn = true;
         for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
             final PhylogenyNode n = iter.next();
             if ( n.getNodeData().isHasTaxonomy() ) {
@@ -145,9 +144,6 @@ public abstract class SDI {
                 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
                     all_have_sn = false;
                 }
-                if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) {
-                    all_have_cn = false;
-                }
             }
             else {
                 throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" );
@@ -166,9 +162,6 @@ public abstract class SDI {
                 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
                     all_have_sn = false;
                 }
-                if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) {
-                    all_have_cn = false;
-                }
             }
             else {
                 throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" );
@@ -183,9 +176,6 @@ public abstract class SDI {
         else if ( all_have_sn ) {
             base = TaxonomyComparisonBase.SCIENTIFIC_NAME;
         }
-        else if ( all_have_cn ) {
-            base = TaxonomyComparisonBase.COMMON_NAME;
-        }
         else {
             throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" );
         }
@@ -296,7 +286,7 @@ public abstract class SDI {
         return sb.toString();
     }
 
-    private static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
+    static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
         final Taxonomy tax = n.getNodeData().getTaxonomy();
         switch ( base ) {
             case ID:
@@ -305,14 +295,12 @@ public abstract class SDI {
                 return tax.getTaxonomyCode();
             case SCIENTIFIC_NAME:
                 return tax.getScientificName();
-            case COMMON_NAME:
-                return tax.getCommonName();
             default:
                 throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base );
         }
     }
 
-    enum TaxonomyComparisonBase {
-        ID, CODE, SCIENTIFIC_NAME, COMMON_NAME;
+    public enum TaxonomyComparisonBase {
+        ID, CODE, SCIENTIFIC_NAME
     }
 }
index 1c914c8..36355ac 100644 (file)
@@ -95,9 +95,9 @@ public final class ForesterUtil {
             sb.append( separator );
         }
     }
-    
+
     final public static String getForesterLibraryInformation() {
-        return "forester " + ForesterConstants.FORESTER_VERSION + " (" + ForesterConstants.FORESTER_DATE  + ")";
+        return "forester " + ForesterConstants.FORESTER_VERSION + " (" + ForesterConstants.FORESTER_DATE + ")";
     }
 
     public static boolean seqIsLikelyToBeAa( final String s ) {