From 4cefcbf8c547e86e56ed503f0a3d1a06e462e60e Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 19 Jun 2012 00:18:32 +0000 Subject: [PATCH] improving GSDI, under construction... --- .../java/src/org/forester/application/gsdi.java | 282 ++++++++++++-------- .../org/forester/phylogeny/PhylogenyMethods.java | 10 +- .../src/org/forester/phylogeny/data/Taxonomy.java | 8 +- forester/java/src/org/forester/sdi/GSDI.java | 117 +++++++- forester/java/src/org/forester/sdi/SDI.java | 18 +- .../java/src/org/forester/util/ForesterUtil.java | 4 +- 6 files changed, 297 insertions(+), 142 deletions(-) diff --git a/forester/java/src/org/forester/application/gsdi.java b/forester/java/src/org/forester/application/gsdi.java index b53d2f1..0c1a387 100644 --- a/forester/java/src/org/forester/application/gsdi.java +++ b/forester/java/src/org/forester/application/gsdi.java @@ -27,6 +27,8 @@ package org.forester.application; import java.io.File; import java.io.IOException; +import java.io.Writer; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; @@ -42,70 +44,81 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.GSDI; import org.forester.sdi.SDI; +import org.forester.sdi.SDI.TaxonomyComparisonBase; import org.forester.sdi.SDIse; import org.forester.util.CommandLineArguments; import org.forester.util.ForesterUtil; public final class gsdi { - final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true; - final static private String STRIP_OPTION = "s"; - final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION = "g"; - - final static private String SDI_OPTION = "b"; - final static private String MOST_PARSIMONIOUS_OPTION = "m"; - final static private String GUESS_FORMAT_OF_SPECIES_TREE = "q"; - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String DEFAULT_OUTFILE_SUFFIX = "_gsdi_out.phylo.xml"; + private enum BASE_ALGORITHM { + GSDI, SDI + } + final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true; + final static private String STRIP_OPTION = "s"; + final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION = "g"; + final static private String SDI_OPTION = "b"; + final static private String MOST_PARSIMONIOUS_OPTION = "m"; + final static private String GUESS_FORMAT_OF_SPECIES_TREE = "q"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String DEFAULT_OUTFILE_SUFFIX = "_gsdi_out.phylo.xml"; final static private String SUFFIX_FOR_LIST_OF_STIPPED_GENE_TREE_NODES = "_stripped_gene_tree_nodes.txt"; - final static private String SUFFIX_FOR_LOG_FILE = "_gsdi_log.txt"; - final static private String PRG_NAME = "gsdi"; - final static private String PRG_VERSION = "0.901"; - final static private String PRG_DATE = "120608"; - final static private String PRG_DESC = "general speciation duplication inference"; - final static private String E_MAIL = "phylosoft@gmail.com"; - final static private String WWW = "www.phylosoft.org/forester"; + final static private String LOGFILE_SUFFIX = "_gsdi_log.txt"; + final static private String PRG_NAME = "gsdi"; + final static private String PRG_VERSION = "0.901"; + final static private String PRG_DATE = "120608"; + final static private String PRG_DESC = "general speciation duplication inference"; + final static private String E_MAIL = "phylosoft@gmail.com"; + final static private String WWW = "www.phylosoft.org/forester"; public static void main( final String args[] ) { - ForesterUtil.printProgramInformation( PRG_NAME, - PRG_DESC, - PRG_VERSION, - PRG_DATE, - E_MAIL, - WWW, - ForesterUtil.getForesterLibraryInformation() ); - CommandLineArguments cla = null; try { - cla = new CommandLineArguments( args ); - } - catch ( final Exception e ) { - ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); - } - if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) { - System.out.println(); - gsdi.print_help(); - System.exit( 0 ); - } - else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) { - System.out.println(); - System.out.println( "Wrong number of arguments." ); - System.out.println(); - gsdi.print_help(); - System.exit( -1 ); + ForesterUtil.printProgramInformation( PRG_NAME, + PRG_DESC, + PRG_VERSION, + PRG_DATE, + E_MAIL, + WWW, + ForesterUtil.getForesterLibraryInformation() ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) { + System.out.println(); + gsdi.print_help(); + System.exit( 0 ); + } + else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) { + System.out.println(); + System.out.println( "Wrong number of arguments." ); + System.out.println(); + gsdi.print_help(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( gsdi.STRIP_OPTION ); + allowed_options.add( gsdi.SDI_OPTION ); + allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE ); + allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION ); + allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + execute( cla ); } - final List allowed_options = new ArrayList(); - allowed_options.add( gsdi.STRIP_OPTION ); - allowed_options.add( gsdi.SDI_OPTION ); - allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE ); - allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION ); - allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ); - - final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); - if ( dissallowed_options.length() > 0 ) { - ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options ); + catch ( final IOException e ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, e.getMessage() ); } - boolean use_sdise = false; + } + + private static void execute( final CommandLineArguments cla ) throws IOException { + BASE_ALGORITHM base_algorithm = BASE_ALGORITHM.GSDI; boolean strip = false; boolean most_parsimonous_duplication_model = false; boolean species_tree_in_phyloxml = true; @@ -114,10 +127,10 @@ public final class gsdi { strip = true; } if ( cla.isOptionSet( gsdi.SDI_OPTION ) ) { - use_sdise = true; + base_algorithm = BASE_ALGORITHM.SDI; } if ( cla.isOptionSet( gsdi.MOST_PARSIMONIOUS_OPTION ) ) { - if ( use_sdise ) { + if ( base_algorithm != BASE_ALGORITHM.GSDI ) { ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" ); } most_parsimonous_duplication_model = true; @@ -126,19 +139,18 @@ public final class gsdi { species_tree_in_phyloxml = false; } if ( cla.isOptionSet( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) { - if ( use_sdise ) { + if ( base_algorithm != BASE_ALGORITHM.GSDI ) { ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only allow stripping of gene tree with GSDI" ); } allow_stripping_of_gene_tree = true; } - - - Phylogeny species_tree = null; Phylogeny gene_tree = null; File gene_tree_file = null; File species_tree_file = null; File out_file = null; + File log_file = null; + Writer log_writer = null; try { gene_tree_file = cla.getFile( 0 ); species_tree_file = cla.getFile( 1 ); @@ -147,8 +159,8 @@ public final class gsdi { } else { out_file = new File( ForesterUtil.removeSuffix( gene_tree_file.toString() ) + DEFAULT_OUTFILE_SUFFIX ); - //out_file = new File( gsdi.DEFAULT_OUTFILE ); } + log_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + LOGFILE_SUFFIX ); } catch ( final IllegalArgumentException e ) { ForesterUtil.fatalError( gsdi.PRG_NAME, "error in command line: " + e.getMessage() ); @@ -162,6 +174,23 @@ public final class gsdi { if ( ForesterUtil.isWritableFile( out_file ) != null ) { ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) ); } + if ( ForesterUtil.isWritableFile( log_file ) != null ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) ); + } + try { + log_writer = ForesterUtil.createBufferedWriter( log_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() ); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, + "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() ); + } try { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); if ( species_tree_in_phyloxml ) { @@ -173,31 +202,39 @@ public final class gsdi { ( ( NHXParser ) p ).setReplaceUnderscores( true ); } species_tree = factory.create( species_tree_file, p )[ 0 ]; - PhylogenyMethods.transferNodeNameToField( species_tree, - PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ); + final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree ); + switch ( comp_base ) { + case SCIENTIFIC_NAME: + PhylogenyMethods + .transferNodeNameToField( species_tree, + PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1 ); + break; + case CODE: + PhylogenyMethods.transferNodeNameToField( species_tree, + PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE ); + break; + case ID: + PhylogenyMethods.transferNodeNameToField( species_tree, + PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID ); + break; + default: + ForesterUtil.fatalError( gsdi.PRG_NAME, "unable to determine comparison base" ); + } } } catch ( final IOException e ) { ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to read species tree from [" + gene_tree_file + "]: " + e.getMessage() ); } - try { - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ]; - } - catch ( final IOException e ) { - ForesterUtil.fatalError( gsdi.PRG_NAME, - "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() ); - } gene_tree.setRooted( true ); species_tree.setRooted( true ); if ( !gene_tree.isCompletelyBinary() ) { ForesterUtil.fatalError( gsdi.PRG_NAME, "gene tree (\"" + gene_tree_file + "\") is not completely binary" ); } - if ( use_sdise ) { + if ( base_algorithm != BASE_ALGORITHM.GSDI ) { if ( !species_tree.isCompletelyBinary() ) { ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree (\"" + species_tree_file - + "\") is not completely binary" ); + + "\") is not completely binary, use GSDI instead" ); } } // For timing. @@ -212,24 +249,41 @@ public final class gsdi { // Helper.randomizeSpecies( 1, 8192, gene_tree ); // Helper.intervalNumberSpecies( gene_tree, 4096 ); // Helper.numberSpeciesInDescOrder( gene_tree ); + log_writer.write( PRG_NAME + " " + PRG_VERSION + " " + PRG_DATE ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( PRG_DESC ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( PRG_DESC ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); System.out.println(); System.out.println( "Strip species tree: " + strip ); + log_writer.write( "Strip species tree: " + strip ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); SDI sdi = null; final long start_time = new Date().getTime(); try { - if ( use_sdise ) { + if ( base_algorithm == BASE_ALGORITHM.GSDI ) { System.out.println(); - System.out.println( "Using SDIse algorithm" ); - sdi = new SDIse( gene_tree, species_tree ); + System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); + System.out.println( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree ); + log_writer.write( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + sdi = new GSDI( gene_tree, + species_tree, + most_parsimonous_duplication_model, + allow_stripping_of_gene_tree ); } else { System.out.println(); - System.out.println( "Using GSDI algorithm" ); - System.out.println(); - System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); - System.out.println( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree ); - sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model, allow_stripping_of_gene_tree ); - + System.out.println( "Using SDIse algorithm" ); + log_writer.write( "Using SDIse algorithm" ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + sdi = new SDIse( gene_tree, species_tree ); } } catch ( final Exception e ) { @@ -237,6 +291,8 @@ public final class gsdi { } System.out.println(); System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); + log_writer.write( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); try { final PhylogenyWriter writer = new PhylogenyWriter(); writer.toPhyloXML( out_file, gene_tree, 0 ); @@ -247,32 +303,41 @@ public final class gsdi { System.out.println(); System.out.println( "Successfully wrote resulting gene tree to: " + out_file ); System.out.println(); - if ( use_sdise ) { + log_writer.write( "Wrote resulting gene tree to: " + out_file ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + if ( base_algorithm == BASE_ALGORITHM.SDI ) { sdi.computeMappingCostL(); System.out.println( "Mapping cost : " + sdi.computeMappingCostL() ); + log_writer.write( "Mapping cost : " + sdi.computeMappingCostL() ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); } System.out.println( "Number of duplications : " + sdi.getDuplicationsSum() ); - if ( !use_sdise && !most_parsimonous_duplication_model ) { - System.out.println( "Number of potential duplications: " - + ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum() ); + log_writer.write( "Number of duplications : " + sdi.getDuplicationsSum() ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) && !most_parsimonous_duplication_model ) { + final int duplications = ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum(); + System.out.println( "Number of potential duplications: " + duplications ); + log_writer.write( "Number of potential duplications: " + duplications ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); } - if ( !use_sdise ) { - System.out.println( "Number of speciations : " + ( ( GSDI ) sdi ).getSpeciationsSum() ); + if ( base_algorithm == BASE_ALGORITHM.GSDI ) { + final int spec = ( ( GSDI ) sdi ).getSpeciationsSum(); + System.out.println( "Number of speciations : " + spec ); + log_writer.write( "Number of speciations : " + spec ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); } System.out.println(); - some stat on gene tree: - filename, name - number of external nodes, strppided nodes - some stats on sepcies tree, external nodes, - filename, name - internal nodes - how many of which are polytomies + // some stat on gene tree: + // filename, name + // number of external nodes, strppided nodes + // some stats on sepcies tree, external nodes, + // filename, name + // internal nodes + // how many of which are polytomies //wrote log file to - if ( allow_stripping_of_gene_tree ) { - stripped x nodes, y external nodes remain - - - } + // if ( allow_stripping_of_gene_tree ) { + // stripped x nodes, y external nodes remain + // } } private static void print_help() { @@ -280,28 +345,29 @@ public final class gsdi { + " [-options] [outfile]" ); System.out.println(); System.out.println( "Options:" ); - System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + ); - System.out.println( " -" + gsdi.STRIP_OPTION + ": to strip the species tree of unneeded species prior to duplication inference" ); + // System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + ); + System.out.println( " -" + gsdi.STRIP_OPTION + + ": to strip the species tree of unneeded species prior to duplication inference" ); System.out.println( " -" + gsdi.SDI_OPTION + ": to use SDI algorithm instead of GSDI algorithm" );//TODO gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION not allowed System.out.println( " -" + gsdi.MOST_PARSIMONIOUS_OPTION + ": use most parimonious duplication model for GSDI: " ); System.out.println( " assign nodes as speciations which would otherwise be assiged" ); System.out.println( " as unknown because of polytomies in the species tree" ); - System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE + ": to allow species tree in other formats than" ); - System.out.println( " phyloXML (Newick, NHX, Nexus)" ); - - + System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE + + ": to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" ); System.out.println(); System.out.println( "Species tree:" ); - System.out.println( " In phyloXML format (unless option -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE - + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" ); + System.out + .println( " In phyloXML format (unless option -" + + gsdi.GUESS_FORMAT_OF_SPECIES_TREE + + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" ); System.out.println(); System.out.println( "Gene tree:" ); System.out.println( " In phyloXM format, with taxonomy and sequence data in appropriate fields" ); System.out.println(); System.out.println( "Example:" ); - System.out.println( "gsdi - System.out.println(); + // System.out.println( "gsdi + // System.out.println(); System.out.println( "Note -- GSDI algorithm is under development" ); System.out.println(); } diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index dbf5d5f..bb13e65 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -474,6 +474,13 @@ public class PhylogenyMethods { .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); break; } + case TAXONOMY_ID: { + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + n.getNodeData().getTaxonomy().setIdentifier( new Identifier( name ) ); + break; + } } } } @@ -1637,7 +1644,8 @@ public class PhylogenyMethods { SEQUENCE_SYMBOL, SEQUENCE_NAME, TAXONOMY_ID_UNIPROT_1, - TAXONOMY_ID_UNIPROT_2; + TAXONOMY_ID_UNIPROT_2, + TAXONOMY_ID; } public static enum TAXONOMY_EXTRACTION { diff --git a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java index 1375652..6d8ade1 100644 --- a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java +++ b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java @@ -197,7 +197,7 @@ public class Taxonomy implements PhylogenyData, MultipleUris, Comparable _stripped_gene_tree_nodes; + private final Set _stripped_gene_tree_nodes; /** * Constructor which sets the gene tree and the species tree to be compared. @@ -102,9 +103,9 @@ public final class GSDI extends SDI { _speciations_sum = 0; _most_parsimonious_duplication_model = most_parsimonious_duplication_model; _transversal_counts = new HashMap(); - _duplications_sum = 0; + _duplications_sum = 0; _strip_gene_tree = strip_gene_tree; - _stripped_gene_tree_nodes = new HashSet(); + _stripped_gene_tree_nodes = new HashSet(); getSpeciesTree().preOrderReId(); linkNodesOfG(); geneTreePostOrderTraversal( getGeneTree().getRoot() ); @@ -287,6 +288,61 @@ public final class GSDI extends SDI { } } + final void linkNodesOfG2() { + final HashMap speciestree_ext_nodes = createTaxonomyToNodeMap(); + if ( _strip_gene_tree ) { + stripGeneTree( speciestree_ext_nodes ); + if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) { + throw new IllegalArgumentException( "species tree does not contain any" + + " nodes matching species in the gene tree" ); + } + } + // Retrieve the reference to the PhylogenyNode with a matching species. + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + if ( !g.getNodeData().isHasTaxonomy() ) { + throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); + } + final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() ); + if ( s == null ) { + throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy() + + " not present in species tree" ); + } + g.setLink( s ); + } + ////// + final Map speciestree_ext_nodes = new HashMap(); + final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); + if ( _strip_gene_tree ) { + stripGeneTree( speciestree_ext_nodes ); + if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) { + throw new IllegalArgumentException( "species tree does not contain any" + + " nodes matching species in the gene tree" ); + } + } + // Put references to all external nodes of the species tree into a map. + // Stringyfied taxonomy is the key, node is the value. + for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode s = iter.next(); + final String tax_str = taxonomyToString( s, tax_comp_base ); + if ( speciestree_ext_nodes.containsKey( tax_str ) ) { + throw new IllegalArgumentException( "taxonomy [" + s + "] is not unique in species phylogeny" ); + } + speciestree_ext_nodes.put( tax_str, s ); + } + // Retrieve the reference to the node with a matching stringyfied taxonomy. + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + final String tax_str = taxonomyToString( g, tax_comp_base ); + final PhylogenyNode s = speciestree_ext_nodes.get( tax_str ); + if ( s == null ) { + throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy() + + "] not present in species tree" ); + } + g.setLink( s ); + } + } + final private HashMap createTaxonomyToNodeMap() { final HashMap speciestree_ext_nodes = new HashMap(); for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) { @@ -303,25 +359,64 @@ public final class GSDI extends SDI { } private final void stripGeneTree( final HashMap speciestree_ext_nodes ) { - // final Set to_delete = new HashSet(); - + // final Set to_delete = new HashSet(); for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode g = iter.next(); if ( !g.getNodeData().isHasTaxonomy() ) { throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); } - final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() ); - if ( s == null ) { + if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { _stripped_gene_tree_nodes.add( g ); } } - for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { + for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { _gene_tree.deleteSubtree( n, true ); - } - } - + + public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { + int with_id_count = 0; + int with_code_count = 0; + int with_sn_count = 0; + int max = 0; + for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + if ( g.getNodeData().isHasTaxonomy() ) { + final Taxonomy tax = g.getNodeData().getTaxonomy(); + if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { + if ( ++with_id_count > max ) { + max = with_id_count; + } + } + if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { + if ( ++with_code_count > max ) { + max = with_code_count; + } + } + if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { + if ( ++with_sn_count > max ) { + max = with_sn_count; + } + } + } + } + if ( max == 0 ) { + throw new IllegalArgumentException( "gene tree has no taxonomic data" ); + } + else if ( max == 1 ) { + throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); + } + else if ( max == with_sn_count ) { + return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; + } + else if ( max == with_id_count ) { + return SDI.TaxonomyComparisonBase.ID; + } + else { + return SDI.TaxonomyComparisonBase.CODE; + } + } + public Set getStrippedExternalGeneTreeNodes() { return _stripped_gene_tree_nodes; } diff --git a/forester/java/src/org/forester/sdi/SDI.java b/forester/java/src/org/forester/sdi/SDI.java index 6f163cc..c783ce0 100644 --- a/forester/java/src/org/forester/sdi/SDI.java +++ b/forester/java/src/org/forester/sdi/SDI.java @@ -131,7 +131,6 @@ public abstract class SDI { boolean all_have_id = true; boolean all_have_code = true; boolean all_have_sn = true; - boolean all_have_cn = true; for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); if ( n.getNodeData().isHasTaxonomy() ) { @@ -145,9 +144,6 @@ public abstract class SDI { if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) { all_have_sn = false; } - if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) { - all_have_cn = false; - } } else { throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" ); @@ -166,9 +162,6 @@ public abstract class SDI { if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) { all_have_sn = false; } - if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) { - all_have_cn = false; - } } else { throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" ); @@ -183,9 +176,6 @@ public abstract class SDI { else if ( all_have_sn ) { base = TaxonomyComparisonBase.SCIENTIFIC_NAME; } - else if ( all_have_cn ) { - base = TaxonomyComparisonBase.COMMON_NAME; - } else { throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" ); } @@ -296,7 +286,7 @@ public abstract class SDI { return sb.toString(); } - private static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) { + static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) { final Taxonomy tax = n.getNodeData().getTaxonomy(); switch ( base ) { case ID: @@ -305,14 +295,12 @@ public abstract class SDI { return tax.getTaxonomyCode(); case SCIENTIFIC_NAME: return tax.getScientificName(); - case COMMON_NAME: - return tax.getCommonName(); default: throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base ); } } - enum TaxonomyComparisonBase { - ID, CODE, SCIENTIFIC_NAME, COMMON_NAME; + public enum TaxonomyComparisonBase { + ID, CODE, SCIENTIFIC_NAME } } diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 1c914c8..36355ac 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -95,9 +95,9 @@ public final class ForesterUtil { sb.append( separator ); } } - + final public static String getForesterLibraryInformation() { - return "forester " + ForesterConstants.FORESTER_VERSION + " (" + ForesterConstants.FORESTER_DATE + ")"; + return "forester " + ForesterConstants.FORESTER_VERSION + " (" + ForesterConstants.FORESTER_DATE + ")"; } public static boolean seqIsLikelyToBeAa( final String s ) { -- 1.7.10.2