From 6c0feb7d2e02c6287d1328e61353cecff062bcd3 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 21 Jun 2012 01:35:24 +0000 Subject: [PATCH] improving GSDI, under construction... --- .../java/src/org/forester/application/gsdi.java | 85 +++++-- .../java/src/org/forester/application/rio.java | 3 +- .../java/src/org/forester/application/sdi_dir.java | 4 +- .../java/src/org/forester/application/sdix.java | 3 +- forester/java/src/org/forester/application/ta.java | 242 -------------------- .../archaeopteryx/MainFrameApplication.java | 20 +- forester/java/src/org/forester/sdi/GSDI.java | 214 +++++++++-------- forester/java/src/org/forester/sdi/RIO.java | 6 +- forester/java/src/org/forester/sdi/RIOn.java | 2 +- forester/java/src/org/forester/sdi/SDI.java | 3 +- forester/java/src/org/forester/sdi/SDIR.java | 3 +- forester/java/src/org/forester/sdi/SDIse.java | 3 +- forester/java/src/org/forester/sdi/SDIx.java | 4 +- .../java/src/org/forester/sdi/SdiException.java | 18 ++ .../src/org/forester/sdi/TaxonomyAssigner.java | 4 +- 15 files changed, 219 insertions(+), 395 deletions(-) delete mode 100644 forester/java/src/org/forester/application/ta.java create mode 100644 forester/java/src/org/forester/sdi/SdiException.java diff --git a/forester/java/src/org/forester/application/gsdi.java b/forester/java/src/org/forester/application/gsdi.java index 5bf4e14..3df2a74 100644 --- a/forester/java/src/org/forester/application/gsdi.java +++ b/forester/java/src/org/forester/application/gsdi.java @@ -40,13 +40,16 @@ import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.GSDI; import org.forester.sdi.SDI; import org.forester.sdi.SDI.TaxonomyComparisonBase; import org.forester.sdi.SDIse; +import org.forester.sdi.SdiException; import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; public final class gsdi { @@ -119,12 +122,12 @@ public final class gsdi { private static void execute( final CommandLineArguments cla ) throws IOException { BASE_ALGORITHM base_algorithm = BASE_ALGORITHM.GSDI; - boolean strip = false; + boolean strip_species_tree = false; boolean most_parsimonous_duplication_model = false; boolean species_tree_in_phyloxml = true; boolean allow_stripping_of_gene_tree = false; if ( cla.isOptionSet( gsdi.STRIP_OPTION ) ) { - strip = true; + strip_species_tree = true; } if ( cla.isOptionSet( gsdi.SDI_OPTION ) ) { base_algorithm = BASE_ALGORITHM.SDI; @@ -229,12 +232,17 @@ public final class gsdi { gene_tree.setRooted( true ); species_tree.setRooted( true ); if ( !gene_tree.isCompletelyBinary() ) { - ForesterUtil.fatalError( gsdi.PRG_NAME, "gene tree (\"" + gene_tree_file + "\") is not completely binary" ); + log_writer.write( "User Error: gene tree is not completely binary" ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.close(); + ForesterUtil.fatalError( gsdi.PRG_NAME, "gene tree is not completely binary" ); } if ( base_algorithm != BASE_ALGORITHM.GSDI ) { if ( !species_tree.isCompletelyBinary() ) { - ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree (\"" + species_tree_file - + "\") is not completely binary, use GSDI instead" ); + log_writer.write( "User Error: species tree is not completely binary, use GSDI instead" ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.close(); + ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree is not completely binary, use GSDI instead" ); } } // For timing. @@ -249,18 +257,30 @@ public final class gsdi { // Helper.randomizeSpecies( 1, 8192, gene_tree ); // Helper.intervalNumberSpecies( gene_tree, 4096 ); // Helper.numberSpeciesInDescOrder( gene_tree ); - log_writer.write( PRG_NAME + " " + PRG_VERSION + " " + PRG_DATE ); + log_writer.write( PRG_NAME + " - " + PRG_DESC ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( " version : " + PRG_VERSION ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( " date : " + PRG_DATE ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( " forester version: " + ForesterConstants.FORESTER_VERSION ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); - log_writer.write( PRG_DESC ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); - log_writer.write( PRG_DESC ); + log_writer.write( "Start time: " + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( "Gene tree file: " + gene_tree_file.getCanonicalPath() ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); - log_writer.write( new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) ); + log_writer.write( "Gene tree name: " + + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( "Species tree file: " + species_tree_file.getCanonicalPath() ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.write( "Species tree name: " + + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); System.out.println(); - System.out.println( "Strip species tree: " + strip ); - log_writer.write( "Strip species tree: " + strip ); + System.out.println( "Strip species tree: " + strip_species_tree ); + log_writer.write( "Strip species tree: " + strip_species_tree ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); SDI sdi = null; final long start_time = new Date().getTime(); @@ -273,22 +293,40 @@ public final class gsdi { log_writer.write( ForesterUtil.LINE_SEPARATOR ); log_writer.write( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.flush(); sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model, - allow_stripping_of_gene_tree ); + allow_stripping_of_gene_tree, + strip_species_tree ); } else { System.out.println(); System.out.println( "Using SDIse algorithm" ); log_writer.write( "Using SDIse algorithm" ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.flush(); sdi = new SDIse( gene_tree, species_tree ); } } + catch ( final SdiException e ) { + log_writer.write( "User Error: " + e.getLocalizedMessage() ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.close(); + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + catch ( final IOException e ) { + log_writer.write( "Error: " + e ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.close(); + ForesterUtil.fatalError( PRG_NAME, e.toString() ); + } catch ( final Exception e ) { + log_writer.write( "Error: " + e ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + log_writer.close(); e.printStackTrace(); - ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + System.exit( -1 ); } System.out.println(); System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); @@ -315,19 +353,24 @@ public final class gsdi { System.out.println( "Number of duplications : " + sdi.getDuplicationsSum() ); log_writer.write( "Number of duplications : " + sdi.getDuplicationsSum() ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); - if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) && !most_parsimonous_duplication_model ) { - final int duplications = ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum(); - System.out.println( "Number of potential duplications: " + duplications ); - log_writer.write( "Number of potential duplications: " + duplications ); - log_writer.write( ForesterUtil.LINE_SEPARATOR ); - } - if ( base_algorithm == BASE_ALGORITHM.GSDI ) { - final int spec = ( ( GSDI ) sdi ).getSpeciationsSum(); + if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) { + final GSDI gsdi = ( GSDI ) sdi; + if ( !most_parsimonous_duplication_model ) { + final int duplications = gsdi.getSpeciationOrDuplicationEventsSum(); + System.out.println( "Number of potential duplications: " + duplications ); + log_writer.write( "Number of potential duplications: " + duplications ); + log_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + final int spec = gsdi.getSpeciationsSum(); System.out.println( "Number of speciations : " + spec ); log_writer.write( "Number of speciations : " + spec ); log_writer.write( ForesterUtil.LINE_SEPARATOR ); + for( PhylogenyNode n : gsdi.getMappedExternalSpeciesTreeNodes() ) { + System.out.println( n.toString() ); + } } System.out.println(); + log_writer.close(); // some stat on gene tree: // filename, name // number of external nodes, strppided nodes diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 88e3444..0ec8855 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -45,6 +45,7 @@ import org.forester.phylogeny.iterators.PreorderTreeIterator; import org.forester.sdi.DistanceCalculator; import org.forester.sdi.RIO; import org.forester.sdi.SDIR; +import org.forester.sdi.SdiException; import org.forester.util.ForesterUtil; public class rio { @@ -145,7 +146,7 @@ public class rio { final int warn_no_orthos, final double warn_one_ortho, final int bootstraps, - final double t_orthologs_dc ) throws IOException { + final double t_orthologs_dc ) throws IOException, SdiException { Phylogeny consensus_tree = null; Phylogeny // to be a consensus tree. diff --git a/forester/java/src/org/forester/application/sdi_dir.java b/forester/java/src/org/forester/application/sdi_dir.java index bc91edb..b76d080 100644 --- a/forester/java/src/org/forester/application/sdi_dir.java +++ b/forester/java/src/org/forester/application/sdi_dir.java @@ -41,6 +41,7 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.SDIR; import org.forester.sdi.SDIse; +import org.forester.sdi.SdiException; import org.forester.util.ForesterUtil; /* @@ -144,6 +145,7 @@ public class sdi_dir { * set to true, then out of the resulting trees with minimal * mapping cost or minimal number of duplications the tree with * the minimal height is chosen + * @throws SdiException */ public static void infer( final File indir, final File species_tree_file, @@ -153,7 +155,7 @@ public class sdi_dir { final boolean write_trees, final boolean minimize_mapping_cost, boolean minimize_sum_of_dup, - final boolean minimize_height ) throws IOException { + final boolean minimize_height ) throws IOException, SdiException { final int MIN_EXT_NODES = 4; // Minimal size of trees [in ext nodes] // to be analyzed. final int MAX_EXT_NODES = 5000; // Maximal size of trees [in ext nodes] diff --git a/forester/java/src/org/forester/application/sdix.java b/forester/java/src/org/forester/application/sdix.java index f8304dd..88ee713 100644 --- a/forester/java/src/org/forester/application/sdix.java +++ b/forester/java/src/org/forester/application/sdix.java @@ -37,6 +37,7 @@ import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.SDIx; +import org.forester.sdi.SdiException; import org.forester.util.CommandLineArguments; import org.forester.util.ForesterUtil; @@ -48,7 +49,7 @@ public class sdix { final static private String PRG_VERSION = "0.001 alpha"; final static private String PRG_DATE = "2009.10.14"; - public static void main( final String args[] ) { + public static void main( final String args[] ) throws SdiException { ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE ); System.out.println(); CommandLineArguments cla = null; diff --git a/forester/java/src/org/forester/application/ta.java b/forester/java/src/org/forester/application/ta.java deleted file mode 100644 index 053b342..0000000 --- a/forester/java/src/org/forester/application/ta.java +++ /dev/null @@ -1,242 +0,0 @@ -// $Id: -// FORESTER -- software libraries and applications -// for evolutionary biology research and applications. -// -// Copyright (C) 2008-2009 Christian M. Zmasek -// Copyright (C) 2008-2009 Burnham Institute for Medical Research -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.application; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Date; -import java.util.List; - -import org.forester.io.parsers.PhylogenyParser; -import org.forester.io.parsers.util.ParserUtils; -import org.forester.io.writers.PhylogenyWriter; -import org.forester.phylogeny.Phylogeny; -import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; -import org.forester.phylogeny.factories.PhylogenyFactory; -import org.forester.sdi.GSDI; -import org.forester.sdi.SDI; -import org.forester.sdi.SDIse; -import org.forester.util.CommandLineArguments; -import org.forester.util.ForesterUtil; - -public class ta { - - final static private String STRIP_OPTION = "s"; - final static private String SDISE_OPTION = "b"; - final static private String MOST_PARSIMONIOUS_OPTION = "m"; - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String DEFAULT_OUTFILE = "sdi_out.xml"; - final static private String PRG_NAME = "sdi"; - final static private String PRG_VERSION = "alpha 0.3"; - final static private String PRG_DATE = "2008.03.04"; - - public static void main( final String args[] ) { - ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE ); - CommandLineArguments cla = null; - try { - cla = new CommandLineArguments( args ); - } - catch ( final Exception e ) { - ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); - } - if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { - System.out.println(); - print_help(); - System.exit( 0 ); - } - else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) { - System.out.println(); - System.out.println( "Wrong number of arguments." ); - System.out.println(); - print_help(); - System.exit( -1 ); - } - final List allowed_options = new ArrayList(); - allowed_options.add( STRIP_OPTION ); - allowed_options.add( SDISE_OPTION ); - allowed_options.add( MOST_PARSIMONIOUS_OPTION ); - final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); - if ( dissallowed_options.length() > 0 ) { - ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); - } - boolean use_sdise = false; - boolean strip = false; - boolean most_parsimonous_duplication_model = false; - if ( cla.isOptionSet( STRIP_OPTION ) ) { - strip = true; - } - if ( cla.isOptionSet( SDISE_OPTION ) ) { - use_sdise = true; - } - if ( cla.isOptionSet( MOST_PARSIMONIOUS_OPTION ) ) { - if ( use_sdise ) { - ForesterUtil.fatalError( PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" ); - } - most_parsimonous_duplication_model = true; - } - Phylogeny species_tree = null; - Phylogeny gene_tree = null; - File gene_tree_file = null; - File species_tree_file = null; - File out_file = null; - try { - gene_tree_file = cla.getFile( 0 ); - species_tree_file = cla.getFile( 1 ); - if ( cla.getNumberOfNames() == 3 ) { - out_file = cla.getFile( 2 ); - } - else { - out_file = new File( DEFAULT_OUTFILE ); - } - } - catch ( final IllegalArgumentException e ) { - ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getMessage() ); - } - if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) { - ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) ); - } - if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) { - ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) ); - } - if ( ForesterUtil.isWritableFile( out_file ) != null ) { - ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isWritableFile( out_file ) ); - } - try { - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( species_tree_file, true ); - species_tree = factory.create( species_tree_file, pp )[ 0 ]; - } - catch ( final IOException e ) { - ForesterUtil.fatalError( PRG_NAME, - "Failed to read species tree from \"" + gene_tree_file + "\" [" + e.getMessage() - + "]" ); - } - try { - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( gene_tree_file, true ); - gene_tree = factory.create( gene_tree_file, pp )[ 0 ]; - } - catch ( final IOException e ) { - ForesterUtil.fatalError( PRG_NAME, - "Failed to read gene tree from \"" + gene_tree_file + "\" [" + e.getMessage() - + "]" ); - } - gene_tree.setRooted( true ); - species_tree.setRooted( true ); - if ( !gene_tree.isCompletelyBinary() ) { - ForesterUtil.fatalError( PRG_NAME, "gene tree is not completely binary." ); - } - if ( use_sdise ) { - if ( !species_tree.isCompletelyBinary() ) { - ForesterUtil.fatalError( PRG_NAME, "species tree is not completely binary." ); - } - } - // For timing. - // gene_tree = Helper.createBalancedTree( 10 ); - // species_tree = Helper.createBalancedTree( 13 ); - // species_tree = Helper.createUnbalancedTree( 1024 ); - // gene_tree = Helper.createUnbalancedTree( 8192 ); - // species_tree = gene_tree.copyTree(); - // gene_tree = species_tree.copyTree(); - // Helper.numberSpeciesInOrder( species_tree ); - // Helper.numberSpeciesInOrder( gene_tree ); - // Helper.randomizeSpecies( 1, 8192, gene_tree ); - // Helper.intervalNumberSpecies( gene_tree, 4096 ); - // Helper.numberSpeciesInDescOrder( gene_tree ); - System.out.println(); - System.out.println( "Strip species tree: " + strip ); - SDI sdi = null; - final long start_time = new Date().getTime(); - try { - if ( use_sdise ) { - System.out.println(); - System.out.println( "Using SDIse algorithm." ); - sdi = new SDIse( gene_tree, species_tree ); - } - else { - System.out.println(); - System.out.println( "Using GSDI algorithm." ); - System.out.println(); - System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); - sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model ); - } - } - catch ( final Exception e ) { - ForesterUtil.unexpectedFatalError( PRG_NAME, e ); - } - System.out.println(); - System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); - try { - final PhylogenyWriter writer = new PhylogenyWriter(); - writer.toPhyloXML( out_file, gene_tree, 1 ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( PRG_NAME, "Failed to write to \"" + out_file + "\" [" + e.getMessage() + "]" ); - } - System.out.println(); - System.out.println( "Successfully wrote resulting gene tree to: " + out_file ); - System.out.println(); - // if ( use_sdise ) { - // computeMappingCostL(); - // System.out.println( "Mapping cost : " + computeMappingCostL() ); - // } - // System.out.println( "Number of duplications : " + getDuplicationsSum() ); - if ( !use_sdise && !most_parsimonous_duplication_model ) { - System.out.println( "Number of potential duplications: " - + ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum() ); - } - if ( !use_sdise ) { - System.out.println( "Number speciations : " + ( ( GSDI ) sdi ).getSpeciationsSum() ); - } - System.out.println(); - } // main( final String args[] ) - - private static void print_help() { - System.out.println( "Usage: \"" + PRG_NAME - + " [-options] [outfile name]\"" ); - System.out.println(); - System.out.println( "Options:" ); - System.out.println( " -" + STRIP_OPTION + ": to strip the species tree prior to duplication inference" ); - System.out.println( " -" + SDISE_OPTION - + ": to use SDIse algorithm instead of GSDI algorithm (for binary trees only, faster)" ); - System.out.println( " -" + MOST_PARSIMONIOUS_OPTION + ": use most parimonious duplication model for GSDI: " ); - System.out.println( " assign nodes as speciations which would otherwise be assiged" ); - System.out.println( " as unknown because of polytomies in the species tree" ); - System.out.println(); - System.out.println( "Species tree file" ); - System.out.println( " In NHX format, with species names in species name fields unless -n option" ); - System.out.println( " is used." ); - System.out.println(); - System.out.println( "Gene tree file" ); - System.out.println( " In NHX format, with species names in species name fields and sequence names" ); - System.out.println( " in sequence name fields." ); - System.out.println(); - System.out.println( "!! WARNING: GSDI algorithm is under development, please use SDIse (-b) instead !!" ); - System.out.println(); - } // print_help() -} diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index d4582f6..49ad580 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -953,8 +953,7 @@ public final class MainFrameApplication extends MainFrame { customizeJMenuItem( _obtain_detailed_taxonomic_information_deleting_jmi ); _obtain_detailed_taxonomic_information_deleting_jmi .setToolTipText( "To add additional taxonomic information, deletes nodes for which taxonomy cannot found (from UniProt Taxonomy)" ); - _tools_menu - .add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information" ) ); + _tools_menu.add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information" ) ); customizeJMenuItem( _obtain_uniprot_seq_information_jmi ); _obtain_uniprot_seq_information_jmi.setToolTipText( "To add additional sequence information" ); _tools_menu.addSeparator(); @@ -1212,7 +1211,7 @@ public final class MainFrameApplication extends MainFrame { GSDI gsdi = null; int duplications = -1; try { - gsdi = new GSDI( gene_tree, _species_tree.copy(), true, true ); + gsdi = new GSDI( gene_tree, _species_tree.copy(), true, true, false ); duplications = gsdi.getDuplicationsSum(); } catch ( final Exception e ) { @@ -1517,8 +1516,10 @@ public final class MainFrameApplication extends MainFrame { final Phylogeny phy = getCurrentTreePanel().getPhylogeny(); if ( ( phy != null ) && !phy.isEmpty() ) { final TaxonomyDataManager t = new TaxonomyDataManager( this, - _mainpanel.getCurrentTreePanel(), - phy.copy(), false,true ); + _mainpanel.getCurrentTreePanel(), + phy.copy(), + false, + true ); new Thread( t ).start(); } } @@ -1529,9 +1530,10 @@ public final class MainFrameApplication extends MainFrame { final Phylogeny phy = getCurrentTreePanel().getPhylogeny(); if ( ( phy != null ) && !phy.isEmpty() ) { final TaxonomyDataManager t = new TaxonomyDataManager( this, - _mainpanel.getCurrentTreePanel(), - phy.copy(), - true,true ); + _mainpanel.getCurrentTreePanel(), + phy.copy(), + true, + true ); new Thread( t ).start(); } } @@ -2714,7 +2716,7 @@ class NHFilter extends FileFilter { final String file_name = f.getName().trim().toLowerCase(); return file_name.endsWith( ".nh" ) || file_name.endsWith( ".newick" ) || file_name.endsWith( ".phy" ) || file_name.endsWith( ".tr" ) || file_name.endsWith( ".tree" ) || file_name.endsWith( ".dnd" ) - || file_name.endsWith( ".ph" ) || file_name.endsWith( ".phb" ) || file_name.endsWith( ".nwk" ) + || file_name.endsWith( ".ph" ) || file_name.endsWith( ".phb" ) || file_name.endsWith( ".nwk" ) || f.isDirectory(); } diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index 8e2cdf0..06db0a9 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -25,10 +25,14 @@ package org.forester.sdi; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; @@ -64,9 +68,12 @@ public final class GSDI extends SDI { private final HashMap _transversal_counts; private final boolean _most_parsimonious_duplication_model; private final boolean _strip_gene_tree; + private final boolean _strip_species_tree; private int _speciation_or_duplication_events_sum; private int _speciations_sum; - private final Set _stripped_gene_tree_nodes; + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final SortedSet _mapped_species_tree_nodes; /** * Constructor which sets the gene tree and the species tree to be compared. @@ -92,12 +99,14 @@ public final class GSDI extends SDI { * set to true to assign nodes as speciations which would * otherwise be assiged as unknown because of polytomies in the * species tree. + * @throws SdiException * */ public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model, - final boolean strip_gene_tree ) { + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SdiException { super( gene_tree, species_tree ); _speciation_or_duplication_events_sum = 0; _speciations_sum = 0; @@ -105,16 +114,18 @@ public final class GSDI extends SDI { _transversal_counts = new HashMap(); _duplications_sum = 0; _strip_gene_tree = strip_gene_tree; - _stripped_gene_tree_nodes = new HashSet(); + _strip_species_tree = strip_species_tree; + _stripped_gene_tree_nodes = new ArrayList(); + _stripped_species_tree_nodes = new ArrayList(); + _mapped_species_tree_nodes = new TreeSet(); getSpeciesTree().preOrderReId(); linkNodesOfG(); geneTreePostOrderTraversal( getGeneTree().getRoot() ); } - public GSDI( final Phylogeny gene_tree, - final Phylogeny species_tree, - final boolean most_parsimonious_duplication_model ) { - this( gene_tree, species_tree, most_parsimonious_duplication_model, false ); + GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model ) + throws SdiException { + this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false ); } private final Event createDuplicationEvent() { @@ -199,7 +210,6 @@ public final class GSDI extends SDI { * Preconditions: Mapping M for external nodes must have been calculated and * the species tree must be labeled in preorder. *

- * (Last modified: ) * * @param g * starting node of a gene tree - normally the root @@ -211,6 +221,10 @@ public final class GSDI extends SDI { } final PhylogenyNode[] linked_nodes = new PhylogenyNode[ g.getNumberOfDescendants() ]; for( int i = 0; i < linked_nodes.length; ++i ) { + if ( g.getChildNode( i ).getLink() == null ) { + System.out.println( "link is null for " + g.getChildNode( i ) ); + System.exit( -1 ); + } linked_nodes[ i ] = g.getChildNode( i ).getLink(); } final int[] min_max = obtainMinMaxIdIndices( linked_nodes ); @@ -261,6 +275,7 @@ public final class GSDI extends SDI { /** * This allows for linking of internal nodes of the species tree (as opposed * to just external nodes, as in the method it overrides. + * @throws SdiException * */ @Override @@ -287,48 +302,22 @@ public final class GSDI extends SDI { // g.setLink( s ); // } // } - final void linkNodesOfG() { - // final HashMap speciestree_ext_nodes = createTaxonomyToNodeMap(); - // if ( _strip_gene_tree ) { - // stripGeneTree( speciestree_ext_nodes ); - // if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) { - // throw new IllegalArgumentException( "species tree does not contain any" - // + " nodes matching species in the gene tree" ); - // } - // } - // // Retrieve the reference to the PhylogenyNode with a matching species. - // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - // final PhylogenyNode g = iter.next(); - // if ( !g.getNodeData().isHasTaxonomy() ) { - // throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); - // } - // final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() ); - // if ( s == null ) { - // throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy() - // + " not present in species tree" ); - // } - // g.setLink( s ); - // } - ////// - final Map speciestree_ext_nodes = new HashMap(); + final void linkNodesOfG() throws SdiException { + final Map species_to_node_map = new HashMap(); + final Set species_tree_ext_nodes = new HashSet(); final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); System.out.println( "comp base is: " + tax_comp_base ); - // if ( _strip_gene_tree ) { - // stripGeneTree2( speciestree_ext_nodes ); - // if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) { - // throw new IllegalArgumentException( "species tree does not contain any" - // + " nodes matching species in the gene tree" ); - // } - //} - // Put references to all external nodes of the species tree into a map. // Stringyfied taxonomy is the key, node is the value. for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); + species_tree_ext_nodes.add( s ); final String tax_str = taxonomyToString( s, tax_comp_base ); - if ( speciestree_ext_nodes.containsKey( tax_str ) ) { - throw new IllegalArgumentException( "taxonomy [" + s + "] is not unique in species phylogeny" ); + if ( !ForesterUtil.isEmpty( tax_str ) ) { + if ( species_to_node_map.containsKey( tax_str ) ) { + throw new SdiException( "taxonomy \"" + s + "\" is not unique in species tree" ); + } + species_to_node_map.put( tax_str, s ); } - speciestree_ext_nodes.put( tax_str, s ); } // Retrieve the reference to the node with a matching stringyfied taxonomy. for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { @@ -336,10 +325,9 @@ public final class GSDI extends SDI { if ( !g.getNodeData().isHasTaxonomy() ) { if ( _strip_gene_tree ) { _stripped_gene_tree_nodes.add( g ); - continue; } else { - throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); + throw new SdiException( "gene tree node \"" + g + "\" has no taxonomic data" ); } } else { @@ -349,86 +337,92 @@ public final class GSDI extends SDI { _stripped_gene_tree_nodes.add( g ); } else { - throw new IllegalArgumentException( "gene tree node " + g - + " has no appropriate taxonomic data" ); + throw new SdiException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); } } else { - final PhylogenyNode s = speciestree_ext_nodes.get( tax_str ); - // if ( s == null ) { - // if ( _strip_gene_tree ) { - // _stripped_gene_tree_nodes.add( g ); - // } - // else { - // throw new IllegalArgumentException( "taxonomy " + g.getNodeData().getTaxonomy() - // + " not present in species tree" ); - // } - // } - // else { - g.setLink( s ); - System.out.println( "setting link of " + g + " to " + s ); - // } + final PhylogenyNode s = species_to_node_map.get( tax_str ); + if ( s == null ) { + if ( _strip_gene_tree ) { + _stripped_gene_tree_nodes.add( g ); + } + else { + throw new SdiException( "taxonomy \"" + g.getNodeData().getTaxonomy() + + "\" not present in species tree" ); + } + } + else { + g.setLink( s ); + _mapped_species_tree_nodes.add( s ); + System.out.println( "setting link of " + g + " to " + s ); + } } } } // for loop if ( _strip_gene_tree ) { - for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { - // if ( _gene_tree.getNode( n.getId() ) != null ) { - _gene_tree.deleteSubtree( n, true ); - // } + for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { + _gene_tree.deleteSubtree( g, true ); } } - } - - final private HashMap createTaxonomyToNodeMap() { - final HashMap speciestree_ext_nodes = new HashMap(); - for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) { - final PhylogenyNode n = iter.next(); - if ( n.getNodeData().isHasTaxonomy() ) { - if ( speciestree_ext_nodes.containsKey( n.getNodeData().getTaxonomy() ) ) { - throw new IllegalArgumentException( "taxonomy [" + n.getNodeData().getTaxonomy() - + "] is not unique in species phylogeny" ); + if ( _strip_species_tree ) { + for( final PhylogenyNode s : species_tree_ext_nodes ) { + if ( !_mapped_species_tree_nodes.contains( s ) ) { + _species_tree.deleteSubtree( s, true ); } - speciestree_ext_nodes.put( n.getNodeData().getTaxonomy(), n ); - } - } - return speciestree_ext_nodes; - } - - private final void stripGeneTree( final HashMap speciestree_ext_nodes ) { - // final Set to_delete = new HashSet(); - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - if ( !g.getNodeData().isHasTaxonomy() ) { - throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); - } - if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { - _stripped_gene_tree_nodes.add( g ); } } - for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( n, true ); - } } - private final void stripGeneTree2( final HashMap speciestree_ext_nodes ) { - // final Set to_delete = new HashSet(); - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - if ( !g.getNodeData().isHasTaxonomy() ) { - _stripped_gene_tree_nodes.add( g ); - } - else { - if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { - _stripped_gene_tree_nodes.add( g ); - } - } - } - for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( n, true ); - } + public SortedSet getMappedExternalSpeciesTreeNodes() { + return _mapped_species_tree_nodes; } + // final private HashMap createTaxonomyToNodeMap() { + // final HashMap speciestree_ext_nodes = new HashMap(); + // for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) { + // final PhylogenyNode n = iter.next(); + // if ( n.getNodeData().isHasTaxonomy() ) { + // if ( speciestree_ext_nodes.containsKey( n.getNodeData().getTaxonomy() ) ) { + // throw new IllegalArgumentException( "taxonomy [" + n.getNodeData().getTaxonomy() + // + "] is not unique in species phylogeny" ); + // } + // speciestree_ext_nodes.put( n.getNodeData().getTaxonomy(), n ); + // } + // } + // return speciestree_ext_nodes; + // } + // private final void stripGeneTree( final HashMap speciestree_ext_nodes ) { + // // final Set to_delete = new HashSet(); + // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + // final PhylogenyNode g = iter.next(); + // if ( !g.getNodeData().isHasTaxonomy() ) { + // throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); + // } + // if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { + // _stripped_gene_tree_nodes.add( g ); + // } + // } + // for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { + // _gene_tree.deleteSubtree( n, true ); + // } + // } + // private final void stripGeneTree2( final HashMap speciestree_ext_nodes ) { + // // final Set to_delete = new HashSet(); + // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + // final PhylogenyNode g = iter.next(); + // if ( !g.getNodeData().isHasTaxonomy() ) { + // _stripped_gene_tree_nodes.add( g ); + // } + // else { + // if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { + // _stripped_gene_tree_nodes.add( g ); + // } + // } + // } + // for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { + // _gene_tree.deleteSubtree( n, true ); + // } + // } public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { int with_id_count = 0; int with_code_count = 0; @@ -472,7 +466,7 @@ public final class GSDI extends SDI { } } - public Set getStrippedExternalGeneTreeNodes() { + public List getStrippedExternalGeneTreeNodes() { return _stripped_gene_tree_nodes; } diff --git a/forester/java/src/org/forester/sdi/RIO.java b/forester/java/src/org/forester/sdi/RIO.java index 2da3a70..4db4648 100644 --- a/forester/java/src/org/forester/sdi/RIO.java +++ b/forester/java/src/org/forester/sdi/RIO.java @@ -231,9 +231,10 @@ public final class RIO { * @param query * the sequence name of the squence whose orthologs are to be * inferred + * @throws SdiException */ public void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query ) - throws IOException { + throws IOException, SdiException { int bs = 0; if ( RIO.TIME ) { _time = System.currentTimeMillis(); @@ -281,7 +282,8 @@ public final class RIO { // Helper method which performs the actual ortholog inference for // the external node with seqname query. - private void inferOrthologsHelper( final Phylogeny gene_tree, final Phylogeny species_tree, final String query ) { + private void inferOrthologsHelper( final Phylogeny gene_tree, final Phylogeny species_tree, final String query ) + throws SdiException { Phylogeny assigned_tree = null; List nodes = null; final SDIR sdiunrooted = new SDIR(); diff --git a/forester/java/src/org/forester/sdi/RIOn.java b/forester/java/src/org/forester/sdi/RIOn.java index f6dda23..679c2d6 100644 --- a/forester/java/src/org/forester/sdi/RIOn.java +++ b/forester/java/src/org/forester/sdi/RIOn.java @@ -45,7 +45,7 @@ public class RIOn { GeneralTable _super_orthologs = null; GeneralTable _ultra_paralogs = null; - private void doInferOrthologs( final Phylogeny gene_tree, final Phylogeny species_tree ) { + private void doInferOrthologs( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SdiException { final SDIR sdiunrooted = new SDIR(); final Phylogeny assigned_tree = sdiunrooted.infer( gene_tree, species_tree, diff --git a/forester/java/src/org/forester/sdi/SDI.java b/forester/java/src/org/forester/sdi/SDI.java index c783ce0..dec3c0f 100644 --- a/forester/java/src/org/forester/sdi/SDI.java +++ b/forester/java/src/org/forester/sdi/SDI.java @@ -214,8 +214,9 @@ public abstract class SDI { * links (sets the field "link" of PhylogenyNode) each external * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree * which has the same species name. + * @throws SdiException */ - void linkNodesOfG() { + void linkNodesOfG() throws SdiException { final Map speciestree_ext_nodes = new HashMap(); final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase(); // Put references to all external nodes of the species tree into a map. diff --git a/forester/java/src/org/forester/sdi/SDIR.java b/forester/java/src/org/forester/sdi/SDIR.java index 98f1cc5..2b72221 100644 --- a/forester/java/src/org/forester/sdi/SDIR.java +++ b/forester/java/src/org/forester/sdi/SDIR.java @@ -212,6 +212,7 @@ public class SDIR { * Array) must be no lower than 1 * @return array of rooted Trees with duplication vs. speciation assigned if * return_trees is set to true, null otherwise + * @throws SdiException */ public Phylogeny[] infer( final Phylogeny gene_tree, final Phylogeny species_tree, @@ -219,7 +220,7 @@ public class SDIR { boolean minimize_sum_of_dup, final boolean minimize_height, final boolean return_trees, - int max_trees_to_return ) { + int max_trees_to_return ) throws SdiException { init(); SDIse sdise = null; final ArrayList trees = new ArrayList(); diff --git a/forester/java/src/org/forester/sdi/SDIse.java b/forester/java/src/org/forester/sdi/SDIse.java index af6653a..0f29bfb 100644 --- a/forester/java/src/org/forester/sdi/SDIse.java +++ b/forester/java/src/org/forester/sdi/SDIse.java @@ -79,8 +79,9 @@ public class SDIse extends SDI { * reference to a rooted binary species Phylogeny which might get * stripped in the process, must have species names in the * species name fields for all external nodes + * @throws SdiException */ - public SDIse( final Phylogeny gene_tree, final Phylogeny species_tree ) { + public SDIse( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SdiException { super( gene_tree, species_tree ); _duplications_sum = 0; getSpeciesTree().preOrderReId(); diff --git a/forester/java/src/org/forester/sdi/SDIx.java b/forester/java/src/org/forester/sdi/SDIx.java index b7ef358..087c032 100644 --- a/forester/java/src/org/forester/sdi/SDIx.java +++ b/forester/java/src/org/forester/sdi/SDIx.java @@ -51,7 +51,7 @@ public class SDIx { private void analyze( final Phylogeny gene_tree, final String gene_tree_file_name, final Phylogeny[] species_trees, - final File out_dir ) throws IOException { + final File out_dir ) throws IOException, SdiException { final boolean minimize_cost = true; final boolean minimize_sum_of_dup = true; final boolean minimize_height = true; @@ -101,7 +101,7 @@ public class SDIx { } public void method1( final List gene_tree_files, final Phylogeny[] species_trees, final File out_dir ) - throws IOException { + throws IOException, SdiException { checkSpeciesTreesForEqualNumberOfExtNodes( species_trees ); final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); for( final File gene_tree_file : gene_tree_files ) { diff --git a/forester/java/src/org/forester/sdi/SdiException.java b/forester/java/src/org/forester/sdi/SdiException.java new file mode 100644 index 0000000..31daef4 --- /dev/null +++ b/forester/java/src/org/forester/sdi/SdiException.java @@ -0,0 +1,18 @@ + +package org.forester.sdi; + +public class SdiException extends Exception { + + /** + * + */ + private static final long serialVersionUID = 5154733429066500435L; + + public SdiException() { + super(); + } + + public SdiException( final String message ) { + super( message ); + } +} diff --git a/forester/java/src/org/forester/sdi/TaxonomyAssigner.java b/forester/java/src/org/forester/sdi/TaxonomyAssigner.java index 6e4a2a8..c24d97e 100644 --- a/forester/java/src/org/forester/sdi/TaxonomyAssigner.java +++ b/forester/java/src/org/forester/sdi/TaxonomyAssigner.java @@ -32,7 +32,7 @@ import org.forester.phylogeny.iterators.PhylogenyNodeIterator; public class TaxonomyAssigner extends SDI { - public TaxonomyAssigner( final Phylogeny gene_tree, final Phylogeny species_tree ) { + public TaxonomyAssigner( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SdiException { super( gene_tree, species_tree ); getSpeciesTree().preOrderReId(); linkNodesOfG(); @@ -65,7 +65,7 @@ public class TaxonomyAssigner extends SDI { } } - public static void execute( final Phylogeny gene_tree, final Phylogeny species_tree ) { + public static void execute( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SdiException { new TaxonomyAssigner( gene_tree, species_tree ); } } -- 1.7.10.2