From 6005bacbd8aecc0d320b70568a24d321a30bc85c Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Mon, 10 Dec 2012 03:50:39 +0000 Subject: [PATCH] rio - gsdir work... --- .../java/src/org/forester/application/gsdi.java | 76 +++++++++++++---- forester/java/src/org/forester/sdi/GSDI.java | 22 ++++- forester/java/src/org/forester/sdi/GSDIR.java | 87 ++++++++++++++++++++ forester/java/src/org/forester/sdi/TestGSDI.java | 65 ++++++++++++++- .../forester/util/BasicDescriptiveStatistics.java | 8 +- 5 files changed, 231 insertions(+), 27 deletions(-) create mode 100644 forester/java/src/org/forester/sdi/GSDIR.java diff --git a/forester/java/src/org/forester/application/gsdi.java b/forester/java/src/org/forester/application/gsdi.java index 6b58edf..eab52df 100644 --- a/forester/java/src/org/forester/application/gsdi.java +++ b/forester/java/src/org/forester/application/gsdi.java @@ -48,6 +48,7 @@ import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.GSDI; +import org.forester.sdi.GSDIR; import org.forester.sdi.SDI; import org.forester.sdi.SDI.TaxonomyComparisonBase; import org.forester.sdi.SDIException; @@ -60,11 +61,12 @@ import org.forester.util.ForesterUtil; public final class gsdi { private enum BASE_ALGORITHM { - GSDI, SDI + GSDIR, GSDI, SDI } final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true; final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION = "g"; final static private String SDISE_OPTION = "b"; + final static private String GSDIR_OPTION = "r"; final static private String MOST_PARSIMONIOUS_OPTION = "m"; final static private String GUESS_FORMAT_OF_SPECIES_TREE = "q"; final static private String HELP_OPTION_1 = "help"; @@ -109,6 +111,7 @@ public final class gsdi { } final List allowed_options = new ArrayList(); allowed_options.add( gsdi.SDISE_OPTION ); + allowed_options.add( gsdi.GSDIR_OPTION ); allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE ); allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION ); allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ); @@ -127,18 +130,21 @@ public final class gsdi { BASE_ALGORITHM base_algorithm = BASE_ALGORITHM.GSDI; boolean most_parsimonous_duplication_model = false; boolean allow_stripping_of_gene_tree = false; - if ( cla.isOptionSet( gsdi.SDISE_OPTION ) ) { + if ( cla.isOptionSet( gsdi.GSDIR_OPTION ) ) { + base_algorithm = BASE_ALGORITHM.GSDIR; + } + else if ( cla.isOptionSet( gsdi.SDISE_OPTION ) ) { base_algorithm = BASE_ALGORITHM.SDI; } if ( cla.isOptionSet( gsdi.MOST_PARSIMONIOUS_OPTION ) ) { - if ( base_algorithm != BASE_ALGORITHM.GSDI ) { - ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" ); + if ( base_algorithm == BASE_ALGORITHM.SDI ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, "Cannot use most parsimonious duplication mode with SDI" ); } most_parsimonous_duplication_model = true; } if ( cla.isOptionSet( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) { - if ( base_algorithm != BASE_ALGORITHM.GSDI ) { - ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only allow stripping of gene tree with GSDI" ); + if ( base_algorithm == BASE_ALGORITHM.SDI ) { + ForesterUtil.fatalError( gsdi.PRG_NAME, "Cannot allow stripping of gene tree with SDI" ); } allow_stripping_of_gene_tree = true; } @@ -248,9 +254,11 @@ public final class gsdi { if ( !gene_tree.isCompletelyBinary() ) { fatalError( "user error", "gene tree is not completely binary", log_writer ); } - if ( base_algorithm != BASE_ALGORITHM.GSDI ) { + if ( base_algorithm == BASE_ALGORITHM.SDI ) { if ( !species_tree.isCompletelyBinary() ) { - fatalError( "user error", "species tree is not completely binary, use GSDI instead", log_writer ); + fatalError( "user error", + "species tree is not completely binary, use GSDI or GSDIR instead", + log_writer ); } } log_writer.println( PRG_NAME + " - " + PRG_DESC ); @@ -277,22 +285,35 @@ public final class gsdi { SDI sdi = null; final long start_time = new Date().getTime(); try { - if ( base_algorithm == BASE_ALGORITHM.GSDI ) { + if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) || ( base_algorithm == BASE_ALGORITHM.GSDIR ) ) { + if ( base_algorithm == BASE_ALGORITHM.GSDI ) { + System.out.println( "Algorithm : GSDI" ); + log_writer.println( "Algorithm : GSDI" ); + } + else if ( base_algorithm == BASE_ALGORITHM.GSDIR ) { + System.out.println( "Algorithm : GSDIR" ); + log_writer.println( "Algorithm : GSDIR" ); + } System.out.println( "Use most parsimonous duplication model : " + most_parsimonous_duplication_model ); System.out.println( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree ); log_writer.println( "Use most parsimonous duplication model : " + most_parsimonous_duplication_model ); log_writer.println( "Allow stripping of gene tree nodes : " + allow_stripping_of_gene_tree ); log_writer.flush(); - sdi = new GSDI( gene_tree, - species_tree, - most_parsimonous_duplication_model, - allow_stripping_of_gene_tree, - true ); + if ( base_algorithm == BASE_ALGORITHM.GSDI ) { + sdi = new GSDI( gene_tree, + species_tree, + most_parsimonous_duplication_model, + allow_stripping_of_gene_tree, + true ); + } + else if ( base_algorithm == BASE_ALGORITHM.GSDIR ) { + sdi = new GSDIR( gene_tree, species_tree, allow_stripping_of_gene_tree, 1 ); + } } else { System.out.println(); - System.out.println( "Using SDIse algorithm" ); - log_writer.println( "Using SDIse algorithm" ); + System.out.println( "Algorithm : SDI" ); + log_writer.println( "Algorithm : SDI" ); log_writer.flush(); sdi = new SDIse( gene_tree, species_tree ); } @@ -316,9 +337,26 @@ public final class gsdi { System.out.println( "Mapping based on : " + gsdi.getTaxCompBase() ); log_writer.println( "Mapping based on : " + gsdi.getTaxCompBase() ); } + if ( ( base_algorithm == BASE_ALGORITHM.GSDIR ) ) { + final GSDIR gsdir = ( GSDIR ) sdi; + System.out.println( "Mapping based on : " + gsdir.getTaxCompBase() ); + log_writer.println( "Mapping based on : " + gsdir.getTaxCompBase() ); + System.out.println( "Minimal duplications sum : " + gsdir.getMinDuplicationsSum() ); + log_writer.println( "Minimal duplications sum : " + gsdir.getMinDuplicationsSum() ); + System.out.println( "Duplications sum statistics : " + gsdir.getMinDuplicationsSum() ); + log_writer.println( "Duplications sum statistics : " + gsdir.getMinDuplicationsSum() ); + } try { final PhylogenyWriter writer = new PhylogenyWriter(); - writer.toPhyloXML( out_file, gene_tree, 0 ); + if ( base_algorithm == BASE_ALGORITHM.GSDIR ) { + writer.toPhyloXML( out_file, + ( ( GSDIR ) sdi ).getMinDuplicationsSumGeneTrees(), + 0, + ForesterUtil.LINE_SEPARATOR ); + } + else { + writer.toPhyloXML( out_file, gene_tree, 0 ); + } } catch ( final IOException e ) { ForesterUtil.fatalError( PRG_NAME, @@ -331,7 +369,7 @@ public final class gsdi { System.out.println( "Mapping cost : " + sdi.computeMappingCostL() ); log_writer.println( "Mapping cost : " + sdi.computeMappingCostL() ); } - else if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) ) { + else if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) || ( base_algorithm == BASE_ALGORITHM.GSDIR ) ) { final GSDI gsdi = ( GSDI ) sdi; final File species_tree_used_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + SUFFIX_FOR_SPECIES_TREE_USED ); @@ -476,6 +514,8 @@ public final class gsdi { + ": to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" ); System.out.println( " -" + gsdi.SDISE_OPTION + ": to use SDIse algorithm instead of GSDI algorithm (for binary species trees)" ); + System.out.println( " -" + gsdi.GSDIR_OPTION + + ": to use GSDIR algorithm instead of GSDI algorithm (re-rooting)" ); System.out.println(); System.out.println( "Gene tree:" ); System.out.println( " in phyloXM format, with taxonomy and sequence data in appropriate fields" ); diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index 2419d13..19e2b7d 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -64,7 +64,7 @@ import org.forester.util.ForesterUtil; * * @author Christian M. Zmasek */ -public final class GSDI extends SDI { +public class GSDI extends SDI { private final boolean _most_parsimonious_duplication_model; private final boolean _strip_gene_tree; @@ -102,6 +102,26 @@ public final class GSDI extends SDI { throws SDIException { this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false ); } + + public GSDI( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean most_parsimonious_duplication_model, + final boolean strip_gene_tree, + final boolean strip_species_tree, + int x ) throws SDIException { + super( gene_tree, species_tree ); + _speciation_or_duplication_events_sum = 0; + _speciations_sum = 0; + _most_parsimonious_duplication_model = most_parsimonious_duplication_model; + _duplications_sum = 0; + _strip_gene_tree = strip_gene_tree; + _strip_species_tree = strip_species_tree; + _stripped_gene_tree_nodes = new ArrayList(); + _stripped_species_tree_nodes = new ArrayList(); + _mapped_species_tree_nodes = new HashSet(); + _scientific_names_mapped_to_reduced_specificity = new TreeSet(); + + } // s is the node on the species tree g maps to. private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { diff --git a/forester/java/src/org/forester/sdi/GSDIR.java b/forester/java/src/org/forester/sdi/GSDIR.java new file mode 100644 index 0000000..8eeca62 --- /dev/null +++ b/forester/java/src/org/forester/sdi/GSDIR.java @@ -0,0 +1,87 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2013 Christian M. Zmasek +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.sdi; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.BasicDescriptiveStatistics; + +public class GSDIR extends GSDI { + + private int _min_duplications_sum; + private final BasicDescriptiveStatistics _duplications_sum_stats; + private final List _min_duplications_sum_gene_trees; + + public GSDIR( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree, final int x ) + throws SDIException { + super( gene_tree, species_tree, true, strip_gene_tree, true, 1 ); + _min_duplications_sum = Integer.MAX_VALUE; + _min_duplications_sum_gene_trees = new ArrayList(); + _duplications_sum_stats = new BasicDescriptiveStatistics(); + linkNodesOfG(); + final List gene_tree_nodes_post_order = new ArrayList(); + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { + gene_tree_nodes_post_order.add( it.next() ); + } + for( final PhylogenyNode root : gene_tree_nodes_post_order ) { + _gene_tree.reRoot( root ); + PhylogenyMethods.preOrderReId( getSpeciesTree() ); + //TEST, remove later + for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode g = it.next(); + if ( g.isInternal() ) { + g.setLink( null ); + } + } + geneTreePostOrderTraversal(); + _duplications_sum_stats.addValue( getMinDuplicationsSum() ); + if ( getDuplicationsSum() < getMinDuplicationsSum() ) { + _min_duplications_sum = getDuplicationsSum(); + _min_duplications_sum_gene_trees.clear(); + _min_duplications_sum_gene_trees.add( getGeneTree().copy() ); + } + else if ( getDuplicationsSum() == getMinDuplicationsSum() ) { + _min_duplications_sum_gene_trees.add( getGeneTree().copy() ); + } + } + } + + public int getMinDuplicationsSum() { + return _min_duplications_sum; + } + + public List getMinDuplicationsSumGeneTrees() { + return _min_duplications_sum_gene_trees; + } + + public BasicDescriptiveStatistics getDuplicationsSumStats() { + return _duplications_sum_stats; + } +} diff --git a/forester/java/src/org/forester/sdi/TestGSDI.java b/forester/java/src/org/forester/sdi/TestGSDI.java index 6480281..8ad459e 100644 --- a/forester/java/src/org/forester/sdi/TestGSDI.java +++ b/forester/java/src/org/forester/sdi/TestGSDI.java @@ -97,7 +97,7 @@ public final class TestGSDI { private static boolean testGSDI_general() { try { - final PhylogenyMethods pm = PhylogenyMethods.getInstance(); + final String s2_ = "((" + "([&&NHX:S=a1],[&&NHX:S=a2],[&&NHX:S=a3],[&&NHX:S=a4])," + "([&&NHX:S=b1],[&&NHX:S=b2],[&&NHX:S=b3],[&&NHX:S=b4])," + "([&&NHX:S=c1],[&&NHX:S=c2],[&&NHX:S=c3],[&&NHX:S=c4])," @@ -1423,6 +1423,64 @@ public final class TestGSDI { } return true; } + + private static boolean testGSDIR_general() { + try { + + final String s2_ = "((" + "([&&NHX:S=a1],[&&NHX:S=a2],[&&NHX:S=a3],[&&NHX:S=a4])," + + "([&&NHX:S=b1],[&&NHX:S=b2],[&&NHX:S=b3],[&&NHX:S=b4])," + + "([&&NHX:S=c1],[&&NHX:S=c2],[&&NHX:S=c3],[&&NHX:S=c4])," + + "([&&NHX:S=d1],[&&NHX:S=d2],[&&NHX:S=d3],[&&NHX:S=d4])),(" + + "([&&NHX:S=e1],[&&NHX:S=e2],[&&NHX:S=e3],[&&NHX:S=e4])," + + "([&&NHX:S=f1],[&&NHX:S=f2],[&&NHX:S=f3],[&&NHX:S=f4])," + + "([&&NHX:S=g1],[&&NHX:S=g2],[&&NHX:S=g3],[&&NHX:S=g4])," + + "([&&NHX:S=h1],[&&NHX:S=h2],[&&NHX:S=h3],[&&NHX:S=h4])),(" + + "([&&NHX:S=i1],[&&NHX:S=i2],[&&NHX:S=i3],[&&NHX:S=i4])," + + "([&&NHX:S=j1],[&&NHX:S=j2],[&&NHX:S=j3],[&&NHX:S=j4])," + + "([&&NHX:S=k1],[&&NHX:S=k2],[&&NHX:S=k3],[&&NHX:S=k4])," + + "([&&NHX:S=l1],[&&NHX:S=l2],[&&NHX:S=l3],[&&NHX:S=l4])),(" + + "([&&NHX:S=m1],[&&NHX:S=m2],[&&NHX:S=m3],[&&NHX:S=m4])," + + "([&&NHX:S=n1],[&&NHX:S=n2],[&&NHX:S=n3],[&&NHX:S=n4])," + + "([&&NHX:S=o1],[&&NHX:S=o2],[&&NHX:S=o3],[&&NHX:S=o4])," + + "([&&NHX:S=p1],[&&NHX:S=p2],[&&NHX:S=p3],[&&NHX:S=p4])" + + "),[&&NHX:S=x],[&&NHX:S=y],[&&NHX:S=z])"; + final Phylogeny s2 = ParserBasedPhylogenyFactory.getInstance().create( s2_, new NHXParser() )[ 0 ]; + s2.setRooted( true ); + final String s1_ = "((([&&NHX:S=A2],[&&NHX:S=A1]),[&&NHX:S=B],[&&NHX:S=C]),[&&NHX:S=D])"; + final Phylogeny s1 = ParserBasedPhylogenyFactory.getInstance().create( s1_, new NHXParser() )[ 0 ]; + s1.setRooted( true ); + final Phylogeny g1 = TestGSDI + .createPhylogeny( "((((B[&&NHX:S=B],A1[&&NHX:S=A1]),C[&&NHX:S=C]),A2[&&NHX:S=A2]),D[&&NHX:S=D])" ); + final GSDIR sdi1 = new GSDIR( g1, s1, false, 1 ); + // Archaeopteryx.createApplication( g1 ); + // Archaeopteryx.createApplication( s1 ); + if ( sdi1.getDuplicationsSum() != 1 ) { + return false; + } + if ( !PhylogenyMethods.calculateLCA( g1.getNode( "B" ), g1.getNode( "A1" ) ).getNodeData().getEvent() + .isSpeciation() ) { + return false; + } + if ( !PhylogenyMethods.calculateLCA( g1.getNode( "C" ), g1.getNode( "A1" ) ).getNodeData().getEvent() + .isSpeciationOrDuplication() ) { + return false; + } + if ( !( PhylogenyMethods.calculateLCA( g1.getNode( "A2" ), g1.getNode( "A1" ) ).getNodeData().getEvent() + .isDuplication() ) ) { + return false; + } + if ( !PhylogenyMethods.calculateLCA( g1.getNode( "D" ), g1.getNode( "A1" ) ).getNodeData().getEvent() + .isSpeciation() ) { + return false; + } + + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } public static void main( final String[] args ) { if ( !TestGSDI.testGSDI_against_binary_gene_tree() ) { @@ -1431,8 +1489,11 @@ public final class TestGSDI { if ( !TestGSDI.testGSDI_general() ) { System.out.println( "general failed" ); } + if ( !TestGSDI.testGSDIR_general() ) { + System.out.println( "general re-rooting failed" ); + } else { - System.out.println( "general OK" ); + System.out.println( "OK" ); } // boolean success = test(); // if ( success ) { diff --git a/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java b/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java index 55aa23b..538f6a1 100644 --- a/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java +++ b/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java @@ -146,9 +146,7 @@ public class BasicDescriptiveStatistics implements DescriptiveStatistics { return _sum; } - /* (non-Javadoc) - * @see org.forester.util.DescriptiveStatisticsI#getSummaryAsString() - */ + @Override public String getSummaryAsString() { validate(); @@ -157,9 +155,7 @@ public class BasicDescriptiveStatistics implements DescriptiveStatistics { return "" + mean + ( ( char ) 177 ) + sd + " [" + getMin() + "..." + getMax() + "]"; } - /* (non-Javadoc) - * @see org.forester.util.DescriptiveStatisticsI#getValue(int) - */ + @Override public double getValue( final int index ) { validate(); -- 1.7.10.2