From: cmzmasek@gmail.com Date: Fri, 14 Dec 2012 05:02:39 +0000 (+0000) Subject: "rio" work X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=9bad57b1ba0f75075ab8c6bda1dedb906f8c6280;p=jalview.git "rio" work --- diff --git a/forester/java/src/org/forester/application/gsdi.java b/forester/java/src/org/forester/application/gsdi.java index 304d12d..871658c 100644 --- a/forester/java/src/org/forester/application/gsdi.java +++ b/forester/java/src/org/forester/application/gsdi.java @@ -50,10 +50,10 @@ import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.GSDI; import org.forester.sdi.GSDIR; import org.forester.sdi.SDI; -import org.forester.sdi.SDI.ALGORITHM; -import org.forester.sdi.SDI.TaxonomyComparisonBase; import org.forester.sdi.SDIException; -import org.forester.sdi.SDIse; +import org.forester.sdi.SDIutil; +import org.forester.sdi.SDIutil.ALGORITHM; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.CommandLineArguments; import org.forester.util.EasyWriter; import org.forester.util.ForesterConstants; @@ -200,7 +200,7 @@ public final class gsdi { ( ( NHXParser ) p ).setReplaceUnderscores( true ); } species_tree = factory.create( species_tree_file, p )[ 0 ]; - final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree ); + final TaxonomyComparisonBase comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree ); switch ( comp_base ) { case SCIENTIFIC_NAME: try { @@ -280,7 +280,7 @@ public final class gsdi { + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) ); System.out.println( "Species tree name : " + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) ); - SDI sdi = null; + Object sdi = null; final long start_time = new Date().getTime(); try { if ( ( base_algorithm == ALGORITHM.GSDI ) || ( base_algorithm == ALGORITHM.GSDIR ) ) { @@ -313,7 +313,7 @@ public final class gsdi { System.out.println( "Algorithm : SDI" ); log_writer.println( "Algorithm : SDI" ); log_writer.flush(); - sdi = new SDIse( gene_tree, species_tree ); + sdi = new SDI( gene_tree, species_tree ); } } catch ( final SDIException e ) { @@ -363,6 +363,7 @@ public final class gsdi { System.out.println( "Wrote resulting gene tree to : " + out_file.getCanonicalPath() ); log_writer.println( "Wrote resulting gene tree to : " + out_file.getCanonicalPath() ); if ( base_algorithm == ALGORITHM.SDI ) { + sdi = sdi; sdi.computeMappingCostL(); System.out.println( "Mapping cost : " + sdi.computeMappingCostL() ); log_writer.println( "Mapping cost : " + sdi.computeMappingCostL() ); diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 2715461..8ff6def 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -43,8 +43,8 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.rio.RIO; import org.forester.rio.RIOException; -import org.forester.sdi.SDI.ALGORITHM; import org.forester.sdi.SDIException; +import org.forester.sdi.SDIutil.ALGORITHM; import org.forester.util.CommandLineArguments; import org.forester.util.EasyWriter; import org.forester.util.ForesterUtil; diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index aa78613..6a0846b 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -1292,9 +1292,13 @@ public final class MainFrameApplication extends MainFrame { _mainpanel.getTabbedPane().setSelectedIndex( selected ); showWhole(); _mainpanel.getCurrentTreePanel().setEdited( true ); - JOptionPane.showMessageDialog( this, "Duplications: " + gsdir.getDuplicationsSum() + "\n" - + "Potential duplications: " + gsdir.getSpeciationOrDuplicationEventsSum() + "\n" + "Speciations: " - + gsdir.getSpeciationsSum(), "GSDI successfully completed", JOptionPane.INFORMATION_MESSAGE ); + JOptionPane.showMessageDialog( this, + "Duplications: " + gsdir.getMinDuplicationsSum() + "\n" + "Speciations: " + + gsdir.getSpeciationsSum() + "\n" + + "Number of root positions minimizing duplications sum: " + + gsdir.getMinDuplicationsSumGeneTrees().size(), + "GSDIR successfully completed", + JOptionPane.INFORMATION_MESSAGE ); } void executeFunctionAnalysis() { diff --git a/forester/java/src/org/forester/rio/RIO.java b/forester/java/src/org/forester/rio/RIO.java index 84d1d15..a42d1f5 100644 --- a/forester/java/src/org/forester/rio/RIO.java +++ b/forester/java/src/org/forester/rio/RIO.java @@ -47,10 +47,10 @@ import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.GSDIR; -import org.forester.sdi.SDI.ALGORITHM; -import org.forester.sdi.SDI.TaxonomyComparisonBase; import org.forester.sdi.SDIException; import org.forester.sdi.SDIR; +import org.forester.sdi.SDIutil.ALGORITHM; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.ForesterUtil; diff --git a/forester/java/src/org/forester/rio/TestRIO.java b/forester/java/src/org/forester/rio/TestRIO.java index 00eab1d..06468af 100644 --- a/forester/java/src/org/forester/rio/TestRIO.java +++ b/forester/java/src/org/forester/rio/TestRIO.java @@ -8,8 +8,8 @@ import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyMethods.PhylogenyNodeField; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; -import org.forester.sdi.SDI.ALGORITHM; -import org.forester.sdi.SDI.TaxonomyComparisonBase; +import org.forester.sdi.SDIutil.ALGORITHM; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; public final class TestRIO { diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index fa5265d..d6e6fb6 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -32,139 +32,98 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; -import java.util.TreeSet; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Event; -import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; -/* - * Implements our algorithm for speciation - duplication inference (SDI).

- * The initialization is accomplished by:

The recursion part is accomplished by this class' - * method "geneTreePostOrderTraversal(PhylogenyNode)".

Requires JDK 1.5 or - * greater. - * - * @see SDI#linkNodesOfG() - * - * @see Phylogeny#preorderReID(int) - * - * @see - * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) - * - * @see #geneTreePostOrderTraversal(PhylogenyNode) - * - * @author Christian M. Zmasek - */ -public class GSDI extends SDI { +public final class GSDI { - private final boolean _most_parsimonious_duplication_model; - protected int _speciation_or_duplication_events_sum; - protected int _speciations_sum; - private final List _stripped_gene_tree_nodes; - private final List _stripped_species_tree_nodes; - private final Set _mapped_species_tree_nodes; - private TaxonomyComparisonBase _tax_comp_base; - private final SortedSet _scientific_names_mapped_to_reduced_specificity; + private final boolean _most_parsimonious_duplication_model; + private final int _speciation_or_duplication_events_sum; + private final int _speciations_sum; + private final int _duplications_sum; + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final Set _mapped_species_tree_nodes; + private final TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model, final boolean strip_gene_tree, final boolean strip_species_tree ) throws SDIException { - super( gene_tree, species_tree ); - _speciation_or_duplication_events_sum = 0; - _speciations_sum = 0; _most_parsimonious_duplication_model = most_parsimonious_duplication_model; - _duplications_sum = 0; - _stripped_gene_tree_nodes = new ArrayList(); - _stripped_species_tree_nodes = new ArrayList(); - _mapped_species_tree_nodes = new HashSet(); - _scientific_names_mapped_to_reduced_specificity = new TreeSet(); - linkNodesOfG( null, strip_gene_tree, strip_species_tree ); - PhylogenyMethods.preOrderReId( getSpeciesTree() ); - geneTreePostOrderTraversal(); + final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree, + species_tree, + null, + strip_gene_tree, + strip_species_tree ); + _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes(); + _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes(); + _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes(); + _scientific_names_mapped_to_reduced_specificity = nodes_linking_result + .getScientificNamesMappedToReducedSpecificity(); + _tax_comp_base = nodes_linking_result.getTaxCompBase(); + PhylogenyMethods.preOrderReId( species_tree ); + final GSDIsummaryResult gsdi_summary_result = new GSDIsummaryResult(); + geneTreePostOrderTraversal( gene_tree, _most_parsimonious_duplication_model, gsdi_summary_result ); + _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum(); + _speciations_sum = gsdi_summary_result.getSpeciationsSum(); + _duplications_sum = gsdi_summary_result.getDuplicationsSum(); } - // Used by GSDIR - protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree ) - throws SDIException { - super( gene_tree, species_tree ); - _speciation_or_duplication_events_sum = -1; - _speciations_sum = 0; - _most_parsimonious_duplication_model = true; - _duplications_sum = 0; - _stripped_gene_tree_nodes = new ArrayList(); - _stripped_species_tree_nodes = new ArrayList(); - _mapped_species_tree_nodes = new HashSet(); - _scientific_names_mapped_to_reduced_specificity = new TreeSet(); + public int getDuplicationsSum() { + return _duplications_sum; } - // s is the node on the species tree g maps to. - private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { - boolean oyako = false; - if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { - oyako = true; - } - if ( g.getLink().getNumberOfDescendants() == 2 ) { - if ( oyako ) { - g.getNodeData().setEvent( createDuplicationEvent() ); - } - else { - g.getNodeData().setEvent( createSpeciationEvent() ); - } - } - else { - if ( oyako ) { - final Set set = new HashSet(); - for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { - n = n.getLink(); - while ( n.getParent() != s ) { - n = n.getParent(); - if ( n.isRoot() ) { - break; - } - } - set.add( n ); - } - boolean multiple = false; - for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { - n = n.getLink(); - while ( n.getParent() != s ) { - n = n.getParent(); - if ( n.isRoot() ) { - break; - } - } - if ( set.contains( n ) ) { - multiple = true; - break; - } - } - if ( multiple ) { - g.getNodeData().setEvent( createDuplicationEvent() ); - } - else { - if ( _most_parsimonious_duplication_model ) { - g.getNodeData().setEvent( createSpeciationEvent() ); - } - else { - g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() ); - } - } - } - else { - g.getNodeData().setEvent( createSpeciationEvent() ); - } + public Set getMappedExternalSpeciesTreeNodes() { + return _mapped_species_tree_nodes; + } + + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; + } + + public final int getSpeciationOrDuplicationEventsSum() { + return _speciation_or_duplication_events_sum; + } + + public final int getSpeciationsSum() { + return _speciations_sum; + } + + public List getStrippedExternalGeneTreeNodes() { + return _stripped_gene_tree_nodes; + } + + public List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + public TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } + + @Override + public final String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Speciations sum : " + getSpeciationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + if ( !_most_parsimonious_duplication_model ) { + sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); } + return sb.toString(); } /** @@ -177,8 +136,10 @@ public class GSDI extends SDI { *

* */ - final void geneTreePostOrderTraversal() { - for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) { + final static void geneTreePostOrderTraversal( final Phylogeny gene_tree, + final boolean most_parsimonious_duplication_model, + final GSDIsummaryResult res ) { + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { final PhylogenyNode g = it.next(); if ( g.isInternal() ) { PhylogenyNode s1 = g.getChildNode1().getLink(); @@ -192,37 +153,11 @@ public class GSDI extends SDI { } } g.setLink( s1 ); - determineEvent( s1, g ); + determineEvent( s1, g, most_parsimonious_duplication_model, res ); } } } - private final Event createDuplicationEvent() { - final Event event = Event.createSingleDuplicationEvent(); - ++_duplications_sum; - return event; - } - - private final Event createSingleSpeciationOrDuplicationEvent() { - final Event event = Event.createSingleSpeciationOrDuplicationEvent(); - ++_speciation_or_duplication_events_sum; - return event; - } - - private final Event createSpeciationEvent() { - final Event event = Event.createSingleSpeciationEvent(); - ++_speciations_sum; - return event; - } - - public final int getSpeciationOrDuplicationEventsSum() { - return _speciation_or_duplication_events_sum; - } - - public final int getSpeciationsSum() { - return _speciations_sum; - } - /** * This allows for linking of internal nodes of the species tree (as opposed * to just external nodes, as in the method it overrides. @@ -230,23 +165,26 @@ public class GSDI extends SDI { * @throws SDIException * */ - final void linkNodesOfG( final TaxonomyComparisonBase tax_comp_base, - final boolean strip_gene_tree, - final boolean strip_species_tree ) throws SDIException { + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final TaxonomyComparisonBase tax_comp_base, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { final Map species_to_node_map = new HashMap(); final List species_tree_ext_nodes = new ArrayList(); + final NodesLinkingResult res = new NodesLinkingResult(); if ( tax_comp_base == null ) { - _tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); + res.setTaxCompBase( SDIutil.determineTaxonomyComparisonBase( gene_tree ) ); } else { - _tax_comp_base = tax_comp_base; + res.setTaxCompBase( tax_comp_base ); } // Stringyfied taxonomy is the key, node is the value. - for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); species_tree_ext_nodes.add( s ); if ( s.getNodeData().isHasTaxonomy() ) { - final String tax_str = taxonomyToString( s, _tax_comp_base ); + final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() ); if ( !ForesterUtil.isEmpty( tax_str ) ) { if ( species_to_node_map.containsKey( tax_str ) ) { throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" ); @@ -256,21 +194,21 @@ public class GSDI extends SDI { } } // Retrieve the reference to the node with a matching stringyfied taxonomy. - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode g = iter.next(); if ( !g.getNodeData().isHasTaxonomy() ) { if ( strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + res.getStrippedGeneTreeNodes().add( g ); } else { throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" ); } } else { - final String tax_str = taxonomyToString( g, _tax_comp_base ); + final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() ); if ( ForesterUtil.isEmpty( tax_str ) ) { if ( strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + res.getStrippedGeneTreeNodes().add( g ); } else { throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); @@ -278,13 +216,15 @@ public class GSDI extends SDI { } else { PhylogenyNode s = species_to_node_map.get( tax_str ); - if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) + if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) { - s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str ); + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + res.getScientificNamesMappedToReducedSpecificity() ); } if ( s == null ) { if ( strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + res.getStrippedGeneTreeNodes().add( g ); } else { throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy() @@ -293,38 +233,158 @@ public class GSDI extends SDI { } else { g.setLink( s ); - _mapped_species_tree_nodes.add( s ); + res.getMappedSpeciesTreeNodes().add( s ); } } } } // for loop if ( strip_gene_tree ) { - stripGeneTree(); - if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) { + stripTree( gene_tree, res.getStrippedGeneTreeNodes() ); + if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) { throw new SDIException( "species could not be mapped between gene tree and species tree" ); } } if ( strip_species_tree ) { - stripSpeciesTree( species_tree_ext_nodes ); + stripSpeciesTree( species_tree, species_tree_ext_nodes, res.getMappedSpeciesTreeNodes() ); } + return res; } - private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, - final String tax_str ) { - PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" ); + private final static void addScientificNamesMappedToReducedSpecificity( final String s1, + final String s2, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); + } + + // Used by GSDIR + // protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree ) + // throws SDIException { + // super( gene_tree, species_tree ); + // _speciation_or_duplication_events_sum = -1; + // _speciations_sum = 0; + // _most_parsimonious_duplication_model = true; + // _duplications_sum = 0; + // _stripped_gene_tree_nodes = new ArrayList(); + // _stripped_species_tree_nodes = new ArrayList(); + // _mapped_species_tree_nodes = new HashSet(); + // _scientific_names_mapped_to_reduced_specificity = new TreeSet(); + // } + // s is the node on the species tree g maps to. + private final static void determineEvent( final PhylogenyNode s, + final PhylogenyNode g, + final boolean most_parsimonious_duplication_model, + final GSDIsummaryResult res ) { + boolean oyako = false; + if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { + oyako = true; + } + if ( g.getLink().getNumberOfDescendants() == 2 ) { + if ( oyako ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + } + else { + if ( oyako ) { + final Set set = new HashSet(); + for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { + n = n.getLink(); + while ( n.getParent() != s ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } + } + set.add( n ); + } + boolean multiple = false; + for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { + n = n.getLink(); + while ( n.getParent() != s ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } + } + if ( set.contains( n ) ) { + multiple = true; + break; + } + } + if ( multiple ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + if ( most_parsimonious_duplication_model ) { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() ); + res.increaseSpeciationOrDuplicationEventsSum(); + } + } + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + } + } + + private final static List stripSpeciesTree( final Phylogeny species_tree, + final List species_tree_ext_nodes, + final Set keep ) { + final List stripped_species_tree_nodes = new ArrayList(); + for( final PhylogenyNode s : species_tree_ext_nodes ) { + if ( !keep.contains( s ) ) { + species_tree.deleteSubtree( s, true ); + stripped_species_tree_nodes.add( s ); + } + } + species_tree.clearHashIdToNodeMap(); + species_tree.externalNodesHaveChanged(); + return stripped_species_tree_nodes; + } + + private final static void stripTree( final Phylogeny phy, final List strip_nodes ) { + for( final PhylogenyNode g : strip_nodes ) { + phy.deleteSubtree( g, true ); + } + phy.clearHashIdToNodeMap(); + phy.externalNodesHaveChanged(); + } + + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + " (", + scientific_names_mapped_to_reduced_specificity ); if ( s == null ) { if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); s = species_to_node_map.get( new_tax_str ); if ( s != null ) { - addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); } } } if ( s == null ) { for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", " form ", " subform ", " cultivar ", " section ", " subsection " } ) { - s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t ); + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + t, + scientific_names_mapped_to_reduced_specificity ); if ( s != null ) { break; } @@ -333,121 +393,21 @@ public class GSDI extends SDI { return s; } - private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, - final String tax_str, - final String term ) { + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final String term, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { final int i = tax_str.indexOf( term ); if ( i > 4 ) { final String new_tax_str = tax_str.substring( 0, i ).trim(); final PhylogenyNode s = species_to_node_map.get( new_tax_str ); if ( s != null ) { - addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); } return s; } return null; } - - private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) { - _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); - } - - public final SortedSet getReMappedScientificNamesFromGeneTree() { - return _scientific_names_mapped_to_reduced_specificity; - } - - public TaxonomyComparisonBase getTaxCompBase() { - return _tax_comp_base; - } - - private void stripSpeciesTree( final List species_tree_ext_nodes ) { - for( final PhylogenyNode s : species_tree_ext_nodes ) { - if ( !_mapped_species_tree_nodes.contains( s ) ) { - _species_tree.deleteSubtree( s, true ); - _stripped_species_tree_nodes.add( s ); - } - } - _species_tree.clearHashIdToNodeMap(); - _species_tree.externalNodesHaveChanged(); - } - - public List getStrippedSpeciesTreeNodes() { - return _stripped_species_tree_nodes; - } - - private void stripGeneTree() { - for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( g, true ); - } - _gene_tree.clearHashIdToNodeMap(); - _gene_tree.externalNodesHaveChanged(); - } - - public Set getMappedExternalSpeciesTreeNodes() { - return _mapped_species_tree_nodes; - } - - public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { - int with_id_count = 0; - int with_code_count = 0; - int with_sn_count = 0; - int max = 0; - for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - if ( g.getNodeData().isHasTaxonomy() ) { - final Taxonomy tax = g.getNodeData().getTaxonomy(); - if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { - if ( ++with_id_count > max ) { - max = with_id_count; - } - } - if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { - if ( ++with_code_count > max ) { - max = with_code_count; - } - } - if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { - if ( ++with_sn_count > max ) { - max = with_sn_count; - } - } - } - } - if ( max == 0 ) { - throw new IllegalArgumentException( "gene tree has no taxonomic data" ); - } - else if ( max == 1 ) { - throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); - } - else if ( max == with_id_count ) { - return SDI.TaxonomyComparisonBase.ID; - } - else if ( max == with_sn_count ) { - return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; - } - else { - return SDI.TaxonomyComparisonBase.CODE; - } - } - - public List getStrippedExternalGeneTreeNodes() { - return _stripped_gene_tree_nodes; - } - - @Override - public final String toString() { - final StringBuffer sb = new StringBuffer(); - sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Speciations sum : " + getSpeciationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Duplications sum : " + getDuplicationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - if ( !_most_parsimonious_duplication_model ) { - sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - } - sb.append( "mapping cost L : " + computeMappingCostL() ); - return sb.toString(); - } } diff --git a/forester/java/src/org/forester/sdi/GSDIR.java b/forester/java/src/org/forester/sdi/GSDIR.java index 5fd0a3b..6c4d267 100644 --- a/forester/java/src/org/forester/sdi/GSDIR.java +++ b/forester/java/src/org/forester/sdi/GSDIR.java @@ -26,28 +26,49 @@ package org.forester.sdi; import java.util.ArrayList; import java.util.List; +import java.util.Set; +import java.util.SortedSet; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyBranch; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.BasicDescriptiveStatistics; -public class GSDIR extends GSDI { +public class GSDIR { private int _min_duplications_sum; private final BasicDescriptiveStatistics _duplications_sum_stats; private final List _min_duplications_sum_gene_trees; + protected int _speciations_sum; + protected int _duplications_sum; + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final Set _mapped_species_tree_nodes; + private final TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; public GSDIR( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree, final boolean strip_species_tree ) throws SDIException { - super( gene_tree.copy(), species_tree, strip_gene_tree ); - linkNodesOfG( null, strip_gene_tree, strip_species_tree ); + _speciations_sum = 0; + _duplications_sum = 0; + final NodesLinkingResult nodes_linking_result = GSDI.linkNodesOfG( gene_tree, + species_tree, + null, + strip_gene_tree, + strip_species_tree ); + _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes(); + _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes(); + _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes(); + _scientific_names_mapped_to_reduced_specificity = nodes_linking_result + .getScientificNamesMappedToReducedSpecificity(); + _tax_comp_base = nodes_linking_result.getTaxCompBase(); final List gene_tree_branches_post_order = new ArrayList(); - for( final PhylogenyNodeIterator it = _gene_tree.iteratorPostorder(); it.hasNext(); ) { + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { final PhylogenyNode n = it.next(); if ( !n.isRoot() && !( n.getParent().isRoot() && n.isFirstChildNode() ) ) { gene_tree_branches_post_order.add( new PhylogenyBranch( n, n.getParent() ) ); @@ -59,8 +80,8 @@ public class GSDIR extends GSDI { for( final PhylogenyBranch branch : gene_tree_branches_post_order ) { _duplications_sum = 0; _speciations_sum = 0; - _gene_tree.reRoot( branch ); - PhylogenyMethods.preOrderReId( getSpeciesTree() ); + gene_tree.reRoot( branch ); + PhylogenyMethods.preOrderReId( species_tree ); //TEST, remove later // for( final PhylogenyNodeIterator it = _gene_tree.iteratorPostorder(); it.hasNext(); ) { // final PhylogenyNode g = it.next(); @@ -68,14 +89,16 @@ public class GSDIR extends GSDI { // g.setLink( null ); // } // } - geneTreePostOrderTraversal(); + final GSDIsummaryResult gsdi_summary_result = new GSDIsummaryResult(); + GSDI.geneTreePostOrderTraversal( gene_tree, true, gsdi_summary_result ); if ( _duplications_sum < _min_duplications_sum ) { _min_duplications_sum = _duplications_sum; _min_duplications_sum_gene_trees.clear(); - _min_duplications_sum_gene_trees.add( getGeneTree().copy() ); + _min_duplications_sum_gene_trees.add( gene_tree.copy() ); + //_speciations_sum } else if ( _duplications_sum == _min_duplications_sum ) { - _min_duplications_sum_gene_trees.add( getGeneTree().copy() ); + _min_duplications_sum_gene_trees.add( gene_tree.copy() ); } _duplications_sum_stats.addValue( _duplications_sum ); } @@ -92,4 +115,24 @@ public class GSDIR extends GSDI { public BasicDescriptiveStatistics getDuplicationsSumStats() { return _duplications_sum_stats; } + + public Set getMappedExternalSpeciesTreeNodes() { + return _mapped_species_tree_nodes; + } + + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; + } + + public List getStrippedExternalGeneTreeNodes() { + return _stripped_gene_tree_nodes; + } + + public List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + public TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } } diff --git a/forester/java/src/org/forester/sdi/GSDIsummaryResult.java b/forester/java/src/org/forester/sdi/GSDIsummaryResult.java new file mode 100644 index 0000000..2fdf251 --- /dev/null +++ b/forester/java/src/org/forester/sdi/GSDIsummaryResult.java @@ -0,0 +1,39 @@ + +package org.forester.sdi; + +final class GSDIsummaryResult { + + private int _speciation_or_duplication_events_sum; + private int _speciations_sum; + private int _duplications_sum; + + GSDIsummaryResult() { + _speciation_or_duplication_events_sum = 0; + _speciations_sum = 0; + _duplications_sum = 0; + } + + final int getDuplicationsSum() { + return _duplications_sum; + } + + final int getSpeciationOrDuplicationEventsSum() { + return _speciation_or_duplication_events_sum; + } + + final int getSpeciationsSum() { + return _speciations_sum; + } + + final void increaseDuplicationsSum() { + ++_duplications_sum; + } + + final void increaseSpeciationOrDuplicationEventsSum() { + ++_speciation_or_duplication_events_sum; + } + + final void increaseSpeciationsSum() { + ++_speciations_sum; + } +} diff --git a/forester/java/src/org/forester/sdi/NodesLinkingResult.java b/forester/java/src/org/forester/sdi/NodesLinkingResult.java new file mode 100644 index 0000000..dd5bc83 --- /dev/null +++ b/forester/java/src/org/forester/sdi/NodesLinkingResult.java @@ -0,0 +1,53 @@ + +package org.forester.sdi; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.phylogeny.PhylogenyNode; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; + +final class NodesLinkingResult { + + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final Set _mapped_species_tree_nodes; + private TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; + + NodesLinkingResult() { + _stripped_gene_tree_nodes = new ArrayList(); + _stripped_species_tree_nodes = new ArrayList(); + _mapped_species_tree_nodes = new HashSet(); + _scientific_names_mapped_to_reduced_specificity = new TreeSet(); + _tax_comp_base = null; + } + + final List getStrippedGeneTreeNodes() { + return _stripped_gene_tree_nodes; + } + + final List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + final Set getMappedSpeciesTreeNodes() { + return _mapped_species_tree_nodes; + } + + final TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } + + final void setTaxCompBase( final TaxonomyComparisonBase tax_comp_base ) { + _tax_comp_base = tax_comp_base; + } + + final SortedSet getScientificNamesMappedToReducedSpecificity() { + return _scientific_names_mapped_to_reduced_specificity; + } +} diff --git a/forester/java/src/org/forester/sdi/SDI.java b/forester/java/src/org/forester/sdi/SDI.java index fbc3c65..0bb74d3 100644 --- a/forester/java/src/org/forester/sdi/SDI.java +++ b/forester/java/src/org/forester/sdi/SDI.java @@ -23,7 +23,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org +// WWW: www.phylosoft.org/forester package org.forester.sdi; @@ -31,17 +31,42 @@ import java.util.HashMap; import java.util.Map; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Event; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; -public abstract class SDI { +/* + * Implements our algorithm for speciation - duplication inference (SDI).

+ * Reference:

The initialization is accomplished by: + *

The recursion + * part is accomplished by this class' method + * "geneTreePostOrderTraversal(PhylogenyNode)".

Requires JDK 1.2 or greater. + * + * @see SDI#linkNodesOfG() + * + * @see Phylogeny#preorderReID(int) + * + * @see + * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) + * + * @see #geneTreePostOrderTraversal(PhylogenyNode) + * + * @author Christian M. Zmasek + * + * @version 1.102 -- last modified: 10/02/01 + */ +public class SDI { - public enum ALGORITHM { - GSDIR, GSDI, SDI, SDIR - } final Phylogeny _gene_tree; final Phylogeny _species_tree; int _duplications_sum; // Sum of duplications. @@ -50,22 +75,15 @@ public abstract class SDI { /** * Constructor which sets the gene tree and the species tree to be compared. * species_tree is the species tree to which the gene tree gene_tree will be - * compared to. - * Infers for each PhylogenyNode of gene_tree whether - * it represents a speciation or duplication event by calculating and - * interpreting the mapping function M. The most parsimonious sequence of - * speciation and duplication events is assumed. - * The mapping cost L can be - * calculated with method "computeMappingCost()". + * compared to - with method "infer(boolean)". Both Trees must be completely + * binary and rooted. The actual inference is accomplished with method + * "infer(boolean)". The mapping cost L can then be calculated with method + * "computeMappingCost()". *

- * Conditions: - *

- * + * (Last modified: 01/11/01) * + * @see #infer(boolean) + * @see SDI#computeMappingCostL() * @param gene_tree * reference to a rooted binary gene Phylogeny to which assign * duplication vs speciation, must have species names in the @@ -74,8 +92,9 @@ public abstract class SDI { * reference to a rooted binary species Phylogeny which might get * stripped in the process, must have species names in the * species name fields for all external nodes + * @throws SDIException */ - public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) { + public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SDIException { if ( species_tree.isEmpty() || gene_tree.isEmpty() ) { throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" ); } @@ -84,10 +103,217 @@ public abstract class SDI { } _gene_tree = gene_tree; _species_tree = species_tree; - _duplications_sum = 0; _mapping_cost = -1; + _duplications_sum = 0; + PhylogenyMethods.preOrderReId( getSpeciesTree() ); + linkNodesOfG(); + geneTreePostOrderTraversal( getGeneTree().getRoot() ); + } + + /** + * Computes the cost of mapping the gene tree gene_tree onto the species + * tree species_tree. Before this method can be called, the mapping has to + * be calculated with method "infer(boolean)". + *

+ * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for + * Comparing Molecular Phylogenies. Journal of Computational Biology 4 + * 177-187. + * + * @return the mapping cost "L" + */ + public int computeMappingCostL() { + _species_tree.levelOrderReID(); + _mapping_cost = 0; + computeMappingCostHelper( _gene_tree.getRoot() ); + return _mapping_cost; + } + + /** + * Returns the number of duplications. + * + * @return number of duplications + */ + public int getDuplicationsSum() { + return _duplications_sum; + } + + /** + * Returns the gene tree. + * + * @return gene tree + */ + public Phylogeny getGeneTree() { + return _gene_tree; + } + + /** + * Returns the species tree. + * + * @return species tree + */ + public Phylogeny getSpeciesTree() { + return _species_tree; + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( getClass() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( "mapping cost L : " + computeMappingCostL() ); + return sb.toString(); } + /** + * Traverses the subtree of PhylogenyNode g in postorder, calculating the + * mapping function M, and determines which nodes represent speciation + * events and which ones duplication events. + *

+ * Preconditions: Mapping M for external nodes must have been calculated and + * the species tree must be labelled in preorder. + *

+ * (Last modified: 01/11/01) + * + * @param g + * starting node of a gene tree - normally the root + */ + void geneTreePostOrderTraversal( final PhylogenyNode g ) { + PhylogenyNode a, b; + if ( !g.isExternal() ) { + geneTreePostOrderTraversal( g.getChildNode( 0 ) ); + geneTreePostOrderTraversal( g.getChildNode( 1 ) ); + a = g.getChildNode( 0 ).getLink(); + b = g.getChildNode( 1 ).getLink(); + while ( a != b ) { + if ( a.getId() > b.getId() ) { + a = a.getParent(); + } + else { + b = b.getParent(); + } + } + g.setLink( a ); + // Determines whether dup. or spec. + Event event = null; + if ( ( a == g.getChildNode( 0 ).getLink() ) || ( a == g.getChildNode( 1 ).getLink() ) ) { + event = Event.createSingleDuplicationEvent(); + ++_duplications_sum; + } + else { + event = Event.createSingleSpeciationEvent(); + } + g.getNodeData().setEvent( event ); + } + } // geneTreePostOrderTraversal( PhylogenyNode ) + + /** + * Calculates the mapping function for the external nodes of the gene tree: + * links (sets the field "link" of PhylogenyNode) each external + * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree + * which has the same species name. + * @throws SDIException + */ + final void linkNodesOfG() throws SDIException { + final Map speciestree_ext_nodes = new HashMap(); + final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase(); + // Put references to all external nodes of the species tree into a map. + // Stringyfied taxonomy is the key, node is the value. + for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode s = iter.next(); + final String tax_str = SDIutil.taxonomyToString( s, tax_comp_base ); + if ( speciestree_ext_nodes.containsKey( tax_str ) ) { + throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy() + + "] is not unique in species phylogeny" ); + } + speciestree_ext_nodes.put( tax_str, s ); + } + // Retrieve the reference to the node with a matching stringyfied taxonomy. + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + final String tax_str = SDIutil.taxonomyToString( g, tax_comp_base ); + final PhylogenyNode s = speciestree_ext_nodes.get( tax_str ); + if ( s == null ) { + throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy() + + "] not present in species tree" ); + } + g.setLink( s ); + } + } + + /** + * Updates the mapping function M after the root of the gene tree has been + * moved by one branch. It calculates M for the root of the gene tree and + * one of its two children. + *

+ * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)". + *

+ * (Last modfied: 10/02/01) + * + * @param prev_root_was_dup + * true if the previous root was a duplication, false otherwise + * @param prev_root_c1 + * child 1 of the previous root + * @param prev_root_c2 + * child 2 of the previous root + * @return number of duplications which have been assigned in gene tree + */ + int updateM( final boolean prev_root_was_dup, final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) { + final PhylogenyNode root = getGeneTree().getRoot(); + if ( ( root.getChildNode1() == prev_root_c1 ) || ( root.getChildNode2() == prev_root_c1 ) ) { + calculateMforNode( prev_root_c1 ); + } + else { + calculateMforNode( prev_root_c2 ); + } + Event event = null; + if ( prev_root_was_dup ) { + event = Event.createSingleDuplicationEvent(); + } + else { + event = Event.createSingleSpeciationEvent(); + } + root.getNodeData().setEvent( event ); + calculateMforNode( root ); + return getDuplicationsSum(); + } // updateM( boolean, PhylogenyNode, PhylogenyNode ) + + // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode ) + // Calculates M for PhylogenyNode n, given that M for the two children + // of n has been calculated. + // (Last modified: 10/02/01) + private void calculateMforNode( final PhylogenyNode n ) { + if ( !n.isExternal() ) { + final boolean was_duplication = n.isDuplication(); + PhylogenyNode a = n.getChildNode1().getLink(); + PhylogenyNode b = n.getChildNode2().getLink(); + while ( a != b ) { + if ( a.getId() > b.getId() ) { + a = a.getParent(); + } + else { + b = b.getParent(); + } + } + n.setLink( a ); + Event event = null; + if ( ( a == n.getChildNode1().getLink() ) || ( a == n.getChildNode2().getLink() ) ) { + event = Event.createSingleDuplicationEvent(); + if ( !was_duplication ) { + ++_duplications_sum; + } + } + else { + event = Event.createSingleSpeciationEvent(); + if ( was_duplication ) { + --_duplications_sum; + } + } + n.getNodeData().setEvent( event ); + } + } // calculateMforNode( PhylogenyNode ) + // Helper method for "computeMappingCost()". private void computeMappingCostHelper( final PhylogenyNode g ) { if ( !g.isExternal() ) { @@ -109,24 +335,6 @@ public abstract class SDI { } } - /** - * Computes the cost of mapping the gene tree gene_tree onto the species - * tree species_tree. Before this method can be called, the mapping has to - * be calculated with method "infer(boolean)". - *

- * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for - * Comparing Molecular Phylogenies. Journal of Computational Biology 4 - * 177-187. - * - * @return the mapping cost "L" - */ - public int computeMappingCostL() { - _species_tree.levelOrderReID(); - _mapping_cost = 0; - computeMappingCostHelper( _gene_tree.getRoot() ); - return _mapping_cost; - } - private TaxonomyComparisonBase determineTaxonomyComparisonBase() { TaxonomyComparisonBase base = null; boolean all_have_id = true; @@ -184,67 +392,6 @@ public abstract class SDI { } /** - * Returns the number of duplications. - * - * @return number of duplications - */ - public int getDuplicationsSum() { - return _duplications_sum; - } - - /** - * Returns the gene tree. - * - * @return gene tree - */ - public Phylogeny getGeneTree() { - return _gene_tree; - } - - /** - * Returns the species tree. - * - * @return species tree - */ - public Phylogeny getSpeciesTree() { - return _species_tree; - } - - /** - * Calculates the mapping function for the external nodes of the gene tree: - * links (sets the field "link" of PhylogenyNode) each external - * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree - * which has the same species name. - * @throws SDIException - */ - void linkNodesOfG() throws SDIException { - final Map speciestree_ext_nodes = new HashMap(); - final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase(); - // Put references to all external nodes of the species tree into a map. - // Stringyfied taxonomy is the key, node is the value. - for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode s = iter.next(); - final String tax_str = taxonomyToString( s, tax_comp_base ); - if ( speciestree_ext_nodes.containsKey( tax_str ) ) { - throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy() - + "] is not unique in species phylogeny" ); - } - speciestree_ext_nodes.put( tax_str, s ); - } - // Retrieve the reference to the node with a matching stringyfied taxonomy. - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - final String tax_str = taxonomyToString( g, tax_comp_base ); - final PhylogenyNode s = speciestree_ext_nodes.get( tax_str ); - if ( s == null ) { - throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy() - + "] not present in species tree" ); - } - g.setLink( s ); - } - } - - /** * Calculates the mapping function for the external nodes of the gene tree: * links (sets the field "link" of PhylogenyNode) each external by taxonomy * identifier @@ -252,7 +399,7 @@ public abstract class SDI { * which has the same species name. * Olivier CHABROL : olivier.chabrol@univ-provence.fr */ - void linkNodesOfGByTaxonomyIdentifier() { + private final void linkNodesOfGByTaxonomyIdentifier() { final HashMap speciestree_ext_nodes = new HashMap(); if ( _species_tree.getFirstExternalNode().isRoot() ) { speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier() @@ -276,56 +423,4 @@ public abstract class SDI { g.setLink( s ); } } - - @Override - public String toString() { - final StringBuffer sb = new StringBuffer(); - sb.append( getClass() ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - sb.append( "Duplications sum : " + getDuplicationsSum() ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - sb.append( "mapping cost L : " + computeMappingCostL() ); - return sb.toString(); - } - - static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) { - switch ( base ) { - case ID: - final Identifier id = n.getNodeData().getTaxonomy().getIdentifier(); - if ( id == null ) { - return null; - } - return id.getValuePlusProvider(); - case CODE: - return n.getNodeData().getTaxonomy().getTaxonomyCode(); - case SCIENTIFIC_NAME: - return n.getNodeData().getTaxonomy().getScientificName(); - default: - throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base ); - } - } - - public enum TaxonomyComparisonBase { - ID { - - @Override - public String toString() { - return "taxonomy id"; - } - }, - CODE { - - @Override - public String toString() { - return "taxonomy code/mnemonic"; - } - }, - SCIENTIFIC_NAME { - - @Override - public String toString() { - return "scientific name"; - } - } - } -} +} // End of class SDIse. diff --git a/forester/java/src/org/forester/sdi/SDIR.java b/forester/java/src/org/forester/sdi/SDIR.java index ce7fc00..7af2359 100644 --- a/forester/java/src/org/forester/sdi/SDIR.java +++ b/forester/java/src/org/forester/sdi/SDIR.java @@ -222,7 +222,7 @@ public class SDIR { final boolean return_trees, int max_trees_to_return ) throws SDIException { init(); - SDIse sdise = null; + SDI sdise = null; final ArrayList trees = new ArrayList(); Phylogeny[] tree_array = null; List branches = null; @@ -282,7 +282,7 @@ public class SDIR { g.reRoot( g.getFirstExternalNode() ); branches = SDIR.getBranchesInPreorder( g ); if ( minimize_mapping_cost || minimize_sum_of_dup ) { - sdise = new SDIse( g, species_tree ); + sdise = new SDI( g, species_tree ); duplications = sdise.getDuplicationsSum(); } final Set used_root_placements = new HashSet(); @@ -418,7 +418,7 @@ public class SDIR { height = height__diff[ 0 ]; diff = height__diff[ 1 ]; if ( Math.abs( diff ) < SDIR.ZERO_DIFF ) { - sdise = new SDIse( g, species_tree ); + sdise = new SDI( g, species_tree ); min_duplications = sdise.getDuplicationsSum(); min_height = height; min_diff = Math.abs( diff ); diff --git a/forester/java/src/org/forester/sdi/SDIse.java b/forester/java/src/org/forester/sdi/SDIse.java deleted file mode 100644 index bf06a41..0000000 --- a/forester/java/src/org/forester/sdi/SDIse.java +++ /dev/null @@ -1,206 +0,0 @@ -// $Id: -// FORESTER -- software libraries and applications -// for evolutionary biology research and applications. -// -// Copyright (C) 2008-2009 Christian M. Zmasek -// Copyright (C) 2008-2009 Burnham Institute for Medical Research -// Copyright (C) 2000-2001 Washington University School of Medicine -// and Howard Hughes Medical Institute -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.sdi; - -import org.forester.phylogeny.Phylogeny; -import org.forester.phylogeny.PhylogenyMethods; -import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.data.Event; - -/* - * Implements our algorithm for speciation - duplication inference (SDI).

- * Reference:

The initialization is accomplished by: - *

The recursion - * part is accomplished by this class' method - * "geneTreePostOrderTraversal(PhylogenyNode)".

Requires JDK 1.2 or greater. - * - * @see SDI#linkNodesOfG() - * - * @see Phylogeny#preorderReID(int) - * - * @see - * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) - * - * @see #geneTreePostOrderTraversal(PhylogenyNode) - * - * @author Christian M. Zmasek - * - * @version 1.102 -- last modified: 10/02/01 - */ -public class SDIse extends SDI { - - /** - * Constructor which sets the gene tree and the species tree to be compared. - * species_tree is the species tree to which the gene tree gene_tree will be - * compared to - with method "infer(boolean)". Both Trees must be completely - * binary and rooted. The actual inference is accomplished with method - * "infer(boolean)". The mapping cost L can then be calculated with method - * "computeMappingCost()". - *

- * (Last modified: 01/11/01) - * - * @see #infer(boolean) - * @see SDI#computeMappingCostL() - * @param gene_tree - * reference to a rooted binary gene Phylogeny to which assign - * duplication vs speciation, must have species names in the - * species name fields for all external nodes - * @param species_tree - * reference to a rooted binary species Phylogeny which might get - * stripped in the process, must have species names in the - * species name fields for all external nodes - * @throws SDIException - */ - public SDIse( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SDIException { - super( gene_tree, species_tree ); - _duplications_sum = 0; - PhylogenyMethods.preOrderReId( getSpeciesTree() ); - linkNodesOfG(); - geneTreePostOrderTraversal( getGeneTree().getRoot() ); - } - - // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode ) - // Calculates M for PhylogenyNode n, given that M for the two children - // of n has been calculated. - // (Last modified: 10/02/01) - private void calculateMforNode( final PhylogenyNode n ) { - if ( !n.isExternal() ) { - final boolean was_duplication = n.isDuplication(); - PhylogenyNode a = n.getChildNode1().getLink(); - PhylogenyNode b = n.getChildNode2().getLink(); - while ( a != b ) { - if ( a.getId() > b.getId() ) { - a = a.getParent(); - } - else { - b = b.getParent(); - } - } - n.setLink( a ); - Event event = null; - if ( ( a == n.getChildNode1().getLink() ) || ( a == n.getChildNode2().getLink() ) ) { - event = Event.createSingleDuplicationEvent(); - if ( !was_duplication ) { - ++_duplications_sum; - } - } - else { - event = Event.createSingleSpeciationEvent(); - if ( was_duplication ) { - --_duplications_sum; - } - } - n.getNodeData().setEvent( event ); - } - } // calculateMforNode( PhylogenyNode ) - - /** - * Traverses the subtree of PhylogenyNode g in postorder, calculating the - * mapping function M, and determines which nodes represent speciation - * events and which ones duplication events. - *

- * Preconditions: Mapping M for external nodes must have been calculated and - * the species tree must be labelled in preorder. - *

- * (Last modified: 01/11/01) - * - * @param g - * starting node of a gene tree - normally the root - */ - void geneTreePostOrderTraversal( final PhylogenyNode g ) { - PhylogenyNode a, b; - if ( !g.isExternal() ) { - geneTreePostOrderTraversal( g.getChildNode( 0 ) ); - geneTreePostOrderTraversal( g.getChildNode( 1 ) ); - a = g.getChildNode( 0 ).getLink(); - b = g.getChildNode( 1 ).getLink(); - while ( a != b ) { - if ( a.getId() > b.getId() ) { - a = a.getParent(); - } - else { - b = b.getParent(); - } - } - g.setLink( a ); - // Determines whether dup. or spec. - Event event = null; - if ( ( a == g.getChildNode( 0 ).getLink() ) || ( a == g.getChildNode( 1 ).getLink() ) ) { - event = Event.createSingleDuplicationEvent(); - ++_duplications_sum; - } - else { - event = Event.createSingleSpeciationEvent(); - } - g.getNodeData().setEvent( event ); - } - } // geneTreePostOrderTraversal( PhylogenyNode ) - - /** - * Updates the mapping function M after the root of the gene tree has been - * moved by one branch. It calculates M for the root of the gene tree and - * one of its two children. - *

- * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)". - *

- * (Last modfied: 10/02/01) - * - * @param prev_root_was_dup - * true if the previous root was a duplication, false otherwise - * @param prev_root_c1 - * child 1 of the previous root - * @param prev_root_c2 - * child 2 of the previous root - * @return number of duplications which have been assigned in gene tree - */ - int updateM( final boolean prev_root_was_dup, final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) { - final PhylogenyNode root = getGeneTree().getRoot(); - if ( ( root.getChildNode1() == prev_root_c1 ) || ( root.getChildNode2() == prev_root_c1 ) ) { - calculateMforNode( prev_root_c1 ); - } - else { - calculateMforNode( prev_root_c2 ); - } - Event event = null; - if ( prev_root_was_dup ) { - event = Event.createSingleDuplicationEvent(); - } - else { - event = Event.createSingleSpeciationEvent(); - } - root.getNodeData().setEvent( event ); - calculateMforNode( root ); - return getDuplicationsSum(); - } // updateM( boolean, PhylogenyNode, PhylogenyNode ) -} // End of class SDIse. diff --git a/forester/java/src/org/forester/sdi/SDIutil.java b/forester/java/src/org/forester/sdi/SDIutil.java new file mode 100644 index 0000000..efe1830 --- /dev/null +++ b/forester/java/src/org/forester/sdi/SDIutil.java @@ -0,0 +1,100 @@ + +package org.forester.sdi; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public class SDIutil { + + static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) { + switch ( base ) { + case ID: + final Identifier id = n.getNodeData().getTaxonomy().getIdentifier(); + if ( id == null ) { + return null; + } + return id.getValuePlusProvider(); + case CODE: + return n.getNodeData().getTaxonomy().getTaxonomyCode(); + case SCIENTIFIC_NAME: + return n.getNodeData().getTaxonomy().getScientificName(); + default: + throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base ); + } + } + + public enum ALGORITHM { + GSDIR, GSDI, SDI, SDIR + } + + public enum TaxonomyComparisonBase { + ID { + + @Override + public String toString() { + return "taxonomy id"; + } + }, + CODE { + + @Override + public String toString() { + return "taxonomy code/mnemonic"; + } + }, + SCIENTIFIC_NAME { + + @Override + public String toString() { + return "scientific name"; + } + } + } + + public final static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { + int with_id_count = 0; + int with_code_count = 0; + int with_sn_count = 0; + int max = 0; + for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + if ( g.getNodeData().isHasTaxonomy() ) { + final Taxonomy tax = g.getNodeData().getTaxonomy(); + if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { + if ( ++with_id_count > max ) { + max = with_id_count; + } + } + if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { + if ( ++with_code_count > max ) { + max = with_code_count; + } + } + if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { + if ( ++with_sn_count > max ) { + max = with_sn_count; + } + } + } + } + if ( max == 0 ) { + throw new IllegalArgumentException( "gene tree has no taxonomic data" ); + } + else if ( max == 1 ) { + throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); + } + else if ( max == with_id_count ) { + return TaxonomyComparisonBase.ID; + } + else if ( max == with_sn_count ) { + return TaxonomyComparisonBase.SCIENTIFIC_NAME; + } + else { + return TaxonomyComparisonBase.CODE; + } + } +} diff --git a/forester/java/src/org/forester/sdi/TestGSDI.java b/forester/java/src/org/forester/sdi/TestGSDI.java index a9dfa66..16f6655 100644 --- a/forester/java/src/org/forester/sdi/TestGSDI.java +++ b/forester/java/src/org/forester/sdi/TestGSDI.java @@ -35,7 +35,7 @@ import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.data.Event; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; -import org.forester.sdi.SDI.TaxonomyComparisonBase; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; public final class TestGSDI { diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 0ab8c91..f739330 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -89,7 +89,7 @@ import org.forester.protein.Protein; import org.forester.rio.TestRIO; import org.forester.sdi.SDI; import org.forester.sdi.SDIR; -import org.forester.sdi.SDIse; +import org.forester.sdi.SDI; import org.forester.sdi.TestGSDI; import org.forester.sequence.BasicSequence; import org.forester.sequence.Sequence; @@ -6255,7 +6255,7 @@ public final class Test { final Phylogeny gene1 = factory.create( "(A1[&&NHX:S=yeast],A2[&&NHX:S=yeast])", new NHXParser() )[ 0 ]; gene1.setRooted( true ); species1.setRooted( true ); - final SDI sdi = new SDIse( gene1, species1 ); + final SDI sdi = new SDI( gene1, species1 ); if ( !gene1.getRoot().isDuplication() ) { return false; } @@ -6267,7 +6267,7 @@ public final class Test { new NHXParser() )[ 0 ]; species2.setRooted( true ); gene2.setRooted( true ); - final SDI sdi2 = new SDIse( gene2, species2 ); + final SDI sdi2 = new SDI( gene2, species2 ); if ( sdi2.getDuplicationsSum() != 0 ) { return false; } @@ -6297,7 +6297,7 @@ public final class Test { new NHXParser() )[ 0 ]; species3.setRooted( true ); gene3.setRooted( true ); - final SDI sdi3 = new SDIse( gene3, species3 ); + final SDI sdi3 = new SDI( gene3, species3 ); if ( sdi3.getDuplicationsSum() != 1 ) { return false; } @@ -6315,7 +6315,7 @@ public final class Test { new NHXParser() )[ 0 ]; species4.setRooted( true ); gene4.setRooted( true ); - final SDI sdi4 = new SDIse( gene4, species4 ); + final SDI sdi4 = new SDI( gene4, species4 ); if ( sdi4.getDuplicationsSum() != 1 ) { return false; } @@ -6342,7 +6342,7 @@ public final class Test { new NHXParser() )[ 0 ]; species5.setRooted( true ); gene5.setRooted( true ); - final SDI sdi5 = new SDIse( gene5, species5 ); + final SDI sdi5 = new SDI( gene5, species5 ); if ( sdi5.getDuplicationsSum() != 2 ) { return false; } @@ -6375,7 +6375,7 @@ public final class Test { new NHXParser() )[ 0 ]; species6.setRooted( true ); gene6.setRooted( true ); - final SDI sdi6 = new SDIse( gene6, species6 ); + final SDI sdi6 = new SDI( gene6, species6 ); if ( sdi6.getDuplicationsSum() != 3 ) { return false; } @@ -6422,7 +6422,7 @@ public final class Test { final Phylogeny gene7_1 = Test .createPhylogeny( "((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); gene7_1.setRooted( true ); - final SDI sdi7 = new SDIse( gene7_1, species7 ); + final SDI sdi7 = new SDI( gene7_1, species7 ); if ( sdi7.getDuplicationsSum() != 0 ) { return false; } @@ -6453,7 +6453,7 @@ public final class Test { final Phylogeny gene7_2 = Test .createPhylogeny( "(((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),j2[&&NHX:S=j2]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); gene7_2.setRooted( true ); - final SDI sdi7_2 = new SDIse( gene7_2, species7 ); + final SDI sdi7_2 = new SDI( gene7_2, species7 ); if ( sdi7_2.getDuplicationsSum() != 1 ) { return false; }