import org.forester.sdi.GSDI;
import org.forester.sdi.GSDIR;
import org.forester.sdi.SDI;
-import org.forester.sdi.SDI.ALGORITHM;
-import org.forester.sdi.SDI.TaxonomyComparisonBase;
import org.forester.sdi.SDIException;
-import org.forester.sdi.SDIse;
+import org.forester.sdi.SDIutil;
+import org.forester.sdi.SDIutil.ALGORITHM;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
import org.forester.util.CommandLineArguments;
import org.forester.util.EasyWriter;
import org.forester.util.ForesterConstants;
( ( NHXParser ) p ).setReplaceUnderscores( true );
}
species_tree = factory.create( species_tree_file, p )[ 0 ];
- final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree );
+ final TaxonomyComparisonBase comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree );
switch ( comp_base ) {
case SCIENTIFIC_NAME:
try {
+ ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
System.out.println( "Species tree name : "
+ ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
- SDI sdi = null;
+ Object sdi = null;
final long start_time = new Date().getTime();
try {
if ( ( base_algorithm == ALGORITHM.GSDI ) || ( base_algorithm == ALGORITHM.GSDIR ) ) {
System.out.println( "Algorithm : SDI" );
log_writer.println( "Algorithm : SDI" );
log_writer.flush();
- sdi = new SDIse( gene_tree, species_tree );
+ sdi = new SDI( gene_tree, species_tree );
}
}
catch ( final SDIException e ) {
System.out.println( "Wrote resulting gene tree to : " + out_file.getCanonicalPath() );
log_writer.println( "Wrote resulting gene tree to : " + out_file.getCanonicalPath() );
if ( base_algorithm == ALGORITHM.SDI ) {
+ sdi = sdi;
sdi.computeMappingCostL();
System.out.println( "Mapping cost : " + sdi.computeMappingCostL() );
log_writer.println( "Mapping cost : " + sdi.computeMappingCostL() );
import org.forester.phylogeny.factories.PhylogenyFactory;
import org.forester.rio.RIO;
import org.forester.rio.RIOException;
-import org.forester.sdi.SDI.ALGORITHM;
import org.forester.sdi.SDIException;
+import org.forester.sdi.SDIutil.ALGORITHM;
import org.forester.util.CommandLineArguments;
import org.forester.util.EasyWriter;
import org.forester.util.ForesterUtil;
_mainpanel.getTabbedPane().setSelectedIndex( selected );
showWhole();
_mainpanel.getCurrentTreePanel().setEdited( true );
- JOptionPane.showMessageDialog( this, "Duplications: " + gsdir.getDuplicationsSum() + "\n"
- + "Potential duplications: " + gsdir.getSpeciationOrDuplicationEventsSum() + "\n" + "Speciations: "
- + gsdir.getSpeciationsSum(), "GSDI successfully completed", JOptionPane.INFORMATION_MESSAGE );
+ JOptionPane.showMessageDialog( this,
+ "Duplications: " + gsdir.getMinDuplicationsSum() + "\n" + "Speciations: "
+ + gsdir.getSpeciationsSum() + "\n"
+ + "Number of root positions minimizing duplications sum: "
+ + gsdir.getMinDuplicationsSumGeneTrees().size(),
+ "GSDIR successfully completed",
+ JOptionPane.INFORMATION_MESSAGE );
}
void executeFunctionAnalysis() {
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
import org.forester.sdi.GSDIR;
-import org.forester.sdi.SDI.ALGORITHM;
-import org.forester.sdi.SDI.TaxonomyComparisonBase;
import org.forester.sdi.SDIException;
import org.forester.sdi.SDIR;
+import org.forester.sdi.SDIutil.ALGORITHM;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.ForesterUtil;
import org.forester.phylogeny.PhylogenyMethods.PhylogenyNodeField;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
-import org.forester.sdi.SDI.ALGORITHM;
-import org.forester.sdi.SDI.TaxonomyComparisonBase;
+import org.forester.sdi.SDIutil.ALGORITHM;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
import org.forester.util.ForesterUtil;
public final class TestRIO {
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
-import java.util.TreeSet;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Event;
-import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
import org.forester.util.ForesterUtil;
-/*
- * Implements our algorithm for speciation - duplication inference (SDI). <p>
- * The initialization is accomplished by: </p> <ul> <li>method
- * "linkExtNodesOfG()" of class SDI: setting the links for the external nodes of
- * the gene tree <li>"preorderReID(int)" from class Phylogeny: numbering of
- * nodes of the species tree in preorder <li>the optional stripping of the
- * species tree is accomplished by method "stripTree(Phylogeny,Phylogeny)" of
- * class Phylogeny </ul> <p> The recursion part is accomplished by this class'
- * method "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.5 or
- * greater.
- *
- * @see SDI#linkNodesOfG()
- *
- * @see Phylogeny#preorderReID(int)
- *
- * @see
- * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
- *
- * @see #geneTreePostOrderTraversal(PhylogenyNode)
- *
- * @author Christian M. Zmasek
- */
-public class GSDI extends SDI {
+public final class GSDI {
- private final boolean _most_parsimonious_duplication_model;
- protected int _speciation_or_duplication_events_sum;
- protected int _speciations_sum;
- private final List<PhylogenyNode> _stripped_gene_tree_nodes;
- private final List<PhylogenyNode> _stripped_species_tree_nodes;
- private final Set<PhylogenyNode> _mapped_species_tree_nodes;
- private TaxonomyComparisonBase _tax_comp_base;
- private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
+ private final boolean _most_parsimonious_duplication_model;
+ private final int _speciation_or_duplication_events_sum;
+ private final int _speciations_sum;
+ private final int _duplications_sum;
+ private final List<PhylogenyNode> _stripped_gene_tree_nodes;
+ private final List<PhylogenyNode> _stripped_species_tree_nodes;
+ private final Set<PhylogenyNode> _mapped_species_tree_nodes;
+ private final TaxonomyComparisonBase _tax_comp_base;
+ private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
public GSDI( final Phylogeny gene_tree,
final Phylogeny species_tree,
final boolean most_parsimonious_duplication_model,
final boolean strip_gene_tree,
final boolean strip_species_tree ) throws SDIException {
- super( gene_tree, species_tree );
- _speciation_or_duplication_events_sum = 0;
- _speciations_sum = 0;
_most_parsimonious_duplication_model = most_parsimonious_duplication_model;
- _duplications_sum = 0;
- _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
- _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
- _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
- _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
- linkNodesOfG( null, strip_gene_tree, strip_species_tree );
- PhylogenyMethods.preOrderReId( getSpeciesTree() );
- geneTreePostOrderTraversal();
+ final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
+ species_tree,
+ null,
+ strip_gene_tree,
+ strip_species_tree );
+ _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
+ _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
+ _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
+ _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
+ .getScientificNamesMappedToReducedSpecificity();
+ _tax_comp_base = nodes_linking_result.getTaxCompBase();
+ PhylogenyMethods.preOrderReId( species_tree );
+ final GSDIsummaryResult gsdi_summary_result = new GSDIsummaryResult();
+ geneTreePostOrderTraversal( gene_tree, _most_parsimonious_duplication_model, gsdi_summary_result );
+ _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
+ _speciations_sum = gsdi_summary_result.getSpeciationsSum();
+ _duplications_sum = gsdi_summary_result.getDuplicationsSum();
}
- // Used by GSDIR
- protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree )
- throws SDIException {
- super( gene_tree, species_tree );
- _speciation_or_duplication_events_sum = -1;
- _speciations_sum = 0;
- _most_parsimonious_duplication_model = true;
- _duplications_sum = 0;
- _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
- _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
- _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
- _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
+ public int getDuplicationsSum() {
+ return _duplications_sum;
}
- // s is the node on the species tree g maps to.
- private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) {
- boolean oyako = false;
- if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
- oyako = true;
- }
- if ( g.getLink().getNumberOfDescendants() == 2 ) {
- if ( oyako ) {
- g.getNodeData().setEvent( createDuplicationEvent() );
- }
- else {
- g.getNodeData().setEvent( createSpeciationEvent() );
- }
- }
- else {
- if ( oyako ) {
- final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
- for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
- n = n.getLink();
- while ( n.getParent() != s ) {
- n = n.getParent();
- if ( n.isRoot() ) {
- break;
- }
- }
- set.add( n );
- }
- boolean multiple = false;
- for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
- n = n.getLink();
- while ( n.getParent() != s ) {
- n = n.getParent();
- if ( n.isRoot() ) {
- break;
- }
- }
- if ( set.contains( n ) ) {
- multiple = true;
- break;
- }
- }
- if ( multiple ) {
- g.getNodeData().setEvent( createDuplicationEvent() );
- }
- else {
- if ( _most_parsimonious_duplication_model ) {
- g.getNodeData().setEvent( createSpeciationEvent() );
- }
- else {
- g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() );
- }
- }
- }
- else {
- g.getNodeData().setEvent( createSpeciationEvent() );
- }
+ public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
+ return _mapped_species_tree_nodes;
+ }
+
+ public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
+ return _scientific_names_mapped_to_reduced_specificity;
+ }
+
+ public final int getSpeciationOrDuplicationEventsSum() {
+ return _speciation_or_duplication_events_sum;
+ }
+
+ public final int getSpeciationsSum() {
+ return _speciations_sum;
+ }
+
+ public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
+ return _stripped_gene_tree_nodes;
+ }
+
+ public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
+ return _stripped_species_tree_nodes;
+ }
+
+ public TaxonomyComparisonBase getTaxCompBase() {
+ return _tax_comp_base;
+ }
+
+ @Override
+ public final String toString() {
+ final StringBuffer sb = new StringBuffer();
+ sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
+ sb.append( ForesterUtil.getLineSeparator() );
+ sb.append( "Speciations sum : " + getSpeciationsSum() );
+ sb.append( ForesterUtil.getLineSeparator() );
+ sb.append( "Duplications sum : " + getDuplicationsSum() );
+ sb.append( ForesterUtil.getLineSeparator() );
+ if ( !_most_parsimonious_duplication_model ) {
+ sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
+ sb.append( ForesterUtil.getLineSeparator() );
}
+ return sb.toString();
}
/**
* <p>
*
*/
- final void geneTreePostOrderTraversal() {
- for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) {
+ final static void geneTreePostOrderTraversal( final Phylogeny gene_tree,
+ final boolean most_parsimonious_duplication_model,
+ final GSDIsummaryResult res ) {
+ for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
final PhylogenyNode g = it.next();
if ( g.isInternal() ) {
PhylogenyNode s1 = g.getChildNode1().getLink();
}
}
g.setLink( s1 );
- determineEvent( s1, g );
+ determineEvent( s1, g, most_parsimonious_duplication_model, res );
}
}
}
- private final Event createDuplicationEvent() {
- final Event event = Event.createSingleDuplicationEvent();
- ++_duplications_sum;
- return event;
- }
-
- private final Event createSingleSpeciationOrDuplicationEvent() {
- final Event event = Event.createSingleSpeciationOrDuplicationEvent();
- ++_speciation_or_duplication_events_sum;
- return event;
- }
-
- private final Event createSpeciationEvent() {
- final Event event = Event.createSingleSpeciationEvent();
- ++_speciations_sum;
- return event;
- }
-
- public final int getSpeciationOrDuplicationEventsSum() {
- return _speciation_or_duplication_events_sum;
- }
-
- public final int getSpeciationsSum() {
- return _speciations_sum;
- }
-
/**
* This allows for linking of internal nodes of the species tree (as opposed
* to just external nodes, as in the method it overrides.
* @throws SDIException
*
*/
- final void linkNodesOfG( final TaxonomyComparisonBase tax_comp_base,
- final boolean strip_gene_tree,
- final boolean strip_species_tree ) throws SDIException {
+ final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
+ final Phylogeny species_tree,
+ final TaxonomyComparisonBase tax_comp_base,
+ final boolean strip_gene_tree,
+ final boolean strip_species_tree ) throws SDIException {
final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
+ final NodesLinkingResult res = new NodesLinkingResult();
if ( tax_comp_base == null ) {
- _tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
+ res.setTaxCompBase( SDIutil.determineTaxonomyComparisonBase( gene_tree ) );
}
else {
- _tax_comp_base = tax_comp_base;
+ res.setTaxCompBase( tax_comp_base );
}
// Stringyfied taxonomy is the key, node is the value.
- for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
final PhylogenyNode s = iter.next();
species_tree_ext_nodes.add( s );
if ( s.getNodeData().isHasTaxonomy() ) {
- final String tax_str = taxonomyToString( s, _tax_comp_base );
+ final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
if ( !ForesterUtil.isEmpty( tax_str ) ) {
if ( species_to_node_map.containsKey( tax_str ) ) {
throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
}
}
// Retrieve the reference to the node with a matching stringyfied taxonomy.
- for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
final PhylogenyNode g = iter.next();
if ( !g.getNodeData().isHasTaxonomy() ) {
if ( strip_gene_tree ) {
- _stripped_gene_tree_nodes.add( g );
+ res.getStrippedGeneTreeNodes().add( g );
}
else {
throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
}
}
else {
- final String tax_str = taxonomyToString( g, _tax_comp_base );
+ final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
if ( ForesterUtil.isEmpty( tax_str ) ) {
if ( strip_gene_tree ) {
- _stripped_gene_tree_nodes.add( g );
+ res.getStrippedGeneTreeNodes().add( g );
}
else {
throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
}
else {
PhylogenyNode s = species_to_node_map.get( tax_str );
- if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
+ if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
&& ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
- s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str );
+ s = tryMapByRemovingOverlySpecificData( species_to_node_map,
+ tax_str,
+ res.getScientificNamesMappedToReducedSpecificity() );
}
if ( s == null ) {
if ( strip_gene_tree ) {
- _stripped_gene_tree_nodes.add( g );
+ res.getStrippedGeneTreeNodes().add( g );
}
else {
throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
}
else {
g.setLink( s );
- _mapped_species_tree_nodes.add( s );
+ res.getMappedSpeciesTreeNodes().add( s );
}
}
}
} // for loop
if ( strip_gene_tree ) {
- stripGeneTree();
- if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) {
+ stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
+ if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
throw new SDIException( "species could not be mapped between gene tree and species tree" );
}
}
if ( strip_species_tree ) {
- stripSpeciesTree( species_tree_ext_nodes );
+ stripSpeciesTree( species_tree, species_tree_ext_nodes, res.getMappedSpeciesTreeNodes() );
}
+ return res;
}
- private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
- final String tax_str ) {
- PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" );
+ private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
+ final String s2,
+ final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
+ scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
+ }
+
+ // Used by GSDIR
+ // protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree )
+ // throws SDIException {
+ // super( gene_tree, species_tree );
+ // _speciation_or_duplication_events_sum = -1;
+ // _speciations_sum = 0;
+ // _most_parsimonious_duplication_model = true;
+ // _duplications_sum = 0;
+ // _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
+ // _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
+ // _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
+ // _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
+ // }
+ // s is the node on the species tree g maps to.
+ private final static void determineEvent( final PhylogenyNode s,
+ final PhylogenyNode g,
+ final boolean most_parsimonious_duplication_model,
+ final GSDIsummaryResult res ) {
+ boolean oyako = false;
+ if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
+ oyako = true;
+ }
+ if ( g.getLink().getNumberOfDescendants() == 2 ) {
+ if ( oyako ) {
+ g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
+ res.increaseDuplicationsSum();
+ }
+ else {
+ g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
+ res.increaseSpeciationsSum();
+ }
+ }
+ else {
+ if ( oyako ) {
+ final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
+ for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
+ n = n.getLink();
+ while ( n.getParent() != s ) {
+ n = n.getParent();
+ if ( n.isRoot() ) {
+ break;
+ }
+ }
+ set.add( n );
+ }
+ boolean multiple = false;
+ for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
+ n = n.getLink();
+ while ( n.getParent() != s ) {
+ n = n.getParent();
+ if ( n.isRoot() ) {
+ break;
+ }
+ }
+ if ( set.contains( n ) ) {
+ multiple = true;
+ break;
+ }
+ }
+ if ( multiple ) {
+ g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
+ res.increaseDuplicationsSum();
+ }
+ else {
+ if ( most_parsimonious_duplication_model ) {
+ g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
+ res.increaseSpeciationsSum();
+ }
+ else {
+ g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
+ res.increaseSpeciationOrDuplicationEventsSum();
+ }
+ }
+ }
+ else {
+ g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
+ res.increaseSpeciationsSum();
+ }
+ }
+ }
+
+ private final static List<PhylogenyNode> stripSpeciesTree( final Phylogeny species_tree,
+ final List<PhylogenyNode> species_tree_ext_nodes,
+ final Set<PhylogenyNode> keep ) {
+ final List<PhylogenyNode> stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
+ for( final PhylogenyNode s : species_tree_ext_nodes ) {
+ if ( !keep.contains( s ) ) {
+ species_tree.deleteSubtree( s, true );
+ stripped_species_tree_nodes.add( s );
+ }
+ }
+ species_tree.clearHashIdToNodeMap();
+ species_tree.externalNodesHaveChanged();
+ return stripped_species_tree_nodes;
+ }
+
+ private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
+ for( final PhylogenyNode g : strip_nodes ) {
+ phy.deleteSubtree( g, true );
+ }
+ phy.clearHashIdToNodeMap();
+ phy.externalNodesHaveChanged();
+ }
+
+ private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
+ final String tax_str,
+ final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
+ PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
+ tax_str,
+ " (",
+ scientific_names_mapped_to_reduced_specificity );
if ( s == null ) {
if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
s = species_to_node_map.get( new_tax_str );
if ( s != null ) {
- addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
+ addScientificNamesMappedToReducedSpecificity( tax_str,
+ new_tax_str,
+ scientific_names_mapped_to_reduced_specificity );
}
}
}
if ( s == null ) {
for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
" form ", " subform ", " cultivar ", " section ", " subsection " } ) {
- s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t );
+ s = tryMapByRemovingOverlySpecificData( species_to_node_map,
+ tax_str,
+ t,
+ scientific_names_mapped_to_reduced_specificity );
if ( s != null ) {
break;
}
return s;
}
- private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
- final String tax_str,
- final String term ) {
+ private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
+ final String tax_str,
+ final String term,
+ final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
final int i = tax_str.indexOf( term );
if ( i > 4 ) {
final String new_tax_str = tax_str.substring( 0, i ).trim();
final PhylogenyNode s = species_to_node_map.get( new_tax_str );
if ( s != null ) {
- addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
+ addScientificNamesMappedToReducedSpecificity( tax_str,
+ new_tax_str,
+ scientific_names_mapped_to_reduced_specificity );
}
return s;
}
return null;
}
-
- private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) {
- _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
- }
-
- public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
- return _scientific_names_mapped_to_reduced_specificity;
- }
-
- public TaxonomyComparisonBase getTaxCompBase() {
- return _tax_comp_base;
- }
-
- private void stripSpeciesTree( final List<PhylogenyNode> species_tree_ext_nodes ) {
- for( final PhylogenyNode s : species_tree_ext_nodes ) {
- if ( !_mapped_species_tree_nodes.contains( s ) ) {
- _species_tree.deleteSubtree( s, true );
- _stripped_species_tree_nodes.add( s );
- }
- }
- _species_tree.clearHashIdToNodeMap();
- _species_tree.externalNodesHaveChanged();
- }
-
- public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
- return _stripped_species_tree_nodes;
- }
-
- private void stripGeneTree() {
- for( final PhylogenyNode g : _stripped_gene_tree_nodes ) {
- _gene_tree.deleteSubtree( g, true );
- }
- _gene_tree.clearHashIdToNodeMap();
- _gene_tree.externalNodesHaveChanged();
- }
-
- public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
- return _mapped_species_tree_nodes;
- }
-
- public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
- int with_id_count = 0;
- int with_code_count = 0;
- int with_sn_count = 0;
- int max = 0;
- for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
- final PhylogenyNode g = iter.next();
- if ( g.getNodeData().isHasTaxonomy() ) {
- final Taxonomy tax = g.getNodeData().getTaxonomy();
- if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
- if ( ++with_id_count > max ) {
- max = with_id_count;
- }
- }
- if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
- if ( ++with_code_count > max ) {
- max = with_code_count;
- }
- }
- if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
- if ( ++with_sn_count > max ) {
- max = with_sn_count;
- }
- }
- }
- }
- if ( max == 0 ) {
- throw new IllegalArgumentException( "gene tree has no taxonomic data" );
- }
- else if ( max == 1 ) {
- throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
- }
- else if ( max == with_id_count ) {
- return SDI.TaxonomyComparisonBase.ID;
- }
- else if ( max == with_sn_count ) {
- return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
- }
- else {
- return SDI.TaxonomyComparisonBase.CODE;
- }
- }
-
- public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
- return _stripped_gene_tree_nodes;
- }
-
- @Override
- public final String toString() {
- final StringBuffer sb = new StringBuffer();
- sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
- sb.append( ForesterUtil.getLineSeparator() );
- sb.append( "Speciations sum : " + getSpeciationsSum() );
- sb.append( ForesterUtil.getLineSeparator() );
- sb.append( "Duplications sum : " + getDuplicationsSum() );
- sb.append( ForesterUtil.getLineSeparator() );
- if ( !_most_parsimonious_duplication_model ) {
- sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
- sb.append( ForesterUtil.getLineSeparator() );
- }
- sb.append( "mapping cost L : " + computeMappingCostL() );
- return sb.toString();
- }
}
\r
import java.util.ArrayList;\r
import java.util.List;\r
+import java.util.Set;\r
+import java.util.SortedSet;\r
\r
import org.forester.phylogeny.Phylogeny;\r
import org.forester.phylogeny.PhylogenyBranch;\r
import org.forester.phylogeny.PhylogenyMethods;\r
import org.forester.phylogeny.PhylogenyNode;\r
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;\r
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;\r
import org.forester.util.BasicDescriptiveStatistics;\r
\r
-public class GSDIR extends GSDI {\r
+public class GSDIR {\r
\r
private int _min_duplications_sum;\r
private final BasicDescriptiveStatistics _duplications_sum_stats;\r
private final List<Phylogeny> _min_duplications_sum_gene_trees;\r
+ protected int _speciations_sum;\r
+ protected int _duplications_sum;\r
+ private final List<PhylogenyNode> _stripped_gene_tree_nodes;\r
+ private final List<PhylogenyNode> _stripped_species_tree_nodes;\r
+ private final Set<PhylogenyNode> _mapped_species_tree_nodes;\r
+ private final TaxonomyComparisonBase _tax_comp_base;\r
+ private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;\r
\r
public GSDIR( final Phylogeny gene_tree,\r
final Phylogeny species_tree,\r
final boolean strip_gene_tree,\r
final boolean strip_species_tree ) throws SDIException {\r
- super( gene_tree.copy(), species_tree, strip_gene_tree );\r
- linkNodesOfG( null, strip_gene_tree, strip_species_tree );\r
+ _speciations_sum = 0;\r
+ _duplications_sum = 0;\r
+ final NodesLinkingResult nodes_linking_result = GSDI.linkNodesOfG( gene_tree,\r
+ species_tree,\r
+ null,\r
+ strip_gene_tree,\r
+ strip_species_tree );\r
+ _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();\r
+ _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();\r
+ _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();\r
+ _scientific_names_mapped_to_reduced_specificity = nodes_linking_result\r
+ .getScientificNamesMappedToReducedSpecificity();\r
+ _tax_comp_base = nodes_linking_result.getTaxCompBase();\r
final List<PhylogenyBranch> gene_tree_branches_post_order = new ArrayList<PhylogenyBranch>();\r
- for( final PhylogenyNodeIterator it = _gene_tree.iteratorPostorder(); it.hasNext(); ) {\r
+ for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {\r
final PhylogenyNode n = it.next();\r
if ( !n.isRoot() && !( n.getParent().isRoot() && n.isFirstChildNode() ) ) {\r
gene_tree_branches_post_order.add( new PhylogenyBranch( n, n.getParent() ) );\r
for( final PhylogenyBranch branch : gene_tree_branches_post_order ) {\r
_duplications_sum = 0;\r
_speciations_sum = 0;\r
- _gene_tree.reRoot( branch );\r
- PhylogenyMethods.preOrderReId( getSpeciesTree() );\r
+ gene_tree.reRoot( branch );\r
+ PhylogenyMethods.preOrderReId( species_tree );\r
//TEST, remove later\r
// for( final PhylogenyNodeIterator it = _gene_tree.iteratorPostorder(); it.hasNext(); ) {\r
// final PhylogenyNode g = it.next();\r
// g.setLink( null );\r
// }\r
// }\r
- geneTreePostOrderTraversal();\r
+ final GSDIsummaryResult gsdi_summary_result = new GSDIsummaryResult();\r
+ GSDI.geneTreePostOrderTraversal( gene_tree, true, gsdi_summary_result );\r
if ( _duplications_sum < _min_duplications_sum ) {\r
_min_duplications_sum = _duplications_sum;\r
_min_duplications_sum_gene_trees.clear();\r
- _min_duplications_sum_gene_trees.add( getGeneTree().copy() );\r
+ _min_duplications_sum_gene_trees.add( gene_tree.copy() );\r
+ //_speciations_sum\r
}\r
else if ( _duplications_sum == _min_duplications_sum ) {\r
- _min_duplications_sum_gene_trees.add( getGeneTree().copy() );\r
+ _min_duplications_sum_gene_trees.add( gene_tree.copy() );\r
}\r
_duplications_sum_stats.addValue( _duplications_sum );\r
}\r
public BasicDescriptiveStatistics getDuplicationsSumStats() {\r
return _duplications_sum_stats;\r
}\r
+\r
+ public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {\r
+ return _mapped_species_tree_nodes;\r
+ }\r
+\r
+ public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {\r
+ return _scientific_names_mapped_to_reduced_specificity;\r
+ }\r
+\r
+ public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {\r
+ return _stripped_gene_tree_nodes;\r
+ }\r
+\r
+ public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {\r
+ return _stripped_species_tree_nodes;\r
+ }\r
+\r
+ public TaxonomyComparisonBase getTaxCompBase() {\r
+ return _tax_comp_base;\r
+ }\r
}\r
--- /dev/null
+
+package org.forester.sdi;
+
+final class GSDIsummaryResult {
+
+ private int _speciation_or_duplication_events_sum;
+ private int _speciations_sum;
+ private int _duplications_sum;
+
+ GSDIsummaryResult() {
+ _speciation_or_duplication_events_sum = 0;
+ _speciations_sum = 0;
+ _duplications_sum = 0;
+ }
+
+ final int getDuplicationsSum() {
+ return _duplications_sum;
+ }
+
+ final int getSpeciationOrDuplicationEventsSum() {
+ return _speciation_or_duplication_events_sum;
+ }
+
+ final int getSpeciationsSum() {
+ return _speciations_sum;
+ }
+
+ final void increaseDuplicationsSum() {
+ ++_duplications_sum;
+ }
+
+ final void increaseSpeciationOrDuplicationEventsSum() {
+ ++_speciation_or_duplication_events_sum;
+ }
+
+ final void increaseSpeciationsSum() {
+ ++_speciations_sum;
+ }
+}
--- /dev/null
+
+package org.forester.sdi;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
+
+final class NodesLinkingResult {
+
+ private final List<PhylogenyNode> _stripped_gene_tree_nodes;
+ private final List<PhylogenyNode> _stripped_species_tree_nodes;
+ private final Set<PhylogenyNode> _mapped_species_tree_nodes;
+ private TaxonomyComparisonBase _tax_comp_base;
+ private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
+
+ NodesLinkingResult() {
+ _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
+ _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
+ _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
+ _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
+ _tax_comp_base = null;
+ }
+
+ final List<PhylogenyNode> getStrippedGeneTreeNodes() {
+ return _stripped_gene_tree_nodes;
+ }
+
+ final List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
+ return _stripped_species_tree_nodes;
+ }
+
+ final Set<PhylogenyNode> getMappedSpeciesTreeNodes() {
+ return _mapped_species_tree_nodes;
+ }
+
+ final TaxonomyComparisonBase getTaxCompBase() {
+ return _tax_comp_base;
+ }
+
+ final void setTaxCompBase( final TaxonomyComparisonBase tax_comp_base ) {
+ _tax_comp_base = tax_comp_base;
+ }
+
+ final SortedSet<String> getScientificNamesMappedToReducedSpecificity() {
+ return _scientific_names_mapped_to_reduced_specificity;
+ }
+}
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org
+// WWW: www.phylosoft.org/forester
package org.forester.sdi;
import java.util.Map;
import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.Event;
import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
import org.forester.util.ForesterUtil;
-public abstract class SDI {
+/*
+ * Implements our algorithm for speciation - duplication inference (SDI). <p>
+ * Reference: </p> <ul> <li>Zmasek, C.M. and Eddy, S.R. (2001) "A simple
+ * algorithm to infer gene duplication and speciation events on a gene tree".
+ * Bioinformatics, in press. </ul> <p> The initialization is accomplished by:
+ * </p> <ul> <li>method "linkExtNodesOfG()" of class SDI: setting the links for
+ * the external nodes of the gene tree <li>"preorderReID(int)" from class
+ * Phylogeny: numbering of nodes of the species tree in preorder <li>the
+ * optional stripping of the species tree is accomplished by method
+ * "stripTree(Phylogeny,Phylogeny)" of class Phylogeny </ul> <p> The recursion
+ * part is accomplished by this class' method
+ * "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.2 or greater.
+ *
+ * @see SDI#linkNodesOfG()
+ *
+ * @see Phylogeny#preorderReID(int)
+ *
+ * @see
+ * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
+ *
+ * @see #geneTreePostOrderTraversal(PhylogenyNode)
+ *
+ * @author Christian M. Zmasek
+ *
+ * @version 1.102 -- last modified: 10/02/01
+ */
+public class SDI {
- public enum ALGORITHM {
- GSDIR, GSDI, SDI, SDIR
- }
final Phylogeny _gene_tree;
final Phylogeny _species_tree;
int _duplications_sum; // Sum of duplications.
/**
* Constructor which sets the gene tree and the species tree to be compared.
* species_tree is the species tree to which the gene tree gene_tree will be
- * compared to.
- * Infers for each PhylogenyNode of gene_tree whether
- * it represents a speciation or duplication event by calculating and
- * interpreting the mapping function M. The most parsimonious sequence of
- * speciation and duplication events is assumed.
- * The mapping cost L can be
- * calculated with method "computeMappingCost()".
+ * compared to - with method "infer(boolean)". Both Trees must be completely
+ * binary and rooted. The actual inference is accomplished with method
+ * "infer(boolean)". The mapping cost L can then be calculated with method
+ * "computeMappingCost()".
* <p>
- * Conditions:
- * </p>
- * <ul>
- * <li>Both Trees must be rooted
- * <li>Both Trees must have species names in the species name fields of all
- * their external nodes
- * </ul>
+ * (Last modified: 01/11/01)
*
+ * @see #infer(boolean)
+ * @see SDI#computeMappingCostL()
* @param gene_tree
* reference to a rooted binary gene Phylogeny to which assign
* duplication vs speciation, must have species names in the
* reference to a rooted binary species Phylogeny which might get
* stripped in the process, must have species names in the
* species name fields for all external nodes
+ * @throws SDIException
*/
- public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) {
+ public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SDIException {
if ( species_tree.isEmpty() || gene_tree.isEmpty() ) {
throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" );
}
}
_gene_tree = gene_tree;
_species_tree = species_tree;
- _duplications_sum = 0;
_mapping_cost = -1;
+ _duplications_sum = 0;
+ PhylogenyMethods.preOrderReId( getSpeciesTree() );
+ linkNodesOfG();
+ geneTreePostOrderTraversal( getGeneTree().getRoot() );
+ }
+
+ /**
+ * Computes the cost of mapping the gene tree gene_tree onto the species
+ * tree species_tree. Before this method can be called, the mapping has to
+ * be calculated with method "infer(boolean)".
+ * <p>
+ * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for
+ * Comparing Molecular Phylogenies. Journal of Computational Biology 4
+ * 177-187.
+ *
+ * @return the mapping cost "L"
+ */
+ public int computeMappingCostL() {
+ _species_tree.levelOrderReID();
+ _mapping_cost = 0;
+ computeMappingCostHelper( _gene_tree.getRoot() );
+ return _mapping_cost;
+ }
+
+ /**
+ * Returns the number of duplications.
+ *
+ * @return number of duplications
+ */
+ public int getDuplicationsSum() {
+ return _duplications_sum;
+ }
+
+ /**
+ * Returns the gene tree.
+ *
+ * @return gene tree
+ */
+ public Phylogeny getGeneTree() {
+ return _gene_tree;
+ }
+
+ /**
+ * Returns the species tree.
+ *
+ * @return species tree
+ */
+ public Phylogeny getSpeciesTree() {
+ return _species_tree;
+ }
+
+ @Override
+ public String toString() {
+ final StringBuffer sb = new StringBuffer();
+ sb.append( getClass() );
+ sb.append( ForesterUtil.LINE_SEPARATOR );
+ sb.append( "Duplications sum : " + getDuplicationsSum() );
+ sb.append( ForesterUtil.LINE_SEPARATOR );
+ sb.append( "mapping cost L : " + computeMappingCostL() );
+ return sb.toString();
}
+ /**
+ * Traverses the subtree of PhylogenyNode g in postorder, calculating the
+ * mapping function M, and determines which nodes represent speciation
+ * events and which ones duplication events.
+ * <p>
+ * Preconditions: Mapping M for external nodes must have been calculated and
+ * the species tree must be labelled in preorder.
+ * <p>
+ * (Last modified: 01/11/01)
+ *
+ * @param g
+ * starting node of a gene tree - normally the root
+ */
+ void geneTreePostOrderTraversal( final PhylogenyNode g ) {
+ PhylogenyNode a, b;
+ if ( !g.isExternal() ) {
+ geneTreePostOrderTraversal( g.getChildNode( 0 ) );
+ geneTreePostOrderTraversal( g.getChildNode( 1 ) );
+ a = g.getChildNode( 0 ).getLink();
+ b = g.getChildNode( 1 ).getLink();
+ while ( a != b ) {
+ if ( a.getId() > b.getId() ) {
+ a = a.getParent();
+ }
+ else {
+ b = b.getParent();
+ }
+ }
+ g.setLink( a );
+ // Determines whether dup. or spec.
+ Event event = null;
+ if ( ( a == g.getChildNode( 0 ).getLink() ) || ( a == g.getChildNode( 1 ).getLink() ) ) {
+ event = Event.createSingleDuplicationEvent();
+ ++_duplications_sum;
+ }
+ else {
+ event = Event.createSingleSpeciationEvent();
+ }
+ g.getNodeData().setEvent( event );
+ }
+ } // geneTreePostOrderTraversal( PhylogenyNode )
+
+ /**
+ * Calculates the mapping function for the external nodes of the gene tree:
+ * links (sets the field "link" of PhylogenyNode) each external
+ * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
+ * which has the same species name.
+ * @throws SDIException
+ */
+ final void linkNodesOfG() throws SDIException {
+ final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
+ final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase();
+ // Put references to all external nodes of the species tree into a map.
+ // Stringyfied taxonomy is the key, node is the value.
+ for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode s = iter.next();
+ final String tax_str = SDIutil.taxonomyToString( s, tax_comp_base );
+ if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
+ throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy()
+ + "] is not unique in species phylogeny" );
+ }
+ speciestree_ext_nodes.put( tax_str, s );
+ }
+ // Retrieve the reference to the node with a matching stringyfied taxonomy.
+ for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode g = iter.next();
+ final String tax_str = SDIutil.taxonomyToString( g, tax_comp_base );
+ final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
+ if ( s == null ) {
+ throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
+ + "] not present in species tree" );
+ }
+ g.setLink( s );
+ }
+ }
+
+ /**
+ * Updates the mapping function M after the root of the gene tree has been
+ * moved by one branch. It calculates M for the root of the gene tree and
+ * one of its two children.
+ * <p>
+ * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)".
+ * <p>
+ * (Last modfied: 10/02/01)
+ *
+ * @param prev_root_was_dup
+ * true if the previous root was a duplication, false otherwise
+ * @param prev_root_c1
+ * child 1 of the previous root
+ * @param prev_root_c2
+ * child 2 of the previous root
+ * @return number of duplications which have been assigned in gene tree
+ */
+ int updateM( final boolean prev_root_was_dup, final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) {
+ final PhylogenyNode root = getGeneTree().getRoot();
+ if ( ( root.getChildNode1() == prev_root_c1 ) || ( root.getChildNode2() == prev_root_c1 ) ) {
+ calculateMforNode( prev_root_c1 );
+ }
+ else {
+ calculateMforNode( prev_root_c2 );
+ }
+ Event event = null;
+ if ( prev_root_was_dup ) {
+ event = Event.createSingleDuplicationEvent();
+ }
+ else {
+ event = Event.createSingleSpeciationEvent();
+ }
+ root.getNodeData().setEvent( event );
+ calculateMforNode( root );
+ return getDuplicationsSum();
+ } // updateM( boolean, PhylogenyNode, PhylogenyNode )
+
+ // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode )
+ // Calculates M for PhylogenyNode n, given that M for the two children
+ // of n has been calculated.
+ // (Last modified: 10/02/01)
+ private void calculateMforNode( final PhylogenyNode n ) {
+ if ( !n.isExternal() ) {
+ final boolean was_duplication = n.isDuplication();
+ PhylogenyNode a = n.getChildNode1().getLink();
+ PhylogenyNode b = n.getChildNode2().getLink();
+ while ( a != b ) {
+ if ( a.getId() > b.getId() ) {
+ a = a.getParent();
+ }
+ else {
+ b = b.getParent();
+ }
+ }
+ n.setLink( a );
+ Event event = null;
+ if ( ( a == n.getChildNode1().getLink() ) || ( a == n.getChildNode2().getLink() ) ) {
+ event = Event.createSingleDuplicationEvent();
+ if ( !was_duplication ) {
+ ++_duplications_sum;
+ }
+ }
+ else {
+ event = Event.createSingleSpeciationEvent();
+ if ( was_duplication ) {
+ --_duplications_sum;
+ }
+ }
+ n.getNodeData().setEvent( event );
+ }
+ } // calculateMforNode( PhylogenyNode )
+
// Helper method for "computeMappingCost()".
private void computeMappingCostHelper( final PhylogenyNode g ) {
if ( !g.isExternal() ) {
}
}
- /**
- * Computes the cost of mapping the gene tree gene_tree onto the species
- * tree species_tree. Before this method can be called, the mapping has to
- * be calculated with method "infer(boolean)".
- * <p>
- * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for
- * Comparing Molecular Phylogenies. Journal of Computational Biology 4
- * 177-187.
- *
- * @return the mapping cost "L"
- */
- public int computeMappingCostL() {
- _species_tree.levelOrderReID();
- _mapping_cost = 0;
- computeMappingCostHelper( _gene_tree.getRoot() );
- return _mapping_cost;
- }
-
private TaxonomyComparisonBase determineTaxonomyComparisonBase() {
TaxonomyComparisonBase base = null;
boolean all_have_id = true;
}
/**
- * Returns the number of duplications.
- *
- * @return number of duplications
- */
- public int getDuplicationsSum() {
- return _duplications_sum;
- }
-
- /**
- * Returns the gene tree.
- *
- * @return gene tree
- */
- public Phylogeny getGeneTree() {
- return _gene_tree;
- }
-
- /**
- * Returns the species tree.
- *
- * @return species tree
- */
- public Phylogeny getSpeciesTree() {
- return _species_tree;
- }
-
- /**
- * Calculates the mapping function for the external nodes of the gene tree:
- * links (sets the field "link" of PhylogenyNode) each external
- * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
- * which has the same species name.
- * @throws SDIException
- */
- void linkNodesOfG() throws SDIException {
- final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
- final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase();
- // Put references to all external nodes of the species tree into a map.
- // Stringyfied taxonomy is the key, node is the value.
- for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
- final PhylogenyNode s = iter.next();
- final String tax_str = taxonomyToString( s, tax_comp_base );
- if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
- throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy()
- + "] is not unique in species phylogeny" );
- }
- speciestree_ext_nodes.put( tax_str, s );
- }
- // Retrieve the reference to the node with a matching stringyfied taxonomy.
- for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
- final PhylogenyNode g = iter.next();
- final String tax_str = taxonomyToString( g, tax_comp_base );
- final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
- if ( s == null ) {
- throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
- + "] not present in species tree" );
- }
- g.setLink( s );
- }
- }
-
- /**
* Calculates the mapping function for the external nodes of the gene tree:
* links (sets the field "link" of PhylogenyNode) each external by taxonomy
* identifier
* which has the same species name.
* Olivier CHABROL : olivier.chabrol@univ-provence.fr
*/
- void linkNodesOfGByTaxonomyIdentifier() {
+ private final void linkNodesOfGByTaxonomyIdentifier() {
final HashMap<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
if ( _species_tree.getFirstExternalNode().isRoot() ) {
speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier()
g.setLink( s );
}
}
-
- @Override
- public String toString() {
- final StringBuffer sb = new StringBuffer();
- sb.append( getClass() );
- sb.append( ForesterUtil.LINE_SEPARATOR );
- sb.append( "Duplications sum : " + getDuplicationsSum() );
- sb.append( ForesterUtil.LINE_SEPARATOR );
- sb.append( "mapping cost L : " + computeMappingCostL() );
- return sb.toString();
- }
-
- static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
- switch ( base ) {
- case ID:
- final Identifier id = n.getNodeData().getTaxonomy().getIdentifier();
- if ( id == null ) {
- return null;
- }
- return id.getValuePlusProvider();
- case CODE:
- return n.getNodeData().getTaxonomy().getTaxonomyCode();
- case SCIENTIFIC_NAME:
- return n.getNodeData().getTaxonomy().getScientificName();
- default:
- throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base );
- }
- }
-
- public enum TaxonomyComparisonBase {
- ID {
-
- @Override
- public String toString() {
- return "taxonomy id";
- }
- },
- CODE {
-
- @Override
- public String toString() {
- return "taxonomy code/mnemonic";
- }
- },
- SCIENTIFIC_NAME {
-
- @Override
- public String toString() {
- return "scientific name";
- }
- }
- }
-}
+} // End of class SDIse.
final boolean return_trees,
int max_trees_to_return ) throws SDIException {
init();
- SDIse sdise = null;
+ SDI sdise = null;
final ArrayList<Phylogeny> trees = new ArrayList<Phylogeny>();
Phylogeny[] tree_array = null;
List<PhylogenyBranch> branches = null;
g.reRoot( g.getFirstExternalNode() );
branches = SDIR.getBranchesInPreorder( g );
if ( minimize_mapping_cost || minimize_sum_of_dup ) {
- sdise = new SDIse( g, species_tree );
+ sdise = new SDI( g, species_tree );
duplications = sdise.getDuplicationsSum();
}
final Set<PhylogenyBranch> used_root_placements = new HashSet<PhylogenyBranch>();
height = height__diff[ 0 ];
diff = height__diff[ 1 ];
if ( Math.abs( diff ) < SDIR.ZERO_DIFF ) {
- sdise = new SDIse( g, species_tree );
+ sdise = new SDI( g, species_tree );
min_duplications = sdise.getDuplicationsSum();
min_height = height;
min_diff = Math.abs( diff );
+++ /dev/null
-// $Id:
-// FORESTER -- software libraries and applications
-// for evolutionary biology research and applications.
-//
-// Copyright (C) 2008-2009 Christian M. Zmasek
-// Copyright (C) 2008-2009 Burnham Institute for Medical Research
-// Copyright (C) 2000-2001 Washington University School of Medicine
-// and Howard Hughes Medical Institute
-// All rights reserved
-//
-// This library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-//
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-// Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
-//
-// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
-
-package org.forester.sdi;
-
-import org.forester.phylogeny.Phylogeny;
-import org.forester.phylogeny.PhylogenyMethods;
-import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.data.Event;
-
-/*
- * Implements our algorithm for speciation - duplication inference (SDI). <p>
- * Reference: </p> <ul> <li>Zmasek, C.M. and Eddy, S.R. (2001) "A simple
- * algorithm to infer gene duplication and speciation events on a gene tree".
- * Bioinformatics, in press. </ul> <p> The initialization is accomplished by:
- * </p> <ul> <li>method "linkExtNodesOfG()" of class SDI: setting the links for
- * the external nodes of the gene tree <li>"preorderReID(int)" from class
- * Phylogeny: numbering of nodes of the species tree in preorder <li>the
- * optional stripping of the species tree is accomplished by method
- * "stripTree(Phylogeny,Phylogeny)" of class Phylogeny </ul> <p> The recursion
- * part is accomplished by this class' method
- * "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.2 or greater.
- *
- * @see SDI#linkNodesOfG()
- *
- * @see Phylogeny#preorderReID(int)
- *
- * @see
- * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
- *
- * @see #geneTreePostOrderTraversal(PhylogenyNode)
- *
- * @author Christian M. Zmasek
- *
- * @version 1.102 -- last modified: 10/02/01
- */
-public class SDIse extends SDI {
-
- /**
- * Constructor which sets the gene tree and the species tree to be compared.
- * species_tree is the species tree to which the gene tree gene_tree will be
- * compared to - with method "infer(boolean)". Both Trees must be completely
- * binary and rooted. The actual inference is accomplished with method
- * "infer(boolean)". The mapping cost L can then be calculated with method
- * "computeMappingCost()".
- * <p>
- * (Last modified: 01/11/01)
- *
- * @see #infer(boolean)
- * @see SDI#computeMappingCostL()
- * @param gene_tree
- * reference to a rooted binary gene Phylogeny to which assign
- * duplication vs speciation, must have species names in the
- * species name fields for all external nodes
- * @param species_tree
- * reference to a rooted binary species Phylogeny which might get
- * stripped in the process, must have species names in the
- * species name fields for all external nodes
- * @throws SDIException
- */
- public SDIse( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SDIException {
- super( gene_tree, species_tree );
- _duplications_sum = 0;
- PhylogenyMethods.preOrderReId( getSpeciesTree() );
- linkNodesOfG();
- geneTreePostOrderTraversal( getGeneTree().getRoot() );
- }
-
- // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode )
- // Calculates M for PhylogenyNode n, given that M for the two children
- // of n has been calculated.
- // (Last modified: 10/02/01)
- private void calculateMforNode( final PhylogenyNode n ) {
- if ( !n.isExternal() ) {
- final boolean was_duplication = n.isDuplication();
- PhylogenyNode a = n.getChildNode1().getLink();
- PhylogenyNode b = n.getChildNode2().getLink();
- while ( a != b ) {
- if ( a.getId() > b.getId() ) {
- a = a.getParent();
- }
- else {
- b = b.getParent();
- }
- }
- n.setLink( a );
- Event event = null;
- if ( ( a == n.getChildNode1().getLink() ) || ( a == n.getChildNode2().getLink() ) ) {
- event = Event.createSingleDuplicationEvent();
- if ( !was_duplication ) {
- ++_duplications_sum;
- }
- }
- else {
- event = Event.createSingleSpeciationEvent();
- if ( was_duplication ) {
- --_duplications_sum;
- }
- }
- n.getNodeData().setEvent( event );
- }
- } // calculateMforNode( PhylogenyNode )
-
- /**
- * Traverses the subtree of PhylogenyNode g in postorder, calculating the
- * mapping function M, and determines which nodes represent speciation
- * events and which ones duplication events.
- * <p>
- * Preconditions: Mapping M for external nodes must have been calculated and
- * the species tree must be labelled in preorder.
- * <p>
- * (Last modified: 01/11/01)
- *
- * @param g
- * starting node of a gene tree - normally the root
- */
- void geneTreePostOrderTraversal( final PhylogenyNode g ) {
- PhylogenyNode a, b;
- if ( !g.isExternal() ) {
- geneTreePostOrderTraversal( g.getChildNode( 0 ) );
- geneTreePostOrderTraversal( g.getChildNode( 1 ) );
- a = g.getChildNode( 0 ).getLink();
- b = g.getChildNode( 1 ).getLink();
- while ( a != b ) {
- if ( a.getId() > b.getId() ) {
- a = a.getParent();
- }
- else {
- b = b.getParent();
- }
- }
- g.setLink( a );
- // Determines whether dup. or spec.
- Event event = null;
- if ( ( a == g.getChildNode( 0 ).getLink() ) || ( a == g.getChildNode( 1 ).getLink() ) ) {
- event = Event.createSingleDuplicationEvent();
- ++_duplications_sum;
- }
- else {
- event = Event.createSingleSpeciationEvent();
- }
- g.getNodeData().setEvent( event );
- }
- } // geneTreePostOrderTraversal( PhylogenyNode )
-
- /**
- * Updates the mapping function M after the root of the gene tree has been
- * moved by one branch. It calculates M for the root of the gene tree and
- * one of its two children.
- * <p>
- * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)".
- * <p>
- * (Last modfied: 10/02/01)
- *
- * @param prev_root_was_dup
- * true if the previous root was a duplication, false otherwise
- * @param prev_root_c1
- * child 1 of the previous root
- * @param prev_root_c2
- * child 2 of the previous root
- * @return number of duplications which have been assigned in gene tree
- */
- int updateM( final boolean prev_root_was_dup, final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) {
- final PhylogenyNode root = getGeneTree().getRoot();
- if ( ( root.getChildNode1() == prev_root_c1 ) || ( root.getChildNode2() == prev_root_c1 ) ) {
- calculateMforNode( prev_root_c1 );
- }
- else {
- calculateMforNode( prev_root_c2 );
- }
- Event event = null;
- if ( prev_root_was_dup ) {
- event = Event.createSingleDuplicationEvent();
- }
- else {
- event = Event.createSingleSpeciationEvent();
- }
- root.getNodeData().setEvent( event );
- calculateMforNode( root );
- return getDuplicationsSum();
- } // updateM( boolean, PhylogenyNode, PhylogenyNode )
-} // End of class SDIse.
--- /dev/null
+
+package org.forester.sdi;
+
+import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.Taxonomy;
+import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.util.ForesterUtil;
+
+public class SDIutil {
+
+ static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
+ switch ( base ) {
+ case ID:
+ final Identifier id = n.getNodeData().getTaxonomy().getIdentifier();
+ if ( id == null ) {
+ return null;
+ }
+ return id.getValuePlusProvider();
+ case CODE:
+ return n.getNodeData().getTaxonomy().getTaxonomyCode();
+ case SCIENTIFIC_NAME:
+ return n.getNodeData().getTaxonomy().getScientificName();
+ default:
+ throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base );
+ }
+ }
+
+ public enum ALGORITHM {
+ GSDIR, GSDI, SDI, SDIR
+ }
+
+ public enum TaxonomyComparisonBase {
+ ID {
+
+ @Override
+ public String toString() {
+ return "taxonomy id";
+ }
+ },
+ CODE {
+
+ @Override
+ public String toString() {
+ return "taxonomy code/mnemonic";
+ }
+ },
+ SCIENTIFIC_NAME {
+
+ @Override
+ public String toString() {
+ return "scientific name";
+ }
+ }
+ }
+
+ public final static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
+ int with_id_count = 0;
+ int with_code_count = 0;
+ int with_sn_count = 0;
+ int max = 0;
+ for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
+ final PhylogenyNode g = iter.next();
+ if ( g.getNodeData().isHasTaxonomy() ) {
+ final Taxonomy tax = g.getNodeData().getTaxonomy();
+ if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
+ if ( ++with_id_count > max ) {
+ max = with_id_count;
+ }
+ }
+ if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
+ if ( ++with_code_count > max ) {
+ max = with_code_count;
+ }
+ }
+ if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
+ if ( ++with_sn_count > max ) {
+ max = with_sn_count;
+ }
+ }
+ }
+ }
+ if ( max == 0 ) {
+ throw new IllegalArgumentException( "gene tree has no taxonomic data" );
+ }
+ else if ( max == 1 ) {
+ throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
+ }
+ else if ( max == with_id_count ) {
+ return TaxonomyComparisonBase.ID;
+ }
+ else if ( max == with_sn_count ) {
+ return TaxonomyComparisonBase.SCIENTIFIC_NAME;
+ }
+ else {
+ return TaxonomyComparisonBase.CODE;
+ }
+ }
+}
import org.forester.phylogeny.data.Event;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
-import org.forester.sdi.SDI.TaxonomyComparisonBase;
+import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
import org.forester.util.ForesterUtil;
public final class TestGSDI {
import org.forester.rio.TestRIO;
import org.forester.sdi.SDI;
import org.forester.sdi.SDIR;
-import org.forester.sdi.SDIse;
+import org.forester.sdi.SDI;
import org.forester.sdi.TestGSDI;
import org.forester.sequence.BasicSequence;
import org.forester.sequence.Sequence;
final Phylogeny gene1 = factory.create( "(A1[&&NHX:S=yeast],A2[&&NHX:S=yeast])", new NHXParser() )[ 0 ];
gene1.setRooted( true );
species1.setRooted( true );
- final SDI sdi = new SDIse( gene1, species1 );
+ final SDI sdi = new SDI( gene1, species1 );
if ( !gene1.getRoot().isDuplication() ) {
return false;
}
new NHXParser() )[ 0 ];
species2.setRooted( true );
gene2.setRooted( true );
- final SDI sdi2 = new SDIse( gene2, species2 );
+ final SDI sdi2 = new SDI( gene2, species2 );
if ( sdi2.getDuplicationsSum() != 0 ) {
return false;
}
new NHXParser() )[ 0 ];
species3.setRooted( true );
gene3.setRooted( true );
- final SDI sdi3 = new SDIse( gene3, species3 );
+ final SDI sdi3 = new SDI( gene3, species3 );
if ( sdi3.getDuplicationsSum() != 1 ) {
return false;
}
new NHXParser() )[ 0 ];
species4.setRooted( true );
gene4.setRooted( true );
- final SDI sdi4 = new SDIse( gene4, species4 );
+ final SDI sdi4 = new SDI( gene4, species4 );
if ( sdi4.getDuplicationsSum() != 1 ) {
return false;
}
new NHXParser() )[ 0 ];
species5.setRooted( true );
gene5.setRooted( true );
- final SDI sdi5 = new SDIse( gene5, species5 );
+ final SDI sdi5 = new SDI( gene5, species5 );
if ( sdi5.getDuplicationsSum() != 2 ) {
return false;
}
new NHXParser() )[ 0 ];
species6.setRooted( true );
gene6.setRooted( true );
- final SDI sdi6 = new SDIse( gene6, species6 );
+ final SDI sdi6 = new SDI( gene6, species6 );
if ( sdi6.getDuplicationsSum() != 3 ) {
return false;
}
final Phylogeny gene7_1 = Test
.createPhylogeny( "((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" );
gene7_1.setRooted( true );
- final SDI sdi7 = new SDIse( gene7_1, species7 );
+ final SDI sdi7 = new SDI( gene7_1, species7 );
if ( sdi7.getDuplicationsSum() != 0 ) {
return false;
}
final Phylogeny gene7_2 = Test
.createPhylogeny( "(((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),j2[&&NHX:S=j2]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" );
gene7_2.setRooted( true );
- final SDI sdi7_2 = new SDIse( gene7_2, species7 );
+ final SDI sdi7_2 = new SDI( gene7_2, species7 );
if ( sdi7_2.getDuplicationsSum() != 1 ) {
return false;
}