import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Event;
import org.forester.phylogeny.data.Taxonomy;
*
* @author Christian M. Zmasek
*/
-public final class GSDI extends SDI {
+public class GSDI extends SDI {
private final boolean _most_parsimonious_duplication_model;
private final boolean _strip_gene_tree;
private final List<PhylogenyNode> _stripped_gene_tree_nodes;
private final List<PhylogenyNode> _stripped_species_tree_nodes;
private final Set<PhylogenyNode> _mapped_species_tree_nodes;
+ private TaxonomyComparisonBase _tax_comp_base;
+ private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
public GSDI( final Phylogeny gene_tree,
final Phylogeny species_tree,
final boolean most_parsimonious_duplication_model,
final boolean strip_gene_tree,
- final boolean strip_species_tree ) throws SdiException {
+ final boolean strip_species_tree ) throws SDIException {
super( gene_tree, species_tree );
_speciation_or_duplication_events_sum = 0;
_speciations_sum = 0;
_stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
_stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
_mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
- getSpeciesTree().preOrderReId();
+ _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
linkNodesOfG();
+ PhylogenyMethods.preOrderReId( getSpeciesTree() );
geneTreePostOrderTraversal();
}
GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model )
- throws SdiException {
+ throws SDIException {
this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false );
}
+
+ public GSDI( final Phylogeny gene_tree,
+ final Phylogeny species_tree,
+ final boolean most_parsimonious_duplication_model,
+ final boolean strip_gene_tree,
+ final boolean strip_species_tree,
+ int x ) throws SDIException {
+ super( gene_tree, species_tree );
+ _speciation_or_duplication_events_sum = 0;
+ _speciations_sum = 0;
+ _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
+ _duplications_sum = 0;
+ _strip_gene_tree = strip_gene_tree;
+ _strip_species_tree = strip_species_tree;
+ _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
+ _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
+ _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
+ _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
+
+ }
// s is the node on the species tree g maps to.
private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) {
}
else {
if ( oyako ) {
- boolean multiple = false;
final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
n = n.getLink();
}
set.add( n );
}
+ boolean multiple = false;
for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
n = n.getLink();
while ( n.getParent() != s ) {
g.getNodeData().setEvent( createDuplicationEvent() );
}
else {
- g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() );
+ if ( _most_parsimonious_duplication_model ) {
+ g.getNodeData().setEvent( createSpeciationEvent() );
+ }
+ else {
+ g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() );
+ }
}
}
else {
final void geneTreePostOrderTraversal() {
for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) {
final PhylogenyNode g = it.next();
- if ( !g.isExternal() ) {
+ if ( g.isInternal() ) {
PhylogenyNode s1 = g.getChildNode1().getLink();
PhylogenyNode s2 = g.getChildNode2().getLink();
while ( s1 != s2 ) {
/**
* This allows for linking of internal nodes of the species tree (as opposed
* to just external nodes, as in the method it overrides.
- * @throws SdiException
+ * @throws SDIException
*
*/
@Override
- final void linkNodesOfG() throws SdiException {
+ final void linkNodesOfG() throws SDIException {
final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
- final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
- // System.out.println( "comp base is: " + tax_comp_base );
+ _tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
// Stringyfied taxonomy is the key, node is the value.
for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
final PhylogenyNode s = iter.next();
species_tree_ext_nodes.add( s );
- final String tax_str = taxonomyToString( s, tax_comp_base );
- if ( !ForesterUtil.isEmpty( tax_str ) ) {
- if ( species_to_node_map.containsKey( tax_str ) ) {
- throw new SdiException( "taxonomy \"" + s + "\" is not unique in species tree" );
+ if ( s.getNodeData().isHasTaxonomy() ) {
+ final String tax_str = taxonomyToString( s, _tax_comp_base );
+ if ( !ForesterUtil.isEmpty( tax_str ) ) {
+ if ( species_to_node_map.containsKey( tax_str ) ) {
+ throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
+ }
+ species_to_node_map.put( tax_str, s );
}
- species_to_node_map.put( tax_str, s );
}
}
// Retrieve the reference to the node with a matching stringyfied taxonomy.
_stripped_gene_tree_nodes.add( g );
}
else {
- throw new SdiException( "gene tree node \"" + g + "\" has no taxonomic data" );
+ throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
}
}
else {
- final String tax_str = taxonomyToString( g, tax_comp_base );
+ final String tax_str = taxonomyToString( g, _tax_comp_base );
if ( ForesterUtil.isEmpty( tax_str ) ) {
if ( _strip_gene_tree ) {
_stripped_gene_tree_nodes.add( g );
}
else {
- throw new SdiException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
+ throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
}
}
else {
- final PhylogenyNode s = species_to_node_map.get( tax_str );
+ PhylogenyNode s = species_to_node_map.get( tax_str );
+ if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
+ && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
+ s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str );
+ }
if ( s == null ) {
if ( _strip_gene_tree ) {
_stripped_gene_tree_nodes.add( g );
}
else {
- throw new SdiException( "taxonomy \"" + g.getNodeData().getTaxonomy()
+ throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
+ "\" not present in species tree" );
}
}
else {
g.setLink( s );
_mapped_species_tree_nodes.add( s );
- // System.out.println( "setting link of " + g + " to " + s );
}
}
}
} // for loop
if ( _strip_gene_tree ) {
- for( final PhylogenyNode g : _stripped_gene_tree_nodes ) {
- _gene_tree.deleteSubtree( g, true );
+ stripGeneTree();
+ if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) {
+ throw new SDIException( "species could not be mapped between gene tree and species tree" );
}
}
if ( _strip_species_tree ) {
- for( final PhylogenyNode s : species_tree_ext_nodes ) {
- if ( !_mapped_species_tree_nodes.contains( s ) ) {
- _species_tree.deleteSubtree( s, true );
+ stripSpeciesTree( species_tree_ext_nodes );
+ }
+ }
+
+ private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
+ final String tax_str ) {
+ PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" );
+ if ( s == null ) {
+ if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
+ final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
+ s = species_to_node_map.get( new_tax_str );
+ if ( s != null ) {
+ addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
}
}
}
+ if ( s == null ) {
+ for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
+ " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
+ s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t );
+ if ( s != null ) {
+ break;
+ }
+ }
+ }
+ return s;
+ }
+
+ private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
+ final String tax_str,
+ final String term ) {
+ final int i = tax_str.indexOf( term );
+ if ( i > 4 ) {
+ final String new_tax_str = tax_str.substring( 0, i ).trim();
+ final PhylogenyNode s = species_to_node_map.get( new_tax_str );
+ if ( s != null ) {
+ addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
+ }
+ return s;
+ }
+ return null;
+ }
+
+ private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) {
+ _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
+ }
+
+ public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
+ return _scientific_names_mapped_to_reduced_specificity;
+ }
+
+ public TaxonomyComparisonBase getTaxCompBase() {
+ return _tax_comp_base;
+ }
+
+ private void stripSpeciesTree( final List<PhylogenyNode> species_tree_ext_nodes ) {
+ for( final PhylogenyNode s : species_tree_ext_nodes ) {
+ if ( !_mapped_species_tree_nodes.contains( s ) ) {
+ _species_tree.deleteSubtree( s, true );
+ _stripped_species_tree_nodes.add( s );
+ }
+ }
+ _species_tree.clearHashIdToNodeMap();
+ _species_tree.externalNodesHaveChanged();
+ }
+
+ public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
+ return _stripped_species_tree_nodes;
+ }
+
+ private void stripGeneTree() {
+ for( final PhylogenyNode g : _stripped_gene_tree_nodes ) {
+ _gene_tree.deleteSubtree( g, true );
+ }
+ _gene_tree.clearHashIdToNodeMap();
+ _gene_tree.externalNodesHaveChanged();
}
public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
else if ( max == 1 ) {
throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
}
- else if ( max == with_sn_count ) {
- return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
- }
else if ( max == with_id_count ) {
return SDI.TaxonomyComparisonBase.ID;
}
+ else if ( max == with_sn_count ) {
+ return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
+ }
else {
return SDI.TaxonomyComparisonBase.CODE;
}