import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Event;
import org.forester.phylogeny.data.Taxonomy;
private final List<PhylogenyNode> _stripped_species_tree_nodes;
private final Set<PhylogenyNode> _mapped_species_tree_nodes;
private TaxonomyComparisonBase _tax_comp_base;
+ private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
public GSDI( final Phylogeny gene_tree,
final Phylogeny species_tree,
_stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
_stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
_mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
- getSpeciesTree().preOrderReId();
+ _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
linkNodesOfG();
+ PhylogenyMethods.preOrderReId( getSpeciesTree() );
geneTreePostOrderTraversal();
}
for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
final PhylogenyNode s = iter.next();
species_tree_ext_nodes.add( s );
- final String tax_str = taxonomyToString( s, _tax_comp_base );
- if ( !ForesterUtil.isEmpty( tax_str ) ) {
- if ( species_to_node_map.containsKey( tax_str ) ) {
- throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
+ if ( s.getNodeData().isHasTaxonomy() ) {
+ final String tax_str = taxonomyToString( s, _tax_comp_base );
+ if ( !ForesterUtil.isEmpty( tax_str ) ) {
+ if ( species_to_node_map.containsKey( tax_str ) ) {
+ throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
+ }
+ species_to_node_map.put( tax_str, s );
}
- species_to_node_map.put( tax_str, s );
}
}
// Retrieve the reference to the node with a matching stringyfied taxonomy.
}
}
else {
- final PhylogenyNode s = species_to_node_map.get( tax_str );
+ PhylogenyNode s = species_to_node_map.get( tax_str );
+ if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
+ && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
+ s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str );
+ }
if ( s == null ) {
if ( _strip_gene_tree ) {
_stripped_gene_tree_nodes.add( g );
} // for loop
if ( _strip_gene_tree ) {
stripGeneTree();
+ if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) {
+ throw new SDIException( "species could not be mapped between gene tree and species tree" );
+ }
}
if ( _strip_species_tree ) {
stripSpeciesTree( species_tree_ext_nodes );
}
}
+ private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
+ final String tax_str ) {
+ PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" );
+ if ( s == null ) {
+ if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
+ final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
+ s = species_to_node_map.get( new_tax_str );
+ if ( s != null ) {
+ addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
+ }
+ }
+ }
+ if ( s == null ) {
+ for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
+ " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
+ s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t );
+ if ( s != null ) {
+ break;
+ }
+ }
+ }
+ return s;
+ }
+
+ private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
+ final String tax_str,
+ final String term ) {
+ final int i = tax_str.indexOf( term );
+ if ( i > 4 ) {
+ final String new_tax_str = tax_str.substring( 0, i ).trim();
+ final PhylogenyNode s = species_to_node_map.get( new_tax_str );
+ if ( s != null ) {
+ addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
+ }
+ return s;
+ }
+ return null;
+ }
+
+ private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) {
+ _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
+ }
+
+ public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
+ return _scientific_names_mapped_to_reduced_specificity;
+ }
+
public TaxonomyComparisonBase getTaxCompBase() {
return _tax_comp_base;
}
_stripped_species_tree_nodes.add( s );
}
}
+ _species_tree.clearHashIdToNodeMap();
+ _species_tree.externalNodesHaveChanged();
}
public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
for( final PhylogenyNode g : _stripped_gene_tree_nodes ) {
_gene_tree.deleteSubtree( g, true );
}
+ _gene_tree.clearHashIdToNodeMap();
+ _gene_tree.externalNodesHaveChanged();
}
public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
else if ( max == 1 ) {
throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
}
- else if ( max == with_sn_count ) {
- return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
- }
else if ( max == with_id_count ) {
return SDI.TaxonomyComparisonBase.ID;
}
+ else if ( max == with_sn_count ) {
+ return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
+ }
else {
return SDI.TaxonomyComparisonBase.CODE;
}