X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsdi%2FGSDI.java;h=2419d131d03bde83142298d10323a85038230aea;hb=e9ca0dc1764303d53fc6b9b087f33cdee53726ea;hp=0e48d32a2ae83d1551d1fe26fecd469a07bc68a0;hpb=d30f8d155dee1fbfa17946cf80a203cc886ffdd6;p=jalview.git diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index 0e48d32..2419d13 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -31,8 +31,11 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Event; import org.forester.phylogeny.data.Taxonomy; @@ -72,6 +75,7 @@ public final class GSDI extends SDI { private final List _stripped_species_tree_nodes; private final Set _mapped_species_tree_nodes; private TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, @@ -88,8 +92,9 @@ public final class GSDI extends SDI { _stripped_gene_tree_nodes = new ArrayList(); _stripped_species_tree_nodes = new ArrayList(); _mapped_species_tree_nodes = new HashSet(); - getSpeciesTree().preOrderReId(); + _scientific_names_mapped_to_reduced_specificity = new TreeSet(); linkNodesOfG(); + PhylogenyMethods.preOrderReId( getSpeciesTree() ); geneTreePostOrderTraversal(); } @@ -228,12 +233,14 @@ public final class GSDI extends SDI { for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); species_tree_ext_nodes.add( s ); - final String tax_str = taxonomyToString( s, _tax_comp_base ); - if ( !ForesterUtil.isEmpty( tax_str ) ) { - if ( species_to_node_map.containsKey( tax_str ) ) { - throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" ); + if ( s.getNodeData().isHasTaxonomy() ) { + final String tax_str = taxonomyToString( s, _tax_comp_base ); + if ( !ForesterUtil.isEmpty( tax_str ) ) { + if ( species_to_node_map.containsKey( tax_str ) ) { + throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" ); + } + species_to_node_map.put( tax_str, s ); } - species_to_node_map.put( tax_str, s ); } } // Retrieve the reference to the node with a matching stringyfied taxonomy. @@ -258,7 +265,11 @@ public final class GSDI extends SDI { } } else { - final PhylogenyNode s = species_to_node_map.get( tax_str ); + PhylogenyNode s = species_to_node_map.get( tax_str ); + if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) + && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str ); + } if ( s == null ) { if ( _strip_gene_tree ) { _stripped_gene_tree_nodes.add( g ); @@ -277,12 +288,62 @@ public final class GSDI extends SDI { } // for loop if ( _strip_gene_tree ) { stripGeneTree(); + if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) { + throw new SDIException( "species could not be mapped between gene tree and species tree" ); + } } if ( _strip_species_tree ) { stripSpeciesTree( species_tree_ext_nodes ); } } + private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str ) { + PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" ); + if ( s == null ) { + if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { + final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); + s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); + } + } + } + if ( s == null ) { + for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", + " form ", " subform ", " cultivar ", " section ", " subsection " } ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t ); + if ( s != null ) { + break; + } + } + } + return s; + } + + private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final String term ) { + final int i = tax_str.indexOf( term ); + if ( i > 4 ) { + final String new_tax_str = tax_str.substring( 0, i ).trim(); + final PhylogenyNode s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); + } + return s; + } + return null; + } + + private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) { + _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); + } + + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; + } + public TaxonomyComparisonBase getTaxCompBase() { return _tax_comp_base; } @@ -294,6 +355,8 @@ public final class GSDI extends SDI { _stripped_species_tree_nodes.add( s ); } } + _species_tree.clearHashIdToNodeMap(); + _species_tree.externalNodesHaveChanged(); } public List getStrippedSpeciesTreeNodes() { @@ -304,6 +367,8 @@ public final class GSDI extends SDI { for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { _gene_tree.deleteSubtree( g, true ); } + _gene_tree.clearHashIdToNodeMap(); + _gene_tree.externalNodesHaveChanged(); } public Set getMappedExternalSpeciesTreeNodes() { @@ -342,12 +407,12 @@ public final class GSDI extends SDI { else if ( max == 1 ) { throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); } - else if ( max == with_sn_count ) { - return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; - } else if ( max == with_id_count ) { return SDI.TaxonomyComparisonBase.ID; } + else if ( max == with_sn_count ) { + return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; + } else { return SDI.TaxonomyComparisonBase.CODE; }