X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsdi%2FGSDI.java;h=19e2b7dcab13689c5eb5f25703d16f445d7c779e;hb=6005bacbd8aecc0d320b70568a24d321a30bc85c;hp=9da5f54a0aef66f62a65978d56bd0c5cd54654e7;hpb=14f8357072fabd3b089bbe8c5eee7ce2a5be2c6e;p=jalview.git diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index 9da5f54..19e2b7d 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -31,8 +31,11 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Event; import org.forester.phylogeny.data.Taxonomy; @@ -61,7 +64,7 @@ import org.forester.util.ForesterUtil; * * @author Christian M. Zmasek */ -public final class GSDI extends SDI { +public class GSDI extends SDI { private final boolean _most_parsimonious_duplication_model; private final boolean _strip_gene_tree; @@ -71,12 +74,14 @@ public final class GSDI extends SDI { private final List _stripped_gene_tree_nodes; private final List _stripped_species_tree_nodes; private final Set _mapped_species_tree_nodes; + private TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model, final boolean strip_gene_tree, - final boolean strip_species_tree ) throws SdiException { + final boolean strip_species_tree ) throws SDIException { super( gene_tree, species_tree ); _speciation_or_duplication_events_sum = 0; _speciations_sum = 0; @@ -87,15 +92,36 @@ public final class GSDI extends SDI { _stripped_gene_tree_nodes = new ArrayList(); _stripped_species_tree_nodes = new ArrayList(); _mapped_species_tree_nodes = new HashSet(); - getSpeciesTree().preOrderReId(); + _scientific_names_mapped_to_reduced_specificity = new TreeSet(); linkNodesOfG(); + PhylogenyMethods.preOrderReId( getSpeciesTree() ); geneTreePostOrderTraversal(); } GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model ) - throws SdiException { + throws SDIException { this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false ); } + + public GSDI( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean most_parsimonious_duplication_model, + final boolean strip_gene_tree, + final boolean strip_species_tree, + int x ) throws SDIException { + super( gene_tree, species_tree ); + _speciation_or_duplication_events_sum = 0; + _speciations_sum = 0; + _most_parsimonious_duplication_model = most_parsimonious_duplication_model; + _duplications_sum = 0; + _strip_gene_tree = strip_gene_tree; + _strip_species_tree = strip_species_tree; + _stripped_gene_tree_nodes = new ArrayList(); + _stripped_species_tree_nodes = new ArrayList(); + _mapped_species_tree_nodes = new HashSet(); + _scientific_names_mapped_to_reduced_specificity = new TreeSet(); + + } // s is the node on the species tree g maps to. private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { @@ -113,7 +139,6 @@ public final class GSDI extends SDI { } else { if ( oyako ) { - boolean multiple = false; final Set set = new HashSet(); for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { n = n.getLink(); @@ -125,6 +150,7 @@ public final class GSDI extends SDI { } set.add( n ); } + boolean multiple = false; for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { n = n.getLink(); while ( n.getParent() != s ) { @@ -142,7 +168,12 @@ public final class GSDI extends SDI { g.getNodeData().setEvent( createDuplicationEvent() ); } else { - g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() ); + if ( _most_parsimonious_duplication_model ) { + g.getNodeData().setEvent( createSpeciationEvent() ); + } + else { + g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() ); + } } } else { @@ -164,7 +195,7 @@ public final class GSDI extends SDI { final void geneTreePostOrderTraversal() { for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) { final PhylogenyNode g = it.next(); - if ( !g.isExternal() ) { + if ( g.isInternal() ) { PhylogenyNode s1 = g.getChildNode1().getLink(); PhylogenyNode s2 = g.getChildNode2().getLink(); while ( s1 != s2 ) { @@ -210,25 +241,26 @@ public final class GSDI extends SDI { /** * This allows for linking of internal nodes of the species tree (as opposed * to just external nodes, as in the method it overrides. - * @throws SdiException + * @throws SDIException * */ @Override - final void linkNodesOfG() throws SdiException { + final void linkNodesOfG() throws SDIException { final Map species_to_node_map = new HashMap(); final List species_tree_ext_nodes = new ArrayList(); - final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); - // System.out.println( "comp base is: " + tax_comp_base ); + _tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); // Stringyfied taxonomy is the key, node is the value. for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); species_tree_ext_nodes.add( s ); - final String tax_str = taxonomyToString( s, tax_comp_base ); - if ( !ForesterUtil.isEmpty( tax_str ) ) { - if ( species_to_node_map.containsKey( tax_str ) ) { - throw new SdiException( "taxonomy \"" + s + "\" is not unique in species tree" ); + if ( s.getNodeData().isHasTaxonomy() ) { + final String tax_str = taxonomyToString( s, _tax_comp_base ); + if ( !ForesterUtil.isEmpty( tax_str ) ) { + if ( species_to_node_map.containsKey( tax_str ) ) { + throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" ); + } + species_to_node_map.put( tax_str, s ); } - species_to_node_map.put( tax_str, s ); } } // Retrieve the reference to the node with a matching stringyfied taxonomy. @@ -239,50 +271,124 @@ public final class GSDI extends SDI { _stripped_gene_tree_nodes.add( g ); } else { - throw new SdiException( "gene tree node \"" + g + "\" has no taxonomic data" ); + throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" ); } } else { - final String tax_str = taxonomyToString( g, tax_comp_base ); + final String tax_str = taxonomyToString( g, _tax_comp_base ); if ( ForesterUtil.isEmpty( tax_str ) ) { if ( _strip_gene_tree ) { _stripped_gene_tree_nodes.add( g ); } else { - throw new SdiException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); + throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); } } else { - final PhylogenyNode s = species_to_node_map.get( tax_str ); + PhylogenyNode s = species_to_node_map.get( tax_str ); + if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) + && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str ); + } if ( s == null ) { if ( _strip_gene_tree ) { _stripped_gene_tree_nodes.add( g ); } else { - throw new SdiException( "taxonomy \"" + g.getNodeData().getTaxonomy() + throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy() + "\" not present in species tree" ); } } else { g.setLink( s ); _mapped_species_tree_nodes.add( s ); - // System.out.println( "setting link of " + g + " to " + s ); } } } } // for loop if ( _strip_gene_tree ) { - for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( g, true ); + stripGeneTree(); + if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) { + throw new SDIException( "species could not be mapped between gene tree and species tree" ); } } if ( _strip_species_tree ) { - for( final PhylogenyNode s : species_tree_ext_nodes ) { - if ( !_mapped_species_tree_nodes.contains( s ) ) { - _species_tree.deleteSubtree( s, true ); + stripSpeciesTree( species_tree_ext_nodes ); + } + } + + private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str ) { + PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" ); + if ( s == null ) { + if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { + final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); + s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); } } } + if ( s == null ) { + for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", + " form ", " subform ", " cultivar ", " section ", " subsection " } ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t ); + if ( s != null ) { + break; + } + } + } + return s; + } + + private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final String term ) { + final int i = tax_str.indexOf( term ); + if ( i > 4 ) { + final String new_tax_str = tax_str.substring( 0, i ).trim(); + final PhylogenyNode s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); + } + return s; + } + return null; + } + + private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) { + _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); + } + + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; + } + + public TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } + + private void stripSpeciesTree( final List species_tree_ext_nodes ) { + for( final PhylogenyNode s : species_tree_ext_nodes ) { + if ( !_mapped_species_tree_nodes.contains( s ) ) { + _species_tree.deleteSubtree( s, true ); + _stripped_species_tree_nodes.add( s ); + } + } + _species_tree.clearHashIdToNodeMap(); + _species_tree.externalNodesHaveChanged(); + } + + public List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + private void stripGeneTree() { + for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { + _gene_tree.deleteSubtree( g, true ); + } + _gene_tree.clearHashIdToNodeMap(); + _gene_tree.externalNodesHaveChanged(); } public Set getMappedExternalSpeciesTreeNodes() { @@ -321,12 +427,12 @@ public final class GSDI extends SDI { else if ( max == 1 ) { throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); } - else if ( max == with_sn_count ) { - return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; - } else if ( max == with_id_count ) { return SDI.TaxonomyComparisonBase.ID; } + else if ( max == with_sn_count ) { + return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; + } else { return SDI.TaxonomyComparisonBase.CODE; }