X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsdi%2FGSDI.java;h=63c6215253b89bc58f5ef013d91167a220504bbf;hb=cec0663378230521f24a851cb1c1c9491026b70a;hp=9da5f54a0aef66f62a65978d56bd0c5cd54654e7;hpb=14f8357072fabd3b089bbe8c5eee7ce2a5be2c6e;p=jalview.git diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index 9da5f54..63c6215 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -31,124 +31,107 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.SortedSet; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Event; -import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; -/* - * Implements our algorithm for speciation - duplication inference (SDI).

- * The initialization is accomplished by:

The recursion part is accomplished by this class' - * method "geneTreePostOrderTraversal(PhylogenyNode)".

Requires JDK 1.5 or - * greater. - * - * @see SDI#linkNodesOfG() - * - * @see Phylogeny#preorderReID(int) - * - * @see - * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) - * - * @see #geneTreePostOrderTraversal(PhylogenyNode) - * - * @author Christian M. Zmasek - */ -public final class GSDI extends SDI { +public final class GSDI implements GSDII { - private final boolean _most_parsimonious_duplication_model; - private final boolean _strip_gene_tree; - private final boolean _strip_species_tree; - private int _speciation_or_duplication_events_sum; - private int _speciations_sum; - private final List _stripped_gene_tree_nodes; - private final List _stripped_species_tree_nodes; - private final Set _mapped_species_tree_nodes; + private final boolean _most_parsimonious_duplication_model; + private final int _speciation_or_duplication_events_sum; + private final int _speciations_sum; + private final int _duplications_sum; + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final Set _mapped_species_tree_nodes; + private final TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model, final boolean strip_gene_tree, - final boolean strip_species_tree ) throws SdiException { - super( gene_tree, species_tree ); - _speciation_or_duplication_events_sum = 0; - _speciations_sum = 0; + final boolean strip_species_tree ) throws SDIException { _most_parsimonious_duplication_model = most_parsimonious_duplication_model; - _duplications_sum = 0; - _strip_gene_tree = strip_gene_tree; - _strip_species_tree = strip_species_tree; - _stripped_gene_tree_nodes = new ArrayList(); - _stripped_species_tree_nodes = new ArrayList(); - _mapped_species_tree_nodes = new HashSet(); - getSpeciesTree().preOrderReId(); - linkNodesOfG(); - geneTreePostOrderTraversal(); + if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) { + gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) ); + } + final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree, + species_tree, + strip_gene_tree, + strip_species_tree ); + _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes(); + _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes(); + _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes(); + _scientific_names_mapped_to_reduced_specificity = nodes_linking_result + .getScientificNamesMappedToReducedSpecificity(); + _tax_comp_base = nodes_linking_result.getTaxCompBase(); + PhylogenyMethods.preOrderReId( species_tree ); + final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree, + _most_parsimonious_duplication_model ); + _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum(); + _speciations_sum = gsdi_summary_result.getSpeciationsSum(); + _duplications_sum = gsdi_summary_result.getDuplicationsSum(); } - GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model ) - throws SdiException { - this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false ); + public int getDuplicationsSum() { + return _duplications_sum; } - // s is the node on the species tree g maps to. - private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { - boolean oyako = false; - if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { - oyako = true; - } - if ( g.getLink().getNumberOfDescendants() == 2 ) { - if ( oyako ) { - g.getNodeData().setEvent( createDuplicationEvent() ); - } - else { - g.getNodeData().setEvent( createSpeciationEvent() ); - } - } - else { - if ( oyako ) { - boolean multiple = false; - final Set set = new HashSet(); - for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { - n = n.getLink(); - while ( n.getParent() != s ) { - n = n.getParent(); - if ( n.isRoot() ) { - break; - } - } - set.add( n ); - } - for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { - n = n.getLink(); - while ( n.getParent() != s ) { - n = n.getParent(); - if ( n.isRoot() ) { - break; - } - } - if ( set.contains( n ) ) { - multiple = true; - break; - } - } - if ( multiple ) { - g.getNodeData().setEvent( createDuplicationEvent() ); - } - else { - g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() ); - } - } - else { - g.getNodeData().setEvent( createSpeciationEvent() ); - } + @Override + public Set getMappedExternalSpeciesTreeNodes() { + return _mapped_species_tree_nodes; + } + + @Override + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; + } + + public final int getSpeciationOrDuplicationEventsSum() { + return _speciation_or_duplication_events_sum; + } + + @Override + public final int getSpeciationsSum() { + return _speciations_sum; + } + + @Override + public List getStrippedExternalGeneTreeNodes() { + return _stripped_gene_tree_nodes; + } + + @Override + public List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + @Override + public TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } + + @Override + public final String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Speciations sum : " + getSpeciationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + if ( !_most_parsimonious_duplication_model ) { + sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); } + return sb.toString(); } /** @@ -159,12 +142,21 @@ public final class GSDI extends SDI { * Preconditions: Mapping M for external nodes must have been calculated and * the species tree must be labeled in preorder. *

+ * @return + * @throws SDIException * */ - final void geneTreePostOrderTraversal() { - for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) { + final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree, + final boolean most_parsimonious_duplication_model ) + throws SDIException { + final GSDIsummaryResult res = new GSDIsummaryResult(); + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { final PhylogenyNode g = it.next(); - if ( !g.isExternal() ) { + if ( g.isInternal() ) { + if ( g.getNumberOfDescendants() != 2 ) { + throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants() + + " descendents" ); + } PhylogenyNode s1 = g.getChildNode1().getLink(); PhylogenyNode s2 = g.getChildNode2().getLink(); while ( s1 != s2 ) { @@ -176,180 +168,257 @@ public final class GSDI extends SDI { } } g.setLink( s1 ); - determineEvent( s1, g ); + determineEvent( s1, g, most_parsimonious_duplication_model, res ); } } + return res; } - private final Event createDuplicationEvent() { - final Event event = Event.createSingleDuplicationEvent(); - ++_duplications_sum; - return event; - } - - private final Event createSingleSpeciationOrDuplicationEvent() { - final Event event = Event.createSingleSpeciationOrDuplicationEvent(); - ++_speciation_or_duplication_events_sum; - return event; - } - - private final Event createSpeciationEvent() { - final Event event = Event.createSingleSpeciationEvent(); - ++_speciations_sum; - return event; - } - - public final int getSpeciationOrDuplicationEventsSum() { - return _speciation_or_duplication_events_sum; - } - - public final int getSpeciationsSum() { - return _speciations_sum; + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { + final TaxonomyComparisonBase tax_comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree ); + if ( tax_comp_base == null ) { + throw new RuntimeException( "failed to establish taxonomy linking base (taxonomy linking base is null)" ); + } + return linkNodesOfG( gene_tree, species_tree, tax_comp_base, strip_gene_tree, strip_species_tree ); } /** * This allows for linking of internal nodes of the species tree (as opposed * to just external nodes, as in the method it overrides. - * @throws SdiException + * If TaxonomyComparisonBase is null, it will try to determine it. + * @throws SDIException * */ - @Override - final void linkNodesOfG() throws SdiException { + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final TaxonomyComparisonBase tax_comp_base, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { + if ( tax_comp_base == null ) { + throw new IllegalArgumentException( "taxonomy linking base is null" ); + } final Map species_to_node_map = new HashMap(); final List species_tree_ext_nodes = new ArrayList(); - final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); - // System.out.println( "comp base is: " + tax_comp_base ); + final NodesLinkingResult res = new NodesLinkingResult(); + res.setTaxCompBase( tax_comp_base ); // Stringyfied taxonomy is the key, node is the value. - for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); species_tree_ext_nodes.add( s ); - final String tax_str = taxonomyToString( s, tax_comp_base ); - if ( !ForesterUtil.isEmpty( tax_str ) ) { - if ( species_to_node_map.containsKey( tax_str ) ) { - throw new SdiException( "taxonomy \"" + s + "\" is not unique in species tree" ); + if ( s.getNodeData().isHasTaxonomy() ) { + final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() ); + if ( !ForesterUtil.isEmpty( tax_str ) ) { + if ( species_to_node_map.containsKey( tax_str ) ) { + throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using " + + res.getTaxCompBase() + " for linking to gene tree)" ); + } + species_to_node_map.put( tax_str, s ); } - species_to_node_map.put( tax_str, s ); } } // Retrieve the reference to the node with a matching stringyfied taxonomy. - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode g = iter.next(); if ( !g.getNodeData().isHasTaxonomy() ) { - if ( _strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + if ( strip_gene_tree ) { + res.getStrippedGeneTreeNodes().add( g ); } else { - throw new SdiException( "gene tree node \"" + g + "\" has no taxonomic data" ); + throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" ); } } else { - final String tax_str = taxonomyToString( g, tax_comp_base ); + final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() ); if ( ForesterUtil.isEmpty( tax_str ) ) { - if ( _strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + if ( strip_gene_tree ) { + res.getStrippedGeneTreeNodes().add( g ); } else { - throw new SdiException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); + throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); } } else { - final PhylogenyNode s = species_to_node_map.get( tax_str ); + PhylogenyNode s = species_to_node_map.get( tax_str ); + if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) + && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + res.getScientificNamesMappedToReducedSpecificity() ); + } if ( s == null ) { - if ( _strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + if ( strip_gene_tree ) { + res.getStrippedGeneTreeNodes().add( g ); } else { - throw new SdiException( "taxonomy \"" + g.getNodeData().getTaxonomy() + throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy() + "\" not present in species tree" ); } } else { g.setLink( s ); - _mapped_species_tree_nodes.add( s ); - // System.out.println( "setting link of " + g + " to " + s ); + res.getMappedSpeciesTreeNodes().add( s ); } } } } // for loop - if ( _strip_gene_tree ) { - for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( g, true ); + if ( strip_gene_tree ) { + stripTree( gene_tree, res.getStrippedGeneTreeNodes() ); + if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) { + throw new SDIException( "species could not be mapped between gene tree and species tree (based on " + + res.getTaxCompBase() + ")" ); } } - if ( _strip_species_tree ) { - for( final PhylogenyNode s : species_tree_ext_nodes ) { - if ( !_mapped_species_tree_nodes.contains( s ) ) { - _species_tree.deleteSubtree( s, true ); - } - } + if ( strip_species_tree ) { + stripSpeciesTree( species_tree, species_tree_ext_nodes, res ); } + return res; } - public Set getMappedExternalSpeciesTreeNodes() { - return _mapped_species_tree_nodes; + private final static void addScientificNamesMappedToReducedSpecificity( final String s1, + final String s2, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); } - public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { - int with_id_count = 0; - int with_code_count = 0; - int with_sn_count = 0; - int max = 0; - for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - if ( g.getNodeData().isHasTaxonomy() ) { - final Taxonomy tax = g.getNodeData().getTaxonomy(); - if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { - if ( ++with_id_count > max ) { - max = with_id_count; + private final static void determineEvent( final PhylogenyNode s, + final PhylogenyNode g, + final boolean most_parsimonious_duplication_model, + final GSDIsummaryResult res ) { + boolean oyako = false; + if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { + oyako = true; + } + if ( g.getLink().getNumberOfDescendants() == 2 ) { + if ( oyako ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + } + else { + if ( oyako ) { + final Set set = new HashSet(); + for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { + n = n.getLink(); + while ( n.getParent() != s ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } } + set.add( n ); } - if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { - if ( ++with_code_count > max ) { - max = with_code_count; + boolean multiple = false; + for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { + n = n.getLink(); + while ( n.getParent() != s ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } + } + if ( set.contains( n ) ) { + multiple = true; + break; } } - if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { - if ( ++with_sn_count > max ) { - max = with_sn_count; + if ( multiple ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + if ( most_parsimonious_duplication_model ) { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() ); + res.increaseSpeciationOrDuplicationEventsSum(); } } } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } } - if ( max == 0 ) { - throw new IllegalArgumentException( "gene tree has no taxonomic data" ); - } - else if ( max == 1 ) { - throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); - } - else if ( max == with_sn_count ) { - return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; - } - else if ( max == with_id_count ) { - return SDI.TaxonomyComparisonBase.ID; + } + + private final static void stripSpeciesTree( final Phylogeny species_tree, + final List species_tree_ext_nodes, + final NodesLinkingResult res ) { + for( final PhylogenyNode s : species_tree_ext_nodes ) { + if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) { + species_tree.deleteSubtree( s, true ); + res.getStrippedSpeciesTreeNodes().add( s ); + } } - else { - return SDI.TaxonomyComparisonBase.CODE; + species_tree.clearHashIdToNodeMap(); + species_tree.externalNodesHaveChanged(); + } + + private final static void stripTree( final Phylogeny phy, final List strip_nodes ) { + for( final PhylogenyNode g : strip_nodes ) { + phy.deleteSubtree( g, true ); } + phy.clearHashIdToNodeMap(); + phy.externalNodesHaveChanged(); } - public List getStrippedExternalGeneTreeNodes() { - return _stripped_gene_tree_nodes; + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + " (", + scientific_names_mapped_to_reduced_specificity ); + if ( s == null ) { + if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { + final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); + s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); + } + } + } + if ( s == null ) { + for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", + " form ", " subform ", " cultivar ", " section ", " subsection " } ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + t, + scientific_names_mapped_to_reduced_specificity ); + if ( s != null ) { + break; + } + } + } + return s; } - @Override - public final String toString() { - final StringBuffer sb = new StringBuffer(); - sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Speciations sum : " + getSpeciationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Duplications sum : " + getDuplicationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - if ( !_most_parsimonious_duplication_model ) { - sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final String term, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + final int i = tax_str.indexOf( term ); + if ( i > 4 ) { + final String new_tax_str = tax_str.substring( 0, i ).trim(); + final PhylogenyNode s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); + } + return s; } - sb.append( "mapping cost L : " + computeMappingCostL() ); - return sb.toString(); + return null; } }