X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsdi%2FGSDI.java;h=4ab6395af79a9f6928d9863cf88723acbf5770e5;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=fa5265df404affae78791b4d4fd8dbd8d91d9776;hpb=caebe5e946fc15f9042dc70cac985910c6205e3c;p=jalview.git diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index fa5265d..4ab6395 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -21,7 +21,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.sdi; @@ -32,139 +32,116 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; -import java.util.TreeSet; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Event; -import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; -/* - * Implements our algorithm for speciation - duplication inference (SDI).

- * The initialization is accomplished by:

The recursion part is accomplished by this class' - * method "geneTreePostOrderTraversal(PhylogenyNode)".

Requires JDK 1.5 or - * greater. - * - * @see SDI#linkNodesOfG() - * - * @see Phylogeny#preorderReID(int) - * - * @see - * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) - * - * @see #geneTreePostOrderTraversal(PhylogenyNode) - * - * @author Christian M. Zmasek - */ -public class GSDI extends SDI { +public final class GSDI implements GSDII { - private final boolean _most_parsimonious_duplication_model; - protected int _speciation_or_duplication_events_sum; - protected int _speciations_sum; - private final List _stripped_gene_tree_nodes; - private final List _stripped_species_tree_nodes; - private final Set _mapped_species_tree_nodes; - private TaxonomyComparisonBase _tax_comp_base; - private final SortedSet _scientific_names_mapped_to_reduced_specificity; + private final boolean _most_parsimonious_duplication_model; + private final int _speciation_or_duplication_events_sum; + private final int _speciations_sum; + private final int _duplications_sum; + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final Set _mapped_species_tree_nodes; + private final TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model, final boolean strip_gene_tree, final boolean strip_species_tree ) throws SDIException { - super( gene_tree, species_tree ); - _speciation_or_duplication_events_sum = 0; - _speciations_sum = 0; + this( gene_tree, species_tree, most_parsimonious_duplication_model, strip_gene_tree, strip_species_tree, true ); + } + + public GSDI( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean most_parsimonious_duplication_model, + final boolean strip_gene_tree, + final boolean strip_species_tree, + final boolean transfer_taxonomy ) throws SDIException { _most_parsimonious_duplication_model = most_parsimonious_duplication_model; - _duplications_sum = 0; - _stripped_gene_tree_nodes = new ArrayList(); - _stripped_species_tree_nodes = new ArrayList(); - _mapped_species_tree_nodes = new HashSet(); - _scientific_names_mapped_to_reduced_specificity = new TreeSet(); - linkNodesOfG( null, strip_gene_tree, strip_species_tree ); - PhylogenyMethods.preOrderReId( getSpeciesTree() ); - geneTreePostOrderTraversal(); + if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) { + gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) ); + } + final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree, + species_tree, + strip_gene_tree, + strip_species_tree ); + _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes(); + _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes(); + _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes(); + _scientific_names_mapped_to_reduced_specificity = nodes_linking_result + .getScientificNamesMappedToReducedSpecificity(); + _tax_comp_base = nodes_linking_result.getTaxCompBase(); + PhylogenyMethods.preOrderReId( species_tree ); + final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree, + _most_parsimonious_duplication_model, + transfer_taxonomy ); + _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum(); + _speciations_sum = gsdi_summary_result.getSpeciationsSum(); + _duplications_sum = gsdi_summary_result.getDuplicationsSum(); } - // Used by GSDIR - protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree ) - throws SDIException { - super( gene_tree, species_tree ); - _speciation_or_duplication_events_sum = -1; - _speciations_sum = 0; - _most_parsimonious_duplication_model = true; - _duplications_sum = 0; - _stripped_gene_tree_nodes = new ArrayList(); - _stripped_species_tree_nodes = new ArrayList(); - _mapped_species_tree_nodes = new HashSet(); - _scientific_names_mapped_to_reduced_specificity = new TreeSet(); + public int getDuplicationsSum() { + return _duplications_sum; } - // s is the node on the species tree g maps to. - private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { - boolean oyako = false; - if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { - oyako = true; - } - if ( g.getLink().getNumberOfDescendants() == 2 ) { - if ( oyako ) { - g.getNodeData().setEvent( createDuplicationEvent() ); - } - else { - g.getNodeData().setEvent( createSpeciationEvent() ); - } - } - else { - if ( oyako ) { - final Set set = new HashSet(); - for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { - n = n.getLink(); - while ( n.getParent() != s ) { - n = n.getParent(); - if ( n.isRoot() ) { - break; - } - } - set.add( n ); - } - boolean multiple = false; - for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { - n = n.getLink(); - while ( n.getParent() != s ) { - n = n.getParent(); - if ( n.isRoot() ) { - break; - } - } - if ( set.contains( n ) ) { - multiple = true; - break; - } - } - if ( multiple ) { - g.getNodeData().setEvent( createDuplicationEvent() ); - } - else { - if ( _most_parsimonious_duplication_model ) { - g.getNodeData().setEvent( createSpeciationEvent() ); - } - else { - g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() ); - } - } - } - else { - g.getNodeData().setEvent( createSpeciationEvent() ); - } + @Override + public Set getMappedExternalSpeciesTreeNodes() { + return _mapped_species_tree_nodes; + } + + @Override + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; + } + + public final int getSpeciationOrDuplicationEventsSum() { + return _speciation_or_duplication_events_sum; + } + + @Override + public final int getSpeciationsSum() { + return _speciations_sum; + } + + @Override + public List getStrippedExternalGeneTreeNodes() { + return _stripped_gene_tree_nodes; + } + + @Override + public List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + @Override + public TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } + + @Override + public final String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Speciations sum : " + getSpeciationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + if ( !_most_parsimonious_duplication_model ) { + sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); } + return sb.toString(); } /** @@ -175,12 +152,22 @@ public class GSDI extends SDI { * Preconditions: Mapping M for external nodes must have been calculated and * the species tree must be labeled in preorder. *

- * + * @param transfer_taxonomy + * @return + * @throws SDIException + * */ - final void geneTreePostOrderTraversal() { - for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) { + final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree, + final boolean most_parsimonious_duplication_model, + final boolean transfer_taxonomy ) throws SDIException { + final GSDIsummaryResult res = new GSDIsummaryResult(); + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { final PhylogenyNode g = it.next(); if ( g.isInternal() ) { + if ( g.getNumberOfDescendants() != 2 ) { + throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants() + + " descendents" ); + } PhylogenyNode s1 = g.getChildNode1().getLink(); PhylogenyNode s2 = g.getChildNode2().getLink(); while ( s1 != s2 ) { @@ -192,85 +179,107 @@ public class GSDI extends SDI { } } g.setLink( s1 ); - determineEvent( s1, g ); + determineEvent( s1, g, most_parsimonious_duplication_model, res ); + } + if ( transfer_taxonomy ) { + transferTaxonomy( g ); } } + return res; } - private final Event createDuplicationEvent() { - final Event event = Event.createSingleDuplicationEvent(); - ++_duplications_sum; - return event; - } - - private final Event createSingleSpeciationOrDuplicationEvent() { - final Event event = Event.createSingleSpeciationOrDuplicationEvent(); - ++_speciation_or_duplication_events_sum; - return event; - } - - private final Event createSpeciationEvent() { - final Event event = Event.createSingleSpeciationEvent(); - ++_speciations_sum; - return event; - } - - public final int getSpeciationOrDuplicationEventsSum() { - return _speciation_or_duplication_events_sum; + final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree, + final boolean most_parsimonious_duplication_model, + final int min_duplications ) throws SDIException { + final GSDIsummaryResult res = new GSDIsummaryResult(); + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode g = it.next(); + if ( g.isInternal() ) { + if ( g.getNumberOfDescendants() != 2 ) { + throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants() + + " descendents" ); + } + PhylogenyNode s1 = g.getChildNode1().getLink(); + PhylogenyNode s2 = g.getChildNode2().getLink(); + while ( s1 != s2 ) { + if ( s1.getId() > s2.getId() ) { + s1 = s1.getParent(); + } + else { + s2 = s2.getParent(); + } + } + g.setLink( s1 ); + determineEvent( s1, g, most_parsimonious_duplication_model, res ); + if ( res.getDuplicationsSum() > min_duplications ) { + return null; + } + } + } + return res; } - public final int getSpeciationsSum() { - return _speciations_sum; + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { + final TaxonomyComparisonBase tax_comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree ); + if ( tax_comp_base == null ) { + throw new RuntimeException( "failed to establish taxonomy linking base (taxonomy linking base is null)" ); + } + return linkNodesOfG( gene_tree, species_tree, tax_comp_base, strip_gene_tree, strip_species_tree ); } /** * This allows for linking of internal nodes of the species tree (as opposed * to just external nodes, as in the method it overrides. * If TaxonomyComparisonBase is null, it will try to determine it. - * @throws SDIException - * + * @throws SDIException + * */ - final void linkNodesOfG( final TaxonomyComparisonBase tax_comp_base, - final boolean strip_gene_tree, - final boolean strip_species_tree ) throws SDIException { - final Map species_to_node_map = new HashMap(); - final List species_tree_ext_nodes = new ArrayList(); + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final TaxonomyComparisonBase tax_comp_base, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { if ( tax_comp_base == null ) { - _tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); - } - else { - _tax_comp_base = tax_comp_base; + throw new IllegalArgumentException( "taxonomy linking base is null" ); } + final Map species_to_node_map = new HashMap(); + final List species_tree_ext_nodes = new ArrayList(); + final NodesLinkingResult res = new NodesLinkingResult(); + res.setTaxCompBase( tax_comp_base ); // Stringyfied taxonomy is the key, node is the value. - for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); species_tree_ext_nodes.add( s ); if ( s.getNodeData().isHasTaxonomy() ) { - final String tax_str = taxonomyToString( s, _tax_comp_base ); + final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() ); if ( !ForesterUtil.isEmpty( tax_str ) ) { if ( species_to_node_map.containsKey( tax_str ) ) { - throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" ); + throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using " + + res.getTaxCompBase() + " for linking to gene tree)" ); } species_to_node_map.put( tax_str, s ); } } } // Retrieve the reference to the node with a matching stringyfied taxonomy. - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode g = iter.next(); if ( !g.getNodeData().isHasTaxonomy() ) { if ( strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + res.getStrippedGeneTreeNodes().add( g ); } else { throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" ); } } else { - final String tax_str = taxonomyToString( g, _tax_comp_base ); + final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() ); if ( ForesterUtil.isEmpty( tax_str ) ) { if ( strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + res.getStrippedGeneTreeNodes().add( g ); } else { throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); @@ -278,176 +287,217 @@ public class GSDI extends SDI { } else { PhylogenyNode s = species_to_node_map.get( tax_str ); - if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) + if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) { - s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str ); + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + res.getScientificNamesMappedToReducedSpecificity() ); } if ( s == null ) { if ( strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + res.getStrippedGeneTreeNodes().add( g ); } else { throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy() - + "\" not present in species tree" ); + + "\" not present in species tree" ); } } else { g.setLink( s ); - _mapped_species_tree_nodes.add( s ); + res.getMappedSpeciesTreeNodes().add( s ); } } } } // for loop if ( strip_gene_tree ) { - stripGeneTree(); - if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) { - throw new SDIException( "species could not be mapped between gene tree and species tree" ); + stripTree( gene_tree, res.getStrippedGeneTreeNodes() ); + if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) { + throw new SDIException( "species could not be mapped between gene tree and species tree (based on " + + res.getTaxCompBase() + ")" ); } } if ( strip_species_tree ) { - stripSpeciesTree( species_tree_ext_nodes ); + stripSpeciesTree( species_tree, species_tree_ext_nodes, res ); } + return res; } - private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, - final String tax_str ) { - PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" ); - if ( s == null ) { - if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { - final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); - s = species_to_node_map.get( new_tax_str ); - if ( s != null ) { - addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); - } - } + static final void transferTaxonomy( final PhylogenyNode g ) { + if ( g == null ) { + throw new IllegalArgumentException( "gene tree node is null" ); } + final PhylogenyNode s = g.getLink(); if ( s == null ) { - for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", - " form ", " subform ", " cultivar ", " section ", " subsection " } ) { - s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t ); - if ( s != null ) { - break; + throw new IllegalArgumentException( "mapped species tree node is null" ); + } + if ( s.getNodeData().isHasTaxonomy() ) { + g.getNodeData().setTaxonomy( s.getNodeData().getTaxonomy() ); + if ( g.isInternal() ) { + if ( g.getChildNode1().isInternal() && g.getChildNode1().getNodeData().isHasTaxonomy() + && ( g.getChildNode1().getNodeData().getTaxonomy() == s.getNodeData().getTaxonomy() ) ) { + g.getChildNode1().getNodeData().setTaxonomy( null ); + } + if ( g.getChildNode2().isInternal() && g.getChildNode2().getNodeData().isHasTaxonomy() + && ( g.getChildNode2().getNodeData().getTaxonomy() == s.getNodeData().getTaxonomy() ) ) { + g.getChildNode2().getNodeData().setTaxonomy( null ); } } } - return s; - } - - private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, - final String tax_str, - final String term ) { - final int i = tax_str.indexOf( term ); - if ( i > 4 ) { - final String new_tax_str = tax_str.substring( 0, i ).trim(); - final PhylogenyNode s = species_to_node_map.get( new_tax_str ); - if ( s != null ) { - addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str ); + else if ( ForesterUtil.isEmpty( g.getName() ) && !ForesterUtil.isEmpty( s.getName() ) ) { + g.setName( s.getName() ); + if ( g.isInternal() ) { + if ( g.getChildNode1().isInternal() && ( g.getChildNode1().getName() == s.getName() ) ) { + g.getChildNode1().setName( "" ); + } + if ( g.getChildNode2().isInternal() && ( g.getChildNode2().getName() == s.getName() ) ) { + g.getChildNode2().setName( "" ); + } } - return s; } - return null; } - private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) { - _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); - } - - public final SortedSet getReMappedScientificNamesFromGeneTree() { - return _scientific_names_mapped_to_reduced_specificity; - } - - public TaxonomyComparisonBase getTaxCompBase() { - return _tax_comp_base; + private final static void addScientificNamesMappedToReducedSpecificity( final String s1, + final String s2, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); } - private void stripSpeciesTree( final List species_tree_ext_nodes ) { - for( final PhylogenyNode s : species_tree_ext_nodes ) { - if ( !_mapped_species_tree_nodes.contains( s ) ) { - _species_tree.deleteSubtree( s, true ); - _stripped_species_tree_nodes.add( s ); - } + private final static void determineEvent( final PhylogenyNode s, + final PhylogenyNode g, + final boolean most_parsimonious_duplication_model, + final GSDIsummaryResult res ) { + boolean oyako = false; + if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { + oyako = true; } - _species_tree.clearHashIdToNodeMap(); - _species_tree.externalNodesHaveChanged(); - } - - public List getStrippedSpeciesTreeNodes() { - return _stripped_species_tree_nodes; - } - - private void stripGeneTree() { - for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( g, true ); + if ( g.getLink().getNumberOfDescendants() == 2 ) { + if ( oyako ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } } - _gene_tree.clearHashIdToNodeMap(); - _gene_tree.externalNodesHaveChanged(); - } - - public Set getMappedExternalSpeciesTreeNodes() { - return _mapped_species_tree_nodes; - } - - public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { - int with_id_count = 0; - int with_code_count = 0; - int with_sn_count = 0; - int max = 0; - for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - if ( g.getNodeData().isHasTaxonomy() ) { - final Taxonomy tax = g.getNodeData().getTaxonomy(); - if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { - if ( ++with_id_count > max ) { - max = with_id_count; + else { + if ( oyako ) { + final Set set = new HashSet(); + for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { + n = n.getLink(); + while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } } + set.add( n ); } - if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { - if ( ++with_code_count > max ) { - max = with_code_count; + boolean multiple = false; + for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { + n = n.getLink(); + while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } } + if ( set.contains( n ) ) { + multiple = true; + break; + } + } + if ( multiple ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); } - if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { - if ( ++with_sn_count > max ) { - max = with_sn_count; + else { + if ( most_parsimonious_duplication_model ) { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() ); + res.increaseSpeciationOrDuplicationEventsSum(); } } } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } } - if ( max == 0 ) { - throw new IllegalArgumentException( "gene tree has no taxonomic data" ); - } - else if ( max == 1 ) { - throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); - } - else if ( max == with_id_count ) { - return SDI.TaxonomyComparisonBase.ID; - } - else if ( max == with_sn_count ) { - return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; + } + + private final static void stripSpeciesTree( final Phylogeny species_tree, + final List species_tree_ext_nodes, + final NodesLinkingResult res ) { + for( final PhylogenyNode s : species_tree_ext_nodes ) { + if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) { + species_tree.deleteSubtree( s, true ); + res.getStrippedSpeciesTreeNodes().add( s ); + } } - else { - return SDI.TaxonomyComparisonBase.CODE; + species_tree.clearHashIdToNodeMap(); + species_tree.externalNodesHaveChanged(); + } + + private final static void stripTree( final Phylogeny phy, final List strip_nodes ) { + for( final PhylogenyNode g : strip_nodes ) { + phy.deleteSubtree( g, true ); } + phy.clearHashIdToNodeMap(); + phy.externalNodesHaveChanged(); } - public List getStrippedExternalGeneTreeNodes() { - return _stripped_gene_tree_nodes; + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + " (", + scientific_names_mapped_to_reduced_specificity ); + if ( s == null ) { + if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { + final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); + s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); + } + } + } + if ( s == null ) { + for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", + " form ", " subform ", " cultivar ", " section ", " subsection " } ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + t, + scientific_names_mapped_to_reduced_specificity ); + if ( s != null ) { + break; + } + } + } + return s; } - @Override - public final String toString() { - final StringBuffer sb = new StringBuffer(); - sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Speciations sum : " + getSpeciationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Duplications sum : " + getDuplicationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - if ( !_most_parsimonious_duplication_model ) { - sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final String term, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + final int i = tax_str.indexOf( term ); + if ( i > 4 ) { + final String new_tax_str = tax_str.substring( 0, i ).trim(); + final PhylogenyNode s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); + } + return s; } - sb.append( "mapping cost L : " + computeMappingCostL() ); - return sb.toString(); + return null; } }