X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsdi%2FGSDI.java;h=63c6215253b89bc58f5ef013d91167a220504bbf;hb=cec0663378230521f24a851cb1c1c9491026b70a;hp=780aba9c24247ea35013d4c361980cfba8d0ef76;hpb=12298ec6ab774c405b20389b81f73329ea3323a0;p=jalview.git diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java index 780aba9..63c6215 100644 --- a/forester/java/src/org/forester/sdi/GSDI.java +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -31,196 +31,107 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.SortedSet; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Event; -import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.ForesterUtil; -/* - * Implements our algorithm for speciation - duplication inference (SDI).

- * The initialization is accomplished by:

The recursion part is accomplished by this class' - * method "geneTreePostOrderTraversal(PhylogenyNode)".

Requires JDK 1.5 or - * greater. - * - * @see SDI#linkNodesOfG() - * - * @see Phylogeny#preorderReID(int) - * - * @see - * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) - * - * @see #geneTreePostOrderTraversal(PhylogenyNode) - * - * @author Christian M. Zmasek - */ -public final class GSDI extends SDI { +public final class GSDI implements GSDII { - private final HashMap _transversal_counts; - private final boolean _most_parsimonious_duplication_model; - private final boolean _strip_gene_tree; - private final boolean _strip_species_tree; - private int _speciation_or_duplication_events_sum; - private int _speciations_sum; - private final List _stripped_gene_tree_nodes; - private final List _stripped_species_tree_nodes; - private final Set _mapped_species_tree_nodes; + private final boolean _most_parsimonious_duplication_model; + private final int _speciation_or_duplication_events_sum; + private final int _speciations_sum; + private final int _duplications_sum; + private final List _stripped_gene_tree_nodes; + private final List _stripped_species_tree_nodes; + private final Set _mapped_species_tree_nodes; + private final TaxonomyComparisonBase _tax_comp_base; + private final SortedSet _scientific_names_mapped_to_reduced_specificity; - /** - * Constructor which sets the gene tree and the species tree to be compared. - * species_tree is the species tree to which the gene tree gene_tree will be - * compared to - with method "infer(boolean)". Both Trees must be completely - * binary and rooted. The actual inference is accomplished with method - * "infer(boolean)". The mapping cost L can then be calculated with method - * "computeMappingCost()". - *

- * - * @see #infer(boolean) - * @see SDI#computeMappingCostL() - * @param gene_tree - * reference to a rooted gene tree to which assign duplication vs - * speciation, must have species names in the species name fields - * for all external nodes - * @param species_tree - * reference to a rooted binary species tree which might get - * stripped in the process, must have species names in the - * species name fields for all external nodes - * - * @param most_parsimonious_duplication_model - * set to true to assign nodes as speciations which would - * otherwise be assiged as unknown because of polytomies in the - * species tree. - * @throws SdiException - * - */ public GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model, final boolean strip_gene_tree, - final boolean strip_species_tree ) throws SdiException { - super( gene_tree, species_tree ); - _speciation_or_duplication_events_sum = 0; - _speciations_sum = 0; + final boolean strip_species_tree ) throws SDIException { _most_parsimonious_duplication_model = most_parsimonious_duplication_model; - _transversal_counts = new HashMap(); - _duplications_sum = 0; - _strip_gene_tree = strip_gene_tree; - _strip_species_tree = strip_species_tree; - _stripped_gene_tree_nodes = new ArrayList(); - _stripped_species_tree_nodes = new ArrayList(); - _mapped_species_tree_nodes = new HashSet(); - getSpeciesTree().preOrderReId(); - linkNodesOfG(); - geneTreePostOrderTraversal( getGeneTree().getRoot() ); + if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) { + gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) ); + } + final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree, + species_tree, + strip_gene_tree, + strip_species_tree ); + _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes(); + _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes(); + _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes(); + _scientific_names_mapped_to_reduced_specificity = nodes_linking_result + .getScientificNamesMappedToReducedSpecificity(); + _tax_comp_base = nodes_linking_result.getTaxCompBase(); + PhylogenyMethods.preOrderReId( species_tree ); + final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree, + _most_parsimonious_duplication_model ); + _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum(); + _speciations_sum = gsdi_summary_result.getSpeciationsSum(); + _duplications_sum = gsdi_summary_result.getDuplicationsSum(); } - GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model ) - throws SdiException { - this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false ); + public int getDuplicationsSum() { + return _duplications_sum; } - private final Event createDuplicationEvent() { - final Event event = Event.createSingleDuplicationEvent(); - ++_duplications_sum; - return event; + @Override + public Set getMappedExternalSpeciesTreeNodes() { + return _mapped_species_tree_nodes; } - private final Event createSingleSpeciationOrDuplicationEvent() { - final Event event = Event.createSingleSpeciationOrDuplicationEvent(); - ++_speciation_or_duplication_events_sum; - return event; + @Override + public final SortedSet getReMappedScientificNamesFromGeneTree() { + return _scientific_names_mapped_to_reduced_specificity; } - private final Event createSpeciationEvent() { - final Event event = Event.createSingleSpeciationEvent(); - ++_speciations_sum; - return event; + public final int getSpeciationOrDuplicationEventsSum() { + return _speciation_or_duplication_events_sum; } - // s is the node on the species tree g maps to. - private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { - Event event = null; - // Determine how many children map to same node as parent. - int sum_g_childs_mapping_to_s = 0; - for( int i = 0; i < g.getNumberOfDescendants(); ++i ) { - final PhylogenyNode c = g.getChildNode( i ); - if ( c.getLink() == s ) { - ++sum_g_childs_mapping_to_s; - } - } - // Determine the sum of traversals. - int traversals_sum = 0; - int max_traversals = 0; - PhylogenyNode max_traversals_node = null; - if ( !s.isExternal() ) { - for( int i = 0; i < s.getNumberOfDescendants(); ++i ) { - final PhylogenyNode current_node = s.getChildNode( i ); - final int traversals = getTraversalCount( current_node ); - traversals_sum += traversals; - if ( traversals > max_traversals ) { - max_traversals = traversals; - max_traversals_node = current_node; - } - } - } - // System.out.println( " sum=" + traversals_sum ); - // System.out.println( " max=" + max_traversals ); - // System.out.println( " m=" + sum_g_childs_mapping_to_s ); - if ( sum_g_childs_mapping_to_s > 0 ) { - if ( traversals_sum == 2 ) { - event = createDuplicationEvent(); - System.out.print( g.toString() ); - System.out.println( " : ==2" ); - // _transversal_counts.clear(); - } - else if ( traversals_sum > 2 ) { - if ( max_traversals <= 1 ) { - if ( _most_parsimonious_duplication_model ) { - event = createSpeciationEvent(); - } - else { - event = createSingleSpeciationOrDuplicationEvent(); - } - } - else { - event = createDuplicationEvent(); - //System.out.println( g.toString() ); - _transversal_counts.put( max_traversals_node, 1 ); - // _transversal_counts.clear(); - } - } - else { - event = createDuplicationEvent(); - // _transversal_counts.clear(); - } - normalizeTcounts( s ); - } - else { - event = createSpeciationEvent(); - } - g.getNodeData().setEvent( event ); + @Override + public final int getSpeciationsSum() { + return _speciations_sum; } - private void normalizeTcounts( final PhylogenyNode s ) { - int min_traversals = Integer.MAX_VALUE; - for( int i = 0; i < s.getNumberOfDescendants(); ++i ) { - final PhylogenyNode current_node = s.getChildNode( i ); - final int traversals = getTraversalCount( current_node ); - if ( traversals < min_traversals ) { - min_traversals = traversals; - } - } - for( int i = 0; i < s.getNumberOfDescendants(); ++i ) { - final PhylogenyNode current_node = s.getChildNode( i ); - _transversal_counts.put( current_node, getTraversalCount( current_node ) - min_traversals ); + @Override + public List getStrippedExternalGeneTreeNodes() { + return _stripped_gene_tree_nodes; + } + + @Override + public List getStrippedSpeciesTreeNodes() { + return _stripped_species_tree_nodes; + } + + @Override + public TaxonomyComparisonBase getTaxCompBase() { + return _tax_comp_base; + } + + @Override + public final String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Speciations sum : " + getSpeciationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + if ( !_most_parsimonious_duplication_model ) { + sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); } + return sb.toString(); } /** @@ -231,381 +142,283 @@ public final class GSDI extends SDI { * Preconditions: Mapping M for external nodes must have been calculated and * the species tree must be labeled in preorder. *

+ * @return + * @throws SDIException * - * @param g - * starting node of a gene tree - normally the root */ - final void geneTreePostOrderTraversal( final PhylogenyNode g ) { - if ( !g.isExternal() ) { - boolean all_ext = true; - for( int i = 0; i < g.getNumberOfDescendants(); ++i ) { - if ( g.getChildNode( i ).isInternal() ) { - all_ext = false; - break; + final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree, + final boolean most_parsimonious_duplication_model ) + throws SDIException { + final GSDIsummaryResult res = new GSDIsummaryResult(); + for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode g = it.next(); + if ( g.isInternal() ) { + if ( g.getNumberOfDescendants() != 2 ) { + throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants() + + " descendents" ); } - } - if ( all_ext ) { - //_transversal_counts.clear(); - } - for( int i = 0; i < g.getNumberOfDescendants(); ++i ) { - geneTreePostOrderTraversal( g.getChildNode( i ) ); - } - final PhylogenyNode[] linked_nodes = new PhylogenyNode[ g.getNumberOfDescendants() ]; - for( int i = 0; i < linked_nodes.length; ++i ) { - if ( g.getChildNode( i ).getLink() == null ) { - System.out.println( "link is null for " + g.getChildNode( i ) ); - System.exit( -1 ); + PhylogenyNode s1 = g.getChildNode1().getLink(); + PhylogenyNode s2 = g.getChildNode2().getLink(); + while ( s1 != s2 ) { + if ( s1.getId() > s2.getId() ) { + s1 = s1.getParent(); + } + else { + s2 = s2.getParent(); + } } - linked_nodes[ i ] = g.getChildNode( i ).getLink(); - } - final int[] min_max = obtainMinMaxIdIndices( linked_nodes ); - int min_i = min_max[ 0 ]; - int max_i = min_max[ 1 ]; - // initTransversalCounts(); - while ( linked_nodes[ min_i ] != linked_nodes[ max_i ] ) { - increaseTraversalCount( linked_nodes[ max_i ] ); - linked_nodes[ max_i ] = linked_nodes[ max_i ].getParent(); - final int[] min_max_ = obtainMinMaxIdIndices( linked_nodes ); - min_i = min_max_[ 0 ]; - max_i = min_max_[ 1 ]; + g.setLink( s1 ); + determineEvent( s1, g, most_parsimonious_duplication_model, res ); } - final PhylogenyNode s = linked_nodes[ max_i ]; - g.setLink( s ); - // Determines whether dup. or spec. - determineEvent( s, g ); } + return res; } - public final int getSpeciationOrDuplicationEventsSum() { - return _speciation_or_duplication_events_sum; - } - - public final int getSpeciationsSum() { - return _speciations_sum; - } - - private final int getTraversalCount( final PhylogenyNode node ) { - if ( _transversal_counts.containsKey( node ) ) { - return _transversal_counts.get( node ); - } - return 0; - } - - private final void increaseTraversalCount( final PhylogenyNode node ) { - if ( _transversal_counts.containsKey( node ) ) { - _transversal_counts.put( node, _transversal_counts.get( node ) + 1 ); - } - else { - _transversal_counts.put( node, 1 ); + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { + final TaxonomyComparisonBase tax_comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree ); + if ( tax_comp_base == null ) { + throw new RuntimeException( "failed to establish taxonomy linking base (taxonomy linking base is null)" ); } - // System.out.println( "count for node " + node.getID() + " is now " - // + getTraversalCount( node ) ); + return linkNodesOfG( gene_tree, species_tree, tax_comp_base, strip_gene_tree, strip_species_tree ); } /** * This allows for linking of internal nodes of the species tree (as opposed * to just external nodes, as in the method it overrides. - * @throws SdiException + * If TaxonomyComparisonBase is null, it will try to determine it. + * @throws SDIException * */ - @Override - // final void linkNodesOfG() { - // final HashMap speciestree_ext_nodes = createTaxonomyToNodeMap(); - // if ( _strip_gene_tree ) { - // stripGeneTree( speciestree_ext_nodes ); - // if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) { - // throw new IllegalArgumentException( "species tree does not contain any" - // + " nodes matching species in the gene tree" ); - // } - // } - // // Retrieve the reference to the PhylogenyNode with a matching species. - // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - // final PhylogenyNode g = iter.next(); - // if ( !g.getNodeData().isHasTaxonomy() ) { - // throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); - // } - // final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() ); - // if ( s == null ) { - // throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy() - // + " not present in species tree" ); - // } - // g.setLink( s ); - // } - // } - final void linkNodesOfG() throws SdiException { + final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree, + final Phylogeny species_tree, + final TaxonomyComparisonBase tax_comp_base, + final boolean strip_gene_tree, + final boolean strip_species_tree ) throws SDIException { + if ( tax_comp_base == null ) { + throw new IllegalArgumentException( "taxonomy linking base is null" ); + } final Map species_to_node_map = new HashMap(); final List species_tree_ext_nodes = new ArrayList(); - final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree ); - // System.out.println( "comp base is: " + tax_comp_base ); + final NodesLinkingResult res = new NodesLinkingResult(); + res.setTaxCompBase( tax_comp_base ); // Stringyfied taxonomy is the key, node is the value. - for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode s = iter.next(); species_tree_ext_nodes.add( s ); - final String tax_str = taxonomyToString( s, tax_comp_base ); - if ( !ForesterUtil.isEmpty( tax_str ) ) { - if ( species_to_node_map.containsKey( tax_str ) ) { - throw new SdiException( "taxonomy \"" + s + "\" is not unique in species tree" ); + if ( s.getNodeData().isHasTaxonomy() ) { + final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() ); + if ( !ForesterUtil.isEmpty( tax_str ) ) { + if ( species_to_node_map.containsKey( tax_str ) ) { + throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using " + + res.getTaxCompBase() + " for linking to gene tree)" ); + } + species_to_node_map.put( tax_str, s ); } - species_to_node_map.put( tax_str, s ); } } // Retrieve the reference to the node with a matching stringyfied taxonomy. - for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode g = iter.next(); if ( !g.getNodeData().isHasTaxonomy() ) { - if ( _strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + if ( strip_gene_tree ) { + res.getStrippedGeneTreeNodes().add( g ); } else { - throw new SdiException( "gene tree node \"" + g + "\" has no taxonomic data" ); + throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" ); } } else { - final String tax_str = taxonomyToString( g, tax_comp_base ); + final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() ); if ( ForesterUtil.isEmpty( tax_str ) ) { - if ( _strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + if ( strip_gene_tree ) { + res.getStrippedGeneTreeNodes().add( g ); } else { - throw new SdiException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); + throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" ); } } else { - final PhylogenyNode s = species_to_node_map.get( tax_str ); + PhylogenyNode s = species_to_node_map.get( tax_str ); + if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null ) + && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + res.getScientificNamesMappedToReducedSpecificity() ); + } if ( s == null ) { - if ( _strip_gene_tree ) { - _stripped_gene_tree_nodes.add( g ); + if ( strip_gene_tree ) { + res.getStrippedGeneTreeNodes().add( g ); } else { - throw new SdiException( "taxonomy \"" + g.getNodeData().getTaxonomy() + throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy() + "\" not present in species tree" ); } } else { g.setLink( s ); - _mapped_species_tree_nodes.add( s ); - // System.out.println( "setting link of " + g + " to " + s ); + res.getMappedSpeciesTreeNodes().add( s ); } } } } // for loop - if ( _strip_gene_tree ) { - for( final PhylogenyNode g : _stripped_gene_tree_nodes ) { - _gene_tree.deleteSubtree( g, true ); + if ( strip_gene_tree ) { + stripTree( gene_tree, res.getStrippedGeneTreeNodes() ); + if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) { + throw new SDIException( "species could not be mapped between gene tree and species tree (based on " + + res.getTaxCompBase() + ")" ); } } - if ( _strip_species_tree ) { - for( final PhylogenyNode s : species_tree_ext_nodes ) { - if ( !_mapped_species_tree_nodes.contains( s ) ) { - _species_tree.deleteSubtree( s, true ); - } - } + if ( strip_species_tree ) { + stripSpeciesTree( species_tree, species_tree_ext_nodes, res ); } + return res; } - public Set getMappedExternalSpeciesTreeNodes() { - return _mapped_species_tree_nodes; + private final static void addScientificNamesMappedToReducedSpecificity( final String s1, + final String s2, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 ); } - // final private HashMap createTaxonomyToNodeMap() { - // final HashMap speciestree_ext_nodes = new HashMap(); - // for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) { - // final PhylogenyNode n = iter.next(); - // if ( n.getNodeData().isHasTaxonomy() ) { - // if ( speciestree_ext_nodes.containsKey( n.getNodeData().getTaxonomy() ) ) { - // throw new IllegalArgumentException( "taxonomy [" + n.getNodeData().getTaxonomy() - // + "] is not unique in species phylogeny" ); - // } - // speciestree_ext_nodes.put( n.getNodeData().getTaxonomy(), n ); - // } - // } - // return speciestree_ext_nodes; - // } - // private final void stripGeneTree( final HashMap speciestree_ext_nodes ) { - // // final Set to_delete = new HashSet(); - // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - // final PhylogenyNode g = iter.next(); - // if ( !g.getNodeData().isHasTaxonomy() ) { - // throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); - // } - // if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { - // _stripped_gene_tree_nodes.add( g ); - // } - // } - // for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { - // _gene_tree.deleteSubtree( n, true ); - // } - // } - // private final void stripGeneTree2( final HashMap speciestree_ext_nodes ) { - // // final Set to_delete = new HashSet(); - // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - // final PhylogenyNode g = iter.next(); - // if ( !g.getNodeData().isHasTaxonomy() ) { - // _stripped_gene_tree_nodes.add( g ); - // } - // else { - // if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) { - // _stripped_gene_tree_nodes.add( g ); - // } - // } - // } - // for( final PhylogenyNode n : _stripped_gene_tree_nodes ) { - // _gene_tree.deleteSubtree( n, true ); - // } - // } - public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) { - int with_id_count = 0; - int with_code_count = 0; - int with_sn_count = 0; - int max = 0; - for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode g = iter.next(); - if ( g.getNodeData().isHasTaxonomy() ) { - final Taxonomy tax = g.getNodeData().getTaxonomy(); - if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { - if ( ++with_id_count > max ) { - max = with_id_count; + private final static void determineEvent( final PhylogenyNode s, + final PhylogenyNode g, + final boolean most_parsimonious_duplication_model, + final GSDIsummaryResult res ) { + boolean oyako = false; + if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) { + oyako = true; + } + if ( g.getLink().getNumberOfDescendants() == 2 ) { + if ( oyako ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + } + else { + if ( oyako ) { + final Set set = new HashSet(); + for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) { + n = n.getLink(); + while ( n.getParent() != s ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } } + set.add( n ); } - if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { - if ( ++with_code_count > max ) { - max = with_code_count; + boolean multiple = false; + for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) { + n = n.getLink(); + while ( n.getParent() != s ) { + n = n.getParent(); + if ( n.isRoot() ) { + break; + } + } + if ( set.contains( n ) ) { + multiple = true; + break; } } - if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { - if ( ++with_sn_count > max ) { - max = with_sn_count; + if ( multiple ) { + g.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + res.increaseDuplicationsSum(); + } + else { + if ( most_parsimonious_duplication_model ) { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } + else { + g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() ); + res.increaseSpeciationOrDuplicationEventsSum(); } } } - } - if ( max == 0 ) { - throw new IllegalArgumentException( "gene tree has no taxonomic data" ); - } - else if ( max == 1 ) { - throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" ); - } - else if ( max == with_sn_count ) { - return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME; - } - else if ( max == with_id_count ) { - return SDI.TaxonomyComparisonBase.ID; - } - else { - return SDI.TaxonomyComparisonBase.CODE; + else { + g.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + res.increaseSpeciationsSum(); + } } } - public List getStrippedExternalGeneTreeNodes() { - return _stripped_gene_tree_nodes; + private final static void stripSpeciesTree( final Phylogeny species_tree, + final List species_tree_ext_nodes, + final NodesLinkingResult res ) { + for( final PhylogenyNode s : species_tree_ext_nodes ) { + if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) { + species_tree.deleteSubtree( s, true ); + res.getStrippedSpeciesTreeNodes().add( s ); + } + } + species_tree.clearHashIdToNodeMap(); + species_tree.externalNodesHaveChanged(); } - @Override - public final String toString() { - final StringBuffer sb = new StringBuffer(); - sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Speciations sum : " + getSpeciationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - sb.append( "Duplications sum : " + getDuplicationsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); - if ( !_most_parsimonious_duplication_model ) { - sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); - sb.append( ForesterUtil.getLineSeparator() ); + private final static void stripTree( final Phylogeny phy, final List strip_nodes ) { + for( final PhylogenyNode g : strip_nodes ) { + phy.deleteSubtree( g, true ); } - sb.append( "mapping cost L : " + computeMappingCostL() ); - return sb.toString(); + phy.clearHashIdToNodeMap(); + phy.externalNodesHaveChanged(); } - static final int[] obtainMinMaxIdIndices( final PhylogenyNode[] linked_nodes ) { - int max_i = 0; - int min_i = 0; - int max_i_id = -Integer.MAX_VALUE; - int min_i_id = Integer.MAX_VALUE; - for( int i = 0; i < linked_nodes.length; ++i ) { - final int id_i = linked_nodes[ i ].getId(); - if ( id_i > max_i_id ) { - max_i = i; - max_i_id = linked_nodes[ max_i ].getId(); + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + " (", + scientific_names_mapped_to_reduced_specificity ); + if ( s == null ) { + if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) { + final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim(); + s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); + } } - if ( id_i < min_i_id ) { - min_i = i; - min_i_id = linked_nodes[ min_i ].getId(); + } + if ( s == null ) { + for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ", + " form ", " subform ", " cultivar ", " section ", " subsection " } ) { + s = tryMapByRemovingOverlySpecificData( species_to_node_map, + tax_str, + t, + scientific_names_mapped_to_reduced_specificity ); + if ( s != null ) { + break; + } } } - return new int[] { min_i, max_i }; + return s; + } + + private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map species_to_node_map, + final String tax_str, + final String term, + final SortedSet scientific_names_mapped_to_reduced_specificity ) { + final int i = tax_str.indexOf( term ); + if ( i > 4 ) { + final String new_tax_str = tax_str.substring( 0, i ).trim(); + final PhylogenyNode s = species_to_node_map.get( new_tax_str ); + if ( s != null ) { + addScientificNamesMappedToReducedSpecificity( tax_str, + new_tax_str, + scientific_names_mapped_to_reduced_specificity ); + } + return s; + } + return null; } - /** - * Updates the mapping function M after the root of the gene tree has been - * moved by one branch. It calculates M for the root of the gene tree and - * one of its two children. - *

- * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)". - *

- * (Last modfied: ) - * - * @param prev_root_was_dup - * true if the previous root was a duplication, false otherwise - * @param prev_root_c1 - * child 1 of the previous root - * @param prev_root_c2 - * child 2 of the previous root - * @return number of duplications which have been assigned in gene tree - */ - // int updateM( final boolean prev_root_was_dup, - // final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) { - // final PhylogenyNode root = getGeneTree().getRoot(); - // if ( ( root.getChildNode1() == prev_root_c1 ) - // || ( root.getChildNode2() == prev_root_c1 ) ) { - // calculateMforNode( prev_root_c1 ); - // } - // else { - // calculateMforNode( prev_root_c2 ); - // } - // Event event = null; - // if ( prev_root_was_dup ) { - // event = Event.createSingleDuplicationEvent(); - // } - // else { - // event = Event.createSingleSpeciationEvent(); - // } - // root.getPhylogenyNodeData().setEvent( event ); - // calculateMforNode( root ); - // return getDuplications(); - // } // updateM( boolean, PhylogenyNode, PhylogenyNode ) - // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode ) - // Calculates M for PhylogenyNode n, given that M for the two children - // of n has been calculated. - // (Last modified: 10/02/01) - // private void calculateMforNode( final PhylogenyNode n ) { - // if ( !n.isExternal() ) { - // boolean was_duplication = n.isDuplication(); - // PhylogenyNode a = n.getChildNode1().getLink(), b = n - // .getChildNode2().getLink(); - // while ( a != b ) { - // if ( a.getID() > b.getID() ) { - // a = a.getParent(); - // } - // else { - // b = b.getParent(); - // } - // } - // n.setLink( a ); - // Event event = null; - // if ( ( a == n.getChildNode1().getLink() ) - // || ( a == n.getChildNode2().getLink() ) ) { - // event = Event.createSingleDuplicationEvent(); - // if ( !was_duplication ) { - // increaseDuplications(); - // } - // } - // else { - // event = Event.createSingleSpeciationEvent(); - // if ( was_duplication ) { - // decreaseDuplications(); - // } - // } - // n.getPhylogenyNodeData().setEvent( event ); - // } - // } // calculateMforNode( PhylogenyNode ) }