--- /dev/null
+#include "muscle.h"\r
+#include "tree.h"\r
+\r
+#define TRACE 0\r
+\r
+/***\r
+Algorithm to compare two trees, X and Y.\r
+\r
+A node x in X and node y in Y are defined to be\r
+similar iff the set of leaves in the subtree under\r
+x is identical to the set of leaves under y.\r
+\r
+A node is defined to be dissimilar iff it is not\r
+similar to any node in the other tree.\r
+\r
+Nodes x and y are defined to be married iff every\r
+node in the subtree under x is similar to a node\r
+in the subtree under y. Married nodes are considered\r
+to be equal. The subtrees under two married nodes can\r
+at most differ by exchanges of left and right branches,\r
+which we do not consider to be significant here.\r
+\r
+A node is defined to be a bachelor iff it is not\r
+married. If a node is a bachelor, then it has a\r
+dissimilar node in its subtree, and it follows\r
+immediately from the definition of marriage that its\r
+parent is also a bachelor. Hence all nodes on the path\r
+from a bachelor node to the root are bachelors.\r
+\r
+We assume the trees have the same set of leaves, so\r
+every leaf is trivially both similar and married to\r
+the same leaf in the opposite tree. Bachelor nodes\r
+are therefore always internal (i.e., non-leaf) nodes.\r
+\r
+A node is defined to be a diff iff (a) it is married\r
+and (b) its parent is a bachelor. The subtree under\r
+a diff is maximally similar to the other tree. (In\r
+other words, you cannot extend the subtree without\r
+adding a bachelor). \r
+\r
+The set of diffs is the subset of the two trees that\r
+we consider to be identical.\r
+\r
+Example:\r
+\r
+ -----A\r
+ -----k\r
+ ----j -----B\r
+--i -----C\r
+ ------D\r
+\r
+\r
+ -----A\r
+ -----p\r
+ ----n -----B\r
+--m -----D\r
+ ------C\r
+\r
+\r
+The following pairs of internal nodes are similar.\r
+\r
+ Nodes Set of leaves\r
+ ----- -------------\r
+ k,p A,B\r
+ i,m A,B,C,D\r
+\r
+Bachelors in the first tree are i and j, bachelors\r
+in the second tree are m and n.\r
+\r
+Node k and p are married, but i and m are not (because j\r
+and n are bachelors). The diffs are C, D and k.\r
+\r
+The set of bachelor nodes can be viewed as the internal\r
+nodes of a tree, the leaves of which are diffs. (To see\r
+that there can't be disjoint subtrees, note that the path\r
+from a diff to a root is all bachelor nodes, so there is\r
+always a path between two diffs that goes through the root).\r
+We call this tree the "diffs tree".\r
+\r
+There is a simple O(N) algorithm to build the diffs tree.\r
+To achieve O(N) we avoid traversing a given subtree multiple\r
+times and also avoid comparing lists of leaves. \r
+\r
+We visit nodes in depth-first order (i.e., a node is visited\r
+before its parent).\r
+\r
+If either child of a node is a bachelor, we flag it as\r
+a bachelor.\r
+\r
+If both children of the node we are visiting are married,\r
+we check whether the spouses of those children have the\r
+same parent in the other tree. If the parents are different,\r
+the current node is a bachelor. If they have the same parent,\r
+then the node we are visiting is the spouse of that parent.\r
+We assign this newly identified married couple a unique integer\r
+id. The id of a node is in one-to-one correspondence with the\r
+set of leaves in its subtree. Two nodes have the same set of\r
+leaves iff they have the same id. Bachelor nodes do not get\r
+an id.\r
+***/\r
+\r
+static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex,\r
+ const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex,\r
+ unsigned IdToDiffsLeafNodeIndex[])\r
+ {\r
+#if TRACE\r
+ Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n",\r
+ uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex));\r
+#endif\r
+ if (bIsDiff[uTreeNodeIndex])\r
+ {\r
+ unsigned uLeafCount = tree.GetLeafCount();\r
+ unsigned *Leaves = new unsigned[uLeafCount];\r
+ GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount);\r
+ for (unsigned n = 0; n < uLeafCount; ++n)\r
+ {\r
+ const unsigned uLeafNodeIndex = Leaves[n];\r
+ const unsigned uId = tree.GetLeafId(uLeafNodeIndex);\r
+ if (uId >= tree.GetLeafCount())\r
+ Quit("BuildDiffs, id out of range");\r
+ IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex;\r
+#if TRACE\r
+ Log(" Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex);\r
+#endif\r
+ }\r
+ delete[] Leaves;\r
+ return;\r
+ }\r
+\r
+ if (tree.IsLeaf(uTreeNodeIndex))\r
+ Quit("BuildDiffs: should never reach leaf");\r
+\r
+ const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex);\r
+ const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex);\r
+\r
+ const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex);\r
+ const unsigned uDiffsRight = uDiffsLeft + 1;\r
+\r
+ BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex);\r
+ BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex);\r
+ }\r
+\r
+void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs,\r
+ unsigned IdToDiffsLeafNodeIndex[])\r
+ {\r
+#if TRACE\r
+ Log("Tree1:\n");\r
+ Tree1.LogMe();\r
+ Log("\n");\r
+ Log("Tree2:\n");\r
+ Tree2.LogMe();\r
+#endif\r
+\r
+ if (!Tree1.IsRooted() || !Tree2.IsRooted())\r
+ Quit("DiffTrees: requires rooted trees");\r
+\r
+ const unsigned uNodeCount = Tree1.GetNodeCount();\r
+ const unsigned uNodeCount2 = Tree2.GetNodeCount();\r
+ \r
+ const unsigned uLeafCount = Tree1.GetLeafCount();\r
+ const unsigned uLeafCount2 = Tree2.GetLeafCount();\r
+ assert(uLeafCount == uLeafCount2);\r
+\r
+ if (uNodeCount != uNodeCount2)\r
+ Quit("DiffTrees: different node counts");\r
+\r
+// Allocate tables so we can convert tree node index to\r
+// and from the unique id with a O(1) lookup.\r
+ unsigned *NodeIndexToId1 = new unsigned[uNodeCount];\r
+ unsigned *IdToNodeIndex2 = new unsigned[uNodeCount];\r
+\r
+ bool *bIsBachelor1 = new bool[uNodeCount];\r
+ bool *bIsDiff1 = new bool[uNodeCount];\r
+\r
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
+ {\r
+ NodeIndexToId1[uNodeIndex] = uNodeCount;\r
+ bIsBachelor1[uNodeIndex] = false;\r
+ bIsDiff1[uNodeIndex] = false;\r
+\r
+ // Use uNodeCount as value meaning "not set".\r
+ IdToNodeIndex2[uNodeIndex] = uNodeCount;\r
+ }\r
+\r
+// Initialize node index <-> id lookup tables\r
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
+ {\r
+ if (Tree1.IsLeaf(uNodeIndex))\r
+ {\r
+ const unsigned uId = Tree1.GetLeafId(uNodeIndex);\r
+ if (uId >= uNodeCount)\r
+ Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");\r
+ NodeIndexToId1[uNodeIndex] = uId;\r
+ }\r
+\r
+ if (Tree2.IsLeaf(uNodeIndex))\r
+ {\r
+ const unsigned uId = Tree2.GetLeafId(uNodeIndex);\r
+ if (uId >= uNodeCount)\r
+ Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");\r
+ IdToNodeIndex2[uId] = uNodeIndex;\r
+ }\r
+ }\r
+\r
+// Validity check. This verifies that the ids\r
+// pre-assigned to the leaves in Tree1 are unique\r
+// (note that the id<N check above does not rule\r
+// out two leaves having duplicate ids).\r
+ for (unsigned uId = 0; uId < uLeafCount; ++uId)\r
+ {\r
+ unsigned uNodeIndex2 = IdToNodeIndex2[uId];\r
+ if (uNodeCount == uNodeIndex2)\r
+ Quit("DiffTrees, check 2");\r
+ }\r
+\r
+// Ids assigned to internal nodes are N, N+1 ...\r
+// An internal node id uniquely identifies a set\r
+// of two or more leaves.\r
+ unsigned uInternalNodeId = uLeafCount;\r
+\r
+// Depth-first traversal of tree.\r
+// The order guarantees that a node is visited before\r
+// its parent is visited.\r
+ for (unsigned uNodeIndex1 = Tree1.FirstDepthFirstNode();\r
+ NULL_NEIGHBOR != uNodeIndex1;\r
+ uNodeIndex1 = Tree1.NextDepthFirstNode(uNodeIndex1))\r
+ {\r
+#if TRACE\r
+ Log("Main loop: Node1=%u IsLeaf=%d IsBachelor=%d\n",\r
+ uNodeIndex1,\r
+ Tree1.IsLeaf(uNodeIndex1),\r
+ bIsBachelor1[uNodeIndex1]);\r
+#endif\r
+\r
+ // Leaves are trivial; nothing to do.\r
+ if (Tree1.IsLeaf(uNodeIndex1) || bIsBachelor1[uNodeIndex1])\r
+ continue;\r
+\r
+ // If either child is a bachelor, flag\r
+ // this node as a bachelor and continue.\r
+ unsigned uLeft1 = Tree1.GetLeft(uNodeIndex1);\r
+ if (bIsBachelor1[uLeft1])\r
+ {\r
+ bIsBachelor1[uNodeIndex1] = true;\r
+ continue;\r
+ }\r
+\r
+ unsigned uRight1 = Tree1.GetRight(uNodeIndex1);\r
+ if (bIsBachelor1[uRight1])\r
+ {\r
+ bIsBachelor1[uNodeIndex1] = true;\r
+ continue;\r
+ }\r
+\r
+ // Both children are married.\r
+ // Married nodes are guaranteed to have an id.\r
+ unsigned uIdLeft = NodeIndexToId1[uLeft1];\r
+ unsigned uIdRight = NodeIndexToId1[uRight1];\r
+\r
+ if (uIdLeft == uNodeCount || uIdRight == uNodeCount)\r
+ Quit("DiffTrees, check 5");\r
+\r
+ // uLeft2 is the spouse of uLeft1, and similarly for uRight2.\r
+ unsigned uLeft2 = IdToNodeIndex2[uIdLeft];\r
+ unsigned uRight2 = IdToNodeIndex2[uIdRight];\r
+\r
+ if (uLeft2 == uNodeCount || uRight2 == uNodeCount)\r
+ Quit("DiffTrees, check 6");\r
+\r
+ // If the spouses of uLeft1 and uRight1 have the same\r
+ // parent, then this parent is the spouse of uNodeIndex1.\r
+ // Otherwise, uNodeIndex1 is a diff.\r
+ unsigned uParentLeft2 = Tree2.GetParent(uLeft2);\r
+ unsigned uParentRight2 = Tree2.GetParent(uRight2);\r
+\r
+#if TRACE\r
+ Log("L1=%u R1=%u L2=%u R2=%u PL2=%u PR2=%u\n",\r
+ uLeft1,\r
+ uRight1,\r
+ uLeft2,\r
+ uRight2,\r
+ uParentLeft2,\r
+ uParentRight2);\r
+#endif\r
+\r
+ if (uParentLeft2 == uParentRight2)\r
+ {\r
+ NodeIndexToId1[uNodeIndex1] = uInternalNodeId;\r
+ IdToNodeIndex2[uInternalNodeId] = uParentLeft2;\r
+ ++uInternalNodeId;\r
+ }\r
+ else\r
+ bIsBachelor1[uNodeIndex1] = true;\r
+ }\r
+\r
+ unsigned uDiffCount = 0;\r
+ for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
+ {\r
+ if (bIsBachelor1[uNodeIndex])\r
+ continue;\r
+ if (Tree1.IsRoot(uNodeIndex))\r
+ {\r
+ // Special case: if no bachelors, consider the\r
+ // root a diff.\r
+ if (!bIsBachelor1[uNodeIndex])\r
+ bIsDiff1[uNodeIndex] = true;\r
+ continue;\r
+ }\r
+ const unsigned uParent = Tree1.GetParent(uNodeIndex);\r
+ if (bIsBachelor1[uParent])\r
+ {\r
+ bIsDiff1[uNodeIndex] = true;\r
+ ++uDiffCount;\r
+ }\r
+ }\r
+\r
+#if TRACE\r
+ Log("Tree1:\n");\r
+ Log("Node Id Bach Diff Name\n");\r
+ Log("---- ---- ---- ---- ----\n");\r
+ for (unsigned n = 0; n < uNodeCount; ++n)\r
+ {\r
+ Log("%4u %4u %d %d",\r
+ n,\r
+ NodeIndexToId1[n],\r
+ bIsBachelor1[n],\r
+ bIsDiff1[n]);\r
+ if (Tree1.IsLeaf(n))\r
+ Log(" %s", Tree1.GetLeafName(n));\r
+ Log("\n");\r
+ }\r
+ Log("\n");\r
+ Log("Tree2:\n");\r
+ Log("Node Id Name\n");\r
+ Log("---- ---- ----\n");\r
+ for (unsigned n = 0; n < uNodeCount; ++n)\r
+ {\r
+ Log("%4u ", n);\r
+ if (Tree2.IsLeaf(n))\r
+ Log(" %s", Tree2.GetLeafName(n));\r
+ Log("\n");\r
+ }\r
+#endif\r
+\r
+ Diffs.CreateRooted();\r
+ const unsigned uDiffsRootIndex = Diffs.GetRootNodeIndex();\r
+ const unsigned uRootIndex1 = Tree1.GetRootNodeIndex();\r
+\r
+ for (unsigned n = 0; n < uLeafCount; ++n)\r
+ IdToDiffsLeafNodeIndex[n] = uNodeCount;\r
+\r
+ BuildDiffs(Tree1, uRootIndex1, bIsDiff1, Diffs, uDiffsRootIndex,\r
+ IdToDiffsLeafNodeIndex);\r
+\r
+#if TRACE\r
+ Log("\n");\r
+ Log("Diffs:\n");\r
+ Diffs.LogMe();\r
+ Log("\n");\r
+ Log("IdToDiffsLeafNodeIndex:");\r
+ for (unsigned n = 0; n < uLeafCount; ++n)\r
+ {\r
+ if (n%16 == 0)\r
+ Log("\n");\r
+ else\r
+ Log(" ");\r
+ Log("%u=%u", n, IdToDiffsLeafNodeIndex[n]);\r
+ }\r
+ Log("\n");\r
+#endif\r
+\r
+ for (unsigned n = 0; n < uLeafCount; ++n)\r
+ if (IdToDiffsLeafNodeIndex[n] == uNodeCount)\r
+ Quit("TreeDiffs check 7");\r
+\r
+ delete[] NodeIndexToId1;\r
+ delete[] IdToNodeIndex2;\r
+\r
+ delete[] bIsBachelor1;\r
+ delete[] bIsDiff1;\r
+ }\r