Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / difftrees.cpp
diff --git a/website/archive/binaries/mac/src/muscle/difftrees.cpp b/website/archive/binaries/mac/src/muscle/difftrees.cpp
new file mode 100644 (file)
index 0000000..7445d22
--- /dev/null
@@ -0,0 +1,381 @@
+#include "muscle.h"\r
+#include "tree.h"\r
+\r
+#define TRACE  0\r
+\r
+/***\r
+Algorithm to compare two trees, X and Y.\r
+\r
+A node x in X and node y in Y are defined to be\r
+similar iff the set of leaves in the subtree under\r
+x is identical to the set of leaves under y.\r
+\r
+A node is defined to be dissimilar iff it is not\r
+similar to any node in the other tree.\r
+\r
+Nodes x and y are defined to be married iff every\r
+node in the subtree under x is similar to a node\r
+in the subtree under y. Married nodes are considered\r
+to be equal. The subtrees under two married nodes can\r
+at most differ by exchanges of left and right branches,\r
+which we do not consider to be significant here.\r
+\r
+A node is defined to be a bachelor iff it is not\r
+married. If a node is a bachelor, then it has a\r
+dissimilar node in its subtree, and it follows\r
+immediately from the definition of marriage that its\r
+parent is also a bachelor. Hence all nodes on the path\r
+from a bachelor node to the root are bachelors.\r
+\r
+We assume the trees have the same set of leaves, so\r
+every leaf is trivially both similar and married to\r
+the same leaf in the opposite tree. Bachelor nodes\r
+are therefore always internal (i.e., non-leaf) nodes.\r
+\r
+A node is defined to be a diff iff (a) it is married\r
+and (b) its parent is a bachelor. The subtree under\r
+a diff is maximally similar to the other tree. (In\r
+other words, you cannot extend the subtree without\r
+adding a bachelor). \r
+\r
+The set of diffs is the subset of the two trees that\r
+we consider to be identical.\r
+\r
+Example:\r
+\r
+              -----A\r
+        -----k\r
+   ----j      -----B\r
+--i     -----C\r
+   ------D\r
+\r
+\r
+              -----A\r
+        -----p\r
+   ----n      -----B\r
+--m     -----D\r
+   ------C\r
+\r
+\r
+The following pairs of internal nodes are similar.\r
+\r
+       Nodes   Set of leaves\r
+       -----   -------------\r
+       k,p             A,B\r
+       i,m             A,B,C,D\r
+\r
+Bachelors in the first tree are i and j, bachelors\r
+in the second tree are m and n.\r
+\r
+Node k and p are married, but i and m are not (because j\r
+and n are bachelors). The diffs are C, D and k.\r
+\r
+The set of bachelor nodes can be viewed as the internal\r
+nodes of a tree, the leaves of which are diffs. (To see\r
+that there can't be disjoint subtrees, note that the path\r
+from a diff to a root is all bachelor nodes, so there is\r
+always a path between two diffs that goes through the root).\r
+We call this tree the "diffs tree".\r
+\r
+There is a simple O(N) algorithm to build the diffs tree.\r
+To achieve O(N) we avoid traversing a given subtree multiple\r
+times and also avoid comparing lists of leaves. \r
+\r
+We visit nodes in depth-first order (i.e., a node is visited\r
+before its parent).\r
+\r
+If either child of a node is a bachelor, we flag it as\r
+a bachelor.\r
+\r
+If both children of the node we are visiting are married,\r
+we check whether the spouses of those children have the\r
+same parent in the other tree. If the parents are different,\r
+the current node is a bachelor. If they have the same parent,\r
+then the node we are visiting is the spouse of that parent.\r
+We assign this newly identified married couple a unique integer\r
+id. The id of a node is in one-to-one correspondence with the\r
+set of leaves in its subtree. Two nodes have the same set of\r
+leaves iff they have the same id. Bachelor nodes do not get\r
+an id.\r
+***/\r
+\r
+static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex,\r
+  const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex,\r
+  unsigned IdToDiffsLeafNodeIndex[])\r
+       {\r
+#if    TRACE\r
+       Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n",\r
+         uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex));\r
+#endif\r
+       if (bIsDiff[uTreeNodeIndex])\r
+               {\r
+               unsigned uLeafCount = tree.GetLeafCount();\r
+               unsigned *Leaves = new unsigned[uLeafCount];\r
+               GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount);\r
+               for (unsigned n = 0; n < uLeafCount; ++n)\r
+                       {\r
+                       const unsigned uLeafNodeIndex = Leaves[n];\r
+                       const unsigned uId = tree.GetLeafId(uLeafNodeIndex);\r
+                       if (uId >= tree.GetLeafCount())\r
+                               Quit("BuildDiffs, id out of range");\r
+                       IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex;\r
+#if    TRACE\r
+                       Log("  Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex);\r
+#endif\r
+                       }\r
+               delete[] Leaves;\r
+               return;\r
+               }\r
+\r
+       if (tree.IsLeaf(uTreeNodeIndex))\r
+               Quit("BuildDiffs: should never reach leaf");\r
+\r
+       const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex);\r
+       const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex);\r
+\r
+       const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex);\r
+       const unsigned uDiffsRight = uDiffsLeft + 1;\r
+\r
+       BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex);\r
+       BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex);\r
+       }\r
+\r
+void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs,\r
+  unsigned IdToDiffsLeafNodeIndex[])\r
+       {\r
+#if    TRACE\r
+       Log("Tree1:\n");\r
+       Tree1.LogMe();\r
+       Log("\n");\r
+       Log("Tree2:\n");\r
+       Tree2.LogMe();\r
+#endif\r
+\r
+       if (!Tree1.IsRooted() || !Tree2.IsRooted())\r
+               Quit("DiffTrees: requires rooted trees");\r
+\r
+       const unsigned uNodeCount = Tree1.GetNodeCount();\r
+       const unsigned uNodeCount2 = Tree2.GetNodeCount();\r
+       \r
+       const unsigned uLeafCount = Tree1.GetLeafCount();\r
+       const unsigned uLeafCount2 = Tree2.GetLeafCount();\r
+       assert(uLeafCount == uLeafCount2);\r
+\r
+       if (uNodeCount != uNodeCount2)\r
+               Quit("DiffTrees: different node counts");\r
+\r
+// Allocate tables so we can convert tree node index to\r
+// and from the unique id with a O(1) lookup.\r
+       unsigned *NodeIndexToId1 = new unsigned[uNodeCount];\r
+       unsigned *IdToNodeIndex2 = new unsigned[uNodeCount];\r
+\r
+       bool *bIsBachelor1 = new bool[uNodeCount];\r
+       bool *bIsDiff1 = new bool[uNodeCount];\r
+\r
+       for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
+               {\r
+               NodeIndexToId1[uNodeIndex] = uNodeCount;\r
+               bIsBachelor1[uNodeIndex] = false;\r
+               bIsDiff1[uNodeIndex] = false;\r
+\r
+       // Use uNodeCount as value meaning "not set".\r
+               IdToNodeIndex2[uNodeIndex] = uNodeCount;\r
+               }\r
+\r
+// Initialize node index <-> id lookup tables\r
+       for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
+               {\r
+               if (Tree1.IsLeaf(uNodeIndex))\r
+                       {\r
+                       const unsigned uId = Tree1.GetLeafId(uNodeIndex);\r
+                       if (uId >= uNodeCount)\r
+                               Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");\r
+                       NodeIndexToId1[uNodeIndex] = uId;\r
+                       }\r
+\r
+               if (Tree2.IsLeaf(uNodeIndex))\r
+                       {\r
+                       const unsigned uId = Tree2.GetLeafId(uNodeIndex);\r
+                       if (uId >= uNodeCount)\r
+                               Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");\r
+                       IdToNodeIndex2[uId] = uNodeIndex;\r
+                       }\r
+               }\r
+\r
+// Validity check. This verifies that the ids\r
+// pre-assigned to the leaves in Tree1 are unique\r
+// (note that the id<N check above does not rule\r
+// out two leaves having duplicate ids).\r
+       for (unsigned uId = 0; uId < uLeafCount; ++uId)\r
+               {\r
+               unsigned uNodeIndex2 = IdToNodeIndex2[uId];\r
+               if (uNodeCount == uNodeIndex2)\r
+                       Quit("DiffTrees, check 2");\r
+               }\r
+\r
+// Ids assigned to internal nodes are N, N+1 ...\r
+// An internal node id uniquely identifies a set\r
+// of two or more leaves.\r
+       unsigned uInternalNodeId = uLeafCount;\r
+\r
+// Depth-first traversal of tree.\r
+// The order guarantees that a node is visited before\r
+// its parent is visited.\r
+       for (unsigned uNodeIndex1 = Tree1.FirstDepthFirstNode();\r
+         NULL_NEIGHBOR != uNodeIndex1;\r
+         uNodeIndex1 = Tree1.NextDepthFirstNode(uNodeIndex1))\r
+               {\r
+#if    TRACE\r
+               Log("Main loop: Node1=%u IsLeaf=%d IsBachelor=%d\n",\r
+                 uNodeIndex1,\r
+                 Tree1.IsLeaf(uNodeIndex1),\r
+                 bIsBachelor1[uNodeIndex1]);\r
+#endif\r
+\r
+       // Leaves are trivial; nothing to do.\r
+               if (Tree1.IsLeaf(uNodeIndex1) || bIsBachelor1[uNodeIndex1])\r
+                       continue;\r
+\r
+       // If either child is a bachelor, flag\r
+       // this node as a bachelor and continue.\r
+               unsigned uLeft1 = Tree1.GetLeft(uNodeIndex1);\r
+               if (bIsBachelor1[uLeft1])\r
+                       {\r
+                       bIsBachelor1[uNodeIndex1] = true;\r
+                       continue;\r
+                       }\r
+\r
+               unsigned uRight1 = Tree1.GetRight(uNodeIndex1);\r
+               if (bIsBachelor1[uRight1])\r
+                       {\r
+                       bIsBachelor1[uNodeIndex1] = true;\r
+                       continue;\r
+                       }\r
+\r
+       // Both children are married.\r
+       // Married nodes are guaranteed to have an id.\r
+               unsigned uIdLeft = NodeIndexToId1[uLeft1];\r
+               unsigned uIdRight = NodeIndexToId1[uRight1];\r
+\r
+               if (uIdLeft == uNodeCount || uIdRight == uNodeCount)\r
+                       Quit("DiffTrees, check 5");\r
+\r
+       // uLeft2 is the spouse of uLeft1, and similarly for uRight2.\r
+               unsigned uLeft2 = IdToNodeIndex2[uIdLeft];\r
+               unsigned uRight2 = IdToNodeIndex2[uIdRight];\r
+\r
+               if (uLeft2 == uNodeCount || uRight2 == uNodeCount)\r
+                       Quit("DiffTrees, check 6");\r
+\r
+       // If the spouses of uLeft1 and uRight1 have the same\r
+       // parent, then this parent is the spouse of uNodeIndex1.\r
+       // Otherwise, uNodeIndex1 is a diff.\r
+               unsigned uParentLeft2 = Tree2.GetParent(uLeft2);\r
+               unsigned uParentRight2 = Tree2.GetParent(uRight2);\r
+\r
+#if    TRACE\r
+               Log("L1=%u R1=%u L2=%u R2=%u PL2=%u PR2=%u\n",\r
+                 uLeft1,\r
+                 uRight1,\r
+                 uLeft2,\r
+                 uRight2,\r
+                 uParentLeft2,\r
+                 uParentRight2);\r
+#endif\r
+\r
+               if (uParentLeft2 == uParentRight2)\r
+                       {\r
+                       NodeIndexToId1[uNodeIndex1] = uInternalNodeId;\r
+                       IdToNodeIndex2[uInternalNodeId] = uParentLeft2;\r
+                       ++uInternalNodeId;\r
+                       }\r
+               else\r
+                       bIsBachelor1[uNodeIndex1] = true;\r
+               }\r
+\r
+       unsigned uDiffCount = 0;\r
+       for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
+               {\r
+               if (bIsBachelor1[uNodeIndex])\r
+                       continue;\r
+               if (Tree1.IsRoot(uNodeIndex))\r
+                       {\r
+               // Special case: if no bachelors, consider the\r
+               // root a diff.\r
+                       if (!bIsBachelor1[uNodeIndex])\r
+                               bIsDiff1[uNodeIndex] = true;\r
+                       continue;\r
+                       }\r
+               const unsigned uParent = Tree1.GetParent(uNodeIndex);\r
+               if (bIsBachelor1[uParent])\r
+                       {\r
+                       bIsDiff1[uNodeIndex] = true;\r
+                       ++uDiffCount;\r
+                       }\r
+               }\r
+\r
+#if    TRACE\r
+       Log("Tree1:\n");\r
+       Log("Node    Id  Bach  Diff  Name\n");\r
+       Log("----  ----  ----  ----  ----\n");\r
+       for (unsigned n = 0; n < uNodeCount; ++n)\r
+               {\r
+               Log("%4u  %4u     %d     %d",\r
+                 n,\r
+                 NodeIndexToId1[n],\r
+                 bIsBachelor1[n],\r
+                 bIsDiff1[n]);\r
+               if (Tree1.IsLeaf(n))\r
+                       Log("  %s", Tree1.GetLeafName(n));\r
+               Log("\n");\r
+               }\r
+       Log("\n");\r
+       Log("Tree2:\n");\r
+       Log("Node    Id              Name\n");\r
+       Log("----  ----              ----\n");\r
+       for (unsigned n = 0; n < uNodeCount; ++n)\r
+               {\r
+               Log("%4u                  ", n);\r
+               if (Tree2.IsLeaf(n))\r
+                       Log("  %s", Tree2.GetLeafName(n));\r
+               Log("\n");\r
+               }\r
+#endif\r
+\r
+       Diffs.CreateRooted();\r
+       const unsigned uDiffsRootIndex = Diffs.GetRootNodeIndex();\r
+       const unsigned uRootIndex1 = Tree1.GetRootNodeIndex();\r
+\r
+       for (unsigned n = 0; n < uLeafCount; ++n)\r
+               IdToDiffsLeafNodeIndex[n] = uNodeCount;\r
+\r
+       BuildDiffs(Tree1, uRootIndex1, bIsDiff1, Diffs, uDiffsRootIndex,\r
+         IdToDiffsLeafNodeIndex);\r
+\r
+#if TRACE\r
+       Log("\n");\r
+       Log("Diffs:\n");\r
+       Diffs.LogMe();\r
+       Log("\n");\r
+       Log("IdToDiffsLeafNodeIndex:");\r
+       for (unsigned n = 0; n < uLeafCount; ++n)\r
+               {\r
+               if (n%16 == 0)\r
+                       Log("\n");\r
+               else\r
+                       Log(" ");\r
+               Log("%u=%u", n, IdToDiffsLeafNodeIndex[n]);\r
+               }\r
+       Log("\n");\r
+#endif\r
+\r
+       for (unsigned n = 0; n < uLeafCount; ++n)\r
+               if (IdToDiffsLeafNodeIndex[n] == uNodeCount)\r
+                       Quit("TreeDiffs check 7");\r
+\r
+       delete[] NodeIndexToId1;\r
+       delete[] IdToNodeIndex2;\r
+\r
+       delete[] bIsBachelor1;\r
+       delete[] bIsDiff1;\r
+       }\r