Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / setgscweights.cpp
diff --git a/website/archive/binaries/mac/src/muscle/setgscweights.cpp b/website/archive/binaries/mac/src/muscle/setgscweights.cpp
new file mode 100644 (file)
index 0000000..7e7e787
--- /dev/null
@@ -0,0 +1,195 @@
+/***\r
+Gerstein/Sonnhammer/Chothia ad hoc sequence weighting.\r
+The algorithm was deduced by reverse-engineering the\r
+HMMer code.\r
+\r
+I used an alternative representation that I prefer over\r
+HMMer's. The HMMer code is full of tree manipulations\r
+that do something to the left child and then the equivalent\r
+thing to the right child. It was clear that there must be\r
+a re-formulation that does everything once for each node,\r
+which would reduce the number of operations expressed\r
+in the code by a factor of two. This gives a more elegant\r
+and less error-prone way to code it.\r
+\r
+These notes explain the correspondence between my design\r
+and Eddy's.\r
+\r
+HMMer stores a data structure phylo_s for each non-leaf\r
+node in the cluster tree. This structure contains the\r
+following fields:\r
+\r
+       diff            Weight of the node\r
+       lblen           Left branch length\r
+       rblen           Right branch length\r
+\r
+The lblen and rblen branch lengths are calculated as:\r
+\r
+       this.lblen = this.diff - left.diff\r
+       this.rblen = this.diff - right.diff\r
+\r
+My code stores one ClusterNode data structure per node\r
+in the cluster tree, including leaves. I store only the\r
+weight. I can recover the HMMer branch length fields\r
+in a trivial O(1) calculation as follows:\r
+\r
+       lblen = Node.GetWeight() - Node.GetLeft()->GetWeight()\r
+       rblen = Node.GetWeight() - Node.GetRight()->GetWeight()\r
+\r
+For the GSC weights calculation, HMMer constructs the\r
+following vectors, which have entries for all nodes,\r
+including leaves:\r
+\r
+       lwt             Left weight\r
+       rwt             Right weight\r
+\r
+The "left weight" is calculated as the sum of the weights in\r
+all the nodes reachable through the left branch, including\r
+the node itself. (This is not immediately obvious from the\r
+code, which does the calculation using branch lengths rather\r
+than weights, but this is an equivalent, and to my mind clearer,\r
+statement of what they are). Similarly, the "right weight" is\r
+the sum of all weights reachable via the right branch. I define\r
+the "cluster weight" to be the summed weight of all nodes in the\r
+subtree under the node, including the node itself. I provide\r
+a function Node.GetClusterWeight() which calculates the cluster\r
+weight using a O(ln N) recursion through the tree. The lwt and\r
+rwt values can be recovered as follows:\r
+\r
+       lwt             = Node.GetLeft()->GetClusterWeight()\r
+                       + Node.GetWeight()\r
+\r
+       lwt             = Node.GetLeft()->GetClusterWeight()\r
+                       + Node.GetWeight()\r
+\r
+HMMer calculates a further vector fwt as follows.\r
+\r
+       this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt)\r
+\r
+This applies to nodes reached via a left branch, for nodes reached\r
+via a right branch:\r
+\r
+       this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt)\r
+\r
+The values of fwt at the leaf nodes are the final GSC weights.\r
+We derive the various terms using our equivalents.\r
+\r
+       parent.lwt      = Parent.GetLeft()->GetClusterWeight()\r
+                               + Parent.GetWeight()\r
+\r
+       parent.rwt      = Parent.GetRight()->GetClusterWeight()\r
+                               + Parent.GetWeight()\r
+\r
+       parent.lwt + parent.rwt =\r
+                               { Parent.GetLeft()->GetClusterWeight()\r
+                               + Parent.GetRight()->GetClusterWeight()\r
+                               + Parent.GetWeight() }\r
+                               + Parent.GetWeight()\r
+\r
+We recognize the term {...} as the cluster weight of the\r
+parent, so\r
+\r
+       parent.lwt + parent.rwt\r
+                               = Parent.GetClusterWeight()\r
+                               + Parent.GetWeight()\r
+\r
+As you would expect, repeating this exercise for parent.rwt gives\r
+exactly the same expression.\r
+\r
+The GSC weights (fwt) are stored in the Weight2 field of the cluster\r
+tree, the Weight field stores the original (BLOSUM) weights used\r
+as input to this algorithm.\r
+***/\r
+\r
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "cluster.h"\r
+#include "distfunc.h"\r
+\r
+// Set weights of all sequences in the subtree under given node.\r
+void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const\r
+       {\r
+       if (0 == ptrNode)\r
+               return;\r
+\r
+       const ClusterNode *ptrRight = ptrNode->GetRight();\r
+       const ClusterNode *ptrLeft = ptrNode->GetLeft();\r
+\r
+// If leaf, set weight\r
+       if (0 == ptrRight && 0 == ptrLeft)\r
+               {\r
+               unsigned uIndex = ptrNode->GetIndex();\r
+               double dWeight = ptrNode->GetWeight2();\r
+               WEIGHT w = DoubleToWeight(dWeight);\r
+               m_Weights[uIndex] = w;\r
+               return;\r
+               }\r
+\r
+// Otherwise, recursively set subtrees\r
+       SetSubtreeWeight2(ptrLeft);\r
+       SetSubtreeWeight2(ptrRight);\r
+       }\r
+\r
+void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const\r
+       {\r
+       if (0 == ptrNode)\r
+               return;\r
+\r
+       ClusterNode *ptrParent = ptrNode->GetParent();\r
+       double dParentWeight2 = ptrParent->GetWeight2();\r
+       double dParentClusterWeight = ptrParent->GetClusterWeight();\r
+       if (0.0 == dParentClusterWeight)\r
+               {\r
+               double dThisClusterSize = ptrNode->GetClusterSize();\r
+               double dParentClusterSize = ptrParent->GetClusterSize();\r
+               double dWeight2 =\r
+                 dParentWeight2*dThisClusterSize/dParentClusterSize;\r
+               ptrNode->SetWeight2(dWeight2);\r
+               }\r
+       else\r
+               {\r
+       // Could cache cluster weights for better performance.\r
+       // We calculate cluster weight of each node twice, so this\r
+       // would give x2 improvement.\r
+       // As weighting is not very expensive, we don't care.\r
+               double dThisClusterWeight = ptrNode->GetClusterWeight();\r
+               double dParentWeight = ptrParent->GetWeight();\r
+\r
+               double dNum = dThisClusterWeight + dParentWeight;\r
+               double dDenom = dParentClusterWeight + dParentWeight;\r
+               double dWeight2 = dParentWeight2*(dNum/dDenom);\r
+\r
+               ptrNode->SetWeight2(dWeight2);\r
+               }\r
+\r
+       SetSubtreeGSCWeight(ptrNode->GetLeft());\r
+       SetSubtreeGSCWeight(ptrNode->GetRight());\r
+       }\r
+\r
+void MSA::SetGSCWeights() const\r
+       {\r
+       ClusterTree CT;\r
+       CalcBLOSUMWeights(CT);\r
+\r
+// Calculate weights and store in tree.\r
+       ClusterNode *ptrRoot = CT.GetRoot();\r
+       ptrRoot->SetWeight2(1.0);\r
+       SetSubtreeGSCWeight(ptrRoot->GetLeft());\r
+       SetSubtreeGSCWeight(ptrRoot->GetRight());\r
+\r
+// Copy weights from tree to MSA.\r
+       SetSubtreeWeight2(ptrRoot);\r
+       }\r
\r
+void MSA::ListWeights() const\r
+       {\r
+       const unsigned uSeqCount = GetSeqCount();\r
+       Log("Weights:\n");\r
+       WEIGHT wTotal = 0;\r
+       for (unsigned n = 0; n < uSeqCount; ++n)\r
+               {\r
+               wTotal += GetSeqWeight(n);\r
+               Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n));\r
+               }\r
+       Log("Total weights = %6.3f, should be 1.0\n", wTotal);\r
+       }\r