Delete unneeded directory
[jabaws.git] / website / archive / binaries / mac / src / muscle / setgscweights.cpp
diff --git a/website/archive/binaries/mac/src/muscle/setgscweights.cpp b/website/archive/binaries/mac/src/muscle/setgscweights.cpp
deleted file mode 100644 (file)
index 7e7e787..0000000
+++ /dev/null
@@ -1,195 +0,0 @@
-/***\r
-Gerstein/Sonnhammer/Chothia ad hoc sequence weighting.\r
-The algorithm was deduced by reverse-engineering the\r
-HMMer code.\r
-\r
-I used an alternative representation that I prefer over\r
-HMMer's. The HMMer code is full of tree manipulations\r
-that do something to the left child and then the equivalent\r
-thing to the right child. It was clear that there must be\r
-a re-formulation that does everything once for each node,\r
-which would reduce the number of operations expressed\r
-in the code by a factor of two. This gives a more elegant\r
-and less error-prone way to code it.\r
-\r
-These notes explain the correspondence between my design\r
-and Eddy's.\r
-\r
-HMMer stores a data structure phylo_s for each non-leaf\r
-node in the cluster tree. This structure contains the\r
-following fields:\r
-\r
-       diff            Weight of the node\r
-       lblen           Left branch length\r
-       rblen           Right branch length\r
-\r
-The lblen and rblen branch lengths are calculated as:\r
-\r
-       this.lblen = this.diff - left.diff\r
-       this.rblen = this.diff - right.diff\r
-\r
-My code stores one ClusterNode data structure per node\r
-in the cluster tree, including leaves. I store only the\r
-weight. I can recover the HMMer branch length fields\r
-in a trivial O(1) calculation as follows:\r
-\r
-       lblen = Node.GetWeight() - Node.GetLeft()->GetWeight()\r
-       rblen = Node.GetWeight() - Node.GetRight()->GetWeight()\r
-\r
-For the GSC weights calculation, HMMer constructs the\r
-following vectors, which have entries for all nodes,\r
-including leaves:\r
-\r
-       lwt             Left weight\r
-       rwt             Right weight\r
-\r
-The "left weight" is calculated as the sum of the weights in\r
-all the nodes reachable through the left branch, including\r
-the node itself. (This is not immediately obvious from the\r
-code, which does the calculation using branch lengths rather\r
-than weights, but this is an equivalent, and to my mind clearer,\r
-statement of what they are). Similarly, the "right weight" is\r
-the sum of all weights reachable via the right branch. I define\r
-the "cluster weight" to be the summed weight of all nodes in the\r
-subtree under the node, including the node itself. I provide\r
-a function Node.GetClusterWeight() which calculates the cluster\r
-weight using a O(ln N) recursion through the tree. The lwt and\r
-rwt values can be recovered as follows:\r
-\r
-       lwt             = Node.GetLeft()->GetClusterWeight()\r
-                       + Node.GetWeight()\r
-\r
-       lwt             = Node.GetLeft()->GetClusterWeight()\r
-                       + Node.GetWeight()\r
-\r
-HMMer calculates a further vector fwt as follows.\r
-\r
-       this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt)\r
-\r
-This applies to nodes reached via a left branch, for nodes reached\r
-via a right branch:\r
-\r
-       this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt)\r
-\r
-The values of fwt at the leaf nodes are the final GSC weights.\r
-We derive the various terms using our equivalents.\r
-\r
-       parent.lwt      = Parent.GetLeft()->GetClusterWeight()\r
-                               + Parent.GetWeight()\r
-\r
-       parent.rwt      = Parent.GetRight()->GetClusterWeight()\r
-                               + Parent.GetWeight()\r
-\r
-       parent.lwt + parent.rwt =\r
-                               { Parent.GetLeft()->GetClusterWeight()\r
-                               + Parent.GetRight()->GetClusterWeight()\r
-                               + Parent.GetWeight() }\r
-                               + Parent.GetWeight()\r
-\r
-We recognize the term {...} as the cluster weight of the\r
-parent, so\r
-\r
-       parent.lwt + parent.rwt\r
-                               = Parent.GetClusterWeight()\r
-                               + Parent.GetWeight()\r
-\r
-As you would expect, repeating this exercise for parent.rwt gives\r
-exactly the same expression.\r
-\r
-The GSC weights (fwt) are stored in the Weight2 field of the cluster\r
-tree, the Weight field stores the original (BLOSUM) weights used\r
-as input to this algorithm.\r
-***/\r
-\r
-#include "muscle.h"\r
-#include "msa.h"\r
-#include "cluster.h"\r
-#include "distfunc.h"\r
-\r
-// Set weights of all sequences in the subtree under given node.\r
-void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const\r
-       {\r
-       if (0 == ptrNode)\r
-               return;\r
-\r
-       const ClusterNode *ptrRight = ptrNode->GetRight();\r
-       const ClusterNode *ptrLeft = ptrNode->GetLeft();\r
-\r
-// If leaf, set weight\r
-       if (0 == ptrRight && 0 == ptrLeft)\r
-               {\r
-               unsigned uIndex = ptrNode->GetIndex();\r
-               double dWeight = ptrNode->GetWeight2();\r
-               WEIGHT w = DoubleToWeight(dWeight);\r
-               m_Weights[uIndex] = w;\r
-               return;\r
-               }\r
-\r
-// Otherwise, recursively set subtrees\r
-       SetSubtreeWeight2(ptrLeft);\r
-       SetSubtreeWeight2(ptrRight);\r
-       }\r
-\r
-void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const\r
-       {\r
-       if (0 == ptrNode)\r
-               return;\r
-\r
-       ClusterNode *ptrParent = ptrNode->GetParent();\r
-       double dParentWeight2 = ptrParent->GetWeight2();\r
-       double dParentClusterWeight = ptrParent->GetClusterWeight();\r
-       if (0.0 == dParentClusterWeight)\r
-               {\r
-               double dThisClusterSize = ptrNode->GetClusterSize();\r
-               double dParentClusterSize = ptrParent->GetClusterSize();\r
-               double dWeight2 =\r
-                 dParentWeight2*dThisClusterSize/dParentClusterSize;\r
-               ptrNode->SetWeight2(dWeight2);\r
-               }\r
-       else\r
-               {\r
-       // Could cache cluster weights for better performance.\r
-       // We calculate cluster weight of each node twice, so this\r
-       // would give x2 improvement.\r
-       // As weighting is not very expensive, we don't care.\r
-               double dThisClusterWeight = ptrNode->GetClusterWeight();\r
-               double dParentWeight = ptrParent->GetWeight();\r
-\r
-               double dNum = dThisClusterWeight + dParentWeight;\r
-               double dDenom = dParentClusterWeight + dParentWeight;\r
-               double dWeight2 = dParentWeight2*(dNum/dDenom);\r
-\r
-               ptrNode->SetWeight2(dWeight2);\r
-               }\r
-\r
-       SetSubtreeGSCWeight(ptrNode->GetLeft());\r
-       SetSubtreeGSCWeight(ptrNode->GetRight());\r
-       }\r
-\r
-void MSA::SetGSCWeights() const\r
-       {\r
-       ClusterTree CT;\r
-       CalcBLOSUMWeights(CT);\r
-\r
-// Calculate weights and store in tree.\r
-       ClusterNode *ptrRoot = CT.GetRoot();\r
-       ptrRoot->SetWeight2(1.0);\r
-       SetSubtreeGSCWeight(ptrRoot->GetLeft());\r
-       SetSubtreeGSCWeight(ptrRoot->GetRight());\r
-\r
-// Copy weights from tree to MSA.\r
-       SetSubtreeWeight2(ptrRoot);\r
-       }\r
\r
-void MSA::ListWeights() const\r
-       {\r
-       const unsigned uSeqCount = GetSeqCount();\r
-       Log("Weights:\n");\r
-       WEIGHT wTotal = 0;\r
-       for (unsigned n = 0; n < uSeqCount; ++n)\r
-               {\r
-               wTotal += GetSeqWeight(n);\r
-               Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n));\r
-               }\r
-       Log("Total weights = %6.3f, should be 1.0\n", wTotal);\r
-       }\r