Next version of JABA
[jabaws.git] / binaries / src / muscle / setgscweights.cpp
1 /***\r
2 Gerstein/Sonnhammer/Chothia ad hoc sequence weighting.\r
3 The algorithm was deduced by reverse-engineering the\r
4 HMMer code.\r
5 \r
6 I used an alternative representation that I prefer over\r
7 HMMer's. The HMMer code is full of tree manipulations\r
8 that do something to the left child and then the equivalent\r
9 thing to the right child. It was clear that there must be\r
10 a re-formulation that does everything once for each node,\r
11 which would reduce the number of operations expressed\r
12 in the code by a factor of two. This gives a more elegant\r
13 and less error-prone way to code it.\r
14 \r
15 These notes explain the correspondence between my design\r
16 and Eddy's.\r
17 \r
18 HMMer stores a data structure phylo_s for each non-leaf\r
19 node in the cluster tree. This structure contains the\r
20 following fields:\r
21 \r
22         diff            Weight of the node\r
23         lblen           Left branch length\r
24         rblen           Right branch length\r
25 \r
26 The lblen and rblen branch lengths are calculated as:\r
27 \r
28         this.lblen = this.diff - left.diff\r
29         this.rblen = this.diff - right.diff\r
30 \r
31 My code stores one ClusterNode data structure per node\r
32 in the cluster tree, including leaves. I store only the\r
33 weight. I can recover the HMMer branch length fields\r
34 in a trivial O(1) calculation as follows:\r
35 \r
36         lblen = Node.GetWeight() - Node.GetLeft()->GetWeight()\r
37         rblen = Node.GetWeight() - Node.GetRight()->GetWeight()\r
38 \r
39 For the GSC weights calculation, HMMer constructs the\r
40 following vectors, which have entries for all nodes,\r
41 including leaves:\r
42 \r
43         lwt             Left weight\r
44         rwt             Right weight\r
45 \r
46 The "left weight" is calculated as the sum of the weights in\r
47 all the nodes reachable through the left branch, including\r
48 the node itself. (This is not immediately obvious from the\r
49 code, which does the calculation using branch lengths rather\r
50 than weights, but this is an equivalent, and to my mind clearer,\r
51 statement of what they are). Similarly, the "right weight" is\r
52 the sum of all weights reachable via the right branch. I define\r
53 the "cluster weight" to be the summed weight of all nodes in the\r
54 subtree under the node, including the node itself. I provide\r
55 a function Node.GetClusterWeight() which calculates the cluster\r
56 weight using a O(ln N) recursion through the tree. The lwt and\r
57 rwt values can be recovered as follows:\r
58 \r
59         lwt             = Node.GetLeft()->GetClusterWeight()\r
60                         + Node.GetWeight()\r
61 \r
62         lwt             = Node.GetLeft()->GetClusterWeight()\r
63                         + Node.GetWeight()\r
64 \r
65 HMMer calculates a further vector fwt as follows.\r
66 \r
67         this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt)\r
68 \r
69 This applies to nodes reached via a left branch, for nodes reached\r
70 via a right branch:\r
71 \r
72         this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt)\r
73 \r
74 The values of fwt at the leaf nodes are the final GSC weights.\r
75 We derive the various terms using our equivalents.\r
76 \r
77         parent.lwt      = Parent.GetLeft()->GetClusterWeight()\r
78                                 + Parent.GetWeight()\r
79 \r
80         parent.rwt      = Parent.GetRight()->GetClusterWeight()\r
81                                 + Parent.GetWeight()\r
82 \r
83         parent.lwt + parent.rwt =\r
84                                 { Parent.GetLeft()->GetClusterWeight()\r
85                                 + Parent.GetRight()->GetClusterWeight()\r
86                                 + Parent.GetWeight() }\r
87                                 + Parent.GetWeight()\r
88 \r
89 We recognize the term {...} as the cluster weight of the\r
90 parent, so\r
91 \r
92         parent.lwt + parent.rwt\r
93                                 = Parent.GetClusterWeight()\r
94                                 + Parent.GetWeight()\r
95 \r
96 As you would expect, repeating this exercise for parent.rwt gives\r
97 exactly the same expression.\r
98 \r
99 The GSC weights (fwt) are stored in the Weight2 field of the cluster\r
100 tree, the Weight field stores the original (BLOSUM) weights used\r
101 as input to this algorithm.\r
102 ***/\r
103 \r
104 #include "muscle.h"\r
105 #include "msa.h"\r
106 #include "cluster.h"\r
107 #include "distfunc.h"\r
108 \r
109 // Set weights of all sequences in the subtree under given node.\r
110 void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const\r
111         {\r
112         if (0 == ptrNode)\r
113                 return;\r
114 \r
115         const ClusterNode *ptrRight = ptrNode->GetRight();\r
116         const ClusterNode *ptrLeft = ptrNode->GetLeft();\r
117 \r
118 // If leaf, set weight\r
119         if (0 == ptrRight && 0 == ptrLeft)\r
120                 {\r
121                 unsigned uIndex = ptrNode->GetIndex();\r
122                 double dWeight = ptrNode->GetWeight2();\r
123                 WEIGHT w = DoubleToWeight(dWeight);\r
124                 m_Weights[uIndex] = w;\r
125                 return;\r
126                 }\r
127 \r
128 // Otherwise, recursively set subtrees\r
129         SetSubtreeWeight2(ptrLeft);\r
130         SetSubtreeWeight2(ptrRight);\r
131         }\r
132 \r
133 void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const\r
134         {\r
135         if (0 == ptrNode)\r
136                 return;\r
137 \r
138         ClusterNode *ptrParent = ptrNode->GetParent();\r
139         double dParentWeight2 = ptrParent->GetWeight2();\r
140         double dParentClusterWeight = ptrParent->GetClusterWeight();\r
141         if (0.0 == dParentClusterWeight)\r
142                 {\r
143                 double dThisClusterSize = ptrNode->GetClusterSize();\r
144                 double dParentClusterSize = ptrParent->GetClusterSize();\r
145                 double dWeight2 =\r
146                   dParentWeight2*dThisClusterSize/dParentClusterSize;\r
147                 ptrNode->SetWeight2(dWeight2);\r
148                 }\r
149         else\r
150                 {\r
151         // Could cache cluster weights for better performance.\r
152         // We calculate cluster weight of each node twice, so this\r
153         // would give x2 improvement.\r
154         // As weighting is not very expensive, we don't care.\r
155                 double dThisClusterWeight = ptrNode->GetClusterWeight();\r
156                 double dParentWeight = ptrParent->GetWeight();\r
157 \r
158                 double dNum = dThisClusterWeight + dParentWeight;\r
159                 double dDenom = dParentClusterWeight + dParentWeight;\r
160                 double dWeight2 = dParentWeight2*(dNum/dDenom);\r
161 \r
162                 ptrNode->SetWeight2(dWeight2);\r
163                 }\r
164 \r
165         SetSubtreeGSCWeight(ptrNode->GetLeft());\r
166         SetSubtreeGSCWeight(ptrNode->GetRight());\r
167         }\r
168 \r
169 void MSA::SetGSCWeights() const\r
170         {\r
171         ClusterTree CT;\r
172         CalcBLOSUMWeights(CT);\r
173 \r
174 // Calculate weights and store in tree.\r
175         ClusterNode *ptrRoot = CT.GetRoot();\r
176         ptrRoot->SetWeight2(1.0);\r
177         SetSubtreeGSCWeight(ptrRoot->GetLeft());\r
178         SetSubtreeGSCWeight(ptrRoot->GetRight());\r
179 \r
180 // Copy weights from tree to MSA.\r
181         SetSubtreeWeight2(ptrRoot);\r
182         }\r
183  \r
184 void MSA::ListWeights() const\r
185         {\r
186         const unsigned uSeqCount = GetSeqCount();\r
187         Log("Weights:\n");\r
188         WEIGHT wTotal = 0;\r
189         for (unsigned n = 0; n < uSeqCount; ++n)\r
190                 {\r
191                 wTotal += GetSeqWeight(n);\r
192                 Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n));\r
193                 }\r
194         Log("Total weights = %6.3f, should be 1.0\n", wTotal);\r
195         }\r