1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /* Module for deriving sequence weights from a tree. Largely based on
4 * Bob Edgar's Muscle (mainly clwwt.cpp; version 3.7). Ported to pure
5 * C. Most functions where apparently based on Clustal 1.8 anyway.
7 * Muscle's code is public domain and so is this code here.
9 * From http://www.drive5.com/muscle/license.htm:
11 * MUSCLE is public domain software
13 * The MUSCLE software, including object and source code and
14 * documentation, is hereby donated to the public domain.
16 * Disclaimer of warranty
17 * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
18 * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * RCS $Id: weights.c 231 2011-04-09 17:13:06Z andreas $
29 * Documentation from Muscle
32 * Compute weights by the CLUSTALW method.
33 * Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29;
34 * see also CLUSTALW paper.
36 * Weights are computed from the edge lengths of a rooted tree.
38 * Define the strength of an edge to be its length divided by the number
39 * of leaves under that edge. The weight of a sequence is then the sum
40 * of edge strengths on the path from the root to the leaf.
47 * --------y ----------- C
49 * 0.4 -------------- D
52 * Edge Length Leaves Strength
53 * ---- ----- ------ --------
61 * Leaf Path Strengths Weight
62 * ---- ---- --------- ------
64 * B xy-yB 0.1 + 0.1 0.2
65 * C xy-yz-zC 0.1 + 0.2 + 0.7 1.0
66 * D xy-yz-zD 0.1 + 0.2 + 0.8 1.1
77 #include "muscle_tree.h"
89 * @param[out] puLeavesUnderNode
93 * @param[in] uNodeIndex
96 * @return The return value
98 * @note see Muscle3.7:clwwt.cpp
102 CountLeaves(uint *puLeavesUnderNode, tree_t *prTree, uint uNodeIndex)
111 if (IsLeaf(uNodeIndex, prTree)) {
112 puLeavesUnderNode[uNodeIndex] = 1;
116 uLeft = GetLeft(uNodeIndex, prTree);
117 uRight = GetRight(uNodeIndex, prTree);
118 uRightCount = CountLeaves(puLeavesUnderNode, prTree, uRight);
119 uLeftCount = CountLeaves(puLeavesUnderNode, prTree, uLeft);
120 uCount = uRightCount + uLeftCount;
122 puLeavesUnderNode[uNodeIndex] = uCount;
126 /*** end: CountLeaves() ***/
132 * @brief Normalise values in a double array to values between 0 and 1.
135 * double array with n elements
137 * number of elements in p
139 * @note From Muscle3.7: intmath.cpp:Normalize()
143 Normalise(double *p, uint n) {
146 for (i = 0; i < n; ++i) {
150 Log(&rLog, LOG_FATAL, "Normalise, sum=0");
152 for (i = 0; i < n; ++i) {
156 /*** end: Normalise() ***/
161 * @brief Calculate "Clustal" weights from a tree.
163 * FIXME see doc in muscle:clwwt.cpp
165 * @param[out] pdWeights_p
166 * Will contain a weight for each leaf/sequence. Allocated here. User
169 * Tree to derive weights from
171 * @return 0 on success, non-zero otherwise
173 * @note Largely copied from Muscle3.7: clwwt.cpp:CalcClustalWWeights()
175 * @warning FIXME Not sure if Muscle routines are most efficient here.
176 * Couldn't we do all this while traversing the tree and thereby safe
181 CalcClustalWeights(double **pdWeights_p, tree_t *prTree)
186 uint *puLeavesUnderNode;
187 uint uLeavesUnderRoot;
191 bool bLogWeights = FALSE; /* verbose output of weights */
194 assert(NULL != pdWeights_p);
195 assert(NULL != prTree);
197 if (rLog.iLogLevelEnabled <= LOG_DEBUG) {
201 uLeafCount = GetLeafCount(prTree);
202 uNodeCount = GetNodeCount(prTree);
205 (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double));
207 if (0 == uLeafCount) {
209 } else if (1 == uLeafCount) {
210 (*pdWeights_p)[0] = 1.0;
212 } else if (2 == uLeafCount) {
213 (*pdWeights_p)[0] = 0.5;
214 (*pdWeights_p)[1] = 0.5;
218 if (!IsRooted(prTree)) {
219 Log(&rLog, LOG_ERROR, "Tree must be rooted to get weights");
226 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Weights follow");
227 fprintf(stderr, "Node Leaves Length Strength\n");
228 fprintf(stderr, "---- ------ -------- --------\n");
229 /* 1234 123456 12345678 12345678 */
232 uRootNodeIndex = GetRootNodeIndex(prTree);
233 puLeavesUnderNode = (uint *) CKCALLOC(uNodeCount, sizeof(uint));
235 uLeavesUnderRoot = CountLeaves(puLeavesUnderNode, prTree, uRootNodeIndex);
236 if (uLeavesUnderRoot != uLeafCount) {
237 Log(&rLog, LOG_FATAL, "Internal error, root count %u %u",
238 uLeavesUnderRoot, uLeafCount);
241 for (uNodeIndex=0; uNodeIndex<uNodeCount; uNodeIndex++) {
242 Log(&rLog, LOG_FORCED_DEBUG, "LeavesUnderNode[%d]=%d", uNodeIndex, puLeavesUnderNode[uNodeIndex]);
246 pdStrengths = (double *) CKMALLOC(uNodeCount * sizeof(double));
248 for (uNodeIndex=0; uNodeIndex < uNodeCount; uNodeIndex++) {
254 if (IsRoot(uNodeIndex, prTree)) {
255 pdStrengths[uNodeIndex] = 0.0;
259 uParent = GetParent(uNodeIndex, prTree);
260 dLength = GetEdgeLength(uNodeIndex, uParent, prTree);
261 uLeaves = puLeavesUnderNode[uNodeIndex];
262 dStrength = dLength / (double) uLeaves;
263 pdStrengths[uNodeIndex] = dStrength;
266 fprintf(stderr, "%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength);
275 fprintf(stderr, "\n");
276 fprintf(stderr, " Seq Path..Weight\n");
277 fprintf(stderr, "-------------------- ------------\n");
279 for (i=0; i<uLeafCount; i++) {
280 double dWeight = 0.0;
281 unsigned uLeafNodeIndex;
284 uLeafNodeIndex = LeafIndexToNodeIndex(i, prTree);
285 uNode = uLeafNodeIndex;
288 fprintf(stderr, "%20.20s %4u ", GetLeafName(uLeafNodeIndex, prTree), uLeafNodeIndex);
290 if (! IsLeaf(uLeafNodeIndex, prTree)) {
291 Log(&rLog, LOG_FATAL,
292 "Internal error: non-leaf-node %d", uLeafNodeIndex);
295 /*LOG_DEBUG("dWeight = %f", dWeight);*/
296 while (! IsRoot(uNode, prTree)) {
297 dWeight += pdStrengths[uNode];
298 /*LOG_DEBUG("dWeight +== %f", pdStrengths[uNode]);*/
299 uNode = GetParent(uNode, prTree);
301 fprintf(stderr, "->%u(%g)", uNode, pdStrengths[uNode]);
304 /* AW: no idea what this is, but it's done like this in Muscle */
305 if (dWeight < 0.0001) {
307 fprintf(stderr, "zero->one");
312 /* @note: the only difference to the muscle code is here: we
313 * use the input index for storing weights, instead of the
316 (*pdWeights_p)[GetLeafId(uLeafNodeIndex, prTree)] = dWeight;
318 fprintf(stderr, " = %g\n", dWeight);
323 for (i=0; i<uLeafCount; i++) {
324 Log(&rLog, LOG_FORCED_DEBUG, "Weights before normalisation: pdWeights_p[%d]=%f", i, (*pdWeights_p)[i]);
325 /*LOG_DEBUG("Should be %d", GetLeafId(LeafIndexToNodeIndex(i, prTree), prTree));*/
329 Normalise((*pdWeights_p), uLeafCount);
332 CKFREE(puLeavesUnderNode);
337 /*** end: CalcWeights() ***/