Wrapper for Clustal Omega.
[jabaws.git] / binaries / src / clustalo / src / clustal / weights.c
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
2
3 /* Module for deriving sequence weights from a tree. Largely based on
4  * Bob Edgar's Muscle (mainly clwwt.cpp; version 3.7). Ported to pure
5  * C. Most functions where apparently based on Clustal 1.8 anyway.
6  *
7  * Muscle's code is public domain and so is this code here.
8  *
9  * From http://www.drive5.com/muscle/license.htm:
10  * """
11  * MUSCLE is public domain software
12  *
13  * The MUSCLE software, including object and source code and
14  * documentation, is hereby donated to the public domain.
15  *
16  * Disclaimer of warranty
17  * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
18  * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * """
21  *
22  */
23  
24 /*
25  * RCS $Id: weights.c 231 2011-04-09 17:13:06Z andreas $
26  */
27
28 /*
29  * Documentation from Muscle
30  *
31  * """
32  *  Compute weights by the CLUSTALW method.
33  *  Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29;
34  *  see also CLUSTALW paper.
35  *   
36  *  Weights are computed from the edge lengths of a rooted tree.
37  *   
38  *  Define the strength of an edge to be its length divided by the number
39  *  of leaves under that edge. The weight of a sequence is then the sum
40  *  of edge strengths on the path from the root to the leaf.
41  *   
42  *  Example.
43  *   
44  *          0.2
45  *         -----A     0.1
46  *           -x         ------- B     0.7
47  *             --------y           ----------- C
48  *              0.3     ----------z
49  *                      0.4    -------------- D
50  *                                   0.8
51  *   
52  *  Edge    Length  Leaves  Strength
53  *  ----    -----   ------  --------
54  *  xy              0.3             3               0.1
55  *  xA              0.2             1               0.2
56  *  yz              0.4             2               0.2
57  *  yB              0.1             1               0.1
58  *  zC              0.7             1               0.7
59  *  zD              0.8             1               0.8
60  *   
61  *  Leaf    Path            Strengths                       Weight
62  *  ----    ----            ---------                       ------
63  *  A               xA                      0.2                                     0.2
64  *  B               xy-yB           0.1 + 0.1                       0.2
65  *  C               xy-yz-zC        0.1 + 0.2 + 0.7         1.0
66  *  D               xy-yz-zD        0.1 + 0.2 + 0.8         1.1
67  * """
68  *
69  */
70
71 #ifdef HAVE_CONFIG_H
72 #include "config.h"
73 #endif
74
75 #include <assert.h>
76 #include "log.h"
77 #include "muscle_tree.h"
78 #include "weights.h"
79
80
81 /* 
82    #undef DEBUG 
83 */
84
85
86 /**
87  * @brief FIXME
88  *
89  * @param[out] puLeavesUnderNode
90  * FIXME
91  * @param[in] prTree
92  * FIXME
93  * @param[in] uNodeIndex
94  * FIXME
95  *
96  * @return The return value
97  *
98  * @note see Muscle3.7:clwwt.cpp
99  * 
100  */    
101 uint
102 CountLeaves(uint *puLeavesUnderNode, tree_t *prTree, uint uNodeIndex)
103 {
104     uint uLeft;
105     uint uRight;
106     uint uRightCount;
107     uint uLeftCount;
108     uint uCount;
109
110
111     if (IsLeaf(uNodeIndex, prTree)) {
112                 puLeavesUnderNode[uNodeIndex] = 1;
113                 return 1;
114     }
115
116     uLeft = GetLeft(uNodeIndex, prTree);
117     uRight = GetRight(uNodeIndex, prTree);
118     uRightCount = CountLeaves(puLeavesUnderNode, prTree, uRight);
119     uLeftCount = CountLeaves(puLeavesUnderNode, prTree, uLeft);
120     uCount = uRightCount + uLeftCount;
121
122     puLeavesUnderNode[uNodeIndex] = uCount;
123         
124     return uCount;    
125 }
126 /***   end: CountLeaves()   ***/
127
128
129
130
131 /**
132  * @brief Normalise values in a double array to values between 0 and 1.
133  *
134  * @param[out] p
135  * double array with n elements
136  * @param[in] n
137  * number of elements in p
138  *
139  * @note From Muscle3.7: intmath.cpp:Normalize()
140  * 
141  */    
142 void
143 Normalise(double *p, uint n) {
144     unsigned i;
145     double dSum = 0.0;
146     for (i = 0; i < n; ++i) {
147         dSum += p[i];
148     }
149     if (0.0 == dSum) {
150         Log(&rLog, LOG_FATAL, "Normalise, sum=0");
151     }
152     for (i = 0; i < n; ++i) {
153         p[i] /= dSum;
154     }
155 }
156 /***   end: Normalise()   ***/
157
158
159
160 /**
161  * @brief Calculate "Clustal" weights from a tree.
162  *
163  * FIXME see doc in muscle:clwwt.cpp
164  *
165  * @param[out] pdWeights_p
166  * Will contain a weight for each leaf/sequence. Allocated here. User
167  * has to free
168  * @param[in] prTree
169  * Tree to derive weights from
170  *
171  * @return 0 on success, non-zero otherwise
172  *
173  * @note Largely copied from Muscle3.7: clwwt.cpp:CalcClustalWWeights()
174  * 
175  * @warning FIXME Not sure if Muscle routines are most efficient here.
176  * Couldn't we do all this while traversing the tree and thereby safe
177  * time?
178  *
179  */    
180 int
181 CalcClustalWeights(double **pdWeights_p, tree_t *prTree)
182 {
183     int i; /* aux */
184     uint uLeafCount;
185     uint uNodeCount;
186         uint *puLeavesUnderNode;
187         uint uLeavesUnderRoot;
188     uint uRootNodeIndex;
189     double *pdStrengths;
190     uint uNodeIndex;
191     bool bLogWeights = FALSE; /* verbose output of weights */
192
193     
194     assert(NULL != pdWeights_p);
195     assert(NULL != prTree);
196
197     if (rLog.iLogLevelEnabled <= LOG_DEBUG) {
198         bLogWeights = TRUE;
199     }
200     
201     uLeafCount = GetLeafCount(prTree);
202     uNodeCount = GetNodeCount(prTree);
203
204
205     (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double));
206
207     if (0 == uLeafCount) {
208                 return 0;
209     } else if (1 == uLeafCount) {
210                 (*pdWeights_p)[0] = 1.0;
211                 return 0;
212     } else if (2 == uLeafCount) {
213                 (*pdWeights_p)[0] = 0.5;
214                 (*pdWeights_p)[1] = 0.5;
215                 return 0;
216     }
217     
218     if (!IsRooted(prTree)) {
219         Log(&rLog, LOG_ERROR, "Tree must be rooted to get weights");
220         CKFREE(pdWeights_p);
221         return -1;
222     }
223
224
225 #ifdef TRACE
226         Log(&rLog, LOG_FORCED_DEBUG, "%s", "Weights follow");
227         fprintf(stderr, "Node  Leaves    Length  Strength\n");
228         fprintf(stderr, "----  ------  --------  --------\n");
229         /*    1234  123456  12345678  12345678 */
230 #endif
231     
232     uRootNodeIndex = GetRootNodeIndex(prTree);
233     puLeavesUnderNode = (uint *) CKCALLOC(uNodeCount, sizeof(uint));
234
235     uLeavesUnderRoot = CountLeaves(puLeavesUnderNode, prTree, uRootNodeIndex);
236         if (uLeavesUnderRoot != uLeafCount) {
237                 Log(&rLog, LOG_FATAL, "Internal error, root count %u %u",
238               uLeavesUnderRoot, uLeafCount);
239     }
240 #if 0
241     for (uNodeIndex=0; uNodeIndex<uNodeCount; uNodeIndex++) {
242         Log(&rLog, LOG_FORCED_DEBUG, "LeavesUnderNode[%d]=%d", uNodeIndex, puLeavesUnderNode[uNodeIndex]);
243     }
244 #endif
245
246     pdStrengths = (double *) CKMALLOC(uNodeCount * sizeof(double));
247                                       
248     for (uNodeIndex=0; uNodeIndex < uNodeCount; uNodeIndex++) {
249         uint uParent;
250         double dLength;
251         uint uLeaves;
252         double dStrength;
253         
254         if (IsRoot(uNodeIndex, prTree)) {
255             pdStrengths[uNodeIndex] = 0.0;
256             continue;
257         }
258         
259         uParent = GetParent(uNodeIndex, prTree);
260         dLength = GetEdgeLength(uNodeIndex, uParent, prTree);
261         uLeaves = puLeavesUnderNode[uNodeIndex];
262         dStrength = dLength / (double) uLeaves;
263         pdStrengths[uNodeIndex] = dStrength;
264         
265 #ifdef TRACE
266         fprintf(stderr, "%4u  %6u  %8g  %8g\n", uNodeIndex, uLeaves, dLength, dStrength);
267 #endif        
268     }
269
270
271
272
273     
274     if (bLogWeights){
275         fprintf(stderr, "\n");
276         fprintf(stderr, "                 Seq  Path..Weight\n");
277         fprintf(stderr, "--------------------  ------------\n");
278     }
279         for (i=0; i<uLeafCount; i++) {
280                 double dWeight = 0.0;
281                 unsigned uLeafNodeIndex;
282                 unsigned uNode;
283
284         uLeafNodeIndex = LeafIndexToNodeIndex(i, prTree);
285         uNode = uLeafNodeIndex;
286
287         if (bLogWeights){
288             fprintf(stderr, "%20.20s  %4u ", GetLeafName(uLeafNodeIndex, prTree), uLeafNodeIndex);
289         }
290                 if (! IsLeaf(uLeafNodeIndex, prTree)) {
291                         Log(&rLog, LOG_FATAL, 
292                 "Internal error: non-leaf-node %d", uLeafNodeIndex);
293         }
294             
295         /*LOG_DEBUG("dWeight = %f", dWeight);*/
296                 while (! IsRoot(uNode, prTree)) {
297                         dWeight += pdStrengths[uNode];
298             /*LOG_DEBUG("dWeight +== %f", pdStrengths[uNode]);*/
299                         uNode = GetParent(uNode, prTree);
300             if (bLogWeights){
301                 fprintf(stderr, "->%u(%g)", uNode, pdStrengths[uNode]);
302             }
303         }
304         /* AW: no idea what this is, but it's done like this in Muscle */
305                 if (dWeight < 0.0001) {
306 #ifdef TRACE
307                         fprintf(stderr, "zero->one");
308 #endif
309                         dWeight = 1.0;
310         }
311
312         /* @note: the only difference to the muscle code is here: we
313          * use the input index for storing weights, instead of the
314          * tree leaf index
315          */
316         (*pdWeights_p)[GetLeafId(uLeafNodeIndex, prTree)] = dWeight;
317         if (bLogWeights){
318             fprintf(stderr, " = %g\n", dWeight);
319         }
320     }
321
322 #if 0
323     for (i=0; i<uLeafCount; i++) {
324         Log(&rLog, LOG_FORCED_DEBUG, "Weights before normalisation: pdWeights_p[%d]=%f", i, (*pdWeights_p)[i]);
325         /*LOG_DEBUG("Should be %d", GetLeafId(LeafIndexToNodeIndex(i, prTree), prTree));*/
326     }
327 #endif
328
329         Normalise((*pdWeights_p), uLeafCount);
330     
331
332     CKFREE(puLeavesUnderNode);
333     CKFREE(pdStrengths);
334     
335     return 0;
336 }
337 /***   end: CalcWeights()   ***/