4 #include "UPGMAAlgorithm.h"
10 #include "../../general/SymMatrix.h"
11 #include "../../general/debuglogObject.h"
12 #include "../../general/clustalw.h"
13 #include "upgmadata.h"
18 UPGMAAlgorithm::UPGMAAlgorithm()
19 : overwriteMatrix(false),
27 auto_ptr<AlignmentSteps> UPGMAAlgorithm::generateTree(RootedGuideTree* phyTree,
29 SeqInfo* seqInfo, bool overwrite,
32 if (tree == 0 || !tree->is_open())
39 (*tree) << "\n\n\t\t\tUPGMA Method\n"
40 << "\n\n This is a ROOTED tree\n"
41 << "\n Numbers in parentheses are branch lengths\n\n";
44 progSteps.reset(new AlignmentSteps);
48 numSeqs = seqInfo->numSeqs;
49 const int sizeDistMat = ((numSeqs + 1) * (numSeqs + 2)) / 2;
51 double* elements = overwrite ?
52 distMat->getDistMatrix(seqInfo->firstSeq, seqInfo->numSeqs) :
53 (double *)memcpy(new double[sizeDistMat],
54 distMat->getDistMatrix(seqInfo->firstSeq, seqInfo->numSeqs),
55 sizeDistMat * sizeof(double));
57 clusters = initialiseNodes(elements, seqInfo->firstSeq);
58 root = doUPGMA(clusters, tree);
60 phyTree->setRoot(root);
67 distMat->clearSubArray();
73 Node** UPGMAAlgorithm::initialiseNodes(double* distanceMatrix, int fSeq)
78 Node** nodes = new Node*[numSeqs];
79 Node** nodeIter = nodes;
81 *nodes = new Node(firstSeq, 0, 0);
85 // Move to first position in distanceMatrix.
86 for(int elementIndex = 1, e = numSeqs; elementIndex < e;
87 distanceMatrix += ++elementIndex)
89 Node* newcluster = new Node(elementIndex + firstSeq,
90 distanceMatrix, elementIndex);
91 (*nodeIter++)->next = newcluster;
92 *nodeIter = newcluster;
98 void UPGMAAlgorithm::printAllNodes(Node** nodes)
101 for(Node* nodeIter = *nodes; nodeIter; nodeIter = nodeIter->next)
104 cout << "Node " << numNodes << "\n";
105 nodeIter->printElements();
108 cout << "There are " << numNodes << " nodes\n";
111 Node* UPGMAAlgorithm::doUPGMA(Node** nodes, ofstream* tree)
113 if (tree == 0 || !tree->is_open())
121 while((*nodes)->next) // While there is more than 1 node.
126 (*tree) << "\n Cycle" << setw(4) << step << " = ";
128 Node** ptrNodeWithMin;
131 * STEP 1 find node with min distance
133 ptrNodeWithMin = getNodeWithMinDist(nodes);
134 Node* nodeToJoin2 = *ptrNodeWithMin;
135 const int indexToMinDist = nodeToJoin2->indexToMinDist;
136 Node* const nodeToJoin1 = nodes[indexToMinDist];
138 orderNode1 = nodeToJoin1->size;
139 orderNode2 = nodeToJoin2->size;
140 orderNewNode = orderNode1 + orderNode2;
143 * STEP 2 Recompute the dist Matrix row for nodeToJoin1
144 * example: Join nodes 2 and 6
154 double* nodeToJoin2DistIter = nodeToJoin2->ptrToDistMatRow;
156 if (indexToMinDist != 0)
158 recomputeNodeToJoin1DistMatRow(nodeToJoin1, &nodeToJoin2DistIter);
162 * STEP 3 Recompute all distances in column
163 * example: Join nodes 2 and 6
174 computeAllOtherDistsToNewNode(nodeToJoin1, nodeToJoin2, &nodeToJoin2DistIter);
177 * STEP 4 Add the step to the progSteps.
178 * This creates the multiple alignment steps.
180 addAlignmentStep(&nodeToJoin1->allElements, &nodeToJoin2->allElements);
183 double minDistance = (*ptrNodeWithMin)->minDist;
186 height = minDistance / 2.0;
190 if(nodeToJoin1->allElements.size() > 1)
198 if(nodeToJoin2->allElements.size() > 1)
206 (*tree) << type1 << nodeToJoin1->allElements[0] << " (" << setw(9)
207 << setprecision(5) << height << ") joins " << type2
208 << setw(4) << nodeToJoin2->allElements[0] << " ("
209 << setw(9) << setprecision(5) << height << ")";
212 * STEP 5 merge 2 nodes
214 nodeToJoin1->merge(ptrNodeWithMin, height);
222 * This function returns a Node object that has the minimum distance of
223 * all the nodes left.
225 Node** UPGMAAlgorithm::getNodeWithMinDist(Node** nodes)
227 Node** ptrMinNode = NULL;
228 double minDistance = numeric_limits<double>::max();
229 //Start at node 1, check see if it points to something, then move on the next one.
231 for(nodeIter = &((*nodes)->next); *nodeIter;
232 nodeIter = &(*nodeIter)->next)
234 if ((*nodeIter)->getMinDist() < minDistance)
236 minDistance = (*nodeIter)->getMinDist();
237 ptrMinNode = nodeIter; // ptrMinNode will hold a ptr to node with sm'st dist
244 * This function is used to recompute the nodeToJoin1 dist mat row. Each row in the distance
245 * matrix has the distances to all nodes before it, not after. For example the dist mat row
246 * for Node 3 would have the distances to nodes 1 and 2.
249 * 3 X X 0 Dists to node 1 and 2.
255 void UPGMAAlgorithm::recomputeNodeToJoin1DistMatRow(Node* nodeToJoin1,
256 double** nodeToJoin2DistIter)
258 double* nodeToJoin1DistIter = nodeToJoin1->ptrToDistMatRow;
259 // calculate new distance
260 *nodeToJoin1DistIter = calcNewDist(*nodeToJoin1DistIter, **nodeToJoin2DistIter);
262 const double* minIndex1 = nodeToJoin1DistIter;
263 nodeToJoin1DistIter++;
264 (*nodeToJoin2DistIter)++;
265 int numDistToUpdate = nodeToJoin1->numDists - 1;
267 // For each of the distances in nodeToJoin1
268 while(numDistToUpdate > 0)
270 if (*nodeToJoin1DistIter >= 0)
272 // Calculate the average
273 *nodeToJoin1DistIter = calcNewDist(*nodeToJoin1DistIter, **nodeToJoin2DistIter);
275 if (*nodeToJoin1DistIter < *minIndex1)
277 minIndex1 = nodeToJoin1DistIter;
280 nodeToJoin1DistIter++;
281 (*nodeToJoin2DistIter)++;
285 // We have found the minimum distance
286 nodeToJoin1->minDist = *minIndex1;
287 nodeToJoin1->indexToMinDist = minIndex1 - nodeToJoin1->ptrToDistMatRow;
291 * This function is used to recompute all the other distances. It does this by calling
292 * two other functions. We first compute all the distances until we get to node 2.
293 * Then we call another function to do the rest. This is because nodes after node 2 may
294 * have had EITHER node1 of node2 as their min distance.
296 void UPGMAAlgorithm::computeAllOtherDistsToNewNode(Node* nodeToJoin1, Node* nodeToJoin2,
297 double** nodeToJoin2DistIter)
299 computeDistsUpToNodeToJoin2(nodeToJoin1, nodeToJoin2, nodeToJoin2DistIter);
300 computeDistsForNodesAfterNode2(nodeToJoin2);
305 * This function recomputes the distances in column until we get to nodeToJoin2
306 * example: Join nodes 2 and 6
316 * For each node until we get to nodeToJoin2
317 * If newdistance is less than the old min distance, set it to this.
318 * else if its greater than old minDist and indexTominDist is the same, recompute min
319 * else leave the min the same as it hasnt changed.
321 void UPGMAAlgorithm::computeDistsUpToNodeToJoin2(Node* nodeToJoin1, Node* nodeToJoin2, double** nodeToJoin2DistIter)
323 const int indexToMinDist = nodeToJoin2->indexToMinDist;
325 movePtrPastUnusedDistances(nodeToJoin2DistIter);
328 // For each node until we get to the second node we are joining
329 for(nodeIter = nodeToJoin1->next; nodeIter != nodeToJoin2; nodeIter = nodeIter->next)
331 (*nodeToJoin2DistIter)++; // Skip the distance to the node we are joining with
332 movePtrPastUnusedDistances(nodeToJoin2DistIter);
334 double distToNode = nodeIter->ptrToDistMatRow[indexToMinDist];
336 double newDistToNode = calcNewDist(distToNode, **nodeToJoin2DistIter);
337 nodeIter->ptrToDistMatRow[indexToMinDist] = newDistToNode;
340 if (newDistToNode < nodeIter->minDist)
341 { /** new value is smaller than current min. */
342 nodeIter->minDist = newDistToNode;
343 nodeIter->indexToMinDist = indexToMinDist;
345 else if ((newDistToNode > nodeIter->minDist) &&
346 (nodeIter->indexToMinDist == indexToMinDist))
347 { /** indexToMinDist was the min dist, but it is now a larger num, recompute */
348 nodeIter->findMinDist();
356 * STEP 3B Recompute distance for nodes after nodeToJoin2
357 * example: Join nodes 2 and 6
367 * For each node until we get to the end.
368 * if dist is less than minDist, set mindist to new distance, set index to index
369 * else if dist is greater than mindist and the index was either nodetojoin1 or
370 * nodetojoin2, recompute distance, set entry for nodetojoin2 to NOTUSED
371 * else set the distance to unused.
374 void UPGMAAlgorithm::computeDistsForNodesAfterNode2(Node* nodeToJoin2)
376 int indexToNode2 = nodeToJoin2->numDists;
377 const int indexToMinDist = nodeToJoin2->indexToMinDist;
381 for(nodeIter = nodeToJoin2->next; nodeIter; nodeIter = nodeIter->next)
383 double &distUpdate = nodeIter->ptrToDistMatRow[indexToMinDist];
386 ((distUpdate * orderNode1) +
387 (nodeIter->ptrToDistMatRow[indexToNode2] * orderNode2))
390 /* The comparison (distUpdate > nodeIter->minDist) is unsafe.
391 * Specifically, we get different results for 32/64bit machines,
392 * which leads to different branching in the if/else statement
393 * and nasty behaviour down the line.
394 * Using all of Pfam as a benchmark distUpdate can 'wobble' by 40 ULPs
395 * (Unit of Least Precision) which is difficult to translate into
396 * a maximum relative error, so we pick COMPARE_REL_EPSILON
397 * phenomenologically: approx 2E-15 was the biggest we saw in Pfam,
398 * but suggest 1E-14 (for good measure).
399 * Using ULPs, eg, *(long long int*)&(distUpdate),
400 * would be better (more elegant?) but gives problems
401 * with aliasing and/or performance reduction.
404 #define COMPARE_REL_EPSILON 1E-14
405 if ( (distUpdate < nodeIter->minDist) &&
406 ((nodeIter->minDist-distUpdate) /
407 nodeIter->minDist > COMPARE_REL_EPSILON) )
408 { /** new distance is smaller */
409 nodeIter->minDist = distUpdate;
410 nodeIter->indexToMinDist = indexToMinDist;
412 else if (((distUpdate > nodeIter->minDist) &&
413 ((distUpdate-nodeIter->minDist) /
414 distUpdate > COMPARE_REL_EPSILON) &&
415 (nodeIter->indexToMinDist == indexToMinDist)) ||
416 (nodeIter->indexToMinDist == indexToNode2))
417 { /** if old min dist was to either nodeToJoin1 or nodeToJoin2 */
418 nodeIter->ptrToDistMatRow[indexToNode2] = BLANKDIST;
419 nodeIter->findMinDist();
423 nodeIter->ptrToDistMatRow[indexToNode2] = BLANKDIST;
429 void UPGMAAlgorithm::addAlignmentStep(vector<int>* group1, vector<int>* group2)
431 int sizeGroup1 = group1->size();
432 int sizeGroup2 = group2->size();
435 groups.resize(numSeqs + 1, 0);
436 int sizeGroup = groups.size();
438 for(int i = 0; i < sizeGroup1 && (*group1)[i] < sizeGroup; i++)
440 groups[(*group1)[i]] = 1; // Set to be apart of group1
443 for(int i = 0; i < sizeGroup2 && (*group2)[i] < sizeGroup; i++)
445 groups[(*group2)[i]] = 2; // Set to be apart of group1
448 progSteps->saveSet(&groups);
452 * At the moment we are only using average distance, UPGMA, but we can also use this
453 * function to have different distance measures, min, max etc.
455 double UPGMAAlgorithm::calcNewDist(double dist1, double dist2)
457 return ((dist1 * orderNode1) + (dist2 * orderNode2)) / orderNewNode;