#ifdef HAVE_CONFIG_H #include "config.h" #endif #include "UPGMAAlgorithm.h" #include #include #include #include #include "../../general/SymMatrix.h" #include "../../general/debuglogObject.h" #include "../../general/clustalw.h" #include "upgmadata.h" namespace clustalw { UPGMAAlgorithm::UPGMAAlgorithm() : overwriteMatrix(false), numSeqs(0), verbose(false), orderNode1(0), orderNode2(0), orderNewNode(0) {} auto_ptr UPGMAAlgorithm::generateTree(RootedGuideTree* phyTree, DistMatrix* distMat, SeqInfo* seqInfo, bool overwrite, ofstream* tree) { if (tree == 0 || !tree->is_open()) { verbose = false; } if (verbose) { (*tree) << "\n\n\t\t\tUPGMA Method\n" << "\n\n This is a ROOTED tree\n" << "\n Numbers in parentheses are branch lengths\n\n"; } progSteps.reset(new AlignmentSteps); Node** clusters; Node* root; numSeqs = seqInfo->numSeqs; const int sizeDistMat = ((numSeqs + 1) * (numSeqs + 2)) / 2; double* elements = overwrite ? distMat->getDistMatrix(seqInfo->firstSeq, seqInfo->numSeqs) : (double *)memcpy(new double[sizeDistMat], distMat->getDistMatrix(seqInfo->firstSeq, seqInfo->numSeqs), sizeDistMat * sizeof(double)); clusters = initialiseNodes(elements, seqInfo->firstSeq); root = doUPGMA(clusters, tree); phyTree->setRoot(root); delete [] clusters; if(!overwrite) { delete [] elements; } distMat->clearSubArray(); return progSteps; } Node** UPGMAAlgorithm::initialiseNodes(double* distanceMatrix, int fSeq) { int firstSeq = fSeq; Node** nodes = new Node*[numSeqs]; Node** nodeIter = nodes; *nodes = new Node(firstSeq, 0, 0); distanceMatrix++; // Move to first position in distanceMatrix. for(int elementIndex = 1, e = numSeqs; elementIndex < e; distanceMatrix += ++elementIndex) { Node* newcluster = new Node(elementIndex + firstSeq, distanceMatrix, elementIndex); (*nodeIter++)->next = newcluster; *nodeIter = newcluster; } return nodes; } void UPGMAAlgorithm::printAllNodes(Node** nodes) { int numNodes = 0; for(Node* nodeIter = *nodes; nodeIter; nodeIter = nodeIter->next) { numNodes++; cout << "Node " << numNodes << "\n"; nodeIter->printElements(); cout << "\n\n"; } cout << "There are " << numNodes << " nodes\n"; } Node* UPGMAAlgorithm::doUPGMA(Node** nodes, ofstream* tree) { if (tree == 0 || !tree->is_open()) { verbose = false; } string type1, type2; int step = 0; while((*nodes)->next) // While there is more than 1 node. { step++; if (verbose) { (*tree) << "\n Cycle" << setw(4) << step << " = "; } Node** ptrNodeWithMin; /** * STEP 1 find node with min distance */ ptrNodeWithMin = getNodeWithMinDist(nodes); Node* nodeToJoin2 = *ptrNodeWithMin; const int indexToMinDist = nodeToJoin2->indexToMinDist; Node* const nodeToJoin1 = nodes[indexToMinDist]; orderNode1 = nodeToJoin1->size; orderNode2 = nodeToJoin2->size; orderNewNode = orderNode1 + orderNode2; /** * STEP 2 Recompute the dist Matrix row for nodeToJoin1 * example: Join nodes 2 and 6 * * 1 0 * 2 X 0 * 3 0 0 0 * 4 0 0 0 0 * 5 0 0 0 0 0 * 6 0 0 0 0 0 0 * 7 0 0 0 0 0 0 0 */ double* nodeToJoin2DistIter = nodeToJoin2->ptrToDistMatRow; if (indexToMinDist != 0) { recomputeNodeToJoin1DistMatRow(nodeToJoin1, &nodeToJoin2DistIter); } /** * STEP 3 Recompute all distances in column * example: Join nodes 2 and 6 * * 1 0 * 2 C 0 * 3 0 X 0 * 4 0 X 0 0 * 5 0 X 0 0 0 * 6 0 0 0 0 0 0 * 7 0 X 0 0 0 0 0 */ computeAllOtherDistsToNewNode(nodeToJoin1, nodeToJoin2, &nodeToJoin2DistIter); /** * STEP 4 Add the step to the progSteps. * This creates the multiple alignment steps. */ addAlignmentStep(&nodeToJoin1->allElements, &nodeToJoin2->allElements); double minDistance = (*ptrNodeWithMin)->minDist; double height = 0.0; height = minDistance / 2.0; if(verbose) { if(nodeToJoin1->allElements.size() > 1) { type1 = "NODE: "; } else { type1 = "SEQ: "; } if(nodeToJoin2->allElements.size() > 1) { type2 = "NODE: "; } else { type2 = "SEQ: "; } (*tree) << type1 << nodeToJoin1->allElements[0] << " (" << setw(9) << setprecision(5) << height << ") joins " << type2 << setw(4) << nodeToJoin2->allElements[0] << " (" << setw(9) << setprecision(5) << height << ")"; } /** * STEP 5 merge 2 nodes */ nodeToJoin1->merge(ptrNodeWithMin, height); } return *nodes; } /** * This function returns a Node object that has the minimum distance of * all the nodes left. */ Node** UPGMAAlgorithm::getNodeWithMinDist(Node** nodes) { Node** ptrMinNode = NULL; double minDistance = numeric_limits::max(); //Start at node 1, check see if it points to something, then move on the next one. Node** nodeIter; for(nodeIter = &((*nodes)->next); *nodeIter; nodeIter = &(*nodeIter)->next) { if ((*nodeIter)->getMinDist() < minDistance) { minDistance = (*nodeIter)->getMinDist(); ptrMinNode = nodeIter; // ptrMinNode will hold a ptr to node with sm'st dist } } return ptrMinNode; } /** * This function is used to recompute the nodeToJoin1 dist mat row. Each row in the distance * matrix has the distances to all nodes before it, not after. For example the dist mat row * for Node 3 would have the distances to nodes 1 and 2. * 1 0 * 2 0 0 * 3 X X 0 Dists to node 1 and 2. * 4 0 0 0 0 * 5 0 0 0 0 0 * 6 0 0 0 0 0 0 * 7 0 0 0 0 0 0 0 */ void UPGMAAlgorithm::recomputeNodeToJoin1DistMatRow(Node* nodeToJoin1, double** nodeToJoin2DistIter) { double* nodeToJoin1DistIter = nodeToJoin1->ptrToDistMatRow; // calculate new distance *nodeToJoin1DistIter = calcNewDist(*nodeToJoin1DistIter, **nodeToJoin2DistIter); const double* minIndex1 = nodeToJoin1DistIter; nodeToJoin1DistIter++; (*nodeToJoin2DistIter)++; int numDistToUpdate = nodeToJoin1->numDists - 1; // For each of the distances in nodeToJoin1 while(numDistToUpdate > 0) { if (*nodeToJoin1DistIter >= 0) { // Calculate the average *nodeToJoin1DistIter = calcNewDist(*nodeToJoin1DistIter, **nodeToJoin2DistIter); if (*nodeToJoin1DistIter < *minIndex1) { minIndex1 = nodeToJoin1DistIter; } } nodeToJoin1DistIter++; (*nodeToJoin2DistIter)++; numDistToUpdate--; } // We have found the minimum distance nodeToJoin1->minDist = *minIndex1; nodeToJoin1->indexToMinDist = minIndex1 - nodeToJoin1->ptrToDistMatRow; } /** * This function is used to recompute all the other distances. It does this by calling * two other functions. We first compute all the distances until we get to node 2. * Then we call another function to do the rest. This is because nodes after node 2 may * have had EITHER node1 of node2 as their min distance. */ void UPGMAAlgorithm::computeAllOtherDistsToNewNode(Node* nodeToJoin1, Node* nodeToJoin2, double** nodeToJoin2DistIter) { computeDistsUpToNodeToJoin2(nodeToJoin1, nodeToJoin2, nodeToJoin2DistIter); computeDistsForNodesAfterNode2(nodeToJoin2); } /** * STEP 3A: * This function recomputes the distances in column until we get to nodeToJoin2 * example: Join nodes 2 and 6 * * 1 0 * 2 C 0 * 3 0 X 0 * 4 0 X 0 0 * 5 0 X 0 0 0 * 6 0 0 0 0 0 0 * 7 0 0 0 0 0 0 0 * * For each node until we get to nodeToJoin2 * If newdistance is less than the old min distance, set it to this. * else if its greater than old minDist and indexTominDist is the same, recompute min * else leave the min the same as it hasnt changed. */ void UPGMAAlgorithm::computeDistsUpToNodeToJoin2(Node* nodeToJoin1, Node* nodeToJoin2, double** nodeToJoin2DistIter) { const int indexToMinDist = nodeToJoin2->indexToMinDist; movePtrPastUnusedDistances(nodeToJoin2DistIter); Node* nodeIter; // For each node until we get to the second node we are joining for(nodeIter = nodeToJoin1->next; nodeIter != nodeToJoin2; nodeIter = nodeIter->next) { (*nodeToJoin2DistIter)++; // Skip the distance to the node we are joining with movePtrPastUnusedDistances(nodeToJoin2DistIter); double distToNode = nodeIter->ptrToDistMatRow[indexToMinDist]; double newDistToNode = calcNewDist(distToNode, **nodeToJoin2DistIter); nodeIter->ptrToDistMatRow[indexToMinDist] = newDistToNode; if (newDistToNode < nodeIter->minDist) { /** new value is smaller than current min. */ nodeIter->minDist = newDistToNode; nodeIter->indexToMinDist = indexToMinDist; } else if ((newDistToNode > nodeIter->minDist) && (nodeIter->indexToMinDist == indexToMinDist)) { /** indexToMinDist was the min dist, but it is now a larger num, recompute */ nodeIter->findMinDist(); } } } /** * STEP 3B Recompute distance for nodes after nodeToJoin2 * example: Join nodes 2 and 6 * * 1 0 * 2 C 0 * 3 0 C 0 * 4 0 C 0 0 * 5 0 C 0 0 0 * 6 0 0 0 0 0 0 * 7 0 X 0 0 0 0 0 * * For each node until we get to the end. * if dist is less than minDist, set mindist to new distance, set index to index * else if dist is greater than mindist and the index was either nodetojoin1 or * nodetojoin2, recompute distance, set entry for nodetojoin2 to NOTUSED * else set the distance to unused. */ void UPGMAAlgorithm::computeDistsForNodesAfterNode2(Node* nodeToJoin2) { int indexToNode2 = nodeToJoin2->numDists; const int indexToMinDist = nodeToJoin2->indexToMinDist; Node* nodeIter; for(nodeIter = nodeToJoin2->next; nodeIter; nodeIter = nodeIter->next) { double &distUpdate = nodeIter->ptrToDistMatRow[indexToMinDist]; distUpdate = ((distUpdate * orderNode1) + (nodeIter->ptrToDistMatRow[indexToNode2] * orderNode2)) / orderNewNode; /* The comparison (distUpdate > nodeIter->minDist) is unsafe. * Specifically, we get different results for 32/64bit machines, * which leads to different branching in the if/else statement * and nasty behaviour down the line. * Using all of Pfam as a benchmark distUpdate can 'wobble' by 40 ULPs * (Unit of Least Precision) which is difficult to translate into * a maximum relative error, so we pick COMPARE_REL_EPSILON * phenomenologically: approx 2E-15 was the biggest we saw in Pfam, * but suggest 1E-14 (for good measure). * Using ULPs, eg, *(long long int*)&(distUpdate), * would be better (more elegant?) but gives problems * with aliasing and/or performance reduction. * FS, 2009-04-30 */ #define COMPARE_REL_EPSILON 1E-14 if ( (distUpdate < nodeIter->minDist) && ((nodeIter->minDist-distUpdate) / nodeIter->minDist > COMPARE_REL_EPSILON) ) { /** new distance is smaller */ nodeIter->minDist = distUpdate; nodeIter->indexToMinDist = indexToMinDist; } else if (((distUpdate > nodeIter->minDist) && ((distUpdate-nodeIter->minDist) / distUpdate > COMPARE_REL_EPSILON) && (nodeIter->indexToMinDist == indexToMinDist)) || (nodeIter->indexToMinDist == indexToNode2)) { /** if old min dist was to either nodeToJoin1 or nodeToJoin2 */ nodeIter->ptrToDistMatRow[indexToNode2] = BLANKDIST; nodeIter->findMinDist(); } else { nodeIter->ptrToDistMatRow[indexToNode2] = BLANKDIST; } } } void UPGMAAlgorithm::addAlignmentStep(vector* group1, vector* group2) { int sizeGroup1 = group1->size(); int sizeGroup2 = group2->size(); vector groups; groups.resize(numSeqs + 1, 0); int sizeGroup = groups.size(); for(int i = 0; i < sizeGroup1 && (*group1)[i] < sizeGroup; i++) { groups[(*group1)[i]] = 1; // Set to be apart of group1 } for(int i = 0; i < sizeGroup2 && (*group2)[i] < sizeGroup; i++) { groups[(*group2)[i]] = 2; // Set to be apart of group1 } progSteps->saveSet(&groups); } /** * At the moment we are only using average distance, UPGMA, but we can also use this * function to have different distance measures, min, max etc. */ double UPGMAAlgorithm::calcNewDist(double dist1, double dist2) { return ((dist1 * orderNode1) + (dist2 * orderNode2)) / orderNewNode; } }