+++ /dev/null
-#ifdef HAVE_CONFIG_H
- #include "config.h"
-#endif
-#include "UPGMAAlgorithm.h"
-#include <cmath>
-#include <ctime>
-#include <iomanip>
-#include <sstream>
-
-#include "../../general/SymMatrix.h"
-#include "../../general/debuglogObject.h"
-#include "../../general/clustalw.h"
-#include "upgmadata.h"
-
-namespace clustalw
-{
-
-UPGMAAlgorithm::UPGMAAlgorithm()
-: overwriteMatrix(false),
- numSeqs(0),
- verbose(false),
- orderNode1(0),
- orderNode2(0),
- orderNewNode(0)
-{}
-
-auto_ptr<AlignmentSteps> UPGMAAlgorithm::generateTree(RootedGuideTree* phyTree,
- DistMatrix* distMat,
- SeqInfo* seqInfo, bool overwrite,
- ofstream* tree)
-{
- if (tree == 0 || !tree->is_open())
- {
- verbose = false;
- }
-
- if (verbose)
- {
- (*tree) << "\n\n\t\t\tUPGMA Method\n"
- << "\n\n This is a ROOTED tree\n"
- << "\n Numbers in parentheses are branch lengths\n\n";
- }
-
- progSteps.reset(new AlignmentSteps);
-
- Node** clusters;
- Node* root;
- numSeqs = seqInfo->numSeqs;
- const int sizeDistMat = ((numSeqs + 1) * (numSeqs + 2)) / 2;
-
- double* elements = overwrite ?
- distMat->getDistMatrix(seqInfo->firstSeq, seqInfo->numSeqs) :
- (double *)memcpy(new double[sizeDistMat],
- distMat->getDistMatrix(seqInfo->firstSeq, seqInfo->numSeqs),
- sizeDistMat * sizeof(double));
-
- clusters = initialiseNodes(elements, seqInfo->firstSeq);
- root = doUPGMA(clusters, tree);
-
- phyTree->setRoot(root);
- delete [] clusters;
-
- if(!overwrite)
- {
- delete [] elements;
- }
- distMat->clearSubArray();
-
- return progSteps;
-}
-
-
-Node** UPGMAAlgorithm::initialiseNodes(double* distanceMatrix, int fSeq)
-{
- int firstSeq = fSeq;
-
-
- Node** nodes = new Node*[numSeqs];
- Node** nodeIter = nodes;
-
- *nodes = new Node(firstSeq, 0, 0);
-
- distanceMatrix++;
-
- // Move to first position in distanceMatrix.
- for(int elementIndex = 1, e = numSeqs; elementIndex < e;
- distanceMatrix += ++elementIndex)
- {
- Node* newcluster = new Node(elementIndex + firstSeq,
- distanceMatrix, elementIndex);
- (*nodeIter++)->next = newcluster;
- *nodeIter = newcluster;
- }
-
- return nodes;
-}
-
-void UPGMAAlgorithm::printAllNodes(Node** nodes)
-{
- int numNodes = 0;
- for(Node* nodeIter = *nodes; nodeIter; nodeIter = nodeIter->next)
- {
- numNodes++;
- cout << "Node " << numNodes << "\n";
- nodeIter->printElements();
- cout << "\n\n";
- }
- cout << "There are " << numNodes << " nodes\n";
-}
-
-Node* UPGMAAlgorithm::doUPGMA(Node** nodes, ofstream* tree)
-{
- if (tree == 0 || !tree->is_open())
- {
- verbose = false;
- }
-
- string type1, type2;
- int step = 0;
-
- while((*nodes)->next) // While there is more than 1 node.
- {
- step++;
- if (verbose)
- {
- (*tree) << "\n Cycle" << setw(4) << step << " = ";
- }
- Node** ptrNodeWithMin;
-
- /**
- * STEP 1 find node with min distance
- */
- ptrNodeWithMin = getNodeWithMinDist(nodes);
- Node* nodeToJoin2 = *ptrNodeWithMin;
- const int indexToMinDist = nodeToJoin2->indexToMinDist;
- Node* const nodeToJoin1 = nodes[indexToMinDist];
-
- orderNode1 = nodeToJoin1->size;
- orderNode2 = nodeToJoin2->size;
- orderNewNode = orderNode1 + orderNode2;
-
- /**
- * STEP 2 Recompute the dist Matrix row for nodeToJoin1
- * example: Join nodes 2 and 6
- *
- * 1 0
- * 2 X 0
- * 3 0 0 0
- * 4 0 0 0 0
- * 5 0 0 0 0 0
- * 6 0 0 0 0 0 0
- * 7 0 0 0 0 0 0 0
- */
- double* nodeToJoin2DistIter = nodeToJoin2->ptrToDistMatRow;
-
- if (indexToMinDist != 0)
- {
- recomputeNodeToJoin1DistMatRow(nodeToJoin1, &nodeToJoin2DistIter);
- }
-
- /**
- * STEP 3 Recompute all distances in column
- * example: Join nodes 2 and 6
- *
- * 1 0
- * 2 C 0
- * 3 0 X 0
- * 4 0 X 0 0
- * 5 0 X 0 0 0
- * 6 0 0 0 0 0 0
- * 7 0 X 0 0 0 0 0
- */
-
- computeAllOtherDistsToNewNode(nodeToJoin1, nodeToJoin2, &nodeToJoin2DistIter);
-
- /**
- * STEP 4 Add the step to the progSteps.
- * This creates the multiple alignment steps.
- */
- addAlignmentStep(&nodeToJoin1->allElements, &nodeToJoin2->allElements);
-
-
- double minDistance = (*ptrNodeWithMin)->minDist;
-
- double height = 0.0;
- height = minDistance / 2.0;
-
- if(verbose)
- {
- if(nodeToJoin1->allElements.size() > 1)
- {
- type1 = "NODE: ";
- }
- else
- {
- type1 = "SEQ: ";
- }
- if(nodeToJoin2->allElements.size() > 1)
- {
- type2 = "NODE: ";
- }
- else
- {
- type2 = "SEQ: ";
- }
- (*tree) << type1 << nodeToJoin1->allElements[0] << " (" << setw(9)
- << setprecision(5) << height << ") joins " << type2
- << setw(4) << nodeToJoin2->allElements[0] << " ("
- << setw(9) << setprecision(5) << height << ")";
- }
- /**
- * STEP 5 merge 2 nodes
- */
- nodeToJoin1->merge(ptrNodeWithMin, height);
-
- }
-
- return *nodes;
-}
-
-/**
- * This function returns a Node object that has the minimum distance of
- * all the nodes left.
- */
-Node** UPGMAAlgorithm::getNodeWithMinDist(Node** nodes)
-{
- Node** ptrMinNode = NULL;
- double minDistance = numeric_limits<double>::max();
- //Start at node 1, check see if it points to something, then move on the next one.
- Node** nodeIter;
- for(nodeIter = &((*nodes)->next); *nodeIter;
- nodeIter = &(*nodeIter)->next)
- {
- if ((*nodeIter)->getMinDist() < minDistance)
- {
- minDistance = (*nodeIter)->getMinDist();
- ptrMinNode = nodeIter; // ptrMinNode will hold a ptr to node with sm'st dist
- }
- }
- return ptrMinNode;
-}
-
-/**
- * This function is used to recompute the nodeToJoin1 dist mat row. Each row in the distance
- * matrix has the distances to all nodes before it, not after. For example the dist mat row
- * for Node 3 would have the distances to nodes 1 and 2.
- * 1 0
- * 2 0 0
- * 3 X X 0 Dists to node 1 and 2.
- * 4 0 0 0 0
- * 5 0 0 0 0 0
- * 6 0 0 0 0 0 0
- * 7 0 0 0 0 0 0 0
- */
-void UPGMAAlgorithm::recomputeNodeToJoin1DistMatRow(Node* nodeToJoin1,
- double** nodeToJoin2DistIter)
-{
- double* nodeToJoin1DistIter = nodeToJoin1->ptrToDistMatRow;
- // calculate new distance
- *nodeToJoin1DistIter = calcNewDist(*nodeToJoin1DistIter, **nodeToJoin2DistIter);
-
- const double* minIndex1 = nodeToJoin1DistIter;
- nodeToJoin1DistIter++;
- (*nodeToJoin2DistIter)++;
- int numDistToUpdate = nodeToJoin1->numDists - 1;
-
- // For each of the distances in nodeToJoin1
- while(numDistToUpdate > 0)
- {
- if (*nodeToJoin1DistIter >= 0)
- {
- // Calculate the average
- *nodeToJoin1DistIter = calcNewDist(*nodeToJoin1DistIter, **nodeToJoin2DistIter);
-
- if (*nodeToJoin1DistIter < *minIndex1)
- {
- minIndex1 = nodeToJoin1DistIter;
- }
- }
- nodeToJoin1DistIter++;
- (*nodeToJoin2DistIter)++;
- numDistToUpdate--;
- }
-
- // We have found the minimum distance
- nodeToJoin1->minDist = *minIndex1;
- nodeToJoin1->indexToMinDist = minIndex1 - nodeToJoin1->ptrToDistMatRow;
-}
-
-/**
- * This function is used to recompute all the other distances. It does this by calling
- * two other functions. We first compute all the distances until we get to node 2.
- * Then we call another function to do the rest. This is because nodes after node 2 may
- * have had EITHER node1 of node2 as their min distance.
- */
-void UPGMAAlgorithm::computeAllOtherDistsToNewNode(Node* nodeToJoin1, Node* nodeToJoin2,
- double** nodeToJoin2DistIter)
-{
- computeDistsUpToNodeToJoin2(nodeToJoin1, nodeToJoin2, nodeToJoin2DistIter);
- computeDistsForNodesAfterNode2(nodeToJoin2);
-}
-
-/**
- * STEP 3A:
- * This function recomputes the distances in column until we get to nodeToJoin2
- * example: Join nodes 2 and 6
- *
- * 1 0
- * 2 C 0
- * 3 0 X 0
- * 4 0 X 0 0
- * 5 0 X 0 0 0
- * 6 0 0 0 0 0 0
- * 7 0 0 0 0 0 0 0
- *
- * For each node until we get to nodeToJoin2
- * If newdistance is less than the old min distance, set it to this.
- * else if its greater than old minDist and indexTominDist is the same, recompute min
- * else leave the min the same as it hasnt changed.
- */
-void UPGMAAlgorithm::computeDistsUpToNodeToJoin2(Node* nodeToJoin1, Node* nodeToJoin2, double** nodeToJoin2DistIter)
-{
- const int indexToMinDist = nodeToJoin2->indexToMinDist;
-
- movePtrPastUnusedDistances(nodeToJoin2DistIter);
-
- Node* nodeIter;
- // For each node until we get to the second node we are joining
- for(nodeIter = nodeToJoin1->next; nodeIter != nodeToJoin2; nodeIter = nodeIter->next)
- {
- (*nodeToJoin2DistIter)++; // Skip the distance to the node we are joining with
- movePtrPastUnusedDistances(nodeToJoin2DistIter);
-
- double distToNode = nodeIter->ptrToDistMatRow[indexToMinDist];
-
- double newDistToNode = calcNewDist(distToNode, **nodeToJoin2DistIter);
- nodeIter->ptrToDistMatRow[indexToMinDist] = newDistToNode;
-
-
- if (newDistToNode < nodeIter->minDist)
- { /** new value is smaller than current min. */
- nodeIter->minDist = newDistToNode;
- nodeIter->indexToMinDist = indexToMinDist;
- }
- else if ((newDistToNode > nodeIter->minDist) &&
- (nodeIter->indexToMinDist == indexToMinDist))
- { /** indexToMinDist was the min dist, but it is now a larger num, recompute */
- nodeIter->findMinDist();
- }
-
- }
-
-}
-
-/**
- * STEP 3B Recompute distance for nodes after nodeToJoin2
- * example: Join nodes 2 and 6
- *
- * 1 0
- * 2 C 0
- * 3 0 C 0
- * 4 0 C 0 0
- * 5 0 C 0 0 0
- * 6 0 0 0 0 0 0
- * 7 0 X 0 0 0 0 0
- *
- * For each node until we get to the end.
- * if dist is less than minDist, set mindist to new distance, set index to index
- * else if dist is greater than mindist and the index was either nodetojoin1 or
- * nodetojoin2, recompute distance, set entry for nodetojoin2 to NOTUSED
- * else set the distance to unused.
- */
-
-void UPGMAAlgorithm::computeDistsForNodesAfterNode2(Node* nodeToJoin2)
-{
- int indexToNode2 = nodeToJoin2->numDists;
- const int indexToMinDist = nodeToJoin2->indexToMinDist;
-
- Node* nodeIter;
-
- for(nodeIter = nodeToJoin2->next; nodeIter; nodeIter = nodeIter->next)
- {
- double &distUpdate = nodeIter->ptrToDistMatRow[indexToMinDist];
-
- distUpdate =
- ((distUpdate * orderNode1) +
- (nodeIter->ptrToDistMatRow[indexToNode2] * orderNode2))
- / orderNewNode;
-
- /* The comparison (distUpdate > nodeIter->minDist) is unsafe.
- * Specifically, we get different results for 32/64bit machines,
- * which leads to different branching in the if/else statement
- * and nasty behaviour down the line.
- * Using all of Pfam as a benchmark distUpdate can 'wobble' by 40 ULPs
- * (Unit of Least Precision) which is difficult to translate into
- * a maximum relative error, so we pick COMPARE_REL_EPSILON
- * phenomenologically: approx 2E-15 was the biggest we saw in Pfam,
- * but suggest 1E-14 (for good measure).
- * Using ULPs, eg, *(long long int*)&(distUpdate),
- * would be better (more elegant?) but gives problems
- * with aliasing and/or performance reduction.
- * FS, 2009-04-30
- */
-#define COMPARE_REL_EPSILON 1E-14
- if ( (distUpdate < nodeIter->minDist) &&
- ((nodeIter->minDist-distUpdate) /
- nodeIter->minDist > COMPARE_REL_EPSILON) )
- { /** new distance is smaller */
- nodeIter->minDist = distUpdate;
- nodeIter->indexToMinDist = indexToMinDist;
- }
- else if (((distUpdate > nodeIter->minDist) &&
- ((distUpdate-nodeIter->minDist) /
- distUpdate > COMPARE_REL_EPSILON) &&
- (nodeIter->indexToMinDist == indexToMinDist)) ||
- (nodeIter->indexToMinDist == indexToNode2))
- { /** if old min dist was to either nodeToJoin1 or nodeToJoin2 */
- nodeIter->ptrToDistMatRow[indexToNode2] = BLANKDIST;
- nodeIter->findMinDist();
- }
- else
- {
- nodeIter->ptrToDistMatRow[indexToNode2] = BLANKDIST;
- }
- }
-
-}
-
-void UPGMAAlgorithm::addAlignmentStep(vector<int>* group1, vector<int>* group2)
-{
- int sizeGroup1 = group1->size();
- int sizeGroup2 = group2->size();
-
- vector<int> groups;
- groups.resize(numSeqs + 1, 0);
- int sizeGroup = groups.size();
-
- for(int i = 0; i < sizeGroup1 && (*group1)[i] < sizeGroup; i++)
- {
- groups[(*group1)[i]] = 1; // Set to be apart of group1
- }
-
- for(int i = 0; i < sizeGroup2 && (*group2)[i] < sizeGroup; i++)
- {
- groups[(*group2)[i]] = 2; // Set to be apart of group1
- }
-
- progSteps->saveSet(&groups);
-}
-
-/**
- * At the moment we are only using average distance, UPGMA, but we can also use this
- * function to have different distance measures, min, max etc.
- */
-double UPGMAAlgorithm::calcNewDist(double dist1, double dist2)
-{
- return ((dist1 * orderNode1) + (dist2 * orderNode2)) / orderNewNode;
-}
-
-}