/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "Tree.h" namespace clustalw { /** * * @param firstSeq * @param lastSeq * @param sweight */ void Tree::calcSeqWeights(int firstSeq, int lastSeq, vector* sweight) { if((int)sweight->size() < lastSeq - 1) { sweight->resize(lastSeq - 1); } int i, _nSeqs; int temp, sum; int *weight; /* * If there are more than three sequences.... */ _nSeqs = lastSeq - firstSeq; if ((_nSeqs >= 2) && (clustalw::userParameters->getDistanceTree() == true) && (clustalw::userParameters->getNoWeights() == false)) { /* * Calculate sequence weights based on Phylip tree. */ weight = new int[lastSeq + 1]; for (i = firstSeq; i < lastSeq; i++) { weight[i] = calcWeight(i); } /* * Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR */ sum = 0; for (i = firstSeq; i < lastSeq; i++) { sum += weight[i]; } if (sum == 0) { for (i = firstSeq; i < lastSeq; i++) { weight[i] = 1; } sum = i; } for (i = firstSeq; i < lastSeq; i++) { (*sweight)[i] = (weight[i] * INT_SCALE_FACTOR) / sum; if ((*sweight)[i] < 1) { (*sweight)[i] = 1; } } delete []weight; weight = NULL; } else { /* * Otherwise, use identity weights. */ temp = INT_SCALE_FACTOR / _nSeqs; // AW 2009-05-28: goes wrong if we have more than // INT_SCALE_FACTOR seqs. if so, set to 1, just as above if (temp < 1) temp = 1; for (i = firstSeq; i < lastSeq; i++) { (*sweight)[i] = temp; } } } /** * * @param alignPtr * @param treeFileName * @param firstSeq * @param lastSeq * @return */ int Tree::readTree(clustalw::Alignment* alignPtr, const string& treeFileName, int firstSeq, int lastSeq) { if(alignPtr == NULL || firstSeq < 0 || lastSeq < 1) { return 0; } char c; string name1 = "", name2 = ""; int i, j, k; bool found; numSeq = 0; nnodes = 0; ntotal = 0; rootedTree = true; // Need to check what happens if I try to open a file that doesnt exist! if(treeFileName.empty()) { return 0; // Cannot open, empty string! } else { file.open(treeFileName.c_str(), ifstream::in); if(!file.is_open()) { clustalw::utilityObject->error("Cannot open output file [%s]\n", treeFileName.c_str()); } } skipSpace(&file); charFromFile = file.get(); if (charFromFile != '(') { clustalw::utilityObject->error("Wrong format in tree file %s\n", treeFileName.c_str()); return 0; } file.seekg(0, ios::beg); // Put get pointer back to the begining! clustalw::userParameters->setDistanceTree(true); // Allocate memory for tree // Allocate memory. nptr = new TreeNode*[3 * (lastSeq - firstSeq + 1)]; ptrs = new TreeNode*[3 * (lastSeq - firstSeq + 1)]; lptr = new TreeNode*[lastSeq - firstSeq + 1]; olptr = new TreeNode*[lastSeq + 1]; seqTree = avail(); setInfo(seqTree, NULL, 0, string(""), 0.0); createTree(seqTree, NULL, &file); file.close(); if (numSeq != lastSeq - firstSeq) { stringstream ss; ss << "tree not compatible with alignment\n(" << lastSeq - firstSeq << " sequences in alignment and "<< numSeq <<" in tree\n"; string errorMes; ss >> errorMes; clustalw::utilityObject->error(errorMes.c_str()); return 0; } // If the tree is unrooted, reroot the tree - ie. minimise the difference // between the mean root->leaf distances for the left and right branches of // the tree. if (clustalw::userParameters->getDistanceTree() == false) { if (rootedTree == false) { clustalw::utilityObject->error("input tree is unrooted and has no distances.\nCannot align sequences"); return 0; } } if (rootedTree == false) { root = reRoot(seqTree, lastSeq - firstSeq + 1); } else { root = seqTree; } // calculate the 'order' of each node. int nameLength; string nameSeq; orderNodes(); if (numSeq >= 2) { // If there are more than three sequences.... // assign the sequence nodes (in the same order as in the alignment file) for (i = firstSeq; i < lastSeq; i++) { nameLength = alignPtr->getName(i + 1).length(); nameSeq = alignPtr->getName(i + 1); name1 = ""; name2 = ""; if (nameLength > clustalw::MAXNAMES) { stringstream ss; ss << "name " << nameSeq << " is too long for PHYLIP tree format (max " << clustalw::MAXNAMES << " chars)"; string msg; ss >> msg; clustalw::utilityObject->warning(msg.c_str());// Mark change 17-5-07 } for (k = 0; k < nameLength && k < clustalw::MAXNAMES; k++) { c = nameSeq[k]; if ((c > 0x40) && (c < 0x5b)) { c = c | 0x20; } if (c == ' ') { c = '_'; } name2 += c; } found = false; for (j = 0; j < numSeq; j++) { name1 = ""; for (k = 0; k < (int)lptr[j]->name.length() && k < clustalw::MAXNAMES; k++) { c = lptr[j]->name[k]; if ((c > 0x40) && (c < 0x5b)) { c = c | 0x20; } name1 += c; } if (name1.compare(name2) == 0) { olptr[i] = lptr[j]; found = true; } } if (found == false) { utilityObject->error("tree not compatible with alignment:\n %s not found\n", name2.c_str()); return 0; } } } return (1); } /** * * @param firstSeq * @param lastSeq * @return */ auto_ptr Tree::createSets(int firstSeq, int lastSeq) { auto_ptr progAlignSteps; progAlignSteps.reset(new AlignmentSteps); int i, j, _nSeqs; numSets = 0; _nSeqs = lastSeq - firstSeq; if (_nSeqs >= 2) { // If there are more than three sequences.... groups = new int[_nSeqs + 1]; groupSeqs(root, groups, _nSeqs, progAlignSteps.get()); delete []groups; groups = NULL; } else { groups = new int[_nSeqs + 1]; for (i = 0; i < _nSeqs - 1; i++) { for (j = 0; j < _nSeqs; j++) if (j <= i) { groups[j] = 1; } else if (j == i + 1) { groups[j] = 2; } else { groups[j] = 0; } progAlignSteps->saveSet(_nSeqs, groups); } delete []groups; groups = NULL; } return progAlignSteps; } /** * calcSimilarities changes the distMat. * @param alignPtr * @param distMat * @return */ int Tree::calcSimilarities(clustalw::Alignment* alignPtr, clustalw::DistMatrix* distMat) { int depth = 0, i, j, k, n; bool found; int nerrs, seq1[MAXERRS], seq2[MAXERRS]; TreeNode *p, **pathToRoot; float dist; float *distToNode, badDist[MAXERRS]; double **dmat; ostringstream err1; char reply; int nSeqs = alignPtr->getNumSeqs(); pathToRoot = new TreeNode*[nSeqs]; distToNode = new float[nSeqs]; dmat = new double*[nSeqs]; for (i = 0; i < nSeqs; i++) { dmat[i] = new double[nSeqs]; } if (nSeqs >= 2) { /* * for each leaf, determine all nodes between the leaf and the root; */ for (i = 0; i < nSeqs; i++) { depth = 0;dist = 0.0; p = olptr[i]; while (p != NULL) { pathToRoot[depth] = p; dist += p->dist; distToNode[depth] = dist; p = p->parent; depth++; } /* * for each pair.... */ for (j = 0; j < i; j++) { p = olptr[j]; dist = 0.0; /* * find the common ancestor. */ found = false; n = 0; while ((found == false) && (p->parent != NULL)) { for (k = 0; k < depth; k++) if (p->parent == pathToRoot[k]) { found = true; n = k; } dist += p->dist; p = p->parent; } dmat[i][j] = dist + distToNode[n - 1]; } } nerrs = 0; for (i = 0; i < nSeqs; i++) { dmat[i][i] = 0.0; for (j = 0; j < i; j++) { if (dmat[i][j] < 0.01) { dmat[i][j] = 0.01; } if (dmat[i][j] > 1.0) { if (dmat[i][j] > 1.1 && nerrs < MAXERRS) { seq1[nerrs] = i; seq2[nerrs] = j; badDist[nerrs] = dmat[i][j]; nerrs++; } dmat[i][j] = 1.0; } } } if (nerrs > 0 && !userParameters->getGui()) { string errMess = "The following sequences are too divergent to be aligned:\n"; for (i = 0; i < nerrs && i < 5; i++) { err1 << " " << alignPtr->getName(seq1[i] + 1) << " and " << alignPtr->getName(seq2[i] + 1) << " (distance " << setprecision(3) << badDist[i] << ")\n"; } errMess += err1.str(); errMess += "(All distances should be between 0.0 and 1.0)\n"; errMess += "This may not be fatal but you have been warned!\n"; errMess += "SUGGESTION: Remove one or more problem sequences and try again"; if (clustalw::userParameters->getInteractive()) { reply = clustalw::utilityObject->promptForYesNo(errMess.c_str(), "Continue "); } else { reply = 'y'; } if ((reply != 'y') && (reply != 'Y')) { return 0; } } } else { for (i = 0; i < nSeqs; i++) { for (j = 0; j < i; j++) { dmat[i][j] = (*distMat)(i + 1, j + 1); } } } delete []pathToRoot; delete []distToNode; double value; for (i = 0; i < nSeqs; i++) { distMat->SetAt(i + 1, i + 1, 0.0); for (j = 0; j < i; j++) { value = 100.0 - (dmat[i][j]) * 100.0; distMat->SetAt(i + 1, j + 1, value); distMat->SetAt(j + 1, i + 1, value); } } for (i = 0; i < nSeqs; i++) { delete [] dmat[i]; } delete []dmat; return 1; } /** ************************************************************************* * Private functions!!!!!!!!!!!!!!! * ****************************************************************************/ /** * * @param ptree * @param parent * @param file */ void Tree::createTree(clustalw::TreeNode* ptree, clustalw::TreeNode* parent, ifstream* file) { TreeNode* p; int i, type; float dist; string name; // is this a node or a leaf ? skipSpace(file); charFromFile = file->get(); if (charFromFile == '(') { // this must be a node.... type = NODE; name = ""; ptrs[ntotal] = nptr[nnodes] = ptree; nnodes++; ntotal++; createNode(ptree, parent); p = ptree->left; createTree(p, ptree, file); if (charFromFile == ',') { p = ptree->right; createTree(p, ptree, file); if (charFromFile == ',') { ptree = insertNode(ptree); ptrs[ntotal] = nptr[nnodes] = ptree; nnodes++; ntotal++; p = ptree->right; createTree(p, ptree, file); rootedTree = false; } } skipSpace(file); charFromFile = file->get(); } // ...otherwise, this is a leaf else { type = LEAF; ptrs[ntotal++] = lptr[numSeq++] = ptree; // get the sequence name name = ""; name += charFromFile; charFromFile = file->get(); i = 1; while ((charFromFile != ':') && (charFromFile != ',') && (charFromFile != ')')) { if (i < MAXNAMES) { name += charFromFile; i++; } charFromFile = file->get(); } if (charFromFile != ':') { clustalw::userParameters->setDistanceTree(false); dist = 0.0; } } // get the distance information dist = 0.0; if (charFromFile == ':') { skipSpace(file); (*file) >> dist; skipSpace(file); charFromFile = file->get(); } setInfo(ptree, parent, type, name, dist); } /** * * @param pptr * @param parent */ void Tree::createNode(TreeNode* pptr, TreeNode* parent) { TreeNode* t; pptr->parent = parent; t = avail(); pptr->left = t; t = avail(); pptr->right = t; } /** * * @param pptr * @return */ TreeNode* Tree::insertNode(TreeNode* pptr) { TreeNode* newnode; newnode = avail(); createNode(newnode, pptr->parent); newnode->left = pptr; pptr->parent = newnode; setInfo(newnode, pptr->parent, NODE, "", 0.0); return newnode; } /** * * @param p */ void Tree::clearTree(clustalw::TreeNode* p) { clearTreeNodes(p); delete [] nptr; nptr = NULL; delete [] ptrs; ptrs = NULL; delete [] lptr; lptr = NULL; delete [] olptr; olptr = NULL; } /** * * @param p */ void Tree::clearTreeNodes(clustalw::TreeNode* p) { if (p == NULL) { p = root; } if (p->left != NULL) { clearTreeNodes(p->left); } if (p->right != NULL) { clearTreeNodes(p->right); } p->left = NULL; p->right = NULL; delete p; p = NULL; } /** * * @param * @param * @return */ void Tree::debugPrintAllNodes(int nseqs) { clustalw::TreeNode *p; int i; float diff, maxDist; cerr << "\nDEBUG: reportAllNodes\n"; for (i = 0; i < ntotal; i++) { p = ptrs[i]; // ios::sync_with_stdio(); // same design as TreeNode if (p->parent == NULL) diff = calcRootMean(p, &maxDist); else diff = calcMean(p, &maxDist, nseqs); fprintf(stdout, "i=%d p=%p: parent=%p left=%p right=%p dist=%f diff=%f\n", i, (void*)p, (void*)p->parent, (void*)p->left, (void*)p->right, p->dist, diff); } } /** * * @param ptree * @param nseqs * @return */ clustalw::TreeNode* Tree::reRoot(clustalw::TreeNode* ptree, int nseqs) { clustalw::TreeNode *p, *rootNode, *rootPtr; float diff, minDiff = 0.0, minDepth = 1.0, maxDist; int i; bool first = true; // find the difference between the means of leaf->node // distances on the left and on the right of each node rootPtr = ptree; for (i = 0; i < ntotal; i++) { p = ptrs[i]; if (p->parent == NULL) { /* AW Bug 94: p->parent must be chosen as rootNode (non-optimized executables (-O0) never do), otherwise insertRoot fails. Is p->parent == NULL valid at all? Simplest thing for now is to continue here. Tree code needs serious dismantling anyway. See debugPrintAllNodes */ continue; //diff = calcRootMean(p, &maxDist); } else { diff = calcMean(p, &maxDist, nseqs); } if ((diff == 0) || ((diff > 0) && (diff < 2 *p->dist))) { if ((maxDist < minDepth) || (first == true)) { first = false; rootPtr = p; minDepth = maxDist; minDiff = diff; } } } // insert a new node as the ancestor of the node which produces the shallowest // tree. /* AW Bug 94: could also be prevented here */ if (rootPtr == ptree) { minDiff = rootPtr->left->dist + rootPtr->right->dist; rootPtr = rootPtr->right; } rootNode = insertRoot(rootPtr, minDiff); diff = calcRootMean(rootNode, &maxDist); return rootNode; } /** * * @param p * @param diff * @return */ clustalw::TreeNode* Tree::insertRoot(clustalw::TreeNode* p, float diff) { clustalw::TreeNode *newp, *prev, *q, *t; float dist, prevDist, td; newp = avail(); if (p->parent==NULL) { // AW bug 94: question remains if access here should be handled differently cerr << "\n\n*** INTERNAL ERROR: Tree::insertRoot: TreeNode p->parent is NULL\n"; cerr << "To help us fix this bug, please send sequence file and used options to clustalw@ucd.ie\n"; exit(1); } t = p->parent; prevDist = t->dist; p->parent = newp; dist = p->dist; p->dist = diff / 2; if (p->dist < 0.0) { p->dist = 0.0; } if (p->dist > dist) { p->dist = dist; } t->dist = dist - p->dist; newp->left = t; newp->right = p; newp->parent = NULL; newp->dist = 0.0; newp->leaf = NODE; if (t->left == p) { t->left = t->parent; } else { t->right = t->parent; } prev = t; q = t->parent; t->parent = newp; while (q != NULL) { if (q->left == prev) { q->left = q->parent; q->parent = prev; td = q->dist; q->dist = prevDist; prevDist = td; prev = q; q = q->left; } else { q->right = q->parent; q->parent = prev; td = q->dist; q->dist = prevDist; prevDist = td; prev = q; q = q->right; } } /* * remove the old root node */ q = prev; if (q->left == NULL) { dist = q->dist; q = q->right; q->dist += dist; q->parent = prev->parent; if (prev->parent->left == prev) { prev->parent->left = q; } else { prev->parent->right = q; } prev->right = NULL; } else { dist = q->dist; q = q->left; q->dist += dist; q->parent = prev->parent; if (prev->parent->left == prev) { prev->parent->left = q; } else { prev->parent->right = q; } prev->left = NULL; } return (newp); } /** * * @param root * @param maxDist * @return */ float Tree::calcRootMean(clustalw::TreeNode* root, float *maxDist) { float dist, leftSum = 0.0, rightSum = 0.0, leftMean, rightMean, diff; clustalw::TreeNode* p; int i; int numLeft, numRight; int direction; // for each leaf, determine whether the leaf is left or right of the root. dist = (*maxDist) = 0; numLeft = numRight = 0; for (i = 0; i < numSeq; i++) { p = lptr[i]; dist = 0.0; while (p->parent != root) { dist += p->dist; p = p->parent; } if (p == root->left) { direction = LEFT; } else { direction = RIGHT; } dist += p->dist; if (direction == LEFT) { leftSum += dist; numLeft++; } else { rightSum += dist; numRight++; } if (dist > (*maxDist)) { *maxDist = dist; } } leftMean = leftSum / numLeft; rightMean = rightSum / numRight; diff = leftMean - rightMean; return diff; } /** * * @param nptr * @param maxDist * @param nSeqs * @return */ float Tree::calcMean(clustalw::TreeNode* nptr, float *maxDist, int nSeqs) { float dist, leftSum = 0.0, rightSum = 0.0, leftMean, rightMean, diff; clustalw::TreeNode* p; clustalw::TreeNode** pathToRoot; float *distToNode; int depth = 0, i, j, n = 0; int numLeft, numRight; int direction; bool found; pathToRoot = new clustalw::TreeNode*[nSeqs]; distToNode = new float[nSeqs]; // determine all nodes between the selected node and the root; depth = 0; (*maxDist) = dist = 0.0; numLeft = numRight = 0; p = nptr; while (p != NULL) { pathToRoot[depth] = p; dist += p->dist; distToNode[depth] = dist; p = p->parent; depth++; } // for each leaf, determine whether the leaf is left or right of the node. // (RIGHT = descendant, LEFT = not descendant) for (i = 0; i < numSeq; i++) { p = lptr[i]; if (p == nptr) { direction = RIGHT; dist = 0.0; } else { direction = LEFT; dist = 0.0; // find the common ancestor. found = false; n = 0; while ((found == false) && (p->parent != NULL)) { for (j = 0; j < depth; j++) if (p->parent == pathToRoot[j]) { found = true; n = j; } dist += p->dist; p = p->parent; } if (p == nptr) { direction = RIGHT; } } if (direction == LEFT) { leftSum += dist; leftSum += distToNode[n - 1]; numLeft++; } else { rightSum += dist; numRight++; } if (dist > (*maxDist)) { *maxDist = dist; } } delete [] distToNode; distToNode = NULL; delete [] pathToRoot; pathToRoot = NULL; leftMean = leftSum / numLeft; rightMean = rightSum / numRight; diff = leftMean - rightMean; return (diff); } /** * */ void Tree::orderNodes() { int i; clustalw::TreeNode* p; for (i = 0; i < numSeq; i++) { p = lptr[i]; while (p != NULL) { p->order++; p = p->parent; } } } /** * * @param leaf * @return */ int Tree::calcWeight(int leaf) { clustalw::TreeNode* p; float weight = 0.0; p = olptr[leaf]; while (p->parent != NULL) { weight += p->dist / p->order; p = p->parent; } weight *= 100.0; return ((int)weight); } /** * skipSpace is used to skip all the spaces at the begining of a file. The next read * will give a character other than a space. * @param file */ void Tree::skipSpace(ifstream* file) { char c; do { c = file->get(); } while (isspace(c)); file->putback(c); } /** * * @param p * @param nextGroups * @param nSeqs * @param stepsPtr */ void Tree::groupSeqs(clustalw::TreeNode* p, int *nextGroups, int nSeqs, AlignmentSteps* stepsPtr) { int i; int *tmpGroups; tmpGroups = new int[nSeqs + 1]; for (i = 0; i < nSeqs; i++) { tmpGroups[i] = 0; } if (p->left != NULL) { if (p->left->leaf == NODE) { groupSeqs(p->left, nextGroups, nSeqs, stepsPtr); for (i = 0; i < nSeqs; i++) if (nextGroups[i] != 0) { tmpGroups[i] = 1; } } else { markGroup1(p->left, tmpGroups, nSeqs); } } if (p->right != NULL) { if (p->right->leaf == NODE) { groupSeqs(p->right, nextGroups, nSeqs, stepsPtr); for (i = 0; i < nSeqs; i++) if (nextGroups[i] != 0) { tmpGroups[i] = 2; } } else { markGroup2(p->right, tmpGroups, nSeqs); } stepsPtr->saveSet(nSeqs, tmpGroups); } for (i = 0; i < nSeqs; i++) { nextGroups[i] = tmpGroups[i]; } delete [] tmpGroups; tmpGroups = NULL; } /** * * @param p * @param groups * @param n */ void Tree::markGroup1(clustalw::TreeNode* p, int *groups, int n) { int i; for (i = 0; i < n; i++) { if (olptr[i] == p) { groups[i] = 1; } else { groups[i] = 0; } } } /** * * @param p * @param groups * @param n */ void Tree::markGroup2(clustalw::TreeNode* p, int *groups, int n) { int i; for (i = 0; i < n; i++) { if (olptr[i] == p) { groups[i] = 2; } else if (groups[i] != 0) { groups[i] = 1; } } } /** * * @return */ clustalw::TreeNode* Tree::avail() { clustalw::TreeNode* p; p = new TreeNode; p->left = NULL; p->right = NULL; p->parent = NULL; p->dist = 0.0; p->leaf = 0; p->order = 0; return (p); } /** * * @param p * @param parent * @param pleaf * @param pname * @param pdist */ void Tree::setInfo(TreeNode* p, TreeNode* parent, int pleaf, string pname, float pdist) { p->parent = parent; p->leaf = pleaf; p->dist = pdist; p->order = 0; p->name = pname; if (p->leaf == true) { p->left = NULL; p->right = NULL; } } }