4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
22 void Tree::calcSeqWeights(int firstSeq, int lastSeq, vector<int>* sweight)
24 if((int)sweight->size() < lastSeq - 1)
26 sweight->resize(lastSeq - 1);
33 * If there are more than three sequences....
35 _nSeqs = lastSeq - firstSeq;
36 if ((_nSeqs >= 2) && (clustalw::userParameters->getDistanceTree() == true) &&
37 (clustalw::userParameters->getNoWeights() == false))
40 * Calculate sequence weights based on Phylip tree.
43 weight = new int[lastSeq + 1];
45 for (i = firstSeq; i < lastSeq; i++)
47 weight[i] = calcWeight(i);
51 * Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
55 for (i = firstSeq; i < lastSeq; i++)
62 for (i = firstSeq; i < lastSeq; i++)
69 for (i = firstSeq; i < lastSeq; i++)
71 (*sweight)[i] = (weight[i] * INT_SCALE_FACTOR) / sum;
72 if ((*sweight)[i] < 1)
85 * Otherwise, use identity weights.
87 temp = INT_SCALE_FACTOR / _nSeqs;
88 // AW 2009-05-28: goes wrong if we have more than
89 // INT_SCALE_FACTOR seqs. if so, set to 1, just as above
93 for (i = firstSeq; i < lastSeq; i++)
104 * @param treeFileName
109 int Tree::readTree(clustalw::Alignment* alignPtr, const string& treeFileName, int firstSeq, int lastSeq)
112 if(alignPtr == NULL || firstSeq < 0 || lastSeq < 1)
117 string name1 = "", name2 = "";
126 // Need to check what happens if I try to open a file that doesnt exist!
127 if(treeFileName.empty())
129 return 0; // Cannot open, empty string!
133 file.open(treeFileName.c_str(), ifstream::in);
136 clustalw::utilityObject->error("Cannot open output file [%s]\n", treeFileName.c_str());
141 charFromFile = file.get();
143 if (charFromFile != '(')
145 clustalw::utilityObject->error("Wrong format in tree file %s\n", treeFileName.c_str());
148 file.seekg(0, ios::beg); // Put get pointer back to the begining!
150 clustalw::userParameters->setDistanceTree(true);
153 // Allocate memory for tree
156 nptr = new TreeNode*[3 * (lastSeq - firstSeq + 1)];
157 ptrs = new TreeNode*[3 * (lastSeq - firstSeq + 1)];
158 lptr = new TreeNode*[lastSeq - firstSeq + 1];
159 olptr = new TreeNode*[lastSeq + 1];
162 setInfo(seqTree, NULL, 0, string(""), 0.0);
164 createTree(seqTree, NULL, &file);
167 if (numSeq != lastSeq - firstSeq)
170 ss << "tree not compatible with alignment\n(" << lastSeq - firstSeq
171 << " sequences in alignment and "<< numSeq <<" in tree\n";
174 clustalw::utilityObject->error(errorMes.c_str());
178 // If the tree is unrooted, reroot the tree - ie. minimise the difference
179 // between the mean root->leaf distances for the left and right branches of
182 if (clustalw::userParameters->getDistanceTree() == false)
184 if (rootedTree == false)
186 clustalw::utilityObject->error("input tree is unrooted and has no distances.\nCannot align sequences");
191 if (rootedTree == false)
193 root = reRoot(seqTree, lastSeq - firstSeq + 1);
200 // calculate the 'order' of each node.
207 // If there are more than three sequences....
208 // assign the sequence nodes (in the same order as in the alignment file)
209 for (i = firstSeq; i < lastSeq; i++)
211 nameLength = alignPtr->getName(i + 1).length();
212 nameSeq = alignPtr->getName(i + 1);
215 if (nameLength > clustalw::MAXNAMES)
218 ss << "name " << nameSeq << " is too long for PHYLIP tree format (max "
219 << clustalw::MAXNAMES << " chars)";
222 clustalw::utilityObject->warning(msg.c_str());// Mark change 17-5-07
225 for (k = 0; k < nameLength && k < clustalw::MAXNAMES; k++)
228 if ((c > 0x40) && (c < 0x5b))
240 for (j = 0; j < numSeq; j++)
243 for (k = 0; k < (int)lptr[j]->name.length() && k < clustalw::MAXNAMES; k++)
245 c = lptr[j]->name[k];
246 if ((c > 0x40) && (c < 0x5b))
254 if (name1.compare(name2) == 0)
263 utilityObject->error("tree not compatible with alignment:\n %s not found\n", name2.c_str());
278 auto_ptr<AlignmentSteps> Tree::createSets(int firstSeq, int lastSeq)
280 auto_ptr<AlignmentSteps> progAlignSteps;
281 progAlignSteps.reset(new AlignmentSteps);
286 _nSeqs = lastSeq - firstSeq;
289 // If there are more than three sequences....
290 groups = new int[_nSeqs + 1];
291 groupSeqs(root, groups, _nSeqs, progAlignSteps.get());
300 groups = new int[_nSeqs + 1];
301 for (i = 0; i < _nSeqs - 1; i++)
303 for (j = 0; j < _nSeqs; j++)
317 progAlignSteps->saveSet(_nSeqs, groups);
323 return progAlignSteps;
327 * calcSimilarities changes the distMat.
332 int Tree::calcSimilarities(clustalw::Alignment* alignPtr, clustalw::DistMatrix* distMat)
334 int depth = 0, i, j, k, n;
336 int nerrs, seq1[MAXERRS], seq2[MAXERRS];
337 TreeNode *p, **pathToRoot;
339 float *distToNode, badDist[MAXERRS];
343 int nSeqs = alignPtr->getNumSeqs();
345 pathToRoot = new TreeNode*[nSeqs];
346 distToNode = new float[nSeqs];
347 dmat = new double*[nSeqs];
349 for (i = 0; i < nSeqs; i++)
351 dmat[i] = new double[nSeqs];
357 * for each leaf, determine all nodes between the leaf and the root;
359 for (i = 0; i < nSeqs; i++)
361 depth = 0;dist = 0.0;
365 pathToRoot[depth] = p;
367 distToNode[depth] = dist;
375 for (j = 0; j < i; j++)
380 * find the common ancestor.
384 while ((found == false) && (p->parent != NULL))
386 for (k = 0; k < depth; k++)
387 if (p->parent == pathToRoot[k])
397 dmat[i][j] = dist + distToNode[n - 1];
402 for (i = 0; i < nSeqs; i++)
405 for (j = 0; j < i; j++)
407 if (dmat[i][j] < 0.01)
411 if (dmat[i][j] > 1.0)
413 if (dmat[i][j] > 1.1 && nerrs < MAXERRS)
417 badDist[nerrs] = dmat[i][j];
424 if (nerrs > 0 && !userParameters->getGui())
426 string errMess = "The following sequences are too divergent to be aligned:\n";
428 for (i = 0; i < nerrs && i < 5; i++)
430 err1 << " " << alignPtr->getName(seq1[i] + 1) << " and "
431 << alignPtr->getName(seq2[i] + 1) << " (distance "
432 << setprecision(3) << badDist[i] << ")\n";
434 errMess += err1.str();
435 errMess += "(All distances should be between 0.0 and 1.0)\n";
436 errMess += "This may not be fatal but you have been warned!\n";
437 errMess += "SUGGESTION: Remove one or more problem sequences and try again";
439 if (clustalw::userParameters->getInteractive())
441 reply = clustalw::utilityObject->promptForYesNo(errMess.c_str(), "Continue ");
447 if ((reply != 'y') && (reply != 'Y'))
455 for (i = 0; i < nSeqs; i++)
457 for (j = 0; j < i; j++)
459 dmat[i][j] = (*distMat)(i + 1, j + 1);
467 for (i = 0; i < nSeqs; i++)
469 distMat->SetAt(i + 1, i + 1, 0.0);
470 for (j = 0; j < i; j++)
472 value = 100.0 - (dmat[i][j]) * 100.0;
473 distMat->SetAt(i + 1, j + 1, value);
474 distMat->SetAt(j + 1, i + 1, value);
478 for (i = 0; i < nSeqs; i++)
489 /** *************************************************************************
490 * Private functions!!!!!!!!!!!!!!! *
491 ****************************************************************************/
499 void Tree::createTree(clustalw::TreeNode* ptree, clustalw::TreeNode* parent, ifstream* file)
508 // is this a node or a leaf ?
510 charFromFile = file->get();
511 if (charFromFile == '(')
513 // this must be a node....
516 ptrs[ntotal] = nptr[nnodes] = ptree;
520 createNode(ptree, parent);
523 createTree(p, ptree, file);
525 if (charFromFile == ',')
528 createTree(p, ptree, file);
529 if (charFromFile == ',')
531 ptree = insertNode(ptree);
532 ptrs[ntotal] = nptr[nnodes] = ptree;
536 createTree(p, ptree, file);
542 charFromFile = file->get();
545 // ...otherwise, this is a leaf
550 ptrs[ntotal++] = lptr[numSeq++] = ptree;
551 // get the sequence name
553 name += charFromFile;
554 charFromFile = file->get();
557 while ((charFromFile != ':') && (charFromFile != ',') && (charFromFile != ')'))
561 name += charFromFile;
564 charFromFile = file->get();
567 if (charFromFile != ':')
569 clustalw::userParameters->setDistanceTree(false);
574 // get the distance information
577 if (charFromFile == ':')
582 charFromFile = file->get();
584 setInfo(ptree, parent, type, name, dist);
592 void Tree::createNode(TreeNode* pptr, TreeNode* parent)
595 pptr->parent = parent;
607 TreeNode* Tree::insertNode(TreeNode* pptr)
613 createNode(newnode, pptr->parent);
615 newnode->left = pptr;
616 pptr->parent = newnode;
618 setInfo(newnode, pptr->parent, NODE, "", 0.0);
627 void Tree::clearTree(clustalw::TreeNode* p)
644 void Tree::clearTreeNodes(clustalw::TreeNode* p)
652 clearTreeNodes(p->left);
654 if (p->right != NULL)
656 clearTreeNodes(p->right);
674 void Tree::debugPrintAllNodes(int nseqs)
676 clustalw::TreeNode *p;
680 cerr << "\nDEBUG: reportAllNodes\n";
681 for (i = 0; i < ntotal; i++) {
683 // ios::sync_with_stdio();
685 // same design as TreeNode
686 if (p->parent == NULL)
687 diff = calcRootMean(p, &maxDist);
689 diff = calcMean(p, &maxDist, nseqs);
691 "i=%d p=%p: parent=%p left=%p right=%p dist=%f diff=%f\n",
692 i, (void*)p, (void*)p->parent, (void*)p->left, (void*)p->right,
707 clustalw::TreeNode* Tree::reRoot(clustalw::TreeNode* ptree, int nseqs)
709 clustalw::TreeNode *p, *rootNode, *rootPtr;
710 float diff, minDiff = 0.0, minDepth = 1.0, maxDist;
714 // find the difference between the means of leaf->node
715 // distances on the left and on the right of each node
717 for (i = 0; i < ntotal; i++)
720 if (p->parent == NULL)
722 /* AW Bug 94: p->parent must be chosen as rootNode
723 (non-optimized executables (-O0) never do), otherwise
725 Is p->parent == NULL valid at all?
726 Simplest thing for now is to continue here. Tree code
727 needs serious dismantling anyway. See debugPrintAllNodes
731 //diff = calcRootMean(p, &maxDist);
735 diff = calcMean(p, &maxDist, nseqs);
738 if ((diff == 0) || ((diff > 0) && (diff < 2 *p->dist)))
740 if ((maxDist < minDepth) || (first == true))
752 // insert a new node as the ancestor of the node which produces the shallowest
754 /* AW Bug 94: could also be prevented here */
755 if (rootPtr == ptree)
757 minDiff = rootPtr->left->dist + rootPtr->right->dist;
758 rootPtr = rootPtr->right;
760 rootNode = insertRoot(rootPtr, minDiff);
762 diff = calcRootMean(rootNode, &maxDist);
773 clustalw::TreeNode* Tree::insertRoot(clustalw::TreeNode* p, float diff)
775 clustalw::TreeNode *newp, *prev, *q, *t;
776 float dist, prevDist, td;
780 if (p->parent==NULL) {
781 // AW bug 94: question remains if access here should be handled differently
782 cerr << "\n\n*** INTERNAL ERROR: Tree::insertRoot: TreeNode p->parent is NULL\n";
783 cerr << "To help us fix this bug, please send sequence file and used options to clustalw@ucd.ie\n";
806 t->dist = dist - p->dist;
820 t->right = t->parent;
842 q->right = q->parent;
853 * remove the old root node
861 q->parent = prev->parent;
863 if (prev->parent->left == prev)
865 prev->parent->left = q;
869 prev->parent->right = q;
879 q->parent = prev->parent;
881 if (prev->parent->left == prev)
883 prev->parent->left = q;
887 prev->parent->right = q;
902 float Tree::calcRootMean(clustalw::TreeNode* root, float *maxDist)
904 float dist, leftSum = 0.0, rightSum = 0.0, leftMean, rightMean, diff;
905 clustalw::TreeNode* p;
907 int numLeft, numRight;
910 // for each leaf, determine whether the leaf is left or right of the root.
912 dist = (*maxDist) = 0;
913 numLeft = numRight = 0;
914 for (i = 0; i < numSeq; i++)
918 while (p->parent != root)
935 if (direction == LEFT)
945 if (dist > (*maxDist))
951 leftMean = leftSum / numLeft;
952 rightMean = rightSum / numRight;
954 diff = leftMean - rightMean;
965 float Tree::calcMean(clustalw::TreeNode* nptr, float *maxDist, int nSeqs)
967 float dist, leftSum = 0.0, rightSum = 0.0, leftMean, rightMean, diff;
968 clustalw::TreeNode* p;
969 clustalw::TreeNode** pathToRoot;
971 int depth = 0, i, j, n = 0;
972 int numLeft, numRight;
976 pathToRoot = new clustalw::TreeNode*[nSeqs];
977 distToNode = new float[nSeqs];
978 // determine all nodes between the selected node and the root;
981 (*maxDist) = dist = 0.0;
982 numLeft = numRight = 0;
986 pathToRoot[depth] = p;
988 distToNode[depth] = dist;
993 // for each leaf, determine whether the leaf is left or right of the node.
994 // (RIGHT = descendant, LEFT = not descendant)
996 for (i = 0; i < numSeq; i++)
1009 // find the common ancestor.
1013 while ((found == false) && (p->parent != NULL))
1015 for (j = 0; j < depth; j++)
1016 if (p->parent == pathToRoot[j])
1031 if (direction == LEFT)
1034 leftSum += distToNode[n - 1];
1043 if (dist > (*maxDist))
1049 delete [] distToNode;
1051 delete [] pathToRoot;
1054 leftMean = leftSum / numLeft;
1055 rightMean = rightSum / numRight;
1057 diff = leftMean - rightMean;
1064 void Tree::orderNodes()
1067 clustalw::TreeNode* p;
1069 for (i = 0; i < numSeq; i++)
1085 int Tree::calcWeight(int leaf)
1087 clustalw::TreeNode* p;
1091 while (p->parent != NULL)
1093 weight += p->dist / p->order;
1099 return ((int)weight);
1103 * skipSpace is used to skip all the spaces at the begining of a file. The next read
1104 * will give a character other than a space.
1107 void Tree::skipSpace(ifstream* file)
1127 void Tree::groupSeqs(clustalw::TreeNode* p, int *nextGroups, int nSeqs, AlignmentSteps* stepsPtr)
1132 tmpGroups = new int[nSeqs + 1];
1133 for (i = 0; i < nSeqs; i++)
1138 if (p->left != NULL)
1140 if (p->left->leaf == NODE)
1142 groupSeqs(p->left, nextGroups, nSeqs, stepsPtr);
1144 for (i = 0; i < nSeqs; i++)
1145 if (nextGroups[i] != 0)
1152 markGroup1(p->left, tmpGroups, nSeqs);
1157 if (p->right != NULL)
1159 if (p->right->leaf == NODE)
1161 groupSeqs(p->right, nextGroups, nSeqs, stepsPtr);
1162 for (i = 0; i < nSeqs; i++)
1163 if (nextGroups[i] != 0)
1170 markGroup2(p->right, tmpGroups, nSeqs);
1172 stepsPtr->saveSet(nSeqs, tmpGroups);
1175 for (i = 0; i < nSeqs; i++)
1177 nextGroups[i] = tmpGroups[i];
1180 delete [] tmpGroups;
1190 void Tree::markGroup1(clustalw::TreeNode* p, int *groups, int n)
1194 for (i = 0; i < n; i++)
1213 void Tree::markGroup2(clustalw::TreeNode* p, int *groups, int n)
1217 for (i = 0; i < n; i++)
1223 else if (groups[i] != 0)
1234 clustalw::TreeNode* Tree::avail()
1236 clustalw::TreeNode* p;
1255 void Tree::setInfo(TreeNode* p, TreeNode* parent, int pleaf, string pname, float
1263 if (p->leaf == true)