4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
9 #include "ClusterTree.h"
10 #include "../general/utils.h"
12 #include "RandomGenerator.h"
15 #include "../general/OutputFile.h"
20 ClusterTree::ClusterTree()
25 bootstrapPrompt = "\nEnter name for bootstrap output file ";
26 bootstrapFileTypeMsg = "Bootstrap output";
31 /** ****************************************************************************************
32 * The rest are private functions. *
36 *******************************************************************************************/
40 void ClusterTree::distanceMatrixOutput(ofstream* outFile, clustalw::DistMatrix* matToPrint,
41 clustalw::Alignment *alignPtr)
43 if(outFile == NULL || !outFile->is_open())
45 clustalw::utilityObject->error("Cannot output the distance matrix, file is not open\n");
50 int _maxNames = alignPtr->getMaxNames();
51 (*outFile) << setw(6) << lastSeq - firstSeq + 1;
53 for (i = 1; i <= lastSeq - firstSeq + 1; i++)
55 (*outFile) << "\n" << left << setw(_maxNames) << alignPtr->getName(i) << " ";
56 for (j = 1; j <= lastSeq - firstSeq + 1; j++)
58 (*outFile) << " " << setw(6) << setprecision(3) << fixed << (*matToPrint)(i, j);
61 if (j != lastSeq - firstSeq + 1)
65 if (j != lastSeq - firstSeq + 1)
74 void ClusterTree::overspillMessage(int overspill,int totalDists)
76 std::stringstream ssOverSpill;
77 ssOverSpill << overspill << " of the distances out of a total of " << totalDists
78 << "\n were out of range for the distance correction." << "\n"
79 << "\n SUGGESTIONS: 1) remove the most distant sequences"
80 << "\n or 2) use the PHYLIP package"
81 << "\n or 3) turn off the correction."
82 << "\n Note: Use option 3 with caution! With this degree"
83 << "\n of divergence you will have great difficulty"
84 << "\n getting robust and reliable trees." << "\n\n";
86 ssOverSpill >> message;
87 clustalw::utilityObject->warning(message.c_str());
91 * The function treeGapDelete flags all the positions that have a gap in any sequence.
94 // NOTE there is something wrong with using _lenFirstSeq. But this is what old clustal does.
95 void ClusterTree::treeGapDelete(clustalw::Alignment *alignPtr)
99 int _maxAlnLength = alignPtr->getMaxAlnLength();
100 int _lenFirstSeq = alignPtr->getSeqLength(firstSeq);
101 int _gapPos1 = clustalw::userParameters->getGapPos1();
102 int _gapPos2 = clustalw::userParameters->getGapPos2();
104 treeGaps.resize(_maxAlnLength + 1);
106 for (posn = 1; posn <= _lenFirstSeq; ++posn)
109 for (seqn = 1; seqn <= lastSeq - firstSeq + 1; ++seqn)
111 const vector<int>* _seqM = alignPtr->getSequence(seqn + firstSeq - 1);
113 if(posn > alignPtr->getSeqLength(seqn + firstSeq - 1))
115 break; // Dont read locations that cannot be read!
117 if (((*_seqM)[posn] == _gapPos1) || ((*_seqM)[posn] == _gapPos2))
126 int ClusterTree::dnaDistanceMatrix(ofstream* treeFile, clustalw::Alignment *alignPtr)
132 double p, q, e, a, b, k;
134 treeGapDelete(alignPtr); // flag positions with gaps (tree_gaps[i] = 1 )
139 (*treeFile) << "\n DIST = percentage divergence (/100)";
140 (*treeFile) << "\n p = rate of transition (A <-> G; C <-> T)";
141 (*treeFile) << "\n q = rate of transversion";
142 (*treeFile) << "\n Length = number of sites used in comparison";
144 if (clustalw::userParameters->getTossGaps())
146 (*treeFile) << "\n All sites with gaps (in any sequence) deleted!";
149 if (clustalw::userParameters->getKimura())
151 (*treeFile) << "\n Distances corrected by Kimura's 2 parameter model:";
152 (*treeFile) << "\n\n Kimura, M. (1980)";
153 (*treeFile) << " A simple method for estimating evolutionary ";
154 (*treeFile) << "rates of base";
155 (*treeFile) << "\n substitutions through comparative studies of ";
156 (*treeFile) << "nucleotide sequences.";
157 (*treeFile) << "\n J. Mol. Evol., 16, 111-120.";
158 (*treeFile) << "\n\n";
162 int _numSeqs = alignPtr->getNumSeqs();
163 quickDistMat.reset(new clustalw::DistMatrix(_numSeqs + 1));
164 int _lenFirstSeq = alignPtr->getSeqLength(firstSeq);
165 int _gapPos1 = clustalw::userParameters->getGapPos1();
166 int _gapPos2 = clustalw::userParameters->getGapPos2();
167 int lenSeqM, lenSeqN;
168 // for every pair of sequence
169 for (m = 1; m < lastSeq - firstSeq + 1; ++m)
171 const vector<int>* _seqM = alignPtr->getSequence(m + firstSeq - 1);
172 lenSeqM = alignPtr->getSeqLength(m + firstSeq - 1);
174 for (n = m + 1; n <= lastSeq - firstSeq + 1; ++n)
176 const vector<int>* _seqN = alignPtr->getSequence(n + firstSeq - 1);
177 lenSeqN = alignPtr->getSeqLength(n + firstSeq - 1);
180 quickDistMat->SetAt(m, n, 0.0);
181 quickDistMat->SetAt(n ,m, 0.0);
183 for (i = 1; i <= _lenFirstSeq; ++i)
185 j = bootPositions[i];
186 if (clustalw::userParameters->getTossGaps() && (treeGaps[j] > 0))
191 /** *******************************************************************************
192 * BUG!!!!!!! NOTE this was found for protein. Presuming the same here *
193 * NOTE: the following if statements were coded in so as to produce *
194 * the same distance results as the old clustal. Old clustal compares *
195 * up to the length of the first sequence. If this is longer than the *
196 * other sequences, then the -3 and 0's are compared at the end of the *
197 * array. These should not be compared, but I need to stick to this to *
198 * produce the same results as the old version for testing! *
199 **********************************************************************************/
230 if ((res1 == _gapPos1) || (res1 == _gapPos2) || (res2 == _gapPos1)
231 || (res2 == _gapPos2))
236 if (!clustalw::userParameters->getUseAmbiguities())
238 if (isAmbiguity(res1) || isAmbiguity(res2))
243 // ambiguity code in a seq
247 if (transition(res1, res2))
260 // Kimura's 2 parameter correction for multiple substitutions
262 if (!clustalw::userParameters->getKimura())
266 cerr << "\n WARNING: sequences " << m << " and " << n
267 << " are non-overlapping\n";
292 quickDistMat->SetAt(m, n, k);
293 quickDistMat->SetAt(n ,m, k);
296 (*treeFile) << setw(4) << m << " vs." << setw(4) << n << ": DIST = "
297 << setw(4) << fixed << setprecision(4)
298 << k << "; p = " << fixed << setprecision(4) << p << "; q = "
299 << fixed << setprecision(4) << q << "; length = " << setw(6)
300 << fixed << setprecision(0) << e << "\n";
307 cerr << "\n WARNING: sequences " << m << " and " << n
308 << " are non-overlapping\n";;
332 if (((2.0 *p) + q) == 1.0)
338 a = 1.0 / (1.0 - (2.0 *p) - q);
347 b = 1.0 / (1.0 - (2.0 *q));
350 // watch for values going off the scale for the correction.
351 if ((a <= 0.0) || (b <= 0.0))
354 k = 3.5; // arbitrary high score
358 k = 0.5 * log(a) + 0.25 * log(b);
360 quickDistMat->SetAt(m, n, k);
361 quickDistMat->SetAt(n ,m, k);
365 (*treeFile) << setw(4) << m << " vs." << setw(4) << n
366 << ": DIST = " << fixed << setprecision(4)
367 << k << "; p = " << fixed << setprecision(4) << p << "; q = "
368 << fixed << setprecision(4) << q << "; length = " << setw(6)
369 << fixed << setprecision(0) << e << "\n";
375 return overspill; // return the number of off-scale values
378 int ClusterTree::protDistanceMatrix(ofstream* treeFile, clustalw::Alignment *alignPtr)
384 double p, e, k, tableEntry;
386 treeGapDelete(alignPtr); // flag positions with gaps (tree_gaps[i] = 1 )
391 (*treeFile) << "\n DIST = percentage divergence (/100)";
392 (*treeFile) << "\n Length = number of sites used in comparison";
393 (*treeFile) << "\n\n";
394 if (clustalw::userParameters->getTossGaps())
396 (*treeFile) << "\n All sites with gaps (in any sequence) deleted";
399 if (clustalw::userParameters->getKimura())
402 "\n Distances up to 0.75 corrected by Kimura's empirical method:";
403 (*treeFile) << "\n\n Kimura, M. (1983)";
404 (*treeFile) << " The Neutral Theory of Molecular Evolution.";
406 "\n Page 75. Cambridge University Press, Cambridge, England.";
407 (*treeFile) << "\n\n";
410 int _numSeqs = alignPtr->getNumSeqs();
411 int _lenSeq1 = alignPtr->getSeqLength(1);
412 quickDistMat.reset(new clustalw::DistMatrix(_numSeqs + 1));
413 int _gapPos1 = clustalw::userParameters->getGapPos1();
414 int _gapPos2 = clustalw::userParameters->getGapPos2();
415 int lenSeqM, lenSeqN;
416 // for every pair of sequence
417 for (m = 1; m < _numSeqs; ++m)
419 const vector<int>* _seqM = alignPtr->getSequence(m);
420 lenSeqM = alignPtr->getSeqLength(m);
421 for (n = m + 1; n <= _numSeqs; ++n)
423 const vector<int>* _seqN = alignPtr->getSequence(n);
424 lenSeqN = alignPtr->getSeqLength(n);
426 quickDistMat->SetAt(m, n, 0.0);
427 quickDistMat->SetAt(n ,m, 0.0);
428 for (i = 1; i <= _lenSeq1; ++i) // It may be this here!
430 j = bootPositions[i];
431 if (clustalw::userParameters->getTossGaps() && (treeGaps[j] > 0))
436 /** *******************************************************************************
438 * NOTE: the following if statements were coded in so as to produce *
439 * the same distance results as the old clustal. Old clustal compares *
440 * up to the length of the first sequence. If this is longer than the *
441 * other sequences, then the -3 and 0's are compared at the end of the *
442 * array. These should not be compared, but I need to stick to this to *
443 * produce the same results as the old version for testing! *
444 **********************************************************************************/
475 if ((res1 == _gapPos1) || (res1 == _gapPos2) || (res2 == _gapPos1)
476 || (res2 == _gapPos2))
498 if (clustalw::userParameters->getKimura())
502 // use Kimura's formula
505 k = - log(1.0 - k - (k * k / 5.0));
513 k = 10.0; // arbitrarily set to 1000%
515 else // dayhoff_pams is from dayhoff.h file
517 tableEntry = (k * 1000.0) - 750.0;
518 k = (double)dayhoff_pams[(int)tableEntry];
524 quickDistMat->SetAt(m, n, k);
525 quickDistMat->SetAt(n ,m, k);
528 (*treeFile) << setw(4) << m << " vs." << setw(4) << n
529 << " DIST = " << fixed << setprecision(4)
530 << k << "; length = " << setw(6)
531 << setprecision(0) << e << "\n";
538 bool ClusterTree::isAmbiguity(int c)
541 char codes[] = "ACGTU";
543 if (clustalw::userParameters->getUseAmbiguities() == true)
548 for (i = 0; i < 5; i++)
549 if (clustalw::userParameters->getAminoAcidCode(c) == codes[i])
558 * The function calcPercIdentity calculates the percent identity of the sequences
559 * and outputs it to a the file pfile. NOTE this is not used at the moment. It was in
560 * the old code, but there was no way to access it from the menu. This may change.
562 void ClusterTree::calcPercIdentity(ofstream* pfile, clustalw::Alignment *alignPtr)
564 clustalw::DistMatrix percentMat;
571 int i,j,k, length_longest;
575 // findout sequence length, longest and shortest;
579 int _numSeqs = alignPtr->getNumSeqs();
581 for (i = 1; i <= _numSeqs; i++)
583 _seqLength = alignPtr->getSeqLength(i);
584 if (length_longest < _seqLength)
586 length_longest = _seqLength;
589 if (length_shortest > _seqLength)
591 length_shortest = _seqLength;
596 percentMat.ResizeRect(_numSeqs + 1);
598 int _lenSeqI, _lenSeqJ;
599 int _maxAA = clustalw::userParameters->getMaxAA();
601 for (i = 1; i <= _numSeqs; i++)
603 const vector<int>* _seqI = alignPtr->getSequence(i);
604 _lenSeqI = alignPtr->getSeqLength(i);
606 for (j = i; j <= _numSeqs; j++)
608 const vector<int>* _seqJ = alignPtr->getSequence(j);
609 _lenSeqJ = alignPtr->getSeqLength(j);
611 cout << "\n " << alignPtr->getName(j) << " ";
614 for(k = 1; k <= length_longest; k++)
616 if((k > _lenSeqI) || (k > _lenSeqJ))
623 if (((val1 < 0) || (val1 > _maxAA)) || ((val2 < 0) || (val2 > _maxAA)))
625 continue; // residue = '-';
637 ident = ident/nmatch * 100.0 ;
638 percentMat.SetAt(i, j, ident);
639 percentMat.SetAt(j, i, ident);
643 int _maxNameSize = alignPtr->getMaxNames();
645 (*pfile) << "#\n#\n# Percent Identity Matrix - created by Clustal"
646 << clustalw::userParameters->getRevisionLevel() << " \n#\n#\n";
647 for(i = 1; i <= _numSeqs; i++)
649 (*pfile) << "\n " << right << setw(5) << i << ": ";
650 (*pfile) << left << setw(_maxNameSize) << alignPtr->getName(i);
652 for(j = 1; j <= _numSeqs; j++)
654 (*pfile) << setw(8) << right << fixed << setprecision(0) << percentMat(i, j);
661 void ClusterTree::compareTree(PhyloTree* tree1, PhyloTree* tree2, vector<int>* hits, int n)
666 for (i = 1; i <= n - 3; i++)
668 for (j = 1; j <= n - 3; j++)
672 for (k = 1; k <= n; k++)
674 if (tree1->treeDesc[i][k] == tree2->treeDesc[j][k])
678 if (tree1->treeDesc[i][k] != tree2->treeDesc[j][k])
683 if ((nhits1 == lastSeq - firstSeq + 1) || (nhits2 == lastSeq -
693 * NOTE this will go into the OutputFile class and will not be needed here anymore.
696 /*string ClusterTree::getOutputFileName(const string prompt, string path,
697 const string fileExtension)
700 string _fileName; // Will return this name.
702 _fileName = path + fileExtension;
704 if(_fileName.compare(clustalw::userParameters->getSeqName()) == 0)
706 cout << "Output file name is the same as input file.\n";
707 if (clustalw::userParameters->getMenuFlag())
709 message = "\n\nEnter new name to avoid overwriting [" + _fileName + "]: ";
710 clustalw::utilityObject->getStr(message, temp);
717 else if (clustalw::userParameters->getMenuFlag())
720 message = prompt + " [" + _fileName + "]";
721 clustalw::utilityObject->getStr(message, temp);
731 bool ClusterTree::transition(int base1, int base2)
733 // assumes that the bases of DNA sequences have been translated as
734 // a,A = 0; c,C = 1; g,G = 2; t,T,u,U = 3; N = 4;
735 // a,A = 0; c,C = 2; g,G = 6; t,T,u,U =17;
736 // A <--> G and T <--> C are transitions; all others are transversions.
737 if (((base1 == 0) && (base2 == 6)) || ((base1 == 6) && (base2 == 0)))
742 if (((base1 == 17) && (base2 == 2)) || ((base1 == 2) && (base2 == 17)))
751 * This function is used to open all the bootstrap tree files. It opens them with the
752 * correct message prompt.
754 bool ClusterTree::openFilesForBootstrap(clustalw::OutputFile* clustalFile, clustalw::OutputFile* phylipFile,
755 clustalw::OutputFile* nexusFile, clustalw::TreeNames* treeNames, string* path)
757 if (clustalw::userParameters->getOutputTreeClustal())
759 if(!clustalFile || !clustalFile->openFile(&(treeNames->clustalName),
760 bootstrapPrompt, path, "njb", bootstrapFileTypeMsg))
766 if (clustalw::userParameters->getOutputTreePhylip())
768 if(!phylipFile || !phylipFile->openFile(&(treeNames->phylipName),
769 bootstrapPrompt, path, "phb", bootstrapFileTypeMsg))
775 if (clustalw::userParameters->getOutputTreeNexus())
777 if(!nexusFile || !nexusFile->openFile(&(treeNames->nexusName),
778 bootstrapPrompt, path, "treb", bootstrapFileTypeMsg))
786 bool ClusterTree::openFilesForTreeFromAlignment(clustalw::OutputFile* clustalFile,
787 clustalw::OutputFile* phylipFile, clustalw::OutputFile* distFile, clustalw::OutputFile* nexusFile, clustalw::OutputFile* pimFile,
788 clustalw::TreeNames* treeNames, string* path)
790 if (clustalw::userParameters->getOutputTreeClustal())
792 if(!clustalFile || !clustalFile->openFile(&(treeNames->clustalName),
793 "\nEnter name for CLUSTAL tree output file ",
794 path, "nj", "Phylogenetic tree"))
800 if (clustalw::userParameters->getOutputTreePhylip())
802 if(!phylipFile || !phylipFile->openFile(&(treeNames->phylipName),
803 "\nEnter name for PHYLIP tree output file ", path, "ph",
804 "Phylogenetic tree"))
810 if (clustalw::userParameters->getOutputTreeDistances())
812 if(!distFile || !distFile->openFile(&(treeNames->distName),
813 "\nEnter name for distance matrix output file ",
814 path, "dst", "Distance matrix"))
820 if (clustalw::userParameters->getOutputTreeNexus())
822 if(!nexusFile || !nexusFile->openFile(&(treeNames->nexusName),
823 "\nEnter name for NEXUS tree output file ", path,
824 "tre", "Nexus tree"))
830 if (clustalw::userParameters->getOutputPim())
832 if(!pimFile || !pimFile->openFile(&(treeNames->pimName),
833 "\nEnter name for % Identity matrix output file ", path, "pim",
842 int ClusterTree::calcQuickDistMatForAll(ofstream* clustalFile, ofstream* phylipFile,
843 ofstream* nexusFile, ofstream* pimFile, ofstream* distFile,
844 clustalw::Alignment* alignPtr)
847 bool _DNAFlag = clustalw::userParameters->getDNAFlag();
849 overspill = calcQuickDistMatForSubSet(clustalFile, phylipFile, nexusFile, alignPtr);
851 if (pimFile && clustalw::userParameters->getOutputPim())
853 verbose = false; // Turn off file output
856 calcPercIdentity(pimFile, alignPtr);
860 calcPercIdentity(pimFile, alignPtr);
865 if (distFile && clustalw::userParameters->getOutputTreeDistances())
867 verbose = false; // Turn off file output
870 overspill = dnaDistanceMatrix(distFile, alignPtr);
874 overspill = protDistanceMatrix(distFile, alignPtr);
876 distanceMatrixOutput(distFile, quickDistMat.get(),
882 int ClusterTree::calcQuickDistMatForSubSet(ofstream* clustalFile, ofstream* phylipFile,
883 ofstream* nexusFile, clustalw::Alignment* alignPtr,
887 bool _DNAFlag = clustalw::userParameters->getDNAFlag();
889 if (clustalFile && clustalw::userParameters->getOutputTreeClustal())
893 verbose = true; // Turn on file output
897 verbose = false; // Turn off when we are in the loop in bootstrap!
901 overspill = dnaDistanceMatrix(clustalFile, alignPtr);
905 overspill = protDistanceMatrix(clustalFile, alignPtr);
909 if (phylipFile && clustalw::userParameters->getOutputTreePhylip())
911 verbose = false; // Turn off file output
914 overspill = dnaDistanceMatrix(phylipFile, alignPtr);
918 overspill = protDistanceMatrix(phylipFile, alignPtr);
922 if (nexusFile && clustalw::userParameters->getOutputTreeNexus())
924 verbose = false; // Turn off file output
927 overspill = dnaDistanceMatrix(nexusFile, alignPtr);
931 overspill = protDistanceMatrix(nexusFile, alignPtr);
937 void ClusterTree::printBootstrapHeaderToClustalFile(ofstream* clustalFile)
941 (*clustalFile) << "\n\n\t\t\tBootstrap Confidence Limits\n\n";
942 (*clustalFile) << "\n Random number generator seed = "
944 << clustalw::userParameters->getBootRanSeed() << "\n";
945 (*clustalFile) << "\n Number of bootstrap trials = " << setw(7)
946 << clustalw::userParameters->getBootNumTrials() << "\n";
947 (*clustalFile) << "\n\n Diagrammatic representation of the above tree: \n";
948 (*clustalFile) << "\n Each row represents 1 tree cycle;";
949 (*clustalFile) << " defining 2 groups.\n";
950 (*clustalFile) << "\n Each column is 1 sequence; ";
951 (*clustalFile) << "the stars in each line show 1 group; ";
952 (*clustalFile) << "\n the dots show the other\n";
953 (*clustalFile) << "\n Numbers show occurences in bootstrap samples.";
957 void ClusterTree::promptForBoolSeedAndNumTrials()
959 if (clustalw::userParameters->getMenuFlag())
961 unsigned int tempSeed;
962 tempSeed = clustalw::utilityObject->getInt(
963 "\n\nEnter seed no. for random number generator ", 1, 1000,
964 clustalw::userParameters->getBootRanSeed());
965 clustalw::userParameters->setBootRanSeed(tempSeed);
967 clustalw::userParameters->setBootNumTrials(
968 clustalw::utilityObject->getInt("\n\nEnter number of bootstrap trials ",
969 1, 10000, clustalw::userParameters->getBootNumTrials()));
973 void ClusterTree::printErrorMessageForBootstrap(int totalOverspill, int totalDists,
977 cerr << "\n WARNING: " << totalOverspill
978 << " of the distances out of a total of "
979 << totalDists << " times" << clustalw::userParameters->getBootNumTrials();
980 cerr << "\n were out of range for the distance correction.";
981 cerr << "\n This affected " << nfails << " out of "
982 << clustalw::userParameters->getBootNumTrials() << " bootstrap trials.";
983 cerr << "\n This may not be fatal but you have been warned!" << "\n";
984 cerr << "\n SUGGESTIONS: 1) turn off the correction";
985 cerr << "\n or 2) remove the most distant sequences";
986 cerr << "\n or 3) use the PHYLIP package.\n\n";
988 if (clustalw::userParameters->getMenuFlag())
991 clustalw::utilityObject->getStr(string("Press [RETURN] to continue"), dummy);
995 bool ClusterTree::checkIfConditionsMet(int numSeqs, int min)
997 if (clustalw::userParameters->getEmpty())
999 clustalw::utilityObject->error("You must load an alignment first");
1005 clustalw::utilityObject->error("Alignment has only %d sequences", numSeqs);