1 /////////////////////////////////////////////////////////////////
3 /////////////////////////////////////////////////////////////////
5 #include "SafeVector.h"
6 #include "MultiSequence.h"
9 #include "ProbabilisticModel.h"
10 #include "EvolutionaryTree.h"
11 #include "SparseMatrix.h"
23 string matrixFilename = "";
24 string parametersInputFilename = "";
25 string parametersOutputFilename = "no training";
27 bool enableTraining = false;
28 bool enableVerbose = false;
29 int numConsistencyReps = 2;
30 int numPreTrainingReps = 0;
31 int numIterativeRefinementReps = 100;
33 float gapOpenPenalty = 0;
34 float gapContinuePenalty = 0;
35 VF initDistrib (NumMatrixTypes);
36 VF gapOpen (2*NumInsertStates);
37 VF gapExtend (2*NumInsertStates);
38 SafeVector<char> alphabet;
42 const int MIN_PRETRAINING_REPS = 0;
43 const int MAX_PRETRAINING_REPS = 20;
44 const int MIN_CONSISTENCY_REPS = 0;
45 const int MAX_CONSISTENCY_REPS = 5;
46 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
47 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
49 /////////////////////////////////////////////////////////////////
50 // Function prototypes
51 /////////////////////////////////////////////////////////////////
54 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
55 const VF &gapExtend, const char *filename);
56 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model);
57 SafeVector<string> ParseParams (int argc, char **argv);
58 void ReadParameters ();
59 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
60 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
61 const ProbabilisticModel &model);
62 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
63 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
64 const ProbabilisticModel &model);
65 void DoRelaxation (MultiSequence *sequences, SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
66 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
67 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
68 const ProbabilisticModel &model, MultiSequence* &alignment);
69 //float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs);
71 /////////////////////////////////////////////////////////////////
74 // Calls all initialization routines and runs the PROBCONS
76 /////////////////////////////////////////////////////////////////
78 int main (int argc, char **argv){
81 cerr << "Usage: FixRef inputfile reffile" << endl;
85 string inputFilename = string (argv[1]);
86 string refFilename = string (argv[2]);
90 // build new model for aligning
91 ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
92 alphabet, emitPairs, emitSingle);
94 MultiSequence *inputSeq = new MultiSequence(); inputSeq->LoadMFA (inputFilename);
95 MultiSequence *refSeq = new MultiSequence(); refSeq->LoadMFA (refFilename);
97 SafeVector<char> *ali = new SafeVector<char>;
99 if (refSeq->GetNumSequences() != 2){
100 cerr << "ERROR: Expected two sequences in reference alignment." << endl;
103 set<int> s; s.insert (0);
104 MultiSequence *ref1 = refSeq->Project (s);
105 s.clear(); s.insert (1);
106 MultiSequence *ref2 = refSeq->Project (s);
108 for (int i = 0; i < inputSeq->GetNumSequences(); i++){
109 if (inputSeq->GetSequence(i)->GetHeader() == ref1->GetSequence(0)->GetHeader()){
110 ref1->AddSequence (inputSeq->GetSequence(i)->Clone());
112 if (inputSeq->GetSequence(i)->GetHeader() == ref2->GetSequence(0)->GetHeader())
113 ref2->AddSequence (inputSeq->GetSequence(i)->Clone());
115 if (ref1->GetNumSequences() != 2){
116 cerr << "ERROR: Expected two sequences in reference1 alignment." << endl;
119 if (ref2->GetNumSequences() != 2){
120 cerr << "ERROR: Expected two sequences in reference2 alignment." << endl;
124 ref1->GetSequence(0)->SetLabel(0);
125 ref2->GetSequence(0)->SetLabel(0);
126 ref1->GetSequence(1)->SetLabel(1);
127 ref2->GetSequence(1)->SetLabel(1);
129 // cerr << "Aligning..." << endl;
131 // now, we can perform the alignments and write them out
132 MultiSequence *alignment1 = DoAlign (ref1,
133 ProbabilisticModel (initDistrib, gapOpen, gapExtend,
134 alphabet, emitPairs, emitSingle));
136 //cerr << "Aligning second..." << endl;
137 MultiSequence *alignment2 = DoAlign (ref2,
138 ProbabilisticModel (initDistrib, gapOpen, gapExtend,
139 alphabet, emitPairs, emitSingle));
141 SafeVector<char>::iterator iter1 = alignment1->GetSequence(0)->GetDataPtr();
142 SafeVector<char>::iterator iter2 = alignment1->GetSequence(1)->GetDataPtr();
143 for (int i = 1; i <= alignment1->GetSequence(0)->GetLength(); i++){
144 if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
145 if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
147 iter1 = alignment2->GetSequence(0)->GetDataPtr();
148 iter2 = alignment2->GetSequence(1)->GetDataPtr();
149 for (int i = 1; i <= alignment2->GetSequence(0)->GetLength(); i++){
150 if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
151 if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
153 //alignment1->WriteMFA (cout);
154 //alignment2->WriteMFA (cout);
159 for (int i = 1; i <= refSeq->GetSequence(0)->GetLength(); i++){
161 // catch up in filler sequences
162 if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
165 if (alignment1->GetSequence(0)->GetPosition(a) != '-') break;
166 ali->push_back ('X');
169 if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
172 if (alignment2->GetSequence(0)->GetPosition(b) != '-') break;
173 ali->push_back ('Y');
177 if (refSeq->GetSequence(0)->GetPosition(i) != '-' &&
178 refSeq->GetSequence(1)->GetPosition(i) != '-'){
179 //cerr << "M: " << refSeq->GetSequence(0)->GetPosition(i) << refSeq->GetSequence(1)->GetPosition(i) << endl;
180 ali->push_back ('B');
182 else if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
183 //cerr << "X" << endl;
184 ali->push_back ('X');
186 else if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
187 //cerr << "Y" << endl;
188 ali->push_back ('Y');
192 while (a < alignment1->GetSequence(0)->GetLength()){
194 ali->push_back ('X');
195 if (alignment1->GetSequence(0)->GetPosition(a) != '-') a1++;
197 while (b < alignment2->GetSequence(0)->GetLength()){
199 ali->push_back ('Y');
200 if (alignment2->GetSequence(0)->GetPosition(b) != '-') b1++;
203 Sequence *seq1 = alignment1->GetSequence(1)->AddGaps (ali, 'X');
204 Sequence *seq2 = alignment2->GetSequence(1)->AddGaps (ali, 'Y');
205 seq1->WriteMFA (cout, 60);
206 seq2->WriteMFA (cout, 60);
218 /////////////////////////////////////////////////////////////////
221 // Prints heading for PROBCONS program.
222 /////////////////////////////////////////////////////////////////
224 void PrintHeading (){
226 << "PROBCONS version 1.02 - align multiple protein sequences and print to standard output" << endl
227 << "Copyright (C) 2004 Chuong Ba Do" << endl
231 /////////////////////////////////////////////////////////////////
234 // Prints PROBCONS parameters to STDERR. If a filename is
235 // specified, then the parameters are also written to the file.
236 /////////////////////////////////////////////////////////////////
238 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
239 const VF &gapExtend, const char *filename){
241 // print parameters to the screen
242 cerr << message << endl
243 << " initDistrib[] = { ";
244 for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
246 << " gapOpen[] = { ";
247 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
249 << " gapExtend[] = { ";
250 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
254 // if a file name is specified
257 // attempt to open the file for writing
258 FILE *file = fopen (filename, "w");
260 cerr << "ERROR: Unable to write parameter file: " << filename << endl;
264 // if successful, then write the parameters to the file
265 for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
266 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
267 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
272 /////////////////////////////////////////////////////////////////
275 // First computes all pairwise posterior probability matrices.
276 // Then, computes new parameters if training, or a final
277 // alignment, otherwise.
278 /////////////////////////////////////////////////////////////////
280 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model){
284 const int numSeqs = sequences->GetNumSequences();
285 VVF distances (numSeqs, VF (numSeqs, 0));
286 SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
288 // do all pairwise alignments
289 for (int a = 0; a < numSeqs-1; a++){
290 for (int b = a+1; b < numSeqs; b++){
291 Sequence *seq1 = sequences->GetSequence (a);
292 Sequence *seq2 = sequences->GetSequence (b);
296 cerr << "(" << a+1 << ") " << seq1->GetHeader() << " vs. "
297 << "(" << b+1 << ") " << seq2->GetHeader() << ": ";
299 // compute forward and backward probabilities
300 VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
301 VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
303 // if we are training, then we'll simply want to compute the
304 // expected counts for each region within the matrix separately;
305 // otherwise, we'll need to put all of the regions together and
306 // assemble a posterior probability match matrix
308 // compute posterior probability matrix
309 VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
311 // compute "expected accuracy" distance for evolutionary tree computation
312 pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
316 float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
319 cerr << setprecision (10) << distance << endl;
321 // save posterior probability matrices in sparse format
322 distances[a][b] = distances[b][a] = distance;
323 sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
324 sparseMatrices[b][a] = sparseMatrices[a][b]->ComputeTranspose();
326 delete alignment.first;
334 if (!enableTraining){
338 // now, perform the consistency transformation the desired number of times
339 for (int i = 0; i < numConsistencyReps; i++)
340 DoRelaxation (sequences, sparseMatrices);
342 // compute the evolutionary tree
343 TreeNode *tree = TreeNode::ComputeTree (distances);
345 //tree->Print (cerr, sequences);
348 // make the final alignment
349 MultiSequence *alignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
358 /////////////////////////////////////////////////////////////////
361 // Attempts to parse an integer from the character string given.
362 // Returns true only if no parsing error occurs.
363 /////////////////////////////////////////////////////////////////
365 bool GetInteger (char *data, int *val){
372 retVal = strtol (data, &endPtr, 0);
373 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
374 if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
375 if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
380 /////////////////////////////////////////////////////////////////
383 // Attempts to parse a float from the character string given.
384 // Returns true only if no parsing error occurs.
385 /////////////////////////////////////////////////////////////////
387 bool GetFloat (char *data, float *val){
394 retVal = strtod (data, &endPtr);
395 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
396 if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
397 *val = (float) retVal;
401 /////////////////////////////////////////////////////////////////
404 // Parse all command-line options.
405 /////////////////////////////////////////////////////////////////
407 SafeVector<string> ParseParams (int argc, char **argv){
411 cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
412 << "you are welcome to redistribute it under certain conditions. See the" << endl
413 << "file COPYING.txt for details." << endl
416 << " probcons [OPTION]... [MFAFILE]..." << endl
418 << "Description:" << endl
419 << " Align sequences in MFAFILE(s) and print result to standard output" << endl
421 << " -t, --train FILENAME" << endl
422 << " compute EM transition probabilities, store in FILENAME (default: "
423 << parametersOutputFilename << ")" << endl
425 << " -m, --matrixfile FILENAME" << endl
426 << " read transition parameters from FILENAME (default: "
427 << matrixFilename << ")" << endl
429 << " -p, --paramfile FILENAME" << endl
430 << " read scoring matrix probabilities from FILENAME (default: "
431 << parametersInputFilename << ")" << endl
433 << " -c, --consistency REPS" << endl
434 << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
435 << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
437 << " -ir, --iterative-refinement REPS" << endl
438 << " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
439 << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
441 << " -pre, --pre-training REPS" << endl
442 << " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
443 << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
445 << " -go, --gap-open VALUE" << endl
446 << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
448 << " -ge, --gap-extension VALUE" << endl
449 << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
451 << " -v, --verbose" << endl
452 << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
458 SafeVector<string> sequenceNames;
462 for (int i = 1; i < argc; i++){
463 if (argv[i][0] == '-'){
466 if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
467 enableTraining = true;
469 parametersOutputFilename = string (argv[++i]);
471 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
476 // scoring matrix file
477 else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--matrixfile")){
479 matrixFilename = string (argv[++i]);
481 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
486 // transition/initial distribution parameter file
487 else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
489 parametersInputFilename = string (argv[++i]);
491 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
496 // number of consistency transformations
497 else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
499 if (!GetInteger (argv[++i], &tempInt)){
500 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
504 if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
505 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
506 << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
510 numConsistencyReps = tempInt;
514 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
519 // number of randomized partitioning iterative refinement passes
520 else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
522 if (!GetInteger (argv[++i], &tempInt)){
523 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
527 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
528 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
529 << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
533 numIterativeRefinementReps = tempInt;
537 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
542 // number of EM pre-training rounds
543 else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
545 if (!GetInteger (argv[++i], &tempInt)){
546 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
550 if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
551 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
552 << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
556 numPreTrainingReps = tempInt;
560 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
566 else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
568 if (!GetFloat (argv[++i], &tempFloat)){
569 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
574 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
578 gapOpenPenalty = tempFloat;
582 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
587 // gap extension penalty
588 else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
590 if (!GetFloat (argv[++i], &tempFloat)){
591 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
596 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
600 gapContinuePenalty = tempFloat;
604 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
610 else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
611 enableVerbose = true;
616 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
621 sequenceNames.push_back (string (argv[i]));
625 return sequenceNames;
628 /////////////////////////////////////////////////////////////////
631 // Read initial distribution, transition, and emission
632 // parameters from a file.
633 /////////////////////////////////////////////////////////////////
635 void ReadParameters (){
639 // read initial state distribution and transition parameters
640 if (parametersInputFilename == string ("")){
641 if (NumInsertStates == 1){
642 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
643 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
644 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
646 else if (NumInsertStates == 2){
647 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
648 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
649 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
652 cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
653 << " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
658 data.open (parametersInputFilename.c_str());
660 cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
663 for (int i = 0; i < NumMatrixTypes; i++) data >> initDistrib[i];
664 for (int i = 0; i < 2*NumInsertStates; i++) data >> gapOpen[i];
665 for (int i = 0; i < 2*NumInsertStates; i++) data >> gapExtend[i];
669 // read emission parameters
670 int alphabetSize = 20;
673 alphabet = SafeVector<char>(alphabetSize);
674 emitPairs = VVF (alphabetSize, VF (alphabetSize, 0));
675 emitSingle = VF (alphabetSize);
677 if (matrixFilename == string ("")){
678 for (int i = 0; i < alphabetSize; i++) alphabet[i] = alphabetDefault[i];
679 for (int i = 0; i < alphabetSize; i++){
680 emitSingle[i] = emitSingleDefault[i];
681 for (int j = 0; j <= i; j++){
682 emitPairs[i][j] = emitPairs[j][i] = (i == j);
687 data.open (matrixFilename.c_str());
689 cerr << "ERROR: Unable to read scoring matrix file: " << matrixFilename << endl;
693 for (int i = 0; i < alphabetSize; i++) data >> alphabet[i];
694 for (int i = 0; i < alphabetSize; i++){
695 for (int j = 0; j <= i; j++){
696 data >> emitPairs[i][j];
697 emitPairs[j][i] = emitPairs[i][j];
700 for (int i = 0; i < alphabetSize; i++){
703 assert (ch == alphabet[i]);
705 for (int i = 0; i < alphabetSize; i++) data >> emitSingle[i];
710 /////////////////////////////////////////////////////////////////
713 // Process the tree recursively. Returns the aligned sequences
714 // corresponding to a node or leaf of the tree.
715 /////////////////////////////////////////////////////////////////
717 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
718 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
719 const ProbabilisticModel &model){
720 MultiSequence *result;
722 // check if this is a node of the alignment tree
723 if (tree->GetSequenceLabel() == -1){
724 MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
725 MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
730 result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
737 // otherwise, this is a leaf of the alignment tree
739 result = new MultiSequence(); assert (result);
740 result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
746 /////////////////////////////////////////////////////////////////
747 // ComputeFinalAlignment()
749 // Compute the final alignment by calling ProcessTree(), then
750 // performing iterative refinement as needed.
751 /////////////////////////////////////////////////////////////////
753 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
754 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
755 const ProbabilisticModel &model){
757 MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
759 // iterative refinement
760 for (int i = 0; i < numIterativeRefinementReps; i++)
761 DoIterativeRefinement (sparseMatrices, model, alignment);
765 // return final alignment
769 /////////////////////////////////////////////////////////////////
772 // Returns the alignment of two MultiSequence objects.
773 /////////////////////////////////////////////////////////////////
775 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
776 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
777 const ProbabilisticModel &model){
779 // print some info about the alignment
781 for (int i = 0; i < align1->GetNumSequences(); i++)
782 cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
784 for (int i = 0; i < align2->GetNumSequences(); i++)
785 cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
789 VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices);
790 pair<SafeVector<char> *, float> alignment;
792 // choose the alignment routine depending on the "cosmetic" gap penalties used
793 if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
794 alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
796 alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
797 *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
798 gapOpenPenalty, gapContinuePenalty);
804 // compute total length of sequences
806 for (int i = 0; i < align1->GetNumSequences(); i++)
807 for (int j = 0; j < align2->GetNumSequences(); j++)
808 totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
810 // give an "accuracy" measure for the alignment
811 cerr << alignment.second / totLength << endl;
814 // now build final alignment
815 MultiSequence *result = new MultiSequence();
816 for (int i = 0; i < align1->GetNumSequences(); i++)
817 result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
818 for (int i = 0; i < align2->GetNumSequences(); i++)
819 result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
820 result->SortByLabel();
822 // free temporary alignment
823 delete alignment.first;
828 /////////////////////////////////////////////////////////////////
831 // Performs one round of the consistency transformation. The
834 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
837 // where S = {x, y, all other sequences...}
839 /////////////////////////////////////////////////////////////////
841 void DoRelaxation (MultiSequence *sequences, SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
842 const int numSeqs = sequences->GetNumSequences();
844 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
846 // for every pair of sequences
847 for (int i = 0; i < numSeqs; i++){
848 for (int j = i+1; j < numSeqs; j++){
849 Sequence *seq1 = sequences->GetSequence (i);
850 Sequence *seq2 = sequences->GetSequence (j);
853 cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
854 << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
856 // get the original posterior matrix
857 VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
858 VF &posterior = *posteriorPtr;
860 const int seq1Length = seq1->GetLength();
861 const int seq2Length = seq2->GetLength();
863 // contribution from the summation where z = x and z = y
864 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
867 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
869 // contribution from all other sequences
870 for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
871 Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
874 // now renormalization
875 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
877 // save the new posterior matrix
878 newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
879 newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();
882 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
887 cerr << "done." << endl;
891 // now replace the old posterior matrices
892 for (int i = 0; i < numSeqs; i++){
893 for (int j = 0; j < numSeqs; j++){
894 delete sparseMatrices[i][j];
895 sparseMatrices[i][j] = newSparseMatrices[i][j];
900 /////////////////////////////////////////////////////////////////
903 // Computes the consistency transformation for a single sequence
904 // z, and adds the transformed matrix to "posterior".
905 /////////////////////////////////////////////////////////////////
907 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
912 int lengthX = matXZ->GetSeq1Length();
913 int lengthY = matZY->GetSeq2Length();
914 assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
917 for (int i = 1; i <= lengthX; i++){
918 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
919 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
921 VF::iterator base = posterior.begin() + i * (lengthY + 1);
923 // iterate through all x[i]-z[k]
924 while (XZptr != XZend){
925 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
926 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
927 const float XZval = XZptr->second;
929 // iterate through all z[k]-y[j]
930 while (ZYptr != ZYend){
931 base[ZYptr->first] += XZval * ZYptr->second;;
939 /////////////////////////////////////////////////////////////////
940 // DoIterativeRefinement()
942 // Performs a single round of randomized partionining iterative
944 /////////////////////////////////////////////////////////////////
946 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
947 const ProbabilisticModel &model, MultiSequence* &alignment){
948 set<int> groupOne, groupTwo;
950 // create two separate groups
951 for (int i = 0; i < alignment->GetNumSequences(); i++){
958 if (groupOne.empty() || groupTwo.empty()) return;
960 // project into the two groups
961 MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
962 MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
966 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
970 float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs){
974 for (int a = 0; a < alignment->GetNumSequences(); a++){
975 for (int b = a+1; b < alignment->GetNumSequences(); b++){
976 Sequence *seq1 = alignment->GetSequence(a);
977 Sequence *seq2 = alignment->GetSequence(b);
979 const int seq1Length = sequences->GetSequence(seq1->GetLabel())->GetLength();
980 const int seq2Length = sequences->GetSequence(seq2->GetLabel())->GetLength();
982 totLength += min (seq1Length, seq2Length);
984 int pos1 = 0, pos2 = 0;
985 for (int i = 1; i <= seq1->GetLength(); i++){
986 char ch1 = seq1->GetPosition(i);
987 char ch2 = seq2->GetPosition(i);
989 if (ch1 != '-') pos1++;
990 if (ch2 != '-') pos2++;
991 if (ch1 != '-' && ch2 != '-'){
992 score += sparseMatrices[a * numSeqs + b]->GetValue (pos1, pos2);
998 return score / totLength;