1 /////////////////////////////////////////////////////////////////
4 // Main routines for PROBCONS program.
5 /////////////////////////////////////////////////////////////////
7 #include "SafeVector.h"
8 #include "MultiSequence.h"
10 #include "ScoreType.h"
11 #include "ProbabilisticModel.h"
12 #include "EvolutionaryTree.h"
13 #include "SparseMatrix.h"
28 string parametersInputFilename = "";
29 string parametersOutputFilename = "no training";
30 string annotationFilename = "";
32 bool enableTraining = false;
33 bool enableVerbose = false;
34 bool enableAllPairs = false;
35 bool enableAnnotation = false;
36 bool enableViterbi = false;
37 bool enableClustalWOutput = false;
38 bool enableTrainEmissions = false;
39 bool enableAlignOrder = false;
40 int numConsistencyReps = 2;
41 int numPreTrainingReps = 0;
42 int numIterativeRefinementReps = 100;
45 float gapOpenPenalty = 0;
46 float gapContinuePenalty = 0;
48 VF initDistrib (NumMatrixTypes);
49 VF gapOpen (2*NumInsertStates);
50 VF gapExtend (2*NumInsertStates);
51 VVF emitPairs (256, VF (256, 1e-10));
52 VF emitSingle (256, 1e-5);
54 string alphabet = alphabetDefault;
56 const int MIN_PRETRAINING_REPS = 0;
57 const int MAX_PRETRAINING_REPS = 20;
58 const int MIN_CONSISTENCY_REPS = 0;
59 const int MAX_CONSISTENCY_REPS = 5;
60 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
61 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
63 /////////////////////////////////////////////////////////////////
64 // Function prototypes
65 /////////////////////////////////////////////////////////////////
68 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
69 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
70 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
71 VVF &emitPairs, VF &emitSingle);
72 SafeVector<string> ParseParams (int argc, char **argv);
73 void ReadParameters ();
74 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
75 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
76 const ProbabilisticModel &model);
77 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
78 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
79 const ProbabilisticModel &model);
80 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
81 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
82 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
83 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
85 set<int> GetSubtree (const TreeNode *tree);
86 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
87 const ProbabilisticModel &model, MultiSequence* &alignment,
88 const TreeNode *tree);
89 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
90 const ProbabilisticModel &model, MultiSequence* &alignment);
91 void WriteAnnotation (MultiSequence *alignment,
92 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
93 int ComputeScore (const SafeVector<pair<int, int> > &active,
94 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
97 /////////////////////////////////////////////////////////////////
100 // Calls all initialization routines and runs the PROBCONS
102 /////////////////////////////////////////////////////////////////
104 int main (int argc, char **argv){
106 // print PROBCONS heading
109 // parse program parameters
110 SafeVector<string> sequenceNames = ParseParams (argc, argv);
112 PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
114 // now, we'll process all the files given as input. If we are given
115 // several filenames as input, then we'll load all of those sequences
116 // simultaneously, as long as we're not training. On the other hand,
117 // if we are training, then we'll treat each file as a separate
120 // if we are training
123 // build new model for aligning
124 ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
126 // prepare to average parameters
127 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
128 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
129 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
130 if (enableTrainEmissions){
131 for (int i = 0; i < (int) emitPairs.size(); i++)
132 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
133 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
136 // align each file individually
137 for (int i = 0; i < (int) sequenceNames.size(); i++){
139 VF thisInitDistrib (NumMatrixTypes);
140 VF thisGapOpen (2*NumInsertStates);
141 VF thisGapExtend (2*NumInsertStates);
142 VVF thisEmitPairs (256, VF (256, 1e-10));
143 VF thisEmitSingle (256, 1e-5);
145 // load sequence file
146 MultiSequence *sequences = new MultiSequence(); assert (sequences);
147 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
148 sequences->LoadMFA (sequenceNames[i], true);
151 DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
153 // add in contribution of the derived parameters
154 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
155 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
156 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
157 if (enableTrainEmissions){
158 for (int i = 0; i < (int) emitPairs.size(); i++)
159 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
160 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
166 // compute new parameters and print them out
167 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
168 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
169 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
170 if (enableTrainEmissions){
171 for (int i = 0; i < (int) emitPairs.size(); i++)
172 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
173 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
176 PrintParameters ("Trained parameter set:",
177 initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
178 parametersOutputFilename.c_str());
181 // if we are not training, we must simply want to align some sequences
184 // load all files together
185 MultiSequence *sequences = new MultiSequence(); assert (sequences);
186 for (int i = 0; i < (int) sequenceNames.size(); i++){
187 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
188 sequences->LoadMFA (sequenceNames[i], true);
191 // do all "pre-training" repetitions first
192 for (int ct = 0; ct < numPreTrainingReps; ct++){
193 enableTraining = true;
195 // build new model for aligning
196 ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
197 emitPairs, emitSingle);
199 // do initial alignments
200 DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
202 // print new parameters
203 PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
205 enableTraining = false;
208 // now, we can perform the alignments and write them out
209 MultiSequence *alignment = DoAlign (sequences,
210 ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
211 initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
213 if (!enableAllPairs){
214 if (enableClustalWOutput)
215 alignment->WriteALN (cout);
217 alignment->WriteMFA (cout);
225 /////////////////////////////////////////////////////////////////
228 // Prints heading for PROBCONS program.
229 /////////////////////////////////////////////////////////////////
231 void PrintHeading (){
233 << "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
234 << "Written by Chuong Do" << endl
238 /////////////////////////////////////////////////////////////////
241 // Prints PROBCONS parameters to STDERR. If a filename is
242 // specified, then the parameters are also written to the file.
243 /////////////////////////////////////////////////////////////////
245 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
246 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
248 // print parameters to the screen
249 cerr << message << endl
250 << " initDistrib[] = { ";
251 for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
253 << " gapOpen[] = { ";
254 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
256 << " gapExtend[] = { ";
257 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
262 for (int i = 0; i < 5; i++){
263 for (int j = 0; j <= i; j++){
264 cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
269 // if a file name is specified
272 // attempt to open the file for writing
273 FILE *file = fopen (filename, "w");
275 cerr << "ERROR: Unable to write parameter file: " << filename << endl;
279 // if successful, then write the parameters to the file
280 for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
281 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
282 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
283 fprintf (file, "%s\n", alphabet.c_str());
284 for (int i = 0; i < (int) alphabet.size(); i++){
285 for (int j = 0; j <= i; j++)
286 fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
287 fprintf (file, "\n");
289 for (int i = 0; i < (int) alphabet.size(); i++)
290 fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
291 fprintf (file, "\n");
296 /////////////////////////////////////////////////////////////////
299 // First computes all pairwise posterior probability matrices.
300 // Then, computes new parameters if training, or a final
301 // alignment, otherwise.
302 /////////////////////////////////////////////////////////////////
304 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
305 VF &gapExtend, VVF &emitPairs, VF &emitSingle){
309 const int numSeqs = sequences->GetNumSequences();
310 VVF distances (numSeqs, VF (numSeqs, 0));
311 SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
316 // prepare to average parameters
317 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
318 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
319 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
320 if (enableTrainEmissions){
321 for (int i = 0; i < (int) emitPairs.size(); i++)
322 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
323 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
327 // skip posterior calculations if we just want to do Viterbi alignments
330 // do all pairwise alignments for posterior probability matrices
331 for (int a = 0; a < numSeqs-1; a++){
332 for (int b = a+1; b < numSeqs; b++){
333 Sequence *seq1 = sequences->GetSequence (a);
334 Sequence *seq2 = sequences->GetSequence (b);
338 cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
339 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
341 // compute forward and backward probabilities
342 VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
343 VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
345 // if we are training, then we'll simply want to compute the
346 // expected counts for each region within the matrix separately;
347 // otherwise, we'll need to put all of the regions together and
348 // assemble a posterior probability match matrix
350 // so, if we're training
353 // compute new parameters
354 VF thisInitDistrib (NumMatrixTypes);
355 VF thisGapOpen (2*NumInsertStates);
356 VF thisGapExtend (2*NumInsertStates);
357 VVF thisEmitPairs (256, VF (256, 1e-10));
358 VF thisEmitSingle (256, 1e-5);
360 model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
362 // add in contribution of the derived parameters
363 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
364 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
365 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
366 if (enableTrainEmissions){
367 for (int i = 0; i < (int) emitPairs.size(); i++)
368 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
369 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
372 // let us know that we're done.
373 if (enableVerbose) cerr << "done." << endl;
377 // compute posterior probability matrix
378 VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
380 // compute sparse representations
381 sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
382 sparseMatrices[b][a] = NULL;
384 if (!enableAllPairs){
385 // perform the pairwise sequence alignment
386 pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
390 // compute "expected accuracy" distance for evolutionary tree computation
391 float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
392 distances[a][b] = distances[b][a] = distance;
395 cerr << setprecision (10) << distance << endl;
397 delete alignment.first;
400 // let us know that we're done.
401 if (enableVerbose) cerr << "done." << endl;
413 // now average out parameters derived
416 // compute new parameters
417 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
418 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
419 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
421 if (enableTrainEmissions){
422 for (int i = 0; i < (int) emitPairs.size(); i++)
423 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
424 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
428 // see if we still want to do some alignments
433 // perform the consistency transformation the desired number of times
434 for (int r = 0; r < numConsistencyReps; r++){
435 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
437 // now replace the old posterior matrices
438 for (int i = 0; i < numSeqs; i++){
439 for (int j = 0; j < numSeqs; j++){
440 delete sparseMatrices[i][j];
441 sparseMatrices[i][j] = newSparseMatrices[i][j];
447 MultiSequence *finalAlignment = NULL;
450 for (int a = 0; a < numSeqs-1; a++){
451 for (int b = a+1; b < numSeqs; b++){
452 Sequence *seq1 = sequences->GetSequence (a);
453 Sequence *seq2 = sequences->GetSequence (b);
456 cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
457 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
460 // perform the pairwise sequence alignment
461 pair<SafeVector<char> *, float> alignment;
463 alignment = model.ComputeViterbiAlignment (seq1, seq2);
466 // build posterior matrix
467 VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
468 int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
469 for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
471 alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
475 // write pairwise alignments
476 string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
477 ofstream outfile (name.c_str());
479 MultiSequence *result = new MultiSequence();
480 result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
481 result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
482 if (enableClustalWOutput)
483 result->WriteALN (outfile);
485 result->WriteMFA (outfile);
489 delete alignment.first;
494 // now if we still need to do a final multiple alignment
500 // compute the evolutionary tree
501 TreeNode *tree = TreeNode::ComputeTree (distances);
503 tree->Print (cerr, sequences);
506 // make the final alignment
507 finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
510 if (enableAnnotation){
511 WriteAnnotation (finalAlignment, sparseMatrices);
518 // delete sparse matrices
519 for (int a = 0; a < numSeqs-1; a++){
520 for (int b = a+1; b < numSeqs; b++){
521 delete sparseMatrices[a][b];
522 delete sparseMatrices[b][a];
527 return finalAlignment;
533 /////////////////////////////////////////////////////////////////
536 // Attempts to parse an integer from the character string given.
537 // Returns true only if no parsing error occurs.
538 /////////////////////////////////////////////////////////////////
540 bool GetInteger (char *data, int *val){
547 retVal = strtol (data, &endPtr, 0);
548 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
549 if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
550 if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
555 /////////////////////////////////////////////////////////////////
558 // Attempts to parse a float from the character string given.
559 // Returns true only if no parsing error occurs.
560 /////////////////////////////////////////////////////////////////
562 bool GetFloat (char *data, float *val){
569 retVal = strtod (data, &endPtr);
570 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
571 if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
572 *val = (float) retVal;
576 /////////////////////////////////////////////////////////////////
579 // Parse all command-line options.
580 /////////////////////////////////////////////////////////////////
582 SafeVector<string> ParseParams (int argc, char **argv){
586 cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
587 << "you are welcome to redistribute it under certain conditions. See the" << endl
588 << "file COPYING.txt for details." << endl
591 << " probcons [OPTION]... [MFAFILE]..." << endl
593 << "Description:" << endl
594 << " Align sequences in MFAFILE(s) and print result to standard output" << endl
596 << " -clustalw" << endl
597 << " use CLUSTALW output format instead of MFA" << endl
599 << " -c, --consistency REPS" << endl
600 << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
601 << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
603 << " -ir, --iterative-refinement REPS" << endl
604 << " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
605 << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
607 << " -pre, --pre-training REPS" << endl
608 << " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
609 << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
612 << " generate all-pairs pairwise alignments" << endl
614 << " -viterbi" << endl
615 << " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
617 << " -v, --verbose" << endl
618 << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
620 << " -annot FILENAME" << endl
621 << " write annotation for multiple alignment to FILENAME" << endl
623 << " -t, --train FILENAME" << endl
624 << " compute EM transition probabilities, store in FILENAME (default: "
625 << parametersOutputFilename << ")" << endl
627 << " -e, --emissions" << endl
628 << " also reestimate emission probabilities (default: "
629 << (enableTrainEmissions ? "on" : "off") << ")" << endl
631 << " -p, --paramfile FILENAME" << endl
632 << " read parameters from FILENAME (default: "
633 << parametersInputFilename << ")" << endl
635 << " -a, --alignment-order" << endl
636 << " print sequences in alignment order rather than input order (default: "
637 << (enableAlignOrder ? "on" : "off") << ")" << endl
639 // << " -go, --gap-open VALUE" << endl
640 // << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
642 // << " -ge, --gap-extension VALUE" << endl
643 // << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
645 // << " -co, --cutoff CUTOFF" << endl
646 // << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
652 SafeVector<string> sequenceNames;
656 for (int i = 1; i < argc; i++){
657 if (argv[i][0] == '-'){
660 if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
661 enableTraining = true;
663 parametersOutputFilename = string (argv[++i]);
665 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
671 else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
672 enableTrainEmissions = true;
676 else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
678 parametersInputFilename = string (argv[++i]);
680 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
685 // number of consistency transformations
686 else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
688 if (!GetInteger (argv[++i], &tempInt)){
689 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
693 if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
694 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
695 << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
699 numConsistencyReps = tempInt;
703 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
708 // number of randomized partitioning iterative refinement passes
709 else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
711 if (!GetInteger (argv[++i], &tempInt)){
712 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
716 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
717 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
718 << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
722 numIterativeRefinementReps = tempInt;
726 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
731 // number of EM pre-training rounds
732 else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
734 if (!GetInteger (argv[++i], &tempInt)){
735 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
739 if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
740 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
741 << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
745 numPreTrainingReps = tempInt;
749 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
755 else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
757 if (!GetFloat (argv[++i], &tempFloat)){
758 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
763 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
767 gapOpenPenalty = tempFloat;
771 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
776 // gap extension penalty
777 else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
779 if (!GetFloat (argv[++i], &tempFloat)){
780 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
785 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
789 gapContinuePenalty = tempFloat;
793 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
798 // all-pairs pairwise alignments
799 else if (!strcmp (argv[i], "-pairs")){
800 enableAllPairs = true;
803 // all-pairs pairwise Viterbi alignments
804 else if (!strcmp (argv[i], "-viterbi")){
805 enableAllPairs = true;
806 enableViterbi = true;
810 else if (!strcmp (argv[i], "-annot")){
811 enableAnnotation = true;
813 annotationFilename = argv[++i];
815 cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
820 // clustalw output format
821 else if (!strcmp (argv[i], "-clustalw")){
822 enableClustalWOutput = true;
826 else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
828 if (!GetFloat (argv[++i], &tempFloat)){
829 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
833 if (tempFloat < 0 || tempFloat > 1){
834 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
842 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
848 else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
849 enableVerbose = true;
853 else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
854 enableAlignOrder = true;
859 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
864 sequenceNames.push_back (string (argv[i]));
868 if (enableTrainEmissions && !enableTraining){
869 cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
873 return sequenceNames;
876 /////////////////////////////////////////////////////////////////
879 // Read initial distribution, transition, and emission
880 // parameters from a file.
881 /////////////////////////////////////////////////////////////////
883 void ReadParameters (){
887 emitPairs = VVF (256, VF (256, 1e-10));
888 emitSingle = VF (256, 1e-5);
890 // read initial state distribution and transition parameters
891 if (parametersInputFilename == string ("")){
892 if (NumInsertStates == 1){
893 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
894 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
895 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
897 else if (NumInsertStates == 2){
898 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
899 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
900 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
903 cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
904 << " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
908 alphabet = alphabetDefault;
910 for (int i = 0; i < (int) alphabet.length(); i++){
911 emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
912 emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
913 for (int j = 0; j <= i; j++){
914 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
915 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
916 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
917 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
918 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
919 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
920 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
921 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
926 data.open (parametersInputFilename.c_str());
928 cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
933 for (int i = 0; i < 3; i++){
934 if (!getline (data, line[i])){
935 cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
940 data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
941 data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
942 data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
944 if (!getline (data, line[0])){
945 cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
949 // read alphabet as concatenation of all characters on alphabet line
952 data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
954 for (int i = 0; i < (int) alphabet.size(); i++){
955 for (int j = 0; j <= i; j++){
958 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
959 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
960 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
961 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
962 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
963 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
964 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
965 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
969 for (int i = 0; i < (int) alphabet.size(); i++){
972 emitSingle[(unsigned char) tolower(alphabet[i])] = val;
973 emitSingle[(unsigned char) toupper(alphabet[i])] = val;
979 /////////////////////////////////////////////////////////////////
982 // Process the tree recursively. Returns the aligned sequences
983 // corresponding to a node or leaf of the tree.
984 /////////////////////////////////////////////////////////////////
986 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
987 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
988 const ProbabilisticModel &model){
989 MultiSequence *result;
991 // check if this is a node of the alignment tree
992 if (tree->GetSequenceLabel() == -1){
993 MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
994 MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
999 result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
1006 // otherwise, this is a leaf of the alignment tree
1008 result = new MultiSequence(); assert (result);
1009 result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1015 /////////////////////////////////////////////////////////////////
1016 // ComputeFinalAlignment()
1018 // Compute the final alignment by calling ProcessTree(), then
1019 // performing iterative refinement as needed.
1020 /////////////////////////////////////////////////////////////////
1022 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1023 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1024 const ProbabilisticModel &model){
1026 MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1028 SafeVector<int> oldOrdering;
1029 if (enableAlignOrder){
1030 for (int i = 0; i < alignment->GetNumSequences(); i++)
1031 oldOrdering.push_back (alignment->GetSequence(i)->GetSortLabel());
1032 alignment->SaveOrdering();
1033 enableAlignOrder = false;
1036 // tree-based refinement
1037 // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1039 // iterative refinement
1040 for (int i = 0; i < numIterativeRefinementReps; i++)
1041 DoIterativeRefinement (sparseMatrices, model, alignment);
1045 if (oldOrdering.size() > 0){
1046 for (int i = 0; i < (int) oldOrdering.size(); i++){
1047 alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
1051 // return final alignment
1055 /////////////////////////////////////////////////////////////////
1056 // AlignAlignments()
1058 // Returns the alignment of two MultiSequence objects.
1059 /////////////////////////////////////////////////////////////////
1061 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1062 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1063 const ProbabilisticModel &model){
1065 // print some info about the alignment
1067 for (int i = 0; i < align1->GetNumSequences(); i++)
1068 cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1070 for (int i = 0; i < align2->GetNumSequences(); i++)
1071 cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1075 VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1076 pair<SafeVector<char> *, float> alignment;
1078 // choose the alignment routine depending on the "cosmetic" gap penalties used
1079 if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1080 alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1082 alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1083 *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1084 gapOpenPenalty, gapContinuePenalty);
1090 // compute total length of sequences
1092 for (int i = 0; i < align1->GetNumSequences(); i++)
1093 for (int j = 0; j < align2->GetNumSequences(); j++)
1094 totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1096 // give an "accuracy" measure for the alignment
1097 cerr << alignment.second / totLength << endl;
1100 // now build final alignment
1101 MultiSequence *result = new MultiSequence();
1102 for (int i = 0; i < align1->GetNumSequences(); i++)
1103 result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1104 for (int i = 0; i < align2->GetNumSequences(); i++)
1105 result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1106 if (!enableAlignOrder)
1107 result->SortByLabel();
1109 // free temporary alignment
1110 delete alignment.first;
1115 /////////////////////////////////////////////////////////////////
1118 // Performs one round of the consistency transformation. The
1121 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1124 // where S = {x, y, all other sequences...}
1126 /////////////////////////////////////////////////////////////////
1128 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1129 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1130 const int numSeqs = sequences->GetNumSequences();
1132 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1134 // for every pair of sequences
1135 for (int i = 0; i < numSeqs; i++){
1136 for (int j = i+1; j < numSeqs; j++){
1137 Sequence *seq1 = sequences->GetSequence (i);
1138 Sequence *seq2 = sequences->GetSequence (j);
1141 cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1142 << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1144 // get the original posterior matrix
1145 VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1146 VF &posterior = *posteriorPtr;
1148 const int seq1Length = seq1->GetLength();
1149 const int seq2Length = seq2->GetLength();
1151 // contribution from the summation where z = x and z = y
1152 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1155 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1157 // contribution from all other sequences
1158 for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1160 Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1161 else if (k > i && k < j)
1162 Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1164 SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1165 Relax (sparseMatrices[i][k], temp, posterior);
1170 // now renormalization
1171 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1173 // mask out positions not originally in the posterior matrix
1174 SparseMatrix *matXY = sparseMatrices[i][j];
1175 for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1176 for (int x = 1; x <= seq1Length; x++){
1177 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1178 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1179 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1181 while (XYptr != XYend){
1183 // zero out all cells until the first filled column
1184 while (curr < XYptr->first){
1189 // now, skip over this column
1194 // zero out cells after last column
1195 while (curr <= seq2Length){
1201 // save the new posterior matrix
1202 newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1203 newSparseMatrices[j][i] = NULL;
1206 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1208 delete posteriorPtr;
1211 cerr << "done." << endl;
1215 return newSparseMatrices;
1218 /////////////////////////////////////////////////////////////////
1221 // Computes the consistency transformation for a single sequence
1222 // z, and adds the transformed matrix to "posterior".
1223 /////////////////////////////////////////////////////////////////
1225 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1230 int lengthX = matXZ->GetSeq1Length();
1231 int lengthY = matZY->GetSeq2Length();
1232 assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1235 for (int i = 1; i <= lengthX; i++){
1236 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1237 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1239 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1241 // iterate through all x[i]-z[k]
1242 while (XZptr != XZend){
1243 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1244 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1245 const float XZval = XZptr->second;
1247 // iterate through all z[k]-y[j]
1248 while (ZYptr != ZYend){
1249 base[ZYptr->first] += XZval * ZYptr->second;
1257 /////////////////////////////////////////////////////////////////
1260 // Computes the consistency transformation for a single sequence
1261 // z, and adds the transformed matrix to "posterior".
1262 /////////////////////////////////////////////////////////////////
1264 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1269 int lengthZ = matZX->GetSeq1Length();
1270 int lengthY = matZY->GetSeq2Length();
1273 for (int k = 1; k <= lengthZ; k++){
1274 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1275 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1277 // iterate through all z[k]-x[i]
1278 while (ZXptr != ZXend){
1279 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1280 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1281 const float ZXval = ZXptr->second;
1282 VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1284 // iterate through all z[k]-y[j]
1285 while (ZYptr != ZYend){
1286 base[ZYptr->first] += ZXval * ZYptr->second;
1294 /////////////////////////////////////////////////////////////////
1297 // Returns set containing all leaf labels of the current subtree.
1298 /////////////////////////////////////////////////////////////////
1300 set<int> GetSubtree (const TreeNode *tree){
1303 if (tree->GetSequenceLabel() == -1){
1304 s = GetSubtree (tree->GetLeftChild());
1305 set<int> t = GetSubtree (tree->GetRightChild());
1307 for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1311 s.insert (tree->GetSequenceLabel());
1317 /////////////////////////////////////////////////////////////////
1318 // TreeBasedBiPartitioning
1320 // Uses the iterative refinement scheme from MUSCLE.
1321 /////////////////////////////////////////////////////////////////
1323 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1324 const ProbabilisticModel &model, MultiSequence* &alignment,
1325 const TreeNode *tree){
1326 // check if this is a node of the alignment tree
1327 if (tree->GetSequenceLabel() == -1){
1328 TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1329 TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1331 set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1332 set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1333 set<int> leftSubtreeComplement, rightSubtreeComplement;
1335 // calculate complement of each subtree
1336 for (int i = 0; i < alignment->GetNumSequences(); i++){
1337 if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1338 if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1341 // perform realignments for edge to left child
1342 if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1343 MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1344 MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1346 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1349 // perform realignments for edge to right child
1350 if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1351 MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1352 MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1354 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1359 /////////////////////////////////////////////////////////////////
1360 // DoIterativeRefinement()
1362 // Performs a single round of randomized partionining iterative
1364 /////////////////////////////////////////////////////////////////
1366 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1367 const ProbabilisticModel &model, MultiSequence* &alignment){
1368 set<int> groupOne, groupTwo;
1370 // create two separate groups
1371 for (int i = 0; i < alignment->GetNumSequences(); i++){
1373 groupOne.insert (i);
1375 groupTwo.insert (i);
1378 if (groupOne.empty() || groupTwo.empty()) return;
1380 // project into the two groups
1381 MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1382 MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1386 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1388 delete groupOneSeqs;
1389 delete groupTwoSeqs;
1392 /////////////////////////////////////////////////////////////////
1393 // WriteAnnotation()
1395 // Computes annotation for multiple alignment and write values
1397 /////////////////////////////////////////////////////////////////
1399 void WriteAnnotation (MultiSequence *alignment,
1400 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1401 ofstream outfile (annotationFilename.c_str());
1403 if (outfile.fail()){
1404 cerr << "ERROR: Unable to write annotation file." << endl;
1408 const int alignLength = alignment->GetSequence(0)->GetLength();
1409 const int numSeqs = alignment->GetNumSequences();
1411 SafeVector<int> position (numSeqs, 0);
1412 SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1413 for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1414 SafeVector<pair<int,int> > active;
1415 active.reserve (numSeqs);
1417 SafeVector<int> lab;
1418 for (int i = 0; i < numSeqs; i++) lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1421 for (int i = 1; i <= alignLength; i++){
1423 // find all aligned residues in this particular column
1425 for (int j = 0; j < numSeqs; j++){
1426 if (seqs[j][i] != '-'){
1427 active.push_back (make_pair(lab[j], ++position[j]));
1431 sort (active.begin(), active.end());
1432 outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1438 /////////////////////////////////////////////////////////////////
1441 // Computes the annotation score for a particular column.
1442 /////////////////////////////////////////////////////////////////
1444 int ComputeScore (const SafeVector<pair<int, int> > &active,
1445 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1447 if (active.size() <= 1) return 0;
1449 // ALTERNATIVE #1: Compute the average alignment score.
1452 for (int i = 0; i < (int) active.size(); i++){
1453 for (int j = i+1; j < (int) active.size(); j++){
1454 val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1458 return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));