1 /////////////////////////////////////////////////////////////////
4 // Main routines for PROBCONS program.
5 /////////////////////////////////////////////////////////////////
7 #include "SafeVector.h"
8 #include "MultiSequence.h"
10 #include "ScoreType.h"
11 #include "ProbabilisticModel.h"
12 #include "EvolutionaryTree.h"
13 #include "SparseMatrix.h"
27 string parametersInputFilename = "";
28 string parametersOutputFilename = "no training";
29 string annotationFilename = "";
31 bool enableTraining = false;
32 bool enableVerbose = false;
33 bool enableAllPairs = false;
34 bool enableAnnotation = false;
35 bool enableViterbi = false;
36 bool enableClustalWOutput = false;
37 bool enableTrainEmissions = false;
38 bool enableAlignOrder = false;
39 int numConsistencyReps = 2;
40 int numPreTrainingReps = 0;
41 int numIterativeRefinementReps = 100;
44 float gapOpenPenalty = 0;
45 float gapContinuePenalty = 0;
47 VF initDistrib (NumMatrixTypes);
48 VF gapOpen (2*NumInsertStates);
49 VF gapExtend (2*NumInsertStates);
50 VVF emitPairs (256, VF (256, 1e-10));
51 VF emitSingle (256, 1e-5);
53 string alphabet = alphabetDefault;
55 const int MIN_PRETRAINING_REPS = 0;
56 const int MAX_PRETRAINING_REPS = 20;
57 const int MIN_CONSISTENCY_REPS = 0;
58 const int MAX_CONSISTENCY_REPS = 5;
59 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
60 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
62 /////////////////////////////////////////////////////////////////
63 // Function prototypes
64 /////////////////////////////////////////////////////////////////
67 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
68 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
69 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
70 VVF &emitPairs, VF &emitSingle);
71 SafeVector<string> ParseParams (int argc, char **argv);
72 void ReadParameters ();
73 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
74 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
75 const ProbabilisticModel &model);
76 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
77 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
78 const ProbabilisticModel &model);
79 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
80 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
81 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
82 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
84 set<int> GetSubtree (const TreeNode *tree);
85 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
86 const ProbabilisticModel &model, MultiSequence* &alignment,
87 const TreeNode *tree);
88 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
89 const ProbabilisticModel &model, MultiSequence* &alignment);
90 void WriteAnnotation (MultiSequence *alignment,
91 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
92 int ComputeScore (const SafeVector<pair<int, int> > &active,
93 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
96 /////////////////////////////////////////////////////////////////
99 // Calls all initialization routines and runs the PROBCONS
101 /////////////////////////////////////////////////////////////////
103 int main (int argc, char **argv){
105 // print PROBCONS heading
108 // parse program parameters
109 SafeVector<string> sequenceNames = ParseParams (argc, argv);
111 PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
113 // now, we'll process all the files given as input. If we are given
114 // several filenames as input, then we'll load all of those sequences
115 // simultaneously, as long as we're not training. On the other hand,
116 // if we are training, then we'll treat each file as a separate
119 // if we are training
122 // build new model for aligning
123 ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
125 // prepare to average parameters
126 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
127 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
128 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
129 if (enableTrainEmissions){
130 for (int i = 0; i < (int) emitPairs.size(); i++)
131 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
132 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
135 // align each file individually
136 for (int i = 0; i < (int) sequenceNames.size(); i++){
138 VF thisInitDistrib (NumMatrixTypes);
139 VF thisGapOpen (2*NumInsertStates);
140 VF thisGapExtend (2*NumInsertStates);
141 VVF thisEmitPairs (256, VF (256, 1e-10));
142 VF thisEmitSingle (256, 1e-5);
144 // load sequence file
145 MultiSequence *sequences = new MultiSequence(); assert (sequences);
146 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
147 sequences->LoadMFA (sequenceNames[i], true);
150 DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
152 // add in contribution of the derived parameters
153 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
154 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
155 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
156 if (enableTrainEmissions){
157 for (int i = 0; i < (int) emitPairs.size(); i++)
158 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
159 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
165 // compute new parameters and print them out
166 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
167 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
168 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
169 if (enableTrainEmissions){
170 for (int i = 0; i < (int) emitPairs.size(); i++)
171 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
172 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
175 PrintParameters ("Trained parameter set:",
176 initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
177 parametersOutputFilename.c_str());
180 // if we are not training, we must simply want to align some sequences
183 // load all files together
184 MultiSequence *sequences = new MultiSequence(); assert (sequences);
185 for (int i = 0; i < (int) sequenceNames.size(); i++){
186 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
187 sequences->LoadMFA (sequenceNames[i], true);
190 // do all "pre-training" repetitions first
191 for (int ct = 0; ct < numPreTrainingReps; ct++){
192 enableTraining = true;
194 // build new model for aligning
195 ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
196 emitPairs, emitSingle);
198 // do initial alignments
199 DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
201 // print new parameters
202 PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
204 enableTraining = false;
207 // now, we can perform the alignments and write them out
208 MultiSequence *alignment = DoAlign (sequences,
209 ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
210 initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
212 if (!enableAllPairs){
213 if (enableClustalWOutput)
214 alignment->WriteALN (cout);
216 alignment->WriteMFA (cout);
224 /////////////////////////////////////////////////////////////////
227 // Prints heading for PROBCONS program.
228 /////////////////////////////////////////////////////////////////
230 void PrintHeading (){
232 << "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
233 << "Written by Chuong Do" << endl
237 /////////////////////////////////////////////////////////////////
240 // Prints PROBCONS parameters to STDERR. If a filename is
241 // specified, then the parameters are also written to the file.
242 /////////////////////////////////////////////////////////////////
244 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
245 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
247 // print parameters to the screen
248 cerr << message << endl
249 << " initDistrib[] = { ";
250 for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
252 << " gapOpen[] = { ";
253 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
255 << " gapExtend[] = { ";
256 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
261 for (int i = 0; i < 5; i++){
262 for (int j = 0; j <= i; j++){
263 cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
268 // if a file name is specified
271 // attempt to open the file for writing
272 FILE *file = fopen (filename, "w");
274 cerr << "ERROR: Unable to write parameter file: " << filename << endl;
278 // if successful, then write the parameters to the file
279 for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
280 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
281 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
282 fprintf (file, "%s\n", alphabet.c_str());
283 for (int i = 0; i < (int) alphabet.size(); i++){
284 for (int j = 0; j <= i; j++)
285 fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
286 fprintf (file, "\n");
288 for (int i = 0; i < (int) alphabet.size(); i++)
289 fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
290 fprintf (file, "\n");
295 /////////////////////////////////////////////////////////////////
298 // First computes all pairwise posterior probability matrices.
299 // Then, computes new parameters if training, or a final
300 // alignment, otherwise.
301 /////////////////////////////////////////////////////////////////
303 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
304 VF &gapExtend, VVF &emitPairs, VF &emitSingle){
308 const int numSeqs = sequences->GetNumSequences();
309 VVF distances (numSeqs, VF (numSeqs, 0));
310 SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
315 // prepare to average parameters
316 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
317 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
318 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
319 if (enableTrainEmissions){
320 for (int i = 0; i < (int) emitPairs.size(); i++)
321 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
322 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
326 // skip posterior calculations if we just want to do Viterbi alignments
329 // do all pairwise alignments for posterior probability matrices
330 for (int a = 0; a < numSeqs-1; a++){
331 for (int b = a+1; b < numSeqs; b++){
332 Sequence *seq1 = sequences->GetSequence (a);
333 Sequence *seq2 = sequences->GetSequence (b);
337 cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
338 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
340 // compute forward and backward probabilities
341 VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
342 VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
344 // if we are training, then we'll simply want to compute the
345 // expected counts for each region within the matrix separately;
346 // otherwise, we'll need to put all of the regions together and
347 // assemble a posterior probability match matrix
349 // so, if we're training
352 // compute new parameters
353 VF thisInitDistrib (NumMatrixTypes);
354 VF thisGapOpen (2*NumInsertStates);
355 VF thisGapExtend (2*NumInsertStates);
356 VVF thisEmitPairs (256, VF (256, 1e-10));
357 VF thisEmitSingle (256, 1e-5);
359 model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
361 // add in contribution of the derived parameters
362 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
363 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
364 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
365 if (enableTrainEmissions){
366 for (int i = 0; i < (int) emitPairs.size(); i++)
367 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
368 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
371 // let us know that we're done.
372 if (enableVerbose) cerr << "done." << endl;
376 // compute posterior probability matrix
377 VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
379 // compute sparse representations
380 sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
381 sparseMatrices[b][a] = NULL;
383 if (!enableAllPairs){
384 // perform the pairwise sequence alignment
385 pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
389 // compute "expected accuracy" distance for evolutionary tree computation
390 float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
391 distances[a][b] = distances[b][a] = distance;
394 cerr << setprecision (10) << distance << endl;
396 delete alignment.first;
399 // let us know that we're done.
400 if (enableVerbose) cerr << "done." << endl;
412 // now average out parameters derived
415 // compute new parameters
416 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
417 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
418 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
420 if (enableTrainEmissions){
421 for (int i = 0; i < (int) emitPairs.size(); i++)
422 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
423 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
427 // see if we still want to do some alignments
432 // perform the consistency transformation the desired number of times
433 for (int r = 0; r < numConsistencyReps; r++){
434 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
436 // now replace the old posterior matrices
437 for (int i = 0; i < numSeqs; i++){
438 for (int j = 0; j < numSeqs; j++){
439 delete sparseMatrices[i][j];
440 sparseMatrices[i][j] = newSparseMatrices[i][j];
446 MultiSequence *finalAlignment = NULL;
449 for (int a = 0; a < numSeqs-1; a++){
450 for (int b = a+1; b < numSeqs; b++){
451 Sequence *seq1 = sequences->GetSequence (a);
452 Sequence *seq2 = sequences->GetSequence (b);
455 cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
456 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
459 // perform the pairwise sequence alignment
460 pair<SafeVector<char> *, float> alignment;
462 alignment = model.ComputeViterbiAlignment (seq1, seq2);
465 // build posterior matrix
466 VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
467 int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
468 for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
470 alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
474 // write pairwise alignments
475 string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
476 ofstream outfile (name.c_str());
478 MultiSequence *result = new MultiSequence();
479 result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
480 result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
481 if (enableClustalWOutput)
482 result->WriteALN (outfile);
484 result->WriteMFA (outfile);
488 delete alignment.first;
493 // now if we still need to do a final multiple alignment
499 // compute the evolutionary tree
500 TreeNode *tree = TreeNode::ComputeTree (distances);
502 tree->Print (cerr, sequences);
505 // make the final alignment
506 finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
509 if (enableAnnotation){
510 WriteAnnotation (finalAlignment, sparseMatrices);
517 // delete sparse matrices
518 for (int a = 0; a < numSeqs-1; a++){
519 for (int b = a+1; b < numSeqs; b++){
520 delete sparseMatrices[a][b];
521 delete sparseMatrices[b][a];
526 return finalAlignment;
532 /////////////////////////////////////////////////////////////////
535 // Attempts to parse an integer from the character string given.
536 // Returns true only if no parsing error occurs.
537 /////////////////////////////////////////////////////////////////
539 bool GetInteger (char *data, int *val){
546 retVal = strtol (data, &endPtr, 0);
547 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
548 if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
549 if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
554 /////////////////////////////////////////////////////////////////
557 // Attempts to parse a float from the character string given.
558 // Returns true only if no parsing error occurs.
559 /////////////////////////////////////////////////////////////////
561 bool GetFloat (char *data, float *val){
568 retVal = strtod (data, &endPtr);
569 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
570 if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
571 *val = (float) retVal;
575 /////////////////////////////////////////////////////////////////
578 // Parse all command-line options.
579 /////////////////////////////////////////////////////////////////
581 SafeVector<string> ParseParams (int argc, char **argv){
585 cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
586 << "you are welcome to redistribute it under certain conditions. See the" << endl
587 << "file COPYING.txt for details." << endl
590 << " probcons [OPTION]... [MFAFILE]..." << endl
592 << "Description:" << endl
593 << " Align sequences in MFAFILE(s) and print result to standard output" << endl
595 << " -clustalw" << endl
596 << " use CLUSTALW output format instead of MFA" << endl
598 << " -c, --consistency REPS" << endl
599 << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
600 << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
602 << " -ir, --iterative-refinement REPS" << endl
603 << " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
604 << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
606 << " -pre, --pre-training REPS" << endl
607 << " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
608 << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
611 << " generate all-pairs pairwise alignments" << endl
613 << " -viterbi" << endl
614 << " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
616 << " -v, --verbose" << endl
617 << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
619 << " -annot FILENAME" << endl
620 << " write annotation for multiple alignment to FILENAME" << endl
622 << " -t, --train FILENAME" << endl
623 << " compute EM transition probabilities, store in FILENAME (default: "
624 << parametersOutputFilename << ")" << endl
626 << " -e, --emissions" << endl
627 << " also reestimate emission probabilities (default: "
628 << (enableTrainEmissions ? "on" : "off") << ")" << endl
630 << " -p, --paramfile FILENAME" << endl
631 << " read parameters from FILENAME (default: "
632 << parametersInputFilename << ")" << endl
634 << " -a, --alignment-order" << endl
635 << " print sequences in alignment order rather than input order (default: "
636 << (enableAlignOrder ? "on" : "off") << ")" << endl
638 // << " -go, --gap-open VALUE" << endl
639 // << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
641 // << " -ge, --gap-extension VALUE" << endl
642 // << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
644 // << " -co, --cutoff CUTOFF" << endl
645 // << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
651 SafeVector<string> sequenceNames;
655 for (int i = 1; i < argc; i++){
656 if (argv[i][0] == '-'){
659 if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
660 enableTraining = true;
662 parametersOutputFilename = string (argv[++i]);
664 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
670 else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
671 enableTrainEmissions = true;
675 else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
677 parametersInputFilename = string (argv[++i]);
679 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
684 // number of consistency transformations
685 else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
687 if (!GetInteger (argv[++i], &tempInt)){
688 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
692 if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
693 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
694 << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
698 numConsistencyReps = tempInt;
702 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
707 // number of randomized partitioning iterative refinement passes
708 else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
710 if (!GetInteger (argv[++i], &tempInt)){
711 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
715 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
716 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
717 << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
721 numIterativeRefinementReps = tempInt;
725 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
730 // number of EM pre-training rounds
731 else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
733 if (!GetInteger (argv[++i], &tempInt)){
734 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
738 if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
739 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
740 << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
744 numPreTrainingReps = tempInt;
748 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
754 else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
756 if (!GetFloat (argv[++i], &tempFloat)){
757 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
762 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
766 gapOpenPenalty = tempFloat;
770 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
775 // gap extension penalty
776 else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
778 if (!GetFloat (argv[++i], &tempFloat)){
779 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
784 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
788 gapContinuePenalty = tempFloat;
792 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
797 // all-pairs pairwise alignments
798 else if (!strcmp (argv[i], "-pairs")){
799 enableAllPairs = true;
802 // all-pairs pairwise Viterbi alignments
803 else if (!strcmp (argv[i], "-viterbi")){
804 enableAllPairs = true;
805 enableViterbi = true;
809 else if (!strcmp (argv[i], "-annot")){
810 enableAnnotation = true;
812 annotationFilename = argv[++i];
814 cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
819 // clustalw output format
820 else if (!strcmp (argv[i], "-clustalw")){
821 enableClustalWOutput = true;
825 else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
827 if (!GetFloat (argv[++i], &tempFloat)){
828 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
832 if (tempFloat < 0 || tempFloat > 1){
833 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
841 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
847 else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
848 enableVerbose = true;
852 else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
853 enableAlignOrder = true;
858 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
863 sequenceNames.push_back (string (argv[i]));
867 if (enableTrainEmissions && !enableTraining){
868 cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
872 return sequenceNames;
875 /////////////////////////////////////////////////////////////////
878 // Read initial distribution, transition, and emission
879 // parameters from a file.
880 /////////////////////////////////////////////////////////////////
882 void ReadParameters (){
886 emitPairs = VVF (256, VF (256, 1e-10));
887 emitSingle = VF (256, 1e-5);
889 // read initial state distribution and transition parameters
890 if (parametersInputFilename == string ("")){
891 if (NumInsertStates == 1){
892 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
893 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
894 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
896 else if (NumInsertStates == 2){
897 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
898 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
899 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
902 cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
903 << " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
907 alphabet = alphabetDefault;
909 for (int i = 0; i < (int) alphabet.length(); i++){
910 emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
911 emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
912 for (int j = 0; j <= i; j++){
913 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
914 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
915 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
916 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
917 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
918 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
919 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
920 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
925 data.open (parametersInputFilename.c_str());
927 cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
932 for (int i = 0; i < 3; i++){
933 if (!getline (data, line[i])){
934 cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
939 data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
940 data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
941 data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
943 if (!getline (data, line[0])){
944 cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
948 // read alphabet as concatenation of all characters on alphabet line
951 data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
953 for (int i = 0; i < (int) alphabet.size(); i++){
954 for (int j = 0; j <= i; j++){
957 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
958 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
959 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
960 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
961 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
962 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
963 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
964 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
968 for (int i = 0; i < (int) alphabet.size(); i++){
971 emitSingle[(unsigned char) tolower(alphabet[i])] = val;
972 emitSingle[(unsigned char) toupper(alphabet[i])] = val;
978 /////////////////////////////////////////////////////////////////
981 // Process the tree recursively. Returns the aligned sequences
982 // corresponding to a node or leaf of the tree.
983 /////////////////////////////////////////////////////////////////
985 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
986 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
987 const ProbabilisticModel &model){
988 MultiSequence *result;
990 // check if this is a node of the alignment tree
991 if (tree->GetSequenceLabel() == -1){
992 MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
993 MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
998 result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
1005 // otherwise, this is a leaf of the alignment tree
1007 result = new MultiSequence(); assert (result);
1008 result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1014 /////////////////////////////////////////////////////////////////
1015 // ComputeFinalAlignment()
1017 // Compute the final alignment by calling ProcessTree(), then
1018 // performing iterative refinement as needed.
1019 /////////////////////////////////////////////////////////////////
1021 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1022 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1023 const ProbabilisticModel &model){
1025 MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1027 SafeVector<int> oldOrdering;
1028 if (enableAlignOrder){
1029 for (int i = 0; i < alignment->GetNumSequences(); i++)
1030 oldOrdering.push_back (alignment->GetSequence(i)->GetSortLabel());
1031 alignment->SaveOrdering();
1032 enableAlignOrder = false;
1035 // tree-based refinement
1036 // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1038 // iterative refinement
1039 for (int i = 0; i < numIterativeRefinementReps; i++)
1040 DoIterativeRefinement (sparseMatrices, model, alignment);
1044 if (oldOrdering.size() > 0){
1045 for (int i = 0; i < (int) oldOrdering.size(); i++){
1046 alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
1050 // return final alignment
1054 /////////////////////////////////////////////////////////////////
1055 // AlignAlignments()
1057 // Returns the alignment of two MultiSequence objects.
1058 /////////////////////////////////////////////////////////////////
1060 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1061 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1062 const ProbabilisticModel &model){
1064 // print some info about the alignment
1066 for (int i = 0; i < align1->GetNumSequences(); i++)
1067 cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1069 for (int i = 0; i < align2->GetNumSequences(); i++)
1070 cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1074 VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1075 pair<SafeVector<char> *, float> alignment;
1077 // choose the alignment routine depending on the "cosmetic" gap penalties used
1078 if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1079 alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1081 alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1082 *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1083 gapOpenPenalty, gapContinuePenalty);
1089 // compute total length of sequences
1091 for (int i = 0; i < align1->GetNumSequences(); i++)
1092 for (int j = 0; j < align2->GetNumSequences(); j++)
1093 totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1095 // give an "accuracy" measure for the alignment
1096 cerr << alignment.second / totLength << endl;
1099 // now build final alignment
1100 MultiSequence *result = new MultiSequence();
1101 for (int i = 0; i < align1->GetNumSequences(); i++)
1102 result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1103 for (int i = 0; i < align2->GetNumSequences(); i++)
1104 result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1105 if (!enableAlignOrder)
1106 result->SortByLabel();
1108 // free temporary alignment
1109 delete alignment.first;
1114 /////////////////////////////////////////////////////////////////
1117 // Performs one round of the consistency transformation. The
1120 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1123 // where S = {x, y, all other sequences...}
1125 /////////////////////////////////////////////////////////////////
1127 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1128 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1129 const int numSeqs = sequences->GetNumSequences();
1131 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1133 // for every pair of sequences
1134 for (int i = 0; i < numSeqs; i++){
1135 for (int j = i+1; j < numSeqs; j++){
1136 Sequence *seq1 = sequences->GetSequence (i);
1137 Sequence *seq2 = sequences->GetSequence (j);
1140 cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1141 << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1143 // get the original posterior matrix
1144 VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1145 VF &posterior = *posteriorPtr;
1147 const int seq1Length = seq1->GetLength();
1148 const int seq2Length = seq2->GetLength();
1150 // contribution from the summation where z = x and z = y
1151 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1154 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1156 // contribution from all other sequences
1157 for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1159 Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1160 else if (k > i && k < j)
1161 Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1163 SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1164 Relax (sparseMatrices[i][k], temp, posterior);
1169 // now renormalization
1170 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1172 // mask out positions not originally in the posterior matrix
1173 SparseMatrix *matXY = sparseMatrices[i][j];
1174 for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1175 for (int x = 1; x <= seq1Length; x++){
1176 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1177 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1178 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1180 while (XYptr != XYend){
1182 // zero out all cells until the first filled column
1183 while (curr < XYptr->first){
1188 // now, skip over this column
1193 // zero out cells after last column
1194 while (curr <= seq2Length){
1200 // save the new posterior matrix
1201 newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1202 newSparseMatrices[j][i] = NULL;
1205 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1207 delete posteriorPtr;
1210 cerr << "done." << endl;
1214 return newSparseMatrices;
1217 /////////////////////////////////////////////////////////////////
1220 // Computes the consistency transformation for a single sequence
1221 // z, and adds the transformed matrix to "posterior".
1222 /////////////////////////////////////////////////////////////////
1224 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1229 int lengthX = matXZ->GetSeq1Length();
1230 int lengthY = matZY->GetSeq2Length();
1231 assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1234 for (int i = 1; i <= lengthX; i++){
1235 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1236 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1238 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1240 // iterate through all x[i]-z[k]
1241 while (XZptr != XZend){
1242 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1243 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1244 const float XZval = XZptr->second;
1246 // iterate through all z[k]-y[j]
1247 while (ZYptr != ZYend){
1248 base[ZYptr->first] += XZval * ZYptr->second;
1256 /////////////////////////////////////////////////////////////////
1259 // Computes the consistency transformation for a single sequence
1260 // z, and adds the transformed matrix to "posterior".
1261 /////////////////////////////////////////////////////////////////
1263 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1268 int lengthZ = matZX->GetSeq1Length();
1269 int lengthY = matZY->GetSeq2Length();
1272 for (int k = 1; k <= lengthZ; k++){
1273 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1274 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1276 // iterate through all z[k]-x[i]
1277 while (ZXptr != ZXend){
1278 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1279 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1280 const float ZXval = ZXptr->second;
1281 VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1283 // iterate through all z[k]-y[j]
1284 while (ZYptr != ZYend){
1285 base[ZYptr->first] += ZXval * ZYptr->second;
1293 /////////////////////////////////////////////////////////////////
1296 // Returns set containing all leaf labels of the current subtree.
1297 /////////////////////////////////////////////////////////////////
1299 set<int> GetSubtree (const TreeNode *tree){
1302 if (tree->GetSequenceLabel() == -1){
1303 s = GetSubtree (tree->GetLeftChild());
1304 set<int> t = GetSubtree (tree->GetRightChild());
1306 for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1310 s.insert (tree->GetSequenceLabel());
1316 /////////////////////////////////////////////////////////////////
1317 // TreeBasedBiPartitioning
1319 // Uses the iterative refinement scheme from MUSCLE.
1320 /////////////////////////////////////////////////////////////////
1322 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1323 const ProbabilisticModel &model, MultiSequence* &alignment,
1324 const TreeNode *tree){
1325 // check if this is a node of the alignment tree
1326 if (tree->GetSequenceLabel() == -1){
1327 TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1328 TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1330 set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1331 set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1332 set<int> leftSubtreeComplement, rightSubtreeComplement;
1334 // calculate complement of each subtree
1335 for (int i = 0; i < alignment->GetNumSequences(); i++){
1336 if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1337 if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1340 // perform realignments for edge to left child
1341 if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1342 MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1343 MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1345 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1348 // perform realignments for edge to right child
1349 if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1350 MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1351 MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1353 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1358 /////////////////////////////////////////////////////////////////
1359 // DoIterativeRefinement()
1361 // Performs a single round of randomized partionining iterative
1363 /////////////////////////////////////////////////////////////////
1365 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1366 const ProbabilisticModel &model, MultiSequence* &alignment){
1367 set<int> groupOne, groupTwo;
1369 // create two separate groups
1370 for (int i = 0; i < alignment->GetNumSequences(); i++){
1372 groupOne.insert (i);
1374 groupTwo.insert (i);
1377 if (groupOne.empty() || groupTwo.empty()) return;
1379 // project into the two groups
1380 MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1381 MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1385 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1387 delete groupOneSeqs;
1388 delete groupTwoSeqs;
1391 /////////////////////////////////////////////////////////////////
1392 // WriteAnnotation()
1394 // Computes annotation for multiple alignment and write values
1396 /////////////////////////////////////////////////////////////////
1398 void WriteAnnotation (MultiSequence *alignment,
1399 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1400 ofstream outfile (annotationFilename.c_str());
1402 if (outfile.fail()){
1403 cerr << "ERROR: Unable to write annotation file." << endl;
1407 const int alignLength = alignment->GetSequence(0)->GetLength();
1408 const int numSeqs = alignment->GetNumSequences();
1410 SafeVector<int> position (numSeqs, 0);
1411 SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1412 for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1413 SafeVector<pair<int,int> > active;
1414 active.reserve (numSeqs);
1416 SafeVector<int> lab;
1417 for (int i = 0; i < numSeqs; i++) lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1420 for (int i = 1; i <= alignLength; i++){
1422 // find all aligned residues in this particular column
1424 for (int j = 0; j < numSeqs; j++){
1425 if (seqs[j][i] != '-'){
1426 active.push_back (make_pair(lab[j], ++position[j]));
1430 sort (active.begin(), active.end());
1431 outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1437 /////////////////////////////////////////////////////////////////
1440 // Computes the annotation score for a particular column.
1441 /////////////////////////////////////////////////////////////////
1443 int ComputeScore (const SafeVector<pair<int, int> > &active,
1444 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1446 if (active.size() <= 1) return 0;
1448 // ALTERNATIVE #1: Compute the average alignment score.
1451 for (int i = 0; i < (int) active.size(); i++){
1452 for (int j = i+1; j < (int) active.size(); j++){
1453 val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1457 return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));