1 /////////////////////////////////////////////////////////////////
4 // Main routines for MXSCARNA program.
5 /////////////////////////////////////////////////////////////////
8 #include "SafeVector.h"
9 #include "MultiSequence.h"
11 #include "ScoreType.h"
12 #include "ProbabilisticModel.h"
13 #include "EvolutionaryTree.h"
14 #include "SparseMatrix.h"
15 #include "BPPMatrix.hpp"
16 #include "StemCandidate.hpp"
17 #include "Globaldp.hpp"
19 #include "AlifoldMEA.h"
33 #include "RfoldWrapper.hpp"
34 //static RFOLD::Rfold folder;
36 using namespace::MXSCARNA;
37 using namespace::RFOLD;
39 string parametersInputFilename = "";
40 string parametersOutputFilename = "no training";
41 string annotationFilename = "";
42 string weboutputFileName = "";
46 bool enableTraining = false;
47 bool enableVerbose = false;
48 bool enableAllPairs = false;
49 bool enableAnnotation = false;
50 bool enableViterbi = false;
51 bool enableClustalWOutput = false;
52 bool enableTrainEmissions = false;
53 bool enableAlignOrder = false;
54 bool enableWebOutput = false;
55 bool enableStockholmOutput = false;
56 bool enableMXSCARNAOutput = false;
57 bool enableMcCaskillMEAMode = false;
58 int numConsistencyReps = 2;
59 int numPreTrainingReps = 0;
60 int numIterativeRefinementReps = 100;
61 int scsLength = SCSLENGTH;
63 float gapOpenPenalty = 0;
64 float gapContinuePenalty = 0;
65 float threshhold = 1.0;
66 float BaseProbThreshold = BASEPROBTHRESHOLD;
67 float BasePairConst = BASEPAIRCONST;
68 int BandWidth = BANDWIDTH;
69 bool useRfold = USERFOLD;
70 SafeVector<string> sequenceNames;
72 VF initDistrib (NumMatrixTypes);
73 VF gapOpen (2*NumInsertStates);
74 VF gapExtend (2*NumInsertStates);
75 VVF emitPairs (256, VF (256, 1e-10));
76 VF emitSingle (256, 1e-5);
78 string alphabet = alphabetDefault;
80 string *ssCons = NULL;
82 const int MIN_PRETRAINING_REPS = 0;
83 const int MAX_PRETRAINING_REPS = 20;
84 const int MIN_CONSISTENCY_REPS = 0;
85 const int MAX_CONSISTENCY_REPS = 5;
86 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
87 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
89 /////////////////////////////////////////////////////////////////
90 // Function prototypes
91 /////////////////////////////////////////////////////////////////
94 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
95 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
96 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle);
97 SafeVector<string> ParseParams (int argc, char **argv);
98 void ReadParameters ();
99 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
100 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
101 const ProbabilisticModel &model,
102 SafeVector<BPPMatrix*> &BPPMatrices);
103 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
104 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
105 const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices, float identity);
106 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
107 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
108 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
109 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
110 void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
111 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
112 SafeVector<BPPMatrix*> &BPPMatrices);
113 set<int> GetSubtree (const TreeNode *tree);
114 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
115 const ProbabilisticModel &model, MultiSequence* &alignment,
116 const TreeNode *tree, SafeVector<BPPMatrix*> &BPPMatrices);
117 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
118 const ProbabilisticModel &model, MultiSequence* &alignment);
119 void WriteAnnotation (MultiSequence *alignment,
120 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
121 int ComputeScore (const SafeVector<pair<int, int> > &active,
122 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
123 std::vector<StemCandidate>* seq2scs(MultiSequence *Sequences, SafeVector<BPPMatrix*> &BPPMatrices, int BandWidth);
124 void removeConflicts(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2);
132 /////////////////////////////////////////////////////////////////
135 // Calls all initialization routines and runs the MXSCARNA
137 /////////////////////////////////////////////////////////////////
140 int main (int argc, char **argv){
142 // print MXSCARNA heading
145 // parse program parameters
146 sequenceNames = ParseParams (argc, argv);
148 PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
150 // now, we'll process all the files given as input. If we are given
151 // several filenames as input, then we'll load all of those sequences
152 // simultaneously, as long as we're not training. On the other hand,
153 // if we are training, then we'll treat each file as a separate
156 if (enableMcCaskillMEAMode) {
157 MultiSequence *sequences = new MultiSequence(); assert (sequences);
158 for (int i = 0; i < (int) sequenceNames.size(); i++){
159 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
160 sequences->LoadMFA (sequenceNames[i], true);
163 const int numSeqs = sequences->GetNumSequences();
164 SafeVector<BPPMatrix*> BPPMatrices;
166 // compute the base pairing matrices for each sequences
167 for(int i = 0; i < numSeqs; i++) {
168 Sequence *tmpSeq = sequences->GetSequence(i);
169 string seq = tmpSeq->GetString();
170 int n_seq = tmpSeq->GetLength();
171 BPPMatrix *bppmat = new BPPMatrix(seq, n_seq);
172 BPPMatrices.push_back(bppmat);
174 AlifoldMEA alifold(sequences, BPPMatrices, BasePairConst);
176 ssCons = alifold.getSScons();
178 if (enableStockholmOutput) {
179 sequences->WriteSTOCKHOLM (cout, ssCons);
181 else if (enableMXSCARNAOutput){
182 sequences->WriteMXSCARNA (cout, ssCons);
185 sequences->WriteMFA (cout, ssCons);
190 // if we are training
191 else if (enableTraining){
193 // build new model for aligning
194 ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
196 // prepare to average parameters
197 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
198 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
199 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
200 if (enableTrainEmissions){
201 for (int i = 0; i < (int) emitPairs.size(); i++)
202 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
203 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
206 // align each file individually
207 for (int i = 0; i < (int) sequenceNames.size(); i++){
209 VF thisInitDistrib (NumMatrixTypes);
210 VF thisGapOpen (2*NumInsertStates);
211 VF thisGapExtend (2*NumInsertStates);
212 VVF thisEmitPairs (256, VF (256, 1e-10));
213 VF thisEmitSingle (256, 1e-5);
215 // load sequence file
216 MultiSequence *sequences = new MultiSequence(); assert (sequences);
217 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
218 sequences->LoadMFA (sequenceNames[i], true);
221 DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
223 // add in contribution of the derived parameters
224 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
225 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
226 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
227 if (enableTrainEmissions){
228 for (int i = 0; i < (int) emitPairs.size(); i++)
229 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
230 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
236 // compute new parameters and print them out
237 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
238 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
239 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
240 if (enableTrainEmissions){
241 for (int i = 0; i < (int) emitPairs.size(); i++)
242 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
243 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
246 PrintParameters ("Trained parameter set:",
247 initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
248 parametersOutputFilename.c_str());
251 // if we are not training, we must simply want to align some sequences
253 // load all files together
254 MultiSequence *sequences = new MultiSequence(); assert (sequences);
255 for (int i = 0; i < (int) sequenceNames.size(); i++){
256 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
258 sequences->LoadMFA (sequenceNames[i], true);
261 // do all "pre-training" repetitions first
263 for (int ct = 0; ct < numPreTrainingReps; ct++){
264 enableTraining = true;
266 // build new model for aligning
267 ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
268 emitPairs, emitSingle);
270 // do initial alignments
271 DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
273 // print new parameters
274 PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
276 enableTraining = false;
279 // now, we can perform the alignments and write them out
280 if (enableWebOutput) {
281 outputFile = new ofstream(weboutputFileName.c_str());
283 cerr << "cannot open output file." << weboutputFileName << endl;
286 *outputFile << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << endl;
287 *outputFile << "<result>" << endl;
289 MultiSequence *alignment = DoAlign (sequences,
290 ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
291 initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
294 if (!enableAllPairs){
295 if (enableClustalWOutput) {
296 alignment->WriteALN (cout);
298 else if (enableWebOutput) {
299 alignment->WriteWEB (*outputFile, ssCons);
300 // computeStructureWithAlifold ();
302 else if (enableStockholmOutput) {
303 alignment->WriteSTOCKHOLM (cout, ssCons);
305 else if (enableMXSCARNAOutput) {
306 alignment->WriteMXSCARNA (cout, ssCons);
309 alignment->WriteMFA (cout, ssCons);
313 if (enableWebOutput) {
314 *outputFile << "</result>" << endl;
325 /////////////////////////////////////////////////////////////////
328 // Prints heading for PROBCONS program.
329 /////////////////////////////////////////////////////////////////
331 void PrintHeading (){
333 << "Multiplex SCARNA"<< endl
334 << "version " << VERSION << " - align multiple RNA sequences and print to standard output" << endl
335 << "Written by Yasuo Tabei" << endl
339 /////////////////////////////////////////////////////////////////
342 // Prints PROBCONS parameters to STDERR. If a filename is
343 // specified, then the parameters are also written to the file.
344 /////////////////////////////////////////////////////////////////
346 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
347 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
349 // print parameters to the screen
350 cerr << message << endl
351 << " initDistrib[] = { ";
352 for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
354 << " gapOpen[] = { ";
355 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
357 << " gapExtend[] = { ";
358 for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
363 for (int i = 0; i < 5; i++){
364 for (int j = 0; j <= i; j++){
365 cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
370 // if a file name is specified
373 // attempt to open the file for writing
374 FILE *file = fopen (filename, "w");
376 cerr << "ERROR: Unable to write parameter file: " << filename << endl;
380 // if successful, then write the parameters to the file
381 for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
382 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
383 for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
384 fprintf (file, "%s\n", alphabet.c_str());
385 for (int i = 0; i < (int) alphabet.size(); i++){
386 for (int j = 0; j <= i; j++)
387 fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
388 fprintf (file, "\n");
390 for (int i = 0; i < (int) alphabet.size(); i++)
391 fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
392 fprintf (file, "\n");
397 /////////////////////////////////////////////////////////////////
400 // First computes all pairwise posterior probability matrices.
401 // Then, computes new parameters if training, or a final
402 // alignment, otherwise.
403 /////////////////////////////////////////////////////////////////
404 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle){
408 const int numSeqs = sequences->GetNumSequences();
409 VVF distances (numSeqs, VF (numSeqs, 0));
410 VVF identities (numSeqs, VF (numSeqs, 0));
411 SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
413 SafeVector<BPPMatrix*> BPPMatrices;
415 RfoldWrapper rfoldWrapper;
418 = rfoldWrapper.getProb(sequenceNames, sequences, BASEPROBTHRESHOLD, BandWidth);
421 for(int i = 0; i < numSeqs; i++) {
422 Sequence *tmpSeq = sequences->GetSequence(i);
423 string seq = tmpSeq->GetString();
424 int n_seq = tmpSeq->GetLength();
425 BPPMatrix *bppmat = new BPPMatrix(seq, n_seq, BASEPROBTHRESHOLD);
426 BPPMatrices.push_back(bppmat);
431 // prepare to average parameters
432 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
433 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
434 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
435 if (enableTrainEmissions){
436 for (int i = 0; i < (int) emitPairs.size(); i++)
437 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
438 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
442 // skip posterior calculations if we just want to do Viterbi alignments
445 // do all pairwise alignments for posterior probability matrices
446 for (int a = 0; a < numSeqs-1; a++){
447 for (int b = a+1; b < numSeqs; b++){
448 Sequence *seq1 = sequences->GetSequence (a);
449 Sequence *seq2 = sequences->GetSequence (b);
453 cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
454 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
456 // compute forward and backward probabilities
457 VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
458 VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
460 // if we are training, then we'll simply want to compute the
461 // expected counts for each region within the matrix separately;
462 // otherwise, we'll need to put all of the regions together and
463 // assemble a posterior probability match matrix
465 // so, if we're training
468 // compute new parameters
469 VF thisInitDistrib (NumMatrixTypes);
470 VF thisGapOpen (2*NumInsertStates);
471 VF thisGapExtend (2*NumInsertStates);
472 VVF thisEmitPairs (256, VF (256, 1e-10));
473 VF thisEmitSingle (256, 1e-5);
475 model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
477 // add in contribution of the derived parameters
478 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
479 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
480 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
481 if (enableTrainEmissions){
482 for (int i = 0; i < (int) emitPairs.size(); i++)
483 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
484 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
487 // let us know that we're done.
488 if (enableVerbose) cerr << "done." << endl;
493 // compute posterior probability matrix
494 VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
496 // compute sparse representations
497 sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
498 sparseMatrices[b][a] = NULL;
500 if (!enableAllPairs){
501 // perform the pairwise sequence alignment
502 pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
506 Sequence *tmpSeq1 = seq1->AddGaps (alignment.first, 'X');
507 Sequence *tmpSeq2 = seq2->AddGaps (alignment.first, 'Y');
509 // compute sequence identity for each pair of sequenceses
510 int length = tmpSeq1->GetLength();
512 int misMatchCount = 0;
513 for (int k = 1; k <= length; k++) {
514 int ch1 = tmpSeq1->GetPosition(k);
515 int ch2 = tmpSeq2->GetPosition(k);
516 if (ch1 == ch2 && ch1 != '-' && ch2 != '-') { ++matchCount; }
517 else if (ch1 != ch2 && ch1 != '-' && ch2 != '-') { ++misMatchCount; }
520 identities[a][b] = identities[b][a] = (float)matchCount/(float)(matchCount + misMatchCount);
522 // compute "expected accuracy" distance for evolutionary tree computation
523 float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
524 distances[a][b] = distances[b][a] = distance;
527 cerr << setprecision (10) << distance << endl;
529 delete alignment.first;
532 // let us know that we're done.
533 if (enableVerbose) cerr << "done." << endl;
545 // now average out parameters derived
547 // compute new parameters
548 for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
549 for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
550 for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
552 if (enableTrainEmissions){
553 for (int i = 0; i < (int) emitPairs.size(); i++)
554 for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
555 for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
559 // see if we still want to do some alignments
564 // perform the consistency transformation the desired number of times
565 for (int r = 0; r < numConsistencyReps; r++){
566 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
568 // now replace the old posterior matrices
569 for (int i = 0; i < numSeqs; i++){
570 for (int j = 0; j < numSeqs; j++){
571 delete sparseMatrices[i][j];
572 sparseMatrices[i][j] = newSparseMatrices[i][j];
577 for (int r = 0; r < 1; r++)
578 DoBasePairProbabilityRelaxation(sequences, sparseMatrices, BPPMatrices);
582 MultiSequence *finalAlignment = NULL;
585 for (int a = 0; a < numSeqs-1; a++){
586 for (int b = a+1; b < numSeqs; b++){
587 Sequence *seq1 = sequences->GetSequence (a);
588 Sequence *seq2 = sequences->GetSequence (b);
591 cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
592 << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
595 // perform the pairwise sequence alignment
596 pair<SafeVector<char> *, float> alignment;
598 alignment = model.ComputeViterbiAlignment (seq1, seq2);
601 // build posterior matrix
602 VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
603 int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
604 for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
606 alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
610 // write pairwise alignments
611 string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
612 ofstream outfile (name.c_str());
614 MultiSequence *result = new MultiSequence();
615 result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
616 result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
617 if (enableClustalWOutput)
618 result->WriteALN (outfile);
620 result->WriteMFA (outfile);
624 delete alignment.first;
629 // now if we still need to do a final multiple alignment
635 // compute the evolutionary tree
636 TreeNode *tree = TreeNode::ComputeTree (distances, identities);
638 if (enableWebOutput) {
639 *outputFile << "<tree>" << endl;
640 tree->Print (*outputFile, sequences);
641 *outputFile << "</tree>" << endl;
644 tree->Print (cerr, sequences);
647 // make the final alignment
648 finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model, BPPMatrices);
651 if (enableAnnotation){
652 WriteAnnotation (finalAlignment, sparseMatrices);
659 // delete sparse matrices
660 for (int a = 0; a < numSeqs-1; a++){
661 for (int b = a+1; b < numSeqs; b++){
662 delete sparseMatrices[a][b];
663 delete sparseMatrices[b][a];
668 AlifoldMEA alifold(finalAlignment, BPPMatrices, BasePairConst);
670 ssCons = alifold.getSScons();
672 return finalAlignment;
679 /////////////////////////////////////////////////////////////////
682 // Attempts to parse an integer from the character string given.
683 // Returns true only if no parsing error occurs.
684 /////////////////////////////////////////////////////////////////
686 bool GetInteger (char *data, int *val){
693 retVal = strtol (data, &endPtr, 0);
694 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
695 if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
696 if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
701 /////////////////////////////////////////////////////////////////
704 // Attempts to parse a float from the character string given.
705 // Returns true only if no parsing error occurs.
706 /////////////////////////////////////////////////////////////////
708 bool GetFloat (char *data, float *val){
715 retVal = strtod (data, &endPtr);
716 if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
717 if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
718 *val = (float) retVal;
722 /////////////////////////////////////////////////////////////////
725 // Parse all command-line options.
726 /////////////////////////////////////////////////////////////////
728 SafeVector<string> ParseParams (int argc, char **argv){
732 cerr << "MXSCARNA comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
733 << "you are welcome to redistribute it under certain conditions. See the" << endl
734 << "file COPYING.txt for details." << endl
737 << " mxscarna [OPTION]... [MFAFILE]..." << endl
739 << "Description:" << endl
740 << " Align sequences in MFAFILE(s) and print result to standard output" << endl
742 << " -clustalw" << endl
743 << " use CLUSTALW output format instead of MFA" << endl
745 << " -stockholm" << endl
746 << " use STOCKHOLM output format instead of MFA" << endl
748 << " -mxscarna" << endl
749 << " use MXSCARNA output format instead of MFA" << endl
751 << " -weboutput /<output_path>/<outputfilename>" << endl
752 << " use web output format" << endl
754 << " -c, --consistency REPS" << endl
755 << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
756 << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
758 << " -ir, --iterative-refinement REPS" << endl
759 << " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
760 << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
762 << " -pre, --pre-training REPS" << endl
763 << " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
764 << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
767 << " generate all-pairs pairwise alignments" << endl
769 << " -viterbi" << endl
770 << " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
772 << " -v, --verbose" << endl
773 << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
775 << " -annot FILENAME" << endl
776 << " write annotation for multiple alignment to FILENAME" << endl
778 << " -t, --train FILENAME" << endl
779 << " compute EM transition probabilities, store in FILENAME (default: "
780 << parametersOutputFilename << ")" << endl
782 << " -e, --emissions" << endl
783 << " also reestimate emission probabilities (default: "
784 << (enableTrainEmissions ? "on" : "off") << ")" << endl
786 << " -p, --paramfile FILENAME" << endl
787 << " read parameters from FILENAME (default: "
788 << parametersInputFilename << ")" << endl
790 << " -a, --alignment-order" << endl
791 << " print sequences in alignment order rather than input order (default: "
792 << (enableAlignOrder ? "on" : "off") << ")" << endl
794 << " -s THRESHOLD" << endl
795 << " the threshold of SCS alignment" << endl
797 << " In default, for less than " << threshhold << ", the SCS aligment is applied. " << endl
798 << " -l SCSLENGTH" << endl
799 << " the length of stem candidates " << SCSLENGTH << endl
801 << " -b BASEPROBTRHESHHOLD" << endl
802 << " the threshold of base pairing probability " << BASEPROBTHRESHOLD << endl
804 << " -m, --mccaskillmea" << endl
805 << " McCaskill MEA MODE: input the clustalw format file and output the secondary structure predicted by McCaskill MEA" << endl
807 << " -g BASEPAIRSCORECONST" << endl
808 << " the control parameter of the prediction of base pairs, default:" << BasePairConst << endl
810 << " -w BANDWIDTH" << endl
811 << " the control parameter of the distance of stem candidates, default:" << BANDWIDTH << endl
813 << " use Rfold instead of global McCaskill algorithm to calcurate base paring probality matrices, default: (" << (useRfold? "on" : "off") << ")" << endl
817 // << " -go, --gap-open VALUE" << endl
818 // << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
820 // << " -ge, --gap-extension VALUE" << endl
821 // << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
823 // << " -co, --cutoff CUTOFF" << endl
824 // << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
830 SafeVector<string> sequenceNames;
834 for (int i = 1; i < argc; i++){
835 if (argv[i][0] == '-'){
838 if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
839 enableTraining = true;
841 parametersOutputFilename = string (argv[++i]);
843 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
849 else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
850 enableTrainEmissions = true;
854 else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
856 parametersInputFilename = string (argv[++i]);
858 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
862 else if (! strcmp (argv[i], "-s")) {
864 if (!GetFloat (argv[++i], &tempFloat)){
865 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
870 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
874 threshhold = tempFloat;
878 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
883 else if (! strcmp (argv[i], "-l")) {
885 if (!GetInteger (argv[++i], &tempInt)){
886 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
890 if (tempInt <= 0 || 6 <= tempInt) {
891 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
892 << "1 and 6" << "." << endl;
900 else if (! strcmp (argv[i], "-b")) {
902 if (!GetFloat (argv[++i], &tempFloat)){
903 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
907 if (tempFloat < 0 && 1 < tempFloat) {
908 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
912 BaseProbThreshold = tempFloat;
916 else if (! strcmp (argv[i], "-g")) {
918 if (!GetFloat (argv[++i], &tempFloat)){
919 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
923 if (tempFloat < 0 && 1 < tempFloat) {
924 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
928 BasePairConst = tempFloat;
933 // number of consistency transformations
934 else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
936 if (!GetInteger (argv[++i], &tempInt)){
937 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
941 if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
942 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
943 << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
947 numConsistencyReps = tempInt;
951 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
956 // number of randomized partitioning iterative refinement passes
957 else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
959 if (!GetInteger (argv[++i], &tempInt)){
960 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
964 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
965 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
966 << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
970 numIterativeRefinementReps = tempInt;
974 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
978 else if (!strcmp (argv[i], "-rfold")){
981 // number of EM pre-training rounds
982 else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
984 if (!GetInteger (argv[++i], &tempInt)){
985 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
989 if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
990 cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
991 << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
995 numPreTrainingReps = tempInt;
999 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1004 // the distance of stem candidate
1005 else if (!strcmp (argv[i], "-w")){
1007 if (!GetInteger (argv[++i], &tempInt)){
1008 cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
1012 BandWidth = tempInt;
1016 cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1022 else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
1024 if (!GetFloat (argv[++i], &tempFloat)){
1025 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1030 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1034 gapOpenPenalty = tempFloat;
1038 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1043 // gap extension penalty
1044 else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
1046 if (!GetFloat (argv[++i], &tempFloat)){
1047 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1052 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1056 gapContinuePenalty = tempFloat;
1060 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1065 // all-pairs pairwise alignments
1066 else if (!strcmp (argv[i], "-pairs")){
1067 enableAllPairs = true;
1070 // all-pairs pairwise Viterbi alignments
1071 else if (!strcmp (argv[i], "-viterbi")){
1072 enableAllPairs = true;
1073 enableViterbi = true;
1077 else if (!strcmp (argv[i], "-annot")){
1078 enableAnnotation = true;
1080 annotationFilename = argv[++i];
1082 cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
1087 // clustalw output format
1088 else if (!strcmp (argv[i], "-clustalw")){
1089 enableClustalWOutput = true;
1091 // mxscarna output format
1092 else if (!strcmp (argv[i], "-mxscarna")) {
1093 enableMXSCARNAOutput = true;
1095 // stockholm output format
1096 else if (!strcmp (argv[i], "-stockholm")) {
1097 enableStockholmOutput = true;
1099 // web output format
1100 else if (!strcmp (argv[i], "-weboutput")) {
1102 weboutputFileName = string(argv[++i]);
1105 cerr << "ERROR: Invalid following option " << argv[i-1] << ": " << argv[i] << endl;
1109 enableWebOutput = true;
1113 else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
1115 if (!GetFloat (argv[++i], &tempFloat)){
1116 cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1120 if (tempFloat < 0 || tempFloat > 1){
1121 cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
1129 cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1134 // verbose reporting
1135 else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
1136 enableVerbose = true;
1140 else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
1141 enableAlignOrder = true;
1143 // McCaskill MEA MODE
1144 else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--mccaskillmea")){
1145 enableMcCaskillMEAMode = true;
1149 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
1154 sequenceNames.push_back (string (argv[i]));
1158 if (enableTrainEmissions && !enableTraining){
1159 cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
1163 return sequenceNames;
1166 /////////////////////////////////////////////////////////////////
1169 // Read initial distribution, transition, and emission
1170 // parameters from a file.
1171 /////////////////////////////////////////////////////////////////
1173 void ReadParameters (){
1177 emitPairs = VVF (256, VF (256, 1e-10));
1178 emitSingle = VF (256, 1e-5);
1180 // read initial state distribution and transition parameters
1182 if (parametersInputFilename == string ("")){
1183 if (NumInsertStates == 1){
1184 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1185 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1186 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1188 else if (NumInsertStates == 2){
1189 for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1190 for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1191 for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1194 cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1195 << " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
1199 alphabet = alphabetDefault;
1201 for (int i = 0; i < (int) alphabet.length(); i++){
1202 emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1203 emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1204 for (int j = 0; j <= i; j++){
1205 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1206 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1207 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1208 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1209 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1210 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1211 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1212 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1217 data.open (parametersInputFilename.c_str());
1219 cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1224 for (int i = 0; i < 3; i++){
1225 if (!getline (data, line[i])){
1226 cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1230 istringstream data2;
1231 data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1232 data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1233 data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1235 if (!getline (data, line[0])){
1236 cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1240 // read alphabet as concatenation of all characters on alphabet line
1243 data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1245 for (int i = 0; i < (int) alphabet.size(); i++){
1246 for (int j = 0; j <= i; j++){
1249 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1250 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1251 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1252 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1253 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1254 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1255 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1256 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1260 for (int i = 0; i < (int) alphabet.size(); i++){
1263 emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1264 emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1270 /////////////////////////////////////////////////////////////////
1273 // Process the tree recursively. Returns the aligned sequences
1274 // corresponding to a node or leaf of the tree.
1275 /////////////////////////////////////////////////////////////////
1277 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
1278 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1279 const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices) {
1280 MultiSequence *result;
1282 // check if this is a node of the alignment tree
1283 if (tree->GetSequenceLabel() == -1){
1284 MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model, BPPMatrices);
1285 MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model, BPPMatrices);
1288 assert (alignRight);
1290 result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model, BPPMatrices, tree->GetIdentity());
1297 // otherwise, this is a leaf of the alignment tree
1299 result = new MultiSequence(); assert (result);
1300 result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1306 /////////////////////////////////////////////////////////////////
1307 // ComputeFinalAlignment()
1309 // Compute the final alignment by calling ProcessTree(), then
1310 // performing iterative refinement as needed.
1311 /////////////////////////////////////////////////////////////////
1313 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1314 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1315 const ProbabilisticModel &model,
1316 SafeVector<BPPMatrix*> &BPPMatrices) {
1318 MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model, BPPMatrices);
1320 if (enableAlignOrder){
1321 alignment->SaveOrdering();
1322 enableAlignOrder = false;
1325 // tree-based refinement
1326 // if you use the function, you can degrade the quality of the software.
1327 // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree, BPPMatrices);
1329 // iterative refinement
1331 for (int i = 0; i < numIterativeRefinementReps; i++)
1332 DoIterativeRefinement (sparseMatrices, model, alignment);
1336 // return final alignment
1340 /////////////////////////////////////////////////////////////////
1341 // AlignAlignments()
1343 // Returns the alignment of two MultiSequence objects.
1344 /////////////////////////////////////////////////////////////////
1346 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1347 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1348 const ProbabilisticModel &model,
1349 SafeVector<BPPMatrix*> &BPPMatrices, float identity){
1351 // print some info about the alignment
1353 for (int i = 0; i < align1->GetNumSequences(); i++)
1354 cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1356 for (int i = 0; i < align2->GetNumSequences(); i++)
1357 cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1361 VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1363 pair<SafeVector<char> *, float> alignment;
1364 // choose the alignment routine depending on the "cosmetic" gap penalties used
1365 if (gapOpenPenalty == 0 && gapContinuePenalty == 0) {
1367 if(identity < threshhold) {
1368 std::vector<StemCandidate> *pscs1, *pscs2;
1369 pscs1 = seq2scs(align1, BPPMatrices, BandWidth);
1370 pscs2 = seq2scs(align2, BPPMatrices, BandWidth);
1371 std::vector<int> *matchPSCS1 = new std::vector<int>;
1372 std::vector<int> *matchPSCS2 = new std::vector<int>;
1374 Globaldp globaldp(pscs1, pscs2, align1, align2, matchPSCS1, matchPSCS2, posterior, BPPMatrices);
1375 //float scsScore = globaldp.Run();
1379 removeConflicts(pscs1, pscs2, matchPSCS1, matchPSCS2);
1381 alignment = model.ComputeAlignment2 (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, pscs1, pscs2, matchPSCS1, matchPSCS2);
1384 alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1388 alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1389 *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1390 gapOpenPenalty, gapContinuePenalty);
1397 // compute total length of sequences
1399 for (int i = 0; i < align1->GetNumSequences(); i++)
1400 for (int j = 0; j < align2->GetNumSequences(); j++)
1401 totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1403 // give an "accuracy" measure for the alignment
1404 cerr << alignment.second / totLength << endl;
1407 // now build final alignment
1408 MultiSequence *result = new MultiSequence();
1409 for (int i = 0; i < align1->GetNumSequences(); i++)
1410 result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1411 for (int i = 0; i < align2->GetNumSequences(); i++)
1412 result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1413 if (!enableAlignOrder)
1414 result->SortByLabel();
1416 // free temporary alignment
1417 delete alignment.first;
1422 /////////////////////////////////////////////////////////////////
1425 // Performs one round of the consistency transformation. The
1428 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1431 // where S = {x, y, all other sequences...}
1433 /////////////////////////////////////////////////////////////////
1435 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1436 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1437 const int numSeqs = sequences->GetNumSequences();
1439 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1441 // for every pair of sequences
1442 for (int i = 0; i < numSeqs; i++){
1443 for (int j = i+1; j < numSeqs; j++){
1444 Sequence *seq1 = sequences->GetSequence (i);
1445 Sequence *seq2 = sequences->GetSequence (j);
1448 cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1449 << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1451 // get the original posterior matrix
1452 VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1453 VF &posterior = *posteriorPtr;
1455 const int seq1Length = seq1->GetLength();
1456 const int seq2Length = seq2->GetLength();
1458 // contribution from the summation where z = x and z = y
1459 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1462 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1464 // contribution from all other sequences
1465 for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1467 Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1468 else if (k > i && k < j)
1469 Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1471 SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1472 Relax (sparseMatrices[i][k], temp, posterior);
1477 // now renormalization
1478 for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1480 // mask out positions not originally in the posterior matrix
1481 SparseMatrix *matXY = sparseMatrices[i][j];
1482 for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1483 for (int x = 1; x <= seq1Length; x++){
1484 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1485 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1486 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1488 while (XYptr != XYend){
1490 // zero out all cells until the first filled column
1491 while (curr < XYptr->first){
1496 // now, skip over this column
1501 // zero out cells after last column
1502 while (curr <= seq2Length){
1508 // save the new posterior matrix
1509 newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1510 newSparseMatrices[j][i] = NULL;
1513 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1515 delete posteriorPtr;
1518 cerr << "done." << endl;
1522 return newSparseMatrices;
1525 /////////////////////////////////////////////////////////////////
1528 // Computes the consistency transformation for a single sequence
1529 // z, and adds the transformed matrix to "posterior".
1530 /////////////////////////////////////////////////////////////////
1532 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1537 int lengthX = matXZ->GetSeq1Length();
1538 int lengthY = matZY->GetSeq2Length();
1539 assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1542 for (int i = 1; i <= lengthX; i++){
1543 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1544 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1546 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1548 // iterate through all x[i]-z[k]
1549 while (XZptr != XZend){
1550 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1551 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1552 const float XZval = XZptr->second;
1554 // iterate through all z[k]-y[j]
1555 while (ZYptr != ZYend){
1556 base[ZYptr->first] += XZval * ZYptr->second;
1564 /////////////////////////////////////////////////////////////////
1567 // Computes the consistency transformation for a single sequence
1568 // z, and adds the transformed matrix to "posterior".
1569 /////////////////////////////////////////////////////////////////
1571 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1576 int lengthZ = matZX->GetSeq1Length();
1577 int lengthY = matZY->GetSeq2Length();
1580 for (int k = 1; k <= lengthZ; k++){
1581 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1582 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1584 // iterate through all z[k]-x[i]
1585 while (ZXptr != ZXend){
1586 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1587 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1588 const float ZXval = ZXptr->second;
1589 VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1591 // iterate through all z[k]-y[j]
1592 while (ZYptr != ZYend){
1593 base[ZYptr->first] += ZXval * ZYptr->second;
1601 void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
1602 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1603 SafeVector<BPPMatrix*> &BPPMatrices) {
1604 const int numSeqs = sequences->GetNumSequences();
1606 for (int i = 0; i < numSeqs; i++) {
1607 Sequence *seq1 = sequences->GetSequence (i);
1608 BPPMatrix *seq1BppMatrix = BPPMatrices[seq1->GetLabel()];
1609 Trimat<float> consBppMat(seq1->GetLength() + 1);
1610 int seq1Length = seq1->GetLength();
1612 for (int k = 1; k <= seq1Length; k++) {
1613 for (int l = k; l <= seq1Length; l++) {
1614 consBppMat.ref(k, l) = seq1BppMatrix->GetProb(k, l);
1618 for (int j = i + 1; j < numSeqs; j++) {
1620 // VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior()
1621 Sequence *seq2 = sequences->GetSequence (j);
1622 BPPMatrix *seq2BppMatrix = BPPMatrices[seq2->GetLabel()];
1623 // int seq2Length = seq2->GetLength();
1624 SparseMatrix *matchProb = sparseMatrices[i][j];
1626 // vector<PIF2> &probs1 = seq1BppMatrix->bppMat.data2;
1627 for(int k = 1; k <= seq1Length; k++) {
1628 for(int m = k, n = k; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1630 // for (int k = 0; k < (int)probs1.size(); k++) {
1631 // float tmpProb1 = probs1[k].prob;
1632 // int tmp1I = probs1[k].i;
1633 // int tmp1J = probs1[k].j;
1636 vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1637 for(int l = 0; l < (int)probs2.size(); l++) {
1638 float tmpProb2 = probs2[l].prob;
1639 int tmp2I = probs2[l].i;
1640 int tmp2J = probs2[l].j;
1641 sumProb += matchProb->GetValue(m, tmp2I)*matchProb->GetValue(n, tmp2J)*tmpProb2;
1644 consBppMat.ref(m, n) += sumProb;
1647 for(int m = k, n = k + 1; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1649 // for (int k = 0; k < (int)probs1.size(); k++) {
1650 // float tmpProb1 = probs1[k].prob;
1651 // int tmp1I = probs1[k].i;
1652 // int tmp1J = probs1[k].j;
1655 vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1656 for(int l = 0; l < (int)probs2.size(); l++) {
1657 float tmpProb2 = probs2[l].prob;
1658 int tmp2I = probs2[l].i;
1659 int tmp2J = probs2[l].j;
1660 sumProb += matchProb->GetValue(m, tmp2I)*matchProb->GetValue(n, tmp2J)*tmpProb2;
1663 consBppMat.ref(m, n) += sumProb;
1669 for(int k = 1; k <= seq1Length; k++) {
1670 for(int m = k, n = k; n <= k + 30 && m >= 1 && n <= seq1Length; m--, n++) {
1671 float tmpProb = seq1BppMatrix->GetProb(m, n);
1672 for(int l = 1; l <= seq2Length; l++) {
1673 for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1674 tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1676 for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1677 tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1680 consBppMat.ref(m, n) += tmpProb;
1683 for(int m = k, n = k + 1; n <= k + 31 && m >= 1 && n <= seq1Length; m--, n++) {
1684 float tmpProb = seq1BppMatrix->GetProb(m, n);
1685 for(int l = 1; l <= seq2Length; l++) {
1686 for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1687 tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1689 for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1690 tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1693 consBppMat.ref(m,n) += tmpProb;
1698 for (int m = 1; m <= seq1Length; m++) {
1699 for (int n = m + 4; n <= seq1Length; n++) {
1700 consBppMat.ref(m,n) = consBppMat.ref(m,n)/(float)numSeqs;
1703 seq1BppMatrix->updateBPPMatrix(consBppMat);
1707 /////////////////////////////////////////////////////////////////
1710 // Returns set containing all leaf labels of the current subtree.
1711 /////////////////////////////////////////////////////////////////
1713 set<int> GetSubtree (const TreeNode *tree){
1716 if (tree->GetSequenceLabel() == -1){
1717 s = GetSubtree (tree->GetLeftChild());
1718 set<int> t = GetSubtree (tree->GetRightChild());
1720 for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1724 s.insert (tree->GetSequenceLabel());
1730 /////////////////////////////////////////////////////////////////
1731 // TreeBasedBiPartitioning
1733 // Uses the iterative refinement scheme from MUSCLE.
1734 /////////////////////////////////////////////////////////////////
1736 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1737 const ProbabilisticModel &model, MultiSequence* &alignment,
1738 const TreeNode *tree, SafeVector<BPPMatrix*> &BPPMatrices){
1739 // check if this is a node of the alignment tree
1740 if (tree->GetSequenceLabel() == -1){
1741 TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild(), BPPMatrices);
1742 TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild(), BPPMatrices);
1744 set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1745 set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1746 set<int> leftSubtreeComplement, rightSubtreeComplement;
1748 // calculate complement of each subtree
1749 for (int i = 0; i < alignment->GetNumSequences(); i++){
1750 if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1751 if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1754 // perform realignments for edge to left child
1755 if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1756 MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1757 MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1759 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetLeftChild()->GetIdentity());
1762 // perform realignments for edge to right child
1763 if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1764 MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1765 MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1767 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetRightChild()->GetIdentity());
1772 /////////////////////////////////////////////////////////////////
1773 // DoterativeRefinement()
1775 // Performs a single round of randomized partionining iterative
1777 /////////////////////////////////////////////////////////////////
1779 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1780 const ProbabilisticModel &model, MultiSequence* &alignment){
1781 set<int> groupOne, groupTwo;
1783 // create two separate groups
1784 for (int i = 0; i < alignment->GetNumSequences(); i++){
1786 groupOne.insert (i);
1788 groupTwo.insert (i);
1791 if (groupOne.empty() || groupTwo.empty()) return;
1793 // project into the two groups
1794 MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1795 MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1799 alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1801 delete groupOneSeqs;
1802 delete groupTwoSeqs;
1806 /////////////////////////////////////////////////////////////////
1807 // WriteAnnotation()
1809 // Computes annotation for multiple alignment and write values
1811 /////////////////////////////////////////////////////////////////
1813 void WriteAnnotation (MultiSequence *alignment,
1814 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1815 ofstream outfile (annotationFilename.c_str());
1817 if (outfile.fail()){
1818 cerr << "ERROR: Unable to write annotation file." << endl;
1822 const int alignLength = alignment->GetSequence(0)->GetLength();
1823 const int numSeqs = alignment->GetNumSequences();
1825 SafeVector<int> position (numSeqs, 0);
1826 SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1827 for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1828 SafeVector<pair<int,int> > active;
1829 active.reserve (numSeqs);
1832 for (int i = 1; i <= alignLength; i++){
1834 // find all aligned residues in this particular column
1836 for (int j = 0; j < numSeqs; j++){
1837 if (seqs[j][i] != '-'){
1838 active.push_back (make_pair(j, ++position[j]));
1842 outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1848 /////////////////////////////////////////////////////////////////
1851 // Computes the annotation score for a particular column.
1852 /////////////////////////////////////////////////////////////////
1854 int ComputeScore (const SafeVector<pair<int, int> > &active,
1855 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1857 if (active.size() <= 1) return 0;
1859 // ALTERNATIVE #1: Compute the average alignment score.
1862 for (int i = 0; i < (int) active.size(); i++){
1863 for (int j = i+1; j < (int) active.size(); j++){
1864 val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1868 return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));