1 /***********************************************
2 * # Copyright 2009-2010. Liu Yongchao
3 * # Contact: Liu Yongchao, School of Computer Engineering,
4 * # Nanyang Technological University.
5 * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
7 * # GPL version 3.0 applies.
9 * ************************************************/
24 #include "MSAClusterTree.h"
31 string parametersInputFilename = "";
32 string parametersOutputFilename = "no training";
33 string annotationFilename = "";
35 bool enableVerbose = false;
36 bool enableAnnotation = false;
37 bool enableClustalWOutput = false;
38 bool enableAlignOrder = false;
39 int numConsistencyReps = 2;
40 int numPreTrainingReps = 0;
41 int numIterativeRefinementReps = 100;
45 VF initDistrib(NumMatrixTypes);
46 VF gapOpen(2 * NumInsertStates);
47 VF gapExtend(2 * NumInsertStates);
48 VVF emitPairs(256, VF(256, 1e-10));
49 VF emitSingle(256, 1e-5);
51 string alphabet = alphabetDefault;
53 const int MIN_PRETRAINING_REPS = 0;
54 const int MAX_PRETRAINING_REPS = 20;
55 const int MIN_CONSISTENCY_REPS = 0;
56 const int MAX_CONSISTENCY_REPS = 5;
57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
60 string posteriorProbsFilename = "";
61 bool allscores = true;
64 int flag_gui = 0; //0: no gui related o/p
65 //1: gui related o/p generated
66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
67 //1: pp score seq added to o/p fasta alignment
69 ///////////////////////////////
70 // global scoring matrix variables
71 //////////////////////////////
72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
76 double sub_matrix[26][26];
77 int firstread = 0; //this makes sure that matrices are read only once
79 float TEMPERATURE = 5;
81 int prot_nuc = 0; //0=prot, 1=nucleotide
94 char opt; //can be 'P' or 'M'
99 argument_decl argument;
101 extern inline void read_sustitution_matrix(char *fileName);
102 extern void setmatrixtype(int le);
103 extern inline int matrixtype_to_int();
104 extern inline void read_dna_matrix();
105 extern inline void read_vtml_la_matrix();
106 extern void init_arguments();
108 MSA::MSA(int argc, char* argv[]) {
109 //parse program parameters
110 SafeVector<string> sequenceNames = ParseParams(argc, argv);
112 //initialize arguments for partition function
116 //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
118 //read the input sequences
119 MultiSequence *sequences = new MultiSequence();
121 for (int i = 0; i < (int) sequenceNames.size(); i++) {
122 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
123 sequences->LoadMFA(sequenceNames[i], true);
125 //allocate space for sequence weights
126 this->seqsWeights = new int[sequences->GetNumSequences()];
127 //initilaize parameters for OPENMP
129 if(numThreads <= 0) {
130 numThreads = omp_get_num_procs();
131 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
133 cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
135 //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
136 omp_set_num_threads(numThreads);
139 // now, we can perform the alignments and write them out
140 MultiSequence *alignment = doAlign(sequences,
141 ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
142 emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
145 //write the alignment results to standard output
146 if (enableClustalWOutput) {
147 alignment->WriteALN(*alignOutFile);
149 alignment->WriteMFA(*alignOutFile);
152 delete[] this->seqsWeights;
157 /*close the output file*/
158 if (alignOutFileName.length() > 0) {
159 ((std::ofstream*) alignOutFile)->close();
162 /////////////////////////////////////////////////////////////////
165 // Prints MSAPROBS parameters to STDERR. If a filename is
166 // specified, then the parameters are also written to the file.
167 /////////////////////////////////////////////////////////////////
169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
170 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
171 const VF &emitSingle, const char *filename) {
173 // print parameters to the screen
174 cerr << message << endl << " initDistrib[] = { ";
175 for (int i = 0; i < NumMatrixTypes; i++)
176 cerr << setprecision(10) << initDistrib[i] << " ";
177 cerr << "}" << endl << " gapOpen[] = { ";
178 for (int i = 0; i < NumInsertStates * 2; i++)
179 cerr << setprecision(10) << gapOpen[i] << " ";
180 cerr << "}" << endl << " gapExtend[] = { ";
181 for (int i = 0; i < NumInsertStates * 2; i++)
182 cerr << setprecision(10) << gapExtend[i] << " ";
183 cerr << "}" << endl << endl;
186 for (int i = 0; i < 5; i++){
187 for (int j = 0; j <= i; j++){
188 cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
193 // if a file name is specified
196 // attempt to open the file for writing
197 FILE *file = fopen(filename, "w");
199 cerr << "ERROR: Unable to write parameter file: " << filename
204 // if successful, then write the parameters to the file
205 for (int i = 0; i < NumMatrixTypes; i++)
206 fprintf(file, "%.10f ", initDistrib[i]);
208 for (int i = 0; i < 2 * NumInsertStates; i++)
209 fprintf(file, "%.10f ", gapOpen[i]);
211 for (int i = 0; i < 2 * NumInsertStates; i++)
212 fprintf(file, "%.10f ", gapExtend[i]);
214 fprintf(file, "%s\n", alphabet.c_str());
215 for (int i = 0; i < (int) alphabet.size(); i++) {
216 for (int j = 0; j <= i; j++)
217 fprintf(file, "%.10f ",
218 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
221 for (int i = 0; i < (int) alphabet.size(); i++)
222 fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
228 /////////////////////////////////////////////////////////////////
231 // First computes all pairwise posterior probability matrices.
232 // Then, computes new parameters if training, or a final
233 // alignment, otherwise.
234 /////////////////////////////////////////////////////////////////
235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
237 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
238 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
241 //get the number of sequences
242 const int numSeqs = sequences->GetNumSequences();
244 //create distance matrix
245 VVF probalign_distances(numSeqs, VF(numSeqs, 0));
246 VVF distances(numSeqs, VF(numSeqs, 0));//msa
248 float gl_accuracy = 0;
249 //creat sparseMatrices
250 SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
251 SafeVector<SparseMatrix *>(numSeqs, NULL));
252 SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
253 SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
256 //calculate sequence pairs for openmp model
258 numPairs = (numSeqs - 1) * numSeqs / 2;
259 seqsPairs = new SeqsPair[numPairs];
260 for(int a = 0; a < numSeqs; a++) {
261 for(int b = a + 1; b < numSeqs; b++) {
262 seqsPairs[pairIdx].seq1 = a;
263 seqsPairs[pairIdx].seq2 = b;
268 // do all pairwise alignments for posterior probability matrices
270 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
271 for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
272 int a= seqsPairs[pairIdx].seq1;
273 int b = seqsPairs[pairIdx].seq2;
276 cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
279 for (int a = 0; a < numSeqs - 1; a++) {
280 for (int b = a + 1; b < numSeqs; b++) {
282 Sequence *seq1 = sequences->GetSequence(a);
283 Sequence *seq2 = sequences->GetSequence(b);
287 cerr << "Computing posterior matrix: (" << a + 1 << ") "
288 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
289 << seq2->GetHeader() << " -- ";
293 // compute forward and backward probabilities
294 VF *forward = model.ComputeForwardMatrix(seq1, seq2);
296 VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
298 // compute posterior probability matrix from HMM
299 VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
300 assert(probcons_posterior);
305 VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
306 assert(probalign_posterior);
307 probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(), *probalign_posterior);
308 probalign_sparseMatrices[b][a] = NULL;
309 pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
310 seq1->GetLength(), seq2->GetLength(), *probalign_posterior);
311 probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());
312 delete probalign_alignment.first;
315 forward = model.ComputeForwardMatrix(seq1, seq2,false);
317 backward = model.ComputeBackwardMatrix(seq1, seq2,false);
319 VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
324 //merge probalign + local + probcons
325 VF::iterator ptr1 = probcons_posterior->begin();
326 VF::iterator ptr2 = probalign_posterior->begin();
327 VF::iterator ptr3 = local_posterior->begin();
328 VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior); //msa
329 VF::iterator ptr = posterior->begin();
330 for (int i = 0; i <= seq1->GetLength(); i++) {
331 for (int j = 0; j <= seq2->GetLength(); j++) {
335 *ptr = sqrt(v1*v1 + v2*v2 + v3*v3);
342 // perform the pairwise sequence alignment
343 pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
344 seq1->GetLength(), seq2->GetLength(), *posterior);
346 //compute expected accuracy
347 distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
348 / (3*min(seq1->GetLength(), seq2->GetLength()));
350 // compute sparse representations
351 sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
352 seq2->GetLength(), *posterior);
353 sparseMatrices[b][a] = NULL;
355 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
356 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
357 float N_correct_match = 0;
359 for (SafeVector<char>::iterator iter = gl_alignment.first->begin();
360 iter != gl_alignment.first->end(); ++iter){
362 unsigned char c1 = (unsigned char) iter1[i++];
363 unsigned char c2 = (unsigned char) iter2[j++];
364 if(c1==c2) N_correct_match += 1;
366 else if(*iter == 'X') i++;
367 else if(*iter == 'Y') j++;
369 if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
370 gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
372 delete probcons_posterior;
373 delete probalign_posterior;
374 delete local_posterior;
383 gl_accuracy /= numPairs;
384 if(gl_accuracy > 0.4){
385 for (int a = 0; a < numSeqs - 1; a++)
386 for (int b = a + 1; b < numSeqs; b++) {
387 distances[a][b] = distances[b][a] = probalign_distances[a][b];
388 sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
389 sparseMatrices[b][a] = NULL;
393 //create the guide tree
394 this->tree = new MSAClusterTree(this, distances, numSeqs);
395 this->tree->create();
397 // perform the consistency transformation the desired number of times
398 float* fweights = new float[numSeqs];
399 for (int r = 0; r < numSeqs; r++) {
400 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
403 for (int r = 0; r < numConsistencyReps; r++) {
404 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
405 DoRelaxation(fweights, sequences, sparseMatrices);
407 // now replace the old posterior matrices
408 for (int i = 0; i < numSeqs; i++) {
409 for (int j = 0; j < numSeqs; j++) {
410 delete sparseMatrices[i][j];
411 sparseMatrices[i][j] = newSparseMatrices[i][j];
420 //compute the final multiple sequence alignment
421 MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
422 sparseMatrices, model);
425 if (enableAnnotation) {
426 WriteAnnotation(finalAlignment, sparseMatrices);
428 //destroy the guide tree
432 // delete sparse matrices
433 for (int a = 0; a < numSeqs - 1; a++) {
434 for (int b = a + 1; b < numSeqs; b++) {
435 delete sparseMatrices[a][b];
436 delete sparseMatrices[b][a];
440 return finalAlignment;
443 /////////////////////////////////////////////////////////////////
446 // Attempts to parse an integer from the character string given.
447 // Returns true only if no parsing error occurs.
448 /////////////////////////////////////////////////////////////////
450 bool GetInteger(char *data, int *val) {
457 retVal = strtol(data, &endPtr, 0);
458 if (retVal == 0 && (errno != 0 || data == endPtr))
460 if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
462 if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
468 /////////////////////////////////////////////////////////////////
471 // Attempts to parse a float from the character string given.
472 // Returns true only if no parsing error occurs.
473 /////////////////////////////////////////////////////////////////
475 bool GetFloat(char *data, float *val) {
482 retVal = strtod(data, &endPtr);
483 if (retVal == 0 && (errno != 0 || data == endPtr))
485 if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
487 *val = (float) retVal;
491 /////////////////////////////////////////////////////////////////
494 // Read initial distribution, transition, and emission
495 // parameters from a file.
496 /////////////////////////////////////////////////////////////////
498 void MSA::ReadParameters() {
502 emitPairs = VVF(256, VF(256, 1e-10));
503 emitSingle = VF(256, 1e-5);
505 // read initial state distribution and transition parameters
506 if (parametersInputFilename == string("")) {
507 if (NumInsertStates == 1) {
508 for (int i = 0; i < NumMatrixTypes; i++)
509 initDistrib[i] = initDistrib1Default[i];
510 for (int i = 0; i < 2 * NumInsertStates; i++)
511 gapOpen[i] = gapOpen1Default[i];
512 for (int i = 0; i < 2 * NumInsertStates; i++)
513 gapExtend[i] = gapExtend1Default[i];
514 } else if (NumInsertStates == 2) {
515 for (int i = 0; i < NumMatrixTypes; i++)
516 initDistrib[i] = initDistrib2Default[i];
517 for (int i = 0; i < 2 * NumInsertStates; i++)
518 gapOpen[i] = gapOpen2Default[i];
519 for (int i = 0; i < 2 * NumInsertStates; i++)
520 gapExtend[i] = gapExtend2Default[i];
523 << "ERROR: No default initial distribution/parameter settings exist"
524 << endl << " for " << NumInsertStates
525 << " pairs of insert states. Use --paramfile." << endl;
529 alphabet = alphabetDefault;
531 for (int i = 0; i < (int) alphabet.length(); i++) {
532 emitSingle[(unsigned char) tolower(alphabet[i])] =
533 emitSingleDefault[i];
534 emitSingle[(unsigned char) toupper(alphabet[i])] =
535 emitSingleDefault[i];
536 for (int j = 0; j <= i; j++) {
537 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
538 alphabet[j])] = emitPairsDefault[i][j];
539 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
540 alphabet[j])] = emitPairsDefault[i][j];
541 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
542 alphabet[j])] = emitPairsDefault[i][j];
543 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
544 alphabet[j])] = emitPairsDefault[i][j];
545 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
546 alphabet[i])] = emitPairsDefault[i][j];
547 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
548 alphabet[i])] = emitPairsDefault[i][j];
549 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
550 alphabet[i])] = emitPairsDefault[i][j];
551 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
552 alphabet[i])] = emitPairsDefault[i][j];
556 data.open(parametersInputFilename.c_str());
558 cerr << "ERROR: Unable to read parameter file: "
559 << parametersInputFilename << endl;
564 for (int i = 0; i < 3; i++) {
565 if (!getline(data, line[i])) {
567 << "ERROR: Unable to read transition parameters from parameter file: "
568 << parametersInputFilename << endl;
575 for (int i = 0; i < NumMatrixTypes; i++)
576 data2 >> initDistrib[i];
579 for (int i = 0; i < 2 * NumInsertStates; i++)
583 for (int i = 0; i < 2 * NumInsertStates; i++)
584 data2 >> gapExtend[i];
586 if (!getline(data, line[0])) {
587 cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
588 << parametersInputFilename << endl;
592 // read alphabet as concatenation of all characters on alphabet line
597 while (data2 >> token)
600 for (int i = 0; i < (int) alphabet.size(); i++) {
601 for (int j = 0; j <= i; j++) {
604 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
606 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
608 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
610 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
612 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
614 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
616 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
618 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
623 for (int i = 0; i < (int) alphabet.size(); i++) {
626 emitSingle[(unsigned char) tolower(alphabet[i])] = val;
627 emitSingle[(unsigned char) toupper(alphabet[i])] = val;
633 /////////////////////////////////////////////////////////////////
636 // Parse all command-line options.
637 /////////////////////////////////////////////////////////////////
638 void MSA::printUsage() {
640 << "************************************************************************"
642 << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
644 << "\tbased on pair hidden markov model and partition function postirior"
646 << "\tprobabilities. If any comments or problems, please contact"
648 << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
650 << "*************************************************************************"
651 << endl << "Usage:" << endl
652 << " msaprobs [OPTION]... [infile]..." << endl << endl
653 << "Description:" << endl
654 << " Align sequences in multi-FASTA format" << endl << endl
655 << " -o, --outfile <string>" << endl
656 << " specify the output file name (STDOUT by default)"
657 << endl << " -num_threads <integer>" << endl
658 << " specify the number of threads used, and otherwise detect automatically"
659 << endl << " -clustalw" << endl
660 << " use CLUSTALW output format instead of FASTA format"
661 << endl << endl << " -c, --consistency REPS" << endl
662 << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
663 << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
664 << ") passes of consistency transformation" << endl << endl
665 << " -ir, --iterative-refinement REPS" << endl
666 << " use " << MIN_ITERATIVE_REFINEMENT_REPS
667 << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
668 << numIterativeRefinementReps << ") passes of iterative-refinement"
669 << endl << endl << " -v, --verbose" << endl
670 << " report progress while aligning (default: "
671 << (enableVerbose ? "on" : "off") << ")" << endl << endl
672 << " -annot FILENAME" << endl
673 << " write annotation for multiple alignment to FILENAME"
674 << endl << endl << " -a, --alignment-order" << endl
675 << " print sequences in alignment order rather than input order (default: "
676 << (enableAlignOrder ? "on" : "off") << ")" << endl
677 << " -version " << endl
678 << " print out version of MSAPROBS " << endl << endl;
680 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
685 SafeVector<string> sequenceNames;
689 for (int i = 1; i < argc; i++) {
690 if (argv[i][0] == '-') {
692 if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
696 } else if (!strcmp(argv[i], "-o")
697 || !strcmp(argv[i], "--outfile")) {
699 alignOutFileName = argv[++i]; //get the file name
701 cerr << "ERROR: String expected for option " << argv[i]
706 } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
708 parametersInputFilename = string (argv[++i]);
710 cerr << "ERROR: Filename expected for option " << argv[i] << endl;
713 //number of threads used
714 } else if (!strcmp(argv[i], "-p")
715 || !strcmp(argv[i], "-num_threads")) {
717 if (!GetInteger(argv[++i], &tempInt)) {
718 cerr << " ERROR: invalid integer following option "
719 << argv[i - 1] << ": " << argv[i] << endl;
725 numThreads = tempInt;
728 cerr << "ERROR: Integer expected for option " << argv[i]
732 // number of consistency transformations
733 } else if (!strcmp(argv[i], "-c")
734 || !strcmp(argv[i], "--consistency")) {
736 if (!GetInteger(argv[++i], &tempInt)) {
737 cerr << "ERROR: Invalid integer following option "
738 << argv[i - 1] << ": " << argv[i] << endl;
741 if (tempInt < MIN_CONSISTENCY_REPS
742 || tempInt > MAX_CONSISTENCY_REPS) {
743 cerr << "ERROR: For option " << argv[i - 1]
744 << ", integer must be between "
745 << MIN_CONSISTENCY_REPS << " and "
746 << MAX_CONSISTENCY_REPS << "." << endl;
749 numConsistencyReps = tempInt;
753 cerr << "ERROR: Integer expected for option " << argv[i]
759 // number of randomized partitioning iterative refinement passes
760 else if (!strcmp(argv[i], "-ir")
761 || !strcmp(argv[i], "--iterative-refinement")) {
763 if (!GetInteger(argv[++i], &tempInt)) {
764 cerr << "ERROR: Invalid integer following option "
765 << argv[i - 1] << ": " << argv[i] << endl;
768 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
769 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
770 cerr << "ERROR: For option " << argv[i - 1]
771 << ", integer must be between "
772 << MIN_ITERATIVE_REFINEMENT_REPS << " and "
773 << MAX_ITERATIVE_REFINEMENT_REPS << "."
777 numIterativeRefinementReps = tempInt;
780 cerr << "ERROR: Integer expected for option " << argv[i]
787 else if (!strcmp(argv[i], "-annot")) {
788 enableAnnotation = true;
790 annotationFilename = argv[++i];
792 cerr << "ERROR: FILENAME expected for option " << argv[i]
798 // clustalw output format
799 else if (!strcmp(argv[i], "-clustalw")) {
800 enableClustalWOutput = true;
804 else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
806 if (!GetFloat(argv[++i], &tempFloat)) {
808 << "ERROR: Invalid floating-point value following option "
809 << argv[i - 1] << ": " << argv[i] << endl;
812 if (tempFloat < 0 || tempFloat > 1) {
813 cerr << "ERROR: For option " << argv[i - 1]
814 << ", floating-point value must be between 0 and 1."
821 cerr << "ERROR: Floating-point value expected for option "
828 else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
829 enableVerbose = true;
833 else if (!strcmp(argv[i], "-a")
834 || !strcmp(argv[i], "--alignment-order")) {
835 enableAlignOrder = true;
839 else if (!strcmp(argv[i], "-version")) {
840 cerr << "MSAPROBS version " << VERSION << endl;
845 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
849 sequenceNames.push_back(string(argv[i]));
853 /*check the output file name*/
854 cerr << "-------------------------------------" << endl;
855 if (alignOutFileName.length() == 0) {
856 cerr << "The final alignments will be printed out to STDOUT" << endl;
857 alignOutFile = &std::cout;
859 cerr << "Open the output file " << alignOutFileName << endl;
860 alignOutFile = new ofstream(alignOutFileName.c_str(),
861 ios::binary | ios::out | ios::trunc);
863 cerr << "-------------------------------------" << endl;
864 return sequenceNames;
867 /////////////////////////////////////////////////////////////////
870 // Process the tree recursively. Returns the aligned sequences
871 // corresponding to a node or leaf of the tree.
872 /////////////////////////////////////////////////////////////////
873 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
874 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
875 const ProbabilisticModel &model) {
877 MultiSequence *result;
879 // check if this is a node of the alignment tree
880 //if (tree->GetSequenceLabel() == -1){
881 if (tree->leaf == NODE) {
882 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
883 sparseMatrices, model);
884 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
885 sparseMatrices, model);
890 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
897 // otherwise, this is a leaf of the alignment tree
899 result = new MultiSequence();
901 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
902 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
908 /////////////////////////////////////////////////////////////////
909 // ComputeFinalAlignment()
911 // Compute the final alignment by calling ProcessTree(), then
912 // performing iterative refinement as needed.
913 /////////////////////////////////////////////////////////////////
915 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
916 MultiSequence *sequences,
917 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
918 const ProbabilisticModel &model) {
919 MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
920 sparseMatrices, model);
922 SafeVector<int> oldOrdering;
923 if (enableAlignOrder) {
924 for (int i = 0; i < alignment->GetNumSequences(); i++)
925 oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
926 alignment->SaveOrdering();
927 enableAlignOrder = false;
930 // tree-based refinement
931 // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
933 int numSeqs = alignment->GetNumSequences();
934 //if(numSeqs < numIterativeRefinementReps){
935 for(int iter = 0; iter < 5; iter ++){
936 for(int i = 0; i < numSeqs - 1; i++){
937 DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
941 //Refinement return false:no improvement
942 for (int i = 0; i < numIterativeRefinementReps; i++) {
943 DoIterativeRefinement(sparseMatrices, model, alignment);
947 if (oldOrdering.size() > 0) {
948 for (int i = 0; i < (int) oldOrdering.size(); i++) {
949 alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
953 // return final alignment
957 /////////////////////////////////////////////////////////////////
960 // Returns the alignment of two MultiSequence objects.
961 /////////////////////////////////////////////////////////////////
963 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
964 MultiSequence *align2,
965 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
966 const ProbabilisticModel &model) {
968 // print some info about the alignment
970 for (int i = 0; i < align1->GetNumSequences(); i++)
971 cerr << ((i == 0) ? "[" : ",")
972 << align1->GetSequence(i)->GetLabel();
974 for (int i = 0; i < align2->GetNumSequences(); i++)
975 cerr << ((i == 0) ? "[" : ",")
976 << align2->GetSequence(i)->GetLabel();
980 VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
982 VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
983 sparseMatrices, cutoff);
985 // compute an "accuracy" measure for the alignment before refinement
987 pair<SafeVector<char> *, float> alignment;
989 alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
990 align2->GetSequence(0)->GetLength(), *posterior);
996 // compute total length of sequences
998 for (int i = 0; i < align1->GetNumSequences(); i++)
999 for (int j = 0; j < align2->GetNumSequences(); j++)
1000 totLength += min(align1->GetSequence(i)->GetLength(),
1001 align2->GetSequence(j)->GetLength());
1003 // give an "accuracy" measure for the alignment
1004 cerr << alignment.second / totLength << endl;
1007 // now build final alignment
1008 MultiSequence *result = new MultiSequence();
1009 for (int i = 0; i < align1->GetNumSequences(); i++)
1010 result->AddSequence(
1011 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1012 for (int i = 0; i < align2->GetNumSequences(); i++)
1013 result->AddSequence(
1014 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1015 if (!enableAlignOrder)
1016 result->SortByLabel();
1018 // free temporary alignment
1019 delete alignment.first;
1024 /////////////////////////////////////////////////////////////////
1027 // Performs one round of the weighted probabilistic consistency transformation.
1029 /////////////////////////////////////////////////////////////////
1031 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1032 MultiSequence *sequences,
1033 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1034 const int numSeqs = sequences->GetNumSequences();
1036 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1037 SafeVector<SparseMatrix *>(numSeqs, NULL));
1039 // for every pair of sequences
1042 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1043 for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1044 int i = seqsPairs[pairIdx].seq1;
1045 int j = seqsPairs[pairIdx].seq2;
1046 float wi = seqsWeights[i];
1047 float wj = seqsWeights[j];
1049 for (int i = 0; i < numSeqs; i++) {
1050 float wi = seqsWeights[i];
1051 for (int j = i + 1; j < numSeqs; j++) {
1052 float wj = seqsWeights[j];
1054 Sequence *seq1 = sequences->GetSequence(i);
1055 Sequence *seq2 = sequences->GetSequence(j);
1057 if (enableVerbose) {
1059 #pragma omp critical
1061 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1062 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1065 // get the original posterior matrix
1066 VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1067 assert(posteriorPtr);
1068 VF &posterior = *posteriorPtr;
1070 const int seq1Length = seq1->GetLength();
1071 const int seq2Length = seq2->GetLength();
1073 // contribution from the summation where z = x and z = y
1074 float w = wi * wi * wj + wi * wj * wj;
1076 for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1077 //posterior[k] = w*posterior[k];
1078 posterior[k] += posterior[k];
1082 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1084 // contribution from all other sequences
1085 for (int k = 0; k < numSeqs; k++) {
1086 if (k != i && k != j) {
1087 float wk = seqsWeights[k];
1088 float w = wi * wj * wk;
1091 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1093 else if (k > i && k < j)
1094 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1097 SparseMatrix *temp =
1098 sparseMatrices[j][k]->ComputeTranspose();
1099 Relax(w, sparseMatrices[i][k], temp, posterior);
1104 //cerr<<"sumW "<<sumW<<endl;
1105 for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1106 //posterior[k] /= sumW;
1107 posterior[k] /= numSeqs;
1109 // mask out positions not originally in the posterior matrix
1110 SparseMatrix *matXY = sparseMatrices[i][j];
1111 for (int y = 0; y <= seq2Length; y++)
1113 for (int x = 1; x <= seq1Length; x++) {
1114 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1115 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1116 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1118 while (XYptr != XYend) {
1120 // zero out all cells until the first filled column
1121 while (curr < XYptr->first) {
1126 // now, skip over this column
1131 // zero out cells after last column
1132 while (curr <= seq2Length) {
1138 // save the new posterior matrix
1139 newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1140 seq2->GetLength(), posterior);
1141 newSparseMatrices[j][i] = NULL;
1144 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1146 delete posteriorPtr;
1149 cerr << "done." << endl;
1155 return newSparseMatrices;
1158 /////////////////////////////////////////////////////////////////
1161 // Computes the consistency transformation for a single sequence
1162 // z, and adds the transformed matrix to "posterior".
1163 /////////////////////////////////////////////////////////////////
1165 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1171 int lengthX = matXZ->GetSeq1Length();
1172 int lengthY = matZY->GetSeq2Length();
1173 assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1176 for (int i = 1; i <= lengthX; i++) {
1177 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1178 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1180 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1182 // iterate through all x[i]-z[k]
1183 while (XZptr != XZend) {
1184 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1185 SafeVector<PIF>::iterator ZYend = ZYptr
1186 + matZY->GetRowSize(XZptr->first);
1187 const float XZval = XZptr->second;
1189 // iterate through all z[k]-y[j]
1190 while (ZYptr != ZYend) {
1191 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1192 base[ZYptr->first] += XZval * ZYptr->second;
1200 /////////////////////////////////////////////////////////////////
1203 // Computes the consistency transformation for a single sequence
1204 // z, and adds the transformed matrix to "posterior".
1205 /////////////////////////////////////////////////////////////////
1207 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1213 int lengthZ = matZX->GetSeq1Length();
1214 int lengthY = matZY->GetSeq2Length();
1217 for (int k = 1; k <= lengthZ; k++) {
1218 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1219 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1221 // iterate through all z[k]-x[i]
1222 while (ZXptr != ZXend) {
1223 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1224 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1225 const float ZXval = ZXptr->second;
1226 VF::iterator base = posterior.begin()
1227 + ZXptr->first * (lengthY + 1);
1229 // iterate through all z[k]-y[j]
1230 while (ZYptr != ZYend) {
1231 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1232 base[ZYptr->first] += ZXval * ZYptr->second;
1239 /////////////////////////////////////////////////////////////////
1240 // DoIterativeRefinement()
1242 // Performs a single round of randomized partionining iterative
1244 /////////////////////////////////////////////////////////////////
1246 void MSA::DoIterativeRefinement(
1247 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1248 const ProbabilisticModel &model, MultiSequence* &alignment) {
1249 set<int> groupOne, groupTwo;
1250 int numSeqs = alignment->GetNumSequences();
1252 // create two separate groups
1253 for (int i = 0; i < numSeqs; i++) {
1261 if (groupOne.empty() || groupTwo.empty()) return;
1263 // project into the two groups
1264 MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1265 assert(groupOneSeqs);
1266 MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1267 assert(groupTwoSeqs);
1269 //start add by Yongtao
1271 VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1273 VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1274 sparseMatrices, cutoff);
1277 // compute an "accuracy" measure for the alignment before refinement
1278 SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1279 SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1281 for (set<int>::const_iterator iter = groupOne.begin();
1282 iter != groupOne.end(); ++iter) {
1283 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1286 for (set<int>::const_iterator iter = groupTwo.begin();
1287 iter != groupTwo.end(); ++iter) {
1288 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1291 VF &posteriorArr = *posterior;
1292 int oldLength = alignment->GetSequence(0)->GetLength();
1293 int groupOneindex=0; int groupTwoindex=0;
1294 float accuracy_before = 0;
1295 for (int i = 1; i <= oldLength; i++) {
1296 // check to see if there is a gap in every sequence of the set
1297 bool foundOne = false;
1298 for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
1299 foundOne = (oldOnePtrs[j][i] != '-');
1300 // if not, then this column counts towards the sequence length
1301 if (foundOne) groupOneindex ++;
1302 bool foundTwo = false;
1303 for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1304 foundTwo = (oldTwoPtrs[j][i] != '-');
1305 // if not, then this column counts towards the sequence length
1306 if (foundTwo) groupTwoindex ++;
1307 if(foundOne && foundTwo) accuracy_before +=
1308 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1311 pair<SafeVector<char> *, float> refinealignment;
1313 refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1314 groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1316 // now build final alignment
1317 MultiSequence *result = new MultiSequence();
1318 //compare accuracy measure before and after refinement
1319 //if (refinealignment.second > accuracy_before) {
1320 //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl;
1321 for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1322 result->AddSequence(
1323 groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1324 for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1325 result->AddSequence(
1326 groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1327 // free temporary alignment
1328 delete refinealignment.first;
1334 if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
1335 delete groupOneSeqs;
1336 delete groupTwoSeqs;
1340 //end add by yongtao
1344 alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
1345 delete groupOneSeqs;
1346 delete groupTwoSeqs;
1350 void MSA::DoIterativeRefinementTreeNode(
1351 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1352 const ProbabilisticModel &model, MultiSequence* &alignment,
1354 set<int> groupOne, groupTwo;
1355 int numSeqs = alignment->GetNumSequences();
1357 vector<bool> inGroup1;
1358 inGroup1.resize(numSeqs);
1359 for (int i = 0; i < numSeqs; i++) {
1360 inGroup1[i] = false;
1363 AlignmentOrder* orders = this->tree->getAlignOrders();
1364 AlignmentOrder* order = &orders[nodeIndex];
1365 for (int i = 0; i < order->leftNum; i++) {
1366 int si = order->leftLeafs[i];
1367 inGroup1[si] = true;
1369 for (int i = 0; i < order->rightNum; i++) {
1370 int si = order->rightLeafs[i];
1371 inGroup1[si] = true;
1373 // create two separate groups
1374 for (int i = 0; i < numSeqs; i++) {
1381 if (groupOne.empty() || groupTwo.empty())
1384 // project into the two groups
1385 MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1386 assert(groupOneSeqs);
1387 MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1388 assert(groupTwoSeqs);
1392 alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1395 delete groupOneSeqs;
1396 delete groupTwoSeqs;
1399 /////////////////////////////////////////////////////////////////
1400 // WriteAnnotation()
1402 // Computes annotation for multiple alignment and write values
1404 /////////////////////////////////////////////////////////////////
1406 void MSA::WriteAnnotation(MultiSequence *alignment,
1407 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1408 ofstream outfile(annotationFilename.c_str());
1410 if (outfile.fail()) {
1411 cerr << "ERROR: Unable to write annotation file." << endl;
1415 const int alignLength = alignment->GetSequence(0)->GetLength();
1416 const int numSeqs = alignment->GetNumSequences();
1418 SafeVector<int> position(numSeqs, 0);
1419 SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1420 for (int i = 0; i < numSeqs; i++)
1421 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1422 SafeVector<pair<int, int> > active;
1423 active.reserve(numSeqs);
1425 SafeVector<int> lab;
1426 for (int i = 0; i < numSeqs; i++)
1427 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1430 for (int i = 1; i <= alignLength; i++) {
1432 // find all aligned residues in this particular column
1434 for (int j = 0; j < numSeqs; j++) {
1435 if (seqs[j][i] != '-') {
1436 active.push_back(make_pair(lab[j], ++position[j]));
1440 sort(active.begin(), active.end());
1441 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1447 /////////////////////////////////////////////////////////////////
1450 // Computes the annotation score for a particular column.
1451 /////////////////////////////////////////////////////////////////
1453 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1454 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1456 if (active.size() <= 1)
1459 // ALTERNATIVE #1: Compute the average alignment score.
1462 for (int i = 0; i < (int) active.size(); i++) {
1463 for (int j = i + 1; j < (int) active.size(); j++) {
1464 val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1465 active[i].second, active[j].second);
1469 return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));