1 /***********************************************
2 * # Copyright 2009-2010. Liu Yongchao
3 * # Contact: Liu Yongchao, School of Computer Engineering,
4 * # Nanyang Technological University.
5 * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
7 * # GPL version 3.0 applies.
9 * ************************************************/
24 #include "MSAClusterTree.h"
31 string parametersInputFilename = "";
32 string parametersOutputFilename = "no training";
33 string annotationFilename = "";
35 bool enableVerbose = false;
36 bool enableAnnotation = false;
37 bool enableClustalWOutput = false;
38 bool enableAlignOrder = false;
39 int numConsistencyReps = 2;
40 int numPreTrainingReps = 0;
41 int numIterativeRefinementReps = 10;
45 VF initDistrib(NumMatrixTypes);
46 VF gapOpen(2 * NumInsertStates);
47 VF gapExtend(2 * NumInsertStates);
48 VVF emitPairs(256, VF(256, 1e-10));
49 VF emitSingle(256, 1e-5);
51 string alphabet = alphabetDefault;
53 const int MIN_PRETRAINING_REPS = 0;
54 const int MAX_PRETRAINING_REPS = 20;
55 const int MIN_CONSISTENCY_REPS = 0;
56 const int MAX_CONSISTENCY_REPS = 5;
57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
60 string posteriorProbsFilename = "";
61 bool allscores = true;
64 int flag_gui = 0; //0: no gui related o/p
65 //1: gui related o/p generated
66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
67 //1: pp score seq added to o/p fasta alignment
69 ///////////////////////////////
70 // global scoring matrix variables
71 //////////////////////////////
72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
76 double sub_matrix[26][26];
77 int firstread = 0; //this makes sure that matrices are read only once
79 float TEMPERATURE = 5;
81 int prot_nuc = 0; //0=prot, 1=nucleotide
94 char opt; //can be 'P' or 'M'
99 argument_decl argument;
101 extern inline void read_sustitution_matrix(char *fileName);
102 extern void setmatrixtype(int le);
103 extern inline int matrixtype_to_int();
104 extern inline void read_dna_matrix();
105 extern inline void read_vtml_la_matrix();
106 extern void init_arguments();
108 MSA::MSA(int argc, char* argv[]) {
109 //parse program parameters
110 SafeVector<string> sequenceNames = ParseParams(argc, argv);
112 //initialize arguments for partition function
116 //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
118 //read the input sequences
119 MultiSequence *sequences = new MultiSequence();
121 for (int i = 0; i < (int) sequenceNames.size(); i++) {
122 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
123 sequences->LoadMFA(sequenceNames[i], true);
125 //allocate space for sequence weights
126 this->seqsWeights = new int[sequences->GetNumSequences()];
127 //initilaize parameters for OPENMP
129 if(numThreads <= 0) {
130 numThreads = omp_get_num_procs();
131 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
133 cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
135 //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
136 omp_set_num_threads(numThreads);
139 // now, we can perform the alignments and write them out
140 MultiSequence *alignment = doAlign(sequences,
141 ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
142 emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
145 //write the alignment results to standard output
146 if (enableClustalWOutput) {
147 alignment->WriteALN(*alignOutFile);
149 alignment->WriteMFA(*alignOutFile);
152 delete[] this->seqsWeights;
157 /*close the output file*/
158 if (alignOutFileName.length() > 0) {
159 ((std::ofstream*) alignOutFile)->close();
162 /////////////////////////////////////////////////////////////////
165 // Prints MSAPROBS parameters to STDERR. If a filename is
166 // specified, then the parameters are also written to the file.
167 /////////////////////////////////////////////////////////////////
169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
170 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
171 const VF &emitSingle, const char *filename) {
173 // print parameters to the screen
174 cerr << message << endl << " initDistrib[] = { ";
175 for (int i = 0; i < NumMatrixTypes; i++)
176 cerr << setprecision(10) << initDistrib[i] << " ";
177 cerr << "}" << endl << " gapOpen[] = { ";
178 for (int i = 0; i < NumInsertStates * 2; i++)
179 cerr << setprecision(10) << gapOpen[i] << " ";
180 cerr << "}" << endl << " gapExtend[] = { ";
181 for (int i = 0; i < NumInsertStates * 2; i++)
182 cerr << setprecision(10) << gapExtend[i] << " ";
183 cerr << "}" << endl << endl;
186 for (int i = 0; i < 5; i++){
187 for (int j = 0; j <= i; j++){
188 cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
193 // if a file name is specified
196 // attempt to open the file for writing
197 FILE *file = fopen(filename, "w");
199 cerr << "ERROR: Unable to write parameter file: " << filename
204 // if successful, then write the parameters to the file
205 for (int i = 0; i < NumMatrixTypes; i++)
206 fprintf(file, "%.10f ", initDistrib[i]);
208 for (int i = 0; i < 2 * NumInsertStates; i++)
209 fprintf(file, "%.10f ", gapOpen[i]);
211 for (int i = 0; i < 2 * NumInsertStates; i++)
212 fprintf(file, "%.10f ", gapExtend[i]);
214 fprintf(file, "%s\n", alphabet.c_str());
215 for (int i = 0; i < (int) alphabet.size(); i++) {
216 for (int j = 0; j <= i; j++)
217 fprintf(file, "%.10f ",
218 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
221 for (int i = 0; i < (int) alphabet.size(); i++)
222 fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
228 /////////////////////////////////////////////////////////////////
231 // First computes all pairwise posterior probability matrices.
232 // Then, computes new parameters if training, or a final
233 // alignment, otherwise.
234 /////////////////////////////////////////////////////////////////
235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
237 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
238 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
241 //get the number of sequences
242 const int numSeqs = sequences->GetNumSequences();
244 //create distance matrix
245 VVF distances(numSeqs, VF(numSeqs, 0));
246 SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
247 SafeVector<SparseMatrix *>(numSeqs, NULL));
250 //calculate sequence pairs for openmp model
252 numPairs = (numSeqs - 1) * numSeqs / 2;
253 seqsPairs = new SeqsPair[numPairs];
254 for(int a = 0; a < numSeqs; a++) {
255 for(int b = a + 1; b < numSeqs; b++) {
256 seqsPairs[pairIdx].seq1 = a;
257 seqsPairs[pairIdx].seq2 = b;
262 // do all pairwise alignments for posterior probability matrices
264 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
265 for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
266 int a= seqsPairs[pairIdx].seq1;
267 int b = seqsPairs[pairIdx].seq2;
270 cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
273 for (int a = 0; a < numSeqs - 1; a++) {
274 for (int b = a + 1; b < numSeqs; b++) {
276 Sequence *seq1 = sequences->GetSequence(a);
277 Sequence *seq2 = sequences->GetSequence(b);
281 cerr << "Computing posterior matrix: (" << a + 1 << ") "
282 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
283 << seq2->GetHeader() << " -- ";
286 // compute forward and backward probabilities
287 VF *forward = model.ComputeForwardMatrix(seq1, seq2);
289 VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
292 // compute posterior probability matrix from HMM
293 VF *posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,
299 //compute posterior probability matrix from partition function
300 VF* part_posterior = ::ComputePostProbs(a, b, seq1->GetString(),
302 assert(part_posterior);
304 //merge the two posterior matrices
305 VF::iterator ptr1 = posterior->begin();
306 VF::iterator ptr2 = part_posterior->begin();
307 for (int i = 0; i <= seq1->GetLength(); i++) {
308 for (int j = 0; j <= seq2->GetLength(); j++) {
312 *ptr1 = sqrt((v1 * v1 + v2 * v2) * 0.5f);
317 delete part_posterior;
319 // compute sparse representations
320 sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
321 seq2->GetLength(), *posterior);
322 sparseMatrices[b][a] = NULL;
324 // perform the pairwise sequence alignment
325 pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
326 seq1->GetLength(), seq2->GetLength(), *posterior);
328 //compute the pairwise distance using expected accuracy
329 float accuracy = alignment.second
330 / min(seq1->GetLength(), seq2->GetLength());
331 distances[a][b] = distances[b][a] = 1.0f - accuracy;
334 cerr << setprecision(10) << accuracy << endl;
336 delete alignment.first;
342 //create the guide tree
343 this->tree = new MSAClusterTree(this, distances, numSeqs);
344 this->tree->create();
346 // perform the consistency transformation the desired number of times
347 float* fweights = new float[numSeqs];
348 for (int r = 0; r < numSeqs; r++) {
349 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
352 for (int r = 0; r < numConsistencyReps; r++) {
353 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
354 DoRelaxation(fweights, sequences, sparseMatrices);
356 // now replace the old posterior matrices
357 for (int i = 0; i < numSeqs; i++) {
358 for (int j = 0; j < numSeqs; j++) {
359 delete sparseMatrices[i][j];
360 sparseMatrices[i][j] = newSparseMatrices[i][j];
369 //compute the final multiple sequence alignment
370 MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
371 sparseMatrices, model);
374 if (enableAnnotation) {
375 WriteAnnotation(finalAlignment, sparseMatrices);
377 //destroy the guide tree
381 // delete sparse matrices
382 for (int a = 0; a < numSeqs - 1; a++) {
383 for (int b = a + 1; b < numSeqs; b++) {
384 delete sparseMatrices[a][b];
385 delete sparseMatrices[b][a];
389 return finalAlignment;
392 /////////////////////////////////////////////////////////////////
395 // Attempts to parse an integer from the character string given.
396 // Returns true only if no parsing error occurs.
397 /////////////////////////////////////////////////////////////////
399 bool GetInteger(char *data, int *val) {
406 retVal = strtol(data, &endPtr, 0);
407 if (retVal == 0 && (errno != 0 || data == endPtr))
409 if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
411 if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
417 /////////////////////////////////////////////////////////////////
420 // Attempts to parse a float from the character string given.
421 // Returns true only if no parsing error occurs.
422 /////////////////////////////////////////////////////////////////
424 bool GetFloat(char *data, float *val) {
431 retVal = strtod(data, &endPtr);
432 if (retVal == 0 && (errno != 0 || data == endPtr))
434 if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
436 *val = (float) retVal;
440 /////////////////////////////////////////////////////////////////
443 // Read initial distribution, transition, and emission
444 // parameters from a file.
445 /////////////////////////////////////////////////////////////////
447 void MSA::ReadParameters() {
451 emitPairs = VVF(256, VF(256, 1e-10));
452 emitSingle = VF(256, 1e-5);
454 // read initial state distribution and transition parameters
455 if (parametersInputFilename == string("")) {
456 if (NumInsertStates == 1) {
457 for (int i = 0; i < NumMatrixTypes; i++)
458 initDistrib[i] = initDistrib1Default[i];
459 for (int i = 0; i < 2 * NumInsertStates; i++)
460 gapOpen[i] = gapOpen1Default[i];
461 for (int i = 0; i < 2 * NumInsertStates; i++)
462 gapExtend[i] = gapExtend1Default[i];
463 } else if (NumInsertStates == 2) {
464 for (int i = 0; i < NumMatrixTypes; i++)
465 initDistrib[i] = initDistrib2Default[i];
466 for (int i = 0; i < 2 * NumInsertStates; i++)
467 gapOpen[i] = gapOpen2Default[i];
468 for (int i = 0; i < 2 * NumInsertStates; i++)
469 gapExtend[i] = gapExtend2Default[i];
472 << "ERROR: No default initial distribution/parameter settings exist"
473 << endl << " for " << NumInsertStates
474 << " pairs of insert states. Use --paramfile." << endl;
478 alphabet = alphabetDefault;
480 for (int i = 0; i < (int) alphabet.length(); i++) {
481 emitSingle[(unsigned char) tolower(alphabet[i])] =
482 emitSingleDefault[i];
483 emitSingle[(unsigned char) toupper(alphabet[i])] =
484 emitSingleDefault[i];
485 for (int j = 0; j <= i; j++) {
486 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
487 alphabet[j])] = emitPairsDefault[i][j];
488 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
489 alphabet[j])] = emitPairsDefault[i][j];
490 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
491 alphabet[j])] = emitPairsDefault[i][j];
492 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
493 alphabet[j])] = emitPairsDefault[i][j];
494 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
495 alphabet[i])] = emitPairsDefault[i][j];
496 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
497 alphabet[i])] = emitPairsDefault[i][j];
498 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
499 alphabet[i])] = emitPairsDefault[i][j];
500 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
501 alphabet[i])] = emitPairsDefault[i][j];
505 data.open(parametersInputFilename.c_str());
507 cerr << "ERROR: Unable to read parameter file: "
508 << parametersInputFilename << endl;
513 for (int i = 0; i < 3; i++) {
514 if (!getline(data, line[i])) {
516 << "ERROR: Unable to read transition parameters from parameter file: "
517 << parametersInputFilename << endl;
524 for (int i = 0; i < NumMatrixTypes; i++)
525 data2 >> initDistrib[i];
528 for (int i = 0; i < 2 * NumInsertStates; i++)
532 for (int i = 0; i < 2 * NumInsertStates; i++)
533 data2 >> gapExtend[i];
535 if (!getline(data, line[0])) {
536 cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
537 << parametersInputFilename << endl;
541 // read alphabet as concatenation of all characters on alphabet line
546 while (data2 >> token)
549 for (int i = 0; i < (int) alphabet.size(); i++) {
550 for (int j = 0; j <= i; j++) {
553 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
555 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
557 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
559 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
561 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
563 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
565 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
567 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
572 for (int i = 0; i < (int) alphabet.size(); i++) {
575 emitSingle[(unsigned char) tolower(alphabet[i])] = val;
576 emitSingle[(unsigned char) toupper(alphabet[i])] = val;
582 /////////////////////////////////////////////////////////////////
585 // Parse all command-line options.
586 /////////////////////////////////////////////////////////////////
587 void MSA::printUsage() {
589 << "************************************************************************"
591 << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
593 << "\tbased on pair hidden markov model and partition function postirior"
595 << "\tprobabilities. If any comments or problems, please contact"
597 << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
599 << "*************************************************************************"
600 << endl << "Usage:" << endl
601 << " msaprobs [OPTION]... [infile]..." << endl << endl
602 << "Description:" << endl
603 << " Align sequences in multi-FASTA format" << endl << endl
604 << " -o, --outfile <string>" << endl
605 << " specify the output file name (STDOUT by default)"
606 << endl << " -num_threads <integer>" << endl
607 << " specify the number of threads used, and otherwise detect automatically"
608 << endl << " -clustalw" << endl
609 << " use CLUSTALW output format instead of FASTA format"
610 << endl << endl << " -c, --consistency REPS" << endl
611 << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
612 << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
613 << ") passes of consistency transformation" << endl << endl
614 << " -ir, --iterative-refinement REPS" << endl
615 << " use " << MIN_ITERATIVE_REFINEMENT_REPS
616 << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
617 << numIterativeRefinementReps << ") passes of iterative-refinement"
618 << endl << endl << " -v, --verbose" << endl
619 << " report progress while aligning (default: "
620 << (enableVerbose ? "on" : "off") << ")" << endl << endl
621 << " -annot FILENAME" << endl
622 << " write annotation for multiple alignment to FILENAME"
623 << endl << endl << " -a, --alignment-order" << endl
624 << " print sequences in alignment order rather than input order (default: "
625 << (enableAlignOrder ? "on" : "off") << ")" << endl
626 << " -version " << endl
627 << " print out version of MSAPROBS " << endl << endl;
629 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
634 SafeVector<string> sequenceNames;
638 for (int i = 1; i < argc; i++) {
639 if (argv[i][0] == '-') {
641 if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
645 } else if (!strcmp(argv[i], "-o")
646 || !strcmp(argv[i], "--outfile")) {
648 alignOutFileName = argv[++i]; //get the file name
650 cerr << "ERROR: String expected for option " << argv[i]
654 //number of threads used
655 } else if (!strcmp(argv[i], "-p")
656 || !strcmp(argv[i], "-num_threads")) {
658 if (!GetInteger(argv[++i], &tempInt)) {
659 cerr << " ERROR: invalid integer following option "
660 << argv[i - 1] << ": " << argv[i] << endl;
666 numThreads = tempInt;
669 cerr << "ERROR: Integer expected for option " << argv[i]
673 // number of consistency transformations
674 } else if (!strcmp(argv[i], "-c")
675 || !strcmp(argv[i], "--consistency")) {
677 if (!GetInteger(argv[++i], &tempInt)) {
678 cerr << "ERROR: Invalid integer following option "
679 << argv[i - 1] << ": " << argv[i] << endl;
682 if (tempInt < MIN_CONSISTENCY_REPS
683 || tempInt > MAX_CONSISTENCY_REPS) {
684 cerr << "ERROR: For option " << argv[i - 1]
685 << ", integer must be between "
686 << MIN_CONSISTENCY_REPS << " and "
687 << MAX_CONSISTENCY_REPS << "." << endl;
690 numConsistencyReps = tempInt;
694 cerr << "ERROR: Integer expected for option " << argv[i]
700 // number of randomized partitioning iterative refinement passes
701 else if (!strcmp(argv[i], "-ir")
702 || !strcmp(argv[i], "--iterative-refinement")) {
704 if (!GetInteger(argv[++i], &tempInt)) {
705 cerr << "ERROR: Invalid integer following option "
706 << argv[i - 1] << ": " << argv[i] << endl;
709 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
710 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
711 cerr << "ERROR: For option " << argv[i - 1]
712 << ", integer must be between "
713 << MIN_ITERATIVE_REFINEMENT_REPS << " and "
714 << MAX_ITERATIVE_REFINEMENT_REPS << "."
718 numIterativeRefinementReps = tempInt;
721 cerr << "ERROR: Integer expected for option " << argv[i]
728 else if (!strcmp(argv[i], "-annot")) {
729 enableAnnotation = true;
731 annotationFilename = argv[++i];
733 cerr << "ERROR: FILENAME expected for option " << argv[i]
739 // clustalw output format
740 else if (!strcmp(argv[i], "-clustalw")) {
741 enableClustalWOutput = true;
745 else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
747 if (!GetFloat(argv[++i], &tempFloat)) {
749 << "ERROR: Invalid floating-point value following option "
750 << argv[i - 1] << ": " << argv[i] << endl;
753 if (tempFloat < 0 || tempFloat > 1) {
754 cerr << "ERROR: For option " << argv[i - 1]
755 << ", floating-point value must be between 0 and 1."
762 cerr << "ERROR: Floating-point value expected for option "
769 else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
770 enableVerbose = true;
774 else if (!strcmp(argv[i], "-a")
775 || !strcmp(argv[i], "--alignment-order")) {
776 enableAlignOrder = true;
780 else if (!strcmp(argv[i], "-version")) {
781 cerr << "MSAPROBS version " << VERSION << endl;
786 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
790 sequenceNames.push_back(string(argv[i]));
794 /*check the output file name*/
795 cerr << "-------------------------------------" << endl;
796 if (alignOutFileName.length() == 0) {
797 cerr << "The final alignments will be printed out to STDOUT" << endl;
798 alignOutFile = &std::cout;
800 cerr << "Open the output file " << alignOutFileName << endl;
801 alignOutFile = new ofstream(alignOutFileName.c_str(),
802 ios::binary | ios::out | ios::trunc);
804 cerr << "-------------------------------------" << endl;
805 return sequenceNames;
808 /////////////////////////////////////////////////////////////////
811 // Process the tree recursively. Returns the aligned sequences
812 // corresponding to a node or leaf of the tree.
813 /////////////////////////////////////////////////////////////////
814 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
815 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
816 const ProbabilisticModel &model) {
818 MultiSequence *result;
820 // check if this is a node of the alignment tree
821 //if (tree->GetSequenceLabel() == -1){
822 if (tree->leaf == NODE) {
823 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
824 sparseMatrices, model);
825 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
826 sparseMatrices, model);
831 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
838 // otherwise, this is a leaf of the alignment tree
840 result = new MultiSequence();
842 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
843 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
849 /////////////////////////////////////////////////////////////////
850 // ComputeFinalAlignment()
852 // Compute the final alignment by calling ProcessTree(), then
853 // performing iterative refinement as needed.
854 /////////////////////////////////////////////////////////////////
856 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
857 MultiSequence *sequences,
858 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
859 const ProbabilisticModel &model) {
860 MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
861 sparseMatrices, model);
863 SafeVector<int> oldOrdering;
864 if (enableAlignOrder) {
865 for (int i = 0; i < alignment->GetNumSequences(); i++)
866 oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
867 alignment->SaveOrdering();
868 enableAlignOrder = false;
871 // tree-based refinement
872 // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
873 /*int numSeqs = alignment->GetNumSequences();
874 if(numSeqs < numIterativeRefinementReps){
875 for(int iter = 0; iter < 1; iter ++){
876 for(int i = 0; i < numSeqs - 1; i++){
877 DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
881 for (int i = 0; i < numIterativeRefinementReps; i++) {
882 DoIterativeRefinement(sparseMatrices, model, alignment, i);
886 if (oldOrdering.size() > 0) {
887 for (int i = 0; i < (int) oldOrdering.size(); i++) {
888 alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
892 // return final alignment
896 /////////////////////////////////////////////////////////////////
899 // Returns the alignment of two MultiSequence objects.
900 /////////////////////////////////////////////////////////////////
902 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
903 MultiSequence *align2,
904 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
905 const ProbabilisticModel &model) {
907 // print some info about the alignment
909 for (int i = 0; i < align1->GetNumSequences(); i++)
910 cerr << ((i == 0) ? "[" : ",")
911 << align1->GetSequence(i)->GetLabel();
913 for (int i = 0; i < align2->GetNumSequences(); i++)
914 cerr << ((i == 0) ? "[" : ",")
915 << align2->GetSequence(i)->GetLabel();
919 VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
921 VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
922 sparseMatrices, cutoff);
924 pair<SafeVector<char> *, float> alignment;
927 alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
928 align2->GetSequence(0)->GetLength(), *posterior);
934 // compute total length of sequences
936 for (int i = 0; i < align1->GetNumSequences(); i++)
937 for (int j = 0; j < align2->GetNumSequences(); j++)
938 totLength += min(align1->GetSequence(i)->GetLength(),
939 align2->GetSequence(j)->GetLength());
941 // give an "accuracy" measure for the alignment
942 cerr << alignment.second / totLength << endl;
945 // now build final alignment
946 MultiSequence *result = new MultiSequence();
947 for (int i = 0; i < align1->GetNumSequences(); i++)
949 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
950 for (int i = 0; i < align2->GetNumSequences(); i++)
952 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
953 if (!enableAlignOrder)
954 result->SortByLabel();
956 // free temporary alignment
957 delete alignment.first;
962 /////////////////////////////////////////////////////////////////
965 // Performs one round of the weighted probabilistic consistency transformation.
967 /////////////////////////////////////////////////////////////////
969 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
970 MultiSequence *sequences,
971 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
972 const int numSeqs = sequences->GetNumSequences();
974 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
975 SafeVector<SparseMatrix *>(numSeqs, NULL));
977 // for every pair of sequences
980 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
981 for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
982 int i = seqsPairs[pairIdx].seq1;
983 int j = seqsPairs[pairIdx].seq2;
984 float wi = seqsWeights[i];
985 float wj = seqsWeights[j];
987 for (int i = 0; i < numSeqs; i++) {
988 float wi = seqsWeights[i];
989 for (int j = i + 1; j < numSeqs; j++) {
990 float wj = seqsWeights[j];
992 Sequence *seq1 = sequences->GetSequence(i);
993 Sequence *seq2 = sequences->GetSequence(j);
999 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1000 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1003 // get the original posterior matrix
1004 VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1005 assert(posteriorPtr);
1006 VF &posterior = *posteriorPtr;
1008 const int seq1Length = seq1->GetLength();
1009 const int seq2Length = seq2->GetLength();
1011 // contribution from the summation where z = x and z = y
1012 float w = wi * wi * wj + wi * wj * wj;
1014 for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1015 posterior[k] = w * posterior[k];
1019 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1021 // contribution from all other sequences
1022 for (int k = 0; k < numSeqs; k++) {
1023 if (k != i && k != j) {
1024 float wk = seqsWeights[k];
1025 float w = wi * wj * wk;
1028 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1030 else if (k > i && k < j)
1031 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1034 SparseMatrix *temp =
1035 sparseMatrices[j][k]->ComputeTranspose();
1036 Relax(w, sparseMatrices[i][k], temp, posterior);
1041 //cerr<<"sumW "<<sumW<<endl;
1042 for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1043 posterior[k] /= sumW;
1045 // mask out positions not originally in the posterior matrix
1046 SparseMatrix *matXY = sparseMatrices[i][j];
1047 for (int y = 0; y <= seq2Length; y++)
1049 for (int x = 1; x <= seq1Length; x++) {
1050 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1051 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1052 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1054 while (XYptr != XYend) {
1056 // zero out all cells until the first filled column
1057 while (curr < XYptr->first) {
1062 // now, skip over this column
1067 // zero out cells after last column
1068 while (curr <= seq2Length) {
1074 // save the new posterior matrix
1075 newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1076 seq2->GetLength(), posterior);
1077 newSparseMatrices[j][i] = NULL;
1080 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1082 delete posteriorPtr;
1085 cerr << "done." << endl;
1091 return newSparseMatrices;
1094 /////////////////////////////////////////////////////////////////
1097 // Computes the consistency transformation for a single sequence
1098 // z, and adds the transformed matrix to "posterior".
1099 /////////////////////////////////////////////////////////////////
1101 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1107 int lengthX = matXZ->GetSeq1Length();
1108 int lengthY = matZY->GetSeq2Length();
1109 assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1112 for (int i = 1; i <= lengthX; i++) {
1113 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1114 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1116 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1118 // iterate through all x[i]-z[k]
1119 while (XZptr != XZend) {
1120 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1121 SafeVector<PIF>::iterator ZYend = ZYptr
1122 + matZY->GetRowSize(XZptr->first);
1123 const float XZval = XZptr->second;
1125 // iterate through all z[k]-y[j]
1126 while (ZYptr != ZYend) {
1127 base[ZYptr->first] += weight * XZval * ZYptr->second;
1135 /////////////////////////////////////////////////////////////////
1138 // Computes the consistency transformation for a single sequence
1139 // z, and adds the transformed matrix to "posterior".
1140 /////////////////////////////////////////////////////////////////
1142 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1148 int lengthZ = matZX->GetSeq1Length();
1149 int lengthY = matZY->GetSeq2Length();
1152 for (int k = 1; k <= lengthZ; k++) {
1153 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1154 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1156 // iterate through all z[k]-x[i]
1157 while (ZXptr != ZXend) {
1158 SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1159 SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1160 const float ZXval = ZXptr->second;
1161 VF::iterator base = posterior.begin()
1162 + ZXptr->first * (lengthY + 1);
1164 // iterate through all z[k]-y[j]
1165 while (ZYptr != ZYend) {
1166 base[ZYptr->first] += weight * ZXval * ZYptr->second;
1173 /////////////////////////////////////////////////////////////////
1174 // DoIterativeRefinement()
1176 // Performs a single round of randomized partionining iterative
1178 /////////////////////////////////////////////////////////////////
1180 int MSA::GenRandom(int m, int seed, bool init) {
1181 static const int a = 5, b = 3, n = 7;
1188 for (int i = 0; i < n; i++) {
1189 rand1 = (a * rand0 + b) % m;
1195 void MSA::DoIterativeRefinement(
1196 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1197 const ProbabilisticModel &model, MultiSequence* &alignment, int si) {
1198 set<int> groupOne, groupTwo;
1199 int numSeqs = alignment->GetNumSequences();
1201 int index = GenRandom(numSeqs, si, true);
1202 // create two separate groups
1203 for (int i = 0; i < numSeqs; i++) {
1204 index = GenRandom(numSeqs, si);
1211 if (groupOne.empty() || groupTwo.empty())
1214 // project into the two groups
1215 MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1216 assert(groupOneSeqs);
1217 MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1218 assert(groupTwoSeqs);
1222 alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1225 delete groupOneSeqs;
1226 delete groupTwoSeqs;
1228 void MSA::DoIterativeRefinementTreeNode(
1229 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1230 const ProbabilisticModel &model, MultiSequence* &alignment,
1232 set<int> groupOne, groupTwo;
1233 int numSeqs = alignment->GetNumSequences();
1235 vector<bool> inGroup1;
1236 inGroup1.resize(numSeqs);
1237 for (int i = 0; i < numSeqs; i++) {
1238 inGroup1[i] = false;
1241 AlignmentOrder* orders = this->tree->getAlignOrders();
1242 AlignmentOrder* order = &orders[nodeIndex];
1243 for (int i = 0; i < order->leftNum; i++) {
1244 int si = order->leftLeafs[i];
1245 inGroup1[si] = true;
1247 for (int i = 0; i < order->rightNum; i++) {
1248 int si = order->rightLeafs[i];
1249 inGroup1[si] = true;
1251 // create two separate groups
1252 for (int i = 0; i < numSeqs; i++) {
1259 if (groupOne.empty() || groupTwo.empty())
1262 // project into the two groups
1263 MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1264 assert(groupOneSeqs);
1265 MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1266 assert(groupTwoSeqs);
1270 alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1273 delete groupOneSeqs;
1274 delete groupTwoSeqs;
1277 /////////////////////////////////////////////////////////////////
1278 // WriteAnnotation()
1280 // Computes annotation for multiple alignment and write values
1282 /////////////////////////////////////////////////////////////////
1284 void MSA::WriteAnnotation(MultiSequence *alignment,
1285 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1286 ofstream outfile(annotationFilename.c_str());
1288 if (outfile.fail()) {
1289 cerr << "ERROR: Unable to write annotation file." << endl;
1293 const int alignLength = alignment->GetSequence(0)->GetLength();
1294 const int numSeqs = alignment->GetNumSequences();
1296 SafeVector<int> position(numSeqs, 0);
1297 SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1298 for (int i = 0; i < numSeqs; i++)
1299 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1300 SafeVector<pair<int, int> > active;
1301 active.reserve(numSeqs);
1303 SafeVector<int> lab;
1304 for (int i = 0; i < numSeqs; i++)
1305 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1308 for (int i = 1; i <= alignLength; i++) {
1310 // find all aligned residues in this particular column
1312 for (int j = 0; j < numSeqs; j++) {
1313 if (seqs[j][i] != '-') {
1314 active.push_back(make_pair(lab[j], ++position[j]));
1318 sort(active.begin(), active.end());
1319 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1325 /////////////////////////////////////////////////////////////////
1328 // Computes the annotation score for a particular column.
1329 /////////////////////////////////////////////////////////////////
1331 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1332 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1334 if (active.size() <= 1)
1337 // ALTERNATIVE #1: Compute the average alignment score.
1340 for (int i = 0; i < (int) active.size(); i++) {
1341 for (int j = i + 1; j < (int) active.size(); j++) {
1342 val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1343 active[i].second, active[j].second);
1347 return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));