Add GLprobs and MSAprobs to binaries
[jabaws.git] / binaries / src / GLProbs-1.0 / MSAfull.cpp
1 /***********************************************
2  * # Copyright 2009-2010. Liu Yongchao
3  * # Contact: Liu Yongchao, School of Computer Engineering,
4  * #                     Nanyang Technological University.
5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
6  * #
7  * # GPL version 3.0 applies.
8  * #
9  * ************************************************/
10
11 #include <string>
12 #include <sstream>
13 #include <iomanip>
14 #include <iostream>
15 #include <list>
16 #include <set>
17 #include <algorithm>
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21 #include <cerrno>
22 #include <iomanip>
23 #include "MSA.h"
24 #include "MSAClusterTree.h"
25 #include "Defaults.h"
26
27 #ifdef _OPENMP
28 #include <omp.h>
29 #endif
30
31 string parametersInputFilename = "";
32 string parametersOutputFilename = "no training";
33 string annotationFilename = "";
34
35 bool enableVerbose = false;
36 bool enableAnnotation = false;
37 bool enableClustalWOutput = false;
38 bool enableAlignOrder = false;
39 int numConsistencyReps = 2;
40 int numPreTrainingReps = 0;
41 int numIterativeRefinementReps = 100;
42
43 float cutoff = 0;
44
45 VF initDistrib(NumMatrixTypes);
46 VF gapOpen(2 * NumInsertStates);
47 VF gapExtend(2 * NumInsertStates);
48 VVF emitPairs(256, VF(256, 1e-10));
49 VF emitSingle(256, 1e-5);
50
51 string alphabet = alphabetDefault;
52
53 const int MIN_PRETRAINING_REPS = 0;
54 const int MAX_PRETRAINING_REPS = 20;
55 const int MIN_CONSISTENCY_REPS = 0;
56 const int MAX_CONSISTENCY_REPS = 5;
57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
59
60 string posteriorProbsFilename = "";
61 bool allscores = true;
62 string infilename;
63
64 int flag_gui = 0;   //0: no gui related o/p 
65 //1: gui related o/p generated
66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
67 //1: pp score seq added to o/p fasta alignment
68
69 ///////////////////////////////
70 // global scoring matrix variables
71 //////////////////////////////
72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
74 int subst_index[26];
75
76 double sub_matrix[26][26];
77 int firstread = 0;              //this makes sure that matrices are read only once 
78
79 float TEMPERATURE = 5;
80 int MATRIXTYPE = 160;
81 int prot_nuc = 0;               //0=prot, 1=nucleotide
82
83 float GAPOPEN = 0;
84 float GAPEXT = 0;
85 int numThreads = 0;
86
87 //argument support
88 typedef struct {
89         char input[30];
90         int matrix;
91         int N;
92         float T;
93         float beta;
94         char opt;                       //can be 'P' or 'M'
95         float gapopen;
96         float gapext;
97 } argument_decl;
98
99 argument_decl argument;
100
101 extern inline void read_sustitution_matrix(char *fileName);
102 extern void setmatrixtype(int le);
103 extern inline int matrixtype_to_int();
104 extern inline void read_dna_matrix();
105 extern inline void read_vtml_la_matrix();
106 extern void init_arguments();
107
108 MSA::MSA(int argc, char* argv[]) {
109         //parse program parameters
110         SafeVector<string> sequenceNames = ParseParams(argc, argv);
111
112         //initialize arguments for partition function
113         init_arguments();
114
115         ReadParameters();
116         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
117
118         //read the input sequences
119         MultiSequence *sequences = new MultiSequence();
120         assert(sequences);
121         for (int i = 0; i < (int) sequenceNames.size(); i++) {
122                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
123                 sequences->LoadMFA(sequenceNames[i], true);
124         }
125         //allocate space for sequence weights
126         this->seqsWeights = new int[sequences->GetNumSequences()];
127         //initilaize parameters for OPENMP
128 #ifdef _OPENMP
129         if(numThreads <= 0) {
130                 numThreads = omp_get_num_procs();
131                 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
132         }
133         cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
134
135         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
136         omp_set_num_threads(numThreads);
137 #endif  
138
139         // now, we can perform the alignments and write them out
140         MultiSequence *alignment = doAlign(sequences,
141                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
142                                         emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
143                         emitSingle);
144
145         //write the alignment results to standard output
146         if (enableClustalWOutput) {
147                 alignment->WriteALN(*alignOutFile);
148         } else {
149                 alignment->WriteMFA(*alignOutFile);
150         }
151         //release resources
152         delete[] this->seqsWeights;
153         delete alignment;
154         delete sequences;
155 }
156 MSA::~MSA() {
157         /*close the output file*/
158         if (alignOutFileName.length() > 0) {
159                 ((std::ofstream*) alignOutFile)->close();
160         }
161 }
162 /////////////////////////////////////////////////////////////////
163 // PrintParameters()
164 //
165 // Prints MSAPROBS parameters to STDERR.  If a filename is
166 // specified, then the parameters are also written to the file.
167 /////////////////////////////////////////////////////////////////
168
169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
170                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
171                 const VF &emitSingle, const char *filename) {
172
173         // print parameters to the screen
174         cerr << message << endl << "    initDistrib[] = { ";
175         for (int i = 0; i < NumMatrixTypes; i++)
176                 cerr << setprecision(10) << initDistrib[i] << " ";
177         cerr << "}" << endl << "        gapOpen[] = { ";
178         for (int i = 0; i < NumInsertStates * 2; i++)
179                 cerr << setprecision(10) << gapOpen[i] << " ";
180         cerr << "}" << endl << "      gapExtend[] = { ";
181         for (int i = 0; i < NumInsertStates * 2; i++)
182                 cerr << setprecision(10) << gapExtend[i] << " ";
183         cerr << "}" << endl << endl;
184
185         /*
186          for (int i = 0; i < 5; i++){
187          for (int j = 0; j <= i; j++){
188          cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
189          }
190          cerr << endl;
191          }*/
192
193         // if a file name is specified
194         if (filename) {
195
196                 // attempt to open the file for writing
197                 FILE *file = fopen(filename, "w");
198                 if (!file) {
199                         cerr << "ERROR: Unable to write parameter file: " << filename
200                                         << endl;
201                         exit(1);
202                 }
203
204                 // if successful, then write the parameters to the file
205                 for (int i = 0; i < NumMatrixTypes; i++)
206                         fprintf(file, "%.10f ", initDistrib[i]);
207                 fprintf(file, "\n");
208                 for (int i = 0; i < 2 * NumInsertStates; i++)
209                         fprintf(file, "%.10f ", gapOpen[i]);
210                 fprintf(file, "\n");
211                 for (int i = 0; i < 2 * NumInsertStates; i++)
212                         fprintf(file, "%.10f ", gapExtend[i]);
213                 fprintf(file, "\n");
214                 fprintf(file, "%s\n", alphabet.c_str());
215                 for (int i = 0; i < (int) alphabet.size(); i++) {
216                         for (int j = 0; j <= i; j++)
217                                 fprintf(file, "%.10f ",
218                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
219                         fprintf(file, "\n");
220                 }
221                 for (int i = 0; i < (int) alphabet.size(); i++)
222                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
223                 fprintf(file, "\n");
224                 fclose(file);
225         }
226 }
227
228 /////////////////////////////////////////////////////////////////
229 // doAlign()
230 //
231 // First computes all pairwise posterior probability matrices.
232 // Then, computes new parameters if training, or a final
233 // alignment, otherwise.
234 /////////////////////////////////////////////////////////////////
235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
237                 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
238                 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
239         assert(sequences);
240
241         //get the number of sequences
242         const int numSeqs = sequences->GetNumSequences();
243
244         //create distance matrix
245         VVF probalign_distances(numSeqs, VF(numSeqs, 0));
246         VVF distances(numSeqs, VF(numSeqs, 0));//msa
247
248         float gl_accuracy = 0;
249         //creat sparseMatrices
250         SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
251                         SafeVector<SparseMatrix *>(numSeqs, NULL));
252         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
253                         SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
254
255 #ifdef _OPENMP
256         //calculate sequence pairs for openmp model
257         int pairIdx = 0;
258         numPairs = (numSeqs - 1) * numSeqs / 2;
259         seqsPairs = new SeqsPair[numPairs];
260         for(int a = 0; a < numSeqs; a++) {
261                 for(int b = a + 1; b < numSeqs; b++) {
262                         seqsPairs[pairIdx].seq1 = a;
263                         seqsPairs[pairIdx].seq2 = b;
264                         pairIdx++;
265                 }
266         }
267 #endif
268         // do all pairwise alignments for posterior probability matrices
269 #ifdef _OPENMP
270 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
271         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
272                 int a= seqsPairs[pairIdx].seq1;
273                 int b = seqsPairs[pairIdx].seq2;
274                 if(enableVerbose) {
275 #pragma omp critical
276                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
277                 }
278 #else
279         for (int a = 0; a < numSeqs - 1; a++) {
280                 for (int b = a + 1; b < numSeqs; b++) {
281 #endif
282                         Sequence *seq1 = sequences->GetSequence(a);
283                         Sequence *seq2 = sequences->GetSequence(b);
284
285                         // verbose output
286                         if (enableVerbose) {
287                                 cerr << "Computing posterior matrix: (" << a + 1 << ") "
288                                                 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
289                                                 << seq2->GetHeader() << " -- ";
290                         }
291
292 //probcons
293                         // compute forward and backward probabilities
294                         VF *forward = model.ComputeForwardMatrix(seq1, seq2);
295                         assert(forward);
296                         VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
297                         assert(backward);
298                         // compute posterior probability matrix from HMM
299                         VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
300                         assert(probcons_posterior);             
301                         delete forward;
302                         delete backward;                     
303
304 //probalign
305                         VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
306                         assert(probalign_posterior);
307                         probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(),                                                                  *probalign_posterior);
308                         probalign_sparseMatrices[b][a] = NULL;                 
309                         pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
310                                         seq1->GetLength(), seq2->GetLength(), *probalign_posterior);                     
311                         probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());                       
312                         delete probalign_alignment.first;  
313
314 //local
315                         forward = model.ComputeForwardMatrix(seq1, seq2,false);
316                         assert(forward);
317                         backward = model.ComputeBackwardMatrix(seq1, seq2,false);
318                         assert(backward);
319                         VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
320                         delete forward;
321                         delete backward; 
322                   
323 //GL
324                         //merge probalign + local + probcons 
325                         VF::iterator ptr1 = probcons_posterior->begin();
326                         VF::iterator ptr2 = probalign_posterior->begin();                       
327                         VF::iterator ptr3 = local_posterior->begin();
328                         VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior);  //msa
329                         VF::iterator ptr = posterior->begin();                 
330                         for (int i = 0; i <= seq1->GetLength(); i++) {
331                                 for (int j = 0; j <= seq2->GetLength(); j++) {
332                                         float v1 = *ptr1;
333                                         float v2 = *ptr2;
334                                         float v3 = *ptr3;
335                                         *ptr = sqrt(v1*v1 + v2*v2 + v3*v3);
336                                         ptr1++;
337                                         ptr2++;
338                                         ptr3++;
339                                         ptr++;
340                                 }
341                         }
342                         // perform the pairwise sequence alignment
343                         pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
344                                         seq1->GetLength(), seq2->GetLength(), *posterior);
345
346                         //compute expected accuracy
347                         distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
348                                         / (3*min(seq1->GetLength(), seq2->GetLength()));
349
350                         // compute sparse representations
351                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
352                         seq2->GetLength(), *posterior);
353                         sparseMatrices[b][a] = NULL;
354                         //
355                         SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
356                         SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
357                         float N_correct_match = 0;
358                         int i = 1;int j = 1;
359                         for (SafeVector<char>::iterator iter = gl_alignment.first->begin(); 
360                                 iter != gl_alignment.first->end(); ++iter){
361                                 if (*iter == 'B'){
362                                         unsigned char c1 = (unsigned char) iter1[i++];
363                                         unsigned char c2 = (unsigned char) iter2[j++];
364                                         if(c1==c2) N_correct_match += 1;
365                                 }
366                                 else if(*iter == 'X') i++;
367                                 else if(*iter == 'Y') j++;
368                         }
369                         if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
370                         gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
371                         //
372                         delete probcons_posterior;
373                         delete probalign_posterior;
374                         delete local_posterior;
375                         delete posterior;
376
377 #ifndef _OPENMP
378                 }
379 #endif
380         } 
381
382 //self-adaptive   
383         gl_accuracy /= numPairs;
384         if(gl_accuracy > 0.4){
385                 for (int a = 0; a < numSeqs - 1; a++) 
386                         for (int b = a + 1; b < numSeqs; b++) {
387                                 distances[a][b] = distances[b][a] = probalign_distances[a][b];
388                                 sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
389                                 sparseMatrices[b][a] = NULL;
390                         }
391         }
392
393         //create the guide tree
394         this->tree = new MSAClusterTree(this, distances, numSeqs);
395         this->tree->create();
396
397         // perform the consistency transformation the desired number of times
398         float* fweights = new float[numSeqs];
399         for (int r = 0; r < numSeqs; r++) {
400                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
401                 fweights[r] *= 10;
402         }
403         for (int r = 0; r < numConsistencyReps; r++) {
404                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
405                                 DoRelaxation(fweights, sequences, sparseMatrices);
406
407                 // now replace the old posterior matrices
408                 for (int i = 0; i < numSeqs; i++) {
409                         for (int j = 0; j < numSeqs; j++) {
410                                 delete sparseMatrices[i][j];
411                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
412                         }
413                 }
414         }
415         delete[] fweights;
416 #ifdef _OPENMP
417         delete [] seqsPairs;
418 #endif
419
420         //compute the final multiple sequence alignment
421         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
422                         sparseMatrices, model);
423
424         // build annotation
425         if (enableAnnotation) {
426                 WriteAnnotation(finalAlignment, sparseMatrices);
427         }
428         //destroy the guide tree
429         delete this->tree;
430         this->tree = 0;
431
432         // delete sparse matrices
433         for (int a = 0; a < numSeqs - 1; a++) {
434                 for (int b = a + 1; b < numSeqs; b++) {
435                         delete sparseMatrices[a][b];
436                         delete sparseMatrices[b][a];
437                 }
438         }
439
440         return finalAlignment;
441 }
442
443 /////////////////////////////////////////////////////////////////
444 // GetInteger()
445 //
446 // Attempts to parse an integer from the character string given.
447 // Returns true only if no parsing error occurs.
448 /////////////////////////////////////////////////////////////////
449
450 bool GetInteger(char *data, int *val) {
451         char *endPtr;
452         long int retVal;
453
454         assert(val);
455
456         errno = 0;
457         retVal = strtol(data, &endPtr, 0);
458         if (retVal == 0 && (errno != 0 || data == endPtr))
459                 return false;
460         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
461                 return false;
462         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
463                 return false;
464         *val = (int) retVal;
465         return true;
466 }
467
468 /////////////////////////////////////////////////////////////////
469 // GetFloat()
470 //
471 // Attempts to parse a float from the character string given.
472 // Returns true only if no parsing error occurs.
473 /////////////////////////////////////////////////////////////////
474
475 bool GetFloat(char *data, float *val) {
476         char *endPtr;
477         double retVal;
478
479         assert(val);
480
481         errno = 0;
482         retVal = strtod(data, &endPtr);
483         if (retVal == 0 && (errno != 0 || data == endPtr))
484                 return false;
485         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
486                 return false;
487         *val = (float) retVal;
488         return true;
489 }
490
491 /////////////////////////////////////////////////////////////////
492 // ReadParameters()
493 //
494 // Read initial distribution, transition, and emission
495 // parameters from a file.
496 /////////////////////////////////////////////////////////////////
497
498 void MSA::ReadParameters() {
499
500         ifstream data;
501
502         emitPairs = VVF(256, VF(256, 1e-10));
503         emitSingle = VF(256, 1e-5);
504
505         // read initial state distribution and transition parameters
506         if (parametersInputFilename == string("")) {
507                 if (NumInsertStates == 1) {
508                         for (int i = 0; i < NumMatrixTypes; i++)
509                                 initDistrib[i] = initDistrib1Default[i];
510                         for (int i = 0; i < 2 * NumInsertStates; i++)
511                                 gapOpen[i] = gapOpen1Default[i];
512                         for (int i = 0; i < 2 * NumInsertStates; i++)
513                                 gapExtend[i] = gapExtend1Default[i];
514                 } else if (NumInsertStates == 2) {
515                         for (int i = 0; i < NumMatrixTypes; i++)
516                                 initDistrib[i] = initDistrib2Default[i];
517                         for (int i = 0; i < 2 * NumInsertStates; i++)
518                                 gapOpen[i] = gapOpen2Default[i];
519                         for (int i = 0; i < 2 * NumInsertStates; i++)
520                                 gapExtend[i] = gapExtend2Default[i];
521                 } else {
522                         cerr
523                                         << "ERROR: No default initial distribution/parameter settings exist"
524                                         << endl << "       for " << NumInsertStates
525                                         << " pairs of insert states.  Use --paramfile." << endl;
526                         exit(1);
527                 }
528
529                 alphabet = alphabetDefault;
530
531                 for (int i = 0; i < (int) alphabet.length(); i++) {
532                         emitSingle[(unsigned char) tolower(alphabet[i])] =
533                                         emitSingleDefault[i];
534                         emitSingle[(unsigned char) toupper(alphabet[i])] =
535                                         emitSingleDefault[i];
536                         for (int j = 0; j <= i; j++) {
537                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
538                                                 alphabet[j])] = emitPairsDefault[i][j];
539                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
540                                                 alphabet[j])] = emitPairsDefault[i][j];
541                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
542                                                 alphabet[j])] = emitPairsDefault[i][j];
543                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
544                                                 alphabet[j])] = emitPairsDefault[i][j];
545                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
546                                                 alphabet[i])] = emitPairsDefault[i][j];
547                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
548                                                 alphabet[i])] = emitPairsDefault[i][j];
549                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
550                                                 alphabet[i])] = emitPairsDefault[i][j];
551                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
552                                                 alphabet[i])] = emitPairsDefault[i][j];
553                         }
554                 }
555         } else {
556                 data.open(parametersInputFilename.c_str());
557                 if (data.fail()) {
558                         cerr << "ERROR: Unable to read parameter file: "
559                                         << parametersInputFilename << endl;
560                         exit(1);
561                 }
562
563                 string line[3];
564                 for (int i = 0; i < 3; i++) {
565                         if (!getline(data, line[i])) {
566                                 cerr
567                                                 << "ERROR: Unable to read transition parameters from parameter file: "
568                                                 << parametersInputFilename << endl;
569                                 exit(1);
570                         }
571                 }
572                 istringstream data2;
573                 data2.clear();
574                 data2.str(line[0]);
575                 for (int i = 0; i < NumMatrixTypes; i++)
576                         data2 >> initDistrib[i];
577                 data2.clear();
578                 data2.str(line[1]);
579                 for (int i = 0; i < 2 * NumInsertStates; i++)
580                         data2 >> gapOpen[i];
581                 data2.clear();
582                 data2.str(line[2]);
583                 for (int i = 0; i < 2 * NumInsertStates; i++)
584                         data2 >> gapExtend[i];
585
586                 if (!getline(data, line[0])) {
587                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
588                                         << parametersInputFilename << endl;
589                         exit(1);
590                 }
591
592                 // read alphabet as concatenation of all characters on alphabet line
593                 alphabet = "";
594                 string token;
595                 data2.clear();
596                 data2.str(line[0]);
597                 while (data2 >> token)
598                         alphabet += token;
599
600                 for (int i = 0; i < (int) alphabet.size(); i++) {
601                         for (int j = 0; j <= i; j++) {
602                                 float val;
603                                 data >> val;
604                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
605                                                 alphabet[j])] = val;
606                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
607                                                 alphabet[j])] = val;
608                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
609                                                 alphabet[j])] = val;
610                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
611                                                 alphabet[j])] = val;
612                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
613                                                 alphabet[i])] = val;
614                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
615                                                 alphabet[i])] = val;
616                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
617                                                 alphabet[i])] = val;
618                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
619                                                 alphabet[i])] = val;
620                         }
621                 }
622
623                 for (int i = 0; i < (int) alphabet.size(); i++) {
624                         float val;
625                         data >> val;
626                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
627                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
628                 }
629                 data.close();
630         }
631 }
632
633 /////////////////////////////////////////////////////////////////
634 // ParseParams()
635 //
636 // Parse all command-line options.
637 /////////////////////////////////////////////////////////////////
638 void MSA::printUsage() {
639         cerr
640                         << "************************************************************************"
641                         << endl
642                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
643                         << endl
644                         << "\tbased on pair hidden markov model and partition function postirior"
645                         << endl
646                         << "\tprobabilities. If any comments or problems, please contact"
647                         << endl
648                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
649                         << endl
650                         << "*************************************************************************"
651                         << endl << "Usage:" << endl
652                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
653                         << "Description:" << endl
654                         << "       Align sequences in multi-FASTA format" << endl << endl
655                         << "       -o, --outfile <string>" << endl
656                         << "              specify the output file name (STDOUT by default)"
657                         << endl << "       -num_threads <integer>" << endl
658                         << "              specify the number of threads used, and otherwise detect automatically"
659                         << endl << "       -clustalw" << endl
660                         << "              use CLUSTALW output format instead of FASTA format"
661                         << endl << endl << "       -c, --consistency REPS" << endl
662                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
663                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
664                         << ") passes of consistency transformation" << endl << endl
665                         << "       -ir, --iterative-refinement REPS" << endl
666                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
667                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
668                         << numIterativeRefinementReps << ") passes of iterative-refinement"
669                         << endl << endl << "       -v, --verbose" << endl
670                         << "              report progress while aligning (default: "
671                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
672                         << "       -annot FILENAME" << endl
673                         << "              write annotation for multiple alignment to FILENAME"
674                         << endl << endl << "       -a, --alignment-order" << endl
675                         << "              print sequences in alignment order rather than input order (default: "
676                         << (enableAlignOrder ? "on" : "off") << ")" << endl
677                         << "       -version " << endl
678                         << "              print out version of MSAPROBS " << endl << endl;
679 }
680 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
681         if (argc < 2) {
682                 printUsage();
683                 exit(1);
684         }
685         SafeVector<string> sequenceNames;
686         int tempInt;
687         float tempFloat;
688
689         for (int i = 1; i < argc; i++) {
690                 if (argv[i][0] == '-') {
691                         //help
692                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
693                                 printUsage();
694                                 exit(1);
695                                 //output file name
696                         } else if (!strcmp(argv[i], "-o")
697                                         || !strcmp(argv[i], "--outfile")) {
698                                 if (i < argc - 1) {
699                                         alignOutFileName = argv[++i];   //get the file name
700                                 } else {
701                                         cerr << "ERROR: String expected for option " << argv[i]
702                                                         << endl;
703                                         exit(1);
704                                 }
705                                 // parameter file
706                         } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
707                                 if (i < argc - 1)
708                                         parametersInputFilename = string (argv[++i]);
709                                 else {
710                                         cerr << "ERROR: Filename expected for option " << argv[i] << endl;
711                                         exit (1);
712                                 }
713                                 //number of threads used
714                         } else if (!strcmp(argv[i], "-p")
715                                         || !strcmp(argv[i], "-num_threads")) {
716                                 if (i < argc - 1) {
717                                         if (!GetInteger(argv[++i], &tempInt)) {
718                                                 cerr << " ERROR: invalid integer following option "
719                                                                 << argv[i - 1] << ": " << argv[i] << endl;
720                                                 exit(1);
721                                         } else {
722                                                 if (tempInt < 0) {
723                                                         tempInt = 0;
724                                                 }
725                                                 numThreads = tempInt;
726                                         }
727                                 } else {
728                                         cerr << "ERROR: Integer expected for option " << argv[i]
729                                                         << endl;
730                                         exit(1);
731                                 }
732                                 // number of consistency transformations
733                         } else if (!strcmp(argv[i], "-c")
734                                         || !strcmp(argv[i], "--consistency")) {
735                                 if (i < argc - 1) {
736                                         if (!GetInteger(argv[++i], &tempInt)) {
737                                                 cerr << "ERROR: Invalid integer following option "
738                                                                 << argv[i - 1] << ": " << argv[i] << endl;
739                                                 exit(1);
740                                         } else {
741                                                 if (tempInt < MIN_CONSISTENCY_REPS
742                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
743                                                         cerr << "ERROR: For option " << argv[i - 1]
744                                                                         << ", integer must be between "
745                                                                         << MIN_CONSISTENCY_REPS << " and "
746                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
747                                                         exit(1);
748                                                 } else {
749                                                         numConsistencyReps = tempInt;
750                                                 }
751                                         }
752                                 } else {
753                                         cerr << "ERROR: Integer expected for option " << argv[i]
754                                                         << endl;
755                                         exit(1);
756                                 }
757                         }
758
759                         // number of randomized partitioning iterative refinement passes
760                         else if (!strcmp(argv[i], "-ir")
761                                         || !strcmp(argv[i], "--iterative-refinement")) {
762                                 if (i < argc - 1) {
763                                         if (!GetInteger(argv[++i], &tempInt)) {
764                                                 cerr << "ERROR: Invalid integer following option "
765                                                                 << argv[i - 1] << ": " << argv[i] << endl;
766                                                 exit(1);
767                                         } else {
768                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
769                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
770                                                         cerr << "ERROR: For option " << argv[i - 1]
771                                                                         << ", integer must be between "
772                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
773                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
774                                                                         << endl;
775                                                         exit(1);
776                                                 } else
777                                                         numIterativeRefinementReps = tempInt;
778                                         }
779                                 } else {
780                                         cerr << "ERROR: Integer expected for option " << argv[i]
781                                                         << endl;
782                                         exit(1);
783                                 }
784                         }
785
786                         // annotation files
787                         else if (!strcmp(argv[i], "-annot")) {
788                                 enableAnnotation = true;
789                                 if (i < argc - 1) {
790                                         annotationFilename = argv[++i];
791                                 } else {
792                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
793                                                         << endl;
794                                         exit(1);
795                                 }
796                         }
797
798                         // clustalw output format
799                         else if (!strcmp(argv[i], "-clustalw")) {
800                                 enableClustalWOutput = true;
801                         }
802
803                         // cutoff
804                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
805                                 if (i < argc - 1) {
806                                         if (!GetFloat(argv[++i], &tempFloat)) {
807                                                 cerr
808                                                                 << "ERROR: Invalid floating-point value following option "
809                                                                 << argv[i - 1] << ": " << argv[i] << endl;
810                                                 exit(1);
811                                         } else {
812                                                 if (tempFloat < 0 || tempFloat > 1) {
813                                                         cerr << "ERROR: For option " << argv[i - 1]
814                                                                         << ", floating-point value must be between 0 and 1."
815                                                                         << endl;
816                                                         exit(1);
817                                                 } else
818                                                         cutoff = tempFloat;
819                                         }
820                                 } else {
821                                         cerr << "ERROR: Floating-point value expected for option "
822                                                         << argv[i] << endl;
823                                         exit(1);
824                                 }
825                         }
826
827                         // verbose reporting
828                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
829                                 enableVerbose = true;
830                         }
831
832                         // alignment order
833                         else if (!strcmp(argv[i], "-a")
834                                         || !strcmp(argv[i], "--alignment-order")) {
835                                 enableAlignOrder = true;
836                         }
837
838                         //print out version
839                         else if (!strcmp(argv[i], "-version")) {
840                                 cerr << "MSAPROBS version " << VERSION << endl;
841                                 exit(1);
842                         }
843                         // bad arguments
844                         else {
845                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
846                                 exit(1);
847                         }
848                 } else {
849                         sequenceNames.push_back(string(argv[i]));
850                 }
851         }
852
853         /*check the output file name*/
854         cerr << "-------------------------------------" << endl;
855         if (alignOutFileName.length() == 0) {
856                 cerr << "The final alignments will be printed out to STDOUT" << endl;
857                 alignOutFile = &std::cout;
858         } else {
859                 cerr << "Open the output file " << alignOutFileName << endl;
860                 alignOutFile = new ofstream(alignOutFileName.c_str(),
861                                 ios::binary | ios::out | ios::trunc);
862         }
863         cerr << "-------------------------------------" << endl;
864         return sequenceNames;
865 }
866
867 /////////////////////////////////////////////////////////////////
868 // ProcessTree()
869 //
870 // Process the tree recursively.  Returns the aligned sequences
871 // corresponding to a node or leaf of the tree.
872 /////////////////////////////////////////////////////////////////
873 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
874                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
875                 const ProbabilisticModel &model) {
876
877         MultiSequence *result;
878
879         // check if this is a node of the alignment tree
880         //if (tree->GetSequenceLabel() == -1){
881         if (tree->leaf == NODE) {
882                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
883                                 sparseMatrices, model);
884                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
885                                 sparseMatrices, model);
886
887                 assert(alignLeft);
888                 assert(alignRight);
889
890                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
891                 assert(result);
892
893                 delete alignLeft;
894                 delete alignRight;
895         }
896
897         // otherwise, this is a leaf of the alignment tree
898         else {
899                 result = new MultiSequence();
900                 assert(result);
901                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
902                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
903         }
904
905         return result;
906 }
907
908 /////////////////////////////////////////////////////////////////
909 // ComputeFinalAlignment()
910 //
911 // Compute the final alignment by calling ProcessTree(), then
912 // performing iterative refinement as needed.
913 /////////////////////////////////////////////////////////////////
914
915 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
916                 MultiSequence *sequences,
917                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
918                 const ProbabilisticModel &model) {
919         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
920                         sparseMatrices, model);
921
922         SafeVector<int> oldOrdering;
923         if (enableAlignOrder) {
924                 for (int i = 0; i < alignment->GetNumSequences(); i++)
925                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
926                 alignment->SaveOrdering();
927                 enableAlignOrder = false;
928         }
929
930         // tree-based refinement
931         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
932         /*
933          int numSeqs = alignment->GetNumSequences();
934          //if(numSeqs < numIterativeRefinementReps){
935          for(int iter = 0; iter < 5; iter ++){
936                 for(int i = 0; i < numSeqs - 1; i++){
937                         DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
938                  }
939          }
940          //}*/
941         //Refinement return false:no improvement 
942         for (int i = 0; i < numIterativeRefinementReps; i++) {               
943                 DoIterativeRefinement(sparseMatrices, model, alignment);
944         }
945         cerr << endl;   
946
947         if (oldOrdering.size() > 0) {
948                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
949                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
950                 }
951         }
952
953         // return final alignment
954         return alignment;
955 }
956
957 /////////////////////////////////////////////////////////////////
958 // AlignAlignments()
959 //
960 // Returns the alignment of two MultiSequence objects.
961 /////////////////////////////////////////////////////////////////
962
963 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
964                 MultiSequence *align2,
965                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
966                 const ProbabilisticModel &model) {
967
968         // print some info about the alignment
969         if (enableVerbose) {
970                 for (int i = 0; i < align1->GetNumSequences(); i++)
971                         cerr << ((i == 0) ? "[" : ",")
972                                         << align1->GetSequence(i)->GetLabel();
973                 cerr << "] vs. ";
974                 for (int i = 0; i < align2->GetNumSequences(); i++)
975                         cerr << ((i == 0) ? "[" : ",")
976                                         << align2->GetSequence(i)->GetLabel();
977                 cerr << "]: ";
978         }
979 #if 0
980         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
981 #else
982         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
983                         sparseMatrices, cutoff);
984 #endif
985         // compute an "accuracy" measure for the alignment before refinement
986
987         pair<SafeVector<char> *, float> alignment;
988         //perform alignment
989         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
990                         align2->GetSequence(0)->GetLength(), *posterior);
991
992         delete posterior;
993
994         if (enableVerbose) {
995
996                 // compute total length of sequences
997                 int totLength = 0;
998                 for (int i = 0; i < align1->GetNumSequences(); i++)
999                         for (int j = 0; j < align2->GetNumSequences(); j++)
1000                                 totLength += min(align1->GetSequence(i)->GetLength(),
1001                                                 align2->GetSequence(j)->GetLength());
1002
1003                 // give an "accuracy" measure for the alignment
1004                 cerr << alignment.second / totLength << endl;
1005         }
1006
1007         // now build final alignment
1008         MultiSequence *result = new MultiSequence();
1009         for (int i = 0; i < align1->GetNumSequences(); i++)
1010                 result->AddSequence(
1011                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1012         for (int i = 0; i < align2->GetNumSequences(); i++)
1013                 result->AddSequence(
1014                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1015         if (!enableAlignOrder)
1016                 result->SortByLabel();
1017
1018         // free temporary alignment
1019         delete alignment.first;
1020
1021         return result;
1022 }
1023
1024 /////////////////////////////////////////////////////////////////
1025 // DoRelaxation()
1026 //
1027 // Performs one round of the weighted probabilistic consistency transformation.
1028 //                     1
1029 /////////////////////////////////////////////////////////////////
1030
1031 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1032                 MultiSequence *sequences,
1033                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1034         const int numSeqs = sequences->GetNumSequences();
1035
1036         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1037                         SafeVector<SparseMatrix *>(numSeqs, NULL));
1038
1039         // for every pair of sequences
1040 #ifdef _OPENMP
1041         int pairIdx;
1042 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1043         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1044                 int i = seqsPairs[pairIdx].seq1;
1045                 int j = seqsPairs[pairIdx].seq2;
1046                 float wi = seqsWeights[i];
1047                 float wj = seqsWeights[j];
1048 #else
1049         for (int i = 0; i < numSeqs; i++) {
1050                 float wi = seqsWeights[i];
1051                 for (int j = i + 1; j < numSeqs; j++) {
1052                         float wj = seqsWeights[j];
1053 #endif
1054                         Sequence *seq1 = sequences->GetSequence(i);
1055                         Sequence *seq2 = sequences->GetSequence(j);
1056
1057                         if (enableVerbose) {
1058 #ifdef _OPENMP
1059 #pragma omp critical
1060 #endif
1061                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1062                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1063                                                 << ": ";
1064                         }
1065                         // get the original posterior matrix
1066                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1067                         assert(posteriorPtr);
1068                         VF &posterior = *posteriorPtr;
1069
1070                         const int seq1Length = seq1->GetLength();
1071                         const int seq2Length = seq2->GetLength();
1072
1073                         // contribution from the summation where z = x and z = y
1074                         float w = wi * wi * wj + wi * wj * wj;
1075                         float sumW = w;
1076                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1077                                 //posterior[k] = w*posterior[k];
1078                                 posterior[k] += posterior[k];
1079                         }
1080
1081                         if (enableVerbose)
1082                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1083
1084                         // contribution from all other sequences
1085                         for (int k = 0; k < numSeqs; k++) {
1086                                 if (k != i && k != j) {
1087                                         float wk = seqsWeights[k];
1088                                         float w = wi * wj * wk;
1089                                         sumW += w;
1090                                         if (k < i)
1091                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1092                                                                 posterior);
1093                                         else if (k > i && k < j)
1094                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1095                                                                 posterior);
1096                                         else {
1097                                                 SparseMatrix *temp =
1098                                                                 sparseMatrices[j][k]->ComputeTranspose();
1099                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1100                                                 delete temp;
1101                                         }
1102                                 }
1103                         }
1104                         //cerr<<"sumW "<<sumW<<endl;
1105                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1106                                 //posterior[k] /= sumW;
1107                                 posterior[k] /= numSeqs;
1108                         }
1109                         // mask out positions not originally in the posterior matrix
1110                         SparseMatrix *matXY = sparseMatrices[i][j];
1111                         for (int y = 0; y <= seq2Length; y++)
1112                                 posterior[y] = 0;
1113                         for (int x = 1; x <= seq1Length; x++) {
1114                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1115                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1116                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1117                                 int curr = 0;
1118                                 while (XYptr != XYend) {
1119
1120                                         // zero out all cells until the first filled column
1121                                         while (curr < XYptr->first) {
1122                                                 base[curr] = 0;
1123                                                 curr++;
1124                                         }
1125
1126                                         // now, skip over this column
1127                                         curr++;
1128                                         ++XYptr;
1129                                 }
1130
1131                                 // zero out cells after last column
1132                                 while (curr <= seq2Length) {
1133                                         base[curr] = 0;
1134                                         curr++;
1135                                 }
1136                         }
1137
1138                         // save the new posterior matrix
1139                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1140                                         seq2->GetLength(), posterior);
1141                         newSparseMatrices[j][i] = NULL;
1142
1143                         if (enableVerbose)
1144                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1145
1146                         delete posteriorPtr;
1147
1148                         if (enableVerbose)
1149                                 cerr << "done." << endl;
1150 #ifndef _OPENMP
1151                 }
1152 #endif
1153         }
1154
1155         return newSparseMatrices;
1156 }
1157
1158 /////////////////////////////////////////////////////////////////
1159 // Relax()
1160 //
1161 // Computes the consistency transformation for a single sequence
1162 // z, and adds the transformed matrix to "posterior".
1163 /////////////////////////////////////////////////////////////////
1164
1165 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1166                 VF &posterior) {
1167
1168         assert(matXZ);
1169         assert(matZY);
1170
1171         int lengthX = matXZ->GetSeq1Length();
1172         int lengthY = matZY->GetSeq2Length();
1173         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1174
1175         // for every x[i]
1176         for (int i = 1; i <= lengthX; i++) {
1177                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1178                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1179
1180                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1181
1182                 // iterate through all x[i]-z[k]
1183                 while (XZptr != XZend) {
1184                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1185                         SafeVector<PIF>::iterator ZYend = ZYptr
1186                                         + matZY->GetRowSize(XZptr->first);
1187                         const float XZval = XZptr->second;
1188
1189                         // iterate through all z[k]-y[j]
1190                         while (ZYptr != ZYend) {
1191                                 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1192                                 base[ZYptr->first] += XZval * ZYptr->second;
1193                                 ZYptr++;
1194                         }
1195                         XZptr++;
1196                 }
1197         }
1198 }
1199
1200 /////////////////////////////////////////////////////////////////
1201 // Relax1()
1202 //
1203 // Computes the consistency transformation for a single sequence
1204 // z, and adds the transformed matrix to "posterior".
1205 /////////////////////////////////////////////////////////////////
1206
1207 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1208                 VF &posterior) {
1209
1210         assert(matZX);
1211         assert(matZY);
1212
1213         int lengthZ = matZX->GetSeq1Length();
1214         int lengthY = matZY->GetSeq2Length();
1215
1216         // for every z[k]
1217         for (int k = 1; k <= lengthZ; k++) {
1218                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1219                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1220
1221                 // iterate through all z[k]-x[i]
1222                 while (ZXptr != ZXend) {
1223                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1224                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1225                         const float ZXval = ZXptr->second;
1226                         VF::iterator base = posterior.begin()
1227                                         + ZXptr->first * (lengthY + 1);
1228
1229                         // iterate through all z[k]-y[j]
1230                         while (ZYptr != ZYend) {
1231                                 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1232                                 base[ZYptr->first] += ZXval * ZYptr->second;
1233                                 ZYptr++;
1234                         }
1235                         ZXptr++;
1236                 }
1237         }
1238 }
1239 /////////////////////////////////////////////////////////////////
1240 // DoIterativeRefinement()
1241 //
1242 // Performs a single round of randomized partionining iterative
1243 // refinement.
1244 /////////////////////////////////////////////////////////////////
1245
1246 void MSA::DoIterativeRefinement(
1247                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1248                 const ProbabilisticModel &model, MultiSequence* &alignment) {
1249         set<int> groupOne, groupTwo;
1250         int numSeqs = alignment->GetNumSequences();
1251
1252         // create two separate groups
1253         for (int i = 0; i < numSeqs; i++) {
1254                 int index = rand();
1255                 if (index % 2) {
1256                         groupOne.insert(i);
1257                 } else {
1258                         groupTwo.insert(i);
1259                 }
1260         }
1261         if (groupOne.empty() || groupTwo.empty()) return;
1262
1263         // project into the two groups
1264         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1265         assert(groupOneSeqs);
1266         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1267         assert(groupTwoSeqs);   
1268 /*
1269 //start add by Yongtao
1270 #if 1
1271         VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1272 #else
1273         VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1274                         sparseMatrices, cutoff);
1275 #endif
1276
1277         // compute an "accuracy" measure for the alignment before refinement        
1278         SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1279         SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1280         int i=0; 
1281         for (set<int>::const_iterator iter = groupOne.begin();
1282                         iter != groupOne.end(); ++iter) {
1283                 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1284         }
1285         i=0;
1286         for (set<int>::const_iterator iter = groupTwo.begin();
1287                         iter != groupTwo.end(); ++iter) {
1288                 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1289         }
1290
1291         VF &posteriorArr = *posterior;
1292         int oldLength = alignment->GetSequence(0)->GetLength();
1293         int groupOneindex=0; int groupTwoindex=0;
1294         float accuracy_before = 0; 
1295         for (int i = 1; i <= oldLength; i++) {
1296                 // check to see if there is a gap in every sequence of the set
1297                 bool foundOne = false;
1298                 for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
1299                         foundOne = (oldOnePtrs[j][i] != '-');
1300                 // if not, then this column counts towards the sequence length
1301                 if (foundOne) groupOneindex ++;
1302                 bool foundTwo = false;
1303                 for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1304                         foundTwo = (oldTwoPtrs[j][i] != '-');
1305                 // if not, then this column counts towards the sequence length
1306                 if (foundTwo) groupTwoindex ++;
1307                 if(foundOne && foundTwo) accuracy_before += 
1308                                 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1309         }
1310         
1311         pair<SafeVector<char> *, float> refinealignment;
1312         //perform alignment
1313         refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1314                         groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1315         delete posterior;
1316         // now build final alignment
1317         MultiSequence *result = new MultiSequence();
1318         //compare accuracy measure before and after refinement
1319         //if (refinealignment.second > accuracy_before) {
1320                 //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl; 
1321                 for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1322                         result->AddSequence(
1323                                 groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1324                 for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1325                         result->AddSequence(
1326                                 groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1327                 // free temporary alignment
1328                 delete refinealignment.first;
1329                 delete alignment;
1330                 alignment = result;
1331
1332         }
1333         else{
1334                 if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
1335                 delete groupOneSeqs;
1336                 delete groupTwoSeqs;
1337                 return false;
1338         }
1339   */    
1340 //end add by yongtao
1341
1342         //delete alignment;
1343         // realign
1344         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
1345                 delete groupOneSeqs;
1346                 delete groupTwoSeqs;            
1347                 
1348 }
1349
1350 void MSA::DoIterativeRefinementTreeNode(
1351                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1352                 const ProbabilisticModel &model, MultiSequence* &alignment,
1353                 int nodeIndex) {
1354         set<int> groupOne, groupTwo;
1355         int numSeqs = alignment->GetNumSequences();
1356
1357         vector<bool> inGroup1;
1358         inGroup1.resize(numSeqs);
1359         for (int i = 0; i < numSeqs; i++) {
1360                 inGroup1[i] = false;
1361         }
1362
1363         AlignmentOrder* orders = this->tree->getAlignOrders();
1364         AlignmentOrder* order = &orders[nodeIndex];
1365         for (int i = 0; i < order->leftNum; i++) {
1366                 int si = order->leftLeafs[i];
1367                 inGroup1[si] = true;
1368         }
1369         for (int i = 0; i < order->rightNum; i++) {
1370                 int si = order->rightLeafs[i];
1371                 inGroup1[si] = true;
1372         }
1373         // create two separate groups
1374         for (int i = 0; i < numSeqs; i++) {
1375                 if (inGroup1[i]) {
1376                         groupOne.insert(i);
1377                 } else {
1378                         groupTwo.insert(i);
1379                 }
1380         }
1381         if (groupOne.empty() || groupTwo.empty())
1382                 return;
1383
1384         // project into the two groups
1385         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1386         assert(groupOneSeqs);
1387         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1388         assert(groupTwoSeqs);
1389         delete alignment;
1390
1391         // realign
1392         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1393                         model);
1394
1395         delete groupOneSeqs;
1396         delete groupTwoSeqs;
1397 }
1398
1399 /////////////////////////////////////////////////////////////////
1400 // WriteAnnotation()
1401 //
1402 // Computes annotation for multiple alignment and write values
1403 // to a file.
1404 /////////////////////////////////////////////////////////////////
1405
1406 void MSA::WriteAnnotation(MultiSequence *alignment,
1407                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1408         ofstream outfile(annotationFilename.c_str());
1409
1410         if (outfile.fail()) {
1411                 cerr << "ERROR: Unable to write annotation file." << endl;
1412                 exit(1);
1413         }
1414
1415         const int alignLength = alignment->GetSequence(0)->GetLength();
1416         const int numSeqs = alignment->GetNumSequences();
1417
1418         SafeVector<int> position(numSeqs, 0);
1419         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1420         for (int i = 0; i < numSeqs; i++)
1421                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1422         SafeVector<pair<int, int> > active;
1423         active.reserve(numSeqs);
1424
1425         SafeVector<int> lab;
1426         for (int i = 0; i < numSeqs; i++)
1427                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1428
1429         // for every column
1430         for (int i = 1; i <= alignLength; i++) {
1431
1432                 // find all aligned residues in this particular column
1433                 active.clear();
1434                 for (int j = 0; j < numSeqs; j++) {
1435                         if (seqs[j][i] != '-') {
1436                                 active.push_back(make_pair(lab[j], ++position[j]));
1437                         }
1438                 }
1439
1440                 sort(active.begin(), active.end());
1441                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1442         }
1443
1444         outfile.close();
1445 }
1446
1447 /////////////////////////////////////////////////////////////////
1448 // ComputeScore()
1449 //
1450 // Computes the annotation score for a particular column.
1451 /////////////////////////////////////////////////////////////////
1452
1453 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1454                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1455
1456         if (active.size() <= 1)
1457                 return 0;
1458
1459         // ALTERNATIVE #1: Compute the average alignment score.
1460
1461         float val = 0;
1462         for (int i = 0; i < (int) active.size(); i++) {
1463                 for (int j = i + 1; j < (int) active.size(); j++) {
1464                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1465                                         active[i].second, active[j].second);
1466                 }
1467         }
1468
1469         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1470
1471 }