Add GLprobs and MSAprobs to binaries
[jabaws.git] / binaries / src / GLProbs-1.0 / MSAdiv3.cpp
1 /***********************************************
2  * # Copyright 2009-2010. Liu Yongchao
3  * # Contact: Liu Yongchao, School of Computer Engineering,
4  * #                     Nanyang Technological University.
5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
6  * #
7  * # GPL version 3.0 applies.
8  * #
9  * ************************************************/
10
11 #include <string>
12 #include <sstream>
13 #include <iomanip>
14 #include <iostream>
15 #include <list>
16 #include <set>
17 #include <algorithm>
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21 #include <cerrno>
22 #include <iomanip>
23 #include "MSA.h"
24 #include "MSAClusterTree.h"
25 #include "Defaults.h"
26
27 #ifdef _OPENMP
28 #include <omp.h>
29 #endif
30
31 string parametersInputFilename = "";
32 string parametersOutputFilename = "no training";
33 string annotationFilename = "";
34
35 bool enableVerbose = false;
36 bool enableAnnotation = false;
37 bool enableClustalWOutput = false;
38 bool enableAlignOrder = false;
39 int numConsistencyReps = 2;
40 int numPreTrainingReps = 0;
41 int numIterativeRefinementReps = 100;
42
43 float cutoff = 0;
44
45 VF initDistrib(NumMatrixTypes);
46 VF gapOpen(2 * NumInsertStates);
47 VF gapExtend(2 * NumInsertStates);
48 VVF emitPairs(256, VF(256, 1e-10));
49 VF emitSingle(256, 1e-5);
50
51 string alphabet = alphabetDefault;
52
53 const int MIN_PRETRAINING_REPS = 0;
54 const int MAX_PRETRAINING_REPS = 20;
55 const int MIN_CONSISTENCY_REPS = 0;
56 const int MAX_CONSISTENCY_REPS = 5;
57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
59
60 string posteriorProbsFilename = "";
61 bool allscores = true;
62 string infilename;
63
64 int flag_gui = 0;   //0: no gui related o/p 
65 //1: gui related o/p generated
66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
67 //1: pp score seq added to o/p fasta alignment
68
69 ///////////////////////////////
70 // global scoring matrix variables
71 //////////////////////////////
72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
74 int subst_index[26];
75
76 double sub_matrix[26][26];
77 int firstread = 0;              //this makes sure that matrices are read only once 
78
79 float TEMPERATURE = 5;
80 int MATRIXTYPE = 160;
81 int prot_nuc = 0;               //0=prot, 1=nucleotide
82
83 float GAPOPEN = 0;
84 float GAPEXT = 0;
85 int numThreads = 0;
86
87 //argument support
88 typedef struct {
89         char input[30];
90         int matrix;
91         int N;
92         float T;
93         float beta;
94         char opt;                       //can be 'P' or 'M'
95         float gapopen;
96         float gapext;
97 } argument_decl;
98
99 argument_decl argument;
100
101 extern inline void read_sustitution_matrix(char *fileName);
102 extern void setmatrixtype(int le);
103 extern inline int matrixtype_to_int();
104 extern inline void read_dna_matrix();
105 extern inline void read_vtml_la_matrix();
106 extern void init_arguments();
107
108 MSA::MSA(int argc, char* argv[]) {
109         //parse program parameters
110         SafeVector<string> sequenceNames = ParseParams(argc, argv);
111
112         //initialize arguments for partition function
113         init_arguments();
114
115         ReadParameters();
116         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
117
118         //read the input sequences
119         MultiSequence *sequences = new MultiSequence();
120         assert(sequences);
121         for (int i = 0; i < (int) sequenceNames.size(); i++) {
122                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
123                 sequences->LoadMFA(sequenceNames[i], true);
124         }
125         //allocate space for sequence weights
126         this->seqsWeights = new int[sequences->GetNumSequences()];
127         //initilaize parameters for OPENMP
128 #ifdef _OPENMP
129         if(numThreads <= 0) {
130                 numThreads = omp_get_num_procs();
131                 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
132         }
133         cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
134
135         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
136         omp_set_num_threads(numThreads);
137 #endif  
138
139         // now, we can perform the alignments and write them out
140         MultiSequence *alignment = doAlign(sequences,
141                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
142                                         emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
143                         emitSingle);
144
145         //write the alignment results to standard output
146         if (enableClustalWOutput) {
147                 alignment->WriteALN(*alignOutFile);
148         } else {
149                 alignment->WriteMFA(*alignOutFile);
150         }
151         //release resources
152         delete[] this->seqsWeights;
153         delete alignment;
154         delete sequences;
155 }
156 MSA::~MSA() {
157         /*close the output file*/
158         if (alignOutFileName.length() > 0) {
159                 ((std::ofstream*) alignOutFile)->close();
160         }
161 }
162 /////////////////////////////////////////////////////////////////
163 // PrintParameters()
164 //
165 // Prints MSAPROBS parameters to STDERR.  If a filename is
166 // specified, then the parameters are also written to the file.
167 /////////////////////////////////////////////////////////////////
168
169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
170                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
171                 const VF &emitSingle, const char *filename) {
172
173         // print parameters to the screen
174         cerr << message << endl << "    initDistrib[] = { ";
175         for (int i = 0; i < NumMatrixTypes; i++)
176                 cerr << setprecision(10) << initDistrib[i] << " ";
177         cerr << "}" << endl << "        gapOpen[] = { ";
178         for (int i = 0; i < NumInsertStates * 2; i++)
179                 cerr << setprecision(10) << gapOpen[i] << " ";
180         cerr << "}" << endl << "      gapExtend[] = { ";
181         for (int i = 0; i < NumInsertStates * 2; i++)
182                 cerr << setprecision(10) << gapExtend[i] << " ";
183         cerr << "}" << endl << endl;
184
185         /*
186          for (int i = 0; i < 5; i++){
187          for (int j = 0; j <= i; j++){
188          cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
189          }
190          cerr << endl;
191          }*/
192
193         // if a file name is specified
194         if (filename) {
195
196                 // attempt to open the file for writing
197                 FILE *file = fopen(filename, "w");
198                 if (!file) {
199                         cerr << "ERROR: Unable to write parameter file: " << filename
200                                         << endl;
201                         exit(1);
202                 }
203
204                 // if successful, then write the parameters to the file
205                 for (int i = 0; i < NumMatrixTypes; i++)
206                         fprintf(file, "%.10f ", initDistrib[i]);
207                 fprintf(file, "\n");
208                 for (int i = 0; i < 2 * NumInsertStates; i++)
209                         fprintf(file, "%.10f ", gapOpen[i]);
210                 fprintf(file, "\n");
211                 for (int i = 0; i < 2 * NumInsertStates; i++)
212                         fprintf(file, "%.10f ", gapExtend[i]);
213                 fprintf(file, "\n");
214                 fprintf(file, "%s\n", alphabet.c_str());
215                 for (int i = 0; i < (int) alphabet.size(); i++) {
216                         for (int j = 0; j <= i; j++)
217                                 fprintf(file, "%.10f ",
218                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
219                         fprintf(file, "\n");
220                 }
221                 for (int i = 0; i < (int) alphabet.size(); i++)
222                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
223                 fprintf(file, "\n");
224                 fclose(file);
225         }
226 }
227
228 /////////////////////////////////////////////////////////////////
229 // doAlign()
230 //
231 // First computes all pairwise posterior probability matrices.
232 // Then, computes new parameters if training, or a final
233 // alignment, otherwise.
234 /////////////////////////////////////////////////////////////////
235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
237                 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
238                 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
239         assert(sequences);
240
241         //get the number of sequences
242         const int numSeqs = sequences->GetNumSequences();
243
244         //create distance matrix
245         VVF probalign_distances(numSeqs, VF(numSeqs, 0));
246         VVF distances(numSeqs, VF(numSeqs, 0));//msa
247
248         float gl_accuracy = 0;
249         //creat sparseMatrices
250         SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
251                         SafeVector<SparseMatrix *>(numSeqs, NULL));
252         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
253                         SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
254
255 #ifdef _OPENMP
256         //calculate sequence pairs for openmp model
257         int pairIdx = 0;
258         numPairs = (numSeqs - 1) * numSeqs / 2;
259         seqsPairs = new SeqsPair[numPairs];
260         for(int a = 0; a < numSeqs; a++) {
261                 for(int b = a + 1; b < numSeqs; b++) {
262                         seqsPairs[pairIdx].seq1 = a;
263                         seqsPairs[pairIdx].seq2 = b;
264                         pairIdx++;
265                 }
266         }
267 #endif
268         // do all pairwise alignments for posterior probability matrices
269 #ifdef _OPENMP
270 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
271         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
272                 int a= seqsPairs[pairIdx].seq1;
273                 int b = seqsPairs[pairIdx].seq2;
274                 if(enableVerbose) {
275 #pragma omp critical
276                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
277                 }
278 #else
279         for (int a = 0; a < numSeqs - 1; a++) {
280                 for (int b = a + 1; b < numSeqs; b++) {
281 #endif
282                         Sequence *seq1 = sequences->GetSequence(a);
283                         Sequence *seq2 = sequences->GetSequence(b);
284
285                         // verbose output
286                         if (enableVerbose) {
287                                 cerr << "Computing posterior matrix: (" << a + 1 << ") "
288                                                 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
289                                                 << seq2->GetHeader() << " -- ";
290                         }
291
292 //probcons
293                         // compute forward and backward probabilities
294                         VF *forward = model.ComputeForwardMatrix(seq1, seq2);
295                         assert(forward);
296                         VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
297                         assert(backward);
298                         // compute posterior probability matrix from HMM
299                         VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
300                         assert(probcons_posterior);             
301                         delete forward;
302                         delete backward;                     
303
304 //probalign
305                         VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
306                         assert(probalign_posterior);
307                         probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(),                                                                  *probalign_posterior);
308                         probalign_sparseMatrices[b][a] = NULL;                 
309                         pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
310                                         seq1->GetLength(), seq2->GetLength(), *probalign_posterior);                     
311                         probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());                       
312                         delete probalign_alignment.first;  
313
314 //local
315                         forward = model.ComputeForwardMatrix(seq1, seq2,false);
316                         assert(forward);
317                         backward = model.ComputeBackwardMatrix(seq1, seq2,false);
318                         assert(backward);
319                         VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
320                         delete forward;
321                         delete backward; 
322                   
323 //GL
324                         //merge probalign + local + probcons 
325                         VF::iterator ptr1 = probcons_posterior->begin();
326                         VF::iterator ptr2 = probalign_posterior->begin();                       
327                         VF::iterator ptr3 = local_posterior->begin();
328                         VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior);  //msa
329                         VF::iterator ptr = posterior->begin();                 
330                         for (int i = 0; i <= seq1->GetLength(); i++) {
331                                 for (int j = 0; j <= seq2->GetLength(); j++) {
332                                         float v1 = *ptr1;
333                                         float v2 = *ptr2;
334                                         float v3 = *ptr3;
335                                         *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
336                                         ptr1++;
337                                         ptr2++;
338                                         ptr3++;
339                                         ptr++;
340                                 }
341                         }
342                         // perform the pairwise sequence alignment
343                         pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
344                                         seq1->GetLength(), seq2->GetLength(), *posterior);
345
346                         //compute expected accuracy
347                         distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
348                                         / min(seq1->GetLength(), seq2->GetLength());
349
350                         // compute sparse representations
351                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
352                         seq2->GetLength(), *posterior);
353                         sparseMatrices[b][a] = NULL;
354                         //
355                         SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
356                         SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
357                         float N_correct_match = 0;
358                         int i = 1;int j = 1;
359                         for (SafeVector<char>::iterator iter = gl_alignment.first->begin(); 
360                                 iter != gl_alignment.first->end(); ++iter){
361                                 if (*iter == 'B'){
362                                         unsigned char c1 = (unsigned char) iter1[i++];
363                                         unsigned char c2 = (unsigned char) iter2[j++];
364                                         if(c1==c2) N_correct_match += 1;
365                                 }
366                                 else if(*iter == 'X') i++;
367                                 else if(*iter == 'Y') j++;
368                         }
369                         if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
370                         gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
371                         //
372                         delete probcons_posterior;
373                         delete probalign_posterior;
374                         delete local_posterior;
375                         delete posterior;
376
377 #ifndef _OPENMP
378                 }
379 #endif
380         } 
381
382 /*
383 //self-adaptive   
384         gl_accuracy /= numPairs;
385         if(gl_accuracy > 0.4){
386                 for (int a = 0; a < numSeqs - 1; a++) 
387                         for (int b = a + 1; b < numSeqs; b++) {
388                                 distances[a][b] = distances[b][a] = probalign_distances[a][b];
389                                 sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
390                                 sparseMatrices[b][a] = NULL;
391                         }
392         }
393 */
394         //create the guide tree
395         this->tree = new MSAClusterTree(this, distances, numSeqs);
396         this->tree->create();
397
398         // perform the consistency transformation the desired number of times
399         float* fweights = new float[numSeqs];
400         for (int r = 0; r < numSeqs; r++) {
401                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
402                 fweights[r] *= 10;
403         }
404         for (int r = 0; r < numConsistencyReps; r++) {
405                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
406                                 DoRelaxation(fweights, sequences, sparseMatrices);
407
408                 // now replace the old posterior matrices
409                 for (int i = 0; i < numSeqs; i++) {
410                         for (int j = 0; j < numSeqs; j++) {
411                                 delete sparseMatrices[i][j];
412                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
413                         }
414                 }
415         }
416         delete[] fweights;
417 #ifdef _OPENMP
418         delete [] seqsPairs;
419 #endif
420
421         //compute the final multiple sequence alignment
422         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
423                         sparseMatrices, model);
424
425         // build annotation
426         if (enableAnnotation) {
427                 WriteAnnotation(finalAlignment, sparseMatrices);
428         }
429         //destroy the guide tree
430         delete this->tree;
431         this->tree = 0;
432
433         // delete sparse matrices
434         for (int a = 0; a < numSeqs - 1; a++) {
435                 for (int b = a + 1; b < numSeqs; b++) {
436                         delete sparseMatrices[a][b];
437                         delete sparseMatrices[b][a];
438                 }
439         }
440
441         return finalAlignment;
442 }
443
444 /////////////////////////////////////////////////////////////////
445 // GetInteger()
446 //
447 // Attempts to parse an integer from the character string given.
448 // Returns true only if no parsing error occurs.
449 /////////////////////////////////////////////////////////////////
450
451 bool GetInteger(char *data, int *val) {
452         char *endPtr;
453         long int retVal;
454
455         assert(val);
456
457         errno = 0;
458         retVal = strtol(data, &endPtr, 0);
459         if (retVal == 0 && (errno != 0 || data == endPtr))
460                 return false;
461         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
462                 return false;
463         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
464                 return false;
465         *val = (int) retVal;
466         return true;
467 }
468
469 /////////////////////////////////////////////////////////////////
470 // GetFloat()
471 //
472 // Attempts to parse a float from the character string given.
473 // Returns true only if no parsing error occurs.
474 /////////////////////////////////////////////////////////////////
475
476 bool GetFloat(char *data, float *val) {
477         char *endPtr;
478         double retVal;
479
480         assert(val);
481
482         errno = 0;
483         retVal = strtod(data, &endPtr);
484         if (retVal == 0 && (errno != 0 || data == endPtr))
485                 return false;
486         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
487                 return false;
488         *val = (float) retVal;
489         return true;
490 }
491
492 /////////////////////////////////////////////////////////////////
493 // ReadParameters()
494 //
495 // Read initial distribution, transition, and emission
496 // parameters from a file.
497 /////////////////////////////////////////////////////////////////
498
499 void MSA::ReadParameters() {
500
501         ifstream data;
502
503         emitPairs = VVF(256, VF(256, 1e-10));
504         emitSingle = VF(256, 1e-5);
505
506         // read initial state distribution and transition parameters
507         if (parametersInputFilename == string("")) {
508                 if (NumInsertStates == 1) {
509                         for (int i = 0; i < NumMatrixTypes; i++)
510                                 initDistrib[i] = initDistrib1Default[i];
511                         for (int i = 0; i < 2 * NumInsertStates; i++)
512                                 gapOpen[i] = gapOpen1Default[i];
513                         for (int i = 0; i < 2 * NumInsertStates; i++)
514                                 gapExtend[i] = gapExtend1Default[i];
515                 } else if (NumInsertStates == 2) {
516                         for (int i = 0; i < NumMatrixTypes; i++)
517                                 initDistrib[i] = initDistrib2Default[i];
518                         for (int i = 0; i < 2 * NumInsertStates; i++)
519                                 gapOpen[i] = gapOpen2Default[i];
520                         for (int i = 0; i < 2 * NumInsertStates; i++)
521                                 gapExtend[i] = gapExtend2Default[i];
522                 } else {
523                         cerr
524                                         << "ERROR: No default initial distribution/parameter settings exist"
525                                         << endl << "       for " << NumInsertStates
526                                         << " pairs of insert states.  Use --paramfile." << endl;
527                         exit(1);
528                 }
529
530                 alphabet = alphabetDefault;
531
532                 for (int i = 0; i < (int) alphabet.length(); i++) {
533                         emitSingle[(unsigned char) tolower(alphabet[i])] =
534                                         emitSingleDefault[i];
535                         emitSingle[(unsigned char) toupper(alphabet[i])] =
536                                         emitSingleDefault[i];
537                         for (int j = 0; j <= i; j++) {
538                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
539                                                 alphabet[j])] = emitPairsDefault[i][j];
540                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
541                                                 alphabet[j])] = emitPairsDefault[i][j];
542                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
543                                                 alphabet[j])] = emitPairsDefault[i][j];
544                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
545                                                 alphabet[j])] = emitPairsDefault[i][j];
546                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
547                                                 alphabet[i])] = emitPairsDefault[i][j];
548                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
549                                                 alphabet[i])] = emitPairsDefault[i][j];
550                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
551                                                 alphabet[i])] = emitPairsDefault[i][j];
552                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
553                                                 alphabet[i])] = emitPairsDefault[i][j];
554                         }
555                 }
556         } else {
557                 data.open(parametersInputFilename.c_str());
558                 if (data.fail()) {
559                         cerr << "ERROR: Unable to read parameter file: "
560                                         << parametersInputFilename << endl;
561                         exit(1);
562                 }
563
564                 string line[3];
565                 for (int i = 0; i < 3; i++) {
566                         if (!getline(data, line[i])) {
567                                 cerr
568                                                 << "ERROR: Unable to read transition parameters from parameter file: "
569                                                 << parametersInputFilename << endl;
570                                 exit(1);
571                         }
572                 }
573                 istringstream data2;
574                 data2.clear();
575                 data2.str(line[0]);
576                 for (int i = 0; i < NumMatrixTypes; i++)
577                         data2 >> initDistrib[i];
578                 data2.clear();
579                 data2.str(line[1]);
580                 for (int i = 0; i < 2 * NumInsertStates; i++)
581                         data2 >> gapOpen[i];
582                 data2.clear();
583                 data2.str(line[2]);
584                 for (int i = 0; i < 2 * NumInsertStates; i++)
585                         data2 >> gapExtend[i];
586
587                 if (!getline(data, line[0])) {
588                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
589                                         << parametersInputFilename << endl;
590                         exit(1);
591                 }
592
593                 // read alphabet as concatenation of all characters on alphabet line
594                 alphabet = "";
595                 string token;
596                 data2.clear();
597                 data2.str(line[0]);
598                 while (data2 >> token)
599                         alphabet += token;
600
601                 for (int i = 0; i < (int) alphabet.size(); i++) {
602                         for (int j = 0; j <= i; j++) {
603                                 float val;
604                                 data >> val;
605                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
606                                                 alphabet[j])] = val;
607                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
608                                                 alphabet[j])] = val;
609                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
610                                                 alphabet[j])] = val;
611                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
612                                                 alphabet[j])] = val;
613                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
614                                                 alphabet[i])] = val;
615                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
616                                                 alphabet[i])] = val;
617                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
618                                                 alphabet[i])] = val;
619                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
620                                                 alphabet[i])] = val;
621                         }
622                 }
623
624                 for (int i = 0; i < (int) alphabet.size(); i++) {
625                         float val;
626                         data >> val;
627                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
628                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
629                 }
630                 data.close();
631         }
632 }
633
634 /////////////////////////////////////////////////////////////////
635 // ParseParams()
636 //
637 // Parse all command-line options.
638 /////////////////////////////////////////////////////////////////
639 void MSA::printUsage() {
640         cerr
641                         << "************************************************************************"
642                         << endl
643                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
644                         << endl
645                         << "\tbased on pair hidden markov model and partition function postirior"
646                         << endl
647                         << "\tprobabilities. If any comments or problems, please contact"
648                         << endl
649                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
650                         << endl
651                         << "*************************************************************************"
652                         << endl << "Usage:" << endl
653                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
654                         << "Description:" << endl
655                         << "       Align sequences in multi-FASTA format" << endl << endl
656                         << "       -o, --outfile <string>" << endl
657                         << "              specify the output file name (STDOUT by default)"
658                         << endl << "       -num_threads <integer>" << endl
659                         << "              specify the number of threads used, and otherwise detect automatically"
660                         << endl << "       -clustalw" << endl
661                         << "              use CLUSTALW output format instead of FASTA format"
662                         << endl << endl << "       -c, --consistency REPS" << endl
663                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
664                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
665                         << ") passes of consistency transformation" << endl << endl
666                         << "       -ir, --iterative-refinement REPS" << endl
667                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
668                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
669                         << numIterativeRefinementReps << ") passes of iterative-refinement"
670                         << endl << endl << "       -v, --verbose" << endl
671                         << "              report progress while aligning (default: "
672                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
673                         << "       -annot FILENAME" << endl
674                         << "              write annotation for multiple alignment to FILENAME"
675                         << endl << endl << "       -a, --alignment-order" << endl
676                         << "              print sequences in alignment order rather than input order (default: "
677                         << (enableAlignOrder ? "on" : "off") << ")" << endl
678                         << "       -version " << endl
679                         << "              print out version of MSAPROBS " << endl << endl;
680 }
681 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
682         if (argc < 2) {
683                 printUsage();
684                 exit(1);
685         }
686         SafeVector<string> sequenceNames;
687         int tempInt;
688         float tempFloat;
689
690         for (int i = 1; i < argc; i++) {
691                 if (argv[i][0] == '-') {
692                         //help
693                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
694                                 printUsage();
695                                 exit(1);
696                                 //output file name
697                         } else if (!strcmp(argv[i], "-o")
698                                         || !strcmp(argv[i], "--outfile")) {
699                                 if (i < argc - 1) {
700                                         alignOutFileName = argv[++i];   //get the file name
701                                 } else {
702                                         cerr << "ERROR: String expected for option " << argv[i]
703                                                         << endl;
704                                         exit(1);
705                                 }
706                                 // parameter file
707                         } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
708                                 if (i < argc - 1)
709                                         parametersInputFilename = string (argv[++i]);
710                                 else {
711                                         cerr << "ERROR: Filename expected for option " << argv[i] << endl;
712                                         exit (1);
713                                 }
714                                 //number of threads used
715                         } else if (!strcmp(argv[i], "-p")
716                                         || !strcmp(argv[i], "-num_threads")) {
717                                 if (i < argc - 1) {
718                                         if (!GetInteger(argv[++i], &tempInt)) {
719                                                 cerr << " ERROR: invalid integer following option "
720                                                                 << argv[i - 1] << ": " << argv[i] << endl;
721                                                 exit(1);
722                                         } else {
723                                                 if (tempInt < 0) {
724                                                         tempInt = 0;
725                                                 }
726                                                 numThreads = tempInt;
727                                         }
728                                 } else {
729                                         cerr << "ERROR: Integer expected for option " << argv[i]
730                                                         << endl;
731                                         exit(1);
732                                 }
733                                 // number of consistency transformations
734                         } else if (!strcmp(argv[i], "-c")
735                                         || !strcmp(argv[i], "--consistency")) {
736                                 if (i < argc - 1) {
737                                         if (!GetInteger(argv[++i], &tempInt)) {
738                                                 cerr << "ERROR: Invalid integer following option "
739                                                                 << argv[i - 1] << ": " << argv[i] << endl;
740                                                 exit(1);
741                                         } else {
742                                                 if (tempInt < MIN_CONSISTENCY_REPS
743                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
744                                                         cerr << "ERROR: For option " << argv[i - 1]
745                                                                         << ", integer must be between "
746                                                                         << MIN_CONSISTENCY_REPS << " and "
747                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
748                                                         exit(1);
749                                                 } else {
750                                                         numConsistencyReps = tempInt;
751                                                 }
752                                         }
753                                 } else {
754                                         cerr << "ERROR: Integer expected for option " << argv[i]
755                                                         << endl;
756                                         exit(1);
757                                 }
758                         }
759
760                         // number of randomized partitioning iterative refinement passes
761                         else if (!strcmp(argv[i], "-ir")
762                                         || !strcmp(argv[i], "--iterative-refinement")) {
763                                 if (i < argc - 1) {
764                                         if (!GetInteger(argv[++i], &tempInt)) {
765                                                 cerr << "ERROR: Invalid integer following option "
766                                                                 << argv[i - 1] << ": " << argv[i] << endl;
767                                                 exit(1);
768                                         } else {
769                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
770                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
771                                                         cerr << "ERROR: For option " << argv[i - 1]
772                                                                         << ", integer must be between "
773                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
774                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
775                                                                         << endl;
776                                                         exit(1);
777                                                 } else
778                                                         numIterativeRefinementReps = tempInt;
779                                         }
780                                 } else {
781                                         cerr << "ERROR: Integer expected for option " << argv[i]
782                                                         << endl;
783                                         exit(1);
784                                 }
785                         }
786
787                         // annotation files
788                         else if (!strcmp(argv[i], "-annot")) {
789                                 enableAnnotation = true;
790                                 if (i < argc - 1) {
791                                         annotationFilename = argv[++i];
792                                 } else {
793                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
794                                                         << endl;
795                                         exit(1);
796                                 }
797                         }
798
799                         // clustalw output format
800                         else if (!strcmp(argv[i], "-clustalw")) {
801                                 enableClustalWOutput = true;
802                         }
803
804                         // cutoff
805                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
806                                 if (i < argc - 1) {
807                                         if (!GetFloat(argv[++i], &tempFloat)) {
808                                                 cerr
809                                                                 << "ERROR: Invalid floating-point value following option "
810                                                                 << argv[i - 1] << ": " << argv[i] << endl;
811                                                 exit(1);
812                                         } else {
813                                                 if (tempFloat < 0 || tempFloat > 1) {
814                                                         cerr << "ERROR: For option " << argv[i - 1]
815                                                                         << ", floating-point value must be between 0 and 1."
816                                                                         << endl;
817                                                         exit(1);
818                                                 } else
819                                                         cutoff = tempFloat;
820                                         }
821                                 } else {
822                                         cerr << "ERROR: Floating-point value expected for option "
823                                                         << argv[i] << endl;
824                                         exit(1);
825                                 }
826                         }
827
828                         // verbose reporting
829                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
830                                 enableVerbose = true;
831                         }
832
833                         // alignment order
834                         else if (!strcmp(argv[i], "-a")
835                                         || !strcmp(argv[i], "--alignment-order")) {
836                                 enableAlignOrder = true;
837                         }
838
839                         //print out version
840                         else if (!strcmp(argv[i], "-version")) {
841                                 cerr << "MSAPROBS version " << VERSION << endl;
842                                 exit(1);
843                         }
844                         // bad arguments
845                         else {
846                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
847                                 exit(1);
848                         }
849                 } else {
850                         sequenceNames.push_back(string(argv[i]));
851                 }
852         }
853
854         /*check the output file name*/
855         cerr << "-------------------------------------" << endl;
856         if (alignOutFileName.length() == 0) {
857                 cerr << "The final alignments will be printed out to STDOUT" << endl;
858                 alignOutFile = &std::cout;
859         } else {
860                 cerr << "Open the output file " << alignOutFileName << endl;
861                 alignOutFile = new ofstream(alignOutFileName.c_str(),
862                                 ios::binary | ios::out | ios::trunc);
863         }
864         cerr << "-------------------------------------" << endl;
865         return sequenceNames;
866 }
867
868 /////////////////////////////////////////////////////////////////
869 // ProcessTree()
870 //
871 // Process the tree recursively.  Returns the aligned sequences
872 // corresponding to a node or leaf of the tree.
873 /////////////////////////////////////////////////////////////////
874 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
875                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
876                 const ProbabilisticModel &model) {
877
878         MultiSequence *result;
879
880         // check if this is a node of the alignment tree
881         //if (tree->GetSequenceLabel() == -1){
882         if (tree->leaf == NODE) {
883                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
884                                 sparseMatrices, model);
885                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
886                                 sparseMatrices, model);
887
888                 assert(alignLeft);
889                 assert(alignRight);
890
891                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
892                 assert(result);
893
894                 delete alignLeft;
895                 delete alignRight;
896         }
897
898         // otherwise, this is a leaf of the alignment tree
899         else {
900                 result = new MultiSequence();
901                 assert(result);
902                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
903                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
904         }
905
906         return result;
907 }
908
909 /////////////////////////////////////////////////////////////////
910 // ComputeFinalAlignment()
911 //
912 // Compute the final alignment by calling ProcessTree(), then
913 // performing iterative refinement as needed.
914 /////////////////////////////////////////////////////////////////
915
916 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
917                 MultiSequence *sequences,
918                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
919                 const ProbabilisticModel &model) {
920         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
921                         sparseMatrices, model);
922
923         SafeVector<int> oldOrdering;
924         if (enableAlignOrder) {
925                 for (int i = 0; i < alignment->GetNumSequences(); i++)
926                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
927                 alignment->SaveOrdering();
928                 enableAlignOrder = false;
929         }
930
931         // tree-based refinement
932         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
933         /*
934          int numSeqs = alignment->GetNumSequences();
935          //if(numSeqs < numIterativeRefinementReps){
936          for(int iter = 0; iter < 5; iter ++){
937                 for(int i = 0; i < numSeqs - 1; i++){
938                         DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
939                  }
940          }
941          //}*/
942         //Refinement return false:no improvement 
943         for (int i = 0; i < numIterativeRefinementReps; i++) {               
944                 DoIterativeRefinement(sparseMatrices, model, alignment);
945         }
946         cerr << endl;   
947
948         if (oldOrdering.size() > 0) {
949                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
950                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
951                 }
952         }
953
954         // return final alignment
955         return alignment;
956 }
957
958 /////////////////////////////////////////////////////////////////
959 // AlignAlignments()
960 //
961 // Returns the alignment of two MultiSequence objects.
962 /////////////////////////////////////////////////////////////////
963
964 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
965                 MultiSequence *align2,
966                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
967                 const ProbabilisticModel &model) {
968
969         // print some info about the alignment
970         if (enableVerbose) {
971                 for (int i = 0; i < align1->GetNumSequences(); i++)
972                         cerr << ((i == 0) ? "[" : ",")
973                                         << align1->GetSequence(i)->GetLabel();
974                 cerr << "] vs. ";
975                 for (int i = 0; i < align2->GetNumSequences(); i++)
976                         cerr << ((i == 0) ? "[" : ",")
977                                         << align2->GetSequence(i)->GetLabel();
978                 cerr << "]: ";
979         }
980 #if 1
981         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
982 #else
983         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
984                         sparseMatrices, cutoff);
985 #endif
986         // compute an "accuracy" measure for the alignment before refinement
987
988         pair<SafeVector<char> *, float> alignment;
989         //perform alignment
990         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
991                         align2->GetSequence(0)->GetLength(), *posterior);
992
993         delete posterior;
994
995         if (enableVerbose) {
996
997                 // compute total length of sequences
998                 int totLength = 0;
999                 for (int i = 0; i < align1->GetNumSequences(); i++)
1000                         for (int j = 0; j < align2->GetNumSequences(); j++)
1001                                 totLength += min(align1->GetSequence(i)->GetLength(),
1002                                                 align2->GetSequence(j)->GetLength());
1003
1004                 // give an "accuracy" measure for the alignment
1005                 cerr << alignment.second / totLength << endl;
1006         }
1007
1008         // now build final alignment
1009         MultiSequence *result = new MultiSequence();
1010         for (int i = 0; i < align1->GetNumSequences(); i++)
1011                 result->AddSequence(
1012                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1013         for (int i = 0; i < align2->GetNumSequences(); i++)
1014                 result->AddSequence(
1015                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1016         if (!enableAlignOrder)
1017                 result->SortByLabel();
1018
1019         // free temporary alignment
1020         delete alignment.first;
1021
1022         return result;
1023 }
1024
1025 /////////////////////////////////////////////////////////////////
1026 // DoRelaxation()
1027 //
1028 // Performs one round of the weighted probabilistic consistency transformation.
1029 //                     1
1030 /////////////////////////////////////////////////////////////////
1031
1032 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1033                 MultiSequence *sequences,
1034                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1035         const int numSeqs = sequences->GetNumSequences();
1036
1037         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1038                         SafeVector<SparseMatrix *>(numSeqs, NULL));
1039
1040         // for every pair of sequences
1041 #ifdef _OPENMP
1042         int pairIdx;
1043 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1044         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1045                 int i = seqsPairs[pairIdx].seq1;
1046                 int j = seqsPairs[pairIdx].seq2;
1047                 float wi = seqsWeights[i];
1048                 float wj = seqsWeights[j];
1049 #else
1050         for (int i = 0; i < numSeqs; i++) {
1051                 float wi = seqsWeights[i];
1052                 for (int j = i + 1; j < numSeqs; j++) {
1053                         float wj = seqsWeights[j];
1054 #endif
1055                         Sequence *seq1 = sequences->GetSequence(i);
1056                         Sequence *seq2 = sequences->GetSequence(j);
1057
1058                         if (enableVerbose) {
1059 #ifdef _OPENMP
1060 #pragma omp critical
1061 #endif
1062                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1063                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1064                                                 << ": ";
1065                         }
1066                         // get the original posterior matrix
1067                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1068                         assert(posteriorPtr);
1069                         VF &posterior = *posteriorPtr;
1070
1071                         const int seq1Length = seq1->GetLength();
1072                         const int seq2Length = seq2->GetLength();
1073
1074                         // contribution from the summation where z = x and z = y
1075                         float w = wi * wi * wj + wi * wj * wj;
1076                         float sumW = w;
1077                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1078                                 //posterior[k] = w*posterior[k];
1079                                 posterior[k] += posterior[k];
1080                         }
1081
1082                         if (enableVerbose)
1083                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1084
1085                         // contribution from all other sequences
1086                         for (int k = 0; k < numSeqs; k++) {
1087                                 if (k != i && k != j) {
1088                                         float wk = seqsWeights[k];
1089                                         float w = wi * wj * wk;
1090                                         sumW += w;
1091                                         if (k < i)
1092                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1093                                                                 posterior);
1094                                         else if (k > i && k < j)
1095                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1096                                                                 posterior);
1097                                         else {
1098                                                 SparseMatrix *temp =
1099                                                                 sparseMatrices[j][k]->ComputeTranspose();
1100                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1101                                                 delete temp;
1102                                         }
1103                                 }
1104                         }
1105                         //cerr<<"sumW "<<sumW<<endl;
1106                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1107                                 //posterior[k] /= sumW;
1108                                 posterior[k] /= numSeqs;
1109                         }
1110                         // mask out positions not originally in the posterior matrix
1111                         SparseMatrix *matXY = sparseMatrices[i][j];
1112                         for (int y = 0; y <= seq2Length; y++)
1113                                 posterior[y] = 0;
1114                         for (int x = 1; x <= seq1Length; x++) {
1115                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1116                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1117                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1118                                 int curr = 0;
1119                                 while (XYptr != XYend) {
1120
1121                                         // zero out all cells until the first filled column
1122                                         while (curr < XYptr->first) {
1123                                                 base[curr] = 0;
1124                                                 curr++;
1125                                         }
1126
1127                                         // now, skip over this column
1128                                         curr++;
1129                                         ++XYptr;
1130                                 }
1131
1132                                 // zero out cells after last column
1133                                 while (curr <= seq2Length) {
1134                                         base[curr] = 0;
1135                                         curr++;
1136                                 }
1137                         }
1138
1139                         // save the new posterior matrix
1140                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1141                                         seq2->GetLength(), posterior);
1142                         newSparseMatrices[j][i] = NULL;
1143
1144                         if (enableVerbose)
1145                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1146
1147                         delete posteriorPtr;
1148
1149                         if (enableVerbose)
1150                                 cerr << "done." << endl;
1151 #ifndef _OPENMP
1152                 }
1153 #endif
1154         }
1155
1156         return newSparseMatrices;
1157 }
1158
1159 /////////////////////////////////////////////////////////////////
1160 // Relax()
1161 //
1162 // Computes the consistency transformation for a single sequence
1163 // z, and adds the transformed matrix to "posterior".
1164 /////////////////////////////////////////////////////////////////
1165
1166 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1167                 VF &posterior) {
1168
1169         assert(matXZ);
1170         assert(matZY);
1171
1172         int lengthX = matXZ->GetSeq1Length();
1173         int lengthY = matZY->GetSeq2Length();
1174         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1175
1176         // for every x[i]
1177         for (int i = 1; i <= lengthX; i++) {
1178                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1179                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1180
1181                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1182
1183                 // iterate through all x[i]-z[k]
1184                 while (XZptr != XZend) {
1185                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1186                         SafeVector<PIF>::iterator ZYend = ZYptr
1187                                         + matZY->GetRowSize(XZptr->first);
1188                         const float XZval = XZptr->second;
1189
1190                         // iterate through all z[k]-y[j]
1191                         while (ZYptr != ZYend) {
1192                                 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1193                                 base[ZYptr->first] += XZval * ZYptr->second;
1194                                 ZYptr++;
1195                         }
1196                         XZptr++;
1197                 }
1198         }
1199 }
1200
1201 /////////////////////////////////////////////////////////////////
1202 // Relax1()
1203 //
1204 // Computes the consistency transformation for a single sequence
1205 // z, and adds the transformed matrix to "posterior".
1206 /////////////////////////////////////////////////////////////////
1207
1208 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1209                 VF &posterior) {
1210
1211         assert(matZX);
1212         assert(matZY);
1213
1214         int lengthZ = matZX->GetSeq1Length();
1215         int lengthY = matZY->GetSeq2Length();
1216
1217         // for every z[k]
1218         for (int k = 1; k <= lengthZ; k++) {
1219                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1220                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1221
1222                 // iterate through all z[k]-x[i]
1223                 while (ZXptr != ZXend) {
1224                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1225                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1226                         const float ZXval = ZXptr->second;
1227                         VF::iterator base = posterior.begin()
1228                                         + ZXptr->first * (lengthY + 1);
1229
1230                         // iterate through all z[k]-y[j]
1231                         while (ZYptr != ZYend) {
1232                                 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1233                                 base[ZYptr->first] += ZXval * ZYptr->second;
1234                                 ZYptr++;
1235                         }
1236                         ZXptr++;
1237                 }
1238         }
1239 }
1240 /////////////////////////////////////////////////////////////////
1241 // DoIterativeRefinement()
1242 //
1243 // Performs a single round of randomized partionining iterative
1244 // refinement.
1245 /////////////////////////////////////////////////////////////////
1246
1247 void MSA::DoIterativeRefinement(
1248                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1249                 const ProbabilisticModel &model, MultiSequence* &alignment) {
1250         set<int> groupOne, groupTwo;
1251         int numSeqs = alignment->GetNumSequences();
1252
1253         // create two separate groups
1254         for (int i = 0; i < numSeqs; i++) {
1255                 int index = rand();
1256                 if (index % 2) {
1257                         groupOne.insert(i);
1258                 } else {
1259                         groupTwo.insert(i);
1260                 }
1261         }
1262         if (groupOne.empty() || groupTwo.empty()) return;
1263
1264         // project into the two groups
1265         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1266         assert(groupOneSeqs);
1267         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1268         assert(groupTwoSeqs);   
1269 /*
1270 //start add by Yongtao
1271 #if 0
1272         VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1273 #else
1274         VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1275                         sparseMatrices, cutoff);
1276 #endif
1277
1278         // compute an "accuracy" measure for the alignment before refinement        
1279         SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1280         SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1281         int i=0; 
1282         for (set<int>::const_iterator iter = groupOne.begin();
1283                         iter != groupOne.end(); ++iter) {
1284                 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1285         }
1286         i=0;
1287         for (set<int>::const_iterator iter = groupTwo.begin();
1288                         iter != groupTwo.end(); ++iter) {
1289                 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1290         }
1291
1292         VF &posteriorArr = *posterior;
1293         int oldLength = alignment->GetSequence(0)->GetLength();
1294         int groupOneindex=0; int groupTwoindex=0;
1295         float accuracy_before = 0; 
1296         for (int i = 1; i <= oldLength; i++) {
1297                 // check to see if there is a gap in every sequence of the set
1298                 bool foundOne = false;
1299                 for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
1300                         foundOne = (oldOnePtrs[j][i] != '-');
1301                 // if not, then this column counts towards the sequence length
1302                 if (foundOne) groupOneindex ++;
1303                 bool foundTwo = false;
1304                 for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1305                         foundTwo = (oldTwoPtrs[j][i] != '-');
1306                 // if not, then this column counts towards the sequence length
1307                 if (foundTwo) groupTwoindex ++;
1308                 if(foundOne && foundTwo) accuracy_before += 
1309                                 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1310         }
1311        
1312         pair<SafeVector<char> *, float> refinealignment;
1313         //perform alignment
1314         refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1315                         groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1316         delete posterior;
1317         // now build final alignment
1318         MultiSequence *result = new MultiSequence();
1319         //compare accuracy measure before and after refinement
1320         //if (refinealignment.second > accuracy_before) {
1321                 //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl; 
1322                 for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1323                         result->AddSequence(
1324                                 groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1325                 for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1326                         result->AddSequence(
1327                                 groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1328                 // free temporary alignment
1329                 delete refinealignment.first;
1330                 delete alignment;
1331                 alignment = result;
1332
1333         }
1334         else{
1335                 if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
1336                 delete groupOneSeqs;
1337                 delete groupTwoSeqs;
1338                 return false;
1339         }
1340   */    
1341 //end add by yongtao
1342
1343         delete alignment;
1344         // realign
1345         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
1346         delete groupOneSeqs;
1347         delete groupTwoSeqs;            
1348                 
1349 }
1350
1351 void MSA::DoIterativeRefinementTreeNode(
1352                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1353                 const ProbabilisticModel &model, MultiSequence* &alignment,
1354                 int nodeIndex) {
1355         set<int> groupOne, groupTwo;
1356         int numSeqs = alignment->GetNumSequences();
1357
1358         vector<bool> inGroup1;
1359         inGroup1.resize(numSeqs);
1360         for (int i = 0; i < numSeqs; i++) {
1361                 inGroup1[i] = false;
1362         }
1363
1364         AlignmentOrder* orders = this->tree->getAlignOrders();
1365         AlignmentOrder* order = &orders[nodeIndex];
1366         for (int i = 0; i < order->leftNum; i++) {
1367                 int si = order->leftLeafs[i];
1368                 inGroup1[si] = true;
1369         }
1370         for (int i = 0; i < order->rightNum; i++) {
1371                 int si = order->rightLeafs[i];
1372                 inGroup1[si] = true;
1373         }
1374         // create two separate groups
1375         for (int i = 0; i < numSeqs; i++) {
1376                 if (inGroup1[i]) {
1377                         groupOne.insert(i);
1378                 } else {
1379                         groupTwo.insert(i);
1380                 }
1381         }
1382         if (groupOne.empty() || groupTwo.empty())
1383                 return;
1384
1385         // project into the two groups
1386         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1387         assert(groupOneSeqs);
1388         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1389         assert(groupTwoSeqs);
1390         delete alignment;
1391
1392         // realign
1393         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1394                         model);
1395
1396         delete groupOneSeqs;
1397         delete groupTwoSeqs;
1398 }
1399
1400 /////////////////////////////////////////////////////////////////
1401 // WriteAnnotation()
1402 //
1403 // Computes annotation for multiple alignment and write values
1404 // to a file.
1405 /////////////////////////////////////////////////////////////////
1406
1407 void MSA::WriteAnnotation(MultiSequence *alignment,
1408                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1409         ofstream outfile(annotationFilename.c_str());
1410
1411         if (outfile.fail()) {
1412                 cerr << "ERROR: Unable to write annotation file." << endl;
1413                 exit(1);
1414         }
1415
1416         const int alignLength = alignment->GetSequence(0)->GetLength();
1417         const int numSeqs = alignment->GetNumSequences();
1418
1419         SafeVector<int> position(numSeqs, 0);
1420         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1421         for (int i = 0; i < numSeqs; i++)
1422                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1423         SafeVector<pair<int, int> > active;
1424         active.reserve(numSeqs);
1425
1426         SafeVector<int> lab;
1427         for (int i = 0; i < numSeqs; i++)
1428                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1429
1430         // for every column
1431         for (int i = 1; i <= alignLength; i++) {
1432
1433                 // find all aligned residues in this particular column
1434                 active.clear();
1435                 for (int j = 0; j < numSeqs; j++) {
1436                         if (seqs[j][i] != '-') {
1437                                 active.push_back(make_pair(lab[j], ++position[j]));
1438                         }
1439                 }
1440
1441                 sort(active.begin(), active.end());
1442                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1443         }
1444
1445         outfile.close();
1446 }
1447
1448 /////////////////////////////////////////////////////////////////
1449 // ComputeScore()
1450 //
1451 // Computes the annotation score for a particular column.
1452 /////////////////////////////////////////////////////////////////
1453
1454 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1455                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1456
1457         if (active.size() <= 1)
1458                 return 0;
1459
1460         // ALTERNATIVE #1: Compute the average alignment score.
1461
1462         float val = 0;
1463         for (int i = 0; i < (int) active.size(); i++) {
1464                 for (int j = i + 1; j < (int) active.size(); j++) {
1465                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1466                                         active[i].second, active[j].second);
1467                 }
1468         }
1469
1470         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1471
1472 }