Add GLprobs and MSAprobs to binaries
[jabaws.git] / binaries / src / MSAProbs-0.9.7 / MSAProbs / ProbabilisticModel.h
1 /////////////////////////////////////////////////////////////////
2 // ProbabilisticModel.h
3 //
4 // Routines for (1) posterior probability computations
5 //              (2) chained anchoring
6 //              (3) maximum weight trace alignment
7 /////////////////////////////////////////////////////////////////
8
9 #ifndef PROBABILISTICMODEL_H
10 #define PROBABILISTICMODEL_H
11
12 #include <list>
13 #include <cmath>
14 #include <cstdio>
15 #include "SafeVector.h"
16 #include "ScoreType.h"
17 #include "SparseMatrix.h"
18 #include "MultiSequence.h"
19
20 using namespace std;
21
22 const int NumMatchStates = 1;            // note that in this version the number
23 // of match states is fixed at 1...will
24 // change in future versions
25 const int NumInsertStates = 2;
26 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
27
28 /////////////////////////////////////////////////////////////////
29 // ProbabilisticModel
30 //
31 // Class for storing the parameters of a probabilistic model and
32 // performing different computations based on those parameters.
33 // In particular, this class handles the computation of
34 // posterior probabilities that may be used in alignment.
35 /////////////////////////////////////////////////////////////////
36
37 class ProbabilisticModel {
38
39         float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
40         float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
41         float matchProb[256][256];        // emission probabilities for match states
42         float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
43
44 public:
45
46         /////////////////////////////////////////////////////////////////
47         // ProbabilisticModel::ProbabilisticModel()
48         //
49         // Constructor.  Builds a new probabilistic model using the
50         // given parameters.
51         /////////////////////////////////////////////////////////////////
52
53         ProbabilisticModel(const VF &initDistribMat, const VF &gapOpen,
54                         const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle) {
55
56                 // build transition matrix
57                 VVF transMat(NumMatrixTypes, VF(NumMatrixTypes, 0.0f));
58                 transMat[0][0] = 1;
59                 for (int i = 0; i < NumInsertStates; i++) {
60                         transMat[0][2 * i + 1] = gapOpen[2 * i];
61                         transMat[0][2 * i + 2] = gapOpen[2 * i + 1];
62                         transMat[0][0] -= (gapOpen[2 * i] + gapOpen[2 * i + 1]);
63                         assert(transMat[0][0] > 0);
64                         transMat[2 * i + 1][2 * i + 1] = gapExtend[2 * i];
65                         transMat[2 * i + 2][2 * i + 2] = gapExtend[2 * i + 1];
66                         transMat[2 * i + 1][2 * i + 2] = 0;
67                         transMat[2 * i + 2][2 * i + 1] = 0;
68                         transMat[2 * i + 1][0] = 1 - gapExtend[2 * i];
69                         transMat[2 * i + 2][0] = 1 - gapExtend[2 * i + 1];
70                 }
71
72                 // create initial and transition probability matrices
73                 for (int i = 0; i < NumMatrixTypes; i++) {
74                         initialDistribution[i] = LOG(initDistribMat[i]);
75                         for (int j = 0; j < NumMatrixTypes; j++)
76                                 transProb[i][j] = LOG(transMat[i][j]);
77                 }
78
79                 // create insertion and match probability matrices
80                 for (int i = 0; i < 256; i++) {
81                         for (int j = 0; j < NumMatrixTypes; j++)
82                                 insProb[i][j] = LOG(emitSingle[i]);
83                         for (int j = 0; j < 256; j++)
84                                 matchProb[i][j] = LOG(emitPairs[i][j]);
85                 }
86         }
87
88         /////////////////////////////////////////////////////////////////
89         // ProbabilisticModel::ComputeForwardMatrix()
90         //
91         // Computes a set of forward probability matrices for aligning
92         // seq1 and seq2.
93         //
94         // For efficiency reasons, a single-dimensional floating-point
95         // array is used here, with the following indexing scheme:
96         //
97         //    forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
98         //    refers to the probability of aligning through j characters
99         //    of the first sequence, k characters of the second sequence,
100         //    and ending in state i.
101         /////////////////////////////////////////////////////////////////
102
103         VF *ComputeForwardMatrix(Sequence *seq1, Sequence *seq2) const {
104
105                 assert(seq1);
106                 assert(seq2);
107
108                 const int seq1Length = seq1->GetLength();
109                 const int seq2Length = seq2->GetLength();
110
111                 // retrieve the points to the beginning of each sequence
112                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
113                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
114
115                 // create matrix
116                 VF *forwardPtr = new VF(
117                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
118                 assert(forwardPtr);
119                 VF &forward = *forwardPtr;
120
121                 // initialization condition
122                 forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] =
123                                 initialDistribution[0]
124                                                 + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
125
126                 for (int k = 0; k < NumInsertStates; k++) {
127                         forward[2 * k + 1 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] =
128                                         initialDistribution[2 * k + 1]
129                                                         + insProb[(unsigned char) iter1[1]][k];
130                         forward[2 * k + 2 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] =
131                                         initialDistribution[2 * k + 2]
132                                                         + insProb[(unsigned char) iter2[1]][k];
133                 }
134
135                 // remember offset for each index combination
136                 int ij = 0;
137                 int i1j = -seq2Length - 1;
138                 int ij1 = -1;
139                 int i1j1 = -seq2Length - 2;
140
141                 ij *= NumMatrixTypes;
142                 i1j *= NumMatrixTypes;
143                 ij1 *= NumMatrixTypes;
144                 i1j1 *= NumMatrixTypes;
145
146                 // compute forward scores
147                 for (int i = 0; i <= seq1Length; i++) {
148                         unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
149                         for (int j = 0; j <= seq2Length; j++) {
150                                 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
151
152                                 if (i > 1 || j > 1) {
153                                         if (i > 0 && j > 0) {
154                                                 forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
155                                                 for (int k = 1; k < NumMatrixTypes; k++)
156                                                         LOG_PLUS_EQUALS(forward[0 + ij],
157                                                                         forward[k + i1j1] + transProb[k][0]);
158                                                 forward[0 + ij] += matchProb[c1][c2];
159                                         }
160                                         if (i > 0) {
161                                                 for (int k = 0; k < NumInsertStates; k++)
162                                                         forward[2 * k + 1 + ij] = insProb[c1][k]
163                                                                         + LOG_ADD(
164                                                                                         forward[0 + i1j]
165                                                                                                         + transProb[0][2 * k + 1],
166                                                                                         forward[2 * k + 1 + i1j]
167                                                                                                         + transProb[2 * k + 1][2 * k
168                                                                                                                         + 1]);
169                                         }
170                                         if (j > 0) {
171                                                 for (int k = 0; k < NumInsertStates; k++)
172                                                         forward[2 * k + 2 + ij] = insProb[c2][k]
173                                                                         + LOG_ADD(
174                                                                                         forward[0 + ij1]
175                                                                                                         + transProb[0][2 * k + 2],
176                                                                                         forward[2 * k + 2 + ij1]
177                                                                                                         + transProb[2 * k + 2][2 * k
178                                                                                                                         + 2]);
179                                         }
180                                 }
181
182                                 ij += NumMatrixTypes;
183                                 i1j += NumMatrixTypes;
184                                 ij1 += NumMatrixTypes;
185                                 i1j1 += NumMatrixTypes;
186                         }
187                 }
188
189                 return forwardPtr;
190         }
191
192         /////////////////////////////////////////////////////////////////
193         // ProbabilisticModel::ComputeBackwardMatrix()
194         //
195         // Computes a set of backward probability matrices for aligning
196         // seq1 and seq2.
197         //
198         // For efficiency reasons, a single-dimensional floating-point
199         // array is used here, with the following indexing scheme:
200         //
201         //    backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
202         //    refers to the probability of starting in state i and
203         //    aligning from character j+1 to the end of the first
204         //    sequence and from character k+1 to the end of the second
205         //    sequence.
206         /////////////////////////////////////////////////////////////////
207
208         VF *ComputeBackwardMatrix(Sequence *seq1, Sequence *seq2) const {
209
210                 assert(seq1);
211                 assert(seq2);
212
213                 const int seq1Length = seq1->GetLength();
214                 const int seq2Length = seq2->GetLength();
215                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
216                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
217
218                 // create matrix
219                 VF *backwardPtr = new VF(
220                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
221                 assert(backwardPtr);
222                 VF &backward = *backwardPtr;
223
224                 // initialization condition
225                 for (int k = 0; k < NumMatrixTypes; k++)
226                         backward[NumMatrixTypes * ((seq1Length + 1) * (seq2Length + 1) - 1)
227                                         + k] = initialDistribution[k];
228
229                 // remember offset for each index combination
230                 int ij = (seq1Length + 1) * (seq2Length + 1) - 1;
231                 int i1j = ij + seq2Length + 1;
232                 int ij1 = ij + 1;
233                 int i1j1 = ij + seq2Length + 2;
234
235                 ij *= NumMatrixTypes;
236                 i1j *= NumMatrixTypes;
237                 ij1 *= NumMatrixTypes;
238                 i1j1 *= NumMatrixTypes;
239
240                 // compute backward scores
241                 for (int i = seq1Length; i >= 0; i--) {
242                         unsigned char c1 =
243                                         (i == seq1Length) ? '~' : (unsigned char) iter1[i + 1];
244                         for (int j = seq2Length; j >= 0; j--) {
245                                 unsigned char c2 =
246                                                 (j == seq2Length) ? '~' : (unsigned char) iter2[j + 1];
247
248                                 if (i < seq1Length && j < seq2Length) {
249                                         const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
250                                         for (int k = 0; k < NumMatrixTypes; k++)
251                                                 LOG_PLUS_EQUALS(backward[k + ij],
252                                                                 ProbXY + transProb[k][0]);
253                                 }
254                                 if (i < seq1Length) {
255                                         for (int k = 0; k < NumInsertStates; k++) {
256                                                 LOG_PLUS_EQUALS(backward[0 + ij],
257                                                                 backward[2 * k + 1 + i1j] + insProb[c1][k]
258                                                                                 + transProb[0][2 * k + 1]);
259                                                 LOG_PLUS_EQUALS(backward[2 * k + 1 + ij],
260                                                                 backward[2 * k + 1 + i1j] + insProb[c1][k]
261                                                                                 + transProb[2 * k + 1][2 * k + 1]);
262                                         }
263                                 }
264                                 if (j < seq2Length) {
265                                         for (int k = 0; k < NumInsertStates; k++) {
266                                                 LOG_PLUS_EQUALS(backward[0 + ij],
267                                                                 backward[2 * k + 2 + ij1] + insProb[c2][k]
268                                                                                 + transProb[0][2 * k + 2]);
269                                                 LOG_PLUS_EQUALS(backward[2 * k + 2 + ij],
270                                                                 backward[2 * k + 2 + ij1] + insProb[c2][k]
271                                                                                 + transProb[2 * k + 2][2 * k + 2]);
272                                         }
273                                 }
274
275                                 ij -= NumMatrixTypes;
276                                 i1j -= NumMatrixTypes;
277                                 ij1 -= NumMatrixTypes;
278                                 i1j1 -= NumMatrixTypes;
279                         }
280                 }
281
282                 return backwardPtr;
283         }
284
285         /////////////////////////////////////////////////////////////////
286         // ProbabilisticModel::ComputeTotalProbability()
287         //
288         // Computes the total probability of an alignment given
289         // the forward and backward matrices.
290         /////////////////////////////////////////////////////////////////
291
292         float ComputeTotalProbability(int seq1Length, int seq2Length,
293                         const VF &forward, const VF &backward) const {
294
295                 // compute total probability
296                 float totalForwardProb = LOG_ZERO;
297                 float totalBackwardProb = LOG_ZERO;
298                 for (int k = 0; k < NumMatrixTypes; k++) {
299                         LOG_PLUS_EQUALS(totalForwardProb,
300                                         forward[k
301                                                         + NumMatrixTypes
302                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
303                                                         + backward[k
304                                                                         + NumMatrixTypes
305                                                                                         * ((seq1Length + 1)
306                                                                                                         * (seq2Length + 1) - 1)]);
307                 }
308
309                 totalBackwardProb = forward[0
310                                 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
311                                 + backward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)];
312
313                 for (int k = 0; k < NumInsertStates; k++) {
314                         LOG_PLUS_EQUALS(totalBackwardProb,
315                                         forward[2 * k + 1
316                                                         + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
317                                                         + backward[2 * k + 1
318                                                                         + NumMatrixTypes
319                                                                                         * (1 * (seq2Length + 1) + 0)]);
320                         LOG_PLUS_EQUALS(totalBackwardProb,
321                                         forward[2 * k + 2
322                                                         + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
323                                                         + backward[2 * k + 2
324                                                                         + NumMatrixTypes
325                                                                                         * (0 * (seq2Length + 1) + 1)]);
326                 }
327
328                 //    cerr << totalForwardProb << " " << totalBackwardProb << endl;
329
330                 return (totalForwardProb + totalBackwardProb) / 2;
331         }
332
333         /////////////////////////////////////////////////////////////////
334         // ProbabilisticModel::ComputePosteriorMatrix()
335         //
336         // Computes the posterior probability matrix based on
337         // the forward and backward matrices.
338         /////////////////////////////////////////////////////////////////
339
340         VF *ComputePosteriorMatrix(Sequence *seq1, Sequence *seq2,
341                         const VF &forward, const VF &backward) const {
342
343                 assert(seq1);
344                 assert(seq2);
345
346                 const int seq1Length = seq1->GetLength();
347                 const int seq2Length = seq2->GetLength();
348
349                 float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
350                                 forward, backward);
351
352                 // compute posterior matrices
353                 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1));
354                 assert(posteriorPtr);
355                 VF &posterior = *posteriorPtr;
356
357                 int ij = 0;
358                 if (totalProb == 0) {
359                         totalProb = 1.0f;
360                 }
361                 VF::iterator ptr = posterior.begin();
362
363                 for (int i = 0; i <= seq1Length; i++) {
364                         for (int j = 0; j <= seq2Length; j++) {
365                                 *(ptr++) = EXP(
366                                                 min(LOG_ONE, forward[ij] + backward[ij] - totalProb));
367                                 ij += NumMatrixTypes;
368                         }
369                 }
370
371                 posterior[0] = 0;
372
373                 return posteriorPtr;
374         }
375
376         /*
377          /////////////////////////////////////////////////////////////////
378          // ProbabilisticModel::ComputeExpectedCounts()
379          //
380          // Computes the expected counts for the various transitions.
381          /////////////////////////////////////////////////////////////////
382
383          VVF *ComputeExpectedCounts () const {
384
385          assert (seq1);
386          assert (seq2);
387
388          const int seq1Length = seq1->GetLength();
389          const int seq2Length = seq2->GetLength();
390          SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
391          SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
392
393          // compute total probability
394          float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
395          forward, backward);
396
397          // initialize expected counts
398          VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
399          VVF &counts = *countsPtr;
400
401          // remember offset for each index combination
402          int ij = 0;
403          int i1j = -seq2Length - 1;
404          int ij1 = -1;
405          int i1j1 = -seq2Length - 2;
406
407          ij *= NumMatrixTypes;
408          i1j *= NumMatrixTypes;
409          ij1 *= NumMatrixTypes;
410          i1j1 *= NumMatrixTypes;
411
412          // compute expected counts
413          for (int i = 0; i <= seq1Length; i++){
414          unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
415          for (int j = 0; j <= seq2Length; j++){
416          unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
417
418          if (i > 0 && j > 0){
419          for (int k = 0; k < NumMatrixTypes; k++)
420          LOG_PLUS_EQUALS (counts[k][0],
421          forward[k + i1j1] + transProb[k][0] +
422          matchProb[c1][c2] + backward[0 + ij]);
423          }
424          if (i > 0){
425          for (int k = 0; k < NumInsertStates; k++){
426          LOG_PLUS_EQUALS (counts[0][2*k+1],
427          forward[0 + i1j] + transProb[0][2*k+1] +
428          insProb[c1][k] + backward[2*k+1 + ij]);
429          LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
430          forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
431          insProb[c1][k] + backward[2*k+1 + ij]);
432          }
433          }
434          if (j > 0){
435          for (int k = 0; k < NumInsertStates; k++){
436          LOG_PLUS_EQUALS (counts[0][2*k+2],
437          forward[0 + ij1] + transProb[0][2*k+2] +
438          insProb[c2][k] + backward[2*k+2 + ij]);
439          LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
440          forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
441          insProb[c2][k] + backward[2*k+2 + ij]);
442          }
443          }
444
445          ij += NumMatrixTypes;
446          i1j += NumMatrixTypes;
447          ij1 += NumMatrixTypes;
448          i1j1 += NumMatrixTypes;
449          }
450          }
451
452          // scale all expected counts appropriately
453          for (int i = 0; i < NumMatrixTypes; i++)
454          for (int j = 0; j < NumMatrixTypes; j++)
455          counts[i][j] -= totalProb;
456
457          }
458          */
459
460         /////////////////////////////////////////////////////////////////
461         // ProbabilisticModel::ComputeNewParameters()
462         //
463         // Computes a new parameter set based on the expected counts
464         // given.
465         /////////////////////////////////////////////////////////////////
466         void ComputeNewParameters(Sequence *seq1, Sequence *seq2, const VF &forward,
467                         const VF &backward, VF &initDistribMat, VF &gapOpen, VF &gapExtend,
468                         VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
469
470                 assert(seq1);
471                 assert(seq2);
472
473                 const int seq1Length = seq1->GetLength();
474                 const int seq2Length = seq2->GetLength();
475                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
476                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
477
478                 // compute total probability
479                 float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
480                                 forward, backward);
481
482                 // initialize expected counts
483                 VVF transCounts(NumMatrixTypes, VF(NumMatrixTypes, LOG_ZERO));
484                 VF initCounts(NumMatrixTypes, LOG_ZERO);
485                 VVF pairCounts(256, VF(256, LOG_ZERO));
486                 VF singleCounts(256, LOG_ZERO);
487
488                 // remember offset for each index combination
489                 int ij = 0;
490                 int i1j = -seq2Length - 1;
491                 int ij1 = -1;
492                 int i1j1 = -seq2Length - 2;
493
494                 ij *= NumMatrixTypes;
495                 i1j *= NumMatrixTypes;
496                 ij1 *= NumMatrixTypes;
497                 i1j1 *= NumMatrixTypes;
498
499                 // compute initial distribution posteriors
500                 initCounts[0] = LOG_ADD(
501                                 forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
502                                                 + backward[0
503                                                                 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)],
504                                 forward[0
505                                                 + NumMatrixTypes
506                                                                 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
507                                                 + backward[0
508                                                                 + NumMatrixTypes
509                                                                                 * ((seq1Length + 1) * (seq2Length + 1)
510                                                                                                 - 1)]);
511                 for (int k = 0; k < NumInsertStates; k++) {
512                         initCounts[2 * k + 1] = LOG_ADD(
513                                         forward[2 * k + 1
514                                                         + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
515                                                         + backward[2 * k + 1
516                                                                         + NumMatrixTypes
517                                                                                         * (1 * (seq2Length + 1) + 0)],
518                                         forward[2 * k + 1
519                                                         + NumMatrixTypes
520                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
521                                                         + backward[2 * k + 1
522                                                                         + NumMatrixTypes
523                                                                                         * ((seq1Length + 1)
524                                                                                                         * (seq2Length + 1) - 1)]);
525                         initCounts[2 * k + 2] = LOG_ADD(
526                                         forward[2 * k + 2
527                                                         + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
528                                                         + backward[2 * k + 2
529                                                                         + NumMatrixTypes
530                                                                                         * (0 * (seq2Length + 1) + 1)],
531                                         forward[2 * k + 2
532                                                         + NumMatrixTypes
533                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
534                                                         + backward[2 * k + 2
535                                                                         + NumMatrixTypes
536                                                                                         * ((seq1Length + 1)
537                                                                                                         * (seq2Length + 1) - 1)]);
538                 }
539
540                 // compute expected counts
541                 for (int i = 0; i <= seq1Length; i++) {
542                         unsigned char c1 =
543                                         (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
544                         for (int j = 0; j <= seq2Length; j++) {
545                                 unsigned char c2 =
546                                                 (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
547
548                                 if (i > 0 && j > 0) {
549                                         if (enableTrainEmissions && i == 1 && j == 1) {
550                                                 LOG_PLUS_EQUALS(pairCounts[c1][c2],
551                                                                 initialDistribution[0] + matchProb[c1][c2]
552                                                                                 + backward[0 + ij]);
553                                                 LOG_PLUS_EQUALS(pairCounts[c2][c1],
554                                                                 initialDistribution[0] + matchProb[c2][c1]
555                                                                                 + backward[0 + ij]);
556                                         }
557
558                                         for (int k = 0; k < NumMatrixTypes; k++) {
559                                                 LOG_PLUS_EQUALS(transCounts[k][0],
560                                                                 forward[k + i1j1] + transProb[k][0]
561                                                                                 + matchProb[c1][c2] + backward[0 + ij]);
562                                                 if (enableTrainEmissions && (i != 1 || j != 1)) {//adding parentheses by Liu Yongchao, 5 Mar, 2010
563                                                         LOG_PLUS_EQUALS(pairCounts[c1][c2],
564                                                                         forward[k + i1j1] + transProb[k][0]
565                                                                                         + matchProb[c1][c2]
566                                                                                         + backward[0 + ij]);
567                                                         LOG_PLUS_EQUALS(pairCounts[c2][c1],
568                                                                         forward[k + i1j1] + transProb[k][0]
569                                                                                         + matchProb[c2][c1]
570                                                                                         + backward[0 + ij]);
571                                                 }
572                                         }
573                                 }
574                                 if (i > 0) {
575                                         for (int k = 0; k < NumInsertStates; k++) {
576                                                 LOG_PLUS_EQUALS(transCounts[0][2 * k + 1],
577                                                                 forward[0 + i1j] + transProb[0][2 * k + 1]
578                                                                                 + insProb[c1][k]
579                                                                                 + backward[2 * k + 1 + ij]);
580                                                 LOG_PLUS_EQUALS(transCounts[2 * k + 1][2 * k + 1],
581                                                                 forward[2 * k + 1 + i1j]
582                                                                                 + transProb[2 * k + 1][2 * k + 1]
583                                                                                 + insProb[c1][k]
584                                                                                 + backward[2 * k + 1 + ij]);
585                                                 if (enableTrainEmissions) {
586                                                         if (i == 1 && j == 0) {
587                                                                 LOG_PLUS_EQUALS(singleCounts[c1],
588                                                                                 initialDistribution[2 * k + 1]
589                                                                                                 + insProb[c1][k]
590                                                                                                 + backward[2 * k + 1 + ij]);
591                                                         } else {
592                                                                 LOG_PLUS_EQUALS(singleCounts[c1],
593                                                                                 forward[0 + i1j]
594                                                                                                 + transProb[0][2 * k + 1]
595                                                                                                 + insProb[c1][k]
596                                                                                                 + backward[2 * k + 1 + ij]);
597                                                                 LOG_PLUS_EQUALS(singleCounts[c1],
598                                                                                 forward[2 * k + 1 + i1j]
599                                                                                                 + transProb[2 * k + 1][2 * k + 1]
600                                                                                                 + insProb[c1][k]
601                                                                                                 + backward[2 * k + 1 + ij]);
602                                                         }
603                                                 }
604                                         }
605                                 }
606                                 if (j > 0) {
607                                         for (int k = 0; k < NumInsertStates; k++) {
608                                                 LOG_PLUS_EQUALS(transCounts[0][2 * k + 2],
609                                                                 forward[0 + ij1] + transProb[0][2 * k + 2]
610                                                                                 + insProb[c2][k]
611                                                                                 + backward[2 * k + 2 + ij]);
612                                                 LOG_PLUS_EQUALS(transCounts[2 * k + 2][2 * k + 2],
613                                                                 forward[2 * k + 2 + ij1]
614                                                                                 + transProb[2 * k + 2][2 * k + 2]
615                                                                                 + insProb[c2][k]
616                                                                                 + backward[2 * k + 2 + ij]);
617                                                 if (enableTrainEmissions) {
618                                                         if (i == 0 && j == 1) {
619                                                                 LOG_PLUS_EQUALS(singleCounts[c2],
620                                                                                 initialDistribution[2 * k + 2]
621                                                                                                 + insProb[c2][k]
622                                                                                                 + backward[2 * k + 2 + ij]);
623                                                         } else {
624                                                                 LOG_PLUS_EQUALS(singleCounts[c2],
625                                                                                 forward[0 + ij1]
626                                                                                                 + transProb[0][2 * k + 2]
627                                                                                                 + insProb[c2][k]
628                                                                                                 + backward[2 * k + 2 + ij]);
629                                                                 LOG_PLUS_EQUALS(singleCounts[c2],
630                                                                                 forward[2 * k + 2 + ij1]
631                                                                                                 + transProb[2 * k + 2][2 * k + 2]
632                                                                                                 + insProb[c2][k]
633                                                                                                 + backward[2 * k + 2 + ij]);
634                                                         }
635                                                 }
636                                         }
637                                 }
638
639                                 ij += NumMatrixTypes;
640                                 i1j += NumMatrixTypes;
641                                 ij1 += NumMatrixTypes;
642                                 i1j1 += NumMatrixTypes;
643                         }
644                 }
645
646                 // scale all expected counts appropriately
647                 for (int i = 0; i < NumMatrixTypes; i++) {
648                         initCounts[i] -= totalProb;
649                         for (int j = 0; j < NumMatrixTypes; j++)
650                                 transCounts[i][j] -= totalProb;
651                 }
652                 if (enableTrainEmissions) {
653                         for (int i = 0; i < 256; i++) {
654                                 for (int j = 0; j < 256; j++)
655                                         pairCounts[i][j] -= totalProb;
656                                 singleCounts[i] -= totalProb;
657                         }
658                 }
659
660                 // compute new initial distribution
661                 float totalInitDistribCounts = 0;
662                 for (int i = 0; i < NumMatrixTypes; i++)
663                         totalInitDistribCounts += exp(initCounts[i]); // should be 2
664                 initDistribMat[0] = min(1.0f,
665                                 max(0.0f, (float) exp(initCounts[0]) / totalInitDistribCounts));
666                 for (int k = 0; k < NumInsertStates; k++) {
667                         float val =
668                                         (exp(initCounts[2 * k + 1]) + exp(initCounts[2 * k + 2]))
669                                                         / 2;
670                         initDistribMat[2 * k + 1] = initDistribMat[2 * k + 2] = min(1.0f,
671                                         max(0.0f, val / totalInitDistribCounts));
672                 }
673
674                 // compute total counts for match state
675                 float inMatchStateCounts = 0;
676                 for (int i = 0; i < NumMatrixTypes; i++)
677                         inMatchStateCounts += exp(transCounts[0][i]);
678                 for (int i = 0; i < NumInsertStates; i++) {
679
680                         // compute total counts for gap state
681                         float inGapStateCounts = exp(transCounts[2 * i + 1][0])
682                                         + exp(transCounts[2 * i + 1][2 * i + 1])
683                                         + exp(transCounts[2 * i + 2][0])
684                                         + exp(transCounts[2 * i + 2][2 * i + 2]);
685
686                         gapOpen[2 * i] = gapOpen[2 * i + 1] = (exp(
687                                         transCounts[0][2 * i + 1]) + exp(transCounts[0][2 * i + 2]))
688                                         / (2 * inMatchStateCounts);
689
690                         gapExtend[2 * i] = gapExtend[2 * i + 1] = (exp(
691                                         transCounts[2 * i + 1][2 * i + 1])
692                                         + exp(transCounts[2 * i + 2][2 * i + 2]))
693                                         / inGapStateCounts;
694                 }
695
696                 if (enableTrainEmissions) {
697                         float totalPairCounts = 0;
698                         float totalSingleCounts = 0;
699                         for (int i = 0; i < 256; i++) {
700                                 for (int j = 0; j <= i; j++)
701                                         totalPairCounts += exp(pairCounts[j][i]);
702                                 totalSingleCounts += exp(singleCounts[i]);
703                         }
704
705                         for (int i = 0; i < 256; i++)
706                                 if (!islower((char) i)) {
707                                         int li = (int) ((unsigned char) tolower((char) i));
708                                         for (int j = 0; j <= i; j++)
709                                                 if (!islower((char) j)) {
710                                                         int lj = (int) ((unsigned char) tolower((char) j));
711                                                         emitPairs[i][j] =
712                                                                         emitPairs[i][lj] =
713                                                                                         emitPairs[li][j] =
714                                                                                                         emitPairs[li][lj] =
715                                                                                                                         emitPairs[j][i] =
716                                                                                                                                         emitPairs[j][li] =
717                                                                                                                                                         emitPairs[lj][i] =
718                                                                                                                                                                         emitPairs[lj][li] =
719                                                                                                                                                                                         exp(
720                                                                                                                                                                                                         pairCounts[j][i])
721                                                                                                                                                                                                         / totalPairCounts;
722                                                 }
723                                         emitSingle[i] = emitSingle[li] = exp(singleCounts[i])
724                                                         / totalSingleCounts;
725                                 }
726                 }
727         }
728
729         /////////////////////////////////////////////////////////////////
730         // ProbabilisticModel::ComputeAlignment()
731         //
732         // Computes an alignment based on the given posterior matrix.
733         // This is done by finding the maximum summing path (or
734         // maximum weight trace) through the posterior matrix.  The
735         // final alignment is returned as a pair consisting of:
736         //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
737         //        denote insertions in one of the two sequences and
738         //        B's denote that both sequences are present (i.e.
739         //        matches).
740         //    (2) a float indicating the sum achieved
741         /////////////////////////////////////////////////////////////////
742
743         pair<SafeVector<char> *, float> ComputeAlignment(int seq1Length,
744                         int seq2Length, const VF &posterior) const {
745
746                 float *twoRows = new float[(seq2Length + 1) * 2];
747                 assert(twoRows);
748                 float *oldRow = twoRows;
749                 float *newRow = twoRows + seq2Length + 1;
750
751                 char *tracebackMatrix = new char[(seq1Length + 1) * (seq2Length + 1)];
752                 assert(tracebackMatrix);
753                 char *tracebackPtr = tracebackMatrix;
754
755                 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
756
757                 // initialization
758                 for (int i = 0; i <= seq2Length; i++) {
759                         oldRow[i] = 0;
760                         *(tracebackPtr++) = 'L';
761                 }
762
763                 // fill in matrix
764                 for (int i = 1; i <= seq1Length; i++) {
765
766                         // initialize left column
767                         newRow[0] = 0;
768                         posteriorPtr++;
769                         *(tracebackPtr++) = 'U';
770
771                         // fill in rest of row
772                         for (int j = 1; j <= seq2Length; j++) {
773                                 ChooseBestOfThree(*(posteriorPtr++) + oldRow[j - 1],
774                                                 newRow[j - 1], oldRow[j], 'D', 'L', 'U', &newRow[j],
775                                                 tracebackPtr++);
776                         }
777
778                         // swap rows
779                         float *temp = oldRow;
780                         oldRow = newRow;
781                         newRow = temp;
782                 }
783
784                 // store best score
785                 float total = oldRow[seq2Length];
786                 delete[] twoRows;
787
788                 // compute traceback
789                 SafeVector<char> *alignment = new SafeVector<char>;
790                 assert(alignment);
791                 int r = seq1Length, c = seq2Length;
792                 while (r != 0 || c != 0) {
793                         char ch = tracebackMatrix[r * (seq2Length + 1) + c];
794                         switch (ch) {
795                         case 'L':
796                                 c--;
797                                 alignment->push_back('Y');
798                                 break;
799                         case 'U':
800                                 r--;
801                                 alignment->push_back('X');
802                                 break;
803                         case 'D':
804                                 c--;
805                                 r--;
806                                 alignment->push_back('B');
807                                 break;
808                         default:
809                                 assert(false);
810                         }
811                 }
812
813                 delete[] tracebackMatrix;
814
815                 reverse(alignment->begin(), alignment->end());
816
817                 return make_pair(alignment, total);
818         }
819
820         /////////////////////////////////////////////////////////////////
821         // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
822         //
823         // Similar to ComputeAlignment() except with gap penalties.
824         /////////////////////////////////////////////////////////////////
825
826         pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties(
827                         MultiSequence *align1, MultiSequence *align2, const VF &posterior,
828                         int numSeqs1, int numSeqs2, float gapOpenPenalty,
829                         float gapContinuePenalty) const {
830                 int seq1Length = align1->GetSequence(0)->GetLength();
831                 int seq2Length = align2->GetSequence(0)->GetLength();
832                 SafeVector<SafeVector<char>::iterator> dataPtrs1(
833                                 align1->GetNumSequences());
834                 SafeVector<SafeVector<char>::iterator> dataPtrs2(
835                                 align2->GetNumSequences());
836
837                 // grab character data
838                 for (int i = 0; i < align1->GetNumSequences(); i++)
839                         dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
840                 for (int i = 0; i < align2->GetNumSequences(); i++)
841                         dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
842
843                 // the number of active sequences at any given column is defined to be the
844                 // number of non-gap characters in that column; the number of gap opens at
845                 // any given column is defined to be the number of gap characters in that
846                 // column where the previous character in the respective sequence was not
847                 // a gap
848                 SafeVector<int> numActive1(seq1Length + 1), numGapOpens1(
849                                 seq1Length + 1);
850                 SafeVector<int> numActive2(seq2Length + 1), numGapOpens2(
851                                 seq2Length + 1);
852
853                 // compute number of active sequences and gap opens for each group
854                 for (int i = 0; i < align1->GetNumSequences(); i++) {
855                         SafeVector<char>::iterator dataPtr =
856                                         align1->GetSequence(i)->GetDataPtr();
857                         numActive1[0] = numGapOpens1[0] = 0;
858                         for (int j = 1; j <= seq1Length; j++) {
859                                 if (dataPtr[j] != '-') {
860                                         numActive1[j]++;
861                                         numGapOpens1[j] += (j != 1 && dataPtr[j - 1] != '-');
862                                 }
863                         }
864                 }
865                 for (int i = 0; i < align2->GetNumSequences(); i++) {
866                         SafeVector<char>::iterator dataPtr =
867                                         align2->GetSequence(i)->GetDataPtr();
868                         numActive2[0] = numGapOpens2[0] = 0;
869                         for (int j = 1; j <= seq2Length; j++) {
870                                 if (dataPtr[j] != '-') {
871                                         numActive2[j]++;
872                                         numGapOpens2[j] += (j != 1 && dataPtr[j - 1] != '-');
873                                 }
874                         }
875                 }
876
877                 VVF openingPenalty1(numSeqs1 + 1, VF(numSeqs2 + 1));
878                 VF continuingPenalty1(numSeqs1 + 1);
879                 VVF openingPenalty2(numSeqs1 + 1, VF(numSeqs2 + 1));
880                 VF continuingPenalty2(numSeqs2 + 1);
881
882                 // precompute penalties
883                 for (int i = 0; i <= numSeqs1; i++)
884                         for (int j = 0; j <= numSeqs2; j++)
885                                 openingPenalty1[i][j] = i
886                                                 * (gapOpenPenalty * j
887                                                                 + gapContinuePenalty * (numSeqs2 - j));
888                 for (int i = 0; i <= numSeqs1; i++)
889                         continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
890                 for (int i = 0; i <= numSeqs2; i++)
891                         for (int j = 0; j <= numSeqs1; j++)
892                                 openingPenalty2[i][j] = i
893                                                 * (gapOpenPenalty * j
894                                                                 + gapContinuePenalty * (numSeqs1 - j));
895                 for (int i = 0; i <= numSeqs2; i++)
896                         continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
897
898                 float *twoRows = new float[6 * (seq2Length + 1)];
899                 assert(twoRows);
900                 float *oldRowMatch = twoRows;
901                 float *newRowMatch = twoRows + (seq2Length + 1);
902                 float *oldRowInsertX = twoRows + 2 * (seq2Length + 1);
903                 float *newRowInsertX = twoRows + 3 * (seq2Length + 1);
904                 float *oldRowInsertY = twoRows + 4 * (seq2Length + 1);
905                 float *newRowInsertY = twoRows + 5 * (seq2Length + 1);
906
907                 char *tracebackMatrix =
908                                 new char[3 * (seq1Length + 1) * (seq2Length + 1)];
909                 assert(tracebackMatrix);
910                 char *tracebackPtr = tracebackMatrix;
911
912                 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
913
914                 // initialization
915                 for (int i = 0; i <= seq2Length; i++) {
916                         oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
917                         oldRowInsertY[i] =
918                                         (i == 0) ?
919                                                         0 :
920                                                         oldRowInsertY[i - 1]
921                                                                         + continuingPenalty2[numActive2[i]];
922                         *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'Y';
923                         tracebackPtr += 3;
924                 }
925
926                 // fill in matrix
927                 for (int i = 1; i <= seq1Length; i++) {
928
929                         // initialize left column
930                         newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
931                         newRowInsertX[0] = oldRowInsertX[0]
932                                         + continuingPenalty1[numActive1[i]];
933                         posteriorPtr++;
934                         *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'X';
935                         tracebackPtr += 3;
936
937                         // fill in rest of row
938                         for (int j = 1; j <= seq2Length; j++) {
939
940                                 // going to MATCH state
941                                 ChooseBestOfThree(oldRowMatch[j - 1], oldRowInsertX[j - 1],
942                                                 oldRowInsertY[j - 1], 'M', 'X', 'Y', &newRowMatch[j],
943                                                 tracebackPtr++);
944                                 newRowMatch[j] += *(posteriorPtr++);
945
946                                 // going to INSERT X state
947                                 ChooseBestOfThree(
948                                                 oldRowMatch[j]
949                                                                 + openingPenalty1[numActive1[i]][numGapOpens2[j]],
950                                                 oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
951                                                 oldRowInsertY[j]
952                                                                 + openingPenalty1[numActive1[i]][numGapOpens2[j]],
953                                                 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
954
955                                 // going to INSERT Y state
956                                 ChooseBestOfThree(
957                                                 newRowMatch[j - 1]
958                                                                 + openingPenalty2[numActive2[j]][numGapOpens1[i]],
959                                                 newRowInsertX[j - 1]
960                                                                 + openingPenalty2[numActive2[j]][numGapOpens1[i]],
961                                                 newRowInsertY[j - 1]
962                                                                 + continuingPenalty2[numActive2[j]], 'M', 'X',
963                                                 'Y', &newRowInsertY[j], tracebackPtr++);
964                         }
965
966                         // swap rows
967                         float *temp;
968                         temp = oldRowMatch;
969                         oldRowMatch = newRowMatch;
970                         newRowMatch = temp;
971                         temp = oldRowInsertX;
972                         oldRowInsertX = newRowInsertX;
973                         newRowInsertX = temp;
974                         temp = oldRowInsertY;
975                         oldRowInsertY = newRowInsertY;
976                         newRowInsertY = temp;
977                 }
978
979                 // store best score
980                 float total;
981                 char matrix;
982                 ChooseBestOfThree(oldRowMatch[seq2Length], oldRowInsertX[seq2Length],
983                                 oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix);
984
985                 delete[] twoRows;
986
987                 // compute traceback
988                 SafeVector<char> *alignment = new SafeVector<char>;
989                 assert(alignment);
990                 int r = seq1Length, c = seq2Length;
991                 while (r != 0 || c != 0) {
992
993                         int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
994                         char ch = tracebackMatrix[(r * (seq2Length + 1) + c) * 3 + offset];
995                         switch (matrix) {
996                         case 'Y':
997                                 c--;
998                                 alignment->push_back('Y');
999                                 break;
1000                         case 'X':
1001                                 r--;
1002                                 alignment->push_back('X');
1003                                 break;
1004                         case 'M':
1005                                 c--;
1006                                 r--;
1007                                 alignment->push_back('B');
1008                                 break;
1009                         default:
1010                                 assert(false);
1011                         }
1012                         matrix = ch;
1013                 }
1014
1015                 delete[] tracebackMatrix;
1016
1017                 reverse(alignment->begin(), alignment->end());
1018
1019                 return make_pair(alignment, 1.0f);
1020         }
1021
1022         /////////////////////////////////////////////////////////////////
1023         // ProbabilisticModel::ComputeViterbiAlignment()
1024         //
1025         // Computes the highest probability pairwise alignment using the
1026         // probabilistic model.  The final alignment is returned as a
1027         //  pair consisting of:
1028         //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
1029         //        denote insertions in one of the two sequences and
1030         //        B's denote that both sequences are present (i.e.
1031         //        matches).
1032         //    (2) a float containing the log probability of the best
1033         //        alignment (not used)
1034         /////////////////////////////////////////////////////////////////
1035
1036         pair<SafeVector<char> *, float> ComputeViterbiAlignment(Sequence *seq1,
1037                         Sequence *seq2) const {
1038
1039                 assert(seq1);
1040                 assert(seq2);
1041
1042                 const int seq1Length = seq1->GetLength();
1043                 const int seq2Length = seq2->GetLength();
1044
1045                 // retrieve the points to the beginning of each sequence
1046                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1047                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1048
1049                 // create viterbi matrix
1050                 VF *viterbiPtr = new VF(
1051                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
1052                 assert(viterbiPtr);
1053                 VF &viterbi = *viterbiPtr;
1054
1055                 // create traceback matrix
1056                 VI *tracebackPtr = new VI(
1057                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), -1);
1058                 assert(tracebackPtr);
1059                 VI &traceback = *tracebackPtr;
1060
1061                 // initialization condition
1062                 for (int k = 0; k < NumMatrixTypes; k++)
1063                         viterbi[k] = initialDistribution[k];
1064
1065                 // remember offset for each index combination
1066                 int ij = 0;
1067                 int i1j = -seq2Length - 1;
1068                 int ij1 = -1;
1069                 int i1j1 = -seq2Length - 2;
1070
1071                 ij *= NumMatrixTypes;
1072                 i1j *= NumMatrixTypes;
1073                 ij1 *= NumMatrixTypes;
1074                 i1j1 *= NumMatrixTypes;
1075
1076                 // compute viterbi scores
1077                 for (int i = 0; i <= seq1Length; i++) {
1078                         unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
1079                         for (int j = 0; j <= seq2Length; j++) {
1080                                 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
1081
1082                                 if (i > 0 && j > 0) {
1083                                         for (int k = 0; k < NumMatrixTypes; k++) {
1084                                                 float newVal = viterbi[k + i1j1] + transProb[k][0]
1085                                                                 + matchProb[c1][c2];
1086                                                 if (viterbi[0 + ij] < newVal) {
1087                                                         viterbi[0 + ij] = newVal;
1088                                                         traceback[0 + ij] = k;
1089                                                 }
1090                                         }
1091                                 }
1092                                 if (i > 0) {
1093                                         for (int k = 0; k < NumInsertStates; k++) {
1094                                                 float valFromMatch = insProb[c1][k] + viterbi[0 + i1j]
1095                                                                 + transProb[0][2 * k + 1];
1096                                                 float valFromIns = insProb[c1][k]
1097                                                                 + viterbi[2 * k + 1 + i1j]
1098                                                                 + transProb[2 * k + 1][2 * k + 1];
1099                                                 if (valFromMatch >= valFromIns) {
1100                                                         viterbi[2 * k + 1 + ij] = valFromMatch;
1101                                                         traceback[2 * k + 1 + ij] = 0;
1102                                                 } else {
1103                                                         viterbi[2 * k + 1 + ij] = valFromIns;
1104                                                         traceback[2 * k + 1 + ij] = 2 * k + 1;
1105                                                 }
1106                                         }
1107                                 }
1108                                 if (j > 0) {
1109                                         for (int k = 0; k < NumInsertStates; k++) {
1110                                                 float valFromMatch = insProb[c2][k] + viterbi[0 + ij1]
1111                                                                 + transProb[0][2 * k + 2];
1112                                                 float valFromIns = insProb[c2][k]
1113                                                                 + viterbi[2 * k + 2 + ij1]
1114                                                                 + transProb[2 * k + 2][2 * k + 2];
1115                                                 if (valFromMatch >= valFromIns) {
1116                                                         viterbi[2 * k + 2 + ij] = valFromMatch;
1117                                                         traceback[2 * k + 2 + ij] = 0;
1118                                                 } else {
1119                                                         viterbi[2 * k + 2 + ij] = valFromIns;
1120                                                         traceback[2 * k + 2 + ij] = 2 * k + 2;
1121                                                 }
1122                                         }
1123                                 }
1124
1125                                 ij += NumMatrixTypes;
1126                                 i1j += NumMatrixTypes;
1127                                 ij1 += NumMatrixTypes;
1128                                 i1j1 += NumMatrixTypes;
1129                         }
1130                 }
1131
1132                 // figure out best terminating cell
1133                 float bestProb = LOG_ZERO;
1134                 int state = -1;
1135                 for (int k = 0; k < NumMatrixTypes; k++) {
1136                         float thisProb =
1137                                         viterbi[k
1138                                                         + NumMatrixTypes
1139                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
1140                                                         + initialDistribution[k];
1141                         if (bestProb < thisProb) {
1142                                 bestProb = thisProb;
1143                                 state = k;
1144                         }
1145                 }
1146                 assert(state != -1);
1147
1148                 delete viterbiPtr;
1149
1150                 // compute traceback
1151                 SafeVector<char> *alignment = new SafeVector<char>;
1152                 assert(alignment);
1153                 int r = seq1Length, c = seq2Length;
1154                 while (r != 0 || c != 0) {
1155                         int newState = traceback[state
1156                                         + NumMatrixTypes * (r * (seq2Length + 1) + c)];
1157
1158                         if (state == 0) {
1159                                 c--;
1160                                 r--;
1161                                 alignment->push_back('B');
1162                         } else if (state % 2 == 1) {
1163                                 r--;
1164                                 alignment->push_back('X');
1165                         } else {
1166                                 c--;
1167                                 alignment->push_back('Y');
1168                         }
1169
1170                         state = newState;
1171                 }
1172
1173                 delete tracebackPtr;
1174
1175                 reverse(alignment->begin(), alignment->end());
1176
1177                 return make_pair(alignment, bestProb);
1178         }
1179
1180         /////////////////////////////////////////////////////////////////
1181         // ProbabilisticModel::BuildPosterior()
1182         //
1183         // Builds a posterior probability matrix needed to align a pair
1184         // of alignments.  Mathematically, the returned matrix M is
1185         // defined as follows:
1186         //    M[i,j] =     sum          sum      f(s,t,i,j)
1187         //             s in align1  t in align2
1188         // where
1189         //                  [  P(s[i'] <--> t[j'])
1190         //                  [       if s[i'] is a letter in the ith column of align1 and
1191         //                  [          t[j'] it a letter in the jth column of align2
1192         //    f(s,t,i,j) =  [
1193         //                  [  0    otherwise
1194         //
1195         /////////////////////////////////////////////////////////////////
1196
1197         VF *BuildPosterior(MultiSequence *align1, MultiSequence *align2,
1198                         const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1199                         float cutoff = 0.0f) const {
1200                 const int seq1Length = align1->GetSequence(0)->GetLength();
1201                 const int seq2Length = align2->GetSequence(0)->GetLength();
1202
1203                 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
1204                 assert(posteriorPtr);
1205                 VF &posterior = *posteriorPtr;
1206                 VF::iterator postPtr = posterior.begin();
1207
1208                 // for each s in align1
1209                 for (int i = 0; i < align1->GetNumSequences(); i++) {
1210                         int first = align1->GetSequence(i)->GetLabel();
1211                         SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1212
1213                         // for each t in align2
1214                         for (int j = 0; j < align2->GetNumSequences(); j++) {
1215                                 int second = align2->GetSequence(j)->GetLabel();
1216                                 SafeVector<int> *mapping2 =
1217                                                 align2->GetSequence(j)->GetMapping();
1218
1219                                 if (first < second) {
1220
1221                                         // get the associated sparse matrix
1222                                         SparseMatrix *matrix = sparseMatrices[first][second];
1223
1224                                         for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
1225                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1226                                                 int base = (*mapping1)[ii] * (seq2Length + 1);
1227                                                 int rowSize = matrix->GetRowSize(ii);
1228
1229                                                 // add in all relevant values
1230                                                 for (int jj = 0; jj < rowSize; jj++)
1231                                                         posterior[base + (*mapping2)[row[jj].first]] +=
1232                                                                         row[jj].second;
1233
1234                                                 // subtract cutoff 
1235                                                 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1236                                                         posterior[base + (*mapping2)[jj]] -= cutoff;
1237                                         }
1238
1239                                 } else {
1240
1241                                         // get the associated sparse matrix
1242                                         SparseMatrix *matrix = sparseMatrices[second][first];
1243
1244                                         for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
1245                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1246                                                 int base = (*mapping2)[jj];
1247                                                 int rowSize = matrix->GetRowSize(jj);
1248
1249                                                 // add in all relevant values
1250                                                 for (int ii = 0; ii < rowSize; ii++)
1251                                                         posterior[base
1252                                                                         + (*mapping1)[row[ii].first]
1253                                                                                         * (seq2Length + 1)] +=
1254                                                                         row[ii].second;
1255
1256                                                 // subtract cutoff 
1257                                                 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1258                                                         posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
1259                                                                         cutoff;
1260                                         }
1261
1262                                 }
1263
1264                                 delete mapping2;
1265                         }
1266
1267                         delete mapping1;
1268                 }
1269
1270                 return posteriorPtr;
1271         }
1272         //added by Liu Yongchao.Feb 23, 2010
1273         VF *BuildPosterior(int* seqsWeights, MultiSequence *align1,
1274                         MultiSequence *align2,
1275                         const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1276                         float cutoff = 0.0f) const {
1277                 const int seq1Length = align1->GetSequence(0)->GetLength();
1278                 const int seq2Length = align2->GetSequence(0)->GetLength();
1279
1280                 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
1281                 assert(posteriorPtr);
1282                 VF &posterior = *posteriorPtr;
1283                 VF::iterator postPtr = posterior.begin();
1284
1285                 //compute the total sum of all weights
1286                 float totalWeights = 0;
1287                 for (int i = 0; i < align1->GetNumSequences(); i++) {
1288                         int first = align1->GetSequence(i)->GetLabel();
1289                         int w1 = seqsWeights[first];
1290                         for (int j = 0; j < align2->GetNumSequences(); j++) {
1291                                 int second = align2->GetSequence(j)->GetLabel();
1292                                 int w2 = seqsWeights[second];
1293
1294                                 totalWeights += w1 * w2;
1295                         }
1296                 }
1297                 // for each s in align1
1298                 for (int i = 0; i < align1->GetNumSequences(); i++) {
1299                         int first = align1->GetSequence(i)->GetLabel();
1300                         int w1 = seqsWeights[first];
1301                         SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1302                         // for each t in align2
1303                         for (int j = 0; j < align2->GetNumSequences(); j++) {
1304                                 int second = align2->GetSequence(j)->GetLabel();
1305                                 int w2 = seqsWeights[second];
1306                                 SafeVector<int> *mapping2 =
1307                                                 align2->GetSequence(j)->GetMapping();
1308
1309                                 float w = (float) (w1 * w2) / totalWeights;
1310                                 if (first < second) {
1311
1312                                         // get the associated sparse matrix
1313                                         SparseMatrix *matrix = sparseMatrices[first][second];
1314
1315                                         for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
1316                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1317                                                 int base = (*mapping1)[ii] * (seq2Length + 1);
1318                                                 int rowSize = matrix->GetRowSize(ii);
1319
1320                                                 // add in all relevant values
1321                                                 for (int jj = 0; jj < rowSize; jj++)
1322                                                         posterior[base + (*mapping2)[row[jj].first]] += w
1323                                                                         * row[jj].second;
1324
1325                                                 // subtract cutoff 
1326                                                 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1327                                                         posterior[base + (*mapping2)[jj]] -= w * cutoff;
1328                                         }
1329
1330                                 } else {
1331
1332                                         // get the associated sparse matrix
1333                                         SparseMatrix *matrix = sparseMatrices[second][first];
1334
1335                                         for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
1336                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1337                                                 int base = (*mapping2)[jj];
1338                                                 int rowSize = matrix->GetRowSize(jj);
1339
1340                                                 // add in all relevant values
1341                                                 for (int ii = 0; ii < rowSize; ii++)
1342                                                         posterior[base
1343                                                                         + (*mapping1)[row[ii].first]
1344                                                                                         * (seq2Length + 1)] += w
1345                                                                         * row[ii].second;
1346
1347                                                 // subtract cutoff 
1348                                                 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1349                                                         posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
1350                                                                         w * cutoff;
1351                                         }
1352
1353                                 }
1354
1355                                 delete mapping2;
1356                         }
1357
1358                         delete mapping1;
1359                 }
1360
1361                 return posteriorPtr;
1362         }
1363 };
1364
1365 #endif