1 /////////////////////////////////////////////////////////////////
2 // ProbabilisticModel.h
4 // Routines for (1) posterior probability computations
5 // (2) chained anchoring
6 // (3) maximum weight trace alignment
7 /////////////////////////////////////////////////////////////////
9 #ifndef PROBABILISTICMODEL_H
10 #define PROBABILISTICMODEL_H
15 #include "SafeVector.h"
16 #include "ScoreType.h"
17 #include "SparseMatrix.h"
18 #include "MultiSequence.h"
22 const int NumMatchStates = 1; // note that in this version the number
23 // of match states is fixed at 1...will
24 // change in future versions
25 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
27 /////////////////////////////////////////////////////////////////
30 // Class for storing the parameters of a probabilistic model and
31 // performing different computations based on those parameters.
32 // In particular, this class handles the computation of
33 // posterior probabilities that may be used in alignment.
34 /////////////////////////////////////////////////////////////////
36 class ProbabilisticModel {
38 float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
39 float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
40 float matchProb[256][256]; // emission probabilities for match states
41 float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
45 /////////////////////////////////////////////////////////////////
46 // ProbabilisticModel::ProbabilisticModel()
48 // Constructor. Builds a new probabilistic model using the
50 /////////////////////////////////////////////////////////////////
52 ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,
53 const VVF &emitPairs, const VF &emitSingle){
55 // build transition matrix
56 VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));
58 for (int i = 0; i < NumInsertStates; i++){
59 transMat[0][2*i+1] = gapOpen[2*i];
60 transMat[0][2*i+2] = gapOpen[2*i+1];
61 transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]);
62 assert (transMat[0][0] > 0);
63 transMat[2*i+1][2*i+1] = gapExtend[2*i];
64 transMat[2*i+2][2*i+2] = gapExtend[2*i+1];
65 transMat[2*i+1][2*i+2] = 0;
66 transMat[2*i+2][2*i+1] = 0;
67 transMat[2*i+1][0] = 1 - gapExtend[2*i];
68 transMat[2*i+2][0] = 1 - gapExtend[2*i+1];
71 // create initial and transition probability matrices
72 for (int i = 0; i < NumMatrixTypes; i++){
73 initialDistribution[i] = LOG (initDistribMat[i]);
74 for (int j = 0; j < NumMatrixTypes; j++)
75 transProb[i][j] = LOG (transMat[i][j]);
78 // create insertion and match probability matrices
79 for (int i = 0; i < 256; i++){
80 for (int j = 0; j < NumMatrixTypes; j++)
81 insProb[i][j] = LOG (emitSingle[i]);
82 for (int j = 0; j < 256; j++)
83 matchProb[i][j] = LOG (emitPairs[i][j]);
87 /////////////////////////////////////////////////////////////////
88 // ProbabilisticModel::ComputeForwardMatrix()
90 // Computes a set of forward probability matrices for aligning
93 // For efficiency reasons, a single-dimensional floating-point
94 // array is used here, with the following indexing scheme:
96 // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
97 // refers to the probability of aligning through j characters
98 // of the first sequence, k characters of the second sequence,
99 // and ending in state i.
100 /////////////////////////////////////////////////////////////////
102 VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const {
107 const int seq1Length = seq1->GetLength();
108 const int seq2Length = seq2->GetLength();
110 // retrieve the points to the beginning of each sequence
111 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
112 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
115 VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
117 VF &forward = *forwardPtr;
119 // initialization condition
120 forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] =
121 initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
123 for (int k = 0; k < NumInsertStates; k++){
124 forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] =
125 initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];
126 forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] =
127 initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k];
130 // remember offset for each index combination
132 int i1j = -seq2Length - 1;
134 int i1j1 = -seq2Length - 2;
136 ij *= NumMatrixTypes;
137 i1j *= NumMatrixTypes;
138 ij1 *= NumMatrixTypes;
139 i1j1 *= NumMatrixTypes;
141 // compute forward scores
142 for (int i = 0; i <= seq1Length; i++){
143 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
144 for (int j = 0; j <= seq2Length; j++){
145 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
149 forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
150 for (int k = 1; k < NumMatrixTypes; k++)
151 LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);
152 forward[0 + ij] += matchProb[c1][c2];
155 for (int k = 0; k < NumInsertStates; k++)
156 forward[2*k+1 + ij] = insProb[c1][k] +
157 LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],
158 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]);
161 for (int k = 0; k < NumInsertStates; k++)
162 forward[2*k+2 + ij] = insProb[c2][k] +
163 LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],
164 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]);
168 ij += NumMatrixTypes;
169 i1j += NumMatrixTypes;
170 ij1 += NumMatrixTypes;
171 i1j1 += NumMatrixTypes;
178 /////////////////////////////////////////////////////////////////
179 // ProbabilisticModel::ComputeBackwardMatrix()
181 // Computes a set of backward probability matrices for aligning
184 // For efficiency reasons, a single-dimensional floating-point
185 // array is used here, with the following indexing scheme:
187 // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
188 // refers to the probability of starting in state i and
189 // aligning from character j+1 to the end of the first
190 // sequence and from character k+1 to the end of the second
192 /////////////////////////////////////////////////////////////////
194 VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const {
199 const int seq1Length = seq1->GetLength();
200 const int seq2Length = seq2->GetLength();
201 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
202 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
205 VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
206 assert (backwardPtr);
207 VF &backward = *backwardPtr;
209 // initialization condition
210 for (int k = 0; k < NumMatrixTypes; k++)
211 backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];
213 // remember offset for each index combination
214 int ij = (seq1Length+1) * (seq2Length+1) - 1;
215 int i1j = ij + seq2Length + 1;
217 int i1j1 = ij + seq2Length + 2;
219 ij *= NumMatrixTypes;
220 i1j *= NumMatrixTypes;
221 ij1 *= NumMatrixTypes;
222 i1j1 *= NumMatrixTypes;
224 // compute backward scores
225 for (int i = seq1Length; i >= 0; i--){
226 unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];
227 for (int j = seq2Length; j >= 0; j--){
228 unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];
230 if (i < seq1Length && j < seq2Length){
231 const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
232 for (int k = 0; k < NumMatrixTypes; k++)
233 LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);
236 for (int k = 0; k < NumInsertStates; k++){
237 LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]);
238 LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]);
242 for (int k = 0; k < NumInsertStates; k++){
243 LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]);
244 LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]);
248 ij -= NumMatrixTypes;
249 i1j -= NumMatrixTypes;
250 ij1 -= NumMatrixTypes;
251 i1j1 -= NumMatrixTypes;
258 /////////////////////////////////////////////////////////////////
259 // ProbabilisticModel::ComputeTotalProbability()
261 // Computes the total probability of an alignment given
262 // the forward and backward matrices.
263 /////////////////////////////////////////////////////////////////
265 float ComputeTotalProbability (int seq1Length, int seq2Length,
266 const VF &forward, const VF &backward) const {
268 // compute total probability
269 float totalForwardProb = LOG_ZERO;
270 float totalBackwardProb = LOG_ZERO;
271 for (int k = 0; k < NumMatrixTypes; k++){
272 LOG_PLUS_EQUALS (totalForwardProb,
273 forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
274 backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
278 forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
279 backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];
281 for (int k = 0; k < NumInsertStates; k++){
282 LOG_PLUS_EQUALS (totalBackwardProb,
283 forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
284 backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]);
285 LOG_PLUS_EQUALS (totalBackwardProb,
286 forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
287 backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]);
290 // cerr << totalForwardProb << " " << totalBackwardProb << endl;
292 return (totalForwardProb + totalBackwardProb) / 2;
295 /////////////////////////////////////////////////////////////////
296 // ProbabilisticModel::ComputePosteriorMatrix()
298 // Computes the posterior probability matrix based on
299 // the forward and backward matrices.
300 /////////////////////////////////////////////////////////////////
302 VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2,
303 const VF &forward, const VF &backward) const {
308 const int seq1Length = seq1->GetLength();
309 const int seq2Length = seq2->GetLength();
311 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
314 // compute posterior matrices
315 VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);
316 VF &posterior = *posteriorPtr;
319 VF::iterator ptr = posterior.begin();
321 for (int i = 0; i <= seq1Length; i++){
322 for (int j = 0; j <= seq2Length; j++){
323 *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));
324 ij += NumMatrixTypes;
334 /////////////////////////////////////////////////////////////////
335 // ProbabilisticModel::ComputeExpectedCounts()
337 // Computes the expected counts for the various transitions.
338 /////////////////////////////////////////////////////////////////
340 VVF *ComputeExpectedCounts () const {
345 const int seq1Length = seq1->GetLength();
346 const int seq2Length = seq2->GetLength();
347 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
348 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
350 // compute total probability
351 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
354 // initialize expected counts
355 VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
356 VVF &counts = *countsPtr;
358 // remember offset for each index combination
360 int i1j = -seq2Length - 1;
362 int i1j1 = -seq2Length - 2;
364 ij *= NumMatrixTypes;
365 i1j *= NumMatrixTypes;
366 ij1 *= NumMatrixTypes;
367 i1j1 *= NumMatrixTypes;
369 // compute expected counts
370 for (int i = 0; i <= seq1Length; i++){
371 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
372 for (int j = 0; j <= seq2Length; j++){
373 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
376 for (int k = 0; k < NumMatrixTypes; k++)
377 LOG_PLUS_EQUALS (counts[k][0],
378 forward[k + i1j1] + transProb[k][0] +
379 matchProb[c1][c2] + backward[0 + ij]);
382 for (int k = 0; k < NumInsertStates; k++){
383 LOG_PLUS_EQUALS (counts[0][2*k+1],
384 forward[0 + i1j] + transProb[0][2*k+1] +
385 insProb[c1][k] + backward[2*k+1 + ij]);
386 LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
387 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
388 insProb[c1][k] + backward[2*k+1 + ij]);
392 for (int k = 0; k < NumInsertStates; k++){
393 LOG_PLUS_EQUALS (counts[0][2*k+2],
394 forward[0 + ij1] + transProb[0][2*k+2] +
395 insProb[c2][k] + backward[2*k+2 + ij]);
396 LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
397 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
398 insProb[c2][k] + backward[2*k+2 + ij]);
402 ij += NumMatrixTypes;
403 i1j += NumMatrixTypes;
404 ij1 += NumMatrixTypes;
405 i1j1 += NumMatrixTypes;
409 // scale all expected counts appropriately
410 for (int i = 0; i < NumMatrixTypes; i++)
411 for (int j = 0; j < NumMatrixTypes; j++)
412 counts[i][j] -= totalProb;
417 /////////////////////////////////////////////////////////////////
418 // ProbabilisticModel::ComputeNewParameters()
420 // Computes a new parameter set based on the expected counts
422 /////////////////////////////////////////////////////////////////
424 void ComputeNewParameters (Sequence *seq1, Sequence *seq2,
425 const VF &forward, const VF &backward,
426 VF &initDistribMat, VF &gapOpen,
427 VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
432 const int seq1Length = seq1->GetLength();
433 const int seq2Length = seq2->GetLength();
434 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
435 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
437 // compute total probability
438 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
441 // initialize expected counts
442 VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));
443 VF initCounts (NumMatrixTypes, LOG_ZERO);
444 VVF pairCounts (256, VF (256, LOG_ZERO));
445 VF singleCounts (256, LOG_ZERO);
447 // remember offset for each index combination
449 int i1j = -seq2Length - 1;
451 int i1j1 = -seq2Length - 2;
453 ij *= NumMatrixTypes;
454 i1j *= NumMatrixTypes;
455 ij1 *= NumMatrixTypes;
456 i1j1 *= NumMatrixTypes;
458 // compute initial distribution posteriors
459 initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
460 backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],
461 forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
462 backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
463 for (int k = 0; k < NumInsertStates; k++){
464 initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
465 backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)],
466 forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
467 backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
468 initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
469 backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)],
470 forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
471 backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
474 // compute expected counts
475 for (int i = 0; i <= seq1Length; i++){
476 unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
477 for (int j = 0; j <= seq2Length; j++){
478 unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
481 if (enableTrainEmissions && i == 1 && j == 1){
482 LOG_PLUS_EQUALS (pairCounts[c1][c2],
483 initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);
484 LOG_PLUS_EQUALS (pairCounts[c2][c1],
485 initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);
488 for (int k = 0; k < NumMatrixTypes; k++){
489 LOG_PLUS_EQUALS (transCounts[k][0],
490 forward[k + i1j1] + transProb[k][0] +
491 matchProb[c1][c2] + backward[0 + ij]);
492 if (enableTrainEmissions && i != 1 || j != 1){
493 LOG_PLUS_EQUALS (pairCounts[c1][c2],
494 forward[k + i1j1] + transProb[k][0] +
495 matchProb[c1][c2] + backward[0 + ij]);
496 LOG_PLUS_EQUALS (pairCounts[c2][c1],
497 forward[k + i1j1] + transProb[k][0] +
498 matchProb[c2][c1] + backward[0 + ij]);
503 for (int k = 0; k < NumInsertStates; k++){
504 LOG_PLUS_EQUALS (transCounts[0][2*k+1],
505 forward[0 + i1j] + transProb[0][2*k+1] +
506 insProb[c1][k] + backward[2*k+1 + ij]);
507 LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1],
508 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
509 insProb[c1][k] + backward[2*k+1 + ij]);
510 if (enableTrainEmissions){
511 if (i == 1 && j == 0){
512 LOG_PLUS_EQUALS (singleCounts[c1],
513 initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]);
516 LOG_PLUS_EQUALS (singleCounts[c1],
517 forward[0 + i1j] + transProb[0][2*k+1] +
518 insProb[c1][k] + backward[2*k+1 + ij]);
519 LOG_PLUS_EQUALS (singleCounts[c1],
520 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
521 insProb[c1][k] + backward[2*k+1 + ij]);
527 for (int k = 0; k < NumInsertStates; k++){
528 LOG_PLUS_EQUALS (transCounts[0][2*k+2],
529 forward[0 + ij1] + transProb[0][2*k+2] +
530 insProb[c2][k] + backward[2*k+2 + ij]);
531 LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2],
532 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
533 insProb[c2][k] + backward[2*k+2 + ij]);
534 if (enableTrainEmissions){
535 if (i == 0 && j == 1){
536 LOG_PLUS_EQUALS (singleCounts[c2],
537 initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]);
540 LOG_PLUS_EQUALS (singleCounts[c2],
541 forward[0 + ij1] + transProb[0][2*k+2] +
542 insProb[c2][k] + backward[2*k+2 + ij]);
543 LOG_PLUS_EQUALS (singleCounts[c2],
544 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
545 insProb[c2][k] + backward[2*k+2 + ij]);
551 ij += NumMatrixTypes;
552 i1j += NumMatrixTypes;
553 ij1 += NumMatrixTypes;
554 i1j1 += NumMatrixTypes;
558 // scale all expected counts appropriately
559 for (int i = 0; i < NumMatrixTypes; i++){
560 initCounts[i] -= totalProb;
561 for (int j = 0; j < NumMatrixTypes; j++)
562 transCounts[i][j] -= totalProb;
564 if (enableTrainEmissions){
565 for (int i = 0; i < 256; i++){
566 for (int j = 0; j < 256; j++)
567 pairCounts[i][j] -= totalProb;
568 singleCounts[i] -= totalProb;
572 // compute new initial distribution
573 float totalInitDistribCounts = 0;
574 for (int i = 0; i < NumMatrixTypes; i++)
575 totalInitDistribCounts += exp (initCounts[i]); // should be 2
576 initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));
577 for (int k = 0; k < NumInsertStates; k++){
578 float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2;
579 initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));
582 // compute total counts for match state
583 float inMatchStateCounts = 0;
584 for (int i = 0; i < NumMatrixTypes; i++)
585 inMatchStateCounts += exp (transCounts[0][i]);
586 for (int i = 0; i < NumInsertStates; i++){
588 // compute total counts for gap state
589 float inGapStateCounts =
590 exp (transCounts[2*i+1][0]) +
591 exp (transCounts[2*i+1][2*i+1]) +
592 exp (transCounts[2*i+2][0]) +
593 exp (transCounts[2*i+2][2*i+2]);
595 gapOpen[2*i] = gapOpen[2*i+1] =
596 (exp (transCounts[0][2*i+1]) +
597 exp (transCounts[0][2*i+2])) /
598 (2 * inMatchStateCounts);
600 gapExtend[2*i] = gapExtend[2*i+1] =
601 (exp (transCounts[2*i+1][2*i+1]) +
602 exp (transCounts[2*i+2][2*i+2])) /
606 if (enableTrainEmissions){
607 float totalPairCounts = 0;
608 float totalSingleCounts = 0;
609 for (int i = 0; i < 256; i++){
610 for (int j = 0; j <= i; j++)
611 totalPairCounts += exp (pairCounts[j][i]);
612 totalSingleCounts += exp (singleCounts[i]);
615 for (int i = 0; i < 256; i++) if (!islower ((char) i)){
616 int li = (int)((unsigned char) tolower ((char) i));
617 for (int j = 0; j <= i; j++) if (!islower ((char) j)){
618 int lj = (int)((unsigned char) tolower ((char) j));
619 emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] =
620 emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;
622 emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;
627 /////////////////////////////////////////////////////////////////
628 // ProbabilisticModel::ComputeAlignment()
630 // Computes an alignment based on the given posterior matrix.
631 // This is done by finding the maximum summing path (or
632 // maximum weight trace) through the posterior matrix. The
633 // final alignment is returned as a pair consisting of:
634 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
635 // denote insertions in one of the two sequences and
636 // B's denote that both sequences are present (i.e.
638 // (2) a float indicating the sum achieved
639 /////////////////////////////////////////////////////////////////
641 pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length,
642 const VF &posterior) const {
644 float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
645 float *oldRow = twoRows;
646 float *newRow = twoRows + seq2Length + 1;
648 char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
649 char *tracebackPtr = tracebackMatrix;
651 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
654 for (int i = 0; i <= seq2Length; i++){
656 *(tracebackPtr++) = 'L';
660 for (int i = 1; i <= seq1Length; i++){
662 // initialize left column
665 *(tracebackPtr++) = 'U';
667 // fill in rest of row
668 for (int j = 1; j <= seq2Length; j++){
669 ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],
670 'D', 'L', 'U', &newRow[j], tracebackPtr++);
674 float *temp = oldRow;
680 float total = oldRow[seq2Length];
684 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
685 int r = seq1Length, c = seq2Length;
686 while (r != 0 || c != 0){
687 char ch = tracebackMatrix[r*(seq2Length+1) + c];
689 case 'L': c--; alignment->push_back ('Y'); break;
690 case 'U': r--; alignment->push_back ('X'); break;
691 case 'D': c--; r--; alignment->push_back ('B'); break;
692 default: assert (false);
696 delete [] tracebackMatrix;
698 reverse (alignment->begin(), alignment->end());
700 return make_pair(alignment, total);
703 /////////////////////////////////////////////////////////////////
704 // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
706 // Similar to ComputeAlignment() except with gap penalties.
707 /////////////////////////////////////////////////////////////////
709 pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1,
710 MultiSequence *align2,
711 const VF &posterior, int numSeqs1,
713 float gapOpenPenalty,
714 float gapContinuePenalty) const {
715 int seq1Length = align1->GetSequence(0)->GetLength();
716 int seq2Length = align2->GetSequence(0)->GetLength();
717 SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());
718 SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());
720 // grab character data
721 for (int i = 0; i < align1->GetNumSequences(); i++)
722 dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
723 for (int i = 0; i < align2->GetNumSequences(); i++)
724 dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
726 // the number of active sequences at any given column is defined to be the
727 // number of non-gap characters in that column; the number of gap opens at
728 // any given column is defined to be the number of gap characters in that
729 // column where the previous character in the respective sequence was not
731 SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);
732 SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);
734 // compute number of active sequences and gap opens for each group
735 for (int i = 0; i < align1->GetNumSequences(); i++){
736 SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();
737 numActive1[0] = numGapOpens1[0] = 0;
738 for (int j = 1; j <= seq1Length; j++){
739 if (dataPtr[j] != '-'){
741 numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');
745 for (int i = 0; i < align2->GetNumSequences(); i++){
746 SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();
747 numActive2[0] = numGapOpens2[0] = 0;
748 for (int j = 1; j <= seq2Length; j++){
749 if (dataPtr[j] != '-'){
751 numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');
756 VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));
757 VF continuingPenalty1 (numSeqs1+1);
758 VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));
759 VF continuingPenalty2 (numSeqs2+1);
761 // precompute penalties
762 for (int i = 0; i <= numSeqs1; i++)
763 for (int j = 0; j <= numSeqs2; j++)
764 openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));
765 for (int i = 0; i <= numSeqs1; i++)
766 continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
767 for (int i = 0; i <= numSeqs2; i++)
768 for (int j = 0; j <= numSeqs1; j++)
769 openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));
770 for (int i = 0; i <= numSeqs2; i++)
771 continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
773 float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows);
774 float *oldRowMatch = twoRows;
775 float *newRowMatch = twoRows + (seq2Length+1);
776 float *oldRowInsertX = twoRows + 2*(seq2Length+1);
777 float *newRowInsertX = twoRows + 3*(seq2Length+1);
778 float *oldRowInsertY = twoRows + 4*(seq2Length+1);
779 float *newRowInsertY = twoRows + 5*(seq2Length+1);
781 char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
782 char *tracebackPtr = tracebackMatrix;
784 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
787 for (int i = 0; i <= seq2Length; i++){
788 oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
789 oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];
790 *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y';
795 for (int i = 1; i <= seq1Length; i++){
797 // initialize left column
798 newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
799 newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];
801 *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X';
804 // fill in rest of row
805 for (int j = 1; j <= seq2Length; j++){
807 // going to MATCH state
808 ChooseBestOfThree (oldRowMatch[j-1],
811 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);
812 newRowMatch[j] += *(posteriorPtr++);
814 // going to INSERT X state
815 ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
816 oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
817 oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
818 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
820 // going to INSERT Y state
821 ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
822 newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
823 newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],
824 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);
829 temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;
830 temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;
831 temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;
837 ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],
838 'M', 'X', 'Y', &total, &matrix);
843 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
844 int r = seq1Length, c = seq2Length;
845 while (r != 0 || c != 0){
847 int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
848 char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset];
850 case 'Y': c--; alignment->push_back ('Y'); break;
851 case 'X': r--; alignment->push_back ('X'); break;
852 case 'M': c--; r--; alignment->push_back ('B'); break;
853 default: assert (false);
858 delete [] tracebackMatrix;
860 reverse (alignment->begin(), alignment->end());
862 return make_pair(alignment, 1.0f);
865 /////////////////////////////////////////////////////////////////
866 // ProbabilisticModel::ComputeViterbiAlignment()
868 // Computes the highest probability pairwise alignment using the
869 // probabilistic model. The final alignment is returned as a
870 // pair consisting of:
871 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
872 // denote insertions in one of the two sequences and
873 // B's denote that both sequences are present (i.e.
875 // (2) a float containing the log probability of the best
876 // alignment (not used)
877 /////////////////////////////////////////////////////////////////
879 pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const {
884 const int seq1Length = seq1->GetLength();
885 const int seq2Length = seq2->GetLength();
887 // retrieve the points to the beginning of each sequence
888 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
889 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
891 // create viterbi matrix
892 VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
894 VF &viterbi = *viterbiPtr;
896 // create traceback matrix
897 VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1);
898 assert (tracebackPtr);
899 VI &traceback = *tracebackPtr;
901 // initialization condition
902 for (int k = 0; k < NumMatrixTypes; k++)
903 viterbi[k] = initialDistribution[k];
905 // remember offset for each index combination
907 int i1j = -seq2Length - 1;
909 int i1j1 = -seq2Length - 2;
911 ij *= NumMatrixTypes;
912 i1j *= NumMatrixTypes;
913 ij1 *= NumMatrixTypes;
914 i1j1 *= NumMatrixTypes;
916 // compute viterbi scores
917 for (int i = 0; i <= seq1Length; i++){
918 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
919 for (int j = 0; j <= seq2Length; j++){
920 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
923 for (int k = 0; k < NumMatrixTypes; k++){
924 float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2];
925 if (viterbi[0 + ij] < newVal){
926 viterbi[0 + ij] = newVal;
927 traceback[0 + ij] = k;
932 for (int k = 0; k < NumInsertStates; k++){
933 float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1];
934 float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1];
935 if (valFromMatch >= valFromIns){
936 viterbi[2*k+1 + ij] = valFromMatch;
937 traceback[2*k+1 + ij] = 0;
940 viterbi[2*k+1 + ij] = valFromIns;
941 traceback[2*k+1 + ij] = 2*k+1;
946 for (int k = 0; k < NumInsertStates; k++){
947 float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2];
948 float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2];
949 if (valFromMatch >= valFromIns){
950 viterbi[2*k+2 + ij] = valFromMatch;
951 traceback[2*k+2 + ij] = 0;
954 viterbi[2*k+2 + ij] = valFromIns;
955 traceback[2*k+2 + ij] = 2*k+2;
960 ij += NumMatrixTypes;
961 i1j += NumMatrixTypes;
962 ij1 += NumMatrixTypes;
963 i1j1 += NumMatrixTypes;
967 // figure out best terminating cell
968 float bestProb = LOG_ZERO;
970 for (int k = 0; k < NumMatrixTypes; k++){
971 float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k];
972 if (bestProb < thisProb){
977 assert (state != -1);
982 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
983 int r = seq1Length, c = seq2Length;
984 while (r != 0 || c != 0){
985 int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)];
987 if (state == 0){ c--; r--; alignment->push_back ('B'); }
988 else if (state % 2 == 1){ r--; alignment->push_back ('X'); }
989 else { c--; alignment->push_back ('Y'); }
996 reverse (alignment->begin(), alignment->end());
998 return make_pair(alignment, bestProb);
1001 /////////////////////////////////////////////////////////////////
1002 // ProbabilisticModel::BuildPosterior()
1004 // Builds a posterior probability matrix needed to align a pair
1005 // of alignments. Mathematically, the returned matrix M is
1006 // defined as follows:
1007 // M[i,j] = sum sum f(s,t,i,j)
1008 // s in align1 t in align2
1010 // [ P(s[i'] <--> t[j'])
1011 // [ if s[i'] is a letter in the ith column of align1 and
1012 // [ t[j'] it a letter in the jth column of align2
1016 /////////////////////////////////////////////////////////////////
1018 VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2,
1019 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1020 float cutoff = 0.0f) const {
1021 const int seq1Length = align1->GetSequence(0)->GetLength();
1022 const int seq2Length = align2->GetSequence(0)->GetLength();
1024 VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr);
1025 VF &posterior = *posteriorPtr;
1026 VF::iterator postPtr = posterior.begin();
1028 // for each s in align1
1029 for (int i = 0; i < align1->GetNumSequences(); i++){
1030 int first = align1->GetSequence(i)->GetLabel();
1031 SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1033 // for each t in align2
1034 for (int j = 0; j < align2->GetNumSequences(); j++){
1035 int second = align2->GetSequence(j)->GetLabel();
1036 SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();
1038 if (first < second){
1040 // get the associated sparse matrix
1041 SparseMatrix *matrix = sparseMatrices[first][second];
1043 for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){
1044 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1045 int base = (*mapping1)[ii] * (seq2Length+1);
1046 int rowSize = matrix->GetRowSize(ii);
1048 // add in all relevant values
1049 for (int jj = 0; jj < rowSize; jj++)
1050 posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;
1053 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1054 posterior[base + (*mapping2)[jj]] -= cutoff;
1059 // get the associated sparse matrix
1060 SparseMatrix *matrix = sparseMatrices[second][first];
1062 for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){
1063 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1064 int base = (*mapping2)[jj];
1065 int rowSize = matrix->GetRowSize(jj);
1067 // add in all relevant values
1068 for (int ii = 0; ii < rowSize; ii++)
1069 posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second;
1072 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1073 posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff;
1085 return posteriorPtr;