1 /////////////////////////////////////////////////////////////////
2 // ProbabilisticModel.h
4 // Routines for (1) posterior probability computations
5 // (2) chained anchoring
6 // (3) maximum weight trace alignment
7 /////////////////////////////////////////////////////////////////
9 #ifndef PROBABILISTICMODEL_H
10 #define PROBABILISTICMODEL_H
15 #include "SafeVector.h"
16 #include "ScoreType.h"
17 #include "SparseMatrix.h"
18 #include "MultiSequence.h"
22 const int NumMatchStates = 1; // note that in this version the number
23 // of match states is fixed at 1...will
24 // change in future versions
25 const int NumInsertStates = 2;
26 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
28 /////////////////////////////////////////////////////////////////
31 // Class for storing the parameters of a probabilistic model and
32 // performing different computations based on those parameters.
33 // In particular, this class handles the computation of
34 // posterior probabilities that may be used in alignment.
35 /////////////////////////////////////////////////////////////////
37 class ProbabilisticModel {
39 float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
40 float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
41 float matchProb[256][256]; // emission probabilities for match states
42 float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
46 /////////////////////////////////////////////////////////////////
47 // ProbabilisticModel::ProbabilisticModel()
49 // Constructor. Builds a new probabilistic model using the
51 /////////////////////////////////////////////////////////////////
53 ProbabilisticModel(const VF &initDistribMat, const VF &gapOpen,
54 const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle) {
56 // build transition matrix
57 VVF transMat(NumMatrixTypes, VF(NumMatrixTypes, 0.0f));
59 for (int i = 0; i < NumInsertStates; i++) {
60 transMat[0][2 * i + 1] = gapOpen[2 * i];
61 transMat[0][2 * i + 2] = gapOpen[2 * i + 1];
62 transMat[0][0] -= (gapOpen[2 * i] + gapOpen[2 * i + 1]);
63 assert(transMat[0][0] > 0);
64 transMat[2 * i + 1][2 * i + 1] = gapExtend[2 * i];
65 transMat[2 * i + 2][2 * i + 2] = gapExtend[2 * i + 1];
66 transMat[2 * i + 1][2 * i + 2] = 0;
67 transMat[2 * i + 2][2 * i + 1] = 0;
68 transMat[2 * i + 1][0] = 1 - gapExtend[2 * i];
69 transMat[2 * i + 2][0] = 1 - gapExtend[2 * i + 1];
72 // create initial and transition probability matrices
73 for (int i = 0; i < NumMatrixTypes; i++) {
74 initialDistribution[i] = LOG(initDistribMat[i]);
75 for (int j = 0; j < NumMatrixTypes; j++)
76 transProb[i][j] = LOG(transMat[i][j]);
79 // create insertion and match probability matrices
80 for (int i = 0; i < 256; i++) {
81 for (int j = 0; j < NumMatrixTypes; j++)
82 insProb[i][j] = LOG(emitSingle[i]);
83 for (int j = 0; j < 256; j++)
84 matchProb[i][j] = LOG(emitPairs[i][j]);
88 /////////////////////////////////////////////////////////////////
89 // ProbabilisticModel::ComputeForwardMatrix()
91 // Computes a set of forward probability matrices for aligning
94 // For efficiency reasons, a single-dimensional floating-point
95 // array is used here, with the following indexing scheme:
97 // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
98 // refers to the probability of aligning through j characters
99 // of the first sequence, k characters of the second sequence,
100 // and ending in state i.
101 /////////////////////////////////////////////////////////////////
103 VF *ComputeForwardMatrix(Sequence *seq1, Sequence *seq2) const {
108 const int seq1Length = seq1->GetLength();
109 const int seq2Length = seq2->GetLength();
111 // retrieve the points to the beginning of each sequence
112 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
113 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
116 VF *forwardPtr = new VF(
117 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
119 VF &forward = *forwardPtr;
121 // initialization condition
122 forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] =
123 initialDistribution[0]
124 + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
126 for (int k = 0; k < NumInsertStates; k++) {
127 forward[2 * k + 1 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] =
128 initialDistribution[2 * k + 1]
129 + insProb[(unsigned char) iter1[1]][k];
130 forward[2 * k + 2 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] =
131 initialDistribution[2 * k + 2]
132 + insProb[(unsigned char) iter2[1]][k];
135 // remember offset for each index combination
137 int i1j = -seq2Length - 1;
139 int i1j1 = -seq2Length - 2;
141 ij *= NumMatrixTypes;
142 i1j *= NumMatrixTypes;
143 ij1 *= NumMatrixTypes;
144 i1j1 *= NumMatrixTypes;
146 // compute forward scores
147 for (int i = 0; i <= seq1Length; i++) {
148 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
149 for (int j = 0; j <= seq2Length; j++) {
150 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
152 if (i > 1 || j > 1) {
153 if (i > 0 && j > 0) {
154 forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
155 for (int k = 1; k < NumMatrixTypes; k++)
156 LOG_PLUS_EQUALS(forward[0 + ij],
157 forward[k + i1j1] + transProb[k][0]);
158 forward[0 + ij] += matchProb[c1][c2];
161 for (int k = 0; k < NumInsertStates; k++)
162 forward[2 * k + 1 + ij] = insProb[c1][k]
165 + transProb[0][2 * k + 1],
166 forward[2 * k + 1 + i1j]
167 + transProb[2 * k + 1][2 * k
171 for (int k = 0; k < NumInsertStates; k++)
172 forward[2 * k + 2 + ij] = insProb[c2][k]
175 + transProb[0][2 * k + 2],
176 forward[2 * k + 2 + ij1]
177 + transProb[2 * k + 2][2 * k
182 ij += NumMatrixTypes;
183 i1j += NumMatrixTypes;
184 ij1 += NumMatrixTypes;
185 i1j1 += NumMatrixTypes;
192 /////////////////////////////////////////////////////////////////
193 // ProbabilisticModel::ComputeBackwardMatrix()
195 // Computes a set of backward probability matrices for aligning
198 // For efficiency reasons, a single-dimensional floating-point
199 // array is used here, with the following indexing scheme:
201 // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
202 // refers to the probability of starting in state i and
203 // aligning from character j+1 to the end of the first
204 // sequence and from character k+1 to the end of the second
206 /////////////////////////////////////////////////////////////////
208 VF *ComputeBackwardMatrix(Sequence *seq1, Sequence *seq2) const {
213 const int seq1Length = seq1->GetLength();
214 const int seq2Length = seq2->GetLength();
215 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
216 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
219 VF *backwardPtr = new VF(
220 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
222 VF &backward = *backwardPtr;
224 // initialization condition
225 for (int k = 0; k < NumMatrixTypes; k++)
226 backward[NumMatrixTypes * ((seq1Length + 1) * (seq2Length + 1) - 1)
227 + k] = initialDistribution[k];
229 // remember offset for each index combination
230 int ij = (seq1Length + 1) * (seq2Length + 1) - 1;
231 int i1j = ij + seq2Length + 1;
233 int i1j1 = ij + seq2Length + 2;
235 ij *= NumMatrixTypes;
236 i1j *= NumMatrixTypes;
237 ij1 *= NumMatrixTypes;
238 i1j1 *= NumMatrixTypes;
240 // compute backward scores
241 for (int i = seq1Length; i >= 0; i--) {
243 (i == seq1Length) ? '~' : (unsigned char) iter1[i + 1];
244 for (int j = seq2Length; j >= 0; j--) {
246 (j == seq2Length) ? '~' : (unsigned char) iter2[j + 1];
248 if (i < seq1Length && j < seq2Length) {
249 const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
250 for (int k = 0; k < NumMatrixTypes; k++)
251 LOG_PLUS_EQUALS(backward[k + ij],
252 ProbXY + transProb[k][0]);
254 if (i < seq1Length) {
255 for (int k = 0; k < NumInsertStates; k++) {
256 LOG_PLUS_EQUALS(backward[0 + ij],
257 backward[2 * k + 1 + i1j] + insProb[c1][k]
258 + transProb[0][2 * k + 1]);
259 LOG_PLUS_EQUALS(backward[2 * k + 1 + ij],
260 backward[2 * k + 1 + i1j] + insProb[c1][k]
261 + transProb[2 * k + 1][2 * k + 1]);
264 if (j < seq2Length) {
265 for (int k = 0; k < NumInsertStates; k++) {
266 LOG_PLUS_EQUALS(backward[0 + ij],
267 backward[2 * k + 2 + ij1] + insProb[c2][k]
268 + transProb[0][2 * k + 2]);
269 LOG_PLUS_EQUALS(backward[2 * k + 2 + ij],
270 backward[2 * k + 2 + ij1] + insProb[c2][k]
271 + transProb[2 * k + 2][2 * k + 2]);
275 ij -= NumMatrixTypes;
276 i1j -= NumMatrixTypes;
277 ij1 -= NumMatrixTypes;
278 i1j1 -= NumMatrixTypes;
285 /////////////////////////////////////////////////////////////////
286 // ProbabilisticModel::ComputeTotalProbability()
288 // Computes the total probability of an alignment given
289 // the forward and backward matrices.
290 /////////////////////////////////////////////////////////////////
292 float ComputeTotalProbability(int seq1Length, int seq2Length,
293 const VF &forward, const VF &backward) const {
295 // compute total probability
296 float totalForwardProb = LOG_ZERO;
297 float totalBackwardProb = LOG_ZERO;
298 for (int k = 0; k < NumMatrixTypes; k++) {
299 LOG_PLUS_EQUALS(totalForwardProb,
302 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
306 * (seq2Length + 1) - 1)]);
309 totalBackwardProb = forward[0
310 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
311 + backward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)];
313 for (int k = 0; k < NumInsertStates; k++) {
314 LOG_PLUS_EQUALS(totalBackwardProb,
316 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
319 * (1 * (seq2Length + 1) + 0)]);
320 LOG_PLUS_EQUALS(totalBackwardProb,
322 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
325 * (0 * (seq2Length + 1) + 1)]);
328 // cerr << totalForwardProb << " " << totalBackwardProb << endl;
330 return (totalForwardProb + totalBackwardProb) / 2;
333 /////////////////////////////////////////////////////////////////
334 // ProbabilisticModel::ComputePosteriorMatrix()
336 // Computes the posterior probability matrix based on
337 // the forward and backward matrices.
338 /////////////////////////////////////////////////////////////////
340 VF *ComputePosteriorMatrix(Sequence *seq1, Sequence *seq2,
341 const VF &forward, const VF &backward) const {
346 const int seq1Length = seq1->GetLength();
347 const int seq2Length = seq2->GetLength();
349 float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
352 // compute posterior matrices
353 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1));
354 assert(posteriorPtr);
355 VF &posterior = *posteriorPtr;
358 if (totalProb == 0) {
361 VF::iterator ptr = posterior.begin();
363 for (int i = 0; i <= seq1Length; i++) {
364 for (int j = 0; j <= seq2Length; j++) {
366 min(LOG_ONE, forward[ij] + backward[ij] - totalProb));
367 ij += NumMatrixTypes;
377 /////////////////////////////////////////////////////////////////
378 // ProbabilisticModel::ComputeExpectedCounts()
380 // Computes the expected counts for the various transitions.
381 /////////////////////////////////////////////////////////////////
383 VVF *ComputeExpectedCounts () const {
388 const int seq1Length = seq1->GetLength();
389 const int seq2Length = seq2->GetLength();
390 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
391 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
393 // compute total probability
394 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
397 // initialize expected counts
398 VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
399 VVF &counts = *countsPtr;
401 // remember offset for each index combination
403 int i1j = -seq2Length - 1;
405 int i1j1 = -seq2Length - 2;
407 ij *= NumMatrixTypes;
408 i1j *= NumMatrixTypes;
409 ij1 *= NumMatrixTypes;
410 i1j1 *= NumMatrixTypes;
412 // compute expected counts
413 for (int i = 0; i <= seq1Length; i++){
414 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
415 for (int j = 0; j <= seq2Length; j++){
416 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
419 for (int k = 0; k < NumMatrixTypes; k++)
420 LOG_PLUS_EQUALS (counts[k][0],
421 forward[k + i1j1] + transProb[k][0] +
422 matchProb[c1][c2] + backward[0 + ij]);
425 for (int k = 0; k < NumInsertStates; k++){
426 LOG_PLUS_EQUALS (counts[0][2*k+1],
427 forward[0 + i1j] + transProb[0][2*k+1] +
428 insProb[c1][k] + backward[2*k+1 + ij]);
429 LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
430 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
431 insProb[c1][k] + backward[2*k+1 + ij]);
435 for (int k = 0; k < NumInsertStates; k++){
436 LOG_PLUS_EQUALS (counts[0][2*k+2],
437 forward[0 + ij1] + transProb[0][2*k+2] +
438 insProb[c2][k] + backward[2*k+2 + ij]);
439 LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
440 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
441 insProb[c2][k] + backward[2*k+2 + ij]);
445 ij += NumMatrixTypes;
446 i1j += NumMatrixTypes;
447 ij1 += NumMatrixTypes;
448 i1j1 += NumMatrixTypes;
452 // scale all expected counts appropriately
453 for (int i = 0; i < NumMatrixTypes; i++)
454 for (int j = 0; j < NumMatrixTypes; j++)
455 counts[i][j] -= totalProb;
460 /////////////////////////////////////////////////////////////////
461 // ProbabilisticModel::ComputeNewParameters()
463 // Computes a new parameter set based on the expected counts
465 /////////////////////////////////////////////////////////////////
466 void ComputeNewParameters(Sequence *seq1, Sequence *seq2, const VF &forward,
467 const VF &backward, VF &initDistribMat, VF &gapOpen, VF &gapExtend,
468 VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
473 const int seq1Length = seq1->GetLength();
474 const int seq2Length = seq2->GetLength();
475 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
476 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
478 // compute total probability
479 float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
482 // initialize expected counts
483 VVF transCounts(NumMatrixTypes, VF(NumMatrixTypes, LOG_ZERO));
484 VF initCounts(NumMatrixTypes, LOG_ZERO);
485 VVF pairCounts(256, VF(256, LOG_ZERO));
486 VF singleCounts(256, LOG_ZERO);
488 // remember offset for each index combination
490 int i1j = -seq2Length - 1;
492 int i1j1 = -seq2Length - 2;
494 ij *= NumMatrixTypes;
495 i1j *= NumMatrixTypes;
496 ij1 *= NumMatrixTypes;
497 i1j1 *= NumMatrixTypes;
499 // compute initial distribution posteriors
500 initCounts[0] = LOG_ADD(
501 forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
503 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)],
506 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
509 * ((seq1Length + 1) * (seq2Length + 1)
511 for (int k = 0; k < NumInsertStates; k++) {
512 initCounts[2 * k + 1] = LOG_ADD(
514 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
517 * (1 * (seq2Length + 1) + 0)],
520 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
524 * (seq2Length + 1) - 1)]);
525 initCounts[2 * k + 2] = LOG_ADD(
527 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
530 * (0 * (seq2Length + 1) + 1)],
533 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
537 * (seq2Length + 1) - 1)]);
540 // compute expected counts
541 for (int i = 0; i <= seq1Length; i++) {
543 (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
544 for (int j = 0; j <= seq2Length; j++) {
546 (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
548 if (i > 0 && j > 0) {
549 if (enableTrainEmissions && i == 1 && j == 1) {
550 LOG_PLUS_EQUALS(pairCounts[c1][c2],
551 initialDistribution[0] + matchProb[c1][c2]
553 LOG_PLUS_EQUALS(pairCounts[c2][c1],
554 initialDistribution[0] + matchProb[c2][c1]
558 for (int k = 0; k < NumMatrixTypes; k++) {
559 LOG_PLUS_EQUALS(transCounts[k][0],
560 forward[k + i1j1] + transProb[k][0]
561 + matchProb[c1][c2] + backward[0 + ij]);
562 if (enableTrainEmissions && (i != 1 || j != 1)) {//adding parentheses by Liu Yongchao, 5 Mar, 2010
563 LOG_PLUS_EQUALS(pairCounts[c1][c2],
564 forward[k + i1j1] + transProb[k][0]
567 LOG_PLUS_EQUALS(pairCounts[c2][c1],
568 forward[k + i1j1] + transProb[k][0]
575 for (int k = 0; k < NumInsertStates; k++) {
576 LOG_PLUS_EQUALS(transCounts[0][2 * k + 1],
577 forward[0 + i1j] + transProb[0][2 * k + 1]
579 + backward[2 * k + 1 + ij]);
580 LOG_PLUS_EQUALS(transCounts[2 * k + 1][2 * k + 1],
581 forward[2 * k + 1 + i1j]
582 + transProb[2 * k + 1][2 * k + 1]
584 + backward[2 * k + 1 + ij]);
585 if (enableTrainEmissions) {
586 if (i == 1 && j == 0) {
587 LOG_PLUS_EQUALS(singleCounts[c1],
588 initialDistribution[2 * k + 1]
590 + backward[2 * k + 1 + ij]);
592 LOG_PLUS_EQUALS(singleCounts[c1],
594 + transProb[0][2 * k + 1]
596 + backward[2 * k + 1 + ij]);
597 LOG_PLUS_EQUALS(singleCounts[c1],
598 forward[2 * k + 1 + i1j]
599 + transProb[2 * k + 1][2 * k + 1]
601 + backward[2 * k + 1 + ij]);
607 for (int k = 0; k < NumInsertStates; k++) {
608 LOG_PLUS_EQUALS(transCounts[0][2 * k + 2],
609 forward[0 + ij1] + transProb[0][2 * k + 2]
611 + backward[2 * k + 2 + ij]);
612 LOG_PLUS_EQUALS(transCounts[2 * k + 2][2 * k + 2],
613 forward[2 * k + 2 + ij1]
614 + transProb[2 * k + 2][2 * k + 2]
616 + backward[2 * k + 2 + ij]);
617 if (enableTrainEmissions) {
618 if (i == 0 && j == 1) {
619 LOG_PLUS_EQUALS(singleCounts[c2],
620 initialDistribution[2 * k + 2]
622 + backward[2 * k + 2 + ij]);
624 LOG_PLUS_EQUALS(singleCounts[c2],
626 + transProb[0][2 * k + 2]
628 + backward[2 * k + 2 + ij]);
629 LOG_PLUS_EQUALS(singleCounts[c2],
630 forward[2 * k + 2 + ij1]
631 + transProb[2 * k + 2][2 * k + 2]
633 + backward[2 * k + 2 + ij]);
639 ij += NumMatrixTypes;
640 i1j += NumMatrixTypes;
641 ij1 += NumMatrixTypes;
642 i1j1 += NumMatrixTypes;
646 // scale all expected counts appropriately
647 for (int i = 0; i < NumMatrixTypes; i++) {
648 initCounts[i] -= totalProb;
649 for (int j = 0; j < NumMatrixTypes; j++)
650 transCounts[i][j] -= totalProb;
652 if (enableTrainEmissions) {
653 for (int i = 0; i < 256; i++) {
654 for (int j = 0; j < 256; j++)
655 pairCounts[i][j] -= totalProb;
656 singleCounts[i] -= totalProb;
660 // compute new initial distribution
661 float totalInitDistribCounts = 0;
662 for (int i = 0; i < NumMatrixTypes; i++)
663 totalInitDistribCounts += exp(initCounts[i]); // should be 2
664 initDistribMat[0] = min(1.0f,
665 max(0.0f, (float) exp(initCounts[0]) / totalInitDistribCounts));
666 for (int k = 0; k < NumInsertStates; k++) {
668 (exp(initCounts[2 * k + 1]) + exp(initCounts[2 * k + 2]))
670 initDistribMat[2 * k + 1] = initDistribMat[2 * k + 2] = min(1.0f,
671 max(0.0f, val / totalInitDistribCounts));
674 // compute total counts for match state
675 float inMatchStateCounts = 0;
676 for (int i = 0; i < NumMatrixTypes; i++)
677 inMatchStateCounts += exp(transCounts[0][i]);
678 for (int i = 0; i < NumInsertStates; i++) {
680 // compute total counts for gap state
681 float inGapStateCounts = exp(transCounts[2 * i + 1][0])
682 + exp(transCounts[2 * i + 1][2 * i + 1])
683 + exp(transCounts[2 * i + 2][0])
684 + exp(transCounts[2 * i + 2][2 * i + 2]);
686 gapOpen[2 * i] = gapOpen[2 * i + 1] = (exp(
687 transCounts[0][2 * i + 1]) + exp(transCounts[0][2 * i + 2]))
688 / (2 * inMatchStateCounts);
690 gapExtend[2 * i] = gapExtend[2 * i + 1] = (exp(
691 transCounts[2 * i + 1][2 * i + 1])
692 + exp(transCounts[2 * i + 2][2 * i + 2]))
696 if (enableTrainEmissions) {
697 float totalPairCounts = 0;
698 float totalSingleCounts = 0;
699 for (int i = 0; i < 256; i++) {
700 for (int j = 0; j <= i; j++)
701 totalPairCounts += exp(pairCounts[j][i]);
702 totalSingleCounts += exp(singleCounts[i]);
705 for (int i = 0; i < 256; i++)
706 if (!islower((char) i)) {
707 int li = (int) ((unsigned char) tolower((char) i));
708 for (int j = 0; j <= i; j++)
709 if (!islower((char) j)) {
710 int lj = (int) ((unsigned char) tolower((char) j));
723 emitSingle[i] = emitSingle[li] = exp(singleCounts[i])
729 /////////////////////////////////////////////////////////////////
730 // ProbabilisticModel::ComputeAlignment()
732 // Computes an alignment based on the given posterior matrix.
733 // This is done by finding the maximum summing path (or
734 // maximum weight trace) through the posterior matrix. The
735 // final alignment is returned as a pair consisting of:
736 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
737 // denote insertions in one of the two sequences and
738 // B's denote that both sequences are present (i.e.
740 // (2) a float indicating the sum achieved
741 /////////////////////////////////////////////////////////////////
743 pair<SafeVector<char> *, float> ComputeAlignment(int seq1Length,
744 int seq2Length, const VF &posterior) const {
746 float *twoRows = new float[(seq2Length + 1) * 2];
748 float *oldRow = twoRows;
749 float *newRow = twoRows + seq2Length + 1;
751 char *tracebackMatrix = new char[(seq1Length + 1) * (seq2Length + 1)];
752 assert(tracebackMatrix);
753 char *tracebackPtr = tracebackMatrix;
755 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
758 for (int i = 0; i <= seq2Length; i++) {
760 *(tracebackPtr++) = 'L';
764 for (int i = 1; i <= seq1Length; i++) {
766 // initialize left column
769 *(tracebackPtr++) = 'U';
771 // fill in rest of row
772 for (int j = 1; j <= seq2Length; j++) {
773 ChooseBestOfThree(*(posteriorPtr++) + oldRow[j - 1],
774 newRow[j - 1], oldRow[j], 'D', 'L', 'U', &newRow[j],
779 float *temp = oldRow;
785 float total = oldRow[seq2Length];
789 SafeVector<char> *alignment = new SafeVector<char>;
791 int r = seq1Length, c = seq2Length;
792 while (r != 0 || c != 0) {
793 char ch = tracebackMatrix[r * (seq2Length + 1) + c];
797 alignment->push_back('Y');
801 alignment->push_back('X');
806 alignment->push_back('B');
813 delete[] tracebackMatrix;
815 reverse(alignment->begin(), alignment->end());
817 return make_pair(alignment, total);
820 /////////////////////////////////////////////////////////////////
821 // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
823 // Similar to ComputeAlignment() except with gap penalties.
824 /////////////////////////////////////////////////////////////////
826 pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties(
827 MultiSequence *align1, MultiSequence *align2, const VF &posterior,
828 int numSeqs1, int numSeqs2, float gapOpenPenalty,
829 float gapContinuePenalty) const {
830 int seq1Length = align1->GetSequence(0)->GetLength();
831 int seq2Length = align2->GetSequence(0)->GetLength();
832 SafeVector<SafeVector<char>::iterator> dataPtrs1(
833 align1->GetNumSequences());
834 SafeVector<SafeVector<char>::iterator> dataPtrs2(
835 align2->GetNumSequences());
837 // grab character data
838 for (int i = 0; i < align1->GetNumSequences(); i++)
839 dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
840 for (int i = 0; i < align2->GetNumSequences(); i++)
841 dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
843 // the number of active sequences at any given column is defined to be the
844 // number of non-gap characters in that column; the number of gap opens at
845 // any given column is defined to be the number of gap characters in that
846 // column where the previous character in the respective sequence was not
848 SafeVector<int> numActive1(seq1Length + 1), numGapOpens1(
850 SafeVector<int> numActive2(seq2Length + 1), numGapOpens2(
853 // compute number of active sequences and gap opens for each group
854 for (int i = 0; i < align1->GetNumSequences(); i++) {
855 SafeVector<char>::iterator dataPtr =
856 align1->GetSequence(i)->GetDataPtr();
857 numActive1[0] = numGapOpens1[0] = 0;
858 for (int j = 1; j <= seq1Length; j++) {
859 if (dataPtr[j] != '-') {
861 numGapOpens1[j] += (j != 1 && dataPtr[j - 1] != '-');
865 for (int i = 0; i < align2->GetNumSequences(); i++) {
866 SafeVector<char>::iterator dataPtr =
867 align2->GetSequence(i)->GetDataPtr();
868 numActive2[0] = numGapOpens2[0] = 0;
869 for (int j = 1; j <= seq2Length; j++) {
870 if (dataPtr[j] != '-') {
872 numGapOpens2[j] += (j != 1 && dataPtr[j - 1] != '-');
877 VVF openingPenalty1(numSeqs1 + 1, VF(numSeqs2 + 1));
878 VF continuingPenalty1(numSeqs1 + 1);
879 VVF openingPenalty2(numSeqs1 + 1, VF(numSeqs2 + 1));
880 VF continuingPenalty2(numSeqs2 + 1);
882 // precompute penalties
883 for (int i = 0; i <= numSeqs1; i++)
884 for (int j = 0; j <= numSeqs2; j++)
885 openingPenalty1[i][j] = i
886 * (gapOpenPenalty * j
887 + gapContinuePenalty * (numSeqs2 - j));
888 for (int i = 0; i <= numSeqs1; i++)
889 continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
890 for (int i = 0; i <= numSeqs2; i++)
891 for (int j = 0; j <= numSeqs1; j++)
892 openingPenalty2[i][j] = i
893 * (gapOpenPenalty * j
894 + gapContinuePenalty * (numSeqs1 - j));
895 for (int i = 0; i <= numSeqs2; i++)
896 continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
898 float *twoRows = new float[6 * (seq2Length + 1)];
900 float *oldRowMatch = twoRows;
901 float *newRowMatch = twoRows + (seq2Length + 1);
902 float *oldRowInsertX = twoRows + 2 * (seq2Length + 1);
903 float *newRowInsertX = twoRows + 3 * (seq2Length + 1);
904 float *oldRowInsertY = twoRows + 4 * (seq2Length + 1);
905 float *newRowInsertY = twoRows + 5 * (seq2Length + 1);
907 char *tracebackMatrix =
908 new char[3 * (seq1Length + 1) * (seq2Length + 1)];
909 assert(tracebackMatrix);
910 char *tracebackPtr = tracebackMatrix;
912 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
915 for (int i = 0; i <= seq2Length; i++) {
916 oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
921 + continuingPenalty2[numActive2[i]];
922 *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'Y';
927 for (int i = 1; i <= seq1Length; i++) {
929 // initialize left column
930 newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
931 newRowInsertX[0] = oldRowInsertX[0]
932 + continuingPenalty1[numActive1[i]];
934 *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'X';
937 // fill in rest of row
938 for (int j = 1; j <= seq2Length; j++) {
940 // going to MATCH state
941 ChooseBestOfThree(oldRowMatch[j - 1], oldRowInsertX[j - 1],
942 oldRowInsertY[j - 1], 'M', 'X', 'Y', &newRowMatch[j],
944 newRowMatch[j] += *(posteriorPtr++);
946 // going to INSERT X state
949 + openingPenalty1[numActive1[i]][numGapOpens2[j]],
950 oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
952 + openingPenalty1[numActive1[i]][numGapOpens2[j]],
953 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
955 // going to INSERT Y state
958 + openingPenalty2[numActive2[j]][numGapOpens1[i]],
960 + openingPenalty2[numActive2[j]][numGapOpens1[i]],
962 + continuingPenalty2[numActive2[j]], 'M', 'X',
963 'Y', &newRowInsertY[j], tracebackPtr++);
969 oldRowMatch = newRowMatch;
971 temp = oldRowInsertX;
972 oldRowInsertX = newRowInsertX;
973 newRowInsertX = temp;
974 temp = oldRowInsertY;
975 oldRowInsertY = newRowInsertY;
976 newRowInsertY = temp;
982 ChooseBestOfThree(oldRowMatch[seq2Length], oldRowInsertX[seq2Length],
983 oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix);
988 SafeVector<char> *alignment = new SafeVector<char>;
990 int r = seq1Length, c = seq2Length;
991 while (r != 0 || c != 0) {
993 int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
994 char ch = tracebackMatrix[(r * (seq2Length + 1) + c) * 3 + offset];
998 alignment->push_back('Y');
1002 alignment->push_back('X');
1007 alignment->push_back('B');
1015 delete[] tracebackMatrix;
1017 reverse(alignment->begin(), alignment->end());
1019 return make_pair(alignment, 1.0f);
1022 /////////////////////////////////////////////////////////////////
1023 // ProbabilisticModel::ComputeViterbiAlignment()
1025 // Computes the highest probability pairwise alignment using the
1026 // probabilistic model. The final alignment is returned as a
1027 // pair consisting of:
1028 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
1029 // denote insertions in one of the two sequences and
1030 // B's denote that both sequences are present (i.e.
1032 // (2) a float containing the log probability of the best
1033 // alignment (not used)
1034 /////////////////////////////////////////////////////////////////
1036 pair<SafeVector<char> *, float> ComputeViterbiAlignment(Sequence *seq1,
1037 Sequence *seq2) const {
1042 const int seq1Length = seq1->GetLength();
1043 const int seq2Length = seq2->GetLength();
1045 // retrieve the points to the beginning of each sequence
1046 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1047 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1049 // create viterbi matrix
1050 VF *viterbiPtr = new VF(
1051 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
1053 VF &viterbi = *viterbiPtr;
1055 // create traceback matrix
1056 VI *tracebackPtr = new VI(
1057 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), -1);
1058 assert(tracebackPtr);
1059 VI &traceback = *tracebackPtr;
1061 // initialization condition
1062 for (int k = 0; k < NumMatrixTypes; k++)
1063 viterbi[k] = initialDistribution[k];
1065 // remember offset for each index combination
1067 int i1j = -seq2Length - 1;
1069 int i1j1 = -seq2Length - 2;
1071 ij *= NumMatrixTypes;
1072 i1j *= NumMatrixTypes;
1073 ij1 *= NumMatrixTypes;
1074 i1j1 *= NumMatrixTypes;
1076 // compute viterbi scores
1077 for (int i = 0; i <= seq1Length; i++) {
1078 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
1079 for (int j = 0; j <= seq2Length; j++) {
1080 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
1082 if (i > 0 && j > 0) {
1083 for (int k = 0; k < NumMatrixTypes; k++) {
1084 float newVal = viterbi[k + i1j1] + transProb[k][0]
1085 + matchProb[c1][c2];
1086 if (viterbi[0 + ij] < newVal) {
1087 viterbi[0 + ij] = newVal;
1088 traceback[0 + ij] = k;
1093 for (int k = 0; k < NumInsertStates; k++) {
1094 float valFromMatch = insProb[c1][k] + viterbi[0 + i1j]
1095 + transProb[0][2 * k + 1];
1096 float valFromIns = insProb[c1][k]
1097 + viterbi[2 * k + 1 + i1j]
1098 + transProb[2 * k + 1][2 * k + 1];
1099 if (valFromMatch >= valFromIns) {
1100 viterbi[2 * k + 1 + ij] = valFromMatch;
1101 traceback[2 * k + 1 + ij] = 0;
1103 viterbi[2 * k + 1 + ij] = valFromIns;
1104 traceback[2 * k + 1 + ij] = 2 * k + 1;
1109 for (int k = 0; k < NumInsertStates; k++) {
1110 float valFromMatch = insProb[c2][k] + viterbi[0 + ij1]
1111 + transProb[0][2 * k + 2];
1112 float valFromIns = insProb[c2][k]
1113 + viterbi[2 * k + 2 + ij1]
1114 + transProb[2 * k + 2][2 * k + 2];
1115 if (valFromMatch >= valFromIns) {
1116 viterbi[2 * k + 2 + ij] = valFromMatch;
1117 traceback[2 * k + 2 + ij] = 0;
1119 viterbi[2 * k + 2 + ij] = valFromIns;
1120 traceback[2 * k + 2 + ij] = 2 * k + 2;
1125 ij += NumMatrixTypes;
1126 i1j += NumMatrixTypes;
1127 ij1 += NumMatrixTypes;
1128 i1j1 += NumMatrixTypes;
1132 // figure out best terminating cell
1133 float bestProb = LOG_ZERO;
1135 for (int k = 0; k < NumMatrixTypes; k++) {
1139 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
1140 + initialDistribution[k];
1141 if (bestProb < thisProb) {
1142 bestProb = thisProb;
1146 assert(state != -1);
1150 // compute traceback
1151 SafeVector<char> *alignment = new SafeVector<char>;
1153 int r = seq1Length, c = seq2Length;
1154 while (r != 0 || c != 0) {
1155 int newState = traceback[state
1156 + NumMatrixTypes * (r * (seq2Length + 1) + c)];
1161 alignment->push_back('B');
1162 } else if (state % 2 == 1) {
1164 alignment->push_back('X');
1167 alignment->push_back('Y');
1173 delete tracebackPtr;
1175 reverse(alignment->begin(), alignment->end());
1177 return make_pair(alignment, bestProb);
1180 /////////////////////////////////////////////////////////////////
1181 // ProbabilisticModel::BuildPosterior()
1183 // Builds a posterior probability matrix needed to align a pair
1184 // of alignments. Mathematically, the returned matrix M is
1185 // defined as follows:
1186 // M[i,j] = sum sum f(s,t,i,j)
1187 // s in align1 t in align2
1189 // [ P(s[i'] <--> t[j'])
1190 // [ if s[i'] is a letter in the ith column of align1 and
1191 // [ t[j'] it a letter in the jth column of align2
1195 /////////////////////////////////////////////////////////////////
1197 VF *BuildPosterior(MultiSequence *align1, MultiSequence *align2,
1198 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1199 float cutoff = 0.0f) const {
1200 const int seq1Length = align1->GetSequence(0)->GetLength();
1201 const int seq2Length = align2->GetSequence(0)->GetLength();
1203 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
1204 assert(posteriorPtr);
1205 VF &posterior = *posteriorPtr;
1206 VF::iterator postPtr = posterior.begin();
1208 // for each s in align1
1209 for (int i = 0; i < align1->GetNumSequences(); i++) {
1210 int first = align1->GetSequence(i)->GetLabel();
1211 SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1213 // for each t in align2
1214 for (int j = 0; j < align2->GetNumSequences(); j++) {
1215 int second = align2->GetSequence(j)->GetLabel();
1216 SafeVector<int> *mapping2 =
1217 align2->GetSequence(j)->GetMapping();
1219 if (first < second) {
1221 // get the associated sparse matrix
1222 SparseMatrix *matrix = sparseMatrices[first][second];
1224 for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
1225 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1226 int base = (*mapping1)[ii] * (seq2Length + 1);
1227 int rowSize = matrix->GetRowSize(ii);
1229 // add in all relevant values
1230 for (int jj = 0; jj < rowSize; jj++)
1231 posterior[base + (*mapping2)[row[jj].first]] +=
1235 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1236 posterior[base + (*mapping2)[jj]] -= cutoff;
1241 // get the associated sparse matrix
1242 SparseMatrix *matrix = sparseMatrices[second][first];
1244 for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
1245 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1246 int base = (*mapping2)[jj];
1247 int rowSize = matrix->GetRowSize(jj);
1249 // add in all relevant values
1250 for (int ii = 0; ii < rowSize; ii++)
1252 + (*mapping1)[row[ii].first]
1253 * (seq2Length + 1)] +=
1257 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1258 posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
1270 return posteriorPtr;
1272 //added by Liu Yongchao.Feb 23, 2010
1273 VF *BuildPosterior(int* seqsWeights, MultiSequence *align1,
1274 MultiSequence *align2,
1275 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1276 float cutoff = 0.0f) const {
1277 const int seq1Length = align1->GetSequence(0)->GetLength();
1278 const int seq2Length = align2->GetSequence(0)->GetLength();
1280 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
1281 assert(posteriorPtr);
1282 VF &posterior = *posteriorPtr;
1283 VF::iterator postPtr = posterior.begin();
1285 //compute the total sum of all weights
1286 float totalWeights = 0;
1287 for (int i = 0; i < align1->GetNumSequences(); i++) {
1288 int first = align1->GetSequence(i)->GetLabel();
1289 int w1 = seqsWeights[first];
1290 for (int j = 0; j < align2->GetNumSequences(); j++) {
1291 int second = align2->GetSequence(j)->GetLabel();
1292 int w2 = seqsWeights[second];
1294 totalWeights += w1 * w2;
1297 // for each s in align1
1298 for (int i = 0; i < align1->GetNumSequences(); i++) {
1299 int first = align1->GetSequence(i)->GetLabel();
1300 int w1 = seqsWeights[first];
1301 SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1302 // for each t in align2
1303 for (int j = 0; j < align2->GetNumSequences(); j++) {
1304 int second = align2->GetSequence(j)->GetLabel();
1305 int w2 = seqsWeights[second];
1306 SafeVector<int> *mapping2 =
1307 align2->GetSequence(j)->GetMapping();
1309 float w = (float) (w1 * w2) / totalWeights;
1310 if (first < second) {
1312 // get the associated sparse matrix
1313 SparseMatrix *matrix = sparseMatrices[first][second];
1315 for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
1316 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1317 int base = (*mapping1)[ii] * (seq2Length + 1);
1318 int rowSize = matrix->GetRowSize(ii);
1320 // add in all relevant values
1321 for (int jj = 0; jj < rowSize; jj++)
1322 posterior[base + (*mapping2)[row[jj].first]] += w
1326 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1327 posterior[base + (*mapping2)[jj]] -= w * cutoff;
1332 // get the associated sparse matrix
1333 SparseMatrix *matrix = sparseMatrices[second][first];
1335 for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
1336 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1337 int base = (*mapping2)[jj];
1338 int rowSize = matrix->GetRowSize(jj);
1340 // add in all relevant values
1341 for (int ii = 0; ii < rowSize; ii++)
1343 + (*mapping1)[row[ii].first]
1344 * (seq2Length + 1)] += w
1348 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1349 posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
1361 return posteriorPtr;