1 /////////////////////////////////////////////////////////////////
2 // ProbabilisticModel.h
4 // Routines for (1) posterior probability computations
5 // (2) chained anchoring
6 // (3) maximum weight trace alignment
7 /////////////////////////////////////////////////////////////////
9 #ifndef PROBABILISTICMODEL_H
10 #define PROBABILISTICMODEL_H
15 #include "SafeVector.h"
16 #include "ScoreType.h"
17 #include "SparseMatrix.h"
18 #include "MultiSequence.h"
19 #include "StemCandidate.hpp"
26 const int NumMatchStates = 1; // note that in this version the number
27 // of match states is fixed at 1...will
28 // change in future versions
29 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
31 /////////////////////////////////////////////////////////////////
34 // Class for storing the parameters of a probabilistic model and
35 // performing different computations based on those parameters.
36 // In particular, this class handles the computation of
37 // posterior probabilities that may be used in alignment.
38 /////////////////////////////////////////////////////////////////
40 class ProbabilisticModel {
42 float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
43 float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
44 float matchProb[256][256]; // emission probabilities for match states
45 float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
50 /////////////////////////////////////////////////////////////////
51 // ProbabilisticModel::ProbabilisticModel()
53 // Constructor. Builds a new probabilistic model using the
55 /////////////////////////////////////////////////////////////////
57 ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,
58 const VVF &emitPairs, const VF &emitSingle){
60 // build transition matrix
61 VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));
63 for (int i = 0; i < NumInsertStates; i++){
64 transMat[0][2*i+1] = gapOpen[2*i];
65 transMat[0][2*i+2] = gapOpen[2*i+1];
66 transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]);
67 assert (transMat[0][0] > 0);
68 transMat[2*i+1][2*i+1] = gapExtend[2*i];
69 transMat[2*i+2][2*i+2] = gapExtend[2*i+1];
70 transMat[2*i+1][2*i+2] = 0;
71 transMat[2*i+2][2*i+1] = 0;
72 transMat[2*i+1][0] = 1 - gapExtend[2*i];
73 transMat[2*i+2][0] = 1 - gapExtend[2*i+1];
76 // create initial and transition probability matrices
77 for (int i = 0; i < NumMatrixTypes; i++){
78 initialDistribution[i] = LOG (initDistribMat[i]);
79 for (int j = 0; j < NumMatrixTypes; j++)
80 transProb[i][j] = LOG (transMat[i][j]);
83 // create insertion and match probability matrices
84 for (int i = 0; i < 256; i++){
85 for (int j = 0; j < NumMatrixTypes; j++)
86 insProb[i][j] = LOG (emitSingle[i]);
87 for (int j = 0; j < 256; j++)
88 matchProb[i][j] = LOG (emitPairs[i][j]);
92 NRMat<float> weightMatchScore(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2,
93 std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2, NRMat<float> WM) {
95 int size = matchPSCS1->size();
98 for(int iter = 0; iter < size; iter++) {
99 int i = matchPSCS1->at(iter);
100 int j = matchPSCS2->at(iter);
102 const StemCandidate &sc1 = pscs1->at(i);
103 const StemCandidate &sc2 = pscs2->at(j);
105 for(int k = 0; k < len; k++) {
106 WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight;
107 // sumWeight += weight;
113 /////////////////////////////////////////////////////////////////
114 // ProbabilisticModel::ComputeForwardMatrix()
116 // Computes a set of forward probability matrices for aligning
119 // For efficiency reasons, a single-dimensional floating-point
120 // array is used here, with the following indexing scheme:
122 // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
123 // refers to the probability of aligning through j characters
124 // of the first sequence, k characters of the second sequence,
125 // and ending in state i.
126 /////////////////////////////////////////////////////////////////
128 VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const {
133 const int seq1Length = seq1->GetLength();
134 const int seq2Length = seq2->GetLength();
136 // retrieve the points to the beginning of each sequence
137 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
138 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
141 VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
143 VF &forward = *forwardPtr;
145 // initialization condition
146 forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] =
147 initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
149 for (int k = 0; k < NumInsertStates; k++){
150 forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] =
151 initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];
152 forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] =
153 initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k];
156 // remember offset for each index combination
158 int i1j = -seq2Length - 1;
160 int i1j1 = -seq2Length - 2;
162 ij *= NumMatrixTypes;
163 i1j *= NumMatrixTypes;
164 ij1 *= NumMatrixTypes;
165 i1j1 *= NumMatrixTypes;
167 // compute forward scores
168 for (int i = 0; i <= seq1Length; i++){
169 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
170 for (int j = 0; j <= seq2Length; j++){
171 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
175 forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
176 for (int k = 1; k < NumMatrixTypes; k++)
177 LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);
178 forward[0 + ij] += matchProb[c1][c2];
181 for (int k = 0; k < NumInsertStates; k++)
182 forward[2*k+1 + ij] = insProb[c1][k] +
183 LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],
184 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]);
187 for (int k = 0; k < NumInsertStates; k++)
188 forward[2*k+2 + ij] = insProb[c2][k] +
189 LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],
190 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]);
194 ij += NumMatrixTypes;
195 i1j += NumMatrixTypes;
196 ij1 += NumMatrixTypes;
197 i1j1 += NumMatrixTypes;
204 /////////////////////////////////////////////////////////////////
205 // ProbabilisticModel::ComputeBackwardMatrix()
207 // Computes a set of backward probability matrices for aligning
210 // For efficiency reasons, a single-dimensional floating-point
211 // array is used here, with the following indexing scheme:
213 // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
214 // refers to the probability of starting in state i and
215 // aligning from character j+1 to the end of the first
216 // sequence and from character k+1 to the end of the second
218 /////////////////////////////////////////////////////////////////
220 VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const {
225 const int seq1Length = seq1->GetLength();
226 const int seq2Length = seq2->GetLength();
227 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
228 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
231 VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
232 assert (backwardPtr);
233 VF &backward = *backwardPtr;
235 // initialization condition
236 for (int k = 0; k < NumMatrixTypes; k++)
237 backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];
239 // remember offset for each index combination
240 int ij = (seq1Length+1) * (seq2Length+1) - 1;
241 int i1j = ij + seq2Length + 1;
243 int i1j1 = ij + seq2Length + 2;
245 ij *= NumMatrixTypes;
246 i1j *= NumMatrixTypes;
247 ij1 *= NumMatrixTypes;
248 i1j1 *= NumMatrixTypes;
250 // compute backward scores
251 for (int i = seq1Length; i >= 0; i--){
252 unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];
253 for (int j = seq2Length; j >= 0; j--){
254 unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];
256 if (i < seq1Length && j < seq2Length){
257 const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
258 for (int k = 0; k < NumMatrixTypes; k++)
259 LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);
262 for (int k = 0; k < NumInsertStates; k++){
263 LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]);
264 LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]);
268 for (int k = 0; k < NumInsertStates; k++){
269 LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]);
270 LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]);
274 ij -= NumMatrixTypes;
275 i1j -= NumMatrixTypes;
276 ij1 -= NumMatrixTypes;
277 i1j1 -= NumMatrixTypes;
284 /////////////////////////////////////////////////////////////////
285 // ProbabilisticModel::ComputeTotalProbability()
287 // Computes the total probability of an alignment given
288 // the forward and backward matrices.
289 /////////////////////////////////////////////////////////////////
291 float ComputeTotalProbability (int seq1Length, int seq2Length,
292 const VF &forward, const VF &backward) const {
294 // compute total probability
295 float totalForwardProb = LOG_ZERO;
296 float totalBackwardProb = LOG_ZERO;
297 for (int k = 0; k < NumMatrixTypes; k++){
298 LOG_PLUS_EQUALS (totalForwardProb,
299 forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
300 backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
304 forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
305 backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];
307 for (int k = 0; k < NumInsertStates; k++){
308 LOG_PLUS_EQUALS (totalBackwardProb,
309 forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
310 backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]);
311 LOG_PLUS_EQUALS (totalBackwardProb,
312 forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
313 backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]);
316 // cerr << totalForwardProb << " " << totalBackwardProb << endl;
318 return (totalForwardProb + totalBackwardProb) / 2;
321 /////////////////////////////////////////////////////////////////
322 // ProbabilisticModel::ComputePosteriorMatrix()
324 // Computes the posterior probability matrix based on
325 // the forward and backward matrices.
326 /////////////////////////////////////////////////////////////////
328 VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2,
329 const VF &forward, const VF &backward) const {
334 const int seq1Length = seq1->GetLength();
335 const int seq2Length = seq2->GetLength();
337 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
340 // compute posterior matrices
341 VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);
342 VF &posterior = *posteriorPtr;
345 VF::iterator ptr = posterior.begin();
347 for (int i = 0; i <= seq1Length; i++){
348 for (int j = 0; j <= seq2Length; j++){
349 *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));
350 ij += NumMatrixTypes;
360 /////////////////////////////////////////////////////////////////
361 // ProbabilisticModel::ComputeExpectedCounts()
363 // Computes the expected counts for the various transitions.
364 /////////////////////////////////////////////////////////////////
366 VVF *ComputeExpectedCounts () const {
371 const int seq1Length = seq1->GetLength();
372 const int seq2Length = seq2->GetLength();
373 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
374 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
376 // compute total probability
377 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
380 // initialize expected counts
381 VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
382 VVF &counts = *countsPtr;
384 // remember offset for each index combination
386 int i1j = -seq2Length - 1;
388 int i1j1 = -seq2Length - 2;
390 ij *= NumMatrixTypes;
391 i1j *= NumMatrixTypes;
392 ij1 *= NumMatrixTypes;
393 i1j1 *= NumMatrixTypes;
395 // compute expected counts
396 for (int i = 0; i <= seq1Length; i++){
397 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
398 for (int j = 0; j <= seq2Length; j++){
399 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
402 for (int k = 0; k < NumMatrixTypes; k++)
403 LOG_PLUS_EQUALS (counts[k][0],
404 forward[k + i1j1] + transProb[k][0] +
405 matchProb[c1][c2] + backward[0 + ij]);
408 for (int k = 0; k < NumInsertStates; k++){
409 LOG_PLUS_EQUALS (counts[0][2*k+1],
410 forward[0 + i1j] + transProb[0][2*k+1] +
411 insProb[c1][k] + backward[2*k+1 + ij]);
412 LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
413 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
414 insProb[c1][k] + backward[2*k+1 + ij]);
418 for (int k = 0; k < NumInsertStates; k++){
419 LOG_PLUS_EQUALS (counts[0][2*k+2],
420 forward[0 + ij1] + transProb[0][2*k+2] +
421 insProb[c2][k] + backward[2*k+2 + ij]);
422 LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
423 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
424 insProb[c2][k] + backward[2*k+2 + ij]);
428 ij += NumMatrixTypes;
429 i1j += NumMatrixTypes;
430 ij1 += NumMatrixTypes;
431 i1j1 += NumMatrixTypes;
435 // scale all expected counts appropriately
436 for (int i = 0; i < NumMatrixTypes; i++)
437 for (int j = 0; j < NumMatrixTypes; j++)
438 counts[i][j] -= totalProb;
443 /////////////////////////////////////////////////////////////////
444 // ProbabilisticModel::ComputeNewParameters()
446 // Computes a new parameter set based on the expected counts
448 /////////////////////////////////////////////////////////////////
450 void ComputeNewParameters (Sequence *seq1, Sequence *seq2,
451 const VF &forward, const VF &backward,
452 VF &initDistribMat, VF &gapOpen,
453 VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
458 const int seq1Length = seq1->GetLength();
459 const int seq2Length = seq2->GetLength();
460 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
461 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
463 // compute total probability
464 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
467 // initialize expected counts
468 VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));
469 VF initCounts (NumMatrixTypes, LOG_ZERO);
470 VVF pairCounts (256, VF (256, LOG_ZERO));
471 VF singleCounts (256, LOG_ZERO);
473 // remember offset for each index combination
475 int i1j = -seq2Length - 1;
477 int i1j1 = -seq2Length - 2;
479 ij *= NumMatrixTypes;
480 i1j *= NumMatrixTypes;
481 ij1 *= NumMatrixTypes;
482 i1j1 *= NumMatrixTypes;
484 // compute initial distribution posteriors
485 initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
486 backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],
487 forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
488 backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
489 for (int k = 0; k < NumInsertStates; k++){
490 initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
491 backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)],
492 forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
493 backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
494 initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
495 backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)],
496 forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
497 backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
500 // compute expected counts
501 for (int i = 0; i <= seq1Length; i++){
502 unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
503 for (int j = 0; j <= seq2Length; j++){
504 unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
507 if (enableTrainEmissions && i == 1 && j == 1){
508 LOG_PLUS_EQUALS (pairCounts[c1][c2],
509 initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);
510 LOG_PLUS_EQUALS (pairCounts[c2][c1],
511 initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);
514 for (int k = 0; k < NumMatrixTypes; k++){
515 LOG_PLUS_EQUALS (transCounts[k][0],
516 forward[k + i1j1] + transProb[k][0] +
517 matchProb[c1][c2] + backward[0 + ij]);
518 if (enableTrainEmissions && i != 1 || j != 1){
519 LOG_PLUS_EQUALS (pairCounts[c1][c2],
520 forward[k + i1j1] + transProb[k][0] +
521 matchProb[c1][c2] + backward[0 + ij]);
522 LOG_PLUS_EQUALS (pairCounts[c2][c1],
523 forward[k + i1j1] + transProb[k][0] +
524 matchProb[c2][c1] + backward[0 + ij]);
529 for (int k = 0; k < NumInsertStates; k++){
530 LOG_PLUS_EQUALS (transCounts[0][2*k+1],
531 forward[0 + i1j] + transProb[0][2*k+1] +
532 insProb[c1][k] + backward[2*k+1 + ij]);
533 LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1],
534 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
535 insProb[c1][k] + backward[2*k+1 + ij]);
536 if (enableTrainEmissions){
537 if (i == 1 && j == 0){
538 LOG_PLUS_EQUALS (singleCounts[c1],
539 initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]);
542 LOG_PLUS_EQUALS (singleCounts[c1],
543 forward[0 + i1j] + transProb[0][2*k+1] +
544 insProb[c1][k] + backward[2*k+1 + ij]);
545 LOG_PLUS_EQUALS (singleCounts[c1],
546 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
547 insProb[c1][k] + backward[2*k+1 + ij]);
553 for (int k = 0; k < NumInsertStates; k++){
554 LOG_PLUS_EQUALS (transCounts[0][2*k+2],
555 forward[0 + ij1] + transProb[0][2*k+2] +
556 insProb[c2][k] + backward[2*k+2 + ij]);
557 LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2],
558 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
559 insProb[c2][k] + backward[2*k+2 + ij]);
560 if (enableTrainEmissions){
561 if (i == 0 && j == 1){
562 LOG_PLUS_EQUALS (singleCounts[c2],
563 initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]);
566 LOG_PLUS_EQUALS (singleCounts[c2],
567 forward[0 + ij1] + transProb[0][2*k+2] +
568 insProb[c2][k] + backward[2*k+2 + ij]);
569 LOG_PLUS_EQUALS (singleCounts[c2],
570 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
571 insProb[c2][k] + backward[2*k+2 + ij]);
577 ij += NumMatrixTypes;
578 i1j += NumMatrixTypes;
579 ij1 += NumMatrixTypes;
580 i1j1 += NumMatrixTypes;
584 // scale all expected counts appropriately
585 for (int i = 0; i < NumMatrixTypes; i++){
586 initCounts[i] -= totalProb;
587 for (int j = 0; j < NumMatrixTypes; j++)
588 transCounts[i][j] -= totalProb;
590 if (enableTrainEmissions){
591 for (int i = 0; i < 256; i++){
592 for (int j = 0; j < 256; j++)
593 pairCounts[i][j] -= totalProb;
594 singleCounts[i] -= totalProb;
598 // compute new initial distribution
599 float totalInitDistribCounts = 0;
600 for (int i = 0; i < NumMatrixTypes; i++)
601 totalInitDistribCounts += exp (initCounts[i]); // should be 2
602 initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));
603 for (int k = 0; k < NumInsertStates; k++){
604 float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2;
605 initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));
608 // compute total counts for match state
609 float inMatchStateCounts = 0;
610 for (int i = 0; i < NumMatrixTypes; i++)
611 inMatchStateCounts += exp (transCounts[0][i]);
612 for (int i = 0; i < NumInsertStates; i++){
614 // compute total counts for gap state
615 float inGapStateCounts =
616 exp (transCounts[2*i+1][0]) +
617 exp (transCounts[2*i+1][2*i+1]) +
618 exp (transCounts[2*i+2][0]) +
619 exp (transCounts[2*i+2][2*i+2]);
621 gapOpen[2*i] = gapOpen[2*i+1] =
622 (exp (transCounts[0][2*i+1]) +
623 exp (transCounts[0][2*i+2])) /
624 (2 * inMatchStateCounts);
626 gapExtend[2*i] = gapExtend[2*i+1] =
627 (exp (transCounts[2*i+1][2*i+1]) +
628 exp (transCounts[2*i+2][2*i+2])) /
632 if (enableTrainEmissions){
633 float totalPairCounts = 0;
634 float totalSingleCounts = 0;
635 for (int i = 0; i < 256; i++){
636 for (int j = 0; j <= i; j++)
637 totalPairCounts += exp (pairCounts[j][i]);
638 totalSingleCounts += exp (singleCounts[i]);
641 for (int i = 0; i < 256; i++) if (!islower ((char) i)){
642 int li = (int)((unsigned char) tolower ((char) i));
643 for (int j = 0; j <= i; j++) if (!islower ((char) j)){
644 int lj = (int)((unsigned char) tolower ((char) j));
645 emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] =
646 emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;
648 emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;
653 /////////////////////////////////////////////////////////////////
654 // ProbabilisticModel::ComputeAlignment()
656 // Computes an alignment based on the given posterior matrix.
657 // This is done by finding the maximum summing path (or
658 // maximum weight trace) through the posterior matrix. The
659 // final alignment is returned as a pair consisting of:
660 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
661 // denote insertions in one of the two sequences and
662 // B's denote that both sequences are present (i.e.
664 // (2) a float indicating the sum achieved
665 /////////////////////////////////////////////////////////////////
667 pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length, const VF &posterior) const {
669 float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
670 float *oldRow = twoRows;
671 float *newRow = twoRows + seq2Length + 1;
673 char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
674 char *tracebackPtr = tracebackMatrix;
676 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
679 for (int i = 0; i <= seq2Length; i++){
681 *(tracebackPtr++) = 'L';
685 for (int i = 1; i <= seq1Length; i++){
687 // initialize left column
690 *(tracebackPtr++) = 'U';
692 // fill in rest of row
693 for (int j = 1; j <= seq2Length; j++){
694 ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],
695 'D', 'L', 'U', &newRow[j], tracebackPtr++); // Match, insert, delete
699 float *temp = oldRow;
705 float total = oldRow[seq2Length];
709 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
710 int r = seq1Length, c = seq2Length;
711 while (r != 0 || c != 0){
712 char ch = tracebackMatrix[r*(seq2Length+1) + c];
714 case 'L': c--; alignment->push_back ('Y'); break;
715 case 'U': r--; alignment->push_back ('X'); break;
716 case 'D': c--; r--; alignment->push_back ('B'); break;
717 default: assert (false);
721 delete [] tracebackMatrix;
723 reverse (alignment->begin(), alignment->end());
725 return make_pair(alignment, total);
728 /////////////////////////////////////////////////////////////////
729 // ProbabilisticModel::ComputeAlignment2()
731 // Computes an alignment based on the given posterior matrix.
732 // This is done by finding the maximum summing path (or
733 // maximum weight trace) through the posterior matrix. The
734 // final alignment is returned as a pair consisting of:
735 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
736 // denote insertions in one of the two sequences and
737 // B's denote that both sequences are present (i.e.
739 // (2) a float indicating the sum achieved
740 /////////////////////////////////////////////////////////////////
742 pair<SafeVector<char> *, float> ComputeAlignment2 (int seq1Length, int seq2Length,
743 const VF &posterior, std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2,
744 std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2) const {
745 NRMat<float> WM(seq1Length + 1, seq2Length + 1);
746 for (int i = 0; i <= seq1Length; i++) {
747 for (int j = 0; j <= seq2Length; j++) {
752 int len = WORDLENGTH;
753 int size = matchPSCS1->size();
756 for(int iter = 0; iter < size; iter++) {
757 int i = matchPSCS1->at(iter);
758 int j = matchPSCS2->at(iter);
760 const StemCandidate &sc1 = pscs1->at(i);
761 const StemCandidate &sc2 = pscs2->at(j);
762 for(int k = 0; k < len; k++) {
763 WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight;
766 float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
767 float *oldRow = twoRows;
768 float *newRow = twoRows + seq2Length + 1;
770 char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
771 char *tracebackPtr = tracebackMatrix;
773 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
776 for (int i = 0; i <= seq2Length; i++){
778 *(tracebackPtr++) = 'L';
782 for (int i = 1; i <= seq1Length; i++){
784 // initialize left column
787 *(tracebackPtr++) = 'U';
789 // fill in rest of row
790 for (int j = 1; j <= seq2Length; j++){
791 ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1] + WM[i][j], newRow[j-1], oldRow[j],
792 'D', 'L', 'U', &newRow[j], tracebackPtr++);
796 float *temp = oldRow;
802 float total = oldRow[seq2Length];
806 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
807 int r = seq1Length, c = seq2Length;
808 while (r != 0 || c != 0){
809 char ch = tracebackMatrix[r*(seq2Length+1) + c];
811 case 'L': c--; alignment->push_back ('Y'); break;
812 case 'U': r--; alignment->push_back ('X'); break;
813 case 'D': c--; r--; alignment->push_back ('B'); break;
814 default: assert (false);
818 delete [] tracebackMatrix;
820 reverse (alignment->begin(), alignment->end());
822 return make_pair(alignment, total);
825 /////////////////////////////////////////////////////////////////
826 // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
828 // Similar to ComputeAlignment() except with gap penalties.
829 /////////////////////////////////////////////////////////////////
831 pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1,
832 MultiSequence *align2,
833 const VF &posterior, int numSeqs1,
835 float gapOpenPenalty,
836 float gapContinuePenalty) const {
837 int seq1Length = align1->GetSequence(0)->GetLength();
838 int seq2Length = align2->GetSequence(0)->GetLength();
839 SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());
840 SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());
842 // grab character data
843 for (int i = 0; i < align1->GetNumSequences(); i++)
844 dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
845 for (int i = 0; i < align2->GetNumSequences(); i++)
846 dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
848 // the number of active sequences at any given column is defined to be the
849 // number of non-gap characters in that column; the number of gap opens at
850 // any given column is defined to be the number of gap characters in that
851 // column where the previous character in the respective sequence was not
853 SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);
854 SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);
856 // compute number of active sequences and gap opens for each group
857 for (int i = 0; i < align1->GetNumSequences(); i++){
858 SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();
859 numActive1[0] = numGapOpens1[0] = 0;
860 for (int j = 1; j <= seq1Length; j++){
861 if (dataPtr[j] != '-'){
863 numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');
867 for (int i = 0; i < align2->GetNumSequences(); i++){
868 SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();
869 numActive2[0] = numGapOpens2[0] = 0;
870 for (int j = 1; j <= seq2Length; j++){
871 if (dataPtr[j] != '-'){
873 numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');
878 VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));
879 VF continuingPenalty1 (numSeqs1+1);
880 VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));
881 VF continuingPenalty2 (numSeqs2+1);
883 // precompute penalties
884 for (int i = 0; i <= numSeqs1; i++)
885 for (int j = 0; j <= numSeqs2; j++)
886 openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));
887 for (int i = 0; i <= numSeqs1; i++)
888 continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
889 for (int i = 0; i <= numSeqs2; i++)
890 for (int j = 0; j <= numSeqs1; j++)
891 openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));
892 for (int i = 0; i <= numSeqs2; i++)
893 continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
895 float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows);
896 float *oldRowMatch = twoRows;
897 float *newRowMatch = twoRows + (seq2Length+1);
898 float *oldRowInsertX = twoRows + 2*(seq2Length+1);
899 float *newRowInsertX = twoRows + 3*(seq2Length+1);
900 float *oldRowInsertY = twoRows + 4*(seq2Length+1);
901 float *newRowInsertY = twoRows + 5*(seq2Length+1);
903 char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
904 char *tracebackPtr = tracebackMatrix;
906 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
909 for (int i = 0; i <= seq2Length; i++){
910 oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
911 oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];
912 *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y';
917 for (int i = 1; i <= seq1Length; i++){
919 // initialize left column
920 newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
921 newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];
923 *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X';
926 // fill in rest of row
927 for (int j = 1; j <= seq2Length; j++){
929 // going to MATCH state
930 ChooseBestOfThree (oldRowMatch[j-1],
933 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);
934 newRowMatch[j] += *(posteriorPtr++);
936 // going to INSERT X state
937 ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
938 oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
939 oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
940 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
942 // going to INSERT Y state
943 ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
944 newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
945 newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],
946 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);
951 temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;
952 temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;
953 temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;
959 ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],
960 'M', 'X', 'Y', &total, &matrix);
965 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
966 int r = seq1Length, c = seq2Length;
967 while (r != 0 || c != 0){
969 int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
970 char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset];
972 case 'Y': c--; alignment->push_back ('Y'); break;
973 case 'X': r--; alignment->push_back ('X'); break;
974 case 'M': c--; r--; alignment->push_back ('B'); break;
975 default: assert (false);
980 delete [] tracebackMatrix;
982 reverse (alignment->begin(), alignment->end());
984 return make_pair(alignment, 1.0f);
988 /////////////////////////////////////////////////////////////////
989 // ProbabilisticModel::ComputeViterbiAlignment()
991 // Computes the highest probability pairwise alignment using the
992 // probabilistic model. The final alignment is returned as a
993 // pair consisting of:
994 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
995 // denote insertions in one of the two sequences and
996 // B's denote that both sequences are present (i.e.
998 // (2) a float containing the log probability of the best
999 // alignment (not used)
1000 /////////////////////////////////////////////////////////////////
1002 pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const {
1007 const int seq1Length = seq1->GetLength();
1008 const int seq2Length = seq2->GetLength();
1010 // retrieve the points to the beginning of each sequence
1011 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1012 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1014 // create viterbi matrix
1015 VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
1016 assert (viterbiPtr);
1017 VF &viterbi = *viterbiPtr;
1019 // create traceback matrix
1020 VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1);
1021 assert (tracebackPtr);
1022 VI &traceback = *tracebackPtr;
1024 // initialization condition
1025 for (int k = 0; k < NumMatrixTypes; k++)
1026 viterbi[k] = initialDistribution[k];
1028 // remember offset for each index combination
1030 int i1j = -seq2Length - 1;
1032 int i1j1 = -seq2Length - 2;
1034 ij *= NumMatrixTypes;
1035 i1j *= NumMatrixTypes;
1036 ij1 *= NumMatrixTypes;
1037 i1j1 *= NumMatrixTypes;
1039 // compute viterbi scores
1040 for (int i = 0; i <= seq1Length; i++){
1041 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
1042 for (int j = 0; j <= seq2Length; j++){
1043 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
1045 if (i > 0 && j > 0){
1046 for (int k = 0; k < NumMatrixTypes; k++){
1047 float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2];
1048 if (viterbi[0 + ij] < newVal){
1049 viterbi[0 + ij] = newVal;
1050 traceback[0 + ij] = k;
1055 for (int k = 0; k < NumInsertStates; k++){
1056 float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1];
1057 float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1];
1058 if (valFromMatch >= valFromIns){
1059 viterbi[2*k+1 + ij] = valFromMatch;
1060 traceback[2*k+1 + ij] = 0;
1063 viterbi[2*k+1 + ij] = valFromIns;
1064 traceback[2*k+1 + ij] = 2*k+1;
1069 for (int k = 0; k < NumInsertStates; k++){
1070 float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2];
1071 float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2];
1072 if (valFromMatch >= valFromIns){
1073 viterbi[2*k+2 + ij] = valFromMatch;
1074 traceback[2*k+2 + ij] = 0;
1077 viterbi[2*k+2 + ij] = valFromIns;
1078 traceback[2*k+2 + ij] = 2*k+2;
1083 ij += NumMatrixTypes;
1084 i1j += NumMatrixTypes;
1085 ij1 += NumMatrixTypes;
1086 i1j1 += NumMatrixTypes;
1090 // figure out best terminating cell
1091 float bestProb = LOG_ZERO;
1093 for (int k = 0; k < NumMatrixTypes; k++){
1094 float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k];
1095 if (bestProb < thisProb){
1096 bestProb = thisProb;
1100 assert (state != -1);
1104 // compute traceback
1105 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
1106 int r = seq1Length, c = seq2Length;
1107 while (r != 0 || c != 0){
1108 int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)];
1110 if (state == 0){ c--; r--; alignment->push_back ('B'); }
1111 else if (state % 2 == 1){ r--; alignment->push_back ('X'); }
1112 else { c--; alignment->push_back ('Y'); }
1117 delete tracebackPtr;
1119 reverse (alignment->begin(), alignment->end());
1121 return make_pair(alignment, bestProb);
1124 /////////////////////////////////////////////////////////////////
1125 // ProbabilisticModel::BuildPosterior()
1127 // Builds a posterior probability matrix needed to align a pair
1128 // of alignments. Mathematically, the returned matrix M is
1129 // defined as follows:
1130 // M[i,j] = sum sum f(s,t,i,j)
1131 // s in align1 t in align2
1133 // [ P(s[i'] <--> t[j'])
1134 // [ if s[i'] is a letter in the ith column of align1 and
1135 // [ t[j'] it a letter in the jth column of align2
1139 /////////////////////////////////////////////////////////////////
1141 VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2,
1142 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1143 float cutoff = 0.0f) const {
1144 const int seq1Length = align1->GetSequence(0)->GetLength();
1145 const int seq2Length = align2->GetSequence(0)->GetLength();
1147 VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr);
1148 VF &posterior = *posteriorPtr;
1149 VF::iterator postPtr = posterior.begin();
1151 // for each s in align1
1152 for (int i = 0; i < align1->GetNumSequences(); i++){
1153 int first = align1->GetSequence(i)->GetLabel();
1154 SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1156 // for each t in align2
1157 for (int j = 0; j < align2->GetNumSequences(); j++){
1158 int second = align2->GetSequence(j)->GetLabel();
1159 SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();
1160 if (first < second){
1162 // get the associated sparse matrix
1163 SparseMatrix *matrix = sparseMatrices[first][second];
1165 for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){
1166 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1167 int base = (*mapping1)[ii] * (seq2Length+1);
1168 int rowSize = matrix->GetRowSize(ii);
1169 // add in all relevant values
1170 for (int jj = 0; jj < rowSize; jj++)
1171 posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;
1174 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) {
1175 posterior[base + (*mapping2)[jj]] -= cutoff;
1181 // get the associated sparse matrix
1182 SparseMatrix *matrix = sparseMatrices[second][first];
1184 for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){
1185 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1186 int base = (*mapping2)[jj];
1187 int rowSize = matrix->GetRowSize(jj);
1189 // add in all relevant values
1190 for (int ii = 0; ii < rowSize; ii++)
1191 posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second;
1194 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1195 posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff;
1207 return posteriorPtr;