1 /////////////////////////////////////////////////////////////////
\r
2 // ProbabilisticModel.h
\r
4 // Routines for (1) posterior probability computations
\r
5 // (2) chained anchoring
\r
6 // (3) maximum weight trace alignment
\r
7 /////////////////////////////////////////////////////////////////
\r
9 #ifndef PROBABILISTICMODEL_H
\r
10 #define PROBABILISTICMODEL_H
\r
15 #include "SafeVector.h"
\r
16 #include "ScoreType.h"
\r
17 #include "SparseMatrix.h"
\r
18 #include "MultiSequence.h"
\r
20 using namespace std;
\r
22 const int NumMatchStates = 1; // note that in this version the number
\r
23 // of match states is fixed at 1...will
\r
24 const int NumInsertStates = 2; // change in future versions
\r
25 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
\r
27 /////////////////////////////////////////////////////////////////
\r
28 // ProbabilisticModel
\r
30 // Class for storing the parameters of a probabilistic model and
\r
31 // performing different computations based on those parameters.
\r
32 // In particular, this class handles the computation of
\r
33 // posterior probabilities that may be used in alignment.
\r
34 /////////////////////////////////////////////////////////////////
\r
36 class ProbabilisticModel {
\r
38 float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
\r
39 float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
\r
40 float matchProb[256][256]; // emission probabilities for match states
\r
41 float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
\r
42 float local_transProb[3][3];
\r
43 float random_transProb[2];
\r
47 /////////////////////////////////////////////////////////////////
\r
48 // ProbabilisticModel::ProbabilisticModel()
\r
50 // Constructor. Builds a new probabilistic model using the
\r
51 // given parameters.
\r
52 /////////////////////////////////////////////////////////////////
\r
54 ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,
\r
55 const VVF &emitPairs, const VF &emitSingle){
\r
58 // build transition matrix
\r
59 VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));
\r
61 for (int i = 0; i < NumInsertStates; i++){
\r
62 transMat[0][2*i+1] = gapOpen[2*i];
\r
63 transMat[0][2*i+2] = gapOpen[2*i];
\r
64 transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i]);
\r
65 assert (transMat[0][0] > 0);
\r
66 transMat[2*i+1][2*i+1] = gapExtend[2*i];
\r
67 transMat[2*i+2][2*i+2] = gapExtend[2*i];
\r
68 transMat[2*i+1][2*i+2] = 0;
\r
69 transMat[2*i+2][2*i+1] = 0;
\r
70 transMat[2*i+1][0] = 1 - gapExtend[2*i];
\r
71 transMat[2*i+2][0] = 1 - gapExtend[2*i];
\r
74 // create initial and transition probability matrices
\r
75 for (int i = 0; i < NumMatrixTypes; i++){
\r
76 initialDistribution[i] = LOG (initDistribMat[i]);
\r
77 for (int j = 0; j < NumMatrixTypes; j++)
\r
78 transProb[i][j] = LOG (transMat[i][j]);
\r
80 //due to Local model parameters' initilization, need to correct initialDistribution[2]
\r
81 initialDistribution[2] = LOG (initDistribMat[1]);
\r
83 // create insertion and match probability matrices
\r
84 for (int i = 0; i < 256; i++){
\r
85 for (int j = 0; j < NumMatrixTypes; j++)
\r
86 insProb[i][j] = LOG (emitSingle[i]);
\r
87 for (int j = 0; j < 256; j++)
\r
88 matchProb[i][j] = LOG (emitPairs[i][j]);
\r
92 // build transition matrix
\r
93 VVF ltransMat (3, VF (3, 0.0f));
\r
94 ltransMat[0][0] = 1;
\r
96 ltransMat[0][1] = gapOpen[1];
\r
97 ltransMat[0][2] = gapOpen[1];
\r
98 ltransMat[0][0] -= (gapOpen[1] + gapOpen[1]);
\r
99 assert (ltransMat[0][0] > 0);
\r
100 ltransMat[1][1] = gapExtend[1];
\r
101 ltransMat[2][2] = gapExtend[1];
\r
102 ltransMat[1][2] = 0;
\r
103 ltransMat[2][1] = 0;
\r
104 ltransMat[1][0] = 1 - gapExtend[1];
\r
105 ltransMat[2][0] = 1 - gapExtend[1];
\r
107 // create initial and transition probability matrices
\r
108 for (int i = 0; i < 3; i++){
\r
109 for (int j = 0; j < 3; j++)
\r
110 local_transProb[i][j] = LOG (ltransMat[i][j]);
\r
113 // create initial and transition probability matrices
\r
114 random_transProb[0] = LOG (initDistribMat[2]);//sigma
\r
115 random_transProb[1] = LOG (1-initDistribMat[2]);//1-sigma
\r
119 /////////////////////////////////////////////////////////////////
\r
120 // ProbabilisticModel::ComputeForwardMatrix()
\r
122 // Computes a set of forward probability matrices for aligning
\r
125 // For efficiency reasons, a single-dimensional floating-point
\r
126 // array is used here, with the following indexing scheme:
\r
128 // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
\r
129 // refers to the probability of aligning through j characters
\r
130 // of the first sequence, k characters of the second sequence,
\r
131 // and ending in state i.
\r
132 // flag: 1 probcons, 0 local
\r
133 /////////////////////////////////////////////////////////////////
\r
135 VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2, bool flag=true) const {
\r
140 const int seq1Length = seq1->GetLength();
\r
141 const int seq2Length = seq2->GetLength();
\r
143 // retrieve the points to the beginning of each sequence
\r
144 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
\r
145 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
\r
149 if(flag) forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
\r
150 else forwardPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
\r
151 assert (forwardPtr);
\r
152 VF &forward = *forwardPtr;
\r
154 // initialization condition
\r
156 forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] =
\r
157 initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
\r
159 for (int k = 0; k < NumInsertStates; k++){
\r
160 forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] =
\r
161 initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];
\r
162 forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] =
\r
163 initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k];
\r
167 // remember offset for each index combination
\r
169 int i1j = -seq2Length - 1;
\r
171 int i1j1 = -seq2Length - 2;
\r
174 ij *= NumMatrixTypes;
\r
175 i1j *= NumMatrixTypes;
\r
176 ij1 *= NumMatrixTypes;
\r
177 i1j1 *= NumMatrixTypes;
\r
186 // compute forward scores
\r
187 for (int i = 0; i <= seq1Length; i++){
\r
188 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
\r
189 for (int j = 0; j <= seq2Length; j++){
\r
190 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
\r
192 if(i == 1 && j == 1 && !flag) forward[0 + ij] =
\r
193 matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1];
\r
195 if (i > 1 || j > 1){
\r
196 if (i > 0 && j > 0){
\r
198 forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
\r
199 for (int k = 1; k < NumMatrixTypes; k++)
\r
200 LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);
\r
201 forward[0 + ij] += matchProb[c1][c2];
\r
205 forward[0 + ij] = matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1];
\r
206 for (int k = 0; k < 3; k++)
\r
207 LOG_PLUS_EQUALS (forward[0 + ij], matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] +
\r
208 forward[k + i1j1] + local_transProb[k][0] - 2*random_transProb[1]);
\r
213 for (int k = 0; k < NumInsertStates; k++)
\r
214 forward[2*k+1 + ij] = insProb[c1][k] +
\r
215 LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],
\r
216 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]);
\r
226 forward[1 + ij] = LOG_ADD (forward[0 + i1j] + local_transProb[0][1] - random_transProb[1],
\r
227 forward[1 + i1j] + local_transProb[1][1] - random_transProb[1]);
\r
233 for (int k = 0; k < NumInsertStates; k++)
\r
234 forward[2*k+2 + ij] = insProb[c2][k] +
\r
235 LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],
\r
236 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]);
\r
240 forward[2 + ij] = LOG_ADD (forward[0 + ij1] + local_transProb[0][2] - random_transProb[1],
\r
241 forward[2 + ij1] + local_transProb[2][2] - random_transProb[1]);
\r
246 ij += NumMatrixTypes;
\r
247 i1j += NumMatrixTypes;
\r
248 ij1 += NumMatrixTypes;
\r
249 i1j1 += NumMatrixTypes;
\r
263 /////////////////////////////////////////////////////////////////
\r
264 // ProbabilisticModel::ComputeBackwardMatrix()
\r
266 // Computes a set of backward probability matrices for aligning
\r
269 // For efficiency reasons, a single-dimensional floating-point
\r
270 // array is used here, with the following indexing scheme:
\r
272 // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
\r
273 // refers to the probability of starting in state i and
\r
274 // aligning from character j+1 to the end of the first
\r
275 // sequence and from character k+1 to the end of the second
\r
277 /////////////////////////////////////////////////////////////////
\r
279 VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2, bool flag=true) const {
\r
284 const int seq1Length = seq1->GetLength();
\r
285 const int seq2Length = seq2->GetLength();
\r
286 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
\r
287 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
\r
291 if(flag) backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
\r
292 else backwardPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
\r
293 assert (backwardPtr);
\r
294 VF &backward = *backwardPtr;
\r
296 // initialization condition
\r
298 for (int k = 0; k < NumMatrixTypes; k++)
\r
299 backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];
\r
301 // remember offset for each index combination
\r
302 int ij = (seq1Length+1) * (seq2Length+1) - 1;
\r
303 int i1j = ij + seq2Length + 1;
\r
305 int i1j1 = ij + seq2Length + 2;
\r
308 ij *= NumMatrixTypes;
\r
309 i1j *= NumMatrixTypes;
\r
310 ij1 *= NumMatrixTypes;
\r
311 i1j1 *= NumMatrixTypes;
\r
320 // compute backward scores
\r
321 for (int i = seq1Length; i >= 0; i--){
\r
322 unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];
\r
323 for (int j = seq2Length; j >= 0; j--){
\r
324 unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];
\r
326 if(!flag) backward[0 + ij] = LOG_ONE;//local
\r
327 if (i < seq1Length && j < seq2Length){
\r
329 const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
\r
330 for (int k = 0; k < NumMatrixTypes; k++)
\r
331 LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);
\r
335 const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0];
\r
336 for (int k = 0; k < 3; k++)
\r
337 LOG_PLUS_EQUALS (backward[k + ij], ProbXY + local_transProb[k][0] - 2*random_transProb[1] );
\r
340 if (i < seq1Length){
\r
342 for (int k = 0; k < NumInsertStates; k++){
\r
343 LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]);
\r
344 LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]);
\r
349 LOG_PLUS_EQUALS (backward[0 + ij], backward[1 + i1j] + local_transProb[0][1] - random_transProb[1]);
\r
350 LOG_PLUS_EQUALS (backward[1 + ij], backward[1 + i1j] + local_transProb[1][1] - random_transProb[1]);
\r
353 if (j < seq2Length){
\r
355 for (int k = 0; k < NumInsertStates; k++){
\r
356 LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]);
\r
357 LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]);
\r
362 LOG_PLUS_EQUALS (backward[0 + ij], backward[2 + ij1] + local_transProb[0][2] - random_transProb[1]);
\r
363 LOG_PLUS_EQUALS (backward[2 + ij], backward[2 + ij1] + local_transProb[2][2] - random_transProb[1]);
\r
367 ij -= NumMatrixTypes;
\r
368 i1j -= NumMatrixTypes;
\r
369 ij1 -= NumMatrixTypes;
\r
370 i1j1 -= NumMatrixTypes;
\r
381 return backwardPtr;
\r
384 /////////////////////////////////////////////////////////////////
\r
385 // ProbabilisticModel::ComputeTotalProbability()
\r
387 // Computes the total probability of an alignment given
\r
388 // the forward and backward matrices.
\r
389 // flag: 1 probcons, 0 local
\r
390 /////////////////////////////////////////////////////////////////
\r
392 float ComputeTotalProbability (Sequence *seq1, Sequence *seq2,
\r
393 const VF &forward, const VF &backward, bool flag=true) const {
\r
395 // compute total probability
\r
396 float totalForwardProb = LOG_ZERO;
\r
397 float totalBackwardProb = LOG_ZERO;
\r
398 const int seq1Length = seq1->GetLength();
\r
399 const int seq2Length = seq2->GetLength();
\r
402 for (int k = 0; k < NumMatrixTypes; k++){
\r
403 LOG_PLUS_EQUALS (totalForwardProb,
\r
404 forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
\r
405 backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
\r
408 totalBackwardProb =
\r
409 forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
\r
410 backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];
\r
412 for (int k = 0; k < NumInsertStates; k++){
\r
413 LOG_PLUS_EQUALS (totalBackwardProb,
\r
414 forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
\r
415 backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]);
\r
416 LOG_PLUS_EQUALS (totalBackwardProb,
\r
417 forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
\r
418 backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]);
\r
422 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
\r
423 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
\r
425 for (int i = 0; i <= seq1Length; i++){
\r
426 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
\r
427 for (int j = 0; j <= seq2Length; j++){
\r
428 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
\r
430 LOG_PLUS_EQUALS (totalForwardProb,forward[ij]);
\r
431 LOG_PLUS_EQUALS (totalBackwardProb,backward[ij] + matchProb[c1][c2]
\r
432 - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1]);
\r
440 return (totalForwardProb + totalBackwardProb) / 2;
\r
443 /////////////////////////////////////////////////////////////////
\r
444 // ProbabilisticModel::ComputePosteriorMatrix()
\r
446 // Computes the posterior probability matrix based on
\r
447 // the forward and backward matrices.
\r
448 // flag: 1 probcons, 0 local
\r
449 /////////////////////////////////////////////////////////////////
\r
451 VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2,
\r
452 const VF &forward, const VF &backward, bool flag=true) const {
\r
457 const int seq1Length = seq1->GetLength();
\r
458 const int seq2Length = seq2->GetLength();
\r
460 float totalProb = ComputeTotalProbability (seq1, seq2,forward, backward, flag);
\r
462 // compute posterior matrices
\r
463 VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);
\r
464 VF &posterior = *posteriorPtr;
\r
467 VF::iterator ptr = posterior.begin();
\r
469 for (int i = 0; i <= seq1Length; i++){
\r
470 for (int j = 0; j <= seq2Length; j++){
\r
471 *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));
\r
472 if(flag) ij += NumMatrixTypes;
\r
479 return posteriorPtr;
\r
483 /////////////////////////////////////////////////////////////////
\r
484 // ProbabilisticModel::ComputeExpectedCounts()
\r
486 // Computes the expected counts for the various transitions.
\r
487 /////////////////////////////////////////////////////////////////
\r
489 VVF *ComputeExpectedCounts () const {
\r
494 const int seq1Length = seq1->GetLength();
\r
495 const int seq2Length = seq2->GetLength();
\r
496 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
\r
497 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
\r
499 // compute total probability
\r
500 float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
\r
501 forward, backward);
\r
503 // initialize expected counts
\r
504 VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
\r
505 VVF &counts = *countsPtr;
\r
507 // remember offset for each index combination
\r
509 int i1j = -seq2Length - 1;
\r
511 int i1j1 = -seq2Length - 2;
\r
513 ij *= NumMatrixTypes;
\r
514 i1j *= NumMatrixTypes;
\r
515 ij1 *= NumMatrixTypes;
\r
516 i1j1 *= NumMatrixTypes;
\r
518 // compute expected counts
\r
519 for (int i = 0; i <= seq1Length; i++){
\r
520 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
\r
521 for (int j = 0; j <= seq2Length; j++){
\r
522 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
\r
524 if (i > 0 && j > 0){
\r
525 for (int k = 0; k < NumMatrixTypes; k++)
\r
526 LOG_PLUS_EQUALS (counts[k][0],
\r
527 forward[k + i1j1] + transProb[k][0] +
\r
528 matchProb[c1][c2] + backward[0 + ij]);
\r
531 for (int k = 0; k < NumInsertStates; k++){
\r
532 LOG_PLUS_EQUALS (counts[0][2*k+1],
\r
533 forward[0 + i1j] + transProb[0][2*k+1] +
\r
534 insProb[c1][k] + backward[2*k+1 + ij]);
\r
535 LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
\r
536 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
\r
537 insProb[c1][k] + backward[2*k+1 + ij]);
\r
541 for (int k = 0; k < NumInsertStates; k++){
\r
542 LOG_PLUS_EQUALS (counts[0][2*k+2],
\r
543 forward[0 + ij1] + transProb[0][2*k+2] +
\r
544 insProb[c2][k] + backward[2*k+2 + ij]);
\r
545 LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
\r
546 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
\r
547 insProb[c2][k] + backward[2*k+2 + ij]);
\r
551 ij += NumMatrixTypes;
\r
552 i1j += NumMatrixTypes;
\r
553 ij1 += NumMatrixTypes;
\r
554 i1j1 += NumMatrixTypes;
\r
558 // scale all expected counts appropriately
\r
559 for (int i = 0; i < NumMatrixTypes; i++)
\r
560 for (int j = 0; j < NumMatrixTypes; j++)
\r
561 counts[i][j] -= totalProb;
\r
566 /////////////////////////////////////////////////////////////////
\r
567 // ProbabilisticModel::ComputeNewParameters()
\r
569 // Computes a new parameter set based on the expected counts
\r
571 /////////////////////////////////////////////////////////////////
\r
573 void ComputeNewParameters (Sequence *seq1, Sequence *seq2,
\r
574 const VF &forward, const VF &backward,
\r
575 VF &initDistribMat, VF &gapOpen,
\r
576 VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
\r
581 const int seq1Length = seq1->GetLength();
\r
582 const int seq2Length = seq2->GetLength();
\r
583 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
\r
584 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
\r
586 // compute total probability
\r
587 float totalProb = ComputeTotalProbability (seq1, seq2,
\r
588 forward, backward);
\r
590 // initialize expected counts
\r
591 VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));
\r
592 VF initCounts (NumMatrixTypes, LOG_ZERO);
\r
593 VVF pairCounts (256, VF (256, LOG_ZERO));
\r
594 VF singleCounts (256, LOG_ZERO);
\r
596 // remember offset for each index combination
\r
598 int i1j = -seq2Length - 1;
\r
600 int i1j1 = -seq2Length - 2;
\r
602 ij *= NumMatrixTypes;
\r
603 i1j *= NumMatrixTypes;
\r
604 ij1 *= NumMatrixTypes;
\r
605 i1j1 *= NumMatrixTypes;
\r
607 // compute initial distribution posteriors
\r
608 initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
\r
609 backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],
\r
610 forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
\r
611 backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
\r
612 for (int k = 0; k < NumInsertStates; k++){
\r
613 initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
\r
614 backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)],
\r
615 forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
\r
616 backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
\r
617 initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
\r
618 backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)],
\r
619 forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
\r
620 backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
\r
623 // compute expected counts
\r
624 for (int i = 0; i <= seq1Length; i++){
\r
625 unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
\r
626 for (int j = 0; j <= seq2Length; j++){
\r
627 unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
\r
629 if (i > 0 && j > 0){
\r
630 if (enableTrainEmissions && i == 1 && j == 1){
\r
631 LOG_PLUS_EQUALS (pairCounts[c1][c2],
\r
632 initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);
\r
633 LOG_PLUS_EQUALS (pairCounts[c2][c1],
\r
634 initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);
\r
637 for (int k = 0; k < NumMatrixTypes; k++){
\r
638 LOG_PLUS_EQUALS (transCounts[k][0],
\r
639 forward[k + i1j1] + transProb[k][0] +
\r
640 matchProb[c1][c2] + backward[0 + ij]);
\r
641 if (enableTrainEmissions && i != 1 || j != 1){
\r
642 LOG_PLUS_EQUALS (pairCounts[c1][c2],
\r
643 forward[k + i1j1] + transProb[k][0] +
\r
644 matchProb[c1][c2] + backward[0 + ij]);
\r
645 LOG_PLUS_EQUALS (pairCounts[c2][c1],
\r
646 forward[k + i1j1] + transProb[k][0] +
\r
647 matchProb[c2][c1] + backward[0 + ij]);
\r
652 for (int k = 0; k < NumInsertStates; k++){
\r
653 LOG_PLUS_EQUALS (transCounts[0][2*k+1],
\r
654 forward[0 + i1j] + transProb[0][2*k+1] +
\r
655 insProb[c1][k] + backward[2*k+1 + ij]);
\r
656 LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1],
\r
657 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
\r
658 insProb[c1][k] + backward[2*k+1 + ij]);
\r
659 if (enableTrainEmissions){
\r
660 if (i == 1 && j == 0){
\r
661 LOG_PLUS_EQUALS (singleCounts[c1],
\r
662 initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]);
\r
665 LOG_PLUS_EQUALS (singleCounts[c1],
\r
666 forward[0 + i1j] + transProb[0][2*k+1] +
\r
667 insProb[c1][k] + backward[2*k+1 + ij]);
\r
668 LOG_PLUS_EQUALS (singleCounts[c1],
\r
669 forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
\r
670 insProb[c1][k] + backward[2*k+1 + ij]);
\r
676 for (int k = 0; k < NumInsertStates; k++){
\r
677 LOG_PLUS_EQUALS (transCounts[0][2*k+2],
\r
678 forward[0 + ij1] + transProb[0][2*k+2] +
\r
679 insProb[c2][k] + backward[2*k+2 + ij]);
\r
680 LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2],
\r
681 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
\r
682 insProb[c2][k] + backward[2*k+2 + ij]);
\r
683 if (enableTrainEmissions){
\r
684 if (i == 0 && j == 1){
\r
685 LOG_PLUS_EQUALS (singleCounts[c2],
\r
686 initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]);
\r
689 LOG_PLUS_EQUALS (singleCounts[c2],
\r
690 forward[0 + ij1] + transProb[0][2*k+2] +
\r
691 insProb[c2][k] + backward[2*k+2 + ij]);
\r
692 LOG_PLUS_EQUALS (singleCounts[c2],
\r
693 forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
\r
694 insProb[c2][k] + backward[2*k+2 + ij]);
\r
700 ij += NumMatrixTypes;
\r
701 i1j += NumMatrixTypes;
\r
702 ij1 += NumMatrixTypes;
\r
703 i1j1 += NumMatrixTypes;
\r
707 // scale all expected counts appropriately
\r
708 for (int i = 0; i < NumMatrixTypes; i++){
\r
709 initCounts[i] -= totalProb;
\r
710 for (int j = 0; j < NumMatrixTypes; j++)
\r
711 transCounts[i][j] -= totalProb;
\r
713 if (enableTrainEmissions){
\r
714 for (int i = 0; i < 256; i++){
\r
715 for (int j = 0; j < 256; j++)
\r
716 pairCounts[i][j] -= totalProb;
\r
717 singleCounts[i] -= totalProb;
\r
721 // compute new initial distribution
\r
722 float totalInitDistribCounts = 0;
\r
723 for (int i = 0; i < NumMatrixTypes; i++)
\r
724 totalInitDistribCounts += exp (initCounts[i]); // should be 2
\r
725 initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));
\r
726 for (int k = 0; k < NumInsertStates; k++){
\r
727 float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2;
\r
728 initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));
\r
731 // compute total counts for match state
\r
732 float inMatchStateCounts = 0;
\r
733 for (int i = 0; i < NumMatrixTypes; i++)
\r
734 inMatchStateCounts += exp (transCounts[0][i]);
\r
735 for (int i = 0; i < NumInsertStates; i++){
\r
737 // compute total counts for gap state
\r
738 float inGapStateCounts =
\r
739 exp (transCounts[2*i+1][0]) +
\r
740 exp (transCounts[2*i+1][2*i+1]) +
\r
741 exp (transCounts[2*i+2][0]) +
\r
742 exp (transCounts[2*i+2][2*i+2]);
\r
744 gapOpen[2*i] = gapOpen[2*i+1] =
\r
745 (exp (transCounts[0][2*i+1]) +
\r
746 exp (transCounts[0][2*i+2])) /
\r
747 (2 * inMatchStateCounts);
\r
749 gapExtend[2*i] = gapExtend[2*i+1] =
\r
750 (exp (transCounts[2*i+1][2*i+1]) +
\r
751 exp (transCounts[2*i+2][2*i+2])) /
\r
755 if (enableTrainEmissions){
\r
756 float totalPairCounts = 0;
\r
757 float totalSingleCounts = 0;
\r
758 for (int i = 0; i < 256; i++){
\r
759 for (int j = 0; j <= i; j++)
\r
760 totalPairCounts += exp (pairCounts[j][i]);
\r
761 totalSingleCounts += exp (singleCounts[i]);
\r
764 for (int i = 0; i < 256; i++) if (!islower ((char) i)){
\r
765 int li = (int)((unsigned char) tolower ((char) i));
\r
766 for (int j = 0; j <= i; j++) if (!islower ((char) j)){
\r
767 int lj = (int)((unsigned char) tolower ((char) j));
\r
768 emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] =
\r
769 emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;
\r
771 emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;
\r
776 /////////////////////////////////////////////////////////////////
\r
777 // ProbabilisticModel::ComputeAlignment()
\r
779 // Computes an alignment based on the given posterior matrix.
\r
780 // This is done by finding the maximum summing path (or
\r
781 // maximum weight trace) through the posterior matrix. The
\r
782 // final alignment is returned as a pair consisting of:
\r
783 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
\r
784 // denote insertions in one of the two sequences and
\r
785 // B's denote that both sequences are present (i.e.
\r
787 // (2) a float indicating the sum achieved
\r
788 /////////////////////////////////////////////////////////////////
\r
790 pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length,
\r
791 const VF &posterior) const {
\r
793 float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
\r
794 float *oldRow = twoRows;
\r
795 float *newRow = twoRows + seq2Length + 1;
\r
797 char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
\r
798 char *tracebackPtr = tracebackMatrix;
\r
800 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
\r
803 for (int i = 0; i <= seq2Length; i++){
\r
805 *(tracebackPtr++) = 'L';
\r
809 for (int i = 1; i <= seq1Length; i++){
\r
811 // initialize left column
\r
814 *(tracebackPtr++) = 'U';
\r
816 // fill in rest of row
\r
817 for (int j = 1; j <= seq2Length; j++){
\r
818 ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],
\r
819 'D', 'L', 'U', &newRow[j], tracebackPtr++);
\r
823 float *temp = oldRow;
\r
828 // store best score
\r
829 float total = oldRow[seq2Length];
\r
832 // compute traceback
\r
833 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
\r
834 int r = seq1Length, c = seq2Length;
\r
835 while (r != 0 || c != 0){
\r
836 char ch = tracebackMatrix[r*(seq2Length+1) + c];
\r
838 case 'L': c--; alignment->push_back ('Y'); break;
\r
839 case 'U': r--; alignment->push_back ('X'); break;
\r
840 case 'D': c--; r--; alignment->push_back ('B'); break;
\r
841 default: assert (false);
\r
845 delete [] tracebackMatrix;
\r
847 reverse (alignment->begin(), alignment->end());
\r
849 return make_pair(alignment, total);
\r
852 /////////////////////////////////////////////////////////////////
\r
853 // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
\r
855 // Similar to ComputeAlignment() except with gap penalties.
\r
856 /////////////////////////////////////////////////////////////////
\r
858 pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1,
\r
859 MultiSequence *align2,
\r
860 const VF &posterior, int numSeqs1,
\r
862 float gapOpenPenalty,
\r
863 float gapContinuePenalty) const {
\r
864 int seq1Length = align1->GetSequence(0)->GetLength();
\r
865 int seq2Length = align2->GetSequence(0)->GetLength();
\r
866 SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());
\r
867 SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());
\r
869 // grab character data
\r
870 for (int i = 0; i < align1->GetNumSequences(); i++)
\r
871 dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
\r
872 for (int i = 0; i < align2->GetNumSequences(); i++)
\r
873 dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
\r
875 // the number of active sequences at any given column is defined to be the
\r
876 // number of non-gap characters in that column; the number of gap opens at
\r
877 // any given column is defined to be the number of gap characters in that
\r
878 // column where the previous character in the respective sequence was not
\r
880 SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);
\r
881 SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);
\r
883 // compute number of active sequences and gap opens for each group
\r
884 for (int i = 0; i < align1->GetNumSequences(); i++){
\r
885 SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();
\r
886 numActive1[0] = numGapOpens1[0] = 0;
\r
887 for (int j = 1; j <= seq1Length; j++){
\r
888 if (dataPtr[j] != '-'){
\r
890 numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');
\r
894 for (int i = 0; i < align2->GetNumSequences(); i++){
\r
895 SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();
\r
896 numActive2[0] = numGapOpens2[0] = 0;
\r
897 for (int j = 1; j <= seq2Length; j++){
\r
898 if (dataPtr[j] != '-'){
\r
900 numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');
\r
905 VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));
\r
906 VF continuingPenalty1 (numSeqs1+1);
\r
907 VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));
\r
908 VF continuingPenalty2 (numSeqs2+1);
\r
910 // precompute penalties
\r
911 for (int i = 0; i <= numSeqs1; i++)
\r
912 for (int j = 0; j <= numSeqs2; j++)
\r
913 openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));
\r
914 for (int i = 0; i <= numSeqs1; i++)
\r
915 continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
\r
916 for (int i = 0; i <= numSeqs2; i++)
\r
917 for (int j = 0; j <= numSeqs1; j++)
\r
918 openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));
\r
919 for (int i = 0; i <= numSeqs2; i++)
\r
920 continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
\r
922 float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows);
\r
923 float *oldRowMatch = twoRows;
\r
924 float *newRowMatch = twoRows + (seq2Length+1);
\r
925 float *oldRowInsertX = twoRows + 2*(seq2Length+1);
\r
926 float *newRowInsertX = twoRows + 3*(seq2Length+1);
\r
927 float *oldRowInsertY = twoRows + 4*(seq2Length+1);
\r
928 float *newRowInsertY = twoRows + 5*(seq2Length+1);
\r
930 char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
\r
931 char *tracebackPtr = tracebackMatrix;
\r
933 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
\r
936 for (int i = 0; i <= seq2Length; i++){
\r
937 oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
\r
938 oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];
\r
939 *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y';
\r
944 for (int i = 1; i <= seq1Length; i++){
\r
946 // initialize left column
\r
947 newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
\r
948 newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];
\r
950 *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X';
\r
953 // fill in rest of row
\r
954 for (int j = 1; j <= seq2Length; j++){
\r
956 // going to MATCH state
\r
957 ChooseBestOfThree (oldRowMatch[j-1],
\r
958 oldRowInsertX[j-1],
\r
959 oldRowInsertY[j-1],
\r
960 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);
\r
961 newRowMatch[j] += *(posteriorPtr++);
\r
963 // going to INSERT X state
\r
964 ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
\r
965 oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
\r
966 oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
\r
967 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
\r
969 // going to INSERT Y state
\r
970 ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
\r
971 newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
\r
972 newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],
\r
973 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);
\r
978 temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;
\r
979 temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;
\r
980 temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;
\r
983 // store best score
\r
986 ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],
\r
987 'M', 'X', 'Y', &total, &matrix);
\r
991 // compute traceback
\r
992 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
\r
993 int r = seq1Length, c = seq2Length;
\r
994 while (r != 0 || c != 0){
\r
996 int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
\r
997 char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset];
\r
999 case 'Y': c--; alignment->push_back ('Y'); break;
\r
1000 case 'X': r--; alignment->push_back ('X'); break;
\r
1001 case 'M': c--; r--; alignment->push_back ('B'); break;
\r
1002 default: assert (false);
\r
1007 delete [] tracebackMatrix;
\r
1009 reverse (alignment->begin(), alignment->end());
\r
1011 return make_pair(alignment, 1.0f);
\r
1014 /////////////////////////////////////////////////////////////////
\r
1015 // ProbabilisticModel::ComputeViterbiAlignment()
\r
1017 // Computes the highest probability pairwise alignment using the
\r
1018 // probabilistic model. The final alignment is returned as a
\r
1019 // pair consisting of:
\r
1020 // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
\r
1021 // denote insertions in one of the two sequences and
\r
1022 // B's denote that both sequences are present (i.e.
\r
1024 // (2) a float containing the log probability of the best
\r
1025 // alignment (not used)
\r
1026 /////////////////////////////////////////////////////////////////
\r
1029 pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const {
\r
1034 const int seq1Length = seq1->GetLength();
\r
1035 const int seq2Length = seq2->GetLength();
\r
1037 // retrieve the points to the beginning of each sequence
\r
1038 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
\r
1039 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
\r
1041 // create viterbi matrix
\r
1042 VF *viterbiPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
\r
1043 assert (viterbiPtr);
\r
1044 VF &viterbi = *viterbiPtr;
\r
1046 // create traceback matrix
\r
1047 VI *tracebackPtr = new VI (3 * (seq1Length+1) * (seq2Length+1), -1);
\r
1048 assert (tracebackPtr);
\r
1049 VI &traceback = *tracebackPtr;
\r
1051 // initialization condition
\r
1053 for (int k = 0; k < NumMatrixTypes; k++)
\r
1054 viterbi[k] = initialDistribution[k];
\r
1056 viterbi[0] = LOG(0.6080327034);
\r
1057 viterbi[1] = LOG(0.1959836632);
\r
1058 viterbi[2] = LOG(0.1959836632);
\r
1060 // remember offset for each index combination
\r
1062 int i1j = -seq2Length - 1;
\r
1064 int i1j1 = -seq2Length - 2;
\r
1071 // compute viterbi scores
\r
1072 for (int i = 0; i <= seq1Length; i++){
\r
1073 unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
\r
1074 for (int j = 0; j <= seq2Length; j++){
\r
1075 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
\r
1077 if (i > 0 && j > 0){
\r
1078 for (int k = 0; k < 3; k++){
\r
1079 float newVal = viterbi[k + i1j1] + local_transProb[k][0] + matchProb[c1][c2];
\r
1080 if (viterbi[0 + ij] < newVal){
\r
1081 viterbi[0 + ij] = newVal;
\r
1082 traceback[0 + ij] = k;
\r
1087 for (int k = 0; k < 1; k++){
\r
1088 float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + local_transProb[0][2*k+1];
\r
1089 float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + local_transProb[2*k+1][2*k+1];
\r
1090 if (valFromMatch >= valFromIns){
\r
1091 viterbi[2*k+1 + ij] = valFromMatch;
\r
1092 traceback[2*k+1 + ij] = 0;
\r
1095 viterbi[2*k+1 + ij] = valFromIns;
\r
1096 traceback[2*k+1 + ij] = 2*k+1;
\r
1101 for (int k = 0; k < 1; k++){
\r
1102 float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + local_transProb[0][2*k+2];
\r
1103 float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + local_transProb[2*k+2][2*k+2];
\r
1104 if (valFromMatch >= valFromIns){
\r
1105 viterbi[2*k+2 + ij] = valFromMatch;
\r
1106 traceback[2*k+2 + ij] = 0;
\r
1109 viterbi[2*k+2 + ij] = valFromIns;
\r
1110 traceback[2*k+2 + ij] = 2*k+2;
\r
1122 // figure out best terminating cell
\r
1123 float bestProb = LOG_ZERO;
\r
1125 viterbi[0] = LOG(0.6080327034);
\r
1126 viterbi[1] = LOG(0.1959836632);
\r
1127 viterbi[2] = LOG(0.1959836632);
\r
1129 for (int k = 0; k < 3; k++){
\r
1130 float thisProb = viterbi[k + 3 * ((seq1Length+1)*(seq2Length+1) - 1)] + viterbi[k];
\r
1131 if (bestProb < thisProb){
\r
1132 bestProb = thisProb;
\r
1136 assert (state != -1);
\r
1138 delete viterbiPtr;
\r
1140 // compute traceback
\r
1141 SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
\r
1142 int r = seq1Length, c = seq2Length;
\r
1143 while (r != 0 || c != 0){
\r
1144 int newState = traceback[state + 3 * (r * (seq2Length+1) + c)];
\r
1145 if (state == 0){ c--; r--; alignment->push_back ('B');}
\r
1146 else if (state % 2 == 1){ r--; alignment->push_back ('X'); }
\r
1147 else { c--; alignment->push_back ('Y'); }
\r
1151 delete tracebackPtr;
\r
1153 reverse (alignment->begin(), alignment->end());
\r
1155 return make_pair(alignment, bestProb);
\r
1158 /////////////////////////////////////////////////////////////////
\r
1159 // ProbabilisticModel::BuildPosterior()
\r
1161 // Builds a posterior probability matrix needed to align a pair
\r
1162 // of alignments. Mathematically, the returned matrix M is
\r
1163 // defined as follows:
\r
1164 // M[i,j] = sum sum f(s,t,i,j)
\r
1165 // s in align1 t in align2
\r
1167 // [ P(s[i'] <--> t[j'])
\r
1168 // [ if s[i'] is a letter in the ith column of align1 and
\r
1169 // [ t[j'] it a letter in the jth column of align2
\r
1173 /////////////////////////////////////////////////////////////////
\r
1175 VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2,
\r
1176 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
\r
1177 float cutoff = 0.0f) const {
\r
1178 const int seq1Length = align1->GetSequence(0)->GetLength();
\r
1179 const int seq2Length = align2->GetSequence(0)->GetLength();
\r
1181 VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr);
\r
1182 VF &posterior = *posteriorPtr;
\r
1183 VF::iterator postPtr = posterior.begin();
\r
1185 // for each s in align1
\r
1186 for (int i = 0; i < align1->GetNumSequences(); i++){
\r
1187 int first = align1->GetSequence(i)->GetLabel();
\r
1188 SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
\r
1190 // for each t in align2
\r
1191 for (int j = 0; j < align2->GetNumSequences(); j++){
\r
1192 int second = align2->GetSequence(j)->GetLabel();
\r
1193 SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();
\r
1195 if (first < second){
\r
1197 // get the associated sparse matrix
\r
1198 SparseMatrix *matrix = sparseMatrices[first][second];
\r
1200 for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){
\r
1201 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
\r
1202 int base = (*mapping1)[ii] * (seq2Length+1);
\r
1203 int rowSize = matrix->GetRowSize(ii);
\r
1205 // add in all relevant values
\r
1206 for (int jj = 0; jj < rowSize; jj++)
\r
1207 posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;
\r
1209 // subtract cutoff
\r
1210 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
\r
1211 posterior[base + (*mapping2)[jj]] -= cutoff;
\r
1216 // get the associated sparse matrix
\r
1217 SparseMatrix *matrix = sparseMatrices[second][first];
\r
1219 for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){
\r
1220 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
\r
1221 int base = (*mapping2)[jj];
\r
1222 int rowSize = matrix->GetRowSize(jj);
\r
1224 // add in all relevant values
\r
1225 for (int ii = 0; ii < rowSize; ii++)
\r
1226 posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second;
\r
1228 // subtract cutoff
\r
1229 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
\r
1230 posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff;
\r
1242 return posteriorPtr;
\r
1245 //added by Liu Yongchao.Feb 23, 2010
\r
1246 VF *BuildPosterior(int* seqsWeights, MultiSequence *align1,
\r
1247 MultiSequence *align2,
\r
1248 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
\r
1249 float cutoff = 0.0f) const {
\r
1250 const int seq1Length = align1->GetSequence(0)->GetLength();
\r
1251 const int seq2Length = align2->GetSequence(0)->GetLength();
\r
1253 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
\r
1254 assert(posteriorPtr);
\r
1255 VF &posterior = *posteriorPtr;
\r
1256 VF::iterator postPtr = posterior.begin();
\r
1258 //compute the total sum of all weights
\r
1259 float totalWeights = 0;
\r
1260 for (int i = 0; i < align1->GetNumSequences(); i++) {
\r
1261 int first = align1->GetSequence(i)->GetLabel();
\r
1262 int w1 = seqsWeights[first];
\r
1263 for (int j = 0; j < align2->GetNumSequences(); j++) {
\r
1264 int second = align2->GetSequence(j)->GetLabel();
\r
1265 int w2 = seqsWeights[second];
\r
1267 totalWeights += w1 * w2;
\r
1270 // for each s in align1
\r
1271 for (int i = 0; i < align1->GetNumSequences(); i++) {
\r
1272 int first = align1->GetSequence(i)->GetLabel();
\r
1273 int w1 = seqsWeights[first];
\r
1274 SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
\r
1275 // for each t in align2
\r
1276 for (int j = 0; j < align2->GetNumSequences(); j++) {
\r
1277 int second = align2->GetSequence(j)->GetLabel();
\r
1278 int w2 = seqsWeights[second];
\r
1279 SafeVector<int> *mapping2 =
\r
1280 align2->GetSequence(j)->GetMapping();
\r
1282 float w = (float) (w1 * w2) / totalWeights;
\r
1283 if (first < second) {
\r
1285 // get the associated sparse matrix
\r
1286 SparseMatrix *matrix = sparseMatrices[first][second];
\r
1288 for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
\r
1289 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
\r
1290 int base = (*mapping1)[ii] * (seq2Length + 1);
\r
1291 int rowSize = matrix->GetRowSize(ii);
\r
1293 // add in all relevant values
\r
1294 for (int jj = 0; jj < rowSize; jj++)
\r
1295 posterior[base + (*mapping2)[row[jj].first]] += w
\r
1298 // subtract cutoff
\r
1299 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
\r
1300 posterior[base + (*mapping2)[jj]] -= w * cutoff;
\r
1305 // get the associated sparse matrix
\r
1306 SparseMatrix *matrix = sparseMatrices[second][first];
\r
1308 for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
\r
1309 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
\r
1310 int base = (*mapping2)[jj];
\r
1311 int rowSize = matrix->GetRowSize(jj);
\r
1313 // add in all relevant values
\r
1314 for (int ii = 0; ii < rowSize; ii++)
\r
1316 + (*mapping1)[row[ii].first]
\r
1317 * (seq2Length + 1)] += w
\r
1320 // subtract cutoff
\r
1321 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
\r
1322 posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
\r
1334 return posteriorPtr;
\r