binaries/src/mafft/extensions/mxscarna_src/probconsRNA/ProbabilisticModel.h

   1 /////////////////////////////////////////////////////////////////
   2 // ProbabilisticModel.h
   3 //
   4 // Routines for (1) posterior probability computations
   5 //              (2) chained anchoring
   6 //              (3) maximum weight trace alignment
   7 /////////////////////////////////////////////////////////////////
   8
   9 #ifndef PROBABILISTICMODEL_H
  10 #define PROBABILISTICMODEL_H
  11
  12 #include <list>
  13 #include <cmath>
  14 #include <cstdio>
  15 #include "SafeVector.h"
  16 #include "ScoreType.h"
  17 #include "SparseMatrix.h"
  18 #include "MultiSequence.h"
  19 #include "StemCandidate.hpp"
  20 #include "scarna.hpp"
  21 #include "nrutil.h"
  22 #include <vector>
  23
  24 using namespace std;
  25
  26 const int NumMatchStates = 1;                                    // note that in this version the number
  27                                                                  // of match states is fixed at 1...will
  28                                                                  // change in future versions
  29 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
  30
  31 /////////////////////////////////////////////////////////////////
  32 // ProbabilisticModel
  33 //
  34 // Class for storing the parameters of a probabilistic model and
  35 // performing different computations based on those parameters.
  36 // In particular, this class handles the computation of
  37 // posterior probabilities that may be used in alignment.
  38 /////////////////////////////////////////////////////////////////
  39 namespace MXSCARNA {
  40 class ProbabilisticModel {
  41
  42   float initialDistribution[NumMatrixTypes];               // holds the initial probabilities for each state
  43   float transProb[NumMatrixTypes][NumMatrixTypes];         // holds all state-to-state transition probabilities
  44   float matchProb[256][256];                               // emission probabilities for match states
  45   float insProb[256][NumMatrixTypes];                      // emission probabilities for insert states
  46   NRMat<float> WM;
  47
  48  public:
  49
  50   /////////////////////////////////////////////////////////////////
  51   // ProbabilisticModel::ProbabilisticModel()
  52   //
  53   // Constructor.  Builds a new probabilistic model using the
  54   // given parameters.
  55   /////////////////////////////////////////////////////////////////
  56
  57   ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,
  58                       const VVF &emitPairs, const VF &emitSingle){
  59
  60     // build transition matrix
  61     VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));
  62     transMat[0][0] = 1;
  63     for (int i = 0; i < NumInsertStates; i++){
  64       transMat[0][2*i+1] = gapOpen[2*i];
  65       transMat[0][2*i+2] = gapOpen[2*i+1];
  66       transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]);
  67       assert (transMat[0][0] > 0);
  68       transMat[2*i+1][2*i+1] = gapExtend[2*i];
  69       transMat[2*i+2][2*i+2] = gapExtend[2*i+1];
  70       transMat[2*i+1][2*i+2] = 0;
  71       transMat[2*i+2][2*i+1] = 0;
  72       transMat[2*i+1][0] = 1 - gapExtend[2*i];
  73       transMat[2*i+2][0] = 1 - gapExtend[2*i+1];
  74     }
  75
  76     // create initial and transition probability matrices
  77     for (int i = 0; i < NumMatrixTypes; i++){
  78       initialDistribution[i] = LOG (initDistribMat[i]);
  79       for (int j = 0; j < NumMatrixTypes; j++)
  80         transProb[i][j] = LOG (transMat[i][j]);
  81     }
  82
  83     // create insertion and match probability matrices
  84     for (int i = 0; i < 256; i++){
  85       for (int j = 0; j < NumMatrixTypes; j++)
  86         insProb[i][j] = LOG (emitSingle[i]);
  87       for (int j = 0; j < 256; j++)
  88         matchProb[i][j] = LOG (emitPairs[i][j]);
  89     }
  90   }
  91
  92   NRMat<float> weightMatchScore(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2,
  93                         std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2, NRMat<float> WM) {
  94       int len      = WORDLENGTH;
  95       int size     = matchPSCS1->size();
  96       float weight = 1000;
  97
  98       for(int iter = 0; iter < size; iter++) {
  99           int i = matchPSCS1->at(iter);
 100           int j = matchPSCS2->at(iter);
 101
 102           const StemCandidate &sc1 = pscs1->at(i);
 103           const StemCandidate &sc2 = pscs2->at(j);
 104
 105           for(int k = 0; k < len; k++) {
 106               WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight;
 107 //            sumWeight += weight;
 108           }
 109       }
 110       return WM;
 111   }
 112
 113   /////////////////////////////////////////////////////////////////
 114   // ProbabilisticModel::ComputeForwardMatrix()
 115   //
 116   // Computes a set of forward probability matrices for aligning
 117   // seq1 and seq2.
 118   //
 119   // For efficiency reasons, a single-dimensional floating-point
 120   // array is used here, with the following indexing scheme:
 121   //
 122   //    forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
 123   //    refers to the probability of aligning through j characters
 124   //    of the first sequence, k characters of the second sequence,
 125   //    and ending in state i.
 126   /////////////////////////////////////////////////////////////////
 127
 128   VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const {
 129
 130     assert (seq1);
 131     assert (seq2);
 132
 133     const int seq1Length = seq1->GetLength();
 134     const int seq2Length = seq2->GetLength();
 135
 136     // retrieve the points to the beginning of each sequence
 137     SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 138     SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 139
 140     // create matrix
 141     VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
 142     assert (forwardPtr);
 143     VF &forward = *forwardPtr;
 144
 145     // initialization condition
 146     forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] =
 147       initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
 148
 149     for (int k = 0; k < NumInsertStates; k++){
 150       forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] =
 151         initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];
 152       forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] =
 153         initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k];
 154     }
 155
 156     // remember offset for each index combination
 157     int ij = 0;
 158     int i1j = -seq2Length - 1;
 159     int ij1 = -1;
 160     int i1j1 = -seq2Length - 2;
 161
 162     ij *= NumMatrixTypes;
 163     i1j *= NumMatrixTypes;
 164     ij1 *= NumMatrixTypes;
 165     i1j1 *= NumMatrixTypes;
 166
 167     // compute forward scores
 168     for (int i = 0; i <= seq1Length; i++){
 169       unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
 170       for (int j = 0; j <= seq2Length; j++){
 171         unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
 172
 173         if (i > 1 || j > 1){
 174           if (i > 0 && j > 0){
 175             forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
 176             for (int k = 1; k < NumMatrixTypes; k++)
 177               LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);
 178             forward[0 + ij] += matchProb[c1][c2];
 179           }
 180           if (i > 0){
 181             for (int k = 0; k < NumInsertStates; k++)
 182               forward[2*k+1 + ij] = insProb[c1][k] +
 183                 LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],
 184                          forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]);
 185           }
 186           if (j > 0){
 187             for (int k = 0; k < NumInsertStates; k++)
 188               forward[2*k+2 + ij] = insProb[c2][k] +
 189                 LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],
 190                          forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]);
 191           }
 192         }
 193
 194         ij += NumMatrixTypes;
 195         i1j += NumMatrixTypes;
 196         ij1 += NumMatrixTypes;
 197         i1j1 += NumMatrixTypes;
 198       }
 199     }
 200
 201     return forwardPtr;
 202   }
 203
 204   /////////////////////////////////////////////////////////////////
 205   // ProbabilisticModel::ComputeBackwardMatrix()
 206   //
 207   // Computes a set of backward probability matrices for aligning
 208   // seq1 and seq2.
 209   //
 210   // For efficiency reasons, a single-dimensional floating-point
 211   // array is used here, with the following indexing scheme:
 212   //
 213   //    backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
 214   //    refers to the probability of starting in state i and
 215   //    aligning from character j+1 to the end of the first
 216   //    sequence and from character k+1 to the end of the second
 217   //    sequence.
 218   /////////////////////////////////////////////////////////////////
 219
 220   VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const {
 221
 222     assert (seq1);
 223     assert (seq2);
 224
 225     const int seq1Length = seq1->GetLength();
 226     const int seq2Length = seq2->GetLength();
 227     SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 228     SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 229
 230     // create matrix
 231     VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
 232     assert (backwardPtr);
 233     VF &backward = *backwardPtr;
 234
 235     // initialization condition
 236     for (int k = 0; k < NumMatrixTypes; k++)
 237       backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];
 238
 239     // remember offset for each index combination
 240     int ij = (seq1Length+1) * (seq2Length+1) - 1;
 241     int i1j = ij + seq2Length + 1;
 242     int ij1 = ij + 1;
 243     int i1j1 = ij + seq2Length + 2;
 244
 245     ij *= NumMatrixTypes;
 246     i1j *= NumMatrixTypes;
 247     ij1 *= NumMatrixTypes;
 248     i1j1 *= NumMatrixTypes;
 249
 250     // compute backward scores
 251     for (int i = seq1Length; i >= 0; i--){
 252       unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];
 253       for (int j = seq2Length; j >= 0; j--){
 254         unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];
 255
 256         if (i < seq1Length && j < seq2Length){
 257           const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
 258           for (int k = 0; k < NumMatrixTypes; k++)
 259             LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);
 260         }
 261         if (i < seq1Length){
 262           for (int k = 0; k < NumInsertStates; k++){
 263             LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]);
 264             LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]);
 265           }
 266         }
 267         if (j < seq2Length){
 268           for (int k = 0; k < NumInsertStates; k++){
 269             LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]);
 270             LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]);
 271           }
 272         }
 273
 274         ij -= NumMatrixTypes;
 275         i1j -= NumMatrixTypes;
 276         ij1 -= NumMatrixTypes;
 277         i1j1 -= NumMatrixTypes;
 278       }
 279     }
 280
 281     return backwardPtr;
 282   }
 283
 284   /////////////////////////////////////////////////////////////////
 285   // ProbabilisticModel::ComputeTotalProbability()
 286   //
 287   // Computes the total probability of an alignment given
 288   // the forward and backward matrices.
 289   /////////////////////////////////////////////////////////////////
 290
 291   float ComputeTotalProbability (int seq1Length, int seq2Length,
 292                                  const VF &forward, const VF &backward) const {
 293
 294     // compute total probability
 295     float totalForwardProb = LOG_ZERO;
 296     float totalBackwardProb = LOG_ZERO;
 297     for (int k = 0; k < NumMatrixTypes; k++){
 298       LOG_PLUS_EQUALS (totalForwardProb,
 299                        forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
 300                        backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
 301     }
 302
 303     totalBackwardProb =
 304       forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
 305       backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];
 306
 307     for (int k = 0; k < NumInsertStates; k++){
 308       LOG_PLUS_EQUALS (totalBackwardProb,
 309                        forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
 310                        backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]);
 311       LOG_PLUS_EQUALS (totalBackwardProb,
 312                        forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
 313                        backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]);
 314     }
 315
 316     //    cerr << totalForwardProb << " " << totalBackwardProb << endl;
 317
 318     return (totalForwardProb + totalBackwardProb) / 2;
 319   }
 320
 321   /////////////////////////////////////////////////////////////////
 322   // ProbabilisticModel::ComputePosteriorMatrix()
 323   //
 324   // Computes the posterior probability matrix based on
 325   // the forward and backward matrices.
 326   /////////////////////////////////////////////////////////////////
 327
 328   VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2,
 329                               const VF &forward, const VF &backward) const {
 330
 331     assert (seq1);
 332     assert (seq2);
 333
 334     const int seq1Length = seq1->GetLength();
 335     const int seq2Length = seq2->GetLength();
 336
 337     float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
 338                                                forward, backward);
 339
 340     // compute posterior matrices
 341     VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);
 342     VF &posterior = *posteriorPtr;
 343
 344     int ij = 0;
 345     VF::iterator ptr = posterior.begin();
 346
 347     for (int i = 0; i <= seq1Length; i++){
 348       for (int j = 0; j <= seq2Length; j++){
 349         *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));
 350         ij += NumMatrixTypes;
 351       }
 352     }
 353
 354     posterior[0] = 0;
 355
 356     return posteriorPtr;
 357   }
 358
 359   /*
 360   /////////////////////////////////////////////////////////////////
 361   // ProbabilisticModel::ComputeExpectedCounts()
 362   //
 363   // Computes the expected counts for the various transitions.
 364   /////////////////////////////////////////////////////////////////
 365
 366   VVF *ComputeExpectedCounts () const {
 367
 368     assert (seq1);
 369     assert (seq2);
 370
 371     const int seq1Length = seq1->GetLength();
 372     const int seq2Length = seq2->GetLength();
 373     SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 374     SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 375
 376     // compute total probability
 377     float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
 378                                                forward, backward);
 379
 380     // initialize expected counts
 381     VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
 382     VVF &counts = *countsPtr;
 383
 384     // remember offset for each index combination
 385     int ij = 0;
 386     int i1j = -seq2Length - 1;
 387     int ij1 = -1;
 388     int i1j1 = -seq2Length - 2;
 389
 390     ij *= NumMatrixTypes;
 391     i1j *= NumMatrixTypes;
 392     ij1 *= NumMatrixTypes;
 393     i1j1 *= NumMatrixTypes;
 394
 395     // compute expected counts
 396     for (int i = 0; i <= seq1Length; i++){
 397       unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
 398       for (int j = 0; j <= seq2Length; j++){
 399         unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
 400
 401         if (i > 0 && j > 0){
 402           for (int k = 0; k < NumMatrixTypes; k++)
 403             LOG_PLUS_EQUALS (counts[k][0],
 404                              forward[k + i1j1] + transProb[k][0] +
 405                              matchProb[c1][c2] + backward[0 + ij]);
 406         }
 407         if (i > 0){
 408           for (int k = 0; k < NumInsertStates; k++){
 409             LOG_PLUS_EQUALS (counts[0][2*k+1],
 410                              forward[0 + i1j] + transProb[0][2*k+1] +
 411                              insProb[c1][k] + backward[2*k+1 + ij]);
 412             LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
 413                              forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
 414                              insProb[c1][k] + backward[2*k+1 + ij]);
 415           }
 416         }
 417         if (j > 0){
 418           for (int k = 0; k < NumInsertStates; k++){
 419             LOG_PLUS_EQUALS (counts[0][2*k+2],
 420                              forward[0 + ij1] + transProb[0][2*k+2] +
 421                              insProb[c2][k] + backward[2*k+2 + ij]);
 422             LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
 423                              forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
 424                              insProb[c2][k] + backward[2*k+2 + ij]);
 425           }
 426         }
 427
 428         ij += NumMatrixTypes;
 429         i1j += NumMatrixTypes;
 430         ij1 += NumMatrixTypes;
 431         i1j1 += NumMatrixTypes;
 432       }
 433     }
 434
 435     // scale all expected counts appropriately
 436     for (int i = 0; i < NumMatrixTypes; i++)
 437       for (int j = 0; j < NumMatrixTypes; j++)
 438         counts[i][j] -= totalProb;
 439
 440   }
 441   */
 442
 443   /////////////////////////////////////////////////////////////////
 444   // ProbabilisticModel::ComputeNewParameters()
 445   //
 446   // Computes a new parameter set based on the expected counts
 447   // given.
 448   /////////////////////////////////////////////////////////////////
 449
 450   void ComputeNewParameters (Sequence *seq1, Sequence *seq2,
 451                              const VF &forward, const VF &backward,
 452                              VF &initDistribMat, VF &gapOpen,
 453                              VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
 454
 455     assert (seq1);
 456     assert (seq2);
 457
 458     const int seq1Length = seq1->GetLength();
 459     const int seq2Length = seq2->GetLength();
 460     SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 461     SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 462
 463     // compute total probability
 464     float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
 465                                                forward, backward);
 466
 467     // initialize expected counts
 468     VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));
 469     VF initCounts (NumMatrixTypes, LOG_ZERO);
 470     VVF pairCounts (256, VF (256, LOG_ZERO));
 471     VF singleCounts (256, LOG_ZERO);
 472
 473     // remember offset for each index combination
 474     int ij = 0;
 475     int i1j = -seq2Length - 1;
 476     int ij1 = -1;
 477     int i1j1 = -seq2Length - 2;
 478
 479     ij *= NumMatrixTypes;
 480     i1j *= NumMatrixTypes;
 481     ij1 *= NumMatrixTypes;
 482     i1j1 *= NumMatrixTypes;
 483
 484     // compute initial distribution posteriors
 485     initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
 486                              backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],
 487                              forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
 488                              backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
 489     for (int k = 0; k < NumInsertStates; k++){
 490       initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
 491                                    backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)],
 492                                    forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
 493                                    backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
 494       initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
 495                                    backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)],
 496                                    forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
 497                                    backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
 498     }
 499
 500     // compute expected counts
 501     for (int i = 0; i <= seq1Length; i++){
 502       unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
 503       for (int j = 0; j <= seq2Length; j++){
 504         unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
 505
 506         if (i > 0 && j > 0){
 507           if (enableTrainEmissions && i == 1 && j == 1){
 508             LOG_PLUS_EQUALS (pairCounts[c1][c2],
 509                              initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);
 510             LOG_PLUS_EQUALS (pairCounts[c2][c1],
 511                              initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);
 512           }
 513
 514           for (int k = 0; k < NumMatrixTypes; k++){
 515             LOG_PLUS_EQUALS (transCounts[k][0],
 516                              forward[k + i1j1] + transProb[k][0] +
 517                              matchProb[c1][c2] + backward[0 + ij]);
 518             if (enableTrainEmissions && i != 1 || j != 1){
 519               LOG_PLUS_EQUALS (pairCounts[c1][c2],
 520                                forward[k + i1j1] + transProb[k][0] +
 521                                matchProb[c1][c2] + backward[0 + ij]);
 522               LOG_PLUS_EQUALS (pairCounts[c2][c1],
 523                                forward[k + i1j1] + transProb[k][0] +
 524                                matchProb[c2][c1] + backward[0 + ij]);
 525             }
 526           }
 527         }
 528         if (i > 0){
 529           for (int k = 0; k < NumInsertStates; k++){
 530             LOG_PLUS_EQUALS (transCounts[0][2*k+1],
 531                              forward[0 + i1j] + transProb[0][2*k+1] +
 532                              insProb[c1][k] + backward[2*k+1 + ij]);
 533             LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1],
 534                              forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
 535                              insProb[c1][k] + backward[2*k+1 + ij]);
 536             if (enableTrainEmissions){
 537               if (i == 1 && j == 0){
 538                 LOG_PLUS_EQUALS (singleCounts[c1],
 539                                  initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]);
 540               }
 541               else {
 542                 LOG_PLUS_EQUALS (singleCounts[c1],
 543                                  forward[0 + i1j] + transProb[0][2*k+1] +
 544                                  insProb[c1][k] + backward[2*k+1 + ij]);
 545                 LOG_PLUS_EQUALS (singleCounts[c1],
 546                                  forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
 547                                  insProb[c1][k] + backward[2*k+1 + ij]);
 548               }
 549             }
 550           }
 551         }
 552         if (j > 0){
 553           for (int k = 0; k < NumInsertStates; k++){
 554             LOG_PLUS_EQUALS (transCounts[0][2*k+2],
 555                              forward[0 + ij1] + transProb[0][2*k+2] +
 556                              insProb[c2][k] + backward[2*k+2 + ij]);
 557             LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2],
 558                              forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
 559                              insProb[c2][k] + backward[2*k+2 + ij]);
 560             if (enableTrainEmissions){
 561               if (i == 0 && j == 1){
 562                 LOG_PLUS_EQUALS (singleCounts[c2],
 563                                  initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]);
 564               }
 565               else {
 566                 LOG_PLUS_EQUALS (singleCounts[c2],
 567                                  forward[0 + ij1] + transProb[0][2*k+2] +
 568                                  insProb[c2][k] + backward[2*k+2 + ij]);
 569                 LOG_PLUS_EQUALS (singleCounts[c2],
 570                                  forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
 571                                  insProb[c2][k] + backward[2*k+2 + ij]);
 572               }
 573             }
 574           }
 575         }
 576
 577         ij += NumMatrixTypes;
 578         i1j += NumMatrixTypes;
 579         ij1 += NumMatrixTypes;
 580         i1j1 += NumMatrixTypes;
 581       }
 582     }
 583
 584     // scale all expected counts appropriately
 585     for (int i = 0; i < NumMatrixTypes; i++){
 586       initCounts[i] -= totalProb;
 587       for (int j = 0; j < NumMatrixTypes; j++)
 588         transCounts[i][j] -= totalProb;
 589     }
 590     if (enableTrainEmissions){
 591       for (int i = 0; i < 256; i++){
 592         for (int j = 0; j < 256; j++)
 593           pairCounts[i][j] -= totalProb;
 594         singleCounts[i] -= totalProb;
 595       }
 596     }
 597
 598     // compute new initial distribution
 599     float totalInitDistribCounts = 0;
 600     for (int i = 0; i < NumMatrixTypes; i++)
 601       totalInitDistribCounts += exp (initCounts[i]); // should be 2
 602     initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));
 603     for (int k = 0; k < NumInsertStates; k++){
 604       float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2;
 605       initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));
 606     }
 607
 608     // compute total counts for match state
 609     float inMatchStateCounts = 0;
 610     for (int i = 0; i < NumMatrixTypes; i++)
 611       inMatchStateCounts += exp (transCounts[0][i]);
 612     for (int i = 0; i < NumInsertStates; i++){
 613
 614       // compute total counts for gap state
 615       float inGapStateCounts =
 616         exp (transCounts[2*i+1][0]) +
 617         exp (transCounts[2*i+1][2*i+1]) +
 618         exp (transCounts[2*i+2][0]) +
 619         exp (transCounts[2*i+2][2*i+2]);
 620
 621       gapOpen[2*i] = gapOpen[2*i+1] =
 622         (exp (transCounts[0][2*i+1]) +
 623          exp (transCounts[0][2*i+2])) /
 624         (2 * inMatchStateCounts);
 625
 626       gapExtend[2*i] = gapExtend[2*i+1] =
 627         (exp (transCounts[2*i+1][2*i+1]) +
 628          exp (transCounts[2*i+2][2*i+2])) /
 629         inGapStateCounts;
 630     }
 631
 632     if (enableTrainEmissions){
 633       float totalPairCounts = 0;
 634       float totalSingleCounts = 0;
 635       for (int i = 0; i < 256; i++){
 636         for (int j = 0; j <= i; j++)
 637           totalPairCounts += exp (pairCounts[j][i]);
 638         totalSingleCounts += exp (singleCounts[i]);
 639       }
 640
 641       for (int i = 0; i < 256; i++) if (!islower ((char) i)){
 642         int li = (int)((unsigned char) tolower ((char) i));
 643         for (int j = 0; j <= i; j++) if (!islower ((char) j)){
 644           int lj = (int)((unsigned char) tolower ((char) j));
 645           emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] =
 646             emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;
 647         }
 648         emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;
 649       }
 650     }
 651   }
 652
 653   /////////////////////////////////////////////////////////////////
 654   // ProbabilisticModel::ComputeAlignment()
 655   //
 656   // Computes an alignment based on the given posterior matrix.
 657   // This is done by finding the maximum summing path (or
 658   // maximum weight trace) through the posterior matrix.  The
 659   // final alignment is returned as a pair consisting of:
 660   //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
 661   //        denote insertions in one of the two sequences and
 662   //        B's denote that both sequences are present (i.e.
 663   //        matches).
 664   //    (2) a float indicating the sum achieved
 665   /////////////////////////////////////////////////////////////////
 666
 667   pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length, const VF &posterior) const {
 668
 669     float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
 670     float *oldRow = twoRows;
 671     float *newRow = twoRows + seq2Length + 1;
 672
 673     char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
 674     char *tracebackPtr = tracebackMatrix;
 675
 676     VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
 677
 678     // initialization
 679     for (int i = 0; i <= seq2Length; i++){
 680       oldRow[i] = 0;
 681       *(tracebackPtr++) = 'L';
 682     }
 683
 684     // fill in matrix
 685     for (int i = 1; i <= seq1Length; i++){
 686
 687       // initialize left column
 688       newRow[0] = 0;
 689       posteriorPtr++;
 690       *(tracebackPtr++) = 'U';
 691
 692       // fill in rest of row
 693       for (int j = 1; j <= seq2Length; j++){
 694         ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],
 695                            'D', 'L', 'U', &newRow[j], tracebackPtr++); // Match, insert, delete
 696       }
 697
 698       // swap rows
 699       float *temp = oldRow;
 700       oldRow = newRow;
 701       newRow = temp;
 702     }
 703
 704     // store best score
 705     float total = oldRow[seq2Length];
 706     delete [] twoRows;
 707
 708     // compute traceback
 709     SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
 710     int r = seq1Length, c = seq2Length;
 711     while (r != 0 || c != 0){
 712       char ch = tracebackMatrix[r*(seq2Length+1) + c];
 713       switch (ch){
 714       case 'L': c--; alignment->push_back ('Y'); break;
 715       case 'U': r--; alignment->push_back ('X'); break;
 716       case 'D': c--; r--; alignment->push_back ('B'); break;
 717       default: assert (false);
 718       }
 719     }
 720
 721     delete [] tracebackMatrix;
 722
 723     reverse (alignment->begin(), alignment->end());
 724
 725     return make_pair(alignment, total);
 726   }
 727
 728   /////////////////////////////////////////////////////////////////
 729   // ProbabilisticModel::ComputeAlignment2()
 730   //
 731   // Computes an alignment based on the given posterior matrix.
 732   // This is done by finding the maximum summing path (or
 733   // maximum weight trace) through the posterior matrix.  The
 734   // final alignment is returned as a pair consisting of:
 735   //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
 736   //        denote insertions in one of the two sequences and
 737   //        B's denote that both sequences are present (i.e.
 738   //        matches).
 739   //    (2) a float indicating the sum achieved
 740   /////////////////////////////////////////////////////////////////
 741
 742   pair<SafeVector<char> *, float> ComputeAlignment2 (int seq1Length, int seq2Length,
 743                                                      const VF &posterior, std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2,
 744                                                      std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2) const {
 745     NRMat<float> WM(seq1Length + 1, seq2Length + 1);
 746     for (int i = 0; i <= seq1Length; i++) {
 747         for (int j = 0; j <= seq2Length; j++) {
 748             WM[i][j] = 0;
 749         }
 750     }
 751
 752     int len      = WORDLENGTH;
 753     int size     = matchPSCS1->size();
 754     float weight = 1000;
 755
 756     for(int iter = 0; iter < size; iter++) {
 757         int i = matchPSCS1->at(iter);
 758         int j = matchPSCS2->at(iter);
 759
 760         const StemCandidate &sc1 = pscs1->at(i);
 761         const StemCandidate &sc2 = pscs2->at(j);
 762         for(int k = 0; k < len; k++) {
 763             WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight;
 764         }
 765     }
 766     float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);
 767     float *oldRow = twoRows;
 768     float *newRow = twoRows + seq2Length + 1;
 769
 770     char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
 771     char *tracebackPtr = tracebackMatrix;
 772
 773     VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
 774
 775     // initialization
 776     for (int i = 0; i <= seq2Length; i++){
 777       oldRow[i] = 0;
 778       *(tracebackPtr++) = 'L';
 779     }
 780
 781     // fill in matrix
 782     for (int i = 1; i <= seq1Length; i++){
 783
 784       // initialize left column
 785       newRow[0] = 0;
 786       posteriorPtr++;
 787       *(tracebackPtr++) = 'U';
 788
 789       // fill in rest of row
 790       for (int j = 1; j <= seq2Length; j++){
 791         ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1] + WM[i][j], newRow[j-1], oldRow[j],
 792                            'D', 'L', 'U', &newRow[j], tracebackPtr++);
 793       }
 794
 795       // swap rows
 796       float *temp = oldRow;
 797       oldRow = newRow;
 798       newRow = temp;
 799     }
 800
 801     // store best score
 802     float total = oldRow[seq2Length];
 803     delete [] twoRows;
 804
 805     // compute traceback
 806     SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
 807     int r = seq1Length, c = seq2Length;
 808     while (r != 0 || c != 0){
 809       char ch = tracebackMatrix[r*(seq2Length+1) + c];
 810       switch (ch){
 811       case 'L': c--; alignment->push_back ('Y'); break;
 812       case 'U': r--; alignment->push_back ('X'); break;
 813       case 'D': c--; r--; alignment->push_back ('B'); break;
 814       default: assert (false);
 815       }
 816     }
 817
 818     delete [] tracebackMatrix;
 819
 820     reverse (alignment->begin(), alignment->end());
 821
 822     return make_pair(alignment, total);
 823   }
 824
 825   /////////////////////////////////////////////////////////////////
 826   // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
 827   //
 828   // Similar to ComputeAlignment() except with gap penalties.
 829   /////////////////////////////////////////////////////////////////
 830
 831   pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1,
 832                                                                     MultiSequence *align2,
 833                                                                     const VF &posterior, int numSeqs1,
 834                                                                     int numSeqs2,
 835                                                                     float gapOpenPenalty,
 836                                                                     float gapContinuePenalty) const {
 837     int seq1Length = align1->GetSequence(0)->GetLength();
 838     int seq2Length = align2->GetSequence(0)->GetLength();
 839     SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());
 840     SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());
 841
 842     // grab character data
 843     for (int i = 0; i < align1->GetNumSequences(); i++)
 844       dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
 845     for (int i = 0; i < align2->GetNumSequences(); i++)
 846       dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
 847
 848     // the number of active sequences at any given column is defined to be the
 849     // number of non-gap characters in that column; the number of gap opens at
 850     // any given column is defined to be the number of gap characters in that
 851     // column where the previous character in the respective sequence was not
 852     // a gap
 853     SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);
 854     SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);
 855
 856     // compute number of active sequences and gap opens for each group
 857     for (int i = 0; i < align1->GetNumSequences(); i++){
 858       SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();
 859       numActive1[0] = numGapOpens1[0] = 0;
 860       for (int j = 1; j <= seq1Length; j++){
 861         if (dataPtr[j] != '-'){
 862           numActive1[j]++;
 863           numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');
 864         }
 865       }
 866     }
 867     for (int i = 0; i < align2->GetNumSequences(); i++){
 868       SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();
 869       numActive2[0] = numGapOpens2[0] = 0;
 870       for (int j = 1; j <= seq2Length; j++){
 871         if (dataPtr[j] != '-'){
 872           numActive2[j]++;
 873           numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');
 874         }
 875       }
 876     }
 877
 878     VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));
 879     VF continuingPenalty1 (numSeqs1+1);
 880     VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));
 881     VF continuingPenalty2 (numSeqs2+1);
 882
 883     // precompute penalties
 884     for (int i = 0; i <= numSeqs1; i++)
 885       for (int j = 0; j <= numSeqs2; j++)
 886         openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));
 887     for (int i = 0; i <= numSeqs1; i++)
 888       continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
 889     for (int i = 0; i <= numSeqs2; i++)
 890       for (int j = 0; j <= numSeqs1; j++)
 891         openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));
 892     for (int i = 0; i <= numSeqs2; i++)
 893       continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
 894
 895     float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows);
 896     float *oldRowMatch = twoRows;
 897     float *newRowMatch = twoRows + (seq2Length+1);
 898     float *oldRowInsertX = twoRows + 2*(seq2Length+1);
 899     float *newRowInsertX = twoRows + 3*(seq2Length+1);
 900     float *oldRowInsertY = twoRows + 4*(seq2Length+1);
 901     float *newRowInsertY = twoRows + 5*(seq2Length+1);
 902
 903     char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
 904     char *tracebackPtr = tracebackMatrix;
 905
 906     VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
 907
 908     // initialization
 909     for (int i = 0; i <= seq2Length; i++){
 910       oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
 911       oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];
 912       *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y';
 913       tracebackPtr += 3;
 914     }
 915
 916     // fill in matrix
 917     for (int i = 1; i <= seq1Length; i++){
 918
 919       // initialize left column
 920       newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
 921       newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];
 922       posteriorPtr++;
 923       *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X';
 924       tracebackPtr += 3;
 925
 926       // fill in rest of row
 927       for (int j = 1; j <= seq2Length; j++){
 928
 929         // going to MATCH state
 930         ChooseBestOfThree (oldRowMatch[j-1],
 931                            oldRowInsertX[j-1],
 932                            oldRowInsertY[j-1],
 933                            'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);
 934         newRowMatch[j] += *(posteriorPtr++);
 935
 936         // going to INSERT X state
 937         ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
 938                            oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
 939                            oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
 940                            'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
 941
 942         // going to INSERT Y state
 943         ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
 944                            newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
 945                            newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],
 946                            'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);
 947       }
 948
 949       // swap rows
 950       float *temp;
 951       temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;
 952       temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;
 953       temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;
 954     }
 955
 956     // store best score
 957     float total;
 958     char matrix;
 959     ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],
 960                        'M', 'X', 'Y', &total, &matrix);
 961
 962     delete [] twoRows;
 963
 964     // compute traceback
 965     SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
 966     int r = seq1Length, c = seq2Length;
 967     while (r != 0 || c != 0){
 968
 969       int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
 970       char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset];
 971       switch (matrix){
 972       case 'Y': c--; alignment->push_back ('Y'); break;
 973       case 'X': r--; alignment->push_back ('X'); break;
 974       case 'M': c--; r--; alignment->push_back ('B'); break;
 975       default: assert (false);
 976       }
 977       matrix = ch;
 978     }
 979
 980     delete [] tracebackMatrix;
 981
 982     reverse (alignment->begin(), alignment->end());
 983
 984     return make_pair(alignment, 1.0f);
 985   }
 986
 987
 988   /////////////////////////////////////////////////////////////////
 989   // ProbabilisticModel::ComputeViterbiAlignment()
 990   //
 991   // Computes the highest probability pairwise alignment using the
 992   // probabilistic model.  The final alignment is returned as a
 993   //  pair consisting of:
 994   //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
 995   //        denote insertions in one of the two sequences and
 996   //        B's denote that both sequences are present (i.e.
 997   //        matches).
 998   //    (2) a float containing the log probability of the best
 999   //        alignment (not used)
1000   /////////////////////////////////////////////////////////////////
1001
1002   pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const {
1003
1004     assert (seq1);
1005     assert (seq2);
1006
1007     const int seq1Length = seq1->GetLength();
1008     const int seq2Length = seq2->GetLength();
1009
1010     // retrieve the points to the beginning of each sequence
1011     SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1012     SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1013
1014     // create viterbi matrix
1015     VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);
1016     assert (viterbiPtr);
1017     VF &viterbi = *viterbiPtr;
1018
1019     // create traceback matrix
1020     VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1);
1021     assert (tracebackPtr);
1022     VI &traceback = *tracebackPtr;
1023
1024     // initialization condition
1025     for (int k = 0; k < NumMatrixTypes; k++)
1026       viterbi[k] = initialDistribution[k];
1027
1028     // remember offset for each index combination
1029     int ij = 0;
1030     int i1j = -seq2Length - 1;
1031     int ij1 = -1;
1032     int i1j1 = -seq2Length - 2;
1033
1034     ij *= NumMatrixTypes;
1035     i1j *= NumMatrixTypes;
1036     ij1 *= NumMatrixTypes;
1037     i1j1 *= NumMatrixTypes;
1038
1039     // compute viterbi scores
1040     for (int i = 0; i <= seq1Length; i++){
1041       unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
1042       for (int j = 0; j <= seq2Length; j++){
1043         unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
1044
1045         if (i > 0 && j > 0){
1046           for (int k = 0; k < NumMatrixTypes; k++){
1047             float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2];
1048             if (viterbi[0 + ij] < newVal){
1049               viterbi[0 + ij] = newVal;
1050               traceback[0 + ij] = k;
1051             }
1052           }
1053         }
1054         if (i > 0){
1055           for (int k = 0; k < NumInsertStates; k++){
1056             float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1];
1057             float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1];
1058             if (valFromMatch >= valFromIns){
1059               viterbi[2*k+1 + ij] = valFromMatch;
1060               traceback[2*k+1 + ij] = 0;
1061             }
1062             else {
1063               viterbi[2*k+1 + ij] = valFromIns;
1064               traceback[2*k+1 + ij] = 2*k+1;
1065             }
1066           }
1067         }
1068         if (j > 0){
1069           for (int k = 0; k < NumInsertStates; k++){
1070             float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2];
1071             float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2];
1072             if (valFromMatch >= valFromIns){
1073               viterbi[2*k+2 + ij] = valFromMatch;
1074               traceback[2*k+2 + ij] = 0;
1075             }
1076             else {
1077               viterbi[2*k+2 + ij] = valFromIns;
1078               traceback[2*k+2 + ij] = 2*k+2;
1079             }
1080           }
1081         }
1082
1083         ij += NumMatrixTypes;
1084         i1j += NumMatrixTypes;
1085         ij1 += NumMatrixTypes;
1086         i1j1 += NumMatrixTypes;
1087       }
1088     }
1089
1090     // figure out best terminating cell
1091     float bestProb = LOG_ZERO;
1092     int state = -1;
1093     for (int k = 0; k < NumMatrixTypes; k++){
1094       float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k];
1095       if (bestProb < thisProb){
1096         bestProb = thisProb;
1097         state = k;
1098       }
1099     }
1100     assert (state != -1);
1101
1102     delete viterbiPtr;
1103
1104     // compute traceback
1105     SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
1106     int r = seq1Length, c = seq2Length;
1107     while (r != 0 || c != 0){
1108       int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)];
1109
1110       if (state == 0){ c--; r--; alignment->push_back ('B'); }
1111       else if (state % 2 == 1){ r--; alignment->push_back ('X'); }
1112       else { c--; alignment->push_back ('Y'); }
1113
1114       state = newState;
1115     }
1116
1117     delete tracebackPtr;
1118
1119     reverse (alignment->begin(), alignment->end());
1120
1121     return make_pair(alignment, bestProb);
1122   }
1123
1124   /////////////////////////////////////////////////////////////////
1125   // ProbabilisticModel::BuildPosterior()
1126   //
1127   // Builds a posterior probability matrix needed to align a pair
1128   // of alignments.  Mathematically, the returned matrix M is
1129   // defined as follows:
1130   //    M[i,j] =     sum          sum      f(s,t,i,j)
1131   //             s in align1  t in align2
1132   // where
1133   //                  [  P(s[i'] <--> t[j'])
1134   //                  [       if s[i'] is a letter in the ith column of align1 and
1135   //                  [          t[j'] it a letter in the jth column of align2
1136   //    f(s,t,i,j) =  [
1137   //                  [  0    otherwise
1138   //
1139   /////////////////////////////////////////////////////////////////
1140
1141   VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2,
1142                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1143                       float cutoff = 0.0f) const {
1144     const int seq1Length = align1->GetSequence(0)->GetLength();
1145     const int seq2Length = align2->GetSequence(0)->GetLength();
1146
1147     VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr);
1148     VF &posterior = *posteriorPtr;
1149     VF::iterator postPtr = posterior.begin();
1150
1151     // for each s in align1
1152     for (int i = 0; i < align1->GetNumSequences(); i++){
1153       int first = align1->GetSequence(i)->GetLabel();
1154       SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1155
1156       // for each t in align2
1157       for (int j = 0; j < align2->GetNumSequences(); j++){
1158         int second = align2->GetSequence(j)->GetLabel();
1159         SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();
1160         if (first < second){
1161
1162           // get the associated sparse matrix
1163           SparseMatrix *matrix = sparseMatrices[first][second];
1164
1165           for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){
1166             SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1167             int base = (*mapping1)[ii] * (seq2Length+1);
1168             int rowSize = matrix->GetRowSize(ii);
1169             // add in all relevant values
1170             for (int jj = 0; jj < rowSize; jj++)
1171               posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;
1172
1173             // subtract cutoff
1174             for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) {
1175               posterior[base + (*mapping2)[jj]] -= cutoff;
1176             }
1177
1178           }
1179
1180         } else {
1181           // get the associated sparse matrix
1182           SparseMatrix *matrix = sparseMatrices[second][first];
1183
1184           for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){
1185             SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1186             int base = (*mapping2)[jj];
1187             int rowSize = matrix->GetRowSize(jj);
1188
1189             // add in all relevant values
1190             for (int ii = 0; ii < rowSize; ii++)
1191               posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second;
1192
1193             // subtract cutoff
1194             for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1195               posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff;
1196           }
1197
1198         }
1199
1200
1201         delete mapping2;
1202       }
1203
1204       delete mapping1;
1205     }
1206
1207     return posteriorPtr;
1208   }
1209 };
1210 }
1211 #endif