binaries/src/MSAProbs-0.9.7/MSAProbs/ProbabilisticModel.h

   1 /////////////////////////////////////////////////////////////////
   2 // ProbabilisticModel.h
   3 //
   4 // Routines for (1) posterior probability computations
   5 //              (2) chained anchoring
   6 //              (3) maximum weight trace alignment
   7 /////////////////////////////////////////////////////////////////
   8
   9 #ifndef PROBABILISTICMODEL_H
  10 #define PROBABILISTICMODEL_H
  11
  12 #include <list>
  13 #include <cmath>
  14 #include <cstdio>
  15 #include "SafeVector.h"
  16 #include "ScoreType.h"
  17 #include "SparseMatrix.h"
  18 #include "MultiSequence.h"
  19
  20 using namespace std;
  21
  22 const int NumMatchStates = 1;            // note that in this version the number
  23 // of match states is fixed at 1...will
  24 // change in future versions
  25 const int NumInsertStates = 2;
  26 const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
  27
  28 /////////////////////////////////////////////////////////////////
  29 // ProbabilisticModel
  30 //
  31 // Class for storing the parameters of a probabilistic model and
  32 // performing different computations based on those parameters.
  33 // In particular, this class handles the computation of
  34 // posterior probabilities that may be used in alignment.
  35 /////////////////////////////////////////////////////////////////
  36
  37 class ProbabilisticModel {
  38
  39         float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
  40         float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
  41         float matchProb[256][256];        // emission probabilities for match states
  42         float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
  43
  44 public:
  45
  46         /////////////////////////////////////////////////////////////////
  47         // ProbabilisticModel::ProbabilisticModel()
  48         //
  49         // Constructor.  Builds a new probabilistic model using the
  50         // given parameters.
  51         /////////////////////////////////////////////////////////////////
  52
  53         ProbabilisticModel(const VF &initDistribMat, const VF &gapOpen,
  54                         const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle) {
  55
  56                 // build transition matrix
  57                 VVF transMat(NumMatrixTypes, VF(NumMatrixTypes, 0.0f));
  58                 transMat[0][0] = 1;
  59                 for (int i = 0; i < NumInsertStates; i++) {
  60                         transMat[0][2 * i + 1] = gapOpen[2 * i];
  61                         transMat[0][2 * i + 2] = gapOpen[2 * i + 1];
  62                         transMat[0][0] -= (gapOpen[2 * i] + gapOpen[2 * i + 1]);
  63                         assert(transMat[0][0] > 0);
  64                         transMat[2 * i + 1][2 * i + 1] = gapExtend[2 * i];
  65                         transMat[2 * i + 2][2 * i + 2] = gapExtend[2 * i + 1];
  66                         transMat[2 * i + 1][2 * i + 2] = 0;
  67                         transMat[2 * i + 2][2 * i + 1] = 0;
  68                         transMat[2 * i + 1][0] = 1 - gapExtend[2 * i];
  69                         transMat[2 * i + 2][0] = 1 - gapExtend[2 * i + 1];
  70                 }
  71
  72                 // create initial and transition probability matrices
  73                 for (int i = 0; i < NumMatrixTypes; i++) {
  74                         initialDistribution[i] = LOG(initDistribMat[i]);
  75                         for (int j = 0; j < NumMatrixTypes; j++)
  76                                 transProb[i][j] = LOG(transMat[i][j]);
  77                 }
  78
  79                 // create insertion and match probability matrices
  80                 for (int i = 0; i < 256; i++) {
  81                         for (int j = 0; j < NumMatrixTypes; j++)
  82                                 insProb[i][j] = LOG(emitSingle[i]);
  83                         for (int j = 0; j < 256; j++)
  84                                 matchProb[i][j] = LOG(emitPairs[i][j]);
  85                 }
  86         }
  87
  88         /////////////////////////////////////////////////////////////////
  89         // ProbabilisticModel::ComputeForwardMatrix()
  90         //
  91         // Computes a set of forward probability matrices for aligning
  92         // seq1 and seq2.
  93         //
  94         // For efficiency reasons, a single-dimensional floating-point
  95         // array is used here, with the following indexing scheme:
  96         //
  97         //    forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
  98         //    refers to the probability of aligning through j characters
  99         //    of the first sequence, k characters of the second sequence,
 100         //    and ending in state i.
 101         /////////////////////////////////////////////////////////////////
 102
 103         VF *ComputeForwardMatrix(Sequence *seq1, Sequence *seq2) const {
 104
 105                 assert(seq1);
 106                 assert(seq2);
 107
 108                 const int seq1Length = seq1->GetLength();
 109                 const int seq2Length = seq2->GetLength();
 110
 111                 // retrieve the points to the beginning of each sequence
 112                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 113                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 114
 115                 // create matrix
 116                 VF *forwardPtr = new VF(
 117                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
 118                 assert(forwardPtr);
 119                 VF &forward = *forwardPtr;
 120
 121                 // initialization condition
 122                 forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] =
 123                                 initialDistribution[0]
 124                                                 + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
 125
 126                 for (int k = 0; k < NumInsertStates; k++) {
 127                         forward[2 * k + 1 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] =
 128                                         initialDistribution[2 * k + 1]
 129                                                         + insProb[(unsigned char) iter1[1]][k];
 130                         forward[2 * k + 2 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] =
 131                                         initialDistribution[2 * k + 2]
 132                                                         + insProb[(unsigned char) iter2[1]][k];
 133                 }
 134
 135                 // remember offset for each index combination
 136                 int ij = 0;
 137                 int i1j = -seq2Length - 1;
 138                 int ij1 = -1;
 139                 int i1j1 = -seq2Length - 2;
 140
 141                 ij *= NumMatrixTypes;
 142                 i1j *= NumMatrixTypes;
 143                 ij1 *= NumMatrixTypes;
 144                 i1j1 *= NumMatrixTypes;
 145
 146                 // compute forward scores
 147                 for (int i = 0; i <= seq1Length; i++) {
 148                         unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
 149                         for (int j = 0; j <= seq2Length; j++) {
 150                                 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
 151
 152                                 if (i > 1 || j > 1) {
 153                                         if (i > 0 && j > 0) {
 154                                                 forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
 155                                                 for (int k = 1; k < NumMatrixTypes; k++)
 156                                                         LOG_PLUS_EQUALS(forward[0 + ij],
 157                                                                         forward[k + i1j1] + transProb[k][0]);
 158                                                 forward[0 + ij] += matchProb[c1][c2];
 159                                         }
 160                                         if (i > 0) {
 161                                                 for (int k = 0; k < NumInsertStates; k++)
 162                                                         forward[2 * k + 1 + ij] = insProb[c1][k]
 163                                                                         + LOG_ADD(
 164                                                                                         forward[0 + i1j]
 165                                                                                                         + transProb[0][2 * k + 1],
 166                                                                                         forward[2 * k + 1 + i1j]
 167                                                                                                         + transProb[2 * k + 1][2 * k
 168                                                                                                                         + 1]);
 169                                         }
 170                                         if (j > 0) {
 171                                                 for (int k = 0; k < NumInsertStates; k++)
 172                                                         forward[2 * k + 2 + ij] = insProb[c2][k]
 173                                                                         + LOG_ADD(
 174                                                                                         forward[0 + ij1]
 175                                                                                                         + transProb[0][2 * k + 2],
 176                                                                                         forward[2 * k + 2 + ij1]
 177                                                                                                         + transProb[2 * k + 2][2 * k
 178                                                                                                                         + 2]);
 179                                         }
 180                                 }
 181
 182                                 ij += NumMatrixTypes;
 183                                 i1j += NumMatrixTypes;
 184                                 ij1 += NumMatrixTypes;
 185                                 i1j1 += NumMatrixTypes;
 186                         }
 187                 }
 188
 189                 return forwardPtr;
 190         }
 191
 192         /////////////////////////////////////////////////////////////////
 193         // ProbabilisticModel::ComputeBackwardMatrix()
 194         //
 195         // Computes a set of backward probability matrices for aligning
 196         // seq1 and seq2.
 197         //
 198         // For efficiency reasons, a single-dimensional floating-point
 199         // array is used here, with the following indexing scheme:
 200         //
 201         //    backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
 202         //    refers to the probability of starting in state i and
 203         //    aligning from character j+1 to the end of the first
 204         //    sequence and from character k+1 to the end of the second
 205         //    sequence.
 206         /////////////////////////////////////////////////////////////////
 207
 208         VF *ComputeBackwardMatrix(Sequence *seq1, Sequence *seq2) const {
 209
 210                 assert(seq1);
 211                 assert(seq2);
 212
 213                 const int seq1Length = seq1->GetLength();
 214                 const int seq2Length = seq2->GetLength();
 215                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 216                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 217
 218                 // create matrix
 219                 VF *backwardPtr = new VF(
 220                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
 221                 assert(backwardPtr);
 222                 VF &backward = *backwardPtr;
 223
 224                 // initialization condition
 225                 for (int k = 0; k < NumMatrixTypes; k++)
 226                         backward[NumMatrixTypes * ((seq1Length + 1) * (seq2Length + 1) - 1)
 227                                         + k] = initialDistribution[k];
 228
 229                 // remember offset for each index combination
 230                 int ij = (seq1Length + 1) * (seq2Length + 1) - 1;
 231                 int i1j = ij + seq2Length + 1;
 232                 int ij1 = ij + 1;
 233                 int i1j1 = ij + seq2Length + 2;
 234
 235                 ij *= NumMatrixTypes;
 236                 i1j *= NumMatrixTypes;
 237                 ij1 *= NumMatrixTypes;
 238                 i1j1 *= NumMatrixTypes;
 239
 240                 // compute backward scores
 241                 for (int i = seq1Length; i >= 0; i--) {
 242                         unsigned char c1 =
 243                                         (i == seq1Length) ? '~' : (unsigned char) iter1[i + 1];
 244                         for (int j = seq2Length; j >= 0; j--) {
 245                                 unsigned char c2 =
 246                                                 (j == seq2Length) ? '~' : (unsigned char) iter2[j + 1];
 247
 248                                 if (i < seq1Length && j < seq2Length) {
 249                                         const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
 250                                         for (int k = 0; k < NumMatrixTypes; k++)
 251                                                 LOG_PLUS_EQUALS(backward[k + ij],
 252                                                                 ProbXY + transProb[k][0]);
 253                                 }
 254                                 if (i < seq1Length) {
 255                                         for (int k = 0; k < NumInsertStates; k++) {
 256                                                 LOG_PLUS_EQUALS(backward[0 + ij],
 257                                                                 backward[2 * k + 1 + i1j] + insProb[c1][k]
 258                                                                                 + transProb[0][2 * k + 1]);
 259                                                 LOG_PLUS_EQUALS(backward[2 * k + 1 + ij],
 260                                                                 backward[2 * k + 1 + i1j] + insProb[c1][k]
 261                                                                                 + transProb[2 * k + 1][2 * k + 1]);
 262                                         }
 263                                 }
 264                                 if (j < seq2Length) {
 265                                         for (int k = 0; k < NumInsertStates; k++) {
 266                                                 LOG_PLUS_EQUALS(backward[0 + ij],
 267                                                                 backward[2 * k + 2 + ij1] + insProb[c2][k]
 268                                                                                 + transProb[0][2 * k + 2]);
 269                                                 LOG_PLUS_EQUALS(backward[2 * k + 2 + ij],
 270                                                                 backward[2 * k + 2 + ij1] + insProb[c2][k]
 271                                                                                 + transProb[2 * k + 2][2 * k + 2]);
 272                                         }
 273                                 }
 274
 275                                 ij -= NumMatrixTypes;
 276                                 i1j -= NumMatrixTypes;
 277                                 ij1 -= NumMatrixTypes;
 278                                 i1j1 -= NumMatrixTypes;
 279                         }
 280                 }
 281
 282                 return backwardPtr;
 283         }
 284
 285         /////////////////////////////////////////////////////////////////
 286         // ProbabilisticModel::ComputeTotalProbability()
 287         //
 288         // Computes the total probability of an alignment given
 289         // the forward and backward matrices.
 290         /////////////////////////////////////////////////////////////////
 291
 292         float ComputeTotalProbability(int seq1Length, int seq2Length,
 293                         const VF &forward, const VF &backward) const {
 294
 295                 // compute total probability
 296                 float totalForwardProb = LOG_ZERO;
 297                 float totalBackwardProb = LOG_ZERO;
 298                 for (int k = 0; k < NumMatrixTypes; k++) {
 299                         LOG_PLUS_EQUALS(totalForwardProb,
 300                                         forward[k
 301                                                         + NumMatrixTypes
 302                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
 303                                                         + backward[k
 304                                                                         + NumMatrixTypes
 305                                                                                         * ((seq1Length + 1)
 306                                                                                                         * (seq2Length + 1) - 1)]);
 307                 }
 308
 309                 totalBackwardProb = forward[0
 310                                 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
 311                                 + backward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)];
 312
 313                 for (int k = 0; k < NumInsertStates; k++) {
 314                         LOG_PLUS_EQUALS(totalBackwardProb,
 315                                         forward[2 * k + 1
 316                                                         + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
 317                                                         + backward[2 * k + 1
 318                                                                         + NumMatrixTypes
 319                                                                                         * (1 * (seq2Length + 1) + 0)]);
 320                         LOG_PLUS_EQUALS(totalBackwardProb,
 321                                         forward[2 * k + 2
 322                                                         + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
 323                                                         + backward[2 * k + 2
 324                                                                         + NumMatrixTypes
 325                                                                                         * (0 * (seq2Length + 1) + 1)]);
 326                 }
 327
 328                 //    cerr << totalForwardProb << " " << totalBackwardProb << endl;
 329
 330                 return (totalForwardProb + totalBackwardProb) / 2;
 331         }
 332
 333         /////////////////////////////////////////////////////////////////
 334         // ProbabilisticModel::ComputePosteriorMatrix()
 335         //
 336         // Computes the posterior probability matrix based on
 337         // the forward and backward matrices.
 338         /////////////////////////////////////////////////////////////////
 339
 340         VF *ComputePosteriorMatrix(Sequence *seq1, Sequence *seq2,
 341                         const VF &forward, const VF &backward) const {
 342
 343                 assert(seq1);
 344                 assert(seq2);
 345
 346                 const int seq1Length = seq1->GetLength();
 347                 const int seq2Length = seq2->GetLength();
 348
 349                 float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
 350                                 forward, backward);
 351
 352                 // compute posterior matrices
 353                 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1));
 354                 assert(posteriorPtr);
 355                 VF &posterior = *posteriorPtr;
 356
 357                 int ij = 0;
 358                 if (totalProb == 0) {
 359                         totalProb = 1.0f;
 360                 }
 361                 VF::iterator ptr = posterior.begin();
 362
 363                 for (int i = 0; i <= seq1Length; i++) {
 364                         for (int j = 0; j <= seq2Length; j++) {
 365                                 *(ptr++) = EXP(
 366                                                 min(LOG_ONE, forward[ij] + backward[ij] - totalProb));
 367                                 ij += NumMatrixTypes;
 368                         }
 369                 }
 370
 371                 posterior[0] = 0;
 372
 373                 return posteriorPtr;
 374         }
 375
 376         /*
 377          /////////////////////////////////////////////////////////////////
 378          // ProbabilisticModel::ComputeExpectedCounts()
 379          //
 380          // Computes the expected counts for the various transitions.
 381          /////////////////////////////////////////////////////////////////
 382
 383          VVF *ComputeExpectedCounts () const {
 384
 385          assert (seq1);
 386          assert (seq2);
 387
 388          const int seq1Length = seq1->GetLength();
 389          const int seq2Length = seq2->GetLength();
 390          SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 391          SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 392
 393          // compute total probability
 394          float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
 395          forward, backward);
 396
 397          // initialize expected counts
 398          VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
 399          VVF &counts = *countsPtr;
 400
 401          // remember offset for each index combination
 402          int ij = 0;
 403          int i1j = -seq2Length - 1;
 404          int ij1 = -1;
 405          int i1j1 = -seq2Length - 2;
 406
 407          ij *= NumMatrixTypes;
 408          i1j *= NumMatrixTypes;
 409          ij1 *= NumMatrixTypes;
 410          i1j1 *= NumMatrixTypes;
 411
 412          // compute expected counts
 413          for (int i = 0; i <= seq1Length; i++){
 414          unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
 415          for (int j = 0; j <= seq2Length; j++){
 416          unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
 417
 418          if (i > 0 && j > 0){
 419          for (int k = 0; k < NumMatrixTypes; k++)
 420          LOG_PLUS_EQUALS (counts[k][0],
 421          forward[k + i1j1] + transProb[k][0] +
 422          matchProb[c1][c2] + backward[0 + ij]);
 423          }
 424          if (i > 0){
 425          for (int k = 0; k < NumInsertStates; k++){
 426          LOG_PLUS_EQUALS (counts[0][2*k+1],
 427          forward[0 + i1j] + transProb[0][2*k+1] +
 428          insProb[c1][k] + backward[2*k+1 + ij]);
 429          LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
 430          forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
 431          insProb[c1][k] + backward[2*k+1 + ij]);
 432          }
 433          }
 434          if (j > 0){
 435          for (int k = 0; k < NumInsertStates; k++){
 436          LOG_PLUS_EQUALS (counts[0][2*k+2],
 437          forward[0 + ij1] + transProb[0][2*k+2] +
 438          insProb[c2][k] + backward[2*k+2 + ij]);
 439          LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
 440          forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
 441          insProb[c2][k] + backward[2*k+2 + ij]);
 442          }
 443          }
 444
 445          ij += NumMatrixTypes;
 446          i1j += NumMatrixTypes;
 447          ij1 += NumMatrixTypes;
 448          i1j1 += NumMatrixTypes;
 449          }
 450          }
 451
 452          // scale all expected counts appropriately
 453          for (int i = 0; i < NumMatrixTypes; i++)
 454          for (int j = 0; j < NumMatrixTypes; j++)
 455          counts[i][j] -= totalProb;
 456
 457          }
 458          */
 459
 460         /////////////////////////////////////////////////////////////////
 461         // ProbabilisticModel::ComputeNewParameters()
 462         //
 463         // Computes a new parameter set based on the expected counts
 464         // given.
 465         /////////////////////////////////////////////////////////////////
 466         void ComputeNewParameters(Sequence *seq1, Sequence *seq2, const VF &forward,
 467                         const VF &backward, VF &initDistribMat, VF &gapOpen, VF &gapExtend,
 468                         VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
 469
 470                 assert(seq1);
 471                 assert(seq2);
 472
 473                 const int seq1Length = seq1->GetLength();
 474                 const int seq2Length = seq2->GetLength();
 475                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 476                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 477
 478                 // compute total probability
 479                 float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
 480                                 forward, backward);
 481
 482                 // initialize expected counts
 483                 VVF transCounts(NumMatrixTypes, VF(NumMatrixTypes, LOG_ZERO));
 484                 VF initCounts(NumMatrixTypes, LOG_ZERO);
 485                 VVF pairCounts(256, VF(256, LOG_ZERO));
 486                 VF singleCounts(256, LOG_ZERO);
 487
 488                 // remember offset for each index combination
 489                 int ij = 0;
 490                 int i1j = -seq2Length - 1;
 491                 int ij1 = -1;
 492                 int i1j1 = -seq2Length - 2;
 493
 494                 ij *= NumMatrixTypes;
 495                 i1j *= NumMatrixTypes;
 496                 ij1 *= NumMatrixTypes;
 497                 i1j1 *= NumMatrixTypes;
 498
 499                 // compute initial distribution posteriors
 500                 initCounts[0] = LOG_ADD(
 501                                 forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
 502                                                 + backward[0
 503                                                                 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)],
 504                                 forward[0
 505                                                 + NumMatrixTypes
 506                                                                 * ((seq1Length + 1) * (seq2Length + 1) - 1)]
 507                                                 + backward[0
 508                                                                 + NumMatrixTypes
 509                                                                                 * ((seq1Length + 1) * (seq2Length + 1)
 510                                                                                                 - 1)]);
 511                 for (int k = 0; k < NumInsertStates; k++) {
 512                         initCounts[2 * k + 1] = LOG_ADD(
 513                                         forward[2 * k + 1
 514                                                         + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
 515                                                         + backward[2 * k + 1
 516                                                                         + NumMatrixTypes
 517                                                                                         * (1 * (seq2Length + 1) + 0)],
 518                                         forward[2 * k + 1
 519                                                         + NumMatrixTypes
 520                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
 521                                                         + backward[2 * k + 1
 522                                                                         + NumMatrixTypes
 523                                                                                         * ((seq1Length + 1)
 524                                                                                                         * (seq2Length + 1) - 1)]);
 525                         initCounts[2 * k + 2] = LOG_ADD(
 526                                         forward[2 * k + 2
 527                                                         + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
 528                                                         + backward[2 * k + 2
 529                                                                         + NumMatrixTypes
 530                                                                                         * (0 * (seq2Length + 1) + 1)],
 531                                         forward[2 * k + 2
 532                                                         + NumMatrixTypes
 533                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
 534                                                         + backward[2 * k + 2
 535                                                                         + NumMatrixTypes
 536                                                                                         * ((seq1Length + 1)
 537                                                                                                         * (seq2Length + 1) - 1)]);
 538                 }
 539
 540                 // compute expected counts
 541                 for (int i = 0; i <= seq1Length; i++) {
 542                         unsigned char c1 =
 543                                         (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
 544                         for (int j = 0; j <= seq2Length; j++) {
 545                                 unsigned char c2 =
 546                                                 (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
 547
 548                                 if (i > 0 && j > 0) {
 549                                         if (enableTrainEmissions && i == 1 && j == 1) {
 550                                                 LOG_PLUS_EQUALS(pairCounts[c1][c2],
 551                                                                 initialDistribution[0] + matchProb[c1][c2]
 552                                                                                 + backward[0 + ij]);
 553                                                 LOG_PLUS_EQUALS(pairCounts[c2][c1],
 554                                                                 initialDistribution[0] + matchProb[c2][c1]
 555                                                                                 + backward[0 + ij]);
 556                                         }
 557
 558                                         for (int k = 0; k < NumMatrixTypes; k++) {
 559                                                 LOG_PLUS_EQUALS(transCounts[k][0],
 560                                                                 forward[k + i1j1] + transProb[k][0]
 561                                                                                 + matchProb[c1][c2] + backward[0 + ij]);
 562                                                 if (enableTrainEmissions && (i != 1 || j != 1)) {//adding parentheses by Liu Yongchao, 5 Mar, 2010
 563                                                         LOG_PLUS_EQUALS(pairCounts[c1][c2],
 564                                                                         forward[k + i1j1] + transProb[k][0]
 565                                                                                         + matchProb[c1][c2]
 566                                                                                         + backward[0 + ij]);
 567                                                         LOG_PLUS_EQUALS(pairCounts[c2][c1],
 568                                                                         forward[k + i1j1] + transProb[k][0]
 569                                                                                         + matchProb[c2][c1]
 570                                                                                         + backward[0 + ij]);
 571                                                 }
 572                                         }
 573                                 }
 574                                 if (i > 0) {
 575                                         for (int k = 0; k < NumInsertStates; k++) {
 576                                                 LOG_PLUS_EQUALS(transCounts[0][2 * k + 1],
 577                                                                 forward[0 + i1j] + transProb[0][2 * k + 1]
 578                                                                                 + insProb[c1][k]
 579                                                                                 + backward[2 * k + 1 + ij]);
 580                                                 LOG_PLUS_EQUALS(transCounts[2 * k + 1][2 * k + 1],
 581                                                                 forward[2 * k + 1 + i1j]
 582                                                                                 + transProb[2 * k + 1][2 * k + 1]
 583                                                                                 + insProb[c1][k]
 584                                                                                 + backward[2 * k + 1 + ij]);
 585                                                 if (enableTrainEmissions) {
 586                                                         if (i == 1 && j == 0) {
 587                                                                 LOG_PLUS_EQUALS(singleCounts[c1],
 588                                                                                 initialDistribution[2 * k + 1]
 589                                                                                                 + insProb[c1][k]
 590                                                                                                 + backward[2 * k + 1 + ij]);
 591                                                         } else {
 592                                                                 LOG_PLUS_EQUALS(singleCounts[c1],
 593                                                                                 forward[0 + i1j]
 594                                                                                                 + transProb[0][2 * k + 1]
 595                                                                                                 + insProb[c1][k]
 596                                                                                                 + backward[2 * k + 1 + ij]);
 597                                                                 LOG_PLUS_EQUALS(singleCounts[c1],
 598                                                                                 forward[2 * k + 1 + i1j]
 599                                                                                                 + transProb[2 * k + 1][2 * k + 1]
 600                                                                                                 + insProb[c1][k]
 601                                                                                                 + backward[2 * k + 1 + ij]);
 602                                                         }
 603                                                 }
 604                                         }
 605                                 }
 606                                 if (j > 0) {
 607                                         for (int k = 0; k < NumInsertStates; k++) {
 608                                                 LOG_PLUS_EQUALS(transCounts[0][2 * k + 2],
 609                                                                 forward[0 + ij1] + transProb[0][2 * k + 2]
 610                                                                                 + insProb[c2][k]
 611                                                                                 + backward[2 * k + 2 + ij]);
 612                                                 LOG_PLUS_EQUALS(transCounts[2 * k + 2][2 * k + 2],
 613                                                                 forward[2 * k + 2 + ij1]
 614                                                                                 + transProb[2 * k + 2][2 * k + 2]
 615                                                                                 + insProb[c2][k]
 616                                                                                 + backward[2 * k + 2 + ij]);
 617                                                 if (enableTrainEmissions) {
 618                                                         if (i == 0 && j == 1) {
 619                                                                 LOG_PLUS_EQUALS(singleCounts[c2],
 620                                                                                 initialDistribution[2 * k + 2]
 621                                                                                                 + insProb[c2][k]
 622                                                                                                 + backward[2 * k + 2 + ij]);
 623                                                         } else {
 624                                                                 LOG_PLUS_EQUALS(singleCounts[c2],
 625                                                                                 forward[0 + ij1]
 626                                                                                                 + transProb[0][2 * k + 2]
 627                                                                                                 + insProb[c2][k]
 628                                                                                                 + backward[2 * k + 2 + ij]);
 629                                                                 LOG_PLUS_EQUALS(singleCounts[c2],
 630                                                                                 forward[2 * k + 2 + ij1]
 631                                                                                                 + transProb[2 * k + 2][2 * k + 2]
 632                                                                                                 + insProb[c2][k]
 633                                                                                                 + backward[2 * k + 2 + ij]);
 634                                                         }
 635                                                 }
 636                                         }
 637                                 }
 638
 639                                 ij += NumMatrixTypes;
 640                                 i1j += NumMatrixTypes;
 641                                 ij1 += NumMatrixTypes;
 642                                 i1j1 += NumMatrixTypes;
 643                         }
 644                 }
 645
 646                 // scale all expected counts appropriately
 647                 for (int i = 0; i < NumMatrixTypes; i++) {
 648                         initCounts[i] -= totalProb;
 649                         for (int j = 0; j < NumMatrixTypes; j++)
 650                                 transCounts[i][j] -= totalProb;
 651                 }
 652                 if (enableTrainEmissions) {
 653                         for (int i = 0; i < 256; i++) {
 654                                 for (int j = 0; j < 256; j++)
 655                                         pairCounts[i][j] -= totalProb;
 656                                 singleCounts[i] -= totalProb;
 657                         }
 658                 }
 659
 660                 // compute new initial distribution
 661                 float totalInitDistribCounts = 0;
 662                 for (int i = 0; i < NumMatrixTypes; i++)
 663                         totalInitDistribCounts += exp(initCounts[i]); // should be 2
 664                 initDistribMat[0] = min(1.0f,
 665                                 max(0.0f, (float) exp(initCounts[0]) / totalInitDistribCounts));
 666                 for (int k = 0; k < NumInsertStates; k++) {
 667                         float val =
 668                                         (exp(initCounts[2 * k + 1]) + exp(initCounts[2 * k + 2]))
 669                                                         / 2;
 670                         initDistribMat[2 * k + 1] = initDistribMat[2 * k + 2] = min(1.0f,
 671                                         max(0.0f, val / totalInitDistribCounts));
 672                 }
 673
 674                 // compute total counts for match state
 675                 float inMatchStateCounts = 0;
 676                 for (int i = 0; i < NumMatrixTypes; i++)
 677                         inMatchStateCounts += exp(transCounts[0][i]);
 678                 for (int i = 0; i < NumInsertStates; i++) {
 679
 680                         // compute total counts for gap state
 681                         float inGapStateCounts = exp(transCounts[2 * i + 1][0])
 682                                         + exp(transCounts[2 * i + 1][2 * i + 1])
 683                                         + exp(transCounts[2 * i + 2][0])
 684                                         + exp(transCounts[2 * i + 2][2 * i + 2]);
 685
 686                         gapOpen[2 * i] = gapOpen[2 * i + 1] = (exp(
 687                                         transCounts[0][2 * i + 1]) + exp(transCounts[0][2 * i + 2]))
 688                                         / (2 * inMatchStateCounts);
 689
 690                         gapExtend[2 * i] = gapExtend[2 * i + 1] = (exp(
 691                                         transCounts[2 * i + 1][2 * i + 1])
 692                                         + exp(transCounts[2 * i + 2][2 * i + 2]))
 693                                         / inGapStateCounts;
 694                 }
 695
 696                 if (enableTrainEmissions) {
 697                         float totalPairCounts = 0;
 698                         float totalSingleCounts = 0;
 699                         for (int i = 0; i < 256; i++) {
 700                                 for (int j = 0; j <= i; j++)
 701                                         totalPairCounts += exp(pairCounts[j][i]);
 702                                 totalSingleCounts += exp(singleCounts[i]);
 703                         }
 704
 705                         for (int i = 0; i < 256; i++)
 706                                 if (!islower((char) i)) {
 707                                         int li = (int) ((unsigned char) tolower((char) i));
 708                                         for (int j = 0; j <= i; j++)
 709                                                 if (!islower((char) j)) {
 710                                                         int lj = (int) ((unsigned char) tolower((char) j));
 711                                                         emitPairs[i][j] =
 712                                                                         emitPairs[i][lj] =
 713                                                                                         emitPairs[li][j] =
 714                                                                                                         emitPairs[li][lj] =
 715                                                                                                                         emitPairs[j][i] =
 716                                                                                                                                         emitPairs[j][li] =
 717                                                                                                                                                         emitPairs[lj][i] =
 718                                                                                                                                                                         emitPairs[lj][li] =
 719                                                                                                                                                                                         exp(
 720                                                                                                                                                                                                         pairCounts[j][i])
 721                                                                                                                                                                                                         / totalPairCounts;
 722                                                 }
 723                                         emitSingle[i] = emitSingle[li] = exp(singleCounts[i])
 724                                                         / totalSingleCounts;
 725                                 }
 726                 }
 727         }
 728
 729         /////////////////////////////////////////////////////////////////
 730         // ProbabilisticModel::ComputeAlignment()
 731         //
 732         // Computes an alignment based on the given posterior matrix.
 733         // This is done by finding the maximum summing path (or
 734         // maximum weight trace) through the posterior matrix.  The
 735         // final alignment is returned as a pair consisting of:
 736         //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
 737         //        denote insertions in one of the two sequences and
 738         //        B's denote that both sequences are present (i.e.
 739         //        matches).
 740         //    (2) a float indicating the sum achieved
 741         /////////////////////////////////////////////////////////////////
 742
 743         pair<SafeVector<char> *, float> ComputeAlignment(int seq1Length,
 744                         int seq2Length, const VF &posterior) const {
 745
 746                 float *twoRows = new float[(seq2Length + 1) * 2];
 747                 assert(twoRows);
 748                 float *oldRow = twoRows;
 749                 float *newRow = twoRows + seq2Length + 1;
 750
 751                 char *tracebackMatrix = new char[(seq1Length + 1) * (seq2Length + 1)];
 752                 assert(tracebackMatrix);
 753                 char *tracebackPtr = tracebackMatrix;
 754
 755                 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
 756
 757                 // initialization
 758                 for (int i = 0; i <= seq2Length; i++) {
 759                         oldRow[i] = 0;
 760                         *(tracebackPtr++) = 'L';
 761                 }
 762
 763                 // fill in matrix
 764                 for (int i = 1; i <= seq1Length; i++) {
 765
 766                         // initialize left column
 767                         newRow[0] = 0;
 768                         posteriorPtr++;
 769                         *(tracebackPtr++) = 'U';
 770
 771                         // fill in rest of row
 772                         for (int j = 1; j <= seq2Length; j++) {
 773                                 ChooseBestOfThree(*(posteriorPtr++) + oldRow[j - 1],
 774                                                 newRow[j - 1], oldRow[j], 'D', 'L', 'U', &newRow[j],
 775                                                 tracebackPtr++);
 776                         }
 777
 778                         // swap rows
 779                         float *temp = oldRow;
 780                         oldRow = newRow;
 781                         newRow = temp;
 782                 }
 783
 784                 // store best score
 785                 float total = oldRow[seq2Length];
 786                 delete[] twoRows;
 787
 788                 // compute traceback
 789                 SafeVector<char> *alignment = new SafeVector<char>;
 790                 assert(alignment);
 791                 int r = seq1Length, c = seq2Length;
 792                 while (r != 0 || c != 0) {
 793                         char ch = tracebackMatrix[r * (seq2Length + 1) + c];
 794                         switch (ch) {
 795                         case 'L':
 796                                 c--;
 797                                 alignment->push_back('Y');
 798                                 break;
 799                         case 'U':
 800                                 r--;
 801                                 alignment->push_back('X');
 802                                 break;
 803                         case 'D':
 804                                 c--;
 805                                 r--;
 806                                 alignment->push_back('B');
 807                                 break;
 808                         default:
 809                                 assert(false);
 810                         }
 811                 }
 812
 813                 delete[] tracebackMatrix;
 814
 815                 reverse(alignment->begin(), alignment->end());
 816
 817                 return make_pair(alignment, total);
 818         }
 819
 820         /////////////////////////////////////////////////////////////////
 821         // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
 822         //
 823         // Similar to ComputeAlignment() except with gap penalties.
 824         /////////////////////////////////////////////////////////////////
 825
 826         pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties(
 827                         MultiSequence *align1, MultiSequence *align2, const VF &posterior,
 828                         int numSeqs1, int numSeqs2, float gapOpenPenalty,
 829                         float gapContinuePenalty) const {
 830                 int seq1Length = align1->GetSequence(0)->GetLength();
 831                 int seq2Length = align2->GetSequence(0)->GetLength();
 832                 SafeVector<SafeVector<char>::iterator> dataPtrs1(
 833                                 align1->GetNumSequences());
 834                 SafeVector<SafeVector<char>::iterator> dataPtrs2(
 835                                 align2->GetNumSequences());
 836
 837                 // grab character data
 838                 for (int i = 0; i < align1->GetNumSequences(); i++)
 839                         dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
 840                 for (int i = 0; i < align2->GetNumSequences(); i++)
 841                         dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
 842
 843                 // the number of active sequences at any given column is defined to be the
 844                 // number of non-gap characters in that column; the number of gap opens at
 845                 // any given column is defined to be the number of gap characters in that
 846                 // column where the previous character in the respective sequence was not
 847                 // a gap
 848                 SafeVector<int> numActive1(seq1Length + 1), numGapOpens1(
 849                                 seq1Length + 1);
 850                 SafeVector<int> numActive2(seq2Length + 1), numGapOpens2(
 851                                 seq2Length + 1);
 852
 853                 // compute number of active sequences and gap opens for each group
 854                 for (int i = 0; i < align1->GetNumSequences(); i++) {
 855                         SafeVector<char>::iterator dataPtr =
 856                                         align1->GetSequence(i)->GetDataPtr();
 857                         numActive1[0] = numGapOpens1[0] = 0;
 858                         for (int j = 1; j <= seq1Length; j++) {
 859                                 if (dataPtr[j] != '-') {
 860                                         numActive1[j]++;
 861                                         numGapOpens1[j] += (j != 1 && dataPtr[j - 1] != '-');
 862                                 }
 863                         }
 864                 }
 865                 for (int i = 0; i < align2->GetNumSequences(); i++) {
 866                         SafeVector<char>::iterator dataPtr =
 867                                         align2->GetSequence(i)->GetDataPtr();
 868                         numActive2[0] = numGapOpens2[0] = 0;
 869                         for (int j = 1; j <= seq2Length; j++) {
 870                                 if (dataPtr[j] != '-') {
 871                                         numActive2[j]++;
 872                                         numGapOpens2[j] += (j != 1 && dataPtr[j - 1] != '-');
 873                                 }
 874                         }
 875                 }
 876
 877                 VVF openingPenalty1(numSeqs1 + 1, VF(numSeqs2 + 1));
 878                 VF continuingPenalty1(numSeqs1 + 1);
 879                 VVF openingPenalty2(numSeqs1 + 1, VF(numSeqs2 + 1));
 880                 VF continuingPenalty2(numSeqs2 + 1);
 881
 882                 // precompute penalties
 883                 for (int i = 0; i <= numSeqs1; i++)
 884                         for (int j = 0; j <= numSeqs2; j++)
 885                                 openingPenalty1[i][j] = i
 886                                                 * (gapOpenPenalty * j
 887                                                                 + gapContinuePenalty * (numSeqs2 - j));
 888                 for (int i = 0; i <= numSeqs1; i++)
 889                         continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
 890                 for (int i = 0; i <= numSeqs2; i++)
 891                         for (int j = 0; j <= numSeqs1; j++)
 892                                 openingPenalty2[i][j] = i
 893                                                 * (gapOpenPenalty * j
 894                                                                 + gapContinuePenalty * (numSeqs1 - j));
 895                 for (int i = 0; i <= numSeqs2; i++)
 896                         continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
 897
 898                 float *twoRows = new float[6 * (seq2Length + 1)];
 899                 assert(twoRows);
 900                 float *oldRowMatch = twoRows;
 901                 float *newRowMatch = twoRows + (seq2Length + 1);
 902                 float *oldRowInsertX = twoRows + 2 * (seq2Length + 1);
 903                 float *newRowInsertX = twoRows + 3 * (seq2Length + 1);
 904                 float *oldRowInsertY = twoRows + 4 * (seq2Length + 1);
 905                 float *newRowInsertY = twoRows + 5 * (seq2Length + 1);
 906
 907                 char *tracebackMatrix =
 908                                 new char[3 * (seq1Length + 1) * (seq2Length + 1)];
 909                 assert(tracebackMatrix);
 910                 char *tracebackPtr = tracebackMatrix;
 911
 912                 VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
 913
 914                 // initialization
 915                 for (int i = 0; i <= seq2Length; i++) {
 916                         oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
 917                         oldRowInsertY[i] =
 918                                         (i == 0) ?
 919                                                         0 :
 920                                                         oldRowInsertY[i - 1]
 921                                                                         + continuingPenalty2[numActive2[i]];
 922                         *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'Y';
 923                         tracebackPtr += 3;
 924                 }
 925
 926                 // fill in matrix
 927                 for (int i = 1; i <= seq1Length; i++) {
 928
 929                         // initialize left column
 930                         newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
 931                         newRowInsertX[0] = oldRowInsertX[0]
 932                                         + continuingPenalty1[numActive1[i]];
 933                         posteriorPtr++;
 934                         *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'X';
 935                         tracebackPtr += 3;
 936
 937                         // fill in rest of row
 938                         for (int j = 1; j <= seq2Length; j++) {
 939
 940                                 // going to MATCH state
 941                                 ChooseBestOfThree(oldRowMatch[j - 1], oldRowInsertX[j - 1],
 942                                                 oldRowInsertY[j - 1], 'M', 'X', 'Y', &newRowMatch[j],
 943                                                 tracebackPtr++);
 944                                 newRowMatch[j] += *(posteriorPtr++);
 945
 946                                 // going to INSERT X state
 947                                 ChooseBestOfThree(
 948                                                 oldRowMatch[j]
 949                                                                 + openingPenalty1[numActive1[i]][numGapOpens2[j]],
 950                                                 oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
 951                                                 oldRowInsertY[j]
 952                                                                 + openingPenalty1[numActive1[i]][numGapOpens2[j]],
 953                                                 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
 954
 955                                 // going to INSERT Y state
 956                                 ChooseBestOfThree(
 957                                                 newRowMatch[j - 1]
 958                                                                 + openingPenalty2[numActive2[j]][numGapOpens1[i]],
 959                                                 newRowInsertX[j - 1]
 960                                                                 + openingPenalty2[numActive2[j]][numGapOpens1[i]],
 961                                                 newRowInsertY[j - 1]
 962                                                                 + continuingPenalty2[numActive2[j]], 'M', 'X',
 963                                                 'Y', &newRowInsertY[j], tracebackPtr++);
 964                         }
 965
 966                         // swap rows
 967                         float *temp;
 968                         temp = oldRowMatch;
 969                         oldRowMatch = newRowMatch;
 970                         newRowMatch = temp;
 971                         temp = oldRowInsertX;
 972                         oldRowInsertX = newRowInsertX;
 973                         newRowInsertX = temp;
 974                         temp = oldRowInsertY;
 975                         oldRowInsertY = newRowInsertY;
 976                         newRowInsertY = temp;
 977                 }
 978
 979                 // store best score
 980                 float total;
 981                 char matrix;
 982                 ChooseBestOfThree(oldRowMatch[seq2Length], oldRowInsertX[seq2Length],
 983                                 oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix);
 984
 985                 delete[] twoRows;
 986
 987                 // compute traceback
 988                 SafeVector<char> *alignment = new SafeVector<char>;
 989                 assert(alignment);
 990                 int r = seq1Length, c = seq2Length;
 991                 while (r != 0 || c != 0) {
 992
 993                         int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
 994                         char ch = tracebackMatrix[(r * (seq2Length + 1) + c) * 3 + offset];
 995                         switch (matrix) {
 996                         case 'Y':
 997                                 c--;
 998                                 alignment->push_back('Y');
 999                                 break;
1000                         case 'X':
1001                                 r--;
1002                                 alignment->push_back('X');
1003                                 break;
1004                         case 'M':
1005                                 c--;
1006                                 r--;
1007                                 alignment->push_back('B');
1008                                 break;
1009                         default:
1010                                 assert(false);
1011                         }
1012                         matrix = ch;
1013                 }
1014
1015                 delete[] tracebackMatrix;
1016
1017                 reverse(alignment->begin(), alignment->end());
1018
1019                 return make_pair(alignment, 1.0f);
1020         }
1021
1022         /////////////////////////////////////////////////////////////////
1023         // ProbabilisticModel::ComputeViterbiAlignment()
1024         //
1025         // Computes the highest probability pairwise alignment using the
1026         // probabilistic model.  The final alignment is returned as a
1027         //  pair consisting of:
1028         //    (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
1029         //        denote insertions in one of the two sequences and
1030         //        B's denote that both sequences are present (i.e.
1031         //        matches).
1032         //    (2) a float containing the log probability of the best
1033         //        alignment (not used)
1034         /////////////////////////////////////////////////////////////////
1035
1036         pair<SafeVector<char> *, float> ComputeViterbiAlignment(Sequence *seq1,
1037                         Sequence *seq2) const {
1038
1039                 assert(seq1);
1040                 assert(seq2);
1041
1042                 const int seq1Length = seq1->GetLength();
1043                 const int seq2Length = seq2->GetLength();
1044
1045                 // retrieve the points to the beginning of each sequence
1046                 SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1047                 SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1048
1049                 // create viterbi matrix
1050                 VF *viterbiPtr = new VF(
1051                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
1052                 assert(viterbiPtr);
1053                 VF &viterbi = *viterbiPtr;
1054
1055                 // create traceback matrix
1056                 VI *tracebackPtr = new VI(
1057                                 NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), -1);
1058                 assert(tracebackPtr);
1059                 VI &traceback = *tracebackPtr;
1060
1061                 // initialization condition
1062                 for (int k = 0; k < NumMatrixTypes; k++)
1063                         viterbi[k] = initialDistribution[k];
1064
1065                 // remember offset for each index combination
1066                 int ij = 0;
1067                 int i1j = -seq2Length - 1;
1068                 int ij1 = -1;
1069                 int i1j1 = -seq2Length - 2;
1070
1071                 ij *= NumMatrixTypes;
1072                 i1j *= NumMatrixTypes;
1073                 ij1 *= NumMatrixTypes;
1074                 i1j1 *= NumMatrixTypes;
1075
1076                 // compute viterbi scores
1077                 for (int i = 0; i <= seq1Length; i++) {
1078                         unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
1079                         for (int j = 0; j <= seq2Length; j++) {
1080                                 unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
1081
1082                                 if (i > 0 && j > 0) {
1083                                         for (int k = 0; k < NumMatrixTypes; k++) {
1084                                                 float newVal = viterbi[k + i1j1] + transProb[k][0]
1085                                                                 + matchProb[c1][c2];
1086                                                 if (viterbi[0 + ij] < newVal) {
1087                                                         viterbi[0 + ij] = newVal;
1088                                                         traceback[0 + ij] = k;
1089                                                 }
1090                                         }
1091                                 }
1092                                 if (i > 0) {
1093                                         for (int k = 0; k < NumInsertStates; k++) {
1094                                                 float valFromMatch = insProb[c1][k] + viterbi[0 + i1j]
1095                                                                 + transProb[0][2 * k + 1];
1096                                                 float valFromIns = insProb[c1][k]
1097                                                                 + viterbi[2 * k + 1 + i1j]
1098                                                                 + transProb[2 * k + 1][2 * k + 1];
1099                                                 if (valFromMatch >= valFromIns) {
1100                                                         viterbi[2 * k + 1 + ij] = valFromMatch;
1101                                                         traceback[2 * k + 1 + ij] = 0;
1102                                                 } else {
1103                                                         viterbi[2 * k + 1 + ij] = valFromIns;
1104                                                         traceback[2 * k + 1 + ij] = 2 * k + 1;
1105                                                 }
1106                                         }
1107                                 }
1108                                 if (j > 0) {
1109                                         for (int k = 0; k < NumInsertStates; k++) {
1110                                                 float valFromMatch = insProb[c2][k] + viterbi[0 + ij1]
1111                                                                 + transProb[0][2 * k + 2];
1112                                                 float valFromIns = insProb[c2][k]
1113                                                                 + viterbi[2 * k + 2 + ij1]
1114                                                                 + transProb[2 * k + 2][2 * k + 2];
1115                                                 if (valFromMatch >= valFromIns) {
1116                                                         viterbi[2 * k + 2 + ij] = valFromMatch;
1117                                                         traceback[2 * k + 2 + ij] = 0;
1118                                                 } else {
1119                                                         viterbi[2 * k + 2 + ij] = valFromIns;
1120                                                         traceback[2 * k + 2 + ij] = 2 * k + 2;
1121                                                 }
1122                                         }
1123                                 }
1124
1125                                 ij += NumMatrixTypes;
1126                                 i1j += NumMatrixTypes;
1127                                 ij1 += NumMatrixTypes;
1128                                 i1j1 += NumMatrixTypes;
1129                         }
1130                 }
1131
1132                 // figure out best terminating cell
1133                 float bestProb = LOG_ZERO;
1134                 int state = -1;
1135                 for (int k = 0; k < NumMatrixTypes; k++) {
1136                         float thisProb =
1137                                         viterbi[k
1138                                                         + NumMatrixTypes
1139                                                                         * ((seq1Length + 1) * (seq2Length + 1) - 1)]
1140                                                         + initialDistribution[k];
1141                         if (bestProb < thisProb) {
1142                                 bestProb = thisProb;
1143                                 state = k;
1144                         }
1145                 }
1146                 assert(state != -1);
1147
1148                 delete viterbiPtr;
1149
1150                 // compute traceback
1151                 SafeVector<char> *alignment = new SafeVector<char>;
1152                 assert(alignment);
1153                 int r = seq1Length, c = seq2Length;
1154                 while (r != 0 || c != 0) {
1155                         int newState = traceback[state
1156                                         + NumMatrixTypes * (r * (seq2Length + 1) + c)];
1157
1158                         if (state == 0) {
1159                                 c--;
1160                                 r--;
1161                                 alignment->push_back('B');
1162                         } else if (state % 2 == 1) {
1163                                 r--;
1164                                 alignment->push_back('X');
1165                         } else {
1166                                 c--;
1167                                 alignment->push_back('Y');
1168                         }
1169
1170                         state = newState;
1171                 }
1172
1173                 delete tracebackPtr;
1174
1175                 reverse(alignment->begin(), alignment->end());
1176
1177                 return make_pair(alignment, bestProb);
1178         }
1179
1180         /////////////////////////////////////////////////////////////////
1181         // ProbabilisticModel::BuildPosterior()
1182         //
1183         // Builds a posterior probability matrix needed to align a pair
1184         // of alignments.  Mathematically, the returned matrix M is
1185         // defined as follows:
1186         //    M[i,j] =     sum          sum      f(s,t,i,j)
1187         //             s in align1  t in align2
1188         // where
1189         //                  [  P(s[i'] <--> t[j'])
1190         //                  [       if s[i'] is a letter in the ith column of align1 and
1191         //                  [          t[j'] it a letter in the jth column of align2
1192         //    f(s,t,i,j) =  [
1193         //                  [  0    otherwise
1194         //
1195         /////////////////////////////////////////////////////////////////
1196
1197         VF *BuildPosterior(MultiSequence *align1, MultiSequence *align2,
1198                         const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1199                         float cutoff = 0.0f) const {
1200                 const int seq1Length = align1->GetSequence(0)->GetLength();
1201                 const int seq2Length = align2->GetSequence(0)->GetLength();
1202
1203                 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
1204                 assert(posteriorPtr);
1205                 VF &posterior = *posteriorPtr;
1206                 VF::iterator postPtr = posterior.begin();
1207
1208                 // for each s in align1
1209                 for (int i = 0; i < align1->GetNumSequences(); i++) {
1210                         int first = align1->GetSequence(i)->GetLabel();
1211                         SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1212
1213                         // for each t in align2
1214                         for (int j = 0; j < align2->GetNumSequences(); j++) {
1215                                 int second = align2->GetSequence(j)->GetLabel();
1216                                 SafeVector<int> *mapping2 =
1217                                                 align2->GetSequence(j)->GetMapping();
1218
1219                                 if (first < second) {
1220
1221                                         // get the associated sparse matrix
1222                                         SparseMatrix *matrix = sparseMatrices[first][second];
1223
1224                                         for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
1225                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1226                                                 int base = (*mapping1)[ii] * (seq2Length + 1);
1227                                                 int rowSize = matrix->GetRowSize(ii);
1228
1229                                                 // add in all relevant values
1230                                                 for (int jj = 0; jj < rowSize; jj++)
1231                                                         posterior[base + (*mapping2)[row[jj].first]] +=
1232                                                                         row[jj].second;
1233
1234                                                 // subtract cutoff
1235                                                 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1236                                                         posterior[base + (*mapping2)[jj]] -= cutoff;
1237                                         }
1238
1239                                 } else {
1240
1241                                         // get the associated sparse matrix
1242                                         SparseMatrix *matrix = sparseMatrices[second][first];
1243
1244                                         for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
1245                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1246                                                 int base = (*mapping2)[jj];
1247                                                 int rowSize = matrix->GetRowSize(jj);
1248
1249                                                 // add in all relevant values
1250                                                 for (int ii = 0; ii < rowSize; ii++)
1251                                                         posterior[base
1252                                                                         + (*mapping1)[row[ii].first]
1253                                                                                         * (seq2Length + 1)] +=
1254                                                                         row[ii].second;
1255
1256                                                 // subtract cutoff
1257                                                 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1258                                                         posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
1259                                                                         cutoff;
1260                                         }
1261
1262                                 }
1263
1264                                 delete mapping2;
1265                         }
1266
1267                         delete mapping1;
1268                 }
1269
1270                 return posteriorPtr;
1271         }
1272         //added by Liu Yongchao.Feb 23, 2010
1273         VF *BuildPosterior(int* seqsWeights, MultiSequence *align1,
1274                         MultiSequence *align2,
1275                         const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1276                         float cutoff = 0.0f) const {
1277                 const int seq1Length = align1->GetSequence(0)->GetLength();
1278                 const int seq2Length = align2->GetSequence(0)->GetLength();
1279
1280                 VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
1281                 assert(posteriorPtr);
1282                 VF &posterior = *posteriorPtr;
1283                 VF::iterator postPtr = posterior.begin();
1284
1285                 //compute the total sum of all weights
1286                 float totalWeights = 0;
1287                 for (int i = 0; i < align1->GetNumSequences(); i++) {
1288                         int first = align1->GetSequence(i)->GetLabel();
1289                         int w1 = seqsWeights[first];
1290                         for (int j = 0; j < align2->GetNumSequences(); j++) {
1291                                 int second = align2->GetSequence(j)->GetLabel();
1292                                 int w2 = seqsWeights[second];
1293
1294                                 totalWeights += w1 * w2;
1295                         }
1296                 }
1297                 // for each s in align1
1298                 for (int i = 0; i < align1->GetNumSequences(); i++) {
1299                         int first = align1->GetSequence(i)->GetLabel();
1300                         int w1 = seqsWeights[first];
1301                         SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1302                         // for each t in align2
1303                         for (int j = 0; j < align2->GetNumSequences(); j++) {
1304                                 int second = align2->GetSequence(j)->GetLabel();
1305                                 int w2 = seqsWeights[second];
1306                                 SafeVector<int> *mapping2 =
1307                                                 align2->GetSequence(j)->GetMapping();
1308
1309                                 float w = (float) (w1 * w2) / totalWeights;
1310                                 if (first < second) {
1311
1312                                         // get the associated sparse matrix
1313                                         SparseMatrix *matrix = sparseMatrices[first][second];
1314
1315                                         for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
1316                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1317                                                 int base = (*mapping1)[ii] * (seq2Length + 1);
1318                                                 int rowSize = matrix->GetRowSize(ii);
1319
1320                                                 // add in all relevant values
1321                                                 for (int jj = 0; jj < rowSize; jj++)
1322                                                         posterior[base + (*mapping2)[row[jj].first]] += w
1323                                                                         * row[jj].second;
1324
1325                                                 // subtract cutoff
1326                                                 for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
1327                                                         posterior[base + (*mapping2)[jj]] -= w * cutoff;
1328                                         }
1329
1330                                 } else {
1331
1332                                         // get the associated sparse matrix
1333                                         SparseMatrix *matrix = sparseMatrices[second][first];
1334
1335                                         for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
1336                                                 SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1337                                                 int base = (*mapping2)[jj];
1338                                                 int rowSize = matrix->GetRowSize(jj);
1339
1340                                                 // add in all relevant values
1341                                                 for (int ii = 0; ii < rowSize; ii++)
1342                                                         posterior[base
1343                                                                         + (*mapping1)[row[ii].first]
1344                                                                                         * (seq2Length + 1)] += w
1345                                                                         * row[ii].second;
1346
1347                                                 // subtract cutoff
1348                                                 for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1349                                                         posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
1350                                                                         w * cutoff;
1351                                         }
1352
1353                                 }
1354
1355                                 delete mapping2;
1356                         }
1357
1358                         delete mapping1;
1359                 }
1360
1361                 return posteriorPtr;
1362         }
1363 };
1364
1365 #endif