binaries/src/clustalw/src/pairwise/FullPairwiseAlign.cpp

   1 /**
   2  * Author: Mark Larkin
   3  *
   4  * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
   5  */
   6 #ifdef HAVE_CONFIG_H
   7     #include "config.h"
   8 #endif
   9 #include "FullPairwiseAlign.h"
  10 #include <math.h>
  11
  12 namespace clustalw
  13 {
  14
  15 FullPairwiseAlign::FullPairwiseAlign()
  16 : _maxAlnLength(0),
  17   intScale(0),
  18   mmScore(0),
  19   printPtr(0),
  20   lastPrint(0),
  21   _gapOpen(0),
  22   _gapExtend(0),
  23   seq1(0),
  24   seq2(0),
  25   maxScore(0),
  26   sb1(0),
  27   sb2(0),
  28   se1(0),
  29   se2(0)
  30 {
  31
  32 }
  33
  34 void FullPairwiseAlign::pairwiseAlign(Alignment *alignPtr, DistMatrix *distMat, int iStart, int iEnd, int jStart, int jEnd)
  35 {
  36     int si, sj, i;
  37     int n, m, len1, len2;
  38     int maxRes;
  39     int _matAvgScore;
  40     int res;
  41     double _score;
  42     float gapOpenScale, gapExtendScale;
  43
  44     try
  45     {
  46
  47         if(distMat->getSize() != alignPtr->getNumSeqs() + 1)
  48         {
  49             cerr << "The distance matrix is not the right size!\n"
  50                  << "Need to terminate program.\n";
  51             exit(1);
  52         }
  53         if((iStart < 0) || (iEnd < iStart) || (jStart < 0) || (jEnd < jStart))
  54         {
  55             cerr << "The range for pairwise Alignment is incorrect.\n"
  56                  << "Need to terminate program.\n";
  57             exit(1);
  58         }
  59
  60         _maxAlnLength = alignPtr->getMaxAlnLength();
  61
  62         int _numSeqs = alignPtr->getNumSeqs();
  63         if(_numSeqs == 0)
  64         {
  65             return;
  66         }
  67
  68         int num = (2 * _maxAlnLength) + 1;
  69         bool _DNAFlag = userParameters->getDNAFlag();
  70         float _pwGapOpen, _pwGapExtend;
  71         _pwGapOpen = userParameters->getPWGapOpen();
  72         _pwGapExtend = userParameters->getPWGapExtend();
  73
  74         displ.resize(num);
  75         HH.resize(_maxAlnLength);
  76         DD.resize(_maxAlnLength);
  77         RR.resize(_maxAlnLength);
  78         SS.resize(_maxAlnLength);
  79         // Note these 2 lines replace the stuff above because it is all done in the SubMatrix
  80         PairScaleValues scaleValues;
  81         maxRes = subMatrix->getPairwiseMatrix(matrix, scaleValues, _matAvgScore);
  82         if (maxRes == 0)
  83         {
  84             cerr << "Could not get the substitution matrix\n";
  85             return;
  86         }
  87
  88         intScale = scaleValues.intScale;
  89         gapOpenScale = scaleValues.gapOpenScale;
  90         gapExtendScale = scaleValues.gapExtendScale;
  91
  92         int _gapPos1, _gapPos2;
  93         _gapPos1 = userParameters->getGapPos1();
  94         _gapPos2 = userParameters->getGapPos2();
  95         const SeqArray* _ptrToSeqArray = alignPtr->getSeqArray(); //This is faster!
  96
  97         for (si = utilityObject->MAX(0, iStart); si < _numSeqs && si < iEnd; si++)
  98         {
  99             n = alignPtr->getSeqLength(si + 1);
 100             len1 = 0;
 101             for (i = 1; i <= n; i++)
 102             {
 103                 res = (*_ptrToSeqArray)[si + 1][i];
 104                 if ((res != _gapPos1) && (res != _gapPos2))
 105                 {
 106                     len1++;
 107                 }
 108             }
 109
 110             for (sj = utilityObject->MAX(si+1, jStart+1); sj < _numSeqs && sj < jEnd; sj++)
 111             {
 112                 m = alignPtr->getSeqLength(sj + 1);
 113                 if (n == 0 || m == 0)
 114                 {
 115                     distMat->SetAt(si + 1, sj + 1, 1.0);
 116                     distMat->SetAt(sj + 1, si + 1, 1.0);
 117                     continue;
 118                 }
 119                 len2 = 0;
 120                 for (i = 1; i <= m; i++)
 121                 {
 122                     res = (*_ptrToSeqArray)[sj + 1][i];
 123                     if ((res != _gapPos1) && (res != _gapPos2))
 124                     {
 125                         len2++;
 126                     }
 127                 }
 128
 129                 if (_DNAFlag)
 130                 {
 131                     _gapOpen = static_cast<int>(2 * _pwGapOpen * intScale *
 132                                     gapOpenScale);
 133                     _gapExtend = static_cast<int>(_pwGapExtend * intScale * gapExtendScale);
 134                 }
 135                 else
 136                 {
 137                     if (_matAvgScore <= 0)
 138                     {
 139                         _gapOpen = 2 * static_cast<int>((_pwGapOpen +
 140                                log(static_cast<double>(utilityObject->MIN(n, m)))) * intScale);
 141                     }
 142                     else
 143                     {
 144                         _gapOpen = static_cast<int>(2 * _matAvgScore * (_pwGapOpen +
 145                         log(static_cast<double>(utilityObject->MIN(n, m)))) * gapOpenScale);
 146                     }
 147                     _gapExtend = static_cast<int>(_pwGapExtend * intScale);
 148                 }
 149                 // align the sequences
 150
 151                 seq1 = si + 1;
 152                 seq2 = sj + 1;
 153
 154                 _ptrToSeq1 = alignPtr->getSequence(seq1);
 155                 _ptrToSeq2 = alignPtr->getSequence(seq2);
 156
 157                 forwardPass(_ptrToSeq1, _ptrToSeq2, n, m);
 158                 reversePass(_ptrToSeq1, _ptrToSeq2);
 159
 160                 lastPrint = 0;
 161                 printPtr = 1;
 162
 163                 // use Myers and Miller to align two sequences
 164
 165                 maxScore = diff(sb1 - 1, sb2 - 1, se1 - sb1 + 1, se2 - sb2 + 1,
 166                     (int)0, (int)0);
 167
 168                 // calculate percentage residue identity
 169
 170                 mmScore = tracePath(sb1, sb2);
 171
 172                 if (len1 == 0 || len2 == 0)
 173                 {
 174                     mmScore = 0;
 175                 }
 176                 else
 177                 {
 178                     mmScore /= (float)utilityObject->MIN(len1, len2);
 179                 }
 180
 181                 _score = ((float)100.0 - mmScore) / (float)100.0;
 182                 distMat->SetAt(si + 1, sj + 1, _score);
 183                 distMat->SetAt(sj + 1, si + 1, _score);
 184
 185                 if(userParameters->getDisplayInfo())
 186                 {
 187                     utilityObject->info("Sequences (%d:%d) Aligned. Score:  %d",
 188                                         si+1, sj+1, (int)mmScore);
 189                 }
 190             }
 191         }
 192
 193         displ.clear();
 194         HH.clear();
 195         DD.clear();
 196         RR.clear();
 197         SS.clear();
 198     }
 199     catch(const exception& e)
 200     {
 201         cerr << "An exception has occured in the FullPairwiseAlign class.\n"
 202              << e.what() << "\n";
 203         exit(1);
 204     }
 205 }
 206
 207 void FullPairwiseAlign::add(int v)
 208 {
 209     if (lastPrint < 0)
 210     {
 211         displ[printPtr - 1] = v;
 212         displ[printPtr++] = lastPrint;
 213     }
 214     else
 215     {
 216         lastPrint = displ[printPtr++] = v;
 217     }
 218 }
 219
 220 inline int FullPairwiseAlign::calcScore(int iat, int jat, int v1, int v2)
 221 {
 222     return matrix[(*_ptrToSeq1)[v1 + iat]][(*_ptrToSeq2)[v2 + jat]];
 223 }
 224
 225 float FullPairwiseAlign::tracePath(int tsb1, int tsb2)
 226 {
 227     int res1, res2;
 228     int i1, i2;
 229     int i, k, pos, toDo;
 230     int count;
 231     float score;
 232
 233     toDo = printPtr - 1;
 234     i1 = tsb1;
 235     i2 = tsb2;
 236
 237     pos = 0;
 238     count = 0;
 239     for (i = 1; i <= toDo; ++i)
 240     {
 241         if (displ[i] == 0)
 242         {
 243             res1 = (*_ptrToSeq1)[i1];
 244             res2 = (*_ptrToSeq2)[i2];
 245
 246             if ((res1 != userParameters->getGapPos1()) &&
 247                 (res2 != userParameters->getGapPos2()) && (res1 == res2))
 248             {
 249                 count++;
 250             }
 251             ++i1;
 252             ++i2;
 253             ++pos;
 254         }
 255         else
 256         {
 257             if ((k = displ[i]) > 0)
 258             {
 259                 i2 += k;
 260                 pos += k;
 261             }
 262             else
 263             {
 264                 i1 -= k;
 265                 pos -= k;
 266             }
 267         }
 268     }
 269
 270     score = 100.0 *(float)count;
 271     return (score);
 272 }
 273
 274 void FullPairwiseAlign::forwardPass(const vector<int>* seq1, const vector<int>* seq2, int n, int m)
 275 {
 276     int i, j;
 277     int f, hh, p, t;
 278
 279     maxScore = 0;
 280     se1 = se2 = 0;
 281     for (i = 0; i <= m; i++)
 282     {
 283         HH[i] = 0;
 284         DD[i] =  -_gapOpen;
 285     }
 286
 287     for (i = 1; i <= n; i++)
 288     {
 289         hh = p = 0;
 290         f =  -_gapOpen;
 291
 292         for (j = 1; j <= m; j++)
 293         {
 294
 295             f -= _gapExtend;
 296             t = hh - _gapOpen - _gapExtend;
 297             if (f < t)
 298             {
 299                 f = t;
 300             }
 301
 302             DD[j] -= _gapExtend;
 303             t = HH[j] - _gapOpen - _gapExtend;
 304             if (DD[j] < t)
 305             {
 306                 DD[j] = t;
 307             }
 308
 309             hh = p + matrix[(*seq1)[i]][(*seq2)[j]];
 310             if (hh < f)
 311             {
 312                 hh = f;
 313             }
 314             if (hh < DD[j])
 315             {
 316                 hh = DD[j];
 317             }
 318             if (hh < 0)
 319             {
 320                 hh = 0;
 321             }
 322
 323             p = HH[j];
 324             HH[j] = hh;
 325
 326             if (hh > maxScore)
 327             {
 328                 maxScore = hh;
 329                 se1 = i;
 330                 se2 = j;
 331             }
 332         }
 333     }
 334
 335 }
 336
 337 void FullPairwiseAlign::reversePass(const vector<int>* seq1, const vector<int>* seq2)
 338 {
 339     int i, j;
 340     int f, hh, p, t;
 341     int cost;
 342
 343     cost = 0;
 344     sb1 = sb2 = 1;
 345     for (i = se2; i > 0; i--)
 346     {
 347         HH[i] =  - 1;
 348         DD[i] =  - 1;
 349     }
 350
 351     for (i = se1; i > 0; i--)
 352     {
 353         hh = f =  - 1;
 354         if (i == se1)
 355         {
 356             p = 0;
 357         }
 358         else
 359         {
 360             p =  - 1;
 361         }
 362
 363         for (j = se2; j > 0; j--)
 364         {
 365
 366             f -= _gapExtend;
 367             t = hh - _gapOpen - _gapExtend;
 368             if (f < t)
 369             {
 370                 f = t;
 371             }
 372
 373             DD[j] -= _gapExtend;
 374             t = HH[j] - _gapOpen - _gapExtend;
 375             if (DD[j] < t)
 376             {
 377                 DD[j] = t;
 378             }
 379
 380             hh = p + matrix[(*seq1)[i]][(*seq2)[j]];
 381             if (hh < f)
 382             {
 383                 hh = f;
 384             }
 385             if (hh < DD[j])
 386             {
 387                 hh = DD[j];
 388             }
 389
 390             p = HH[j];
 391             HH[j] = hh;
 392
 393             if (hh > cost)
 394             {
 395                 cost = hh;
 396                 sb1 = i;
 397                 sb2 = j;
 398                 if (cost >= maxScore)
 399                 {
 400                     break;
 401                 }
 402             }
 403         }
 404         if (cost >= maxScore)
 405         {
 406             break;
 407         }
 408     }
 409
 410 }
 411
 412 int FullPairwiseAlign::diff(int A, int B, int M, int N, int tb, int te)
 413 {
 414     int type;
 415     int midi, midj, i, j;
 416     int midh;
 417     static int f, hh, e, s, t;
 418
 419     if (N <= 0)
 420     {
 421         if (M > 0)
 422         {
 423             del(M);
 424         }
 425
 426         return ( -(int)tbgap(M, tb));
 427     }
 428
 429     if (M <= 1)
 430     {
 431         if (M <= 0)
 432         {
 433             add(N);
 434             return ( -(int)tbgap(N, tb));
 435         }
 436
 437         midh =  - (tb + _gapExtend) - tegap(N, te);
 438         hh =  - (te + _gapExtend) - tbgap(N, tb);
 439         if (hh > midh)
 440         {
 441             midh = hh;
 442         }
 443         midj = 0;
 444         for (j = 1; j <= N; j++)
 445         {
 446             hh = calcScore(1, j, A, B) - tegap(N - j, te) - tbgap(j - 1, tb);
 447             if (hh > midh)
 448             {
 449                 midh = hh;
 450                 midj = j;
 451             }
 452         }
 453
 454         if (midj == 0)
 455         {
 456             del(1);
 457             add(N);
 458         }
 459         else
 460         {
 461             if (midj > 1)
 462             {
 463                 add(midj - 1);
 464             }
 465             displ[printPtr++] = lastPrint = 0;
 466             if (midj < N)
 467             {
 468                 add(N - midj);
 469             }
 470         }
 471         return midh;
 472     }
 473
 474     // Divide: Find optimum midpoint (midi,midj) of cost midh
 475
 476     midi = M / 2;
 477     HH[0] = 0;
 478     t =  - tb;
 479     for (j = 1; j <= N; j++)
 480     {
 481         HH[j] = t = t - _gapExtend;
 482         DD[j] = t - _gapOpen;
 483     }
 484
 485     t =  - tb;
 486     for (i = 1; i <= midi; i++)
 487     {
 488         s = HH[0];
 489         HH[0] = hh = t = t - _gapExtend;
 490         f = t - _gapOpen;
 491         for (j = 1; j <= N; j++)
 492         {
 493             if ((hh = hh - _gapOpen - _gapExtend) > (f = f - _gapExtend))
 494             {
 495                 f = hh;
 496             }
 497             if ((hh = HH[j] - _gapOpen - _gapExtend) > (e = DD[j] - _gapExtend))
 498             {
 499                 e = hh;
 500             }
 501             hh = s + calcScore(i, j, A, B);
 502             if (f > hh)
 503             {
 504                 hh = f;
 505             }
 506             if (e > hh)
 507             {
 508                 hh = e;
 509             }
 510
 511             s = HH[j];
 512             HH[j] = hh;
 513             DD[j] = e;
 514         }
 515     }
 516
 517     DD[0] = HH[0];
 518
 519     RR[N] = 0;
 520     t =  - te;
 521     for (j = N - 1; j >= 0; j--)
 522     {
 523         RR[j] = t = t - _gapExtend;
 524         SS[j] = t - _gapOpen;
 525     }
 526
 527     t =  - te;
 528     for (i = M - 1; i >= midi; i--)
 529     {
 530         s = RR[N];
 531         RR[N] = hh = t = t - _gapExtend;
 532         f = t - _gapOpen;
 533
 534         for (j = N - 1; j >= 0; j--)
 535         {
 536
 537             if ((hh = hh - _gapOpen - _gapExtend) > (f = f - _gapExtend))
 538             {
 539                 f = hh;
 540             }
 541             if ((hh = RR[j] - _gapOpen - _gapExtend) > (e = SS[j] - _gapExtend))
 542             {
 543                 e = hh;
 544             }
 545             hh = s + calcScore(i + 1, j + 1, A, B);
 546             if (f > hh)
 547             {
 548                 hh = f;
 549             }
 550             if (e > hh)
 551             {
 552                 hh = e;
 553             }
 554
 555             s = RR[j];
 556             RR[j] = hh;
 557             SS[j] = e;
 558
 559         }
 560     }
 561
 562     SS[N] = RR[N];
 563
 564     midh = HH[0] + RR[0];
 565     midj = 0;
 566     type = 1;
 567     for (j = 0; j <= N; j++)
 568     {
 569         hh = HH[j] + RR[j];
 570         if (hh >= midh)
 571         if (hh > midh || (HH[j] != DD[j] && RR[j] == SS[j]))
 572         {
 573             midh = hh;
 574             midj = j;
 575         }
 576     }
 577
 578     for (j = N; j >= 0; j--)
 579     {
 580         hh = DD[j] + SS[j] + _gapOpen;
 581         if (hh > midh)
 582         {
 583             midh = hh;
 584             midj = j;
 585             type = 2;
 586         }
 587     }
 588
 589     // Conquer recursively around midpoint
 590
 591
 592     if (type == 1)
 593     {
 594         // Type 1 gaps
 595         diff(A, B, midi, midj, tb, _gapOpen);
 596         diff(A + midi, B + midj, M - midi, N - midj, _gapOpen, te);
 597     }
 598     else
 599     {
 600         diff(A, B, midi - 1, midj, tb, 0);
 601         del(2);
 602         diff(A + midi + 1, B + midj, M - midi - 1, N - midj, 0, te);
 603     }
 604
 605     return midh; // Return the score of the best alignment
 606 }
 607
 608 void FullPairwiseAlign::del(int k)
 609 {
 610     if (lastPrint < 0)
 611     {
 612         lastPrint = displ[printPtr - 1] -= k;
 613     }
 614     else
 615     {
 616         lastPrint = displ[printPtr++] =  - (k);
 617     }
 618 }
 619
 620 int FullPairwiseAlign::gap(int k)
 621 {
 622     if(k <= 0)
 623     {
 624         return 0;
 625     }
 626     else
 627     {
 628         return _gapOpen + _gapExtend * k;
 629     }
 630 }
 631
 632 int FullPairwiseAlign::tbgap(int k, int tb)
 633 {
 634     if(k <= 0)
 635     {
 636         return 0;
 637     }
 638     else
 639     {
 640         return tb + _gapExtend * k;
 641     }
 642 }
 643
 644 int FullPairwiseAlign::tegap(int k, int te)
 645 {
 646     if(k <= 0)
 647     {
 648         return 0;
 649     }
 650     else
 651     {
 652         return te + _gapExtend * k;
 653     }
 654 }
 655
 656 }