4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
5 * Changes: Mark 20-6-07, I added a call to calculateMaxlengths
10 #include "MyersMillerProfileAlign.h"
11 #include "Iteration.h"
21 MyersMillerProfileAlign::MyersMillerProfileAlign()
22 : _gapPos1(userParameters->getGapPos1()),
23 _gapPos2(userParameters->getGapPos2())
36 int MyersMillerProfileAlign::profileAlign(Alignment* alnPtr, DistMatrix* distMat,
37 vector<int>* group, int* aligned)
39 bool negative = false;
40 int i = 0, j = 0, count = 0;
43 int len = 0, len1 = 0, len2 = 0, is = 0, minLen = 0;
44 int se1 = 0, se2 = 0, sb1 = 0, sb2 = 0;
48 double logmin = 0.0, logdiff = 0.0;
50 int matrix[NUMRES][NUMRES];
51 int numSeqsProf1 = 0, numSeqsProf2 = 0;
52 // Initialise the matrix. To get rid of a valgrind error.
53 for(int i = 0; i < NUMRES; i++)
55 for(int j = 0; j < NUMRES; j++)
61 numSeq = alnPtr->getNumSeqs();
62 seqArray.resize(numSeq);
63 alnWeight.resize(numSeq);
65 for (i = 0; i < numSeq; i++)
67 if (aligned[i + 1] == 0)
74 for (i = 0; i < numSeq; i++)
76 if ((*group)[i + 1] == 1)
80 else if ((*group)[i + 1] == 2)
86 if ((nseqs1 == 0) || (nseqs2 == 0))
90 numSeqsProf1 = nseqs1;
91 numSeqsProf2 = nseqs2;
95 switchProfiles = true;
96 for (i = 0; i < numSeq; i++)
98 if ((*group)[i + 1] == 1)
102 else if ((*group)[i + 1] == 2)
110 switchProfiles = false;
113 // calculate the mean of the sequence pc identities between the two groups
117 negative = userParameters->getUseNegMatrix();
119 for (i = 0; i < numSeq; i++)
121 if ((*group)[i + 1] == 1)
123 for (j = 0; j < numSeq; j++)
125 if ((*group)[j + 1] == 2)
128 pcid += (*distMat)(i + 1, j + 1);
134 pcid = pcid / (float)count;
136 // Make the first profile.
139 for (i = 0; i < numSeq; i++)
141 if ((*group)[i + 1] == 1)
143 if (alnPtr->getSeqLength(i + 1) > prfLength1)
145 prfLength1 = alnPtr->getSeqLength(i + 1);
151 for (i = 0; i < numSeq; i++)
153 if ((*group)[i + 1] == 1)
155 len = alnPtr->getSeqLength(i + 1);
157 // initialise with the other vector!
158 seqArray[nseqs1] = vector<int>(alnPtr->getSequence(i + 1)->begin() + 1,
159 alnPtr->getSequence(i + 1)->end());
160 seqArray[nseqs1].resize(prfLength1 + extraEndElemNum);
162 for (j = len; j < prfLength1; j++)
164 seqArray[nseqs1][j] = userParameters->getGapPos1();
166 alnWeight[nseqs1] = alnPtr->getSeqWeight(i);
172 // Make the second profile.
175 for (i = 0; i < numSeq; i++)
177 if ((*group)[i + 1] == 2)
179 if (alnPtr->getSeqLength(i + 1) > prfLength2)
181 prfLength2 = alnPtr->getSeqLength(i + 1);
187 for (i = 0; i < numSeq; i++)
189 if ((*group)[i + 1] == 2)
191 len = alnPtr->getSeqLength(i + 1);
193 seqArray[nseqs1 + nseqs2] = vector<int>(alnPtr->getSequence(i + 1)->begin() + 1,
194 alnPtr->getSequence(i + 1)->end());
195 seqArray[nseqs1 + nseqs2].resize(prfLength2 + extraEndElemNum);
197 for (j = len; j < prfLength2; j++)
199 seqArray[nseqs1 + nseqs2][j] = userParameters->getGapPos1();
202 seqArray[nseqs1 + nseqs2][j] = ENDALN;
203 alnWeight[nseqs1 + nseqs2] = alnPtr->getSeqWeight(i);
204 //cout << "weight " << nseqs1 + nseqs2 << alnWeight[nseqs1 + nseqs2] << "\n";
209 // Change the Max alignment length in the Alignment Object!
210 alnPtr->setMaxAlnLength(prfLength1 + prfLength2 + 2);
212 // calculate real length of profiles - removing gaps!
215 for (i = 0; i < nseqs1; i++)
218 for (j = 0; j < utilityObject->MIN(static_cast<int>(seqArray[i].size()), prfLength1); j++)
221 if ((c != _gapPos1) && (c != _gapPos2))
228 len1 = static_cast<int>(len1 / (float)nseqs1);
231 for (i = nseqs1; i < nseqs2 + nseqs1; i++)
234 for (j = 0; j < utilityObject->MIN(static_cast<int>(seqArray[i].size()), prfLength2);
238 if ((c != _gapPos1) && (c != _gapPos2))
245 len2 = static_cast<int>(len2 / (float)nseqs2);
247 PrfScaleValues scaleVals;
248 scaleVals.scale = 1.0;
249 scaleVals.intScale = 100.0;
252 minLen = utilityObject->MIN(len1, len2);
255 // Get the substitution matrix that will be stored in 'matrix'
256 maxRes = subMatrix->getProfileAlignMatrix(matrix, pcid, minLen, scaleVals, matAvgScore);
258 if (maxRes == 0 || maxRes == -1)
262 if (userParameters->getDNAFlag())
264 gapcoef1 = gapcoef2 = static_cast<int>(100.0 * userParameters->getGapOpen() * scaleVals.scale);
265 lencoef1 = lencoef2 = static_cast<int>(100.0 * userParameters->getGapExtend() * scaleVals.scale);
269 if (len1 == 0 || len2 == 0)
276 logmin = 1.0 / log10((double)minLen);
279 logdiff = 1.0 + 0.5 * log10((double)((float)len2 / (float)len1));
281 else if (len1 < len2)
283 logdiff = 1.0 + 0.5 * log10((double)((float)len1 / (float)len2));
298 gapcoef1 = gapcoef2 = static_cast<int>(100.0 *(float)(userParameters->getGapOpen()));
299 lencoef1 = lencoef2 = static_cast<int>(100.0 * userParameters->getGapExtend());
303 if (matAvgScore <= 0)
305 gapcoef1 = gapcoef2 = static_cast<int>(100.0 *(float)(userParameters->getGapOpen() + logmin));
309 gapcoef1 = gapcoef2 = static_cast<int>(scaleVals.scale * matAvgScore *
310 (float)(userParameters->getGapOpen() / (logdiff * logmin)));
312 lencoef1 = lencoef2 = static_cast<int>(100.0 * userParameters->getGapExtend());
315 // We need one profile with substitution matrix information and one without!
316 // But this will change when we have the LE scoring function.
317 profileWithSub = new ProfileWithSub(prfLength1, 0, nseqs1);
318 profileStandard = new ProfileStandard(prfLength2, nseqs1, nseqs1 + nseqs2);
320 // Calculate the profile array with Substitution matrix info
321 // calculate the Gap Coefficients.
323 gaps.resize(alnPtr->getMaxAlnLength() + 1);
326 if (switchProfiles == false)
328 profile1Pen = userParameters->getStructPenalties1() && userParameters->getUseSS1();
329 profileWithSub->calcGapCoeff(&seqArray, &gaps, profile1Pen,
330 alnPtr->getGapPenaltyMask1(), gapcoef1, lencoef1);
334 profile1Pen = userParameters->getStructPenalties2() && userParameters->getUseSS2();
335 profileWithSub->calcGapCoeff(&seqArray, &gaps, profile1Pen,
336 alnPtr->getGapPenaltyMask2(), gapcoef1, lencoef1);
338 // calculate the profile matrix.
339 profileWithSub->calcProfileWithSub(&seqArray, &gaps, matrix, &alnWeight);
340 profile1 = profileWithSub->getProfilePtr();
342 if (userParameters->getDebug() > 4)
344 string aminoAcidCodes = userParameters->getAminoAcidCodes();
345 for (j = 0; j <= userParameters->getMaxAA(); j++)
347 cout << aminoAcidCodes[j] << " ";
350 for (i = 0; i < prfLength1; i++)
352 for (j = 0; j <= userParameters->getMaxAA(); j++)
354 cout << (*profile1)[i + 1][j] << " ";
356 cout << (*profile1)[i + 1][_gapPos1]<< " ";
357 cout << (*profile1)[i + 1][_gapPos2]<< " ";
358 cout << (*profile1)[i + 1][GAPCOL] << " " << (*profile1)[i + 1][LENCOL]
362 // Calculate the standard profile array
363 // calculate the Gap Coefficients.
365 if (switchProfiles == false)
367 profile2Pen = userParameters->getStructPenalties2() && userParameters->getUseSS2();
368 profileStandard->calcGapCoeff(&seqArray, &gaps, profile2Pen,
369 alnPtr->getGapPenaltyMask2(), gapcoef2, lencoef2);
373 profile2Pen = userParameters->getStructPenalties1() && userParameters->getUseSS1();
374 profileStandard->calcGapCoeff(&seqArray, &gaps, profile2Pen,
375 alnPtr->getGapPenaltyMask1(), gapcoef2, lencoef2);
378 // calculate the profile matrix.
380 profileStandard->calcStandardProfile(&seqArray, &alnWeight);
381 profile2 = profileStandard->getProfilePtr();
383 if (userParameters->getDebug() > 4)
385 string aminoAcidCodes = userParameters->getAminoAcidCodes();
386 for (j = 0; j <= userParameters->getMaxAA(); j++)
388 cout << aminoAcidCodes[j] << " ";
391 for (i = 0; i < prfLength2; i++)
393 for (j = 0; j <= userParameters->getMaxAA(); j++)
395 cout << (*profile2)[i + 1][j] << " ";
397 cout << (*profile2)[i + 1][_gapPos1]<< " ";
398 cout << (*profile2)[i + 1][_gapPos2]<< " ";
399 cout << (*profile2)[i + 1][GAPCOL] << " " << (*profile2)[i + 1][LENCOL]
405 int _maxAlnLength = alnPtr->getMaxAlnLength();
407 alnPath1.resize(_maxAlnLength + 1);
408 alnPath2.resize(_maxAlnLength + 1);
412 // use Myers and Miller to align two sequences
421 HH.resize(_maxAlnLength + 1);
422 DD.resize(_maxAlnLength + 1);
423 RR.resize(_maxAlnLength + 1);
424 SS.resize(_maxAlnLength + 1);
425 gS.resize(_maxAlnLength + 1);
426 displ.resize(_maxAlnLength + 1);
428 score = progDiff(sb1, sb2, se1 - sb1, se2 - sb2, (*profile1)[0][GAPCOL],
429 (*profile1)[prfLength1][GAPCOL]);
431 // May not need if we recreate the Profile every time!
438 alignmentLength = progTracepath();
442 addGGaps(alnPtr, &seqArray);
444 profileStandard->resetProfile();
445 profileWithSub->resetProfile();
447 prfLength1 = alignmentLength;
452 if(userParameters->getDoRemoveFirstIteration() == TREE)
454 Iteration iterateObj;
455 iterateObj.iterationOnTreeNode(numSeqsProf1, numSeqsProf2, prfLength1, prfLength2,
459 // Now we resize the SeqArray that holds the sequences in the alignment class
460 // and also update it with the new aligned sequences
462 SeqArray* newSequences = alnPtr->getSeqArrayForRealloc();
464 for (j = 0; j < numSeq; j++)
466 if ((*group)[j + 1] == 1)
468 (*newSequences)[j + 1].clear();
469 (*newSequences)[j + 1].resize(prfLength1 + 1);
470 for (i = 0; i < prfLength1; i++)
472 (*newSequences)[j + 1][i + 1] = seqArray[seqNum][i];
477 for (j = 0; j < numSeq; j++)
479 if ((*group)[j + 1] == 2)
481 (*newSequences)[j + 1].clear();
482 (*newSequences)[j + 1].resize(prfLength1 + 1);
483 for (i = 0; i < prfLength1; i++)
485 (*newSequences)[j + 1][i + 1] = seqArray[seqNum][i];
491 alnPtr->calculateMaxLengths(); // Mark change 20-6-07
496 delete profileWithSub;
497 delete profileStandard;
499 int retScore = (score / 100);
505 /** ****************************************************************************************
506 * Private functions *
507 *******************************************************************************************/
519 int MyersMillerProfileAlign::progDiff(int A, int B, int M, int N, int go1, int go2)
521 int midi, midj, type;
524 static int t, tl, g, h;
528 static int hh, f, e, s;
530 /* Boundary cases: M <= 1 or N == 0 */
531 if (userParameters->getDebug() > 2)
533 cout << "A " << A << " B " << B << " M " << M << " N " << N
534 << " midi " << M / 2 << " go1 " << go1 << " go2 " << go2<< "\n";
537 /* if sequence B is empty.... */
542 /* if sequence A is not empty.... */
547 /* delete residues A[1] to A[M] */
551 return ( - gapPenalty1(A, B, M));
554 /* if sequence A is empty.... */
561 /* insert residues B[1] to B[N] */
564 return ( - gapPenalty2(A, B, N));
567 /* if sequence A has just one residue.... */
571 midh = - gapPenalty1(A + 1, B + 1, N);
575 midh = - gapPenalty2(A + 1, B, 1) - gapPenalty1(A + 1, B + 1,
579 for (j = 1; j <= N; j++)
581 hh = - gapPenalty1(A, B + 1, j - 1) + prfScore(A + 1, B + j)
582 - gapPenalty1(A + 1, B + j + 1, N - j);
611 /* Divide sequence A in half: midi */
615 /* In a forward phase, calculate all HH[j] and HH[j] */
618 t = - openPenalty1(A, B + 1);
619 tl = - extPenalty1(A, B + 1);
620 for (j = 1; j <= N; j++)
623 DD[j] = t - openPenalty2(A + 1, B + j);
632 t = - openPenalty2(A + 1, B);
634 tl = - extPenalty2(A + 1, B);
635 for (i = 1; i <= midi; i++)
638 HH[0] = hh = t = t + tl;
639 f = t - openPenalty1(A + i, B + 1);
641 for (j = 1; j <= N; j++)
643 g = openPenalty1(A + i, B + j);
644 h = extPenalty1(A + i, B + j);
645 if ((hh = hh - g - h) > (f = f - h))
649 g = openPenalty2(A + i, B + j);
650 h = extPenalty2(A + i, B + j);
651 if ((hh = HH[j] - g - h) > (e = DD[j] - h))
655 hh = s + prfScore(A + i, B + j);
674 /* In a reverse phase, calculate all RR[j] and SS[j] */
678 for (j = N - 1; j >= 0; j--)
680 g = - openPenalty1(A + M, B + j + 1);
681 tl -= extPenalty1(A + M, B + j + 1);
683 SS[j] = RR[j] - openPenalty2(A + M, B + j);
684 gS[j] = openPenalty2(A + M, B + j);
688 for (i = M - 1; i >= midi; i--)
697 g = - openPenalty2(A + i + 1, B + N);
699 tl -= extPenalty2(A + i + 1, B + N);
701 t = openPenalty1(A + i, B + N);
704 for (j = N - 1; j >= 0; j--)
706 g = openPenalty1(A + i, B + j + 1);
707 h = extPenalty1(A + i, B + j + 1);
708 if ((hh = hh - g - h) > (f = f - h - g + t))
713 g = openPenalty2(A + i + 1, B + j);
714 h = extPenalty2(A + i + 1, B + j);
722 e = SS[j] - h - g + openPenalty2(A + i + 2, B + j);
729 hh = s + prfScore(A + i + 1, B + j + 1);
746 gS[N] = openPenalty2(A + midi + 1, B + N);
748 /* find midj, such that HH[j]+RR[j] or DD[j]+SS[j]+gap is the maximum */
750 midh = HH[0] + RR[0];
753 for (j = 0; j <= N; j++)
757 if (hh > midh || (HH[j] != DD[j] && RR[j] == SS[j]))
764 for (j = N; j >= 0; j--)
766 hh = DD[j] + SS[j] + gS[j];
776 /* Conquer recursively around midpoint */
782 if (userParameters->getDebug() > 2)
784 cout << "Type 1,1: midj " << midj << "\n";
786 progDiff(A, B, midi, midj, go1, 1);
787 if (userParameters->getDebug() > 2)
789 cout << "Type 1,2: midj " << midj << "\n";
791 progDiff(A + midi, B + midj, M - midi, N - midj, 1, go2);
795 if (userParameters->getDebug() > 2)
797 cout << "Type 2,1: midj " << midj << "\n";
799 progDiff(A, B, midi - 1, midj, go1, 0);
801 if (userParameters->getDebug() > 2)
803 cout << "Type 2,2: midj " << midj << "\n";
805 progDiff(A + midi + 1, B + midj, M - midi - 1, N - midj, 0, go2);
808 return midh; /* Return the score of the best alignment */
816 void MyersMillerProfileAlign::addGGaps(Alignment* alnPtr, SeqArray* seqArray)
823 ta.resize(alignmentLength + 1);
825 for (j = 0; j < nseqs1; j++)
828 for (i = 0; i < alignmentLength; i++)
830 if (alnPath1[i] == 2)
832 if (ix < ((int)(*seqArray)[j].size() - extraEndElemNum))
834 ta[i] = (*seqArray)[j][ix];
842 else if (alnPath1[i] == 1)
845 insertion in first alignment...
851 cerr << "Error in aln_path\n";
856 len = alignmentLength;
858 (*seqArray)[j].resize(len + 2);
860 for (i = 0; i < len; i++)
862 (*seqArray)[j][i] = ta[i];
864 (*seqArray)[j][len] = ENDALN;
868 for (j = nseqs1; j < nseqs1 + nseqs2; j++)
871 for (i = 0; i < alignmentLength; i++)
873 if (alnPath2[i] == 2)
875 if (ix < ((int)(*seqArray)[j].size() - extraEndElemNum))
877 ta[i] = (*seqArray)[j][ix];
885 else if (alnPath2[i] == 1)
888 insertion in second alignment...
894 cerr << "Error in alnPath\n";
899 len = alignmentLength;
901 (*seqArray)[j].resize(len + 2);
903 for (i = 0; i < len; i++)
905 (*seqArray)[j][i] = ta[i];
907 (*seqArray)[j][len] = ENDALN;
911 if (userParameters->getStructPenalties1() != NONE)
913 addGGapsMask(alnPtr->getGapPenaltyMask1(), alignmentLength,
914 &alnPath1, &alnPath2);
916 if (userParameters->getStructPenalties1() == SECST)
918 addGGapsMask(alnPtr->getSecStructMask1(), alignmentLength,
919 &alnPath1, &alnPath2);
922 if (userParameters->getStructPenalties2() != NONE)
924 addGGapsMask(alnPtr->getGapPenaltyMask2(), alignmentLength,
925 &alnPath2, &alnPath1);
927 if (userParameters->getStructPenalties2() == SECST)
929 addGGapsMask(alnPtr->getSecStructMask2(), alignmentLength,
930 &alnPath2, &alnPath1);
942 void MyersMillerProfileAlign::addGGapsMask(vector<char>* mask, int len, vector<int>* path1,
948 ta = new char[len + 1];
951 if (switchProfiles == false)
953 for (i = 0; i < len; i++)
955 if ((*path1)[i] == 2)
960 else if ((*path1)[i] == 1)
968 for (i = 0; i < len; i++)
970 if ((*path2)[i] == 2)
975 else if ((*path2)[i] == 1)
981 mask->resize(len + 2);
983 for (i = 0; i < len; i++)
1000 inline int MyersMillerProfileAlign::prfScore(int n, int m)
1006 int _maxAA = userParameters->getMaxAA(); // NOTE Change here!
1007 for (ix = 0; ix <= _maxAA; ix++)
1009 score += ((*profile1)[n][ix] * (*profile2)[m][ix]);
1011 score += ((*profile1)[n][_gapPos1] * (*profile2)[m][_gapPos1]);
1012 score += ((*profile1)[n][_gapPos2] * (*profile2)[m][_gapPos2]);
1013 return (score / 10);
1021 int MyersMillerProfileAlign::progTracepath()
1023 int i, j, k, pos, toDo;
1027 toDo = printPtr - 1;
1029 for (i = 1; i <= toDo; ++i)
1031 if (userParameters->getDebug() > 1)
1033 cout << displ[i] << " ";
1043 if ((k = displ[i]) > 0)
1045 for (j = 0; j <= k - 1; ++j)
1047 alnPath2[pos + j] = 2;
1048 alnPath1[pos + j] = 1;
1054 k = (displ[i] < 0) ? displ[i] * - 1: displ[i];
1055 for (j = 0; j <= k - 1; ++j)
1057 alnPath1[pos + j] = 2;
1058 alnPath2[pos + j] = 1;
1064 if (userParameters->getDebug() > 1)
1078 void MyersMillerProfileAlign::progDel(int k)
1082 lastPrint = displ[printPtr - 1] -= k;
1086 lastPrint = displ[printPtr++] = - (k);
1095 void MyersMillerProfileAlign::progAdd(int k)
1099 displ[printPtr - 1] = k;
1100 displ[printPtr++] = lastPrint;
1104 lastPrint = displ[printPtr++] = k;
1112 void MyersMillerProfileAlign::progAlign()
1114 displ[printPtr++] = lastPrint = 0;
1124 inline int MyersMillerProfileAlign::openPenalty1(int i, int j) // NOTE Change here!
1127 if (!userParameters->getEndGapPenalties() && (i == 0 || i == prfLength1))
1132 g = (*profile2)[j][GAPCOL] + (*profile1)[i][GAPCOL];
1143 inline int MyersMillerProfileAlign::extPenalty1(int i, int j) // NOTE Change here!
1147 if (!userParameters->getEndGapPenalties() && (i == 0 || i == prfLength1))
1152 h = (*profile2)[j][LENCOL];
1164 int MyersMillerProfileAlign::gapPenalty1(int i, int j, int k)
1174 if (!userParameters->getEndGapPenalties() && (i == 0 || i == prfLength1))
1179 g = (*profile2)[j][GAPCOL] + (*profile1)[i][GAPCOL];
1180 for (ix = 0; ix < k && ix + j < prfLength2; ix++)
1182 h += (*profile2)[ix + j][LENCOL];
1196 inline int MyersMillerProfileAlign::openPenalty2(int i, int j) // NOTE Change here!
1200 if (!userParameters->getEndGapPenalties() && (j == 0 || j == prfLength2))
1205 g = (*profile1)[i][GAPCOL] + (*profile2)[j][GAPCOL];
1216 inline int MyersMillerProfileAlign::extPenalty2(int i, int j) // NOTE Change here!
1220 if (!userParameters->getEndGapPenalties() && (j == 0 || j == prfLength2))
1225 h = (*profile1)[i][LENCOL];
1237 int MyersMillerProfileAlign::gapPenalty2(int i, int j, int k)
1247 if (!userParameters->getEndGapPenalties() && (j == 0 || j == prfLength2))
1252 g = (*profile1)[i][GAPCOL] + (*profile2)[j][GAPCOL];
1253 for (ix = 0; ix < k && ix + i < prfLength1; ix++)
1255 h += (*profile1)[ix + i][LENCOL];