4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
9 #include "ProfileBase.h"
20 ProfileBase::ProfileBase(int prfLen, int firstS, int lastS)
28 vlut.resize(numLetters, vector<int>(numLetters));
29 for(int i = 0; i < numLetters; i++)
34 pascarellaRes = "ACDEFGHKILMNPQRSTVYW";
35 int pasprob[] = { 87, 87, 104, 69, 80, 139, 100, 104, 68, 79,
36 71, 137, 126, 93, 128, 124, 111, 75, 100, 77};
37 pascarellaProb = vector<int>(pasprob, pasprob + 20);
39 profile.resize(prfLength + 2, vector<int>(LENCOL + 2));
47 * @param useStructPenalties
48 * @param gapPenaltyMask
52 void ProfileBase::calcGapCoeff(SeqArray* seqArray, vector<int>* gaps,
53 bool useStructPenalties, vector<char>* gapPenaltyMask, int gapCoef, int lenCoef)
58 int _numSeq, val, pcid;
60 vector<int> vWeight, resWeight, hydWeight;
62 int _maxAA = userParameters->getMaxAA();
64 _numSeq = lastSeq - firstSeq;
67 pcid = static_cast<int>(percentId(&(*seqArray)[firstSeq], &(*seqArray)[firstSeq + 1]));
72 for (j = 0; j < prfLength; j++)
77 // Check for a gap penalty mask
79 if (useStructPenalties != false)
81 nVarPen = nHydPen = nPrefPen = true;
84 else if (userParameters->getNoVarPenalties() == false && pcid > 60)
86 nHydPen = nPrefPen = true;
92 nHydPen = userParameters->getNoHydPenalties();
93 nPrefPen = userParameters->getNoPrefPenalties();
94 gdist = userParameters->getGapDist();
97 for (i = firstSeq; i < lastSeq; i++)
99 // Include end gaps as gaps ?
102 if (userParameters->getUseEndGaps() == false &&
103 userParameters->getEndGapPenalties() == false)
105 for (j = 0; j < prfLength; j++)
107 c = (*seqArray)[i][j];
108 if ((c < 0) || (c > _maxAA))
113 for (j = prfLength - 1; j >= 0; j--)
115 c = (*seqArray)[i][j];
116 if ((c < 0) || (c > _maxAA))
123 for (j = is; j < ie; j++)
125 if (((*seqArray)[i][j] < 0) || ((*seqArray)[i][j] > _maxAA))
132 int _DNAFlag = userParameters->getDNAFlag();
134 if ((!_DNAFlag) && (nVarPen == false))
136 vWeight.resize(prfLength + 2);
137 calcVPenalties(seqArray, &vWeight);
141 if ((!_DNAFlag) && (nPrefPen == false))
143 resWeight.resize(prfLength + 2);
144 calcResidueSpecificPen(seqArray, &resWeight);
147 if ((!_DNAFlag) && (nHydPen == false))
149 hydWeight.resize(prfLength + 2);
150 calcHydrophilicPen(seqArray, &hydWeight);
153 gapPos.resize(prfLength + 2);
155 // mark the residues close to an existing gap (set gaps[i] = -ve)
157 if (_DNAFlag || (gdist <= 0))
159 for (i = 0; i < prfLength; i++)
161 gapPos[i] = (*gaps)[i];
167 while (i < prfLength)
171 gapPos[i] = (*gaps)[i];
176 for (j = -gdist + 1; j < 0; j++)
178 if ((i + j >= 0) && (i + j < prfLength) &&
179 (((*gaps)[i + j] == 0) || ((*gaps)[i + j] < j)))
184 while ((*gaps)[i] > 0)
190 gapPos[i] = (*gaps)[i];
193 for (j = 0; j < gdist; j++)
195 if ((*gaps)[i + j] > 0)
199 if ((i + j >= 0) && (i + j < prfLength) &&
200 (((*gaps)[i + j] == 0) || ((*gaps)[i + j] < -j)))
202 gapPos[i + j] = -j-1;
210 for (j = 0; j < prfLength; j++)
215 // apply residue-specific and hydrophilic gap penalties.
219 profile[j + 1][GAPCOL] = localPenalty(gapCoef, j, &resWeight, &hydWeight,
221 profile[j+1][LENCOL] = lenCoef;
225 profile[j + 1][GAPCOL] = gapCoef;
226 profile[j + 1][LENCOL] = lenCoef;
229 // increase gap penalty near to existing gaps.
233 profile[j + 1][GAPCOL] = static_cast<int>((profile[j + 1][GAPCOL] * (2.0 + 2.0 * (gdist + gapPos[j]) / gdist)));
238 scale = ((float)(_numSeq - (*gaps)[j]) / (float)_numSeq) * reducedGap;
239 profile[j + 1][GAPCOL] = static_cast<int>(scale * gapCoef);
240 profile[j + 1][LENCOL] = static_cast<int>(0.5 * lenCoef);
243 // apply the gap penalty mask
245 if (useStructPenalties != NONE)
247 val = (*gapPenaltyMask)[j] - '0';
248 if (val > 0 && val < 10)
250 profile[j + 1][GAPCOL] *= val;
251 profile[j + 1][LENCOL] *= val;
255 // make sure no penalty is zero - even for all-gap positions
257 if (profile[j + 1][GAPCOL] <= 0)
259 profile[j + 1][GAPCOL] = 1;
261 if (profile[j + 1][LENCOL] <= 0)
263 profile[j + 1][LENCOL] = 1;
267 // set the penalties at the beginning and end of the profile
268 if(userParameters->getEndGapPenalties() == true)
270 profile[0][GAPCOL] = gapCoef;
271 profile[0][LENCOL] = lenCoef;
275 profile[0][GAPCOL] = 0;
276 profile[0][LENCOL] = 0;
277 profile[prfLength][GAPCOL] = 0;
278 profile[prfLength][LENCOL] = 0;
280 if (userParameters->getDebug() > 0)
282 cout << "Opening penalties:\n";
283 for(i = 0; i <= prfLength; i++)
285 cout <<" " << i << ":" << profile[i][GAPCOL]<< " ";
289 if (userParameters->getDebug() > 0)
291 cout << "Extension penalties:\n";
292 for(i = 0; i <= prfLength; i++)
294 cout << i << ":" << profile[i][LENCOL] << " ";
300 /** **************************************************************************************
301 * Protected functions *
302 *****************************************************************************************/
310 void ProfileBase::calcVPenalties(SeqArray* aln, vector<int>* weight)
314 int _maxAA = userParameters->getMaxAA();
315 int aminoCodeix1, aminoCodeix2;
317 for (i = 0; i < prfLength; i++)
321 for(j = i - vwindow;j < i + vwindow; j++)
323 if(j >= 0 && j < prfLength)
325 ix1 = (*aln)[firstSeq][j];
326 ix2 = (*aln)[firstSeq + 1][j];
327 if ((ix1 < 0) || (ix1 > _maxAA) || (ix2 < 0) || (ix2 > _maxAA))
331 aminoCodeix1 = userParameters->getAminoAcidCode(ix1);
332 aminoCodeix2 = userParameters->getAminoAcidCode(ix2);
333 (*weight)[i] += vlut[aminoCodeix1 - 'A'][aminoCodeix2 - 'A'];
337 /* now we have a weight -t < w < t */
340 (*weight)[i] = ((*weight)[i] * 100)/(2 * t);
343 /* now we have a weight vll < w < 100 */
344 if ((*weight)[i] < vll)
355 void ProfileBase::calcResidueSpecificPen(SeqArray* aln, vector<int>* weight)
360 int _maxAA = userParameters->getMaxAA();
361 int _pascarellaNumRes = pascarellaRes.size();
363 _numSeq = lastSeq - firstSeq;
364 for (i = 0; i < prfLength; i++)
367 for (k = firstSeq; k < lastSeq; k++)
369 for (j = 0; j < _pascarellaNumRes; j++)
372 if ((ix < 0) || (ix > _maxAA))
374 if (userParameters->getAminoAcidCode(ix) == pascarellaRes[j])
376 (*weight)[i] += (180 - pascarellaProb[j]);
381 (*weight)[i] /= _numSeq;
390 void ProfileBase::calcHydrophilicPen(SeqArray* aln, vector<int>* weight)
393 int numHydResidues, j, k;
397 int _maxAA = userParameters->getMaxAA();
399 hyd.resize(prfLength + 2);
400 string _hydResidues(userParameters->getHydResidues());
401 numHydResidues = _hydResidues.size();
403 for (i = 0; i < prfLength; i++)
408 for (k = firstSeq; k < lastSeq; k++)
410 for (i = 0; i < prfLength; i++)
413 for (j = 0; j < numHydResidues; j++)
416 if ((res < 0) || (res > _maxAA))
418 if (userParameters->getAminoAcidCode(res) == _hydResidues[j])
426 while (i < prfLength)
433 while ((hyd[i] != 0) && (i < prfLength))
440 for (j = s; j < e; j++)
449 scale = lastSeq - firstSeq;
450 for (i = 0; i < prfLength; i++)
452 (*weight)[i] = static_cast<int>(((*weight)[i] / scale)); // Mark change 17-5-07
466 int ProfileBase::localPenalty(int penalty, int n, vector<int>* resWeight,
467 vector<int>* hydWeight, vector<int>* vWeight)
472 if (userParameters->getDNAFlag())
478 if (nVarPen == false)
480 gw *= (*vWeight)[n] / 100.0;
483 if (nHydPen == false)
485 if ((*hydWeight)[n] > 0)
491 if ((nPrefPen == false) && (h == false))
493 gw *= ((*resWeight)[n] / 100.0);
501 /** *********************************************************************
502 * Note dont think this will work. Dont have -3. Need to use lengths! *
503 ************************************************************************/
511 float ProfileBase::percentId(vector<int>* s1, vector<int>* s2)
518 for (i = 0; i < prfLength; i++)
520 if (((*s1)[i] >= 0) && ((*s1)[i] < userParameters->getMaxAA()))
523 if ((*s1)[i] == (*s2)[i])
528 if ((*s1)[i]==(-3) || (*s2)[i]==(-3))
530 break; // I dont have -3 at the end!
541 score = 100.0 * (float)count / (float)total;