4 #include "objscore.h"
\r
7 #define TRACE_SEQPAIR 0
\r
8 #define TEST_SPFAST 0
\r
10 extern SCOREMATRIX VTML_LA;
\r
11 extern SCOREMATRIX PAM200;
\r
12 extern SCOREMATRIX PAM200NoCenter;
\r
13 extern SCOREMATRIX VTML_SP;
\r
14 extern SCOREMATRIX VTML_SPNoCenter;
\r
15 extern SCOREMATRIX NUC_SP;
\r
17 SCORE g_SPScoreLetters;
\r
18 SCORE g_SPScoreGaps;
\r
20 static SCORE TermGapScore(bool Gap)
\r
29 return g_scoreGapOpen/2;
\r
34 return g_scoreGapExtend;
\r
37 Quit("TermGapScore?!");
\r
41 SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,
\r
42 const MSA &msa2, unsigned uSeqIndex2)
\r
44 const unsigned uColCount = msa1.GetColCount();
\r
45 const unsigned uColCount2 = msa2.GetColCount();
\r
46 if (uColCount != uColCount2)
\r
47 Quit("ScoreSeqPairLetters, different lengths");
\r
52 Log("ScoreSeqPairLetters\n");
\r
54 msaTmp.SetSize(2, uColCount);
\r
55 msaTmp.CopySeq(0, msa1, uSeqIndex1);
\r
56 msaTmp.CopySeq(1, msa2, uSeqIndex2);
\r
61 SCORE scoreLetters = 0;
\r
62 SCORE scoreGaps = 0;
\r
63 bool bGapping1 = false;
\r
64 bool bGapping2 = false;
\r
66 unsigned uColStart = 0;
\r
67 bool bLeftTermGap = false;
\r
68 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
70 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
\r
71 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
\r
72 if (!bGap1 || !bGap2)
\r
75 bLeftTermGap = true;
\r
76 uColStart = uColIndex;
\r
81 unsigned uColEnd = uColCount - 1;
\r
82 bool bRightTermGap = false;
\r
83 for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
\r
85 bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
\r
86 bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
\r
87 if (!bGap1 || !bGap2)
\r
90 bRightTermGap = true;
\r
91 uColEnd = (unsigned) iColIndex;
\r
97 Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
\r
100 for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
\r
102 unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex);
\r
103 if (uLetter1 >= g_AlphaSize)
\r
105 unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex);
\r
106 if (uLetter2 >= g_AlphaSize)
\r
109 SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];
\r
110 scoreLetters += scoreMatch;
\r
112 return scoreLetters;
\r
115 SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,
\r
116 const MSA &msa2, unsigned uSeqIndex2)
\r
118 const unsigned uColCount = msa1.GetColCount();
\r
119 const unsigned uColCount2 = msa2.GetColCount();
\r
120 if (uColCount != uColCount2)
\r
121 Quit("ScoreSeqPairGaps, different lengths");
\r
126 Log("ScoreSeqPairGaps\n");
\r
128 msaTmp.SetSize(2, uColCount);
\r
129 msaTmp.CopySeq(0, msa1, uSeqIndex1);
\r
130 msaTmp.CopySeq(1, msa2, uSeqIndex2);
\r
135 SCORE scoreGaps = 0;
\r
136 bool bGapping1 = false;
\r
137 bool bGapping2 = false;
\r
139 unsigned uColStart = 0;
\r
140 bool bLeftTermGap = false;
\r
141 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
143 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
\r
144 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
\r
145 if (!bGap1 || !bGap2)
\r
147 if (bGap1 || bGap2)
\r
148 bLeftTermGap = true;
\r
149 uColStart = uColIndex;
\r
154 unsigned uColEnd = uColCount - 1;
\r
155 bool bRightTermGap = false;
\r
156 for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
\r
158 bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
\r
159 bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
\r
160 if (!bGap1 || !bGap2)
\r
162 if (bGap1 || bGap2)
\r
163 bRightTermGap = true;
\r
164 uColEnd = (unsigned) iColIndex;
\r
170 Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
\r
173 for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
\r
175 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
\r
176 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
\r
178 if (bGap1 && bGap2)
\r
186 Log("Gap open seq 1 col %d\n", uColIndex);
\r
188 if (uColIndex == uColStart)
\r
189 scoreGaps += TermGapScore(true);
\r
191 scoreGaps += g_scoreGapOpen;
\r
195 scoreGaps += g_scoreGapExtend;
\r
204 Log("Gap open seq 2 col %d\n", uColIndex);
\r
206 if (uColIndex == uColStart)
\r
207 scoreGaps += TermGapScore(true);
\r
209 scoreGaps += g_scoreGapOpen;
\r
213 scoreGaps += g_scoreGapExtend;
\r
221 if (bGapping1 || bGapping2)
\r
223 scoreGaps -= g_scoreGapOpen;
\r
224 scoreGaps += TermGapScore(true);
\r
229 // The usual sum-of-pairs objective score: sum the score
\r
230 // of the alignment of each pair of sequences.
\r
231 SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[])
\r
234 Log("==================ObjScoreSP==============\n");
\r
238 g_SPScoreLetters = 0;
\r
241 if (0 != MatchScore)
\r
243 const unsigned uColCount = msa.GetColCount();
\r
244 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
245 MatchScore[uColIndex] = 0;
\r
248 const unsigned uSeqCount = msa.GetSeqCount();
\r
249 SCORE scoreTotal = 0;
\r
250 unsigned uPairCount = 0;
\r
252 Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n");
\r
253 Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n");
\r
255 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
\r
257 const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
\r
258 for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
\r
260 const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
\r
261 const WEIGHT w = w1*w2;
\r
263 SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2);
\r
264 SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2);
\r
265 SCORE scorePair = scoreLetters + scoreGaps;
\r
268 scoreTotal += w*scorePair;
\r
270 g_SPScoreLetters += w*scoreLetters;
\r
271 g_SPScoreGaps += w*scoreGaps;
\r
273 Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n",
\r
283 msa.GetSeqName(uSeqIndex1),
\r
284 msa.GetSeqName(uSeqIndex2));
\r
290 SCORE f = ObjScoreSPFast(msa);
\r
291 Log("Fast = %.6g\n", f);
\r
292 Log("Brute = %.6g\n", scoreTotal);
\r
293 if (BTEq(f, scoreTotal))
\r
296 Log("** DISAGREE **\n");
\r
299 // return scoreTotal / uPairCount;
\r
303 // Objective score defined as the dynamic programming score.
\r
304 // Input is two alignments, which must be of the same length.
\r
305 // Result is the same profile-profile score that is optimized
\r
306 // by dynamic programming.
\r
307 SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[])
\r
309 const unsigned uColCount = msa1.GetColCount();
\r
310 if (msa2.GetColCount() != uColCount)
\r
311 Quit("ObjScoreDP, must be same length");
\r
313 const unsigned uColCount1 = msa1.GetColCount();
\r
314 const unsigned uColCount2 = msa2.GetColCount();
\r
316 const ProfPos *PA = ProfileFromMSA(msa1);
\r
317 const ProfPos *PB = ProfileFromMSA(msa2);
\r
319 return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore);
\r
322 SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,
\r
323 SCORE MatchScore[])
\r
326 // Log("Profile 1:\n");
\r
327 // ListProfile(PA, uColCount, &msa1);
\r
329 // Log("Profile 2:\n");
\r
330 // ListProfile(PB, uColCount, &msa2);
\r
333 SCORE scoreTotal = 0;
\r
335 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
337 const ProfPos &PPA = PA[uColIndex];
\r
338 const ProfPos &PPB = PB[uColIndex];
\r
340 SCORE scoreGap = 0;
\r
341 SCORE scoreMatch = 0;
\r
342 // If gapped column...
\r
343 if (PPA.m_bAllGaps && PPB.m_bAllGaps)
\r
345 else if (PPA.m_bAllGaps)
\r
347 if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps)
\r
348 scoreGap = PPB.m_scoreGapClose;
\r
349 if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps)
\r
350 scoreGap += PPB.m_scoreGapOpen;
\r
351 //if (0 == scoreGap)
\r
352 // scoreGap = PPB.m_scoreGapExtend;
\r
354 else if (PPB.m_bAllGaps)
\r
356 if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps)
\r
357 scoreGap = PPA.m_scoreGapClose;
\r
358 if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps)
\r
359 scoreGap += PPA.m_scoreGapOpen;
\r
360 //if (0 == scoreGap)
\r
361 // scoreGap = PPA.m_scoreGapExtend;
\r
364 scoreMatch = ScoreProfPos2(PPA, PPB);
\r
366 if (0 != MatchScore)
\r
367 MatchScore[uColIndex] = scoreMatch;
\r
369 scoreTotal += scoreMatch + scoreGap;
\r
371 extern bool g_bTracePPScore;
\r
372 extern MSA *g_ptrPPScoreMSA1;
\r
373 extern MSA *g_ptrPPScoreMSA2;
\r
374 if (g_bTracePPScore)
\r
376 const MSA &msa1 = *g_ptrPPScoreMSA1;
\r
377 const MSA &msa2 = *g_ptrPPScoreMSA2;
\r
378 const unsigned uSeqCount1 = msa1.GetSeqCount();
\r
379 const unsigned uSeqCount2 = msa2.GetSeqCount();
\r
381 for (unsigned n = 0; n < uSeqCount1; ++n)
\r
382 Log("%c", msa1.GetChar(n, uColIndex));
\r
384 for (unsigned n = 0; n < uSeqCount2; ++n)
\r
385 Log("%c", msa2.GetChar(n, uColIndex));
\r
386 Log(" %10.3f", scoreMatch);
\r
388 Log(" %10.3f", scoreGap);
\r
399 // Objective score defined as the sum of profile-sequence
\r
400 // scores for each sequence in the alignment. The profile
\r
401 // is computed from the entire alignment, so this includes
\r
402 // the score of each sequence against itself. This is to
\r
403 // avoid recomputing the profile each time, so we reduce
\r
404 // complexity but introduce a questionable approximation.
\r
405 // The goal is to see if we can exploit the apparent
\r
406 // improvement in performance of log-expectation score
\r
407 // over the usual sum-of-pairs by optimizing this
\r
408 // objective score in the iterative refinement stage.
\r
409 SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[])
\r
411 if (g_PPScore != PPSCORE_LE)
\r
412 Quit("FastScoreMSA_LASimple: LA");
\r
414 const unsigned uSeqCount = msa.GetSeqCount();
\r
415 const unsigned uColCount = msa.GetColCount();
\r
417 const ProfPos *Prof = ProfileFromMSA(msa);
\r
419 if (0 != MatchScore)
\r
420 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
421 MatchScore[uColIndex] = 0;
\r
423 SCORE scoreTotal = 0;
\r
424 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
426 const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex);
\r
427 SCORE scoreSeq = 0;
\r
428 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
430 const ProfPos &PP = Prof[uColIndex];
\r
431 if (msa.IsGap(uSeqIndex, uColIndex))
\r
433 bool bOpen = (0 == uColIndex ||
\r
434 !msa.IsGap(uSeqIndex, uColIndex - 1));
\r
435 bool bClose = (uColCount - 1 == uColIndex ||
\r
436 !msa.IsGap(uSeqIndex, uColIndex + 1));
\r
439 scoreSeq += PP.m_scoreGapOpen;
\r
441 scoreSeq += PP.m_scoreGapClose;
\r
442 //if (!bOpen && !bClose)
\r
443 // scoreSeq += PP.m_scoreGapExtend;
\r
445 else if (msa.IsWildcard(uSeqIndex, uColIndex))
\r
449 unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex);
\r
450 const SCORE scoreMatch = PP.m_AAScores[uLetter];
\r
451 if (0 != MatchScore)
\r
452 MatchScore[uColIndex] += weightSeq*scoreMatch;
\r
453 scoreSeq += scoreMatch;
\r
456 scoreTotal += weightSeq*scoreSeq;
\r
463 // The XP score is the sum of the score of each pair of
\r
464 // sequences between two profiles which are aligned to
\r
465 // each other. Notice that for two given profiles aligned
\r
466 // in different ways, the difference in XP score must be
\r
467 // the same as the difference in SP score because the
\r
468 // score of a pair of sequences in one profile doesn't
\r
469 // depend on the alignment.
\r
470 SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2)
\r
472 const unsigned uColCount1 = msa1.GetColCount();
\r
473 const unsigned uColCount2 = msa2.GetColCount();
\r
474 if (uColCount1 != uColCount2)
\r
475 Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2);
\r
477 const unsigned uSeqCount1 = msa1.GetSeqCount();
\r
478 const unsigned uSeqCount2 = msa2.GetSeqCount();
\r
481 Log(" Score Weight Weight Total\n");
\r
482 Log("---------- ------ ------ ----------\n");
\r
485 SCORE scoreTotal = 0;
\r
486 unsigned uPairCount = 0;
\r
487 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
\r
489 const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1);
\r
490 for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
\r
492 const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2);
\r
493 const WEIGHT w = w1*w2;
\r
494 SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2);
\r
495 SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2);
\r
496 SCORE scorePair = scoreLetters + scoreGaps;
\r
497 scoreTotal += w1*w2*scorePair;
\r
500 Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n",
\r
505 msa1.GetSeqName(uSeqIndex1),
\r
506 msa2.GetSeqName(uSeqIndex2));
\r
510 if (0 == uPairCount)
\r
511 Quit("0 == uPairCount");
\r
518 Log("XP=%g\n", scoreTotal);
\r
520 // return scoreTotal / uPairCount;
\r