Next version of JABA
[jabaws.git] / binaries / src / muscle / objscore2.cpp
1 #include "muscle.h"\r
2 #include "msa.h"\r
3 #include "profile.h"\r
4 #include "objscore.h"\r
5 \r
6 #define TRACE                   0\r
7 #define TRACE_SEQPAIR   0\r
8 #define TEST_SPFAST             0\r
9 \r
10 extern SCOREMATRIX VTML_LA;\r
11 extern SCOREMATRIX PAM200;\r
12 extern SCOREMATRIX PAM200NoCenter;\r
13 extern SCOREMATRIX VTML_SP;\r
14 extern SCOREMATRIX VTML_SPNoCenter;\r
15 extern SCOREMATRIX NUC_SP;\r
16 \r
17 SCORE g_SPScoreLetters;\r
18 SCORE g_SPScoreGaps;\r
19 \r
20 static SCORE TermGapScore(bool Gap)\r
21         {\r
22         switch (g_TermGaps)\r
23                 {\r
24         case TERMGAPS_Full:\r
25                 return 0;\r
26 \r
27         case TERMGAPS_Half:\r
28                 if (Gap)\r
29                         return g_scoreGapOpen/2;\r
30                 return 0;\r
31 \r
32         case TERMGAPS_Ext:\r
33                 if (Gap)\r
34                         return g_scoreGapExtend;\r
35                 return 0;\r
36                 }\r
37         Quit("TermGapScore?!");\r
38         return 0;\r
39         }\r
40 \r
41 SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,\r
42   const MSA &msa2, unsigned uSeqIndex2)\r
43         {\r
44         const unsigned uColCount = msa1.GetColCount();\r
45         const unsigned uColCount2 = msa2.GetColCount();\r
46         if (uColCount != uColCount2)\r
47                 Quit("ScoreSeqPairLetters, different lengths");\r
48 \r
49 #if     TRACE_SEQPAIR\r
50         {\r
51         Log("\n");\r
52         Log("ScoreSeqPairLetters\n");\r
53         MSA msaTmp;\r
54         msaTmp.SetSize(2, uColCount);\r
55         msaTmp.CopySeq(0, msa1, uSeqIndex1);\r
56         msaTmp.CopySeq(1, msa2, uSeqIndex2);\r
57         msaTmp.LogMe();\r
58         }\r
59 #endif\r
60 \r
61         SCORE scoreLetters = 0;\r
62         SCORE scoreGaps = 0;\r
63         bool bGapping1 = false;\r
64         bool bGapping2 = false;\r
65 \r
66         unsigned uColStart = 0;\r
67         bool bLeftTermGap = false;\r
68         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
69                 {\r
70                 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);\r
71                 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);\r
72                 if (!bGap1 || !bGap2)\r
73                         {\r
74                         if (bGap1 || bGap2)\r
75                                 bLeftTermGap = true;\r
76                         uColStart = uColIndex;\r
77                         break;\r
78                         }\r
79                 }\r
80 \r
81         unsigned uColEnd = uColCount - 1;\r
82         bool bRightTermGap = false;\r
83         for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)\r
84                 {\r
85                 bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);\r
86                 bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);\r
87                 if (!bGap1 || !bGap2)\r
88                         {\r
89                         if (bGap1 || bGap2)\r
90                                 bRightTermGap = true;\r
91                         uColEnd = (unsigned) iColIndex;\r
92                         break;\r
93                         }\r
94                 }\r
95 \r
96 #if     TRACE_SEQPAIR\r
97         Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);\r
98 #endif\r
99 \r
100         for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)\r
101                 {\r
102                 unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex);\r
103                 if (uLetter1 >= g_AlphaSize)\r
104                         continue;\r
105                 unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex);\r
106                 if (uLetter2 >= g_AlphaSize)\r
107                         continue;\r
108 \r
109                 SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];\r
110                 scoreLetters += scoreMatch;\r
111                 }\r
112         return scoreLetters;\r
113         }\r
114 \r
115 SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,\r
116   const MSA &msa2, unsigned uSeqIndex2)\r
117         {\r
118         const unsigned uColCount = msa1.GetColCount();\r
119         const unsigned uColCount2 = msa2.GetColCount();\r
120         if (uColCount != uColCount2)\r
121                 Quit("ScoreSeqPairGaps, different lengths");\r
122 \r
123 #if     TRACE_SEQPAIR\r
124         {\r
125         Log("\n");\r
126         Log("ScoreSeqPairGaps\n");\r
127         MSA msaTmp;\r
128         msaTmp.SetSize(2, uColCount);\r
129         msaTmp.CopySeq(0, msa1, uSeqIndex1);\r
130         msaTmp.CopySeq(1, msa2, uSeqIndex2);\r
131         msaTmp.LogMe();\r
132         }\r
133 #endif\r
134 \r
135         SCORE scoreGaps = 0;\r
136         bool bGapping1 = false;\r
137         bool bGapping2 = false;\r
138 \r
139         unsigned uColStart = 0;\r
140         bool bLeftTermGap = false;\r
141         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
142                 {\r
143                 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);\r
144                 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);\r
145                 if (!bGap1 || !bGap2)\r
146                         {\r
147                         if (bGap1 || bGap2)\r
148                                 bLeftTermGap = true;\r
149                         uColStart = uColIndex;\r
150                         break;\r
151                         }\r
152                 }\r
153 \r
154         unsigned uColEnd = uColCount - 1;\r
155         bool bRightTermGap = false;\r
156         for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)\r
157                 {\r
158                 bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);\r
159                 bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);\r
160                 if (!bGap1 || !bGap2)\r
161                         {\r
162                         if (bGap1 || bGap2)\r
163                                 bRightTermGap = true;\r
164                         uColEnd = (unsigned) iColIndex;\r
165                         break;\r
166                         }\r
167                 }\r
168 \r
169 #if     TRACE_SEQPAIR\r
170         Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);\r
171 #endif\r
172 \r
173         for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)\r
174                 {\r
175                 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);\r
176                 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);\r
177 \r
178                 if (bGap1 && bGap2)\r
179                         continue;\r
180 \r
181                 if (bGap1)\r
182                         {\r
183                         if (!bGapping1)\r
184                                 {\r
185 #if     TRACE_SEQPAIR\r
186                                 Log("Gap open seq 1 col %d\n", uColIndex);\r
187 #endif\r
188                                 if (uColIndex == uColStart)\r
189                                         scoreGaps += TermGapScore(true);\r
190                                 else\r
191                                         scoreGaps += g_scoreGapOpen;\r
192                                 bGapping1 = true;\r
193                                 }\r
194                         else\r
195                                 scoreGaps += g_scoreGapExtend;\r
196                         continue;\r
197                         }\r
198 \r
199                 else if (bGap2)\r
200                         {\r
201                         if (!bGapping2)\r
202                                 {\r
203 #if     TRACE_SEQPAIR\r
204                                 Log("Gap open seq 2 col %d\n", uColIndex);\r
205 #endif\r
206                                 if (uColIndex == uColStart)\r
207                                         scoreGaps += TermGapScore(true);\r
208                                 else\r
209                                         scoreGaps += g_scoreGapOpen;\r
210                                 bGapping2 = true;\r
211                                 }\r
212                         else\r
213                                 scoreGaps += g_scoreGapExtend;\r
214                         continue;\r
215                         }\r
216 \r
217                 bGapping1 = false;\r
218                 bGapping2 = false;\r
219                 }\r
220 \r
221         if (bGapping1 || bGapping2)\r
222                 {\r
223                 scoreGaps -= g_scoreGapOpen;\r
224                 scoreGaps += TermGapScore(true);\r
225                 }\r
226         return scoreGaps;\r
227         }\r
228 \r
229 // The usual sum-of-pairs objective score: sum the score\r
230 // of the alignment of each pair of sequences.\r
231 SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[])\r
232         {\r
233 #if     TRACE\r
234         Log("==================ObjScoreSP==============\n");\r
235         Log("msa=\n");\r
236         msa.LogMe();\r
237 #endif\r
238         g_SPScoreLetters = 0;\r
239         g_SPScoreGaps = 0;\r
240 \r
241         if (0 != MatchScore)\r
242                 {\r
243                 const unsigned uColCount = msa.GetColCount();\r
244                 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
245                         MatchScore[uColIndex] = 0;\r
246                 }\r
247 \r
248         const unsigned uSeqCount = msa.GetSeqCount();\r
249         SCORE scoreTotal = 0;\r
250         unsigned uPairCount = 0;\r
251 #if     TRACE\r
252         Log("Seq1  Seq2     wt1     wt2    Letters         Gaps  Unwt.Score    Wt.Score       Total\n");\r
253         Log("----  ----  ------  ------  ----------  ----------  ----------  ----------  ----------\n");\r
254 #endif\r
255         for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)\r
256                 {\r
257                 const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);\r
258                 for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)\r
259                         {\r
260                         const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);\r
261                         const WEIGHT w = w1*w2;\r
262 \r
263                         SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2);\r
264                         SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2);\r
265                         SCORE scorePair = scoreLetters + scoreGaps;\r
266                         ++uPairCount;\r
267 \r
268                         scoreTotal += w*scorePair;\r
269 \r
270                         g_SPScoreLetters += w*scoreLetters;\r
271                         g_SPScoreGaps += w*scoreGaps;\r
272 #if     TRACE\r
273                         Log("%4d  %4d  %6.3f  %6.3f  %10.2f  %10.2f  %10.2f  %10.2f  %10.2f >%s >%s\n",\r
274                           uSeqIndex1,\r
275                           uSeqIndex2,\r
276                           w1,\r
277                           w2,\r
278                           scoreLetters,\r
279                           scoreGaps,\r
280                           scorePair,\r
281                           scorePair*w1*w2,\r
282                           scoreTotal,\r
283                           msa.GetSeqName(uSeqIndex1),\r
284                           msa.GetSeqName(uSeqIndex2));\r
285 #endif\r
286                         }\r
287                 }\r
288 #if     TEST_SPFAST\r
289         {\r
290         SCORE f = ObjScoreSPFast(msa);\r
291         Log("Fast  = %.6g\n", f);\r
292         Log("Brute = %.6g\n", scoreTotal);\r
293         if (BTEq(f, scoreTotal))\r
294                 Log("Agree\n");\r
295         else\r
296                 Log("** DISAGREE **\n");\r
297         }\r
298 #endif\r
299 //      return scoreTotal / uPairCount;\r
300         return scoreTotal;\r
301         }\r
302 \r
303 // Objective score defined as the dynamic programming score.\r
304 // Input is two alignments, which must be of the same length.\r
305 // Result is the same profile-profile score that is optimized\r
306 // by dynamic programming.\r
307 SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[])\r
308         {\r
309         const unsigned uColCount = msa1.GetColCount();\r
310         if (msa2.GetColCount() != uColCount)\r
311                 Quit("ObjScoreDP, must be same length");\r
312 \r
313         const unsigned uColCount1 = msa1.GetColCount();\r
314         const unsigned uColCount2 = msa2.GetColCount();\r
315 \r
316         const ProfPos *PA = ProfileFromMSA(msa1);\r
317         const ProfPos *PB = ProfileFromMSA(msa2);\r
318 \r
319         return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore);\r
320         }\r
321 \r
322 SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,\r
323   SCORE MatchScore[])\r
324         {\r
325 //#if   TRACE\r
326 //      Log("Profile 1:\n");\r
327 //      ListProfile(PA, uColCount, &msa1);\r
328 //\r
329 //      Log("Profile 2:\n");\r
330 //      ListProfile(PB, uColCount, &msa2);\r
331 //#endif\r
332 \r
333         SCORE scoreTotal = 0;\r
334 \r
335         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
336                 {\r
337                 const ProfPos &PPA = PA[uColIndex];\r
338                 const ProfPos &PPB = PB[uColIndex];\r
339 \r
340                 SCORE scoreGap = 0;\r
341                 SCORE scoreMatch = 0;\r
342         // If gapped column...\r
343                 if (PPA.m_bAllGaps && PPB.m_bAllGaps)\r
344                         scoreGap = 0;\r
345                 else if (PPA.m_bAllGaps)\r
346                         {\r
347                         if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps)\r
348                                 scoreGap = PPB.m_scoreGapClose;\r
349                         if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps)\r
350                                 scoreGap += PPB.m_scoreGapOpen;\r
351                         //if (0 == scoreGap)\r
352                         //      scoreGap = PPB.m_scoreGapExtend;\r
353                         }\r
354                 else if (PPB.m_bAllGaps)\r
355                         {\r
356                         if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps)\r
357                                 scoreGap = PPA.m_scoreGapClose;\r
358                         if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps)\r
359                                 scoreGap += PPA.m_scoreGapOpen;\r
360                         //if (0 == scoreGap)\r
361                         //      scoreGap = PPA.m_scoreGapExtend;\r
362                         }\r
363                 else\r
364                         scoreMatch = ScoreProfPos2(PPA, PPB);\r
365 \r
366                 if (0 != MatchScore)\r
367                         MatchScore[uColIndex] = scoreMatch;\r
368 \r
369                 scoreTotal += scoreMatch + scoreGap;\r
370 \r
371                 extern bool g_bTracePPScore;\r
372                 extern MSA *g_ptrPPScoreMSA1;\r
373                 extern MSA *g_ptrPPScoreMSA2;\r
374                 if (g_bTracePPScore)\r
375                         {\r
376                         const MSA &msa1 = *g_ptrPPScoreMSA1;\r
377                         const MSA &msa2 = *g_ptrPPScoreMSA2;\r
378                         const unsigned uSeqCount1 = msa1.GetSeqCount();\r
379                         const unsigned uSeqCount2 = msa2.GetSeqCount();\r
380 \r
381                         for (unsigned n = 0; n < uSeqCount1; ++n)\r
382                                 Log("%c", msa1.GetChar(n, uColIndex));\r
383                         Log("  ");\r
384                         for (unsigned n = 0; n < uSeqCount2; ++n)\r
385                                 Log("%c", msa2.GetChar(n, uColIndex));\r
386                         Log("  %10.3f", scoreMatch);\r
387                         if (scoreGap != 0)\r
388                                 Log("  %10.3f", scoreGap);\r
389                         Log("\n");\r
390                         }\r
391                 }\r
392 \r
393         delete[] PA;\r
394         delete[] PB;\r
395 \r
396         return scoreTotal;\r
397         }\r
398 \r
399 // Objective score defined as the sum of profile-sequence\r
400 // scores for each sequence in the alignment. The profile\r
401 // is computed from the entire alignment, so this includes\r
402 // the score of each sequence against itself. This is to\r
403 // avoid recomputing the profile each time, so we reduce\r
404 // complexity but introduce a questionable approximation.\r
405 // The goal is to see if we can exploit the apparent\r
406 // improvement in performance of log-expectation score\r
407 // over the usual sum-of-pairs by optimizing this\r
408 // objective score in the iterative refinement stage.\r
409 SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[])\r
410         {\r
411         if (g_PPScore != PPSCORE_LE)\r
412                 Quit("FastScoreMSA_LASimple: LA");\r
413 \r
414         const unsigned uSeqCount = msa.GetSeqCount();\r
415         const unsigned uColCount = msa.GetColCount();\r
416 \r
417         const ProfPos *Prof = ProfileFromMSA(msa);\r
418 \r
419         if (0 != MatchScore)\r
420                 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
421                         MatchScore[uColIndex] = 0;\r
422 \r
423         SCORE scoreTotal = 0;\r
424         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
425                 {\r
426                 const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex);\r
427                 SCORE scoreSeq = 0;\r
428                 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
429                         {\r
430                         const ProfPos &PP = Prof[uColIndex];\r
431                         if (msa.IsGap(uSeqIndex, uColIndex))\r
432                                 {\r
433                                 bool bOpen = (0 == uColIndex ||\r
434                                   !msa.IsGap(uSeqIndex, uColIndex - 1));\r
435                                 bool bClose = (uColCount - 1 == uColIndex ||\r
436                                   !msa.IsGap(uSeqIndex, uColIndex + 1));\r
437 \r
438                                 if (bOpen)\r
439                                         scoreSeq += PP.m_scoreGapOpen;\r
440                                 if (bClose)\r
441                                         scoreSeq += PP.m_scoreGapClose;\r
442                                 //if (!bOpen && !bClose)\r
443                                 //      scoreSeq += PP.m_scoreGapExtend;\r
444                                 }\r
445                         else if (msa.IsWildcard(uSeqIndex, uColIndex))\r
446                                 continue;\r
447                         else\r
448                                 {\r
449                                 unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex);\r
450                                 const SCORE scoreMatch = PP.m_AAScores[uLetter];\r
451                                 if (0 != MatchScore)\r
452                                         MatchScore[uColIndex] += weightSeq*scoreMatch;\r
453                                 scoreSeq += scoreMatch;\r
454                                 }\r
455                         }\r
456                 scoreTotal += weightSeq*scoreSeq;\r
457                 }\r
458 \r
459         delete[] Prof;\r
460         return scoreTotal;\r
461         }\r
462 \r
463 // The XP score is the sum of the score of each pair of\r
464 // sequences between two profiles which are aligned to\r
465 // each other. Notice that for two given profiles aligned\r
466 // in different ways, the difference in XP score must be\r
467 // the same as the difference in SP score because the\r
468 // score of a pair of sequences in one profile doesn't\r
469 // depend on the alignment.\r
470 SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2)\r
471         {\r
472         const unsigned uColCount1 = msa1.GetColCount();\r
473         const unsigned uColCount2 = msa2.GetColCount();\r
474         if (uColCount1 != uColCount2)\r
475                 Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2);\r
476 \r
477         const unsigned uSeqCount1 = msa1.GetSeqCount();\r
478         const unsigned uSeqCount2 = msa2.GetSeqCount();\r
479 \r
480 #if     TRACE\r
481         Log("     Score  Weight  Weight       Total\n");\r
482         Log("----------  ------  ------  ----------\n");\r
483 #endif\r
484 \r
485         SCORE scoreTotal = 0;\r
486         unsigned uPairCount = 0;\r
487         for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)\r
488                 {\r
489                 const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1);\r
490                 for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)\r
491                         {\r
492                         const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2);\r
493                         const WEIGHT w = w1*w2;\r
494                         SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2);\r
495                         SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2);\r
496                         SCORE scorePair = scoreLetters + scoreGaps;\r
497                         scoreTotal += w1*w2*scorePair;\r
498                         ++uPairCount;\r
499 #if     TRACE\r
500                         Log("%10.2f  %6.3f  %6.3f  %10.2f  >%s >%s\n",\r
501                           scorePair,\r
502                           w1,\r
503                           w2,\r
504                           scorePair*w1*w2,\r
505                           msa1.GetSeqName(uSeqIndex1),\r
506                           msa2.GetSeqName(uSeqIndex2));\r
507 #endif\r
508                         }\r
509                 }\r
510         if (0 == uPairCount)\r
511                 Quit("0 == uPairCount");\r
512 \r
513 #if     TRACE\r
514         Log("msa1=\n");\r
515         msa1.LogMe();\r
516         Log("msa2=\n");\r
517         msa2.LogMe();\r
518         Log("XP=%g\n", scoreTotal);\r
519 #endif\r
520 //      return scoreTotal / uPairCount;\r
521         return scoreTotal;\r
522         }\r