--- /dev/null
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "profile.h"\r
+#include "objscore.h"\r
+\r
+#define TRACE 0\r
+#define TRACE_SEQPAIR 0\r
+#define TEST_SPFAST 0\r
+\r
+extern SCOREMATRIX VTML_LA;\r
+extern SCOREMATRIX PAM200;\r
+extern SCOREMATRIX PAM200NoCenter;\r
+extern SCOREMATRIX VTML_SP;\r
+extern SCOREMATRIX VTML_SPNoCenter;\r
+extern SCOREMATRIX NUC_SP;\r
+\r
+SCORE g_SPScoreLetters;\r
+SCORE g_SPScoreGaps;\r
+\r
+static SCORE TermGapScore(bool Gap)\r
+ {\r
+ switch (g_TermGaps)\r
+ {\r
+ case TERMGAPS_Full:\r
+ return 0;\r
+\r
+ case TERMGAPS_Half:\r
+ if (Gap)\r
+ return g_scoreGapOpen/2;\r
+ return 0;\r
+\r
+ case TERMGAPS_Ext:\r
+ if (Gap)\r
+ return g_scoreGapExtend;\r
+ return 0;\r
+ }\r
+ Quit("TermGapScore?!");\r
+ return 0;\r
+ }\r
+\r
+SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,\r
+ const MSA &msa2, unsigned uSeqIndex2)\r
+ {\r
+ const unsigned uColCount = msa1.GetColCount();\r
+ const unsigned uColCount2 = msa2.GetColCount();\r
+ if (uColCount != uColCount2)\r
+ Quit("ScoreSeqPairLetters, different lengths");\r
+\r
+#if TRACE_SEQPAIR\r
+ {\r
+ Log("\n");\r
+ Log("ScoreSeqPairLetters\n");\r
+ MSA msaTmp;\r
+ msaTmp.SetSize(2, uColCount);\r
+ msaTmp.CopySeq(0, msa1, uSeqIndex1);\r
+ msaTmp.CopySeq(1, msa2, uSeqIndex2);\r
+ msaTmp.LogMe();\r
+ }\r
+#endif\r
+\r
+ SCORE scoreLetters = 0;\r
+ SCORE scoreGaps = 0;\r
+ bool bGapping1 = false;\r
+ bool bGapping2 = false;\r
+\r
+ unsigned uColStart = 0;\r
+ bool bLeftTermGap = false;\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);\r
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);\r
+ if (!bGap1 || !bGap2)\r
+ {\r
+ if (bGap1 || bGap2)\r
+ bLeftTermGap = true;\r
+ uColStart = uColIndex;\r
+ break;\r
+ }\r
+ }\r
+\r
+ unsigned uColEnd = uColCount - 1;\r
+ bool bRightTermGap = false;\r
+ for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)\r
+ {\r
+ bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);\r
+ bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);\r
+ if (!bGap1 || !bGap2)\r
+ {\r
+ if (bGap1 || bGap2)\r
+ bRightTermGap = true;\r
+ uColEnd = (unsigned) iColIndex;\r
+ break;\r
+ }\r
+ }\r
+\r
+#if TRACE_SEQPAIR\r
+ Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);\r
+#endif\r
+\r
+ for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)\r
+ {\r
+ unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex);\r
+ if (uLetter1 >= g_AlphaSize)\r
+ continue;\r
+ unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex);\r
+ if (uLetter2 >= g_AlphaSize)\r
+ continue;\r
+\r
+ SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];\r
+ scoreLetters += scoreMatch;\r
+ }\r
+ return scoreLetters;\r
+ }\r
+\r
+SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,\r
+ const MSA &msa2, unsigned uSeqIndex2)\r
+ {\r
+ const unsigned uColCount = msa1.GetColCount();\r
+ const unsigned uColCount2 = msa2.GetColCount();\r
+ if (uColCount != uColCount2)\r
+ Quit("ScoreSeqPairGaps, different lengths");\r
+\r
+#if TRACE_SEQPAIR\r
+ {\r
+ Log("\n");\r
+ Log("ScoreSeqPairGaps\n");\r
+ MSA msaTmp;\r
+ msaTmp.SetSize(2, uColCount);\r
+ msaTmp.CopySeq(0, msa1, uSeqIndex1);\r
+ msaTmp.CopySeq(1, msa2, uSeqIndex2);\r
+ msaTmp.LogMe();\r
+ }\r
+#endif\r
+\r
+ SCORE scoreGaps = 0;\r
+ bool bGapping1 = false;\r
+ bool bGapping2 = false;\r
+\r
+ unsigned uColStart = 0;\r
+ bool bLeftTermGap = false;\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);\r
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);\r
+ if (!bGap1 || !bGap2)\r
+ {\r
+ if (bGap1 || bGap2)\r
+ bLeftTermGap = true;\r
+ uColStart = uColIndex;\r
+ break;\r
+ }\r
+ }\r
+\r
+ unsigned uColEnd = uColCount - 1;\r
+ bool bRightTermGap = false;\r
+ for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)\r
+ {\r
+ bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);\r
+ bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);\r
+ if (!bGap1 || !bGap2)\r
+ {\r
+ if (bGap1 || bGap2)\r
+ bRightTermGap = true;\r
+ uColEnd = (unsigned) iColIndex;\r
+ break;\r
+ }\r
+ }\r
+\r
+#if TRACE_SEQPAIR\r
+ Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);\r
+#endif\r
+\r
+ for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)\r
+ {\r
+ bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);\r
+ bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);\r
+\r
+ if (bGap1 && bGap2)\r
+ continue;\r
+\r
+ if (bGap1)\r
+ {\r
+ if (!bGapping1)\r
+ {\r
+#if TRACE_SEQPAIR\r
+ Log("Gap open seq 1 col %d\n", uColIndex);\r
+#endif\r
+ if (uColIndex == uColStart)\r
+ scoreGaps += TermGapScore(true);\r
+ else\r
+ scoreGaps += g_scoreGapOpen;\r
+ bGapping1 = true;\r
+ }\r
+ else\r
+ scoreGaps += g_scoreGapExtend;\r
+ continue;\r
+ }\r
+\r
+ else if (bGap2)\r
+ {\r
+ if (!bGapping2)\r
+ {\r
+#if TRACE_SEQPAIR\r
+ Log("Gap open seq 2 col %d\n", uColIndex);\r
+#endif\r
+ if (uColIndex == uColStart)\r
+ scoreGaps += TermGapScore(true);\r
+ else\r
+ scoreGaps += g_scoreGapOpen;\r
+ bGapping2 = true;\r
+ }\r
+ else\r
+ scoreGaps += g_scoreGapExtend;\r
+ continue;\r
+ }\r
+\r
+ bGapping1 = false;\r
+ bGapping2 = false;\r
+ }\r
+\r
+ if (bGapping1 || bGapping2)\r
+ {\r
+ scoreGaps -= g_scoreGapOpen;\r
+ scoreGaps += TermGapScore(true);\r
+ }\r
+ return scoreGaps;\r
+ }\r
+\r
+// The usual sum-of-pairs objective score: sum the score\r
+// of the alignment of each pair of sequences.\r
+SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[])\r
+ {\r
+#if TRACE\r
+ Log("==================ObjScoreSP==============\n");\r
+ Log("msa=\n");\r
+ msa.LogMe();\r
+#endif\r
+ g_SPScoreLetters = 0;\r
+ g_SPScoreGaps = 0;\r
+\r
+ if (0 != MatchScore)\r
+ {\r
+ const unsigned uColCount = msa.GetColCount();\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ MatchScore[uColIndex] = 0;\r
+ }\r
+\r
+ const unsigned uSeqCount = msa.GetSeqCount();\r
+ SCORE scoreTotal = 0;\r
+ unsigned uPairCount = 0;\r
+#if TRACE\r
+ Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n");\r
+ Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n");\r
+#endif\r
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)\r
+ {\r
+ const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);\r
+ for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)\r
+ {\r
+ const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);\r
+ const WEIGHT w = w1*w2;\r
+\r
+ SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2);\r
+ SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2);\r
+ SCORE scorePair = scoreLetters + scoreGaps;\r
+ ++uPairCount;\r
+\r
+ scoreTotal += w*scorePair;\r
+\r
+ g_SPScoreLetters += w*scoreLetters;\r
+ g_SPScoreGaps += w*scoreGaps;\r
+#if TRACE\r
+ Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n",\r
+ uSeqIndex1,\r
+ uSeqIndex2,\r
+ w1,\r
+ w2,\r
+ scoreLetters,\r
+ scoreGaps,\r
+ scorePair,\r
+ scorePair*w1*w2,\r
+ scoreTotal,\r
+ msa.GetSeqName(uSeqIndex1),\r
+ msa.GetSeqName(uSeqIndex2));\r
+#endif\r
+ }\r
+ }\r
+#if TEST_SPFAST\r
+ {\r
+ SCORE f = ObjScoreSPFast(msa);\r
+ Log("Fast = %.6g\n", f);\r
+ Log("Brute = %.6g\n", scoreTotal);\r
+ if (BTEq(f, scoreTotal))\r
+ Log("Agree\n");\r
+ else\r
+ Log("** DISAGREE **\n");\r
+ }\r
+#endif\r
+// return scoreTotal / uPairCount;\r
+ return scoreTotal;\r
+ }\r
+\r
+// Objective score defined as the dynamic programming score.\r
+// Input is two alignments, which must be of the same length.\r
+// Result is the same profile-profile score that is optimized\r
+// by dynamic programming.\r
+SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[])\r
+ {\r
+ const unsigned uColCount = msa1.GetColCount();\r
+ if (msa2.GetColCount() != uColCount)\r
+ Quit("ObjScoreDP, must be same length");\r
+\r
+ const unsigned uColCount1 = msa1.GetColCount();\r
+ const unsigned uColCount2 = msa2.GetColCount();\r
+\r
+ const ProfPos *PA = ProfileFromMSA(msa1);\r
+ const ProfPos *PB = ProfileFromMSA(msa2);\r
+\r
+ return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore);\r
+ }\r
+\r
+SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,\r
+ SCORE MatchScore[])\r
+ {\r
+//#if TRACE\r
+// Log("Profile 1:\n");\r
+// ListProfile(PA, uColCount, &msa1);\r
+//\r
+// Log("Profile 2:\n");\r
+// ListProfile(PB, uColCount, &msa2);\r
+//#endif\r
+\r
+ SCORE scoreTotal = 0;\r
+\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const ProfPos &PPA = PA[uColIndex];\r
+ const ProfPos &PPB = PB[uColIndex];\r
+\r
+ SCORE scoreGap = 0;\r
+ SCORE scoreMatch = 0;\r
+ // If gapped column...\r
+ if (PPA.m_bAllGaps && PPB.m_bAllGaps)\r
+ scoreGap = 0;\r
+ else if (PPA.m_bAllGaps)\r
+ {\r
+ if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps)\r
+ scoreGap = PPB.m_scoreGapClose;\r
+ if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps)\r
+ scoreGap += PPB.m_scoreGapOpen;\r
+ //if (0 == scoreGap)\r
+ // scoreGap = PPB.m_scoreGapExtend;\r
+ }\r
+ else if (PPB.m_bAllGaps)\r
+ {\r
+ if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps)\r
+ scoreGap = PPA.m_scoreGapClose;\r
+ if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps)\r
+ scoreGap += PPA.m_scoreGapOpen;\r
+ //if (0 == scoreGap)\r
+ // scoreGap = PPA.m_scoreGapExtend;\r
+ }\r
+ else\r
+ scoreMatch = ScoreProfPos2(PPA, PPB);\r
+\r
+ if (0 != MatchScore)\r
+ MatchScore[uColIndex] = scoreMatch;\r
+\r
+ scoreTotal += scoreMatch + scoreGap;\r
+\r
+ extern bool g_bTracePPScore;\r
+ extern MSA *g_ptrPPScoreMSA1;\r
+ extern MSA *g_ptrPPScoreMSA2;\r
+ if (g_bTracePPScore)\r
+ {\r
+ const MSA &msa1 = *g_ptrPPScoreMSA1;\r
+ const MSA &msa2 = *g_ptrPPScoreMSA2;\r
+ const unsigned uSeqCount1 = msa1.GetSeqCount();\r
+ const unsigned uSeqCount2 = msa2.GetSeqCount();\r
+\r
+ for (unsigned n = 0; n < uSeqCount1; ++n)\r
+ Log("%c", msa1.GetChar(n, uColIndex));\r
+ Log(" ");\r
+ for (unsigned n = 0; n < uSeqCount2; ++n)\r
+ Log("%c", msa2.GetChar(n, uColIndex));\r
+ Log(" %10.3f", scoreMatch);\r
+ if (scoreGap != 0)\r
+ Log(" %10.3f", scoreGap);\r
+ Log("\n");\r
+ }\r
+ }\r
+\r
+ delete[] PA;\r
+ delete[] PB;\r
+\r
+ return scoreTotal;\r
+ }\r
+\r
+// Objective score defined as the sum of profile-sequence\r
+// scores for each sequence in the alignment. The profile\r
+// is computed from the entire alignment, so this includes\r
+// the score of each sequence against itself. This is to\r
+// avoid recomputing the profile each time, so we reduce\r
+// complexity but introduce a questionable approximation.\r
+// The goal is to see if we can exploit the apparent\r
+// improvement in performance of log-expectation score\r
+// over the usual sum-of-pairs by optimizing this\r
+// objective score in the iterative refinement stage.\r
+SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[])\r
+ {\r
+ if (g_PPScore != PPSCORE_LE)\r
+ Quit("FastScoreMSA_LASimple: LA");\r
+\r
+ const unsigned uSeqCount = msa.GetSeqCount();\r
+ const unsigned uColCount = msa.GetColCount();\r
+\r
+ const ProfPos *Prof = ProfileFromMSA(msa);\r
+\r
+ if (0 != MatchScore)\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ MatchScore[uColIndex] = 0;\r
+\r
+ SCORE scoreTotal = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex);\r
+ SCORE scoreSeq = 0;\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const ProfPos &PP = Prof[uColIndex];\r
+ if (msa.IsGap(uSeqIndex, uColIndex))\r
+ {\r
+ bool bOpen = (0 == uColIndex ||\r
+ !msa.IsGap(uSeqIndex, uColIndex - 1));\r
+ bool bClose = (uColCount - 1 == uColIndex ||\r
+ !msa.IsGap(uSeqIndex, uColIndex + 1));\r
+\r
+ if (bOpen)\r
+ scoreSeq += PP.m_scoreGapOpen;\r
+ if (bClose)\r
+ scoreSeq += PP.m_scoreGapClose;\r
+ //if (!bOpen && !bClose)\r
+ // scoreSeq += PP.m_scoreGapExtend;\r
+ }\r
+ else if (msa.IsWildcard(uSeqIndex, uColIndex))\r
+ continue;\r
+ else\r
+ {\r
+ unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex);\r
+ const SCORE scoreMatch = PP.m_AAScores[uLetter];\r
+ if (0 != MatchScore)\r
+ MatchScore[uColIndex] += weightSeq*scoreMatch;\r
+ scoreSeq += scoreMatch;\r
+ }\r
+ }\r
+ scoreTotal += weightSeq*scoreSeq;\r
+ }\r
+\r
+ delete[] Prof;\r
+ return scoreTotal;\r
+ }\r
+\r
+// The XP score is the sum of the score of each pair of\r
+// sequences between two profiles which are aligned to\r
+// each other. Notice that for two given profiles aligned\r
+// in different ways, the difference in XP score must be\r
+// the same as the difference in SP score because the\r
+// score of a pair of sequences in one profile doesn't\r
+// depend on the alignment.\r
+SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2)\r
+ {\r
+ const unsigned uColCount1 = msa1.GetColCount();\r
+ const unsigned uColCount2 = msa2.GetColCount();\r
+ if (uColCount1 != uColCount2)\r
+ Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2);\r
+\r
+ const unsigned uSeqCount1 = msa1.GetSeqCount();\r
+ const unsigned uSeqCount2 = msa2.GetSeqCount();\r
+\r
+#if TRACE\r
+ Log(" Score Weight Weight Total\n");\r
+ Log("---------- ------ ------ ----------\n");\r
+#endif\r
+\r
+ SCORE scoreTotal = 0;\r
+ unsigned uPairCount = 0;\r
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)\r
+ {\r
+ const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1);\r
+ for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)\r
+ {\r
+ const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2);\r
+ const WEIGHT w = w1*w2;\r
+ SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2);\r
+ SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2);\r
+ SCORE scorePair = scoreLetters + scoreGaps;\r
+ scoreTotal += w1*w2*scorePair;\r
+ ++uPairCount;\r
+#if TRACE\r
+ Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n",\r
+ scorePair,\r
+ w1,\r
+ w2,\r
+ scorePair*w1*w2,\r
+ msa1.GetSeqName(uSeqIndex1),\r
+ msa2.GetSeqName(uSeqIndex2));\r
+#endif\r
+ }\r
+ }\r
+ if (0 == uPairCount)\r
+ Quit("0 == uPairCount");\r
+\r
+#if TRACE\r
+ Log("msa1=\n");\r
+ msa1.LogMe();\r
+ Log("msa2=\n");\r
+ msa2.LogMe();\r
+ Log("XP=%g\n", scoreTotal);\r
+#endif\r
+// return scoreTotal / uPairCount;\r
+ return scoreTotal;\r
+ }\r