--- /dev/null
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "seqvect.h"\r
+#include "profile.h"\r
+#include "tree.h"\r
+\r
+// These global variables are a hack to allow the tree\r
+// dependent iteration code to communicate the edge\r
+// used to divide the tree. The three-way weighting\r
+// scheme needs to know this edge in order to compute\r
+// sequence weights.\r
+static const Tree *g_ptrMuscleTree = 0;\r
+unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;\r
+unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;\r
+\r
+void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,\r
+ FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,\r
+ FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,\r
+ FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const\r
+ {\r
+ const unsigned uSeqCount = GetSeqCount();\r
+ const unsigned uColCount = GetColCount();\r
+\r
+ memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));\r
+ WEIGHT wTotal = 0;\r
+ FCOUNT fGap = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const WEIGHT w = GetSeqWeight(uSeqIndex);\r
+ if (IsGap(uSeqIndex, uColIndex))\r
+ {\r
+ fGap += w;\r
+ continue;\r
+ }\r
+ else if (IsWildcard(uSeqIndex, uColIndex))\r
+ {\r
+ const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);\r
+ switch (g_Alpha)\r
+ {\r
+ case ALPHA_Amino:\r
+ switch (uLetter)\r
+ {\r
+ case AX_B: // D or N\r
+ fcCounts[AX_D] += w/2;\r
+ fcCounts[AX_N] += w/2;\r
+ break;\r
+ case AX_Z: // E or Q\r
+ fcCounts[AX_E] += w/2;\r
+ fcCounts[AX_Q] += w/2;\r
+ break;\r
+ default: // any\r
+ {\r
+ const FCOUNT f = w/20;\r
+ for (unsigned uLetter = 0; uLetter < 20; ++uLetter)\r
+ fcCounts[uLetter] += f;\r
+ break;\r
+ }\r
+ }\r
+ break;\r
+\r
+ case ALPHA_DNA:\r
+ case ALPHA_RNA:\r
+ switch (uLetter)\r
+ {\r
+ case AX_R: // G or A\r
+ fcCounts[NX_G] += w/2;\r
+ fcCounts[NX_A] += w/2;\r
+ break;\r
+ case AX_Y: // C or T/U\r
+ fcCounts[NX_C] += w/2;\r
+ fcCounts[NX_T] += w/2;\r
+ break;\r
+ default: // any\r
+ const FCOUNT f = w/20;\r
+ for (unsigned uLetter = 0; uLetter < 4; ++uLetter)\r
+ fcCounts[uLetter] += f;\r
+ break;\r
+ }\r
+ break;\r
+\r
+ default:\r
+ Quit("Alphabet %d not supported", g_Alpha);\r
+ }\r
+ continue;\r
+ }\r
+ unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
+ fcCounts[uLetter] += w;\r
+ wTotal += w;\r
+ }\r
+ *ptrfOcc = (float) (1.0 - fGap);\r
+\r
+ if (bNormalize && wTotal > 0)\r
+ {\r
+ if (wTotal > 1.001)\r
+ Quit("wTotal=%g\n", wTotal);\r
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
+ fcCounts[uLetter] /= wTotal;\r
+// AssertNormalized(fcCounts);\r
+ }\r
+\r
+ FCOUNT fcStartCount = 0;\r
+ if (uColIndex == 0)\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex))\r
+ fcStartCount += GetSeqWeight(uSeqIndex);\r
+ }\r
+ else\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))\r
+ fcStartCount += GetSeqWeight(uSeqIndex);\r
+ }\r
+\r
+ FCOUNT fcEndCount = 0;\r
+ if (uColCount - 1 == uColIndex)\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex))\r
+ fcEndCount += GetSeqWeight(uSeqIndex);\r
+ }\r
+ else\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))\r
+ fcEndCount += GetSeqWeight(uSeqIndex);\r
+ }\r
+\r
+ FCOUNT LL = 0;\r
+ FCOUNT LG = 0;\r
+ FCOUNT GL = 0;\r
+ FCOUNT GG = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ WEIGHT w = GetSeqWeight(uSeqIndex);\r
+ bool bLetterHere = !IsGap(uSeqIndex, uColIndex);\r
+ bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));\r
+ if (bLetterHere)\r
+ {\r
+ if (bLetterPrev)\r
+ LL += w;\r
+ else\r
+ GL += w;\r
+ }\r
+ else\r
+ {\r
+ if (bLetterPrev)\r
+ LG += w;\r
+ else\r
+ GG += w;\r
+ }\r
+ }\r
+\r
+ FCOUNT fcExtendCount = 0;\r
+ if (uColIndex > 0 && uColIndex < GetColCount() - 1)\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&\r
+ IsGap(uSeqIndex, uColIndex + 1))\r
+ fcExtendCount += GetSeqWeight(uSeqIndex);\r
+\r
+ *ptrfcLL = LL;\r
+ *ptrfcLG = LG;\r
+ *ptrfcGL = GL;\r
+ *ptrfcGG = GG;\r
+ *ptrfcGapStart = fcStartCount;\r
+ *ptrfcGapEnd = fcEndCount;\r
+ *ptrfcGapExtend = fcExtendCount;\r
+ }\r
+\r
+// Return true if the given column has no gaps and all\r
+// its residues are in the same biochemical group.\r
+bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)\r
+ {\r
+ extern unsigned ResidueGroup[];\r
+\r
+ const unsigned uSeqCount = msa.GetColCount();\r
+ if (0 == uSeqCount)\r
+ Quit("MSAColIsConservative: empty alignment");\r
+\r
+ if (msa.IsGap(0, uColIndex))\r
+ return false;\r
+\r
+ unsigned uLetter = msa.GetLetterEx(0, uColIndex);\r
+ const unsigned uGroup = ResidueGroup[uLetter];\r
+\r
+ for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ if (msa.IsGap(uSeqIndex, uColIndex))\r
+ return false;\r
+ uLetter = msa.GetLetter(uSeqIndex, uColIndex);\r
+ if (ResidueGroup[uLetter] != uGroup)\r
+ return false;\r
+ }\r
+ return true;\r
+ }\r
+\r
+void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,\r
+ MSA &msaOut)\r
+ {\r
+ const unsigned uColCount = msaIn.GetColCount();\r
+ msaOut.SetSize(uSeqCount, uColCount);\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);\r
+ msaOut.SetSeqName(uSeqIndex, ptrName);\r
+\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);\r
+ msaOut.SetChar(uSeqIndex, uColIndex, c);\r
+ }\r
+ }\r
+ }\r
+\r
+void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,\r
+ MSA &msaOut)\r
+ {\r
+ const unsigned uSeqCount = msaIn.GetSeqCount();\r
+ const unsigned uInColCount = msaIn.GetColCount();\r
+\r
+ if (uFromColIndex + uColCount - 1 > uInColCount)\r
+ Quit("MSAFromColRange, out of bounds");\r
+\r
+ msaOut.SetSize(uSeqCount, uColCount);\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const char *ptrName = msaIn.GetSeqName(uSeqIndex);\r
+ unsigned uId = msaIn.GetSeqId(uSeqIndex);\r
+ msaOut.SetSeqName(uSeqIndex, ptrName);\r
+ msaOut.SetSeqId(uSeqIndex, uId);\r
+\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);\r
+ msaOut.SetChar(uSeqIndex, uColIndex, c);\r
+ }\r
+ }\r
+ }\r
+\r
+void SeqVectFromMSA(const MSA &msa, SeqVect &v)\r
+ {\r
+ v.Clear();\r
+ const unsigned uSeqCount = msa.GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ Seq s;\r
+ msa.GetSeq(uSeqIndex, s);\r
+\r
+ s.StripGaps();\r
+ //if (0 == s.Length())\r
+ // continue;\r
+\r
+ const char *ptrName = msa.GetSeqName(uSeqIndex);\r
+ s.SetName(ptrName);\r
+\r
+ v.AppendSeq(s);\r
+ }\r
+ }\r
+\r
+void DeleteGappedCols(MSA &msa)\r
+ {\r
+ unsigned uColIndex = 0;\r
+ for (;;)\r
+ {\r
+ if (uColIndex >= msa.GetColCount())\r
+ break;\r
+ if (msa.IsGapColumn(uColIndex))\r
+ msa.DeleteCol(uColIndex);\r
+ else\r
+ ++uColIndex;\r
+ }\r
+ }\r
+\r
+void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,\r
+ MSA &msaOut)\r
+ {\r
+ const unsigned uColCount = msaIn.GetColCount();\r
+ msaOut.SetSize(uSeqCount, uColCount);\r
+ for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)\r
+ {\r
+ unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];\r
+ const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);\r
+ unsigned uId = msaIn.GetSeqId(uSeqIndexIn);\r
+ msaOut.SetSeqName(uSeqIndexOut, ptrName);\r
+ msaOut.SetSeqId(uSeqIndexOut, uId);\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);\r
+ msaOut.SetChar(uSeqIndexOut, uColIndex, c);\r
+ }\r
+ }\r
+ }\r
+\r
+void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)\r
+ {\r
+ const unsigned uSeqCount1 = msa1.GetSeqCount();\r
+ const unsigned uSeqCount2 = msa2.GetSeqCount();\r
+ if (uSeqCount1 != uSeqCount2)\r
+ Quit("Seq count differs");\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
+ {\r
+ Seq seq1;\r
+ msa1.GetSeq(uSeqIndex, seq1);\r
+\r
+ unsigned uId = msa1.GetSeqId(uSeqIndex);\r
+ unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
+\r
+ Seq seq2;\r
+ msa2.GetSeq(uSeqIndex2, seq2);\r
+\r
+ if (!seq1.EqIgnoreCaseAndGaps(seq2))\r
+ {\r
+ Log("Input:\n");\r
+ seq1.LogMe();\r
+ Log("Output:\n");\r
+ seq2.LogMe();\r
+ Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
+ }\r
+ }\r
+ }\r
+\r
+void AssertMSAEq(const MSA &msa1, const MSA &msa2)\r
+ {\r
+ const unsigned uSeqCount1 = msa1.GetSeqCount();\r
+ const unsigned uSeqCount2 = msa2.GetSeqCount();\r
+ if (uSeqCount1 != uSeqCount2)\r
+ Quit("Seq count differs");\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
+ {\r
+ Seq seq1;\r
+ msa1.GetSeq(uSeqIndex, seq1);\r
+\r
+ unsigned uId = msa1.GetSeqId(uSeqIndex);\r
+ unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
+\r
+ Seq seq2;\r
+ msa2.GetSeq(uSeqIndex2, seq2);\r
+\r
+ if (!seq1.Eq(seq2))\r
+ {\r
+ Log("Input:\n");\r
+ seq1.LogMe();\r
+ Log("Output:\n");\r
+ seq2.LogMe();\r
+ Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
+ }\r
+ }\r
+ }\r
+\r
+void SetMSAWeightsMuscle(MSA &msa)\r
+ {\r
+ SEQWEIGHT Method = GetSeqWeightMethod();\r
+ switch (Method)\r
+ {\r
+ case SEQWEIGHT_None:\r
+ msa.SetUniformWeights();\r
+ return;\r
+\r
+ case SEQWEIGHT_Henikoff:\r
+ msa.SetHenikoffWeights();\r
+ return;\r
+\r
+ case SEQWEIGHT_HenikoffPB:\r
+ msa.SetHenikoffWeightsPB();\r
+ return;\r
+\r
+ case SEQWEIGHT_GSC:\r
+ msa.SetGSCWeights();\r
+ return;\r
+\r
+ case SEQWEIGHT_ClustalW:\r
+ SetClustalWWeightsMuscle(msa);\r
+ return;\r
+ \r
+ case SEQWEIGHT_ThreeWay:\r
+ SetThreeWayWeightsMuscle(msa);\r
+ return;\r
+ }\r
+ Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);\r
+ }\r
+\r
+static WEIGHT *g_MuscleWeights;\r
+static unsigned g_uMuscleIdCount;\r
+\r
+WEIGHT GetMuscleSeqWeightById(unsigned uId)\r
+ {\r
+ if (0 == g_MuscleWeights)\r
+ Quit("g_MuscleWeights = 0");\r
+ if (uId >= g_uMuscleIdCount)\r
+ Quit("GetMuscleSeqWeightById(%u): count=%u",\r
+ uId, g_uMuscleIdCount);\r
+\r
+ return g_MuscleWeights[uId];\r
+ }\r
+\r
+void SetMuscleTree(const Tree &tree)\r
+ {\r
+ g_ptrMuscleTree = &tree;\r
+\r
+ if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())\r
+ return;\r
+\r
+ delete[] g_MuscleWeights;\r
+\r
+ const unsigned uLeafCount = tree.GetLeafCount();\r
+ g_uMuscleIdCount = uLeafCount;\r
+ g_MuscleWeights = new WEIGHT[uLeafCount];\r
+ CalcClustalWWeights(tree, g_MuscleWeights);\r
+ }\r
+\r
+void SetClustalWWeightsMuscle(MSA &msa)\r
+ {\r
+ if (0 == g_MuscleWeights)\r
+ Quit("g_MuscleWeights = 0");\r
+ const unsigned uSeqCount = msa.GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const unsigned uId = msa.GetSeqId(uSeqIndex);\r
+ if (uId >= g_uMuscleIdCount)\r
+ Quit("SetClustalWWeightsMuscle: id out of range");\r
+ msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);\r
+ }\r
+ msa.NormalizeWeights((WEIGHT) 1.0);\r
+ }\r
+\r
+#define LOCAL_VERBOSE 0\r
+\r
+void SetThreeWayWeightsMuscle(MSA &msa)\r
+ {\r
+ if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)\r
+ {\r
+ msa.SetHenikoffWeightsPB();\r
+ return;\r
+ }\r
+\r
+ const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();\r
+ WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];\r
+\r
+ CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,\r
+ Weights);\r
+\r
+ const unsigned uMSASeqCount = msa.GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)\r
+ {\r
+ const unsigned uId = msa.GetSeqId(uSeqIndex);\r
+ if (uId >= uMuscleSeqCount)\r
+ Quit("SetThreeWayWeightsMuscle: id out of range");\r
+ msa.SetSeqWeight(uSeqIndex, Weights[uId]);\r
+ }\r
+#if LOCAL_VERBOSE\r
+ {\r
+ Log("SetThreeWayWeightsMuscle\n");\r
+ for (unsigned n = 0; n < uMSASeqCount; ++n)\r
+ {\r
+ const unsigned uId = msa.GetSeqId(n);\r
+ Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);\r
+ }\r
+ }\r
+#endif\r
+ msa.NormalizeWeights((WEIGHT) 1.0);\r
+\r
+ delete[] Weights;\r
+ }\r
+\r
+// Append msa2 at the end of msa1\r
+void MSAAppend(MSA &msa1, const MSA &msa2)\r
+ {\r
+ const unsigned uSeqCount = msa1.GetSeqCount();\r
+\r
+ const unsigned uColCount1 = msa1.GetColCount();\r
+ const unsigned uColCount2 = msa2.GetColCount();\r
+ const unsigned uColCountCat = uColCount1 + uColCount2;\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ unsigned uId = msa1.GetSeqId(uSeqIndex);\r
+ unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
+ {\r
+ const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
+ msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
+ }\r
+ }\r
+ }\r
+\r
+// "Catenate" two MSAs (by bad analogy with UNIX cat command).\r
+// msa1 and msa2 must have same sequence names, but possibly\r
+// in a different order.\r
+// msaCat is the combined alignment produce by appending\r
+// sequences in msa2 to sequences in msa1.\r
+void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)\r
+ {\r
+ const unsigned uSeqCount = msa1.GetSeqCount();\r
+\r
+ const unsigned uColCount1 = msa1.GetColCount();\r
+ const unsigned uColCount2 = msa2.GetColCount();\r
+ const unsigned uColCountCat = uColCount1 + uColCount2;\r
+\r
+ msaCat.SetSize(uSeqCount, uColCountCat);\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)\r
+ {\r
+ const char c = msa1.GetChar(uSeqIndex, uColIndex);\r
+ msaCat.SetChar(uSeqIndex, uColIndex, c);\r
+ }\r
+\r
+ const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);\r
+ unsigned uSeqIndex2;\r
+ msaCat.SetSeqName(uSeqIndex, ptrSeqName);\r
+ bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);\r
+ if (bFound)\r
+ {\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
+ {\r
+ const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
+ msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
+ }\r
+ }\r
+ else\r
+ {\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
+ msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');\r
+ }\r
+ }\r
+ }\r