+++ /dev/null
-#include "muscle.h"\r
-#include "msa.h"\r
-#include "seqvect.h"\r
-#include "profile.h"\r
-#include "tree.h"\r
-\r
-// These global variables are a hack to allow the tree\r
-// dependent iteration code to communicate the edge\r
-// used to divide the tree. The three-way weighting\r
-// scheme needs to know this edge in order to compute\r
-// sequence weights.\r
-static const Tree *g_ptrMuscleTree = 0;\r
-unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;\r
-unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;\r
-\r
-void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,\r
- FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,\r
- FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,\r
- FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const\r
- {\r
- const unsigned uSeqCount = GetSeqCount();\r
- const unsigned uColCount = GetColCount();\r
-\r
- memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));\r
- WEIGHT wTotal = 0;\r
- FCOUNT fGap = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- const WEIGHT w = GetSeqWeight(uSeqIndex);\r
- if (IsGap(uSeqIndex, uColIndex))\r
- {\r
- fGap += w;\r
- continue;\r
- }\r
- else if (IsWildcard(uSeqIndex, uColIndex))\r
- {\r
- const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);\r
- switch (g_Alpha)\r
- {\r
- case ALPHA_Amino:\r
- switch (uLetter)\r
- {\r
- case AX_B: // D or N\r
- fcCounts[AX_D] += w/2;\r
- fcCounts[AX_N] += w/2;\r
- break;\r
- case AX_Z: // E or Q\r
- fcCounts[AX_E] += w/2;\r
- fcCounts[AX_Q] += w/2;\r
- break;\r
- default: // any\r
- {\r
- const FCOUNT f = w/20;\r
- for (unsigned uLetter = 0; uLetter < 20; ++uLetter)\r
- fcCounts[uLetter] += f;\r
- break;\r
- }\r
- }\r
- break;\r
-\r
- case ALPHA_DNA:\r
- case ALPHA_RNA:\r
- switch (uLetter)\r
- {\r
- case AX_R: // G or A\r
- fcCounts[NX_G] += w/2;\r
- fcCounts[NX_A] += w/2;\r
- break;\r
- case AX_Y: // C or T/U\r
- fcCounts[NX_C] += w/2;\r
- fcCounts[NX_T] += w/2;\r
- break;\r
- default: // any\r
- const FCOUNT f = w/20;\r
- for (unsigned uLetter = 0; uLetter < 4; ++uLetter)\r
- fcCounts[uLetter] += f;\r
- break;\r
- }\r
- break;\r
-\r
- default:\r
- Quit("Alphabet %d not supported", g_Alpha);\r
- }\r
- continue;\r
- }\r
- unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
- fcCounts[uLetter] += w;\r
- wTotal += w;\r
- }\r
- *ptrfOcc = (float) (1.0 - fGap);\r
-\r
- if (bNormalize && wTotal > 0)\r
- {\r
- if (wTotal > 1.001)\r
- Quit("wTotal=%g\n", wTotal);\r
- for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
- fcCounts[uLetter] /= wTotal;\r
-// AssertNormalized(fcCounts);\r
- }\r
-\r
- FCOUNT fcStartCount = 0;\r
- if (uColIndex == 0)\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex))\r
- fcStartCount += GetSeqWeight(uSeqIndex);\r
- }\r
- else\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))\r
- fcStartCount += GetSeqWeight(uSeqIndex);\r
- }\r
-\r
- FCOUNT fcEndCount = 0;\r
- if (uColCount - 1 == uColIndex)\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex))\r
- fcEndCount += GetSeqWeight(uSeqIndex);\r
- }\r
- else\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))\r
- fcEndCount += GetSeqWeight(uSeqIndex);\r
- }\r
-\r
- FCOUNT LL = 0;\r
- FCOUNT LG = 0;\r
- FCOUNT GL = 0;\r
- FCOUNT GG = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- WEIGHT w = GetSeqWeight(uSeqIndex);\r
- bool bLetterHere = !IsGap(uSeqIndex, uColIndex);\r
- bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));\r
- if (bLetterHere)\r
- {\r
- if (bLetterPrev)\r
- LL += w;\r
- else\r
- GL += w;\r
- }\r
- else\r
- {\r
- if (bLetterPrev)\r
- LG += w;\r
- else\r
- GG += w;\r
- }\r
- }\r
-\r
- FCOUNT fcExtendCount = 0;\r
- if (uColIndex > 0 && uColIndex < GetColCount() - 1)\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&\r
- IsGap(uSeqIndex, uColIndex + 1))\r
- fcExtendCount += GetSeqWeight(uSeqIndex);\r
-\r
- *ptrfcLL = LL;\r
- *ptrfcLG = LG;\r
- *ptrfcGL = GL;\r
- *ptrfcGG = GG;\r
- *ptrfcGapStart = fcStartCount;\r
- *ptrfcGapEnd = fcEndCount;\r
- *ptrfcGapExtend = fcExtendCount;\r
- }\r
-\r
-// Return true if the given column has no gaps and all\r
-// its residues are in the same biochemical group.\r
-bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)\r
- {\r
- extern unsigned ResidueGroup[];\r
-\r
- const unsigned uSeqCount = msa.GetColCount();\r
- if (0 == uSeqCount)\r
- Quit("MSAColIsConservative: empty alignment");\r
-\r
- if (msa.IsGap(0, uColIndex))\r
- return false;\r
-\r
- unsigned uLetter = msa.GetLetterEx(0, uColIndex);\r
- const unsigned uGroup = ResidueGroup[uLetter];\r
-\r
- for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- if (msa.IsGap(uSeqIndex, uColIndex))\r
- return false;\r
- uLetter = msa.GetLetter(uSeqIndex, uColIndex);\r
- if (ResidueGroup[uLetter] != uGroup)\r
- return false;\r
- }\r
- return true;\r
- }\r
-\r
-void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,\r
- MSA &msaOut)\r
- {\r
- const unsigned uColCount = msaIn.GetColCount();\r
- msaOut.SetSize(uSeqCount, uColCount);\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);\r
- msaOut.SetSeqName(uSeqIndex, ptrName);\r
-\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);\r
- msaOut.SetChar(uSeqIndex, uColIndex, c);\r
- }\r
- }\r
- }\r
-\r
-void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,\r
- MSA &msaOut)\r
- {\r
- const unsigned uSeqCount = msaIn.GetSeqCount();\r
- const unsigned uInColCount = msaIn.GetColCount();\r
-\r
- if (uFromColIndex + uColCount - 1 > uInColCount)\r
- Quit("MSAFromColRange, out of bounds");\r
-\r
- msaOut.SetSize(uSeqCount, uColCount);\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- const char *ptrName = msaIn.GetSeqName(uSeqIndex);\r
- unsigned uId = msaIn.GetSeqId(uSeqIndex);\r
- msaOut.SetSeqName(uSeqIndex, ptrName);\r
- msaOut.SetSeqId(uSeqIndex, uId);\r
-\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);\r
- msaOut.SetChar(uSeqIndex, uColIndex, c);\r
- }\r
- }\r
- }\r
-\r
-void SeqVectFromMSA(const MSA &msa, SeqVect &v)\r
- {\r
- v.Clear();\r
- const unsigned uSeqCount = msa.GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- Seq s;\r
- msa.GetSeq(uSeqIndex, s);\r
-\r
- s.StripGaps();\r
- //if (0 == s.Length())\r
- // continue;\r
-\r
- const char *ptrName = msa.GetSeqName(uSeqIndex);\r
- s.SetName(ptrName);\r
-\r
- v.AppendSeq(s);\r
- }\r
- }\r
-\r
-void DeleteGappedCols(MSA &msa)\r
- {\r
- unsigned uColIndex = 0;\r
- for (;;)\r
- {\r
- if (uColIndex >= msa.GetColCount())\r
- break;\r
- if (msa.IsGapColumn(uColIndex))\r
- msa.DeleteCol(uColIndex);\r
- else\r
- ++uColIndex;\r
- }\r
- }\r
-\r
-void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,\r
- MSA &msaOut)\r
- {\r
- const unsigned uColCount = msaIn.GetColCount();\r
- msaOut.SetSize(uSeqCount, uColCount);\r
- for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)\r
- {\r
- unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];\r
- const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);\r
- unsigned uId = msaIn.GetSeqId(uSeqIndexIn);\r
- msaOut.SetSeqName(uSeqIndexOut, ptrName);\r
- msaOut.SetSeqId(uSeqIndexOut, uId);\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);\r
- msaOut.SetChar(uSeqIndexOut, uColIndex, c);\r
- }\r
- }\r
- }\r
-\r
-void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)\r
- {\r
- const unsigned uSeqCount1 = msa1.GetSeqCount();\r
- const unsigned uSeqCount2 = msa2.GetSeqCount();\r
- if (uSeqCount1 != uSeqCount2)\r
- Quit("Seq count differs");\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
- {\r
- Seq seq1;\r
- msa1.GetSeq(uSeqIndex, seq1);\r
-\r
- unsigned uId = msa1.GetSeqId(uSeqIndex);\r
- unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
-\r
- Seq seq2;\r
- msa2.GetSeq(uSeqIndex2, seq2);\r
-\r
- if (!seq1.EqIgnoreCaseAndGaps(seq2))\r
- {\r
- Log("Input:\n");\r
- seq1.LogMe();\r
- Log("Output:\n");\r
- seq2.LogMe();\r
- Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
- }\r
- }\r
- }\r
-\r
-void AssertMSAEq(const MSA &msa1, const MSA &msa2)\r
- {\r
- const unsigned uSeqCount1 = msa1.GetSeqCount();\r
- const unsigned uSeqCount2 = msa2.GetSeqCount();\r
- if (uSeqCount1 != uSeqCount2)\r
- Quit("Seq count differs");\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
- {\r
- Seq seq1;\r
- msa1.GetSeq(uSeqIndex, seq1);\r
-\r
- unsigned uId = msa1.GetSeqId(uSeqIndex);\r
- unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
-\r
- Seq seq2;\r
- msa2.GetSeq(uSeqIndex2, seq2);\r
-\r
- if (!seq1.Eq(seq2))\r
- {\r
- Log("Input:\n");\r
- seq1.LogMe();\r
- Log("Output:\n");\r
- seq2.LogMe();\r
- Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
- }\r
- }\r
- }\r
-\r
-void SetMSAWeightsMuscle(MSA &msa)\r
- {\r
- SEQWEIGHT Method = GetSeqWeightMethod();\r
- switch (Method)\r
- {\r
- case SEQWEIGHT_None:\r
- msa.SetUniformWeights();\r
- return;\r
-\r
- case SEQWEIGHT_Henikoff:\r
- msa.SetHenikoffWeights();\r
- return;\r
-\r
- case SEQWEIGHT_HenikoffPB:\r
- msa.SetHenikoffWeightsPB();\r
- return;\r
-\r
- case SEQWEIGHT_GSC:\r
- msa.SetGSCWeights();\r
- return;\r
-\r
- case SEQWEIGHT_ClustalW:\r
- SetClustalWWeightsMuscle(msa);\r
- return;\r
- \r
- case SEQWEIGHT_ThreeWay:\r
- SetThreeWayWeightsMuscle(msa);\r
- return;\r
- }\r
- Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);\r
- }\r
-\r
-static WEIGHT *g_MuscleWeights;\r
-static unsigned g_uMuscleIdCount;\r
-\r
-WEIGHT GetMuscleSeqWeightById(unsigned uId)\r
- {\r
- if (0 == g_MuscleWeights)\r
- Quit("g_MuscleWeights = 0");\r
- if (uId >= g_uMuscleIdCount)\r
- Quit("GetMuscleSeqWeightById(%u): count=%u",\r
- uId, g_uMuscleIdCount);\r
-\r
- return g_MuscleWeights[uId];\r
- }\r
-\r
-void SetMuscleTree(const Tree &tree)\r
- {\r
- g_ptrMuscleTree = &tree;\r
-\r
- if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())\r
- return;\r
-\r
- delete[] g_MuscleWeights;\r
-\r
- const unsigned uLeafCount = tree.GetLeafCount();\r
- g_uMuscleIdCount = uLeafCount;\r
- g_MuscleWeights = new WEIGHT[uLeafCount];\r
- CalcClustalWWeights(tree, g_MuscleWeights);\r
- }\r
-\r
-void SetClustalWWeightsMuscle(MSA &msa)\r
- {\r
- if (0 == g_MuscleWeights)\r
- Quit("g_MuscleWeights = 0");\r
- const unsigned uSeqCount = msa.GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- const unsigned uId = msa.GetSeqId(uSeqIndex);\r
- if (uId >= g_uMuscleIdCount)\r
- Quit("SetClustalWWeightsMuscle: id out of range");\r
- msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);\r
- }\r
- msa.NormalizeWeights((WEIGHT) 1.0);\r
- }\r
-\r
-#define LOCAL_VERBOSE 0\r
-\r
-void SetThreeWayWeightsMuscle(MSA &msa)\r
- {\r
- if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)\r
- {\r
- msa.SetHenikoffWeightsPB();\r
- return;\r
- }\r
-\r
- const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();\r
- WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];\r
-\r
- CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,\r
- Weights);\r
-\r
- const unsigned uMSASeqCount = msa.GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)\r
- {\r
- const unsigned uId = msa.GetSeqId(uSeqIndex);\r
- if (uId >= uMuscleSeqCount)\r
- Quit("SetThreeWayWeightsMuscle: id out of range");\r
- msa.SetSeqWeight(uSeqIndex, Weights[uId]);\r
- }\r
-#if LOCAL_VERBOSE\r
- {\r
- Log("SetThreeWayWeightsMuscle\n");\r
- for (unsigned n = 0; n < uMSASeqCount; ++n)\r
- {\r
- const unsigned uId = msa.GetSeqId(n);\r
- Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);\r
- }\r
- }\r
-#endif\r
- msa.NormalizeWeights((WEIGHT) 1.0);\r
-\r
- delete[] Weights;\r
- }\r
-\r
-// Append msa2 at the end of msa1\r
-void MSAAppend(MSA &msa1, const MSA &msa2)\r
- {\r
- const unsigned uSeqCount = msa1.GetSeqCount();\r
-\r
- const unsigned uColCount1 = msa1.GetColCount();\r
- const unsigned uColCount2 = msa2.GetColCount();\r
- const unsigned uColCountCat = uColCount1 + uColCount2;\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- unsigned uId = msa1.GetSeqId(uSeqIndex);\r
- unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
- for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
- {\r
- const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
- msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
- }\r
- }\r
- }\r
-\r
-// "Catenate" two MSAs (by bad analogy with UNIX cat command).\r
-// msa1 and msa2 must have same sequence names, but possibly\r
-// in a different order.\r
-// msaCat is the combined alignment produce by appending\r
-// sequences in msa2 to sequences in msa1.\r
-void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)\r
- {\r
- const unsigned uSeqCount = msa1.GetSeqCount();\r
-\r
- const unsigned uColCount1 = msa1.GetColCount();\r
- const unsigned uColCount2 = msa2.GetColCount();\r
- const unsigned uColCountCat = uColCount1 + uColCount2;\r
-\r
- msaCat.SetSize(uSeqCount, uColCountCat);\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)\r
- {\r
- const char c = msa1.GetChar(uSeqIndex, uColIndex);\r
- msaCat.SetChar(uSeqIndex, uColIndex, c);\r
- }\r
-\r
- const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);\r
- unsigned uSeqIndex2;\r
- msaCat.SetSeqName(uSeqIndex, ptrSeqName);\r
- bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);\r
- if (bFound)\r
- {\r
- for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
- {\r
- const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
- msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
- }\r
- }\r
- else\r
- {\r
- for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
- msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');\r
- }\r
- }\r
- }\r