Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / msa2.cpp
diff --git a/website/archive/binaries/mac/src/muscle/msa2.cpp b/website/archive/binaries/mac/src/muscle/msa2.cpp
new file mode 100644 (file)
index 0000000..c82b9d7
--- /dev/null
@@ -0,0 +1,531 @@
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "seqvect.h"\r
+#include "profile.h"\r
+#include "tree.h"\r
+\r
+// These global variables are a hack to allow the tree\r
+// dependent iteration code to communicate the edge\r
+// used to divide the tree. The three-way weighting\r
+// scheme needs to know this edge in order to compute\r
+// sequence weights.\r
+static const Tree *g_ptrMuscleTree = 0;\r
+unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;\r
+unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;\r
+\r
+void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,\r
+  FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,\r
+  FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,\r
+  FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const\r
+       {\r
+       const unsigned uSeqCount = GetSeqCount();\r
+       const unsigned uColCount = GetColCount();\r
+\r
+       memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));\r
+       WEIGHT wTotal = 0;\r
+       FCOUNT fGap = 0;\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               const WEIGHT w = GetSeqWeight(uSeqIndex);\r
+               if (IsGap(uSeqIndex, uColIndex))\r
+                       {\r
+                       fGap += w;\r
+                       continue;\r
+                       }\r
+               else if (IsWildcard(uSeqIndex, uColIndex))\r
+                       {\r
+                       const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);\r
+                       switch (g_Alpha)\r
+                               {\r
+                       case ALPHA_Amino:\r
+                               switch (uLetter)\r
+                                       {\r
+                               case AX_B:              // D or N\r
+                                       fcCounts[AX_D] += w/2;\r
+                                       fcCounts[AX_N] += w/2;\r
+                                       break;\r
+                               case AX_Z:              // E or Q\r
+                                       fcCounts[AX_E] += w/2;\r
+                                       fcCounts[AX_Q] += w/2;\r
+                                       break;\r
+                               default:                // any\r
+                                       {\r
+                                       const FCOUNT f = w/20;\r
+                                       for (unsigned uLetter = 0; uLetter < 20; ++uLetter)\r
+                                               fcCounts[uLetter] += f;\r
+                                       break;\r
+                                       }\r
+                                       }\r
+                               break;\r
+\r
+                       case ALPHA_DNA:\r
+                       case ALPHA_RNA:\r
+                               switch (uLetter)\r
+                                       {\r
+                               case AX_R:      // G or A\r
+                                       fcCounts[NX_G] += w/2;\r
+                                       fcCounts[NX_A] += w/2;\r
+                                       break;\r
+                               case AX_Y:      // C or T/U\r
+                                       fcCounts[NX_C] += w/2;\r
+                                       fcCounts[NX_T] += w/2;\r
+                                       break;\r
+                               default:        // any\r
+                                       const FCOUNT f = w/20;\r
+                                       for (unsigned uLetter = 0; uLetter < 4; ++uLetter)\r
+                                               fcCounts[uLetter] += f;\r
+                                       break;\r
+                                       }\r
+                               break;\r
+\r
+                       default:\r
+                               Quit("Alphabet %d not supported", g_Alpha);\r
+                               }\r
+                       continue;\r
+                       }\r
+               unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
+               fcCounts[uLetter] += w;\r
+               wTotal += w;\r
+               }\r
+       *ptrfOcc = (float) (1.0 - fGap);\r
+\r
+       if (bNormalize && wTotal > 0)\r
+               {\r
+               if (wTotal > 1.001)\r
+                       Quit("wTotal=%g\n", wTotal);\r
+               for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
+                       fcCounts[uLetter] /= wTotal;\r
+//             AssertNormalized(fcCounts);\r
+               }\r
+\r
+       FCOUNT fcStartCount = 0;\r
+       if (uColIndex == 0)\r
+               {\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       if (IsGap(uSeqIndex, uColIndex))\r
+                               fcStartCount += GetSeqWeight(uSeqIndex);\r
+               }\r
+       else\r
+               {\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))\r
+                               fcStartCount += GetSeqWeight(uSeqIndex);\r
+               }\r
+\r
+       FCOUNT fcEndCount = 0;\r
+       if (uColCount - 1 == uColIndex)\r
+               {\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       if (IsGap(uSeqIndex, uColIndex))\r
+                               fcEndCount += GetSeqWeight(uSeqIndex);\r
+               }\r
+       else\r
+               {\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))\r
+                               fcEndCount += GetSeqWeight(uSeqIndex);\r
+               }\r
+\r
+       FCOUNT LL = 0;\r
+       FCOUNT LG = 0;\r
+       FCOUNT GL = 0;\r
+       FCOUNT GG = 0;\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               WEIGHT w = GetSeqWeight(uSeqIndex);\r
+               bool bLetterHere = !IsGap(uSeqIndex, uColIndex);\r
+               bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));\r
+               if (bLetterHere)\r
+                       {\r
+                       if (bLetterPrev)\r
+                               LL += w;\r
+                       else\r
+                               GL += w;\r
+                       }\r
+               else\r
+                       {\r
+                       if (bLetterPrev)\r
+                               LG += w;\r
+                       else\r
+                               GG += w;\r
+                       }\r
+               }\r
+\r
+       FCOUNT fcExtendCount = 0;\r
+       if (uColIndex > 0 && uColIndex < GetColCount() - 1)\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&\r
+                         IsGap(uSeqIndex, uColIndex + 1))\r
+                               fcExtendCount += GetSeqWeight(uSeqIndex);\r
+\r
+       *ptrfcLL = LL;\r
+       *ptrfcLG = LG;\r
+       *ptrfcGL = GL;\r
+       *ptrfcGG = GG;\r
+       *ptrfcGapStart = fcStartCount;\r
+       *ptrfcGapEnd = fcEndCount;\r
+       *ptrfcGapExtend = fcExtendCount;\r
+       }\r
+\r
+// Return true if the given column has no gaps and all\r
+// its residues are in the same biochemical group.\r
+bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)\r
+       {\r
+       extern unsigned ResidueGroup[];\r
+\r
+       const unsigned uSeqCount = msa.GetColCount();\r
+       if (0 == uSeqCount)\r
+               Quit("MSAColIsConservative: empty alignment");\r
+\r
+       if (msa.IsGap(0, uColIndex))\r
+               return false;\r
+\r
+       unsigned uLetter = msa.GetLetterEx(0, uColIndex);\r
+       const unsigned uGroup = ResidueGroup[uLetter];\r
+\r
+       for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               if (msa.IsGap(uSeqIndex, uColIndex))\r
+                       return false;\r
+               uLetter = msa.GetLetter(uSeqIndex, uColIndex);\r
+               if (ResidueGroup[uLetter] != uGroup)\r
+                       return false;\r
+               }\r
+       return true;\r
+       }\r
+\r
+void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,\r
+  MSA &msaOut)\r
+       {\r
+       const unsigned uColCount = msaIn.GetColCount();\r
+       msaOut.SetSize(uSeqCount, uColCount);\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);\r
+               msaOut.SetSeqName(uSeqIndex, ptrName);\r
+\r
+               for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+                       {\r
+                       const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);\r
+                       msaOut.SetChar(uSeqIndex, uColIndex, c);\r
+                       }\r
+               }\r
+       }\r
+\r
+void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,\r
+  MSA &msaOut)\r
+       {\r
+       const unsigned uSeqCount = msaIn.GetSeqCount();\r
+       const unsigned uInColCount = msaIn.GetColCount();\r
+\r
+       if (uFromColIndex + uColCount - 1 > uInColCount)\r
+               Quit("MSAFromColRange, out of bounds");\r
+\r
+       msaOut.SetSize(uSeqCount, uColCount);\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               const char *ptrName = msaIn.GetSeqName(uSeqIndex);\r
+               unsigned uId = msaIn.GetSeqId(uSeqIndex);\r
+               msaOut.SetSeqName(uSeqIndex, ptrName);\r
+               msaOut.SetSeqId(uSeqIndex, uId);\r
+\r
+               for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+                       {\r
+                       const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);\r
+                       msaOut.SetChar(uSeqIndex, uColIndex, c);\r
+                       }\r
+               }\r
+       }\r
+\r
+void SeqVectFromMSA(const MSA &msa, SeqVect &v)\r
+       {\r
+       v.Clear();\r
+       const unsigned uSeqCount = msa.GetSeqCount();\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               Seq s;\r
+               msa.GetSeq(uSeqIndex, s);\r
+\r
+               s.StripGaps();\r
+               //if (0 == s.Length())\r
+               //      continue;\r
+\r
+               const char *ptrName = msa.GetSeqName(uSeqIndex);\r
+               s.SetName(ptrName);\r
+\r
+               v.AppendSeq(s);\r
+               }\r
+       }\r
+\r
+void DeleteGappedCols(MSA &msa)\r
+       {\r
+       unsigned uColIndex = 0;\r
+       for (;;)\r
+               {\r
+               if (uColIndex >= msa.GetColCount())\r
+                       break;\r
+               if (msa.IsGapColumn(uColIndex))\r
+                       msa.DeleteCol(uColIndex);\r
+               else\r
+                       ++uColIndex;\r
+               }\r
+       }\r
+\r
+void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,\r
+  MSA &msaOut)\r
+       {\r
+       const unsigned uColCount = msaIn.GetColCount();\r
+       msaOut.SetSize(uSeqCount, uColCount);\r
+       for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)\r
+               {\r
+               unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];\r
+               const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);\r
+               unsigned uId = msaIn.GetSeqId(uSeqIndexIn);\r
+               msaOut.SetSeqName(uSeqIndexOut, ptrName);\r
+               msaOut.SetSeqId(uSeqIndexOut, uId);\r
+               for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+                       {\r
+                       const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);\r
+                       msaOut.SetChar(uSeqIndexOut, uColIndex, c);\r
+                       }\r
+               }\r
+       }\r
+\r
+void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)\r
+       {\r
+       const unsigned uSeqCount1 = msa1.GetSeqCount();\r
+       const unsigned uSeqCount2 = msa2.GetSeqCount();\r
+       if (uSeqCount1 != uSeqCount2)\r
+               Quit("Seq count differs");\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
+               {\r
+               Seq seq1;\r
+               msa1.GetSeq(uSeqIndex, seq1);\r
+\r
+               unsigned uId = msa1.GetSeqId(uSeqIndex);\r
+               unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
+\r
+               Seq seq2;\r
+               msa2.GetSeq(uSeqIndex2, seq2);\r
+\r
+               if (!seq1.EqIgnoreCaseAndGaps(seq2))\r
+                       {\r
+                       Log("Input:\n");\r
+                       seq1.LogMe();\r
+                       Log("Output:\n");\r
+                       seq2.LogMe();\r
+                       Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
+                       }\r
+               }\r
+       }\r
+\r
+void AssertMSAEq(const MSA &msa1, const MSA &msa2)\r
+       {\r
+       const unsigned uSeqCount1 = msa1.GetSeqCount();\r
+       const unsigned uSeqCount2 = msa2.GetSeqCount();\r
+       if (uSeqCount1 != uSeqCount2)\r
+               Quit("Seq count differs");\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
+               {\r
+               Seq seq1;\r
+               msa1.GetSeq(uSeqIndex, seq1);\r
+\r
+               unsigned uId = msa1.GetSeqId(uSeqIndex);\r
+               unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
+\r
+               Seq seq2;\r
+               msa2.GetSeq(uSeqIndex2, seq2);\r
+\r
+               if (!seq1.Eq(seq2))\r
+                       {\r
+                       Log("Input:\n");\r
+                       seq1.LogMe();\r
+                       Log("Output:\n");\r
+                       seq2.LogMe();\r
+                       Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
+                       }\r
+               }\r
+       }\r
+\r
+void SetMSAWeightsMuscle(MSA &msa)\r
+       {\r
+       SEQWEIGHT Method = GetSeqWeightMethod();\r
+       switch (Method)\r
+               {\r
+       case SEQWEIGHT_None:\r
+               msa.SetUniformWeights();\r
+               return;\r
+\r
+       case SEQWEIGHT_Henikoff:\r
+               msa.SetHenikoffWeights();\r
+               return;\r
+\r
+       case SEQWEIGHT_HenikoffPB:\r
+               msa.SetHenikoffWeightsPB();\r
+               return;\r
+\r
+       case SEQWEIGHT_GSC:\r
+               msa.SetGSCWeights();\r
+               return;\r
+\r
+       case SEQWEIGHT_ClustalW:\r
+               SetClustalWWeightsMuscle(msa);\r
+               return;\r
+       \r
+       case SEQWEIGHT_ThreeWay:\r
+               SetThreeWayWeightsMuscle(msa);\r
+               return;\r
+               }\r
+       Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);\r
+       }\r
+\r
+static WEIGHT *g_MuscleWeights;\r
+static unsigned g_uMuscleIdCount;\r
+\r
+WEIGHT GetMuscleSeqWeightById(unsigned uId)\r
+       {\r
+       if (0 == g_MuscleWeights)\r
+               Quit("g_MuscleWeights = 0");\r
+       if (uId >= g_uMuscleIdCount)\r
+               Quit("GetMuscleSeqWeightById(%u): count=%u",\r
+                 uId, g_uMuscleIdCount);\r
+\r
+       return g_MuscleWeights[uId];\r
+       }\r
+\r
+void SetMuscleTree(const Tree &tree)\r
+       {\r
+       g_ptrMuscleTree = &tree;\r
+\r
+       if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())\r
+               return;\r
+\r
+       delete[] g_MuscleWeights;\r
+\r
+       const unsigned uLeafCount = tree.GetLeafCount();\r
+       g_uMuscleIdCount = uLeafCount;\r
+       g_MuscleWeights = new WEIGHT[uLeafCount];\r
+       CalcClustalWWeights(tree, g_MuscleWeights);\r
+       }\r
+\r
+void SetClustalWWeightsMuscle(MSA &msa)\r
+       {\r
+       if (0 == g_MuscleWeights)\r
+               Quit("g_MuscleWeights = 0");\r
+       const unsigned uSeqCount = msa.GetSeqCount();\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               const unsigned uId = msa.GetSeqId(uSeqIndex);\r
+               if (uId >= g_uMuscleIdCount)\r
+                       Quit("SetClustalWWeightsMuscle: id out of range");\r
+               msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);\r
+               }\r
+       msa.NormalizeWeights((WEIGHT) 1.0);\r
+       }\r
+\r
+#define        LOCAL_VERBOSE   0\r
+\r
+void SetThreeWayWeightsMuscle(MSA &msa)\r
+       {\r
+       if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)\r
+               {\r
+               msa.SetHenikoffWeightsPB();\r
+               return;\r
+               }\r
+\r
+       const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();\r
+       WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];\r
+\r
+       CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,\r
+         Weights);\r
+\r
+       const unsigned uMSASeqCount = msa.GetSeqCount();\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)\r
+               {\r
+               const unsigned uId = msa.GetSeqId(uSeqIndex);\r
+               if (uId >= uMuscleSeqCount)\r
+                       Quit("SetThreeWayWeightsMuscle: id out of range");\r
+               msa.SetSeqWeight(uSeqIndex, Weights[uId]);\r
+               }\r
+#if    LOCAL_VERBOSE\r
+       {\r
+       Log("SetThreeWayWeightsMuscle\n");\r
+       for (unsigned n = 0; n < uMSASeqCount; ++n)\r
+               {\r
+               const unsigned uId = msa.GetSeqId(n);\r
+               Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);\r
+               }\r
+       }\r
+#endif\r
+       msa.NormalizeWeights((WEIGHT) 1.0);\r
+\r
+       delete[] Weights;\r
+       }\r
+\r
+// Append msa2 at the end of msa1\r
+void MSAAppend(MSA &msa1, const MSA &msa2)\r
+       {\r
+       const unsigned uSeqCount = msa1.GetSeqCount();\r
+\r
+       const unsigned uColCount1 = msa1.GetColCount();\r
+       const unsigned uColCount2 = msa2.GetColCount();\r
+       const unsigned uColCountCat = uColCount1 + uColCount2;\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               unsigned uId = msa1.GetSeqId(uSeqIndex);\r
+               unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
+               for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
+                       {\r
+                       const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
+                       msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
+                       }\r
+               }\r
+       }\r
+\r
+// "Catenate" two MSAs (by bad analogy with UNIX cat command).\r
+// msa1 and msa2 must have same sequence names, but possibly\r
+// in a different order.\r
+// msaCat is the combined alignment produce by appending\r
+// sequences in msa2 to sequences in msa1.\r
+void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)\r
+       {\r
+       const unsigned uSeqCount = msa1.GetSeqCount();\r
+\r
+       const unsigned uColCount1 = msa1.GetColCount();\r
+       const unsigned uColCount2 = msa2.GetColCount();\r
+       const unsigned uColCountCat = uColCount1 + uColCount2;\r
+\r
+       msaCat.SetSize(uSeqCount, uColCountCat);\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)\r
+                       {\r
+                       const char c = msa1.GetChar(uSeqIndex, uColIndex);\r
+                       msaCat.SetChar(uSeqIndex, uColIndex, c);\r
+                       }\r
+\r
+               const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);\r
+               unsigned uSeqIndex2;\r
+               msaCat.SetSeqName(uSeqIndex, ptrSeqName);\r
+               bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);\r
+               if (bFound)\r
+                       {\r
+                       for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
+                               {\r
+                               const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
+                               msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
+                               }\r
+                       }\r
+               else\r
+                       {\r
+                       for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
+                               msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');\r
+                       }\r
+               }\r
+       }\r