+++ /dev/null
-#include "muscle.h"\r
-#include "distfunc.h"\r
-#include "seqvect.h"\r
-#include <math.h>\r
-\r
-#define TRACE 0\r
-\r
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))\r
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))\r
-\r
-const unsigned TUPLE_COUNT = 6*6*6*6*6*6;\r
-static unsigned char Count1[TUPLE_COUNT];\r
-static unsigned char Count2[TUPLE_COUNT];\r
-\r
-// Amino acid groups according to MAFFT (sextet5)\r
-// 0 = A G P S T\r
-// 1 = I L M V\r
-// 2 = N D Q E B Z\r
-// 3 = R H K\r
-// 4 = F W Y\r
-// 5 = C\r
-// 6 = X . - U\r
-unsigned ResidueGroup[] =\r
- {\r
- 0, // AX_A,\r
- 5, // AX_C,\r
- 2, // AX_D,\r
- 2, // AX_E,\r
- 4, // AX_F,\r
- 0, // AX_G,\r
- 3, // AX_H,\r
- 1, // AX_I,\r
- 3, // AX_K,\r
- 1, // AX_L,\r
- 1, // AX_M,\r
- 2, // AX_N,\r
- 0, // AX_P,\r
- 2, // AX_Q,\r
- 3, // AX_R,\r
- 0, // AX_S,\r
- 0, // AX_T,\r
- 1, // AX_V,\r
- 4, // AX_W,\r
- 4, // AX_Y,\r
-\r
- 2, // AX_B, // D or N\r
- 2, // AX_Z, // E or Q\r
- 0, // AX_X, // Unknown // ******** TODO *************\r
- // This isn't the correct way of avoiding group 6\r
- 0 // AX_GAP, // ******** TODO ******************\r
- };\r
-unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);\r
-\r
-static char *TupleToStr(int t)\r
- {\r
- static char s[7];\r
- int t1, t2, t3, t4, t5, t6;\r
-\r
- t1 = t%6;\r
- t2 = (t/6)%6;\r
- t3 = (t/(6*6))%6;\r
- t4 = (t/(6*6*6))%6;\r
- t5 = (t/(6*6*6*6))%6;\r
- t6 = (t/(6*6*6*6*6))%6;\r
-\r
- s[5] = '0' + t1;\r
- s[4] = '0' + t2;\r
- s[3] = '0' + t3;\r
- s[2] = '0' + t4;\r
- s[1] = '0' + t5;\r
- s[0] = '0' + t6;\r
- return s;\r
- }\r
-\r
-static unsigned GetTuple(const unsigned uLetters[], unsigned n)\r
- {\r
- assert(uLetters[n] < uResidueGroupCount);\r
- assert(uLetters[n+1] < uResidueGroupCount);\r
- assert(uLetters[n+2] < uResidueGroupCount);\r
- assert(uLetters[n+3] < uResidueGroupCount);\r
- assert(uLetters[n+4] < uResidueGroupCount);\r
- assert(uLetters[n+5] < uResidueGroupCount);\r
-\r
- unsigned u1 = ResidueGroup[uLetters[n]];\r
- unsigned u2 = ResidueGroup[uLetters[n+1]];\r
- unsigned u3 = ResidueGroup[uLetters[n+2]];\r
- unsigned u4 = ResidueGroup[uLetters[n+3]];\r
- unsigned u5 = ResidueGroup[uLetters[n+4]];\r
- unsigned u6 = ResidueGroup[uLetters[n+5]];\r
-\r
- return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;\r
- }\r
-\r
-static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])\r
- {\r
- memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));\r
- for (unsigned n = 0; n < uTupleCount; ++n)\r
- {\r
- const unsigned uTuple = GetTuple(L, n);\r
- ++(Count[uTuple]);\r
- }\r
- }\r
-\r
-static void ListCount(const unsigned char Count[])\r
- {\r
- for (unsigned n = 0; n < TUPLE_COUNT; ++n)\r
- {\r
- if (0 == Count[n])\r
- continue;\r
- Log("%s %u\n", TupleToStr(n), Count[n]);\r
- }\r
- }\r
-\r
-void DistKmer6_6(const SeqVect &v, DistFunc &DF)\r
- {\r
- const unsigned uSeqCount = v.Length();\r
-\r
- DF.SetCount(uSeqCount);\r
- if (0 == uSeqCount)\r
- return;\r
-\r
-// Initialize distance matrix to zero\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
- {\r
- DF.SetDist(uSeq1, uSeq1, 0);\r
- for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
- DF.SetDist(uSeq1, uSeq2, 0);\r
- }\r
-\r
-// Convert to letters\r
- unsigned **Letters = new unsigned *[uSeqCount];\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- Seq &s = *(v[uSeqIndex]);\r
- const unsigned uSeqLength = s.Length();\r
- unsigned *L = new unsigned[uSeqLength];\r
- Letters[uSeqIndex] = L;\r
- for (unsigned n = 0; n < uSeqLength; ++n)\r
- {\r
- char c = s[n];\r
- L[n] = CharToLetterEx(c);\r
- assert(L[n] < uResidueGroupCount);\r
- }\r
- }\r
-\r
- unsigned **uCommonTupleCount = new unsigned *[uSeqCount];\r
- for (unsigned n = 0; n < uSeqCount; ++n)\r
- {\r
- uCommonTupleCount[n] = new unsigned[uSeqCount];\r
- memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));\r
- }\r
-\r
- const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;\r
- unsigned uCount = 0;\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
- {\r
- Seq &seq1 = *(v[uSeq1]);\r
- const unsigned uSeqLength1 = seq1.Length();\r
- if (uSeqLength1 < 5)\r
- continue;\r
-\r
- const unsigned uTupleCount = uSeqLength1 - 5;\r
- const unsigned *L = Letters[uSeq1];\r
- CountTuples(L, uTupleCount, Count1);\r
-#if TRACE\r
- {\r
- Log("Seq1=%d\n", uSeq1);\r
- Log("Groups:\n");\r
- for (unsigned n = 0; n < uSeqLength1; ++n)\r
- Log("%u", ResidueGroup[L[n]]);\r
- Log("\n");\r
-\r
- Log("Tuples:\n");\r
- ListCount(Count1);\r
- }\r
-#endif\r
-\r
- SetProgressDesc("K-mer dist pass 1");\r
- for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)\r
- {\r
- if (0 == uCount%500)\r
- Progress(uCount, uPairCount);\r
- ++uCount;\r
- Seq &seq2 = *(v[uSeq2]);\r
- const unsigned uSeqLength2 = seq2.Length();\r
- if (uSeqLength2 < 5)\r
- {\r
- if (uSeq1 == uSeq2)\r
- DF.SetDist(uSeq1, uSeq2, 0);\r
- else\r
- DF.SetDist(uSeq1, uSeq2, 1);\r
- continue;\r
- }\r
-\r
- // First pass through seq 2 to count tuples\r
- const unsigned uTupleCount = uSeqLength2 - 5;\r
- const unsigned *L = Letters[uSeq2];\r
- CountTuples(L, uTupleCount, Count2);\r
-#if TRACE\r
- Log("Seq2=%d Counts=\n", uSeq2);\r
- ListCount(Count2);\r
-#endif\r
-\r
- // Second pass to accumulate sum of shared tuples\r
- // MAFFT defines this as the sum over unique tuples\r
- // in seq2 of the minimum of the number of tuples found\r
- // in the two sequences.\r
- unsigned uSum = 0;\r
- for (unsigned n = 0; n < uTupleCount; ++n)\r
- {\r
- const unsigned uTuple = GetTuple(L, n);\r
- uSum += MIN(Count1[uTuple], Count2[uTuple]);\r
-\r
- // This is a hack to make sure each unique tuple counted only once.\r
- Count2[uTuple] = 0;\r
- }\r
-#if TRACE\r
- {\r
- Seq &s1 = *(v[uSeq1]);\r
- Seq &s2 = *(v[uSeq2]);\r
- const char *pName1 = s1.GetName();\r
- const char *pName2 = s2.GetName();\r
- Log("Common count %s(%d) - %s(%d) =%u\n",\r
- pName1, uSeq1, pName2, uSeq2, uSum);\r
- }\r
-#endif\r
- uCommonTupleCount[uSeq1][uSeq2] = uSum;\r
- uCommonTupleCount[uSeq2][uSeq1] = uSum;\r
- }\r
- }\r
- ProgressStepsDone();\r
-\r
- uCount = 0;\r
- SetProgressDesc("K-mer dist pass 2");\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
- {\r
- Seq &s1 = *(v[uSeq1]);\r
- const char *pName1 = s1.GetName();\r
-\r
- double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];\r
- if (0 == dCommonTupleCount11)\r
- dCommonTupleCount11 = 1;\r
-\r
- DF.SetDist(uSeq1, uSeq1, 0);\r
- for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
- {\r
- if (0 == uCount%500)\r
- Progress(uCount, uPairCount);\r
- ++uCount;\r
-\r
- double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];\r
- if (0 == dCommonTupleCount22)\r
- dCommonTupleCount22 = 1;\r
-\r
- const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])\r
- /dCommonTupleCount11;\r
- const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])\r
- /dCommonTupleCount22;\r
-\r
- // dMinDist is the value used for tree-building in MAFFT\r
- const double dMinDist = MIN(dDist1, dDist2);\r
- DF.SetDist(uSeq1, uSeq2, (float) dMinDist);\r
-\r
- //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);\r
- //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);\r
- // **** TODO **** why does this make score slightly worse??\r
- //const double dKimuraDist = KimuraDist(dEstimatedPctId);\r
- //DF.SetDist(uSeq1, uSeq2, dKimuraDist);\r
- }\r
- }\r
- ProgressStepsDone();\r
-\r
- for (unsigned n = 0; n < uSeqCount; ++n)\r
- delete[] uCommonTupleCount[n];\r
- delete[] uCommonTupleCount;\r
- delete[] Letters;\r
- }\r
-\r
-double PctIdToMAFFTDist(double dPctId)\r
- {\r
- if (dPctId < 0.05)\r
- dPctId = 0.05;\r
- double dDist = -log(dPctId);\r
- return dDist;\r
- }\r
-\r
-double PctIdToHeightMAFFT(double dPctId)\r
- {\r
- return PctIdToMAFFTDist(dPctId);\r
- }\r