--- /dev/null
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "seqvect.h"\r
+#include "seq.h"\r
+#include "distfunc.h"\r
+#include <math.h>\r
+\r
+#define TRACE 0\r
+\r
+/***\r
+Some candidate alphabets considered because they\r
+have high correlations and small table sizes.\r
+Correlation coefficent is between k-mer distance\r
+and %id D measured from a CLUSTALW alignment.\r
+Table size is N^k where N is size of alphabet.\r
+A is standard (uncompressed) amino alphabet.\r
+\r
+ Correlation\r
+Alpha N k Table Size all 25-50%\r
+----- -- - ---------- ---- ------\r
+A 20 3 8,000 0.943 0.575\r
+A 20 4 160,000 0.962 0.685 <<\r
+LiA 14 4 38,416 0.966 0.645\r
+SEB 14 4 38,416 0.964 0.634\r
+LiA 13 4 28,561 0.965 0.640\r
+LiA 12 4 20,736 0.963 0.620\r
+LiA 10 5 100,000 0.964 0.652\r
+\r
+We select A with k=4 because it has the best\r
+correlations. The only drawback is a large table\r
+size, but space is readily available and the only \r
+additional time cost is in resetting the table to\r
+zero, which can be done quickly with memset or by\r
+keeping a list of the k-mers that were found (should\r
+test to see which is faster, and may vary by compiler\r
+and processor type). It also has the minor advantage\r
+that we don't need to convert the alphabet.\r
+\r
+Fractional identity d is estimated as follows.\r
+\r
+ F = fractional k-mer count\r
+ if F is 0: F = 0.01\r
+ Y = log(0.02 + F)\r
+ d = -4.1 + 4.12*Y\r
+\r
+The constant 0.02 was chosen to make the relationship\r
+between Y and D linear. The constants -4.1 and 4.12\r
+were chosen to fit a straight line to the scatterplot\r
+of Y vs D.\r
+***/\r
+\r
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))\r
+\r
+const unsigned K = 4;\r
+const unsigned N = 20;\r
+const unsigned N_2 = 20*20;\r
+const unsigned N_3 = 20*20*20;\r
+const unsigned N_4 = 20*20*20*20;\r
+\r
+const unsigned TABLE_SIZE = N_4;\r
+\r
+// For debug output\r
+const char *KmerToStr(unsigned Kmer)\r
+ {\r
+ static char s[5];\r
+\r
+ unsigned c3 = (Kmer/N_3)%N;\r
+ unsigned c2 = (Kmer/N_2)%N;\r
+ unsigned c1 = (Kmer/N)%N;\r
+ unsigned c0 = Kmer%N;\r
+\r
+ s[0] = LetterToChar(c3);\r
+ s[1] = LetterToChar(c2);\r
+ s[2] = LetterToChar(c1);\r
+ s[3] = LetterToChar(c0);\r
+ return s;\r
+ }\r
+\r
+void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])\r
+ {\r
+#if TRACE\r
+ Log("CountKmers\n");\r
+#endif\r
+ memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));\r
+\r
+ const byte *ptrKmerStart = s;\r
+ const byte *ptrKmerEnd = s + 4;\r
+ const byte *ptrSeqEnd = s + uSeqLength;\r
+\r
+ unsigned c3 = s[0]*N_3;\r
+ unsigned c2 = s[1]*N_2;\r
+ unsigned c1 = s[2]*N;\r
+ unsigned c0 = s[3];\r
+\r
+ unsigned Kmer = c3 + c2 + c1 + c0;\r
+\r
+ for (;;)\r
+ {\r
+ assert(Kmer < TABLE_SIZE);\r
+\r
+#if TRACE\r
+ Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));\r
+#endif\r
+ ++(KmerCounts[Kmer]);\r
+\r
+ if (ptrKmerEnd == ptrSeqEnd)\r
+ break;\r
+\r
+ // Compute k-mer as function of previous k-mer:\r
+ // 1. Subtract first letter from previous k-mer.\r
+ // 2. Multiply by N.\r
+ // 3. Add next letter.\r
+ c3 = (*ptrKmerStart++) * N_3;\r
+ Kmer = (Kmer - c3)*N;\r
+ Kmer += *ptrKmerEnd++;\r
+ }\r
+ }\r
+\r
+unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,\r
+ const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)\r
+ {\r
+ byte KmerCounts2[TABLE_SIZE];\r
+ CountKmers(Seq2, uSeqLength2, KmerCounts2);\r
+\r
+ const byte *ptrKmerStart = Seq;\r
+ const byte *ptrKmerEnd = Seq + 4;\r
+ const byte *ptrSeqEnd = Seq + uSeqLength;\r
+\r
+ unsigned c3 = Seq[0]*N_3;\r
+ unsigned c2 = Seq[1]*N_2;\r
+ unsigned c1 = Seq[2]*N;\r
+ unsigned c0 = Seq[3];\r
+\r
+ unsigned Kmer = c3 + c2 + c1 + c0;\r
+\r
+ unsigned uCommonCount = 0;\r
+ for (;;)\r
+ {\r
+ assert(Kmer < TABLE_SIZE);\r
+\r
+ const byte Count1 = KmerCounts1[Kmer];\r
+ const byte Count2 = KmerCounts2[Kmer];\r
+\r
+ uCommonCount += MIN(Count1, Count2);\r
+\r
+ // Hack so we don't double-count\r
+ KmerCounts2[Kmer] = 0;\r
+\r
+ if (ptrKmerEnd == ptrSeqEnd)\r
+ break;\r
+\r
+ // Compute k-mer as function of previous k-mer:\r
+ // 1. Subtract first letter from previous k-mer.\r
+ // 2. Multiply by N.\r
+ // 3. Add next letter.\r
+ c3 = (*ptrKmerStart++) * N_3;\r
+ Kmer = (Kmer - c3)*N;\r
+ Kmer += *ptrKmerEnd++;\r
+ }\r
+ return uCommonCount;\r
+ }\r
+\r
+static void SeqToLetters(const Seq &s, byte Letters[])\r
+ {\r
+ const unsigned uSeqLength = s.Length();\r
+ for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)\r
+ {\r
+ char c = s.GetChar(uCol);\r
+ // Ugly hack. My k-mer counting code isn't wild-card\r
+ // aware. Arbitrarily replace wildcards by a specific\r
+ // amino acid.\r
+ if (IsWildcardChar(c))\r
+ c = 'A';\r
+ *Letters++ = CharToLetter(c);\r
+ }\r
+ }\r
+\r
+void FastDistKmer(const SeqVect &v, DistFunc &DF)\r
+ {\r
+ byte KmerCounts[TABLE_SIZE];\r
+\r
+ const unsigned uSeqCount = v.GetSeqCount();\r
+\r
+ DF.SetCount(uSeqCount);\r
+ if (0 == uSeqCount)\r
+ return;\r
+\r
+// Initialize distance matrix to zero\r
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
+ {\r
+ DF.SetDist(uSeq1, uSeq1, 0);\r
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+ DF.SetDist(uSeq1, uSeq2, 0);\r
+ }\r
+\r
+ unsigned uMaxLength = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const Seq &s = v.GetSeq(uSeqIndex);\r
+ unsigned uSeqLength = s.Length();\r
+ if (uSeqLength > uMaxLength)\r
+ uMaxLength = uSeqLength;\r
+ }\r
+ if (0 == uMaxLength)\r
+ return;\r
+\r
+ byte *Seq1Letters = new byte[uMaxLength];\r
+ byte *Seq2Letters = new byte[uMaxLength];\r
+\r
+ for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)\r
+ {\r
+ const Seq &s1 = v.GetSeq(uSeqIndex1);\r
+ const unsigned uSeqLength1 = s1.Length();\r
+\r
+ SeqToLetters(s1, Seq1Letters);\r
+ CountKmers(Seq1Letters, uSeqLength1, KmerCounts);\r
+\r
+ for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;\r
+ ++uSeqIndex2)\r
+ {\r
+ const Seq &s2 = v.GetSeq(uSeqIndex2);\r
+ const unsigned uSeqLength2 = s2.Length();\r
+\r
+ SeqToLetters(s2, Seq2Letters);\r
+\r
+ unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,\r
+ KmerCounts, Seq2Letters, uSeqLength2);\r
+\r
+ unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);\r
+ double F = (double) uCommonKmerCount / (uMinLength - K + 1);\r
+ if (0.0 == F)\r
+ F = 0.01;\r
+ double Y = log(0.02 + F);\r
+ double EstimatedPctId = Y/4.12 + 0.995;\r
+ double KD = KimuraDist(EstimatedPctId);\r
+// DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);\r
+ DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));\r
+#if TRACE\r
+ Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",\r
+ uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);\r
+#endif\r
+ }\r
+ }\r
+\r
+ delete[] Seq1Letters;\r
+ delete[] Seq2Letters;\r
+ }\r