+++ /dev/null
-#include "muscle.h"\r
-#include "msa.h"\r
-#include "seqvect.h"\r
-#include "seq.h"\r
-#include "distfunc.h"\r
-#include <math.h>\r
-\r
-#define TRACE 0\r
-\r
-/***\r
-Some candidate alphabets considered because they\r
-have high correlations and small table sizes.\r
-Correlation coefficent is between k-mer distance\r
-and %id D measured from a CLUSTALW alignment.\r
-Table size is N^k where N is size of alphabet.\r
-A is standard (uncompressed) amino alphabet.\r
-\r
- Correlation\r
-Alpha N k Table Size all 25-50%\r
------ -- - ---------- ---- ------\r
-A 20 3 8,000 0.943 0.575\r
-A 20 4 160,000 0.962 0.685 <<\r
-LiA 14 4 38,416 0.966 0.645\r
-SEB 14 4 38,416 0.964 0.634\r
-LiA 13 4 28,561 0.965 0.640\r
-LiA 12 4 20,736 0.963 0.620\r
-LiA 10 5 100,000 0.964 0.652\r
-\r
-We select A with k=4 because it has the best\r
-correlations. The only drawback is a large table\r
-size, but space is readily available and the only \r
-additional time cost is in resetting the table to\r
-zero, which can be done quickly with memset or by\r
-keeping a list of the k-mers that were found (should\r
-test to see which is faster, and may vary by compiler\r
-and processor type). It also has the minor advantage\r
-that we don't need to convert the alphabet.\r
-\r
-Fractional identity d is estimated as follows.\r
-\r
- F = fractional k-mer count\r
- if F is 0: F = 0.01\r
- Y = log(0.02 + F)\r
- d = -4.1 + 4.12*Y\r
-\r
-The constant 0.02 was chosen to make the relationship\r
-between Y and D linear. The constants -4.1 and 4.12\r
-were chosen to fit a straight line to the scatterplot\r
-of Y vs D.\r
-***/\r
-\r
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))\r
-\r
-const unsigned K = 4;\r
-const unsigned N = 20;\r
-const unsigned N_2 = 20*20;\r
-const unsigned N_3 = 20*20*20;\r
-const unsigned N_4 = 20*20*20*20;\r
-\r
-const unsigned TABLE_SIZE = N_4;\r
-\r
-// For debug output\r
-const char *KmerToStr(unsigned Kmer)\r
- {\r
- static char s[5];\r
-\r
- unsigned c3 = (Kmer/N_3)%N;\r
- unsigned c2 = (Kmer/N_2)%N;\r
- unsigned c1 = (Kmer/N)%N;\r
- unsigned c0 = Kmer%N;\r
-\r
- s[0] = LetterToChar(c3);\r
- s[1] = LetterToChar(c2);\r
- s[2] = LetterToChar(c1);\r
- s[3] = LetterToChar(c0);\r
- return s;\r
- }\r
-\r
-void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])\r
- {\r
-#if TRACE\r
- Log("CountKmers\n");\r
-#endif\r
- memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));\r
-\r
- const byte *ptrKmerStart = s;\r
- const byte *ptrKmerEnd = s + 4;\r
- const byte *ptrSeqEnd = s + uSeqLength;\r
-\r
- unsigned c3 = s[0]*N_3;\r
- unsigned c2 = s[1]*N_2;\r
- unsigned c1 = s[2]*N;\r
- unsigned c0 = s[3];\r
-\r
- unsigned Kmer = c3 + c2 + c1 + c0;\r
-\r
- for (;;)\r
- {\r
- assert(Kmer < TABLE_SIZE);\r
-\r
-#if TRACE\r
- Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));\r
-#endif\r
- ++(KmerCounts[Kmer]);\r
-\r
- if (ptrKmerEnd == ptrSeqEnd)\r
- break;\r
-\r
- // Compute k-mer as function of previous k-mer:\r
- // 1. Subtract first letter from previous k-mer.\r
- // 2. Multiply by N.\r
- // 3. Add next letter.\r
- c3 = (*ptrKmerStart++) * N_3;\r
- Kmer = (Kmer - c3)*N;\r
- Kmer += *ptrKmerEnd++;\r
- }\r
- }\r
-\r
-unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,\r
- const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)\r
- {\r
- byte KmerCounts2[TABLE_SIZE];\r
- CountKmers(Seq2, uSeqLength2, KmerCounts2);\r
-\r
- const byte *ptrKmerStart = Seq;\r
- const byte *ptrKmerEnd = Seq + 4;\r
- const byte *ptrSeqEnd = Seq + uSeqLength;\r
-\r
- unsigned c3 = Seq[0]*N_3;\r
- unsigned c2 = Seq[1]*N_2;\r
- unsigned c1 = Seq[2]*N;\r
- unsigned c0 = Seq[3];\r
-\r
- unsigned Kmer = c3 + c2 + c1 + c0;\r
-\r
- unsigned uCommonCount = 0;\r
- for (;;)\r
- {\r
- assert(Kmer < TABLE_SIZE);\r
-\r
- const byte Count1 = KmerCounts1[Kmer];\r
- const byte Count2 = KmerCounts2[Kmer];\r
-\r
- uCommonCount += MIN(Count1, Count2);\r
-\r
- // Hack so we don't double-count\r
- KmerCounts2[Kmer] = 0;\r
-\r
- if (ptrKmerEnd == ptrSeqEnd)\r
- break;\r
-\r
- // Compute k-mer as function of previous k-mer:\r
- // 1. Subtract first letter from previous k-mer.\r
- // 2. Multiply by N.\r
- // 3. Add next letter.\r
- c3 = (*ptrKmerStart++) * N_3;\r
- Kmer = (Kmer - c3)*N;\r
- Kmer += *ptrKmerEnd++;\r
- }\r
- return uCommonCount;\r
- }\r
-\r
-static void SeqToLetters(const Seq &s, byte Letters[])\r
- {\r
- const unsigned uSeqLength = s.Length();\r
- for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)\r
- {\r
- char c = s.GetChar(uCol);\r
- // Ugly hack. My k-mer counting code isn't wild-card\r
- // aware. Arbitrarily replace wildcards by a specific\r
- // amino acid.\r
- if (IsWildcardChar(c))\r
- c = 'A';\r
- *Letters++ = CharToLetter(c);\r
- }\r
- }\r
-\r
-void FastDistKmer(const SeqVect &v, DistFunc &DF)\r
- {\r
- byte KmerCounts[TABLE_SIZE];\r
-\r
- const unsigned uSeqCount = v.GetSeqCount();\r
-\r
- DF.SetCount(uSeqCount);\r
- if (0 == uSeqCount)\r
- return;\r
-\r
-// Initialize distance matrix to zero\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
- {\r
- DF.SetDist(uSeq1, uSeq1, 0);\r
- for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
- DF.SetDist(uSeq1, uSeq2, 0);\r
- }\r
-\r
- unsigned uMaxLength = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- const Seq &s = v.GetSeq(uSeqIndex);\r
- unsigned uSeqLength = s.Length();\r
- if (uSeqLength > uMaxLength)\r
- uMaxLength = uSeqLength;\r
- }\r
- if (0 == uMaxLength)\r
- return;\r
-\r
- byte *Seq1Letters = new byte[uMaxLength];\r
- byte *Seq2Letters = new byte[uMaxLength];\r
-\r
- for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)\r
- {\r
- const Seq &s1 = v.GetSeq(uSeqIndex1);\r
- const unsigned uSeqLength1 = s1.Length();\r
-\r
- SeqToLetters(s1, Seq1Letters);\r
- CountKmers(Seq1Letters, uSeqLength1, KmerCounts);\r
-\r
- for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;\r
- ++uSeqIndex2)\r
- {\r
- const Seq &s2 = v.GetSeq(uSeqIndex2);\r
- const unsigned uSeqLength2 = s2.Length();\r
-\r
- SeqToLetters(s2, Seq2Letters);\r
-\r
- unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,\r
- KmerCounts, Seq2Letters, uSeqLength2);\r
-\r
- unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);\r
- double F = (double) uCommonKmerCount / (uMinLength - K + 1);\r
- if (0.0 == F)\r
- F = 0.01;\r
- double Y = log(0.02 + F);\r
- double EstimatedPctId = Y/4.12 + 0.995;\r
- double KD = KimuraDist(EstimatedPctId);\r
-// DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);\r
- DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));\r
-#if TRACE\r
- Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",\r
- uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);\r
-#endif\r
- }\r
- }\r
-\r
- delete[] Seq1Letters;\r
- delete[] Seq2Letters;\r
- }\r