5 #include "distfunc.h"
\r
11 Some candidate alphabets considered because they
\r
12 have high correlations and small table sizes.
\r
13 Correlation coefficent is between k-mer distance
\r
14 and %id D measured from a CLUSTALW alignment.
\r
15 Table size is N^k where N is size of alphabet.
\r
16 A is standard (uncompressed) amino alphabet.
\r
19 Alpha N k Table Size all 25-50%
\r
20 ----- -- - ---------- ---- ------
\r
21 A 20 3 8,000 0.943 0.575
\r
22 A 20 4 160,000 0.962 0.685 <<
\r
23 LiA 14 4 38,416 0.966 0.645
\r
24 SEB 14 4 38,416 0.964 0.634
\r
25 LiA 13 4 28,561 0.965 0.640
\r
26 LiA 12 4 20,736 0.963 0.620
\r
27 LiA 10 5 100,000 0.964 0.652
\r
29 We select A with k=4 because it has the best
\r
30 correlations. The only drawback is a large table
\r
31 size, but space is readily available and the only
\r
32 additional time cost is in resetting the table to
\r
33 zero, which can be done quickly with memset or by
\r
34 keeping a list of the k-mers that were found (should
\r
35 test to see which is faster, and may vary by compiler
\r
36 and processor type). It also has the minor advantage
\r
37 that we don't need to convert the alphabet.
\r
39 Fractional identity d is estimated as follows.
\r
41 F = fractional k-mer count
\r
46 The constant 0.02 was chosen to make the relationship
\r
47 between Y and D linear. The constants -4.1 and 4.12
\r
48 were chosen to fit a straight line to the scatterplot
\r
52 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
\r
54 const unsigned K = 4;
\r
55 const unsigned N = 20;
\r
56 const unsigned N_2 = 20*20;
\r
57 const unsigned N_3 = 20*20*20;
\r
58 const unsigned N_4 = 20*20*20*20;
\r
60 const unsigned TABLE_SIZE = N_4;
\r
63 const char *KmerToStr(unsigned Kmer)
\r
67 unsigned c3 = (Kmer/N_3)%N;
\r
68 unsigned c2 = (Kmer/N_2)%N;
\r
69 unsigned c1 = (Kmer/N)%N;
\r
70 unsigned c0 = Kmer%N;
\r
72 s[0] = LetterToChar(c3);
\r
73 s[1] = LetterToChar(c2);
\r
74 s[2] = LetterToChar(c1);
\r
75 s[3] = LetterToChar(c0);
\r
79 void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
\r
82 Log("CountKmers\n");
\r
84 memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));
\r
86 const byte *ptrKmerStart = s;
\r
87 const byte *ptrKmerEnd = s + 4;
\r
88 const byte *ptrSeqEnd = s + uSeqLength;
\r
90 unsigned c3 = s[0]*N_3;
\r
91 unsigned c2 = s[1]*N_2;
\r
92 unsigned c1 = s[2]*N;
\r
95 unsigned Kmer = c3 + c2 + c1 + c0;
\r
99 assert(Kmer < TABLE_SIZE);
\r
102 Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
\r
104 ++(KmerCounts[Kmer]);
\r
106 if (ptrKmerEnd == ptrSeqEnd)
\r
109 // Compute k-mer as function of previous k-mer:
\r
110 // 1. Subtract first letter from previous k-mer.
\r
111 // 2. Multiply by N.
\r
112 // 3. Add next letter.
\r
113 c3 = (*ptrKmerStart++) * N_3;
\r
114 Kmer = (Kmer - c3)*N;
\r
115 Kmer += *ptrKmerEnd++;
\r
119 unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
\r
120 const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
\r
122 byte KmerCounts2[TABLE_SIZE];
\r
123 CountKmers(Seq2, uSeqLength2, KmerCounts2);
\r
125 const byte *ptrKmerStart = Seq;
\r
126 const byte *ptrKmerEnd = Seq + 4;
\r
127 const byte *ptrSeqEnd = Seq + uSeqLength;
\r
129 unsigned c3 = Seq[0]*N_3;
\r
130 unsigned c2 = Seq[1]*N_2;
\r
131 unsigned c1 = Seq[2]*N;
\r
132 unsigned c0 = Seq[3];
\r
134 unsigned Kmer = c3 + c2 + c1 + c0;
\r
136 unsigned uCommonCount = 0;
\r
139 assert(Kmer < TABLE_SIZE);
\r
141 const byte Count1 = KmerCounts1[Kmer];
\r
142 const byte Count2 = KmerCounts2[Kmer];
\r
144 uCommonCount += MIN(Count1, Count2);
\r
146 // Hack so we don't double-count
\r
147 KmerCounts2[Kmer] = 0;
\r
149 if (ptrKmerEnd == ptrSeqEnd)
\r
152 // Compute k-mer as function of previous k-mer:
\r
153 // 1. Subtract first letter from previous k-mer.
\r
154 // 2. Multiply by N.
\r
155 // 3. Add next letter.
\r
156 c3 = (*ptrKmerStart++) * N_3;
\r
157 Kmer = (Kmer - c3)*N;
\r
158 Kmer += *ptrKmerEnd++;
\r
160 return uCommonCount;
\r
163 static void SeqToLetters(const Seq &s, byte Letters[])
\r
165 const unsigned uSeqLength = s.Length();
\r
166 for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
\r
168 char c = s.GetChar(uCol);
\r
169 // Ugly hack. My k-mer counting code isn't wild-card
\r
170 // aware. Arbitrarily replace wildcards by a specific
\r
172 if (IsWildcardChar(c))
\r
174 *Letters++ = CharToLetter(c);
\r
178 void FastDistKmer(const SeqVect &v, DistFunc &DF)
\r
180 byte KmerCounts[TABLE_SIZE];
\r
182 const unsigned uSeqCount = v.GetSeqCount();
\r
184 DF.SetCount(uSeqCount);
\r
185 if (0 == uSeqCount)
\r
188 // Initialize distance matrix to zero
\r
189 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
191 DF.SetDist(uSeq1, uSeq1, 0);
\r
192 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
193 DF.SetDist(uSeq1, uSeq2, 0);
\r
196 unsigned uMaxLength = 0;
\r
197 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
199 const Seq &s = v.GetSeq(uSeqIndex);
\r
200 unsigned uSeqLength = s.Length();
\r
201 if (uSeqLength > uMaxLength)
\r
202 uMaxLength = uSeqLength;
\r
204 if (0 == uMaxLength)
\r
207 byte *Seq1Letters = new byte[uMaxLength];
\r
208 byte *Seq2Letters = new byte[uMaxLength];
\r
210 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
\r
212 const Seq &s1 = v.GetSeq(uSeqIndex1);
\r
213 const unsigned uSeqLength1 = s1.Length();
\r
215 SeqToLetters(s1, Seq1Letters);
\r
216 CountKmers(Seq1Letters, uSeqLength1, KmerCounts);
\r
218 for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
\r
221 const Seq &s2 = v.GetSeq(uSeqIndex2);
\r
222 const unsigned uSeqLength2 = s2.Length();
\r
224 SeqToLetters(s2, Seq2Letters);
\r
226 unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
\r
227 KmerCounts, Seq2Letters, uSeqLength2);
\r
229 unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
\r
230 double F = (double) uCommonKmerCount / (uMinLength - K + 1);
\r
233 double Y = log(0.02 + F);
\r
234 double EstimatedPctId = Y/4.12 + 0.995;
\r
235 double KD = KimuraDist(EstimatedPctId);
\r
236 // DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
\r
237 DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
\r
239 Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
\r
240 uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
\r
245 delete[] Seq1Letters;
\r
246 delete[] Seq2Letters;
\r