2 #include "distfunc.h"
\r
8 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
\r
9 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
\r
11 const unsigned TUPLE_COUNT = 6*6*6*6*6*6;
\r
12 static unsigned char Count1[TUPLE_COUNT];
\r
13 static unsigned char Count2[TUPLE_COUNT];
\r
15 // Nucleotide groups according to MAFFT (sextet5)
\r
22 static unsigned ResidueGroup[] =
\r
33 static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);
\r
35 static char *TupleToStr(int t)
\r
38 int t1, t2, t3, t4, t5, t6;
\r
44 t5 = (t/(6*6*6*6))%6;
\r
45 t6 = (t/(6*6*6*6*6))%6;
\r
56 static unsigned GetTuple(const unsigned uLetters[], unsigned n)
\r
58 assert(uLetters[n] < uResidueGroupCount);
\r
59 assert(uLetters[n+1] < uResidueGroupCount);
\r
60 assert(uLetters[n+2] < uResidueGroupCount);
\r
61 assert(uLetters[n+3] < uResidueGroupCount);
\r
62 assert(uLetters[n+4] < uResidueGroupCount);
\r
63 assert(uLetters[n+5] < uResidueGroupCount);
\r
65 unsigned u1 = ResidueGroup[uLetters[n]];
\r
66 unsigned u2 = ResidueGroup[uLetters[n+1]];
\r
67 unsigned u3 = ResidueGroup[uLetters[n+2]];
\r
68 unsigned u4 = ResidueGroup[uLetters[n+3]];
\r
69 unsigned u5 = ResidueGroup[uLetters[n+4]];
\r
70 unsigned u6 = ResidueGroup[uLetters[n+5]];
\r
72 return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;
\r
75 static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
\r
77 memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
\r
78 for (unsigned n = 0; n < uTupleCount; ++n)
\r
80 const unsigned uTuple = GetTuple(L, n);
\r
85 static void ListCount(const unsigned char Count[])
\r
87 for (unsigned n = 0; n < TUPLE_COUNT; ++n)
\r
91 Log("%s %u\n", TupleToStr(n), Count[n]);
\r
95 void DistKmer4_6(const SeqVect &v, DistFunc &DF)
\r
97 if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha)
\r
98 Quit("DistKmer4_6 requires nucleo alphabet");
\r
100 const unsigned uSeqCount = v.Length();
\r
102 DF.SetCount(uSeqCount);
\r
103 if (0 == uSeqCount)
\r
106 // Initialize distance matrix to zero
\r
107 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
109 DF.SetDist(uSeq1, uSeq1, 0);
\r
110 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
111 DF.SetDist(uSeq1, uSeq2, 0);
\r
114 // Convert to letters
\r
115 unsigned **Letters = new unsigned *[uSeqCount];
\r
116 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
118 Seq &s = *(v[uSeqIndex]);
\r
119 const unsigned uSeqLength = s.Length();
\r
120 unsigned *L = new unsigned[uSeqLength];
\r
121 Letters[uSeqIndex] = L;
\r
122 for (unsigned n = 0; n < uSeqLength; ++n)
\r
125 L[n] = CharToLetterEx(c);
\r
131 unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
\r
132 for (unsigned n = 0; n < uSeqCount; ++n)
\r
134 uCommonTupleCount[n] = new unsigned[uSeqCount];
\r
135 memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
\r
138 const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
\r
139 unsigned uCount = 0;
\r
140 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
142 Seq &seq1 = *(v[uSeq1]);
\r
143 const unsigned uSeqLength1 = seq1.Length();
\r
144 if (uSeqLength1 < 5)
\r
147 const unsigned uTupleCount = uSeqLength1 - 5;
\r
148 const unsigned *L = Letters[uSeq1];
\r
149 CountTuples(L, uTupleCount, Count1);
\r
152 Log("Seq1=%d\n", uSeq1);
\r
154 for (unsigned n = 0; n < uSeqLength1; ++n)
\r
155 Log("%u", ResidueGroup[L[n]]);
\r
163 SetProgressDesc("K-mer dist pass 1");
\r
164 for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
\r
166 if (0 == uCount%500)
\r
167 Progress(uCount, uPairCount);
\r
169 Seq &seq2 = *(v[uSeq2]);
\r
170 const unsigned uSeqLength2 = seq2.Length();
\r
171 if (uSeqLength2 < 5)
\r
173 if (uSeq1 == uSeq2)
\r
174 DF.SetDist(uSeq1, uSeq2, 0);
\r
176 DF.SetDist(uSeq1, uSeq2, 1);
\r
180 // First pass through seq 2 to count tuples
\r
181 const unsigned uTupleCount = uSeqLength2 - 5;
\r
182 const unsigned *L = Letters[uSeq2];
\r
183 CountTuples(L, uTupleCount, Count2);
\r
185 Log("Seq2=%d Counts=\n", uSeq2);
\r
189 // Second pass to accumulate sum of shared tuples
\r
190 // MAFFT defines this as the sum over unique tuples
\r
191 // in seq2 of the minimum of the number of tuples found
\r
192 // in the two sequences.
\r
194 for (unsigned n = 0; n < uTupleCount; ++n)
\r
196 const unsigned uTuple = GetTuple(L, n);
\r
197 uSum += MIN(Count1[uTuple], Count2[uTuple]);
\r
199 // This is a hack to make sure each unique tuple counted only once.
\r
200 Count2[uTuple] = 0;
\r
204 Seq &s1 = *(v[uSeq1]);
\r
205 Seq &s2 = *(v[uSeq2]);
\r
206 const char *pName1 = s1.GetName();
\r
207 const char *pName2 = s2.GetName();
\r
208 Log("Common count %s(%d) - %s(%d) =%u\n",
\r
209 pName1, uSeq1, pName2, uSeq2, uSum);
\r
212 uCommonTupleCount[uSeq1][uSeq2] = uSum;
\r
213 uCommonTupleCount[uSeq2][uSeq1] = uSum;
\r
216 ProgressStepsDone();
\r
219 SetProgressDesc("K-mer dist pass 2");
\r
220 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
222 Seq &s1 = *(v[uSeq1]);
\r
223 const char *pName1 = s1.GetName();
\r
225 double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
\r
226 if (0 == dCommonTupleCount11)
\r
227 dCommonTupleCount11 = 1;
\r
229 DF.SetDist(uSeq1, uSeq1, 0);
\r
230 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
232 if (0 == uCount%500)
\r
233 Progress(uCount, uPairCount);
\r
236 double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
\r
237 if (0 == dCommonTupleCount22)
\r
238 dCommonTupleCount22 = 1;
\r
240 const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
\r
241 /dCommonTupleCount11;
\r
242 const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
\r
243 /dCommonTupleCount22;
\r
245 // dMinDist is the value used for tree-building in MAFFT
\r
246 const double dMinDist = MIN(dDist1, dDist2);
\r
247 DF.SetDist(uSeq1, uSeq2, (float) dMinDist);
\r
249 //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
\r
250 //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
\r
251 // **** TODO **** why does this make score slightly worse??
\r
252 //const double dKimuraDist = KimuraDist(dEstimatedPctId);
\r
253 //DF.SetDist(uSeq1, uSeq2, dKimuraDist);
\r
256 ProgressStepsDone();
\r
258 for (unsigned n = 0; n < uSeqCount; ++n)
\r
260 delete[] uCommonTupleCount[n];
\r
261 delete[] Letters[n];
\r
263 delete[] uCommonTupleCount;
\r