2 #include "distfunc.h"
\r
8 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
\r
9 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
\r
11 const unsigned TUPLE_COUNT = 6*6*6*6*6*6;
\r
12 static unsigned char Count1[TUPLE_COUNT];
\r
13 static unsigned char Count2[TUPLE_COUNT];
\r
15 // Amino acid groups according to MAFFT (sextet5)
\r
23 unsigned ResidueGroup[] =
\r
46 2, // AX_B, // D or N
\r
47 2, // AX_Z, // E or Q
\r
48 0, // AX_X, // Unknown // ******** TODO *************
\r
49 // This isn't the correct way of avoiding group 6
\r
50 0 // AX_GAP, // ******** TODO ******************
\r
52 unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);
\r
54 static char *TupleToStr(int t)
\r
57 int t1, t2, t3, t4, t5, t6;
\r
63 t5 = (t/(6*6*6*6))%6;
\r
64 t6 = (t/(6*6*6*6*6))%6;
\r
75 static unsigned GetTuple(const unsigned uLetters[], unsigned n)
\r
77 assert(uLetters[n] < uResidueGroupCount);
\r
78 assert(uLetters[n+1] < uResidueGroupCount);
\r
79 assert(uLetters[n+2] < uResidueGroupCount);
\r
80 assert(uLetters[n+3] < uResidueGroupCount);
\r
81 assert(uLetters[n+4] < uResidueGroupCount);
\r
82 assert(uLetters[n+5] < uResidueGroupCount);
\r
84 unsigned u1 = ResidueGroup[uLetters[n]];
\r
85 unsigned u2 = ResidueGroup[uLetters[n+1]];
\r
86 unsigned u3 = ResidueGroup[uLetters[n+2]];
\r
87 unsigned u4 = ResidueGroup[uLetters[n+3]];
\r
88 unsigned u5 = ResidueGroup[uLetters[n+4]];
\r
89 unsigned u6 = ResidueGroup[uLetters[n+5]];
\r
91 return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;
\r
94 static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
\r
96 memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
\r
97 for (unsigned n = 0; n < uTupleCount; ++n)
\r
99 const unsigned uTuple = GetTuple(L, n);
\r
104 static void ListCount(const unsigned char Count[])
\r
106 for (unsigned n = 0; n < TUPLE_COUNT; ++n)
\r
110 Log("%s %u\n", TupleToStr(n), Count[n]);
\r
114 void DistKmer6_6(const SeqVect &v, DistFunc &DF)
\r
116 const unsigned uSeqCount = v.Length();
\r
118 DF.SetCount(uSeqCount);
\r
119 if (0 == uSeqCount)
\r
122 // Initialize distance matrix to zero
\r
123 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
125 DF.SetDist(uSeq1, uSeq1, 0);
\r
126 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
127 DF.SetDist(uSeq1, uSeq2, 0);
\r
130 // Convert to letters
\r
131 unsigned **Letters = new unsigned *[uSeqCount];
\r
132 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
134 Seq &s = *(v[uSeqIndex]);
\r
135 const unsigned uSeqLength = s.Length();
\r
136 unsigned *L = new unsigned[uSeqLength];
\r
137 Letters[uSeqIndex] = L;
\r
138 for (unsigned n = 0; n < uSeqLength; ++n)
\r
141 L[n] = CharToLetterEx(c);
\r
142 assert(L[n] < uResidueGroupCount);
\r
146 unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
\r
147 for (unsigned n = 0; n < uSeqCount; ++n)
\r
149 uCommonTupleCount[n] = new unsigned[uSeqCount];
\r
150 memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
\r
153 const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
\r
154 unsigned uCount = 0;
\r
155 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
157 Seq &seq1 = *(v[uSeq1]);
\r
158 const unsigned uSeqLength1 = seq1.Length();
\r
159 if (uSeqLength1 < 5)
\r
162 const unsigned uTupleCount = uSeqLength1 - 5;
\r
163 const unsigned *L = Letters[uSeq1];
\r
164 CountTuples(L, uTupleCount, Count1);
\r
167 Log("Seq1=%d\n", uSeq1);
\r
169 for (unsigned n = 0; n < uSeqLength1; ++n)
\r
170 Log("%u", ResidueGroup[L[n]]);
\r
178 SetProgressDesc("K-mer dist pass 1");
\r
179 for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
\r
181 if (0 == uCount%500)
\r
182 Progress(uCount, uPairCount);
\r
184 Seq &seq2 = *(v[uSeq2]);
\r
185 const unsigned uSeqLength2 = seq2.Length();
\r
186 if (uSeqLength2 < 5)
\r
188 if (uSeq1 == uSeq2)
\r
189 DF.SetDist(uSeq1, uSeq2, 0);
\r
191 DF.SetDist(uSeq1, uSeq2, 1);
\r
195 // First pass through seq 2 to count tuples
\r
196 const unsigned uTupleCount = uSeqLength2 - 5;
\r
197 const unsigned *L = Letters[uSeq2];
\r
198 CountTuples(L, uTupleCount, Count2);
\r
200 Log("Seq2=%d Counts=\n", uSeq2);
\r
204 // Second pass to accumulate sum of shared tuples
\r
205 // MAFFT defines this as the sum over unique tuples
\r
206 // in seq2 of the minimum of the number of tuples found
\r
207 // in the two sequences.
\r
209 for (unsigned n = 0; n < uTupleCount; ++n)
\r
211 const unsigned uTuple = GetTuple(L, n);
\r
212 uSum += MIN(Count1[uTuple], Count2[uTuple]);
\r
214 // This is a hack to make sure each unique tuple counted only once.
\r
215 Count2[uTuple] = 0;
\r
219 Seq &s1 = *(v[uSeq1]);
\r
220 Seq &s2 = *(v[uSeq2]);
\r
221 const char *pName1 = s1.GetName();
\r
222 const char *pName2 = s2.GetName();
\r
223 Log("Common count %s(%d) - %s(%d) =%u\n",
\r
224 pName1, uSeq1, pName2, uSeq2, uSum);
\r
227 uCommonTupleCount[uSeq1][uSeq2] = uSum;
\r
228 uCommonTupleCount[uSeq2][uSeq1] = uSum;
\r
231 ProgressStepsDone();
\r
234 SetProgressDesc("K-mer dist pass 2");
\r
235 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
237 Seq &s1 = *(v[uSeq1]);
\r
238 const char *pName1 = s1.GetName();
\r
240 double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
\r
241 if (0 == dCommonTupleCount11)
\r
242 dCommonTupleCount11 = 1;
\r
244 DF.SetDist(uSeq1, uSeq1, 0);
\r
245 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
247 if (0 == uCount%500)
\r
248 Progress(uCount, uPairCount);
\r
251 double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
\r
252 if (0 == dCommonTupleCount22)
\r
253 dCommonTupleCount22 = 1;
\r
255 const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
\r
256 /dCommonTupleCount11;
\r
257 const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
\r
258 /dCommonTupleCount22;
\r
260 // dMinDist is the value used for tree-building in MAFFT
\r
261 const double dMinDist = MIN(dDist1, dDist2);
\r
262 DF.SetDist(uSeq1, uSeq2, (float) dMinDist);
\r
264 //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
\r
265 //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
\r
266 // **** TODO **** why does this make score slightly worse??
\r
267 //const double dKimuraDist = KimuraDist(dEstimatedPctId);
\r
268 //DF.SetDist(uSeq1, uSeq2, dKimuraDist);
\r
271 ProgressStepsDone();
\r
273 for (unsigned n = 0; n < uSeqCount; ++n)
\r
274 delete[] uCommonTupleCount[n];
\r
275 delete[] uCommonTupleCount;
\r
279 double PctIdToMAFFTDist(double dPctId)
\r
283 double dDist = -log(dPctId);
\r
287 double PctIdToHeightMAFFT(double dPctId)
\r
289 return PctIdToMAFFTDist(dPctId);
\r