Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / fastdistmafft.cpp
1 #include "muscle.h"\r
2 #include "distfunc.h"\r
3 #include "seqvect.h"\r
4 #include <math.h>\r
5 \r
6 #define TRACE 0\r
7 \r
8 #define MIN(x, y)       (((x) < (y)) ? (x) : (y))\r
9 #define MAX(x, y)       (((x) > (y)) ? (x) : (y))\r
10 \r
11 const unsigned TUPLE_COUNT = 6*6*6*6*6*6;\r
12 static unsigned char Count1[TUPLE_COUNT];\r
13 static unsigned char Count2[TUPLE_COUNT];\r
14 \r
15 // Amino acid groups according to MAFFT (sextet5)\r
16 // 0 =  A G P S T\r
17 // 1 =  I L M V\r
18 // 2 =  N D Q E B Z\r
19 // 3 =  R H K\r
20 // 4 =  F W Y\r
21 // 5 =  C\r
22 // 6 =  X . - U\r
23 unsigned ResidueGroup[] =\r
24         {\r
25         0,              // AX_A,\r
26         5,              // AX_C,\r
27         2,              // AX_D,\r
28         2,              // AX_E,\r
29         4,              // AX_F,\r
30         0,              // AX_G,\r
31         3,              // AX_H,\r
32         1,              // AX_I,\r
33         3,              // AX_K,\r
34         1,              // AX_L,\r
35         1,              // AX_M,\r
36         2,              // AX_N,\r
37         0,              // AX_P,\r
38         2,              // AX_Q,\r
39         3,              // AX_R,\r
40         0,              // AX_S,\r
41         0,              // AX_T,\r
42         1,              // AX_V,\r
43         4,              // AX_W,\r
44         4,              // AX_Y,\r
45 \r
46         2,              // AX_B,        // D or N\r
47         2,              // AX_Z,        // E or Q\r
48         0,              // AX_X,        // Unknown              // ******** TODO *************\r
49                                                                                 // This isn't the correct way of avoiding group 6\r
50         0               // AX_GAP,                                      // ******** TODO ******************\r
51         };\r
52 unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);\r
53 \r
54 static char *TupleToStr(int t)\r
55         {\r
56         static char s[7];\r
57         int t1, t2, t3, t4, t5, t6;\r
58 \r
59         t1 = t%6;\r
60         t2 = (t/6)%6;\r
61         t3 = (t/(6*6))%6;\r
62         t4 = (t/(6*6*6))%6;\r
63         t5 = (t/(6*6*6*6))%6;\r
64         t6 = (t/(6*6*6*6*6))%6;\r
65 \r
66         s[5] = '0' + t1;\r
67         s[4] = '0' + t2;\r
68         s[3] = '0' + t3;\r
69         s[2] = '0' + t4;\r
70         s[1] = '0' + t5;\r
71         s[0] = '0' + t6;\r
72         return s;\r
73         }\r
74 \r
75 static unsigned GetTuple(const unsigned uLetters[], unsigned n)\r
76         {\r
77         assert(uLetters[n] < uResidueGroupCount);\r
78         assert(uLetters[n+1] < uResidueGroupCount);\r
79         assert(uLetters[n+2] < uResidueGroupCount);\r
80         assert(uLetters[n+3] < uResidueGroupCount);\r
81         assert(uLetters[n+4] < uResidueGroupCount);\r
82         assert(uLetters[n+5] < uResidueGroupCount);\r
83 \r
84         unsigned u1 = ResidueGroup[uLetters[n]];\r
85         unsigned u2 = ResidueGroup[uLetters[n+1]];\r
86         unsigned u3 = ResidueGroup[uLetters[n+2]];\r
87         unsigned u4 = ResidueGroup[uLetters[n+3]];\r
88         unsigned u5 = ResidueGroup[uLetters[n+4]];\r
89         unsigned u6 = ResidueGroup[uLetters[n+5]];\r
90 \r
91         return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;\r
92         }\r
93 \r
94 static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])\r
95         {\r
96         memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));\r
97         for (unsigned n = 0; n < uTupleCount; ++n)\r
98                 {\r
99                 const unsigned uTuple = GetTuple(L, n);\r
100                 ++(Count[uTuple]);\r
101                 }\r
102         }\r
103 \r
104 static void ListCount(const unsigned char Count[])\r
105         {\r
106         for (unsigned n = 0; n < TUPLE_COUNT; ++n)\r
107                 {\r
108                 if (0 == Count[n])\r
109                         continue;\r
110                 Log("%s  %u\n", TupleToStr(n), Count[n]);\r
111                 }\r
112         }\r
113 \r
114 void DistKmer6_6(const SeqVect &v, DistFunc &DF)\r
115         {\r
116         const unsigned uSeqCount = v.Length();\r
117 \r
118         DF.SetCount(uSeqCount);\r
119         if (0 == uSeqCount)\r
120                 return;\r
121 \r
122 // Initialize distance matrix to zero\r
123         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
124                 {\r
125                 DF.SetDist(uSeq1, uSeq1, 0);\r
126                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
127                         DF.SetDist(uSeq1, uSeq2, 0);\r
128                 }\r
129 \r
130 // Convert to letters\r
131         unsigned **Letters = new unsigned *[uSeqCount];\r
132         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
133                 {\r
134                 Seq &s = *(v[uSeqIndex]);\r
135                 const unsigned uSeqLength = s.Length();\r
136                 unsigned *L = new unsigned[uSeqLength];\r
137                 Letters[uSeqIndex] = L;\r
138                 for (unsigned n = 0; n < uSeqLength; ++n)\r
139                         {\r
140                         char c = s[n];\r
141                         L[n] = CharToLetterEx(c);\r
142                         assert(L[n] < uResidueGroupCount);\r
143                         }\r
144                 }\r
145 \r
146         unsigned **uCommonTupleCount = new unsigned *[uSeqCount];\r
147         for (unsigned n = 0; n < uSeqCount; ++n)\r
148                 {\r
149                 uCommonTupleCount[n] = new unsigned[uSeqCount];\r
150                 memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));\r
151                 }\r
152 \r
153         const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;\r
154         unsigned uCount = 0;\r
155         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
156                 {\r
157                 Seq &seq1 = *(v[uSeq1]);\r
158                 const unsigned uSeqLength1 = seq1.Length();\r
159                 if (uSeqLength1 < 5)\r
160                         continue;\r
161 \r
162                 const unsigned uTupleCount = uSeqLength1 - 5;\r
163                 const unsigned *L = Letters[uSeq1];\r
164                 CountTuples(L, uTupleCount, Count1);\r
165 #if     TRACE\r
166                 {\r
167                 Log("Seq1=%d\n", uSeq1);\r
168                 Log("Groups:\n");\r
169                 for (unsigned n = 0; n < uSeqLength1; ++n)\r
170                         Log("%u", ResidueGroup[L[n]]);\r
171                 Log("\n");\r
172 \r
173                 Log("Tuples:\n");\r
174                 ListCount(Count1);\r
175                 }\r
176 #endif\r
177 \r
178                 SetProgressDesc("K-mer dist pass 1");\r
179                 for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)\r
180                         {\r
181                         if (0 == uCount%500)\r
182                                 Progress(uCount, uPairCount);\r
183                         ++uCount;\r
184                         Seq &seq2 = *(v[uSeq2]);\r
185                         const unsigned uSeqLength2 = seq2.Length();\r
186                         if (uSeqLength2 < 5)\r
187                                 {\r
188                                 if (uSeq1 == uSeq2)\r
189                                         DF.SetDist(uSeq1, uSeq2, 0);\r
190                                 else\r
191                                         DF.SetDist(uSeq1, uSeq2, 1);\r
192                                 continue;\r
193                                 }\r
194 \r
195                 // First pass through seq 2 to count tuples\r
196                         const unsigned uTupleCount = uSeqLength2 - 5;\r
197                         const unsigned *L = Letters[uSeq2];\r
198                         CountTuples(L, uTupleCount, Count2);\r
199 #if     TRACE\r
200                         Log("Seq2=%d Counts=\n", uSeq2);\r
201                         ListCount(Count2);\r
202 #endif\r
203 \r
204                 // Second pass to accumulate sum of shared tuples\r
205                 // MAFFT defines this as the sum over unique tuples\r
206                 // in seq2 of the minimum of the number of tuples found\r
207                 // in the two sequences.\r
208                         unsigned uSum = 0;\r
209                         for (unsigned n = 0; n < uTupleCount; ++n)\r
210                                 {\r
211                                 const unsigned uTuple = GetTuple(L, n);\r
212                                 uSum += MIN(Count1[uTuple], Count2[uTuple]);\r
213 \r
214                         // This is a hack to make sure each unique tuple counted only once.\r
215                                 Count2[uTuple] = 0;\r
216                                 }\r
217 #if     TRACE\r
218                         {\r
219                         Seq &s1 = *(v[uSeq1]);\r
220                         Seq &s2 = *(v[uSeq2]);\r
221                         const char *pName1 = s1.GetName();\r
222                         const char *pName2 = s2.GetName();\r
223                         Log("Common count %s(%d) - %s(%d) =%u\n",\r
224                           pName1, uSeq1, pName2, uSeq2, uSum);\r
225                         }\r
226 #endif\r
227                         uCommonTupleCount[uSeq1][uSeq2] = uSum;\r
228                         uCommonTupleCount[uSeq2][uSeq1] = uSum;\r
229                         }\r
230                 }\r
231         ProgressStepsDone();\r
232 \r
233         uCount = 0;\r
234         SetProgressDesc("K-mer dist pass 2");\r
235         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
236                 {\r
237                 Seq &s1 = *(v[uSeq1]);\r
238                 const char *pName1 = s1.GetName();\r
239 \r
240                 double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];\r
241                 if (0 == dCommonTupleCount11)\r
242                         dCommonTupleCount11 = 1;\r
243 \r
244                 DF.SetDist(uSeq1, uSeq1, 0);\r
245                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
246                         {\r
247                         if (0 == uCount%500)\r
248                                 Progress(uCount, uPairCount);\r
249                         ++uCount;\r
250 \r
251                         double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];\r
252                         if (0 == dCommonTupleCount22)\r
253                                 dCommonTupleCount22 = 1;\r
254 \r
255                         const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])\r
256                           /dCommonTupleCount11;\r
257                         const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])\r
258                           /dCommonTupleCount22;\r
259 \r
260                 // dMinDist is the value used for tree-building in MAFFT\r
261                         const double dMinDist = MIN(dDist1, dDist2);\r
262                         DF.SetDist(uSeq1, uSeq2, (float) dMinDist);\r
263 \r
264                         //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);\r
265                         //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);\r
266                 // **** TODO **** why does this make score slightly worse??\r
267                         //const double dKimuraDist = KimuraDist(dEstimatedPctId);\r
268                         //DF.SetDist(uSeq1, uSeq2, dKimuraDist);\r
269                         }\r
270                 }\r
271         ProgressStepsDone();\r
272 \r
273         for (unsigned n = 0; n < uSeqCount; ++n)\r
274                 delete[] uCommonTupleCount[n];\r
275         delete[] uCommonTupleCount;\r
276         delete[] Letters;\r
277         }\r
278 \r
279 double PctIdToMAFFTDist(double dPctId)\r
280         {\r
281         if (dPctId < 0.05)\r
282                 dPctId = 0.05;\r
283         double dDist = -log(dPctId);\r
284         return dDist;\r
285         }\r
286 \r
287 double PctIdToHeightMAFFT(double dPctId)\r
288         {\r
289         return PctIdToMAFFTDist(dPctId);\r
290         }\r