Next version of JABA
[jabaws.git] / binaries / src / muscle / fastdistnuc.cpp
1 #include "muscle.h"\r
2 #include "distfunc.h"\r
3 #include "seqvect.h"\r
4 #include <math.h>\r
5 \r
6 #define TRACE 0\r
7 \r
8 #define MIN(x, y)       (((x) < (y)) ? (x) : (y))\r
9 #define MAX(x, y)       (((x) > (y)) ? (x) : (y))\r
10 \r
11 const unsigned TUPLE_COUNT = 6*6*6*6*6*6;\r
12 static unsigned char Count1[TUPLE_COUNT];\r
13 static unsigned char Count2[TUPLE_COUNT];\r
14 \r
15 // Nucleotide groups according to MAFFT (sextet5)\r
16 // 0 =  A\r
17 // 1 =  C\r
18 // 2 =  G\r
19 // 3 =  T\r
20 // 4 =  other\r
21 \r
22 static unsigned ResidueGroup[] =\r
23         {\r
24         0,              // NX_A,\r
25         1,              // NX_C,\r
26         2,              // NX_G,\r
27         3,              // NX_T/U\r
28         4,              // NX_N,\r
29         4,              // NX_R,\r
30         4,              // NX_Y,\r
31         4,              // NX_GAP\r
32         };\r
33 static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);\r
34 \r
35 static char *TupleToStr(int t)\r
36         {\r
37         static char s[7];\r
38         int t1, t2, t3, t4, t5, t6;\r
39 \r
40         t1 = t%6;\r
41         t2 = (t/6)%6;\r
42         t3 = (t/(6*6))%6;\r
43         t4 = (t/(6*6*6))%6;\r
44         t5 = (t/(6*6*6*6))%6;\r
45         t6 = (t/(6*6*6*6*6))%6;\r
46 \r
47         s[5] = '0' + t1;\r
48         s[4] = '0' + t2;\r
49         s[3] = '0' + t3;\r
50         s[2] = '0' + t4;\r
51         s[1] = '0' + t5;\r
52         s[0] = '0' + t6;\r
53         return s;\r
54         }\r
55 \r
56 static unsigned GetTuple(const unsigned uLetters[], unsigned n)\r
57         {\r
58         assert(uLetters[n] < uResidueGroupCount);\r
59         assert(uLetters[n+1] < uResidueGroupCount);\r
60         assert(uLetters[n+2] < uResidueGroupCount);\r
61         assert(uLetters[n+3] < uResidueGroupCount);\r
62         assert(uLetters[n+4] < uResidueGroupCount);\r
63         assert(uLetters[n+5] < uResidueGroupCount);\r
64 \r
65         unsigned u1 = ResidueGroup[uLetters[n]];\r
66         unsigned u2 = ResidueGroup[uLetters[n+1]];\r
67         unsigned u3 = ResidueGroup[uLetters[n+2]];\r
68         unsigned u4 = ResidueGroup[uLetters[n+3]];\r
69         unsigned u5 = ResidueGroup[uLetters[n+4]];\r
70         unsigned u6 = ResidueGroup[uLetters[n+5]];\r
71 \r
72         return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;\r
73         }\r
74 \r
75 static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])\r
76         {\r
77         memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));\r
78         for (unsigned n = 0; n < uTupleCount; ++n)\r
79                 {\r
80                 const unsigned uTuple = GetTuple(L, n);\r
81                 ++(Count[uTuple]);\r
82                 }\r
83         }\r
84 \r
85 static void ListCount(const unsigned char Count[])\r
86         {\r
87         for (unsigned n = 0; n < TUPLE_COUNT; ++n)\r
88                 {\r
89                 if (0 == Count[n])\r
90                         continue;\r
91                 Log("%s  %u\n", TupleToStr(n), Count[n]);\r
92                 }\r
93         }\r
94 \r
95 void DistKmer4_6(const SeqVect &v, DistFunc &DF)\r
96         {\r
97         if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha)\r
98                 Quit("DistKmer4_6 requires nucleo alphabet");\r
99 \r
100         const unsigned uSeqCount = v.Length();\r
101 \r
102         DF.SetCount(uSeqCount);\r
103         if (0 == uSeqCount)\r
104                 return;\r
105 \r
106 // Initialize distance matrix to zero\r
107         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
108                 {\r
109                 DF.SetDist(uSeq1, uSeq1, 0);\r
110                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
111                         DF.SetDist(uSeq1, uSeq2, 0);\r
112                 }\r
113 \r
114 // Convert to letters\r
115         unsigned **Letters = new unsigned *[uSeqCount];\r
116         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
117                 {\r
118                 Seq &s = *(v[uSeqIndex]);\r
119                 const unsigned uSeqLength = s.Length();\r
120                 unsigned *L = new unsigned[uSeqLength];\r
121                 Letters[uSeqIndex] = L;\r
122                 for (unsigned n = 0; n < uSeqLength; ++n)\r
123                         {\r
124                         char c = s[n];\r
125                         L[n] = CharToLetterEx(c);\r
126                         if (L[n] >= 4)\r
127                                 L[n] = 4;\r
128                         }\r
129                 }\r
130 \r
131         unsigned **uCommonTupleCount = new unsigned *[uSeqCount];\r
132         for (unsigned n = 0; n < uSeqCount; ++n)\r
133                 {\r
134                 uCommonTupleCount[n] = new unsigned[uSeqCount];\r
135                 memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));\r
136                 }\r
137 \r
138         const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;\r
139         unsigned uCount = 0;\r
140         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
141                 {\r
142                 Seq &seq1 = *(v[uSeq1]);\r
143                 const unsigned uSeqLength1 = seq1.Length();\r
144                 if (uSeqLength1 < 5)\r
145                         continue;\r
146 \r
147                 const unsigned uTupleCount = uSeqLength1 - 5;\r
148                 const unsigned *L = Letters[uSeq1];\r
149                 CountTuples(L, uTupleCount, Count1);\r
150 #if     TRACE\r
151                 {\r
152                 Log("Seq1=%d\n", uSeq1);\r
153                 Log("Groups:\n");\r
154                 for (unsigned n = 0; n < uSeqLength1; ++n)\r
155                         Log("%u", ResidueGroup[L[n]]);\r
156                 Log("\n");\r
157 \r
158                 Log("Tuples:\n");\r
159                 ListCount(Count1);\r
160                 }\r
161 #endif\r
162 \r
163                 SetProgressDesc("K-mer dist pass 1");\r
164                 for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)\r
165                         {\r
166                         if (0 == uCount%500)\r
167                                 Progress(uCount, uPairCount);\r
168                         ++uCount;\r
169                         Seq &seq2 = *(v[uSeq2]);\r
170                         const unsigned uSeqLength2 = seq2.Length();\r
171                         if (uSeqLength2 < 5)\r
172                                 {\r
173                                 if (uSeq1 == uSeq2)\r
174                                         DF.SetDist(uSeq1, uSeq2, 0);\r
175                                 else\r
176                                         DF.SetDist(uSeq1, uSeq2, 1);\r
177                                 continue;\r
178                                 }\r
179 \r
180                 // First pass through seq 2 to count tuples\r
181                         const unsigned uTupleCount = uSeqLength2 - 5;\r
182                         const unsigned *L = Letters[uSeq2];\r
183                         CountTuples(L, uTupleCount, Count2);\r
184 #if     TRACE\r
185                         Log("Seq2=%d Counts=\n", uSeq2);\r
186                         ListCount(Count2);\r
187 #endif\r
188 \r
189                 // Second pass to accumulate sum of shared tuples\r
190                 // MAFFT defines this as the sum over unique tuples\r
191                 // in seq2 of the minimum of the number of tuples found\r
192                 // in the two sequences.\r
193                         unsigned uSum = 0;\r
194                         for (unsigned n = 0; n < uTupleCount; ++n)\r
195                                 {\r
196                                 const unsigned uTuple = GetTuple(L, n);\r
197                                 uSum += MIN(Count1[uTuple], Count2[uTuple]);\r
198 \r
199                         // This is a hack to make sure each unique tuple counted only once.\r
200                                 Count2[uTuple] = 0;\r
201                                 }\r
202 #if     TRACE\r
203                         {\r
204                         Seq &s1 = *(v[uSeq1]);\r
205                         Seq &s2 = *(v[uSeq2]);\r
206                         const char *pName1 = s1.GetName();\r
207                         const char *pName2 = s2.GetName();\r
208                         Log("Common count %s(%d) - %s(%d) =%u\n",\r
209                           pName1, uSeq1, pName2, uSeq2, uSum);\r
210                         }\r
211 #endif\r
212                         uCommonTupleCount[uSeq1][uSeq2] = uSum;\r
213                         uCommonTupleCount[uSeq2][uSeq1] = uSum;\r
214                         }\r
215                 }\r
216         ProgressStepsDone();\r
217 \r
218         uCount = 0;\r
219         SetProgressDesc("K-mer dist pass 2");\r
220         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
221                 {\r
222                 Seq &s1 = *(v[uSeq1]);\r
223                 const char *pName1 = s1.GetName();\r
224 \r
225                 double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];\r
226                 if (0 == dCommonTupleCount11)\r
227                         dCommonTupleCount11 = 1;\r
228 \r
229                 DF.SetDist(uSeq1, uSeq1, 0);\r
230                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
231                         {\r
232                         if (0 == uCount%500)\r
233                                 Progress(uCount, uPairCount);\r
234                         ++uCount;\r
235 \r
236                         double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];\r
237                         if (0 == dCommonTupleCount22)\r
238                                 dCommonTupleCount22 = 1;\r
239 \r
240                         const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])\r
241                           /dCommonTupleCount11;\r
242                         const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])\r
243                           /dCommonTupleCount22;\r
244 \r
245                 // dMinDist is the value used for tree-building in MAFFT\r
246                         const double dMinDist = MIN(dDist1, dDist2);\r
247                         DF.SetDist(uSeq1, uSeq2, (float) dMinDist);\r
248 \r
249                         //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);\r
250                         //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);\r
251                 // **** TODO **** why does this make score slightly worse??\r
252                         //const double dKimuraDist = KimuraDist(dEstimatedPctId);\r
253                         //DF.SetDist(uSeq1, uSeq2, dKimuraDist);\r
254                         }\r
255                 }\r
256         ProgressStepsDone();\r
257 \r
258         for (unsigned n = 0; n < uSeqCount; ++n)\r
259                 {\r
260                 delete[] uCommonTupleCount[n];\r
261                 delete[] Letters[n];\r
262                 }\r
263         delete[] uCommonTupleCount;\r
264         delete[] Letters;\r
265         }\r