Next version of JABA
[jabaws.git] / binaries / src / muscle / fastdistjones.cpp
1 #include "muscle.h"\r
2 #include "distfunc.h"\r
3 #include "seqvect.h"\r
4 #include <math.h>\r
5 \r
6 const unsigned TRIPLE_COUNT = 20*20*20;\r
7 \r
8 struct TripleCount\r
9         {\r
10         unsigned m_uSeqCount;                   // How many sequences have this triple?\r
11         unsigned short *m_Counts;               // m_Counts[s] = nr of times triple found in seq s\r
12         };\r
13 static TripleCount *TripleCounts;\r
14 \r
15 // WARNING: Sequences MUST be stripped of gaps and upper case!\r
16 void DistKmer20_3(const SeqVect &v, DistFunc &DF)\r
17         {\r
18         const unsigned uSeqCount = v.Length();\r
19 \r
20         DF.SetCount(uSeqCount);\r
21         if (0 == uSeqCount)\r
22                 return;\r
23         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
24                 {\r
25                 DF.SetDist(uSeq1, uSeq1, 0);\r
26                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
27                         DF.SetDist(uSeq1, uSeq2, 0);\r
28                 }\r
29 \r
30         const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);\r
31         TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);\r
32         if (0 == TripleCounts)\r
33                 Quit("Not enough memory (TripleCounts)");\r
34         memset(TripleCounts, 0, uTripleArrayBytes);\r
35 \r
36         for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
37                 {\r
38                 TripleCount &tc = *(TripleCounts + uWord);\r
39                 const unsigned uBytes = uSeqCount*sizeof(short);\r
40                 tc.m_Counts = (unsigned short *) malloc(uBytes);\r
41                 memset(tc.m_Counts, 0, uBytes);\r
42                 }\r
43 \r
44         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
45                 {\r
46                 Seq &s = *(v[uSeqIndex]);\r
47                 const unsigned uSeqLength = s.Length();\r
48                 for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)\r
49                         {\r
50                         const unsigned uLetter1 = CharToLetterEx(s[uPos]);\r
51                         if (uLetter1 >= 20)\r
52                                 continue;\r
53                         const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);\r
54                         if (uLetter2 >= 20)\r
55                                 continue;\r
56                         const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);\r
57                         if (uLetter3 >= 20)\r
58                                 continue;\r
59 \r
60                         const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;\r
61                         assert(uWord < TRIPLE_COUNT);\r
62 \r
63                         TripleCount &tc = *(TripleCounts + uWord);\r
64                         const unsigned uOldCount = tc.m_Counts[uSeqIndex];\r
65                         if (0 == uOldCount)\r
66                                 ++(tc.m_uSeqCount);\r
67 \r
68                         ++(tc.m_Counts[uSeqIndex]);\r
69                         }\r
70                 }\r
71 \r
72 #if TRACE\r
73         {\r
74         Log("TripleCounts\n");\r
75         unsigned uGrandTotal = 0;\r
76         for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
77                 {\r
78                 const TripleCount &tc = *(TripleCounts + uWord);\r
79                 if (0 == tc.m_uSeqCount)\r
80                         continue;\r
81 \r
82                 const unsigned uLetter3 = uWord/(20*20);\r
83                 const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;\r
84                 const unsigned uLetter1 = uWord%20;\r
85                 Log("Word %6u %c%c%c   %6u",\r
86                   uWord,\r
87                   LetterToCharAmino(uLetter1),\r
88                   LetterToCharAmino(uLetter2),\r
89                   LetterToCharAmino(uLetter3),\r
90                   tc.m_uSeqCount);\r
91 \r
92                 unsigned uSeqCountWithThisWord = 0;\r
93                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
94                         {\r
95                         const unsigned uCount = tc.m_Counts[uSeqIndex];\r
96                         if (uCount > 0)\r
97                                 {\r
98                                 ++uSeqCountWithThisWord;\r
99                                 Log(" %u=%u", uSeqIndex, uCount);\r
100                                 uGrandTotal += uCount;\r
101                                 }\r
102                         }\r
103                 if (uSeqCountWithThisWord != tc.m_uSeqCount)\r
104                         Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);\r
105                 Log("\n");\r
106                 }\r
107         \r
108         unsigned uTotalBySeqLength = 0;\r
109         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
110                 {\r
111                 Seq &s = *(v[uSeqIndex]);\r
112                 const unsigned uSeqLength = s.Length();\r
113                 uTotalBySeqLength += uSeqLength - 2;\r
114                 }\r
115         if (uGrandTotal != uTotalBySeqLength)\r
116                 Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);\r
117         }\r
118 #endif\r
119 \r
120         const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);\r
121         unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);\r
122 \r
123         for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
124                 {\r
125                 const TripleCount &tc = *(TripleCounts + uWord);\r
126                 if (0 == tc.m_uSeqCount)\r
127                         continue;\r
128 \r
129                 unsigned uSeqCountFound = 0;\r
130                 memset(SeqList, 0, uSeqListBytes);\r
131 \r
132                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
133                         {\r
134                         if (tc.m_Counts[uSeqIndex] > 0)\r
135                                 {\r
136                                 SeqList[uSeqCountFound] = uSeqIndex;\r
137                                 ++uSeqCountFound;\r
138                                 if (uSeqCountFound == tc.m_uSeqCount)\r
139                                         break;\r
140                                 }\r
141                         }\r
142                 assert(uSeqCountFound == tc.m_uSeqCount);\r
143 \r
144                 for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)\r
145                         {\r
146                         const unsigned uSeqIndex1 = SeqList[uSeq1];\r
147                         const unsigned uCount1 = tc.m_Counts[uSeqIndex1];\r
148                         for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
149                                 {\r
150                                 const unsigned uSeqIndex2 = SeqList[uSeq2];\r
151                                 const unsigned uCount2 = tc.m_Counts[uSeqIndex2];\r
152                                 const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;\r
153                                 const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);\r
154                                 DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));\r
155                                 }\r
156                         }\r
157                 }\r
158         delete[] SeqList;\r
159         free(TripleCounts);\r
160 \r
161         unsigned uDone = 0;\r
162         const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;\r
163         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
164                 {\r
165                 DF.SetDist(uSeq1, uSeq1, 0.0);\r
166 \r
167                 const Seq &s1 = *(v[uSeq1]);\r
168                 const unsigned uLength1 = s1.Length();\r
169 \r
170                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
171                         {\r
172                         const Seq &s2 = *(v[uSeq2]);\r
173                         const unsigned uLength2 = s2.Length();\r
174                         unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;\r
175                         if (uMinLength < 3)\r
176                                 {\r
177                                 DF.SetDist(uSeq1, uSeq2, 1.0);\r
178                                 continue;\r
179                                 }\r
180 \r
181                         const double dTripleCount = DF.GetDist(uSeq1, uSeq2);\r
182                         if (dTripleCount == 0)\r
183                                 {\r
184                                 DF.SetDist(uSeq1, uSeq2, 1.0);\r
185                                 continue;\r
186                                 }\r
187                         double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);\r
188                         //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));\r
189                         //if (dEstimatedPairwiseIdentity > 1)\r
190                         //      dEstimatedPairwiseIdentity = 1;\r
191 //                      DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));\r
192                         DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);\r
193 \r
194 #if     TRACE\r
195                         {\r
196                         Log("%s - %s  Triplet count = %g  Lengths %u, %u Estimated pwid = %g\n",\r
197                           s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,\r
198                           dEstimatedPairwiseIdentity);\r
199                         }\r
200 #endif\r
201                         if (uDone%1000 == 0)\r
202                                 Progress(uDone, uTotal);\r
203                         }\r
204                 }\r
205         ProgressStepsDone();\r
206         }\r