Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / fastdistkbit.cpp
1 #include "muscle.h"\r
2 #include "distfunc.h"\r
3 #include "seqvect.h"\r
4 #include <math.h>\r
5 \r
6 #define MIN(x, y)       ((x) < (y) ? (x) : (y))\r
7 \r
8 static void SetKmerBitVector(const Seq &s, byte Bits[])\r
9         {\r
10         const unsigned uLength = s.Length();\r
11         const unsigned k = 3;   // kmer length\r
12         unsigned i = 0;\r
13         unsigned c = 0;\r
14         unsigned h = 0;\r
15         for (unsigned j = 0; j < k - 1; ++j)\r
16                 {\r
17                 unsigned x = CharToLetterEx(s[i++]);\r
18                 if (x <= AX_Y)\r
19                         c = c*20 + x;\r
20                 else\r
21                         {\r
22                         c = 0;\r
23                         h = j + 1;\r
24                         }\r
25                 }\r
26         for ( ; i < uLength; ++i)\r
27                 {\r
28                 unsigned x = CharToLetterEx(s[i++]);\r
29                 if (x <= AX_Y)\r
30                         c = (c*20 + x)%8000;\r
31                 else\r
32                         {\r
33                         c = 0;\r
34                         h = i + k;\r
35                         }\r
36                 if (i >= h)\r
37                         {\r
38                         unsigned ByteOffset = c/8;\r
39                         unsigned BitOffset = c%8;\r
40                         Bits[ByteOffset] |= (1 << BitOffset);\r
41                         }\r
42                 }\r
43         }\r
44 \r
45 static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[])\r
46         {\r
47         const byte * const p1end = Bits1 + 1000;\r
48         const byte *p2 = Bits2;\r
49 \r
50         unsigned uCount = 0;\r
51         for (const byte *p1 = Bits1; p1 != p1end; ++p1)\r
52                 {\r
53         // Here is a cute trick for efficiently counting the\r
54         // bits common between two bytes by combining them into\r
55         // a single word.\r
56                 unsigned b = *p1 | (*p2 << 8);\r
57                 while (b != 0)\r
58                         {\r
59                         if (b & 0x101)\r
60                                 ++uCount;\r
61                         b >>= 1;\r
62                         }\r
63                 ++p2;\r
64                 }\r
65         return uCount;\r
66         }\r
67 \r
68 void DistKbit20_3(const SeqVect &v, DistFunc &DF)\r
69         {\r
70         const unsigned uSeqCount = v.Length();\r
71         DF.SetCount(uSeqCount);\r
72 \r
73 // There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet.\r
74 // For each sequence, we create a bit vector of length 8,000, i.e.\r
75 // 1,000 bytes, having one bit per kmer. The bit is set to 1 if the\r
76 // kmer is present in the sequence.\r
77         const unsigned uBytes = uSeqCount*1000;\r
78         byte *BitVector = new byte[uBytes];\r
79         memset(BitVector, 0, uBytes);\r
80 \r
81         SetProgressDesc("K-bit distance matrix");\r
82         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
83                 SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000);\r
84 \r
85         unsigned uDone = 0;\r
86         const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;\r
87         for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)\r
88                 {\r
89                 const byte *Bits1 = BitVector + uSeqIndex1*1000;\r
90                 const unsigned uLength1 = v[uSeqIndex1]->Length();\r
91                 for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)\r
92                         {\r
93                         const byte *Bits2 = BitVector + uSeqIndex2*1000;\r
94                         const unsigned uLength2 = v[uSeqIndex2]->Length();\r
95                         const float fCount = (float) CommonBitCount(Bits1, Bits2);\r
96 \r
97                 // Distance measure = K / min(L1, L2)\r
98                 // K is number of distinct kmers that are found in both sequences\r
99                         const float fDist = fCount / MIN(uLength1, uLength2);\r
100                         DF.SetDist(uSeqIndex1, uSeqIndex2, fDist);\r
101                         if (uDone%10000 == 0)\r
102                                 Progress(uDone, uTotal);\r
103                         ++uDone;\r
104                         }\r
105                 }\r
106         ProgressStepsDone();\r
107 \r
108         delete[] BitVector;\r
109         }\r