2 #include "distfunc.h"
\r
6 #define MIN(x, y) ((x) < (y) ? (x) : (y))
\r
8 static void SetKmerBitVector(const Seq &s, byte Bits[])
\r
10 const unsigned uLength = s.Length();
\r
11 const unsigned k = 3; // kmer length
\r
15 for (unsigned j = 0; j < k - 1; ++j)
\r
17 unsigned x = CharToLetterEx(s[i++]);
\r
26 for ( ; i < uLength; ++i)
\r
28 unsigned x = CharToLetterEx(s[i++]);
\r
30 c = (c*20 + x)%8000;
\r
38 unsigned ByteOffset = c/8;
\r
39 unsigned BitOffset = c%8;
\r
40 Bits[ByteOffset] |= (1 << BitOffset);
\r
45 static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[])
\r
47 const byte * const p1end = Bits1 + 1000;
\r
48 const byte *p2 = Bits2;
\r
50 unsigned uCount = 0;
\r
51 for (const byte *p1 = Bits1; p1 != p1end; ++p1)
\r
53 // Here is a cute trick for efficiently counting the
\r
54 // bits common between two bytes by combining them into
\r
56 unsigned b = *p1 | (*p2 << 8);
\r
68 void DistKbit20_3(const SeqVect &v, DistFunc &DF)
\r
70 const unsigned uSeqCount = v.Length();
\r
71 DF.SetCount(uSeqCount);
\r
73 // There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet.
\r
74 // For each sequence, we create a bit vector of length 8,000, i.e.
\r
75 // 1,000 bytes, having one bit per kmer. The bit is set to 1 if the
\r
76 // kmer is present in the sequence.
\r
77 const unsigned uBytes = uSeqCount*1000;
\r
78 byte *BitVector = new byte[uBytes];
\r
79 memset(BitVector, 0, uBytes);
\r
81 SetProgressDesc("K-bit distance matrix");
\r
82 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
83 SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000);
\r
86 const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
\r
87 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
\r
89 const byte *Bits1 = BitVector + uSeqIndex1*1000;
\r
90 const unsigned uLength1 = v[uSeqIndex1]->Length();
\r
91 for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
\r
93 const byte *Bits2 = BitVector + uSeqIndex2*1000;
\r
94 const unsigned uLength2 = v[uSeqIndex2]->Length();
\r
95 const float fCount = (float) CommonBitCount(Bits1, Bits2);
\r
97 // Distance measure = K / min(L1, L2)
\r
98 // K is number of distinct kmers that are found in both sequences
\r
99 const float fDist = fCount / MIN(uLength1, uLength2);
\r
100 DF.SetDist(uSeqIndex1, uSeqIndex2, fDist);
\r
101 if (uDone%10000 == 0)
\r
102 Progress(uDone, uTotal);
\r
106 ProgressStepsDone();
\r
108 delete[] BitVector;
\r