2 #include "distfunc.h"
\r
6 const unsigned TRIPLE_COUNT = 20*20*20;
\r
10 unsigned m_uSeqCount; // How many sequences have this triple?
\r
11 unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s
\r
13 static TripleCount *TripleCounts;
\r
15 // WARNING: Sequences MUST be stripped of gaps and upper case!
\r
16 void DistKmer20_3(const SeqVect &v, DistFunc &DF)
\r
18 const unsigned uSeqCount = v.Length();
\r
20 DF.SetCount(uSeqCount);
\r
23 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
25 DF.SetDist(uSeq1, uSeq1, 0);
\r
26 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
27 DF.SetDist(uSeq1, uSeq2, 0);
\r
30 const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
\r
31 TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
\r
32 if (0 == TripleCounts)
\r
33 Quit("Not enough memory (TripleCounts)");
\r
34 memset(TripleCounts, 0, uTripleArrayBytes);
\r
36 for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
\r
38 TripleCount &tc = *(TripleCounts + uWord);
\r
39 const unsigned uBytes = uSeqCount*sizeof(short);
\r
40 tc.m_Counts = (unsigned short *) malloc(uBytes);
\r
41 memset(tc.m_Counts, 0, uBytes);
\r
44 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
46 Seq &s = *(v[uSeqIndex]);
\r
47 const unsigned uSeqLength = s.Length();
\r
48 for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
\r
50 const unsigned uLetter1 = CharToLetterEx(s[uPos]);
\r
53 const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
\r
56 const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
\r
60 const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
\r
61 assert(uWord < TRIPLE_COUNT);
\r
63 TripleCount &tc = *(TripleCounts + uWord);
\r
64 const unsigned uOldCount = tc.m_Counts[uSeqIndex];
\r
68 ++(tc.m_Counts[uSeqIndex]);
\r
74 Log("TripleCounts\n");
\r
75 unsigned uGrandTotal = 0;
\r
76 for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
\r
78 const TripleCount &tc = *(TripleCounts + uWord);
\r
79 if (0 == tc.m_uSeqCount)
\r
82 const unsigned uLetter3 = uWord/(20*20);
\r
83 const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
\r
84 const unsigned uLetter1 = uWord%20;
\r
85 Log("Word %6u %c%c%c %6u",
\r
87 LetterToCharAmino(uLetter1),
\r
88 LetterToCharAmino(uLetter2),
\r
89 LetterToCharAmino(uLetter3),
\r
92 unsigned uSeqCountWithThisWord = 0;
\r
93 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
95 const unsigned uCount = tc.m_Counts[uSeqIndex];
\r
98 ++uSeqCountWithThisWord;
\r
99 Log(" %u=%u", uSeqIndex, uCount);
\r
100 uGrandTotal += uCount;
\r
103 if (uSeqCountWithThisWord != tc.m_uSeqCount)
\r
104 Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
\r
108 unsigned uTotalBySeqLength = 0;
\r
109 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
111 Seq &s = *(v[uSeqIndex]);
\r
112 const unsigned uSeqLength = s.Length();
\r
113 uTotalBySeqLength += uSeqLength - 2;
\r
115 if (uGrandTotal != uTotalBySeqLength)
\r
116 Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
\r
120 const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
\r
121 unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);
\r
123 for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
\r
125 const TripleCount &tc = *(TripleCounts + uWord);
\r
126 if (0 == tc.m_uSeqCount)
\r
129 unsigned uSeqCountFound = 0;
\r
130 memset(SeqList, 0, uSeqListBytes);
\r
132 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
134 if (tc.m_Counts[uSeqIndex] > 0)
\r
136 SeqList[uSeqCountFound] = uSeqIndex;
\r
138 if (uSeqCountFound == tc.m_uSeqCount)
\r
142 assert(uSeqCountFound == tc.m_uSeqCount);
\r
144 for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
\r
146 const unsigned uSeqIndex1 = SeqList[uSeq1];
\r
147 const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
\r
148 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
150 const unsigned uSeqIndex2 = SeqList[uSeq2];
\r
151 const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
\r
152 const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
\r
153 const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
\r
154 DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
\r
159 free(TripleCounts);
\r
161 unsigned uDone = 0;
\r
162 const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
\r
163 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
\r
165 DF.SetDist(uSeq1, uSeq1, 0.0);
\r
167 const Seq &s1 = *(v[uSeq1]);
\r
168 const unsigned uLength1 = s1.Length();
\r
170 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
\r
172 const Seq &s2 = *(v[uSeq2]);
\r
173 const unsigned uLength2 = s2.Length();
\r
174 unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
\r
175 if (uMinLength < 3)
\r
177 DF.SetDist(uSeq1, uSeq2, 1.0);
\r
181 const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
\r
182 if (dTripleCount == 0)
\r
184 DF.SetDist(uSeq1, uSeq2, 1.0);
\r
187 double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
\r
188 //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
\r
189 //if (dEstimatedPairwiseIdentity > 1)
\r
190 // dEstimatedPairwiseIdentity = 1;
\r
191 // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
\r
192 DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);
\r
196 Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n",
\r
197 s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
\r
198 dEstimatedPairwiseIdentity);
\r
201 if (uDone%1000 == 0)
\r
202 Progress(uDone, uTotal);
\r
205 ProgressStepsDone();
\r