--- /dev/null
+#include "muscle.h"\r
+#include "distfunc.h"\r
+#include "seqvect.h"\r
+#include <math.h>\r
+\r
+const unsigned TRIPLE_COUNT = 20*20*20;\r
+\r
+struct TripleCount\r
+ {\r
+ unsigned m_uSeqCount; // How many sequences have this triple?\r
+ unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s\r
+ };\r
+static TripleCount *TripleCounts;\r
+\r
+// WARNING: Sequences MUST be stripped of gaps and upper case!\r
+void DistKmer20_3(const SeqVect &v, DistFunc &DF)\r
+ {\r
+ const unsigned uSeqCount = v.Length();\r
+\r
+ DF.SetCount(uSeqCount);\r
+ if (0 == uSeqCount)\r
+ return;\r
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
+ {\r
+ DF.SetDist(uSeq1, uSeq1, 0);\r
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+ DF.SetDist(uSeq1, uSeq2, 0);\r
+ }\r
+\r
+ const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);\r
+ TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);\r
+ if (0 == TripleCounts)\r
+ Quit("Not enough memory (TripleCounts)");\r
+ memset(TripleCounts, 0, uTripleArrayBytes);\r
+\r
+ for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
+ {\r
+ TripleCount &tc = *(TripleCounts + uWord);\r
+ const unsigned uBytes = uSeqCount*sizeof(short);\r
+ tc.m_Counts = (unsigned short *) malloc(uBytes);\r
+ memset(tc.m_Counts, 0, uBytes);\r
+ }\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ Seq &s = *(v[uSeqIndex]);\r
+ const unsigned uSeqLength = s.Length();\r
+ for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)\r
+ {\r
+ const unsigned uLetter1 = CharToLetterEx(s[uPos]);\r
+ if (uLetter1 >= 20)\r
+ continue;\r
+ const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);\r
+ if (uLetter2 >= 20)\r
+ continue;\r
+ const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);\r
+ if (uLetter3 >= 20)\r
+ continue;\r
+\r
+ const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;\r
+ assert(uWord < TRIPLE_COUNT);\r
+\r
+ TripleCount &tc = *(TripleCounts + uWord);\r
+ const unsigned uOldCount = tc.m_Counts[uSeqIndex];\r
+ if (0 == uOldCount)\r
+ ++(tc.m_uSeqCount);\r
+\r
+ ++(tc.m_Counts[uSeqIndex]);\r
+ }\r
+ }\r
+\r
+#if TRACE\r
+ {\r
+ Log("TripleCounts\n");\r
+ unsigned uGrandTotal = 0;\r
+ for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
+ {\r
+ const TripleCount &tc = *(TripleCounts + uWord);\r
+ if (0 == tc.m_uSeqCount)\r
+ continue;\r
+\r
+ const unsigned uLetter3 = uWord/(20*20);\r
+ const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;\r
+ const unsigned uLetter1 = uWord%20;\r
+ Log("Word %6u %c%c%c %6u",\r
+ uWord,\r
+ LetterToCharAmino(uLetter1),\r
+ LetterToCharAmino(uLetter2),\r
+ LetterToCharAmino(uLetter3),\r
+ tc.m_uSeqCount);\r
+\r
+ unsigned uSeqCountWithThisWord = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ const unsigned uCount = tc.m_Counts[uSeqIndex];\r
+ if (uCount > 0)\r
+ {\r
+ ++uSeqCountWithThisWord;\r
+ Log(" %u=%u", uSeqIndex, uCount);\r
+ uGrandTotal += uCount;\r
+ }\r
+ }\r
+ if (uSeqCountWithThisWord != tc.m_uSeqCount)\r
+ Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);\r
+ Log("\n");\r
+ }\r
+ \r
+ unsigned uTotalBySeqLength = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ Seq &s = *(v[uSeqIndex]);\r
+ const unsigned uSeqLength = s.Length();\r
+ uTotalBySeqLength += uSeqLength - 2;\r
+ }\r
+ if (uGrandTotal != uTotalBySeqLength)\r
+ Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);\r
+ }\r
+#endif\r
+\r
+ const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);\r
+ unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);\r
+\r
+ for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
+ {\r
+ const TripleCount &tc = *(TripleCounts + uWord);\r
+ if (0 == tc.m_uSeqCount)\r
+ continue;\r
+\r
+ unsigned uSeqCountFound = 0;\r
+ memset(SeqList, 0, uSeqListBytes);\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ if (tc.m_Counts[uSeqIndex] > 0)\r
+ {\r
+ SeqList[uSeqCountFound] = uSeqIndex;\r
+ ++uSeqCountFound;\r
+ if (uSeqCountFound == tc.m_uSeqCount)\r
+ break;\r
+ }\r
+ }\r
+ assert(uSeqCountFound == tc.m_uSeqCount);\r
+\r
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)\r
+ {\r
+ const unsigned uSeqIndex1 = SeqList[uSeq1];\r
+ const unsigned uCount1 = tc.m_Counts[uSeqIndex1];\r
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+ {\r
+ const unsigned uSeqIndex2 = SeqList[uSeq2];\r
+ const unsigned uCount2 = tc.m_Counts[uSeqIndex2];\r
+ const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;\r
+ const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);\r
+ DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));\r
+ }\r
+ }\r
+ }\r
+ delete[] SeqList;\r
+ free(TripleCounts);\r
+\r
+ unsigned uDone = 0;\r
+ const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;\r
+ for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
+ {\r
+ DF.SetDist(uSeq1, uSeq1, 0.0);\r
+\r
+ const Seq &s1 = *(v[uSeq1]);\r
+ const unsigned uLength1 = s1.Length();\r
+\r
+ for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+ {\r
+ const Seq &s2 = *(v[uSeq2]);\r
+ const unsigned uLength2 = s2.Length();\r
+ unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;\r
+ if (uMinLength < 3)\r
+ {\r
+ DF.SetDist(uSeq1, uSeq2, 1.0);\r
+ continue;\r
+ }\r
+\r
+ const double dTripleCount = DF.GetDist(uSeq1, uSeq2);\r
+ if (dTripleCount == 0)\r
+ {\r
+ DF.SetDist(uSeq1, uSeq2, 1.0);\r
+ continue;\r
+ }\r
+ double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);\r
+ //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));\r
+ //if (dEstimatedPairwiseIdentity > 1)\r
+ // dEstimatedPairwiseIdentity = 1;\r
+// DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));\r
+ DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);\r
+\r
+#if TRACE\r
+ {\r
+ Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n",\r
+ s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,\r
+ dEstimatedPairwiseIdentity);\r
+ }\r
+#endif\r
+ if (uDone%1000 == 0)\r
+ Progress(uDone, uTotal);\r
+ }\r
+ }\r
+ ProgressStepsDone();\r
+ }\r