+++ /dev/null
-#include "muscle.h"\r
-#include "distfunc.h"\r
-#include "seqvect.h"\r
-#include <math.h>\r
-\r
-const unsigned TRIPLE_COUNT = 20*20*20;\r
-\r
-struct TripleCount\r
- {\r
- unsigned m_uSeqCount; // How many sequences have this triple?\r
- unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s\r
- };\r
-static TripleCount *TripleCounts;\r
-\r
-// WARNING: Sequences MUST be stripped of gaps and upper case!\r
-void DistKmer20_3(const SeqVect &v, DistFunc &DF)\r
- {\r
- const unsigned uSeqCount = v.Length();\r
-\r
- DF.SetCount(uSeqCount);\r
- if (0 == uSeqCount)\r
- return;\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
- {\r
- DF.SetDist(uSeq1, uSeq1, 0);\r
- for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
- DF.SetDist(uSeq1, uSeq2, 0);\r
- }\r
-\r
- const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);\r
- TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);\r
- if (0 == TripleCounts)\r
- Quit("Not enough memory (TripleCounts)");\r
- memset(TripleCounts, 0, uTripleArrayBytes);\r
-\r
- for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
- {\r
- TripleCount &tc = *(TripleCounts + uWord);\r
- const unsigned uBytes = uSeqCount*sizeof(short);\r
- tc.m_Counts = (unsigned short *) malloc(uBytes);\r
- memset(tc.m_Counts, 0, uBytes);\r
- }\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- Seq &s = *(v[uSeqIndex]);\r
- const unsigned uSeqLength = s.Length();\r
- for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)\r
- {\r
- const unsigned uLetter1 = CharToLetterEx(s[uPos]);\r
- if (uLetter1 >= 20)\r
- continue;\r
- const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);\r
- if (uLetter2 >= 20)\r
- continue;\r
- const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);\r
- if (uLetter3 >= 20)\r
- continue;\r
-\r
- const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;\r
- assert(uWord < TRIPLE_COUNT);\r
-\r
- TripleCount &tc = *(TripleCounts + uWord);\r
- const unsigned uOldCount = tc.m_Counts[uSeqIndex];\r
- if (0 == uOldCount)\r
- ++(tc.m_uSeqCount);\r
-\r
- ++(tc.m_Counts[uSeqIndex]);\r
- }\r
- }\r
-\r
-#if TRACE\r
- {\r
- Log("TripleCounts\n");\r
- unsigned uGrandTotal = 0;\r
- for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
- {\r
- const TripleCount &tc = *(TripleCounts + uWord);\r
- if (0 == tc.m_uSeqCount)\r
- continue;\r
-\r
- const unsigned uLetter3 = uWord/(20*20);\r
- const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;\r
- const unsigned uLetter1 = uWord%20;\r
- Log("Word %6u %c%c%c %6u",\r
- uWord,\r
- LetterToCharAmino(uLetter1),\r
- LetterToCharAmino(uLetter2),\r
- LetterToCharAmino(uLetter3),\r
- tc.m_uSeqCount);\r
-\r
- unsigned uSeqCountWithThisWord = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- const unsigned uCount = tc.m_Counts[uSeqIndex];\r
- if (uCount > 0)\r
- {\r
- ++uSeqCountWithThisWord;\r
- Log(" %u=%u", uSeqIndex, uCount);\r
- uGrandTotal += uCount;\r
- }\r
- }\r
- if (uSeqCountWithThisWord != tc.m_uSeqCount)\r
- Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);\r
- Log("\n");\r
- }\r
- \r
- unsigned uTotalBySeqLength = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- Seq &s = *(v[uSeqIndex]);\r
- const unsigned uSeqLength = s.Length();\r
- uTotalBySeqLength += uSeqLength - 2;\r
- }\r
- if (uGrandTotal != uTotalBySeqLength)\r
- Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);\r
- }\r
-#endif\r
-\r
- const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);\r
- unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);\r
-\r
- for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
- {\r
- const TripleCount &tc = *(TripleCounts + uWord);\r
- if (0 == tc.m_uSeqCount)\r
- continue;\r
-\r
- unsigned uSeqCountFound = 0;\r
- memset(SeqList, 0, uSeqListBytes);\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- if (tc.m_Counts[uSeqIndex] > 0)\r
- {\r
- SeqList[uSeqCountFound] = uSeqIndex;\r
- ++uSeqCountFound;\r
- if (uSeqCountFound == tc.m_uSeqCount)\r
- break;\r
- }\r
- }\r
- assert(uSeqCountFound == tc.m_uSeqCount);\r
-\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)\r
- {\r
- const unsigned uSeqIndex1 = SeqList[uSeq1];\r
- const unsigned uCount1 = tc.m_Counts[uSeqIndex1];\r
- for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
- {\r
- const unsigned uSeqIndex2 = SeqList[uSeq2];\r
- const unsigned uCount2 = tc.m_Counts[uSeqIndex2];\r
- const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;\r
- const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);\r
- DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));\r
- }\r
- }\r
- }\r
- delete[] SeqList;\r
- free(TripleCounts);\r
-\r
- unsigned uDone = 0;\r
- const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;\r
- for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
- {\r
- DF.SetDist(uSeq1, uSeq1, 0.0);\r
-\r
- const Seq &s1 = *(v[uSeq1]);\r
- const unsigned uLength1 = s1.Length();\r
-\r
- for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
- {\r
- const Seq &s2 = *(v[uSeq2]);\r
- const unsigned uLength2 = s2.Length();\r
- unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;\r
- if (uMinLength < 3)\r
- {\r
- DF.SetDist(uSeq1, uSeq2, 1.0);\r
- continue;\r
- }\r
-\r
- const double dTripleCount = DF.GetDist(uSeq1, uSeq2);\r
- if (dTripleCount == 0)\r
- {\r
- DF.SetDist(uSeq1, uSeq2, 1.0);\r
- continue;\r
- }\r
- double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);\r
- //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));\r
- //if (dEstimatedPairwiseIdentity > 1)\r
- // dEstimatedPairwiseIdentity = 1;\r
-// DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));\r
- DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);\r
-\r
-#if TRACE\r
- {\r
- Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n",\r
- s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,\r
- dEstimatedPairwiseIdentity);\r
- }\r
-#endif\r
- if (uDone%1000 == 0)\r
- Progress(uDone, uTotal);\r
- }\r
- }\r
- ProgressStepsDone();\r
- }\r