Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / fastdistjones.cpp
diff --git a/website/archive/binaries/mac/src/muscle/fastdistjones.cpp b/website/archive/binaries/mac/src/muscle/fastdistjones.cpp
new file mode 100644 (file)
index 0000000..bacc485
--- /dev/null
@@ -0,0 +1,206 @@
+#include "muscle.h"\r
+#include "distfunc.h"\r
+#include "seqvect.h"\r
+#include <math.h>\r
+\r
+const unsigned TRIPLE_COUNT = 20*20*20;\r
+\r
+struct TripleCount\r
+       {\r
+       unsigned m_uSeqCount;                   // How many sequences have this triple?\r
+       unsigned short *m_Counts;               // m_Counts[s] = nr of times triple found in seq s\r
+       };\r
+static TripleCount *TripleCounts;\r
+\r
+// WARNING: Sequences MUST be stripped of gaps and upper case!\r
+void DistKmer20_3(const SeqVect &v, DistFunc &DF)\r
+       {\r
+       const unsigned uSeqCount = v.Length();\r
+\r
+       DF.SetCount(uSeqCount);\r
+       if (0 == uSeqCount)\r
+               return;\r
+       for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
+               {\r
+               DF.SetDist(uSeq1, uSeq1, 0);\r
+               for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+                       DF.SetDist(uSeq1, uSeq2, 0);\r
+               }\r
+\r
+       const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);\r
+       TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);\r
+       if (0 == TripleCounts)\r
+               Quit("Not enough memory (TripleCounts)");\r
+       memset(TripleCounts, 0, uTripleArrayBytes);\r
+\r
+       for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
+               {\r
+               TripleCount &tc = *(TripleCounts + uWord);\r
+               const unsigned uBytes = uSeqCount*sizeof(short);\r
+               tc.m_Counts = (unsigned short *) malloc(uBytes);\r
+               memset(tc.m_Counts, 0, uBytes);\r
+               }\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               Seq &s = *(v[uSeqIndex]);\r
+               const unsigned uSeqLength = s.Length();\r
+               for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)\r
+                       {\r
+                       const unsigned uLetter1 = CharToLetterEx(s[uPos]);\r
+                       if (uLetter1 >= 20)\r
+                               continue;\r
+                       const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);\r
+                       if (uLetter2 >= 20)\r
+                               continue;\r
+                       const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);\r
+                       if (uLetter3 >= 20)\r
+                               continue;\r
+\r
+                       const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;\r
+                       assert(uWord < TRIPLE_COUNT);\r
+\r
+                       TripleCount &tc = *(TripleCounts + uWord);\r
+                       const unsigned uOldCount = tc.m_Counts[uSeqIndex];\r
+                       if (0 == uOldCount)\r
+                               ++(tc.m_uSeqCount);\r
+\r
+                       ++(tc.m_Counts[uSeqIndex]);\r
+                       }\r
+               }\r
+\r
+#if TRACE\r
+       {\r
+       Log("TripleCounts\n");\r
+       unsigned uGrandTotal = 0;\r
+       for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
+               {\r
+               const TripleCount &tc = *(TripleCounts + uWord);\r
+               if (0 == tc.m_uSeqCount)\r
+                       continue;\r
+\r
+               const unsigned uLetter3 = uWord/(20*20);\r
+               const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;\r
+               const unsigned uLetter1 = uWord%20;\r
+               Log("Word %6u %c%c%c   %6u",\r
+                 uWord,\r
+                 LetterToCharAmino(uLetter1),\r
+                 LetterToCharAmino(uLetter2),\r
+                 LetterToCharAmino(uLetter3),\r
+                 tc.m_uSeqCount);\r
+\r
+               unsigned uSeqCountWithThisWord = 0;\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       {\r
+                       const unsigned uCount = tc.m_Counts[uSeqIndex];\r
+                       if (uCount > 0)\r
+                               {\r
+                               ++uSeqCountWithThisWord;\r
+                               Log(" %u=%u", uSeqIndex, uCount);\r
+                               uGrandTotal += uCount;\r
+                               }\r
+                       }\r
+               if (uSeqCountWithThisWord != tc.m_uSeqCount)\r
+                       Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);\r
+               Log("\n");\r
+               }\r
+       \r
+       unsigned uTotalBySeqLength = 0;\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               Seq &s = *(v[uSeqIndex]);\r
+               const unsigned uSeqLength = s.Length();\r
+               uTotalBySeqLength += uSeqLength - 2;\r
+               }\r
+       if (uGrandTotal != uTotalBySeqLength)\r
+               Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);\r
+       }\r
+#endif\r
+\r
+       const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);\r
+       unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);\r
+\r
+       for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)\r
+               {\r
+               const TripleCount &tc = *(TripleCounts + uWord);\r
+               if (0 == tc.m_uSeqCount)\r
+                       continue;\r
+\r
+               unsigned uSeqCountFound = 0;\r
+               memset(SeqList, 0, uSeqListBytes);\r
+\r
+               for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+                       {\r
+                       if (tc.m_Counts[uSeqIndex] > 0)\r
+                               {\r
+                               SeqList[uSeqCountFound] = uSeqIndex;\r
+                               ++uSeqCountFound;\r
+                               if (uSeqCountFound == tc.m_uSeqCount)\r
+                                       break;\r
+                               }\r
+                       }\r
+               assert(uSeqCountFound == tc.m_uSeqCount);\r
+\r
+               for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)\r
+                       {\r
+                       const unsigned uSeqIndex1 = SeqList[uSeq1];\r
+                       const unsigned uCount1 = tc.m_Counts[uSeqIndex1];\r
+                       for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+                               {\r
+                               const unsigned uSeqIndex2 = SeqList[uSeq2];\r
+                               const unsigned uCount2 = tc.m_Counts[uSeqIndex2];\r
+                               const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;\r
+                               const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);\r
+                               DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));\r
+                               }\r
+                       }\r
+               }\r
+       delete[] SeqList;\r
+       free(TripleCounts);\r
+\r
+       unsigned uDone = 0;\r
+       const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;\r
+       for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
+               {\r
+               DF.SetDist(uSeq1, uSeq1, 0.0);\r
+\r
+               const Seq &s1 = *(v[uSeq1]);\r
+               const unsigned uLength1 = s1.Length();\r
+\r
+               for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
+                       {\r
+                       const Seq &s2 = *(v[uSeq2]);\r
+                       const unsigned uLength2 = s2.Length();\r
+                       unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;\r
+                       if (uMinLength < 3)\r
+                               {\r
+                               DF.SetDist(uSeq1, uSeq2, 1.0);\r
+                               continue;\r
+                               }\r
+\r
+                       const double dTripleCount = DF.GetDist(uSeq1, uSeq2);\r
+                       if (dTripleCount == 0)\r
+                               {\r
+                               DF.SetDist(uSeq1, uSeq2, 1.0);\r
+                               continue;\r
+                               }\r
+                       double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);\r
+                       //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));\r
+                       //if (dEstimatedPairwiseIdentity > 1)\r
+                       //      dEstimatedPairwiseIdentity = 1;\r
+//                     DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));\r
+                       DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);\r
+\r
+#if    TRACE\r
+                       {\r
+                       Log("%s - %s  Triplet count = %g  Lengths %u, %u Estimated pwid = %g\n",\r
+                         s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,\r
+                         dEstimatedPairwiseIdentity);\r
+                       }\r
+#endif\r
+                       if (uDone%1000 == 0)\r
+                               Progress(uDone, uTotal);\r
+                       }\r
+               }\r
+       ProgressStepsDone();\r
+       }\r