Delete unneeded directory

[jabaws.git] / website / archive / binaries / mac / src / muscle / fastdistkmer.cpp
diff --git a/website/archive/binaries/mac/src/muscle/fastdistkmer.cpp b/website/archive/binaries/mac/src/muscle/fastdistkmer.cpp

deleted file mode 100644 (file)

index 569a75b..0000000
--- a/website/archive/binaries/mac/src/muscle/fastdistkmer.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-#include "muscle.h"\r
-#include "msa.h"\r
-#include "seqvect.h"\r
-#include "seq.h"\r
-#include "distfunc.h"\r
-#include <math.h>\r
-\r
-#define TRACE  0\r
-\r
-/***\r
-Some candidate alphabets considered because they\r
-have high correlations and small table sizes.\r
-Correlation coefficent is between k-mer distance\r
-and %id D measured from a CLUSTALW alignment.\r
-Table size is N^k where N is size of alphabet.\r
-A is standard (uncompressed) amino alphabet.\r
-\r
-                           Correlation\r
-Alpha   N  k  Table Size   all   25-50%\r
------  --  -  ----------   ----  ------\r
-A      20  3       8,000  0.943   0.575\r
-A      20  4     160,000  0.962   0.685 <<\r
-LiA    14  4      38,416  0.966   0.645\r
-SEB    14  4      38,416  0.964   0.634\r
-LiA    13  4      28,561  0.965   0.640\r
-LiA    12  4      20,736  0.963   0.620\r
-LiA    10  5     100,000  0.964   0.652\r
-\r
-We select A with k=4 because it has the best\r
-correlations. The only drawback is a large table\r
-size, but space is readily available and the only \r
-additional time cost is in resetting the table to\r
-zero, which can be done quickly with memset or by\r
-keeping a list of the k-mers that were found (should\r
-test to see which is faster, and may vary by compiler\r
-and processor type). It also has the minor advantage\r
-that we don't need to convert the alphabet.\r
-\r
-Fractional identity d is estimated as follows.\r
-\r
-       F = fractional k-mer count\r
-       if F is 0: F = 0.01\r
-       Y = log(0.02 + F)\r
-       d = -4.1 + 4.12*Y\r
-\r
-The constant 0.02 was chosen to make the relationship\r
-between Y and D linear. The constants -4.1 and 4.12\r
-were chosen to fit a straight line to the scatterplot\r
-of Y vs D.\r
-***/\r
-\r
-#define MIN(x, y)      (((x) < (y)) ? (x) : (y))\r
-\r
-const unsigned K = 4;\r
-const unsigned N = 20;\r
-const unsigned N_2 = 20*20;\r
-const unsigned N_3 = 20*20*20;\r
-const unsigned N_4 = 20*20*20*20;\r
-\r
-const unsigned TABLE_SIZE = N_4;\r
-\r
-// For debug output\r
-const char *KmerToStr(unsigned Kmer)\r
-       {\r
-       static char s[5];\r
-\r
-       unsigned c3 = (Kmer/N_3)%N;\r
-       unsigned c2 = (Kmer/N_2)%N;\r
-       unsigned c1 = (Kmer/N)%N;\r
-       unsigned c0 = Kmer%N;\r
-\r
-       s[0] = LetterToChar(c3);\r
-       s[1] = LetterToChar(c2);\r
-       s[2] = LetterToChar(c1);\r
-       s[3] = LetterToChar(c0);\r
-       return s;\r
-       }\r
-\r
-void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])\r
-       {\r
-#if    TRACE\r
-       Log("CountKmers\n");\r
-#endif\r
-       memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));\r
-\r
-       const byte *ptrKmerStart = s;\r
-       const byte *ptrKmerEnd = s + 4;\r
-       const byte *ptrSeqEnd = s + uSeqLength;\r
-\r
-       unsigned c3 = s[0]*N_3;\r
-       unsigned c2 = s[1]*N_2;\r
-       unsigned c1 = s[2]*N;\r
-       unsigned c0 = s[3];\r
-\r
-       unsigned Kmer = c3 + c2 + c1 + c0;\r
-\r
-       for (;;)\r
-               {\r
-               assert(Kmer < TABLE_SIZE);\r
-\r
-#if    TRACE\r
-               Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));\r
-#endif\r
-               ++(KmerCounts[Kmer]);\r
-\r
-               if (ptrKmerEnd == ptrSeqEnd)\r
-                       break;\r
-\r
-       // Compute k-mer as function of previous k-mer:\r
-       // 1. Subtract first letter from previous k-mer.\r
-       // 2. Multiply by N.\r
-       // 3. Add next letter.\r
-               c3 = (*ptrKmerStart++) * N_3;\r
-               Kmer = (Kmer - c3)*N;\r
-               Kmer += *ptrKmerEnd++;\r
-               }\r
-       }\r
-\r
-unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,\r
-  const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)\r
-       {\r
-       byte KmerCounts2[TABLE_SIZE];\r
-       CountKmers(Seq2, uSeqLength2, KmerCounts2);\r
-\r
-       const byte *ptrKmerStart = Seq;\r
-       const byte *ptrKmerEnd = Seq + 4;\r
-       const byte *ptrSeqEnd = Seq + uSeqLength;\r
-\r
-       unsigned c3 = Seq[0]*N_3;\r
-       unsigned c2 = Seq[1]*N_2;\r
-       unsigned c1 = Seq[2]*N;\r
-       unsigned c0 = Seq[3];\r
-\r
-       unsigned Kmer = c3 + c2 + c1 + c0;\r
-\r
-       unsigned uCommonCount = 0;\r
-       for (;;)\r
-               {\r
-               assert(Kmer < TABLE_SIZE);\r
-\r
-               const byte Count1 = KmerCounts1[Kmer];\r
-               const byte Count2 = KmerCounts2[Kmer];\r
-\r
-               uCommonCount += MIN(Count1, Count2);\r
-\r
-       // Hack so we don't double-count\r
-               KmerCounts2[Kmer] = 0;\r
-\r
-               if (ptrKmerEnd == ptrSeqEnd)\r
-                       break;\r
-\r
-       // Compute k-mer as function of previous k-mer:\r
-       // 1. Subtract first letter from previous k-mer.\r
-       // 2. Multiply by N.\r
-       // 3. Add next letter.\r
-               c3 = (*ptrKmerStart++) * N_3;\r
-               Kmer = (Kmer - c3)*N;\r
-               Kmer += *ptrKmerEnd++;\r
-               }\r
-       return uCommonCount;\r
-       }\r
-\r
-static void SeqToLetters(const Seq &s, byte Letters[])\r
-       {\r
-       const unsigned uSeqLength = s.Length();\r
-       for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)\r
-               {\r
-               char c = s.GetChar(uCol);\r
-       // Ugly hack. My k-mer counting code isn't wild-card\r
-       // aware. Arbitrarily replace wildcards by a specific\r
-       // amino acid.\r
-               if (IsWildcardChar(c))\r
-                       c = 'A';\r
-               *Letters++ = CharToLetter(c);\r
-               }\r
-       }\r
-\r
-void FastDistKmer(const SeqVect &v, DistFunc &DF)\r
-       {\r
-       byte KmerCounts[TABLE_SIZE];\r
-\r
-       const unsigned uSeqCount = v.GetSeqCount();\r
-\r
-       DF.SetCount(uSeqCount);\r
-       if (0 == uSeqCount)\r
-               return;\r
-\r
-// Initialize distance matrix to zero\r
-       for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
-               {\r
-               DF.SetDist(uSeq1, uSeq1, 0);\r
-               for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
-                       DF.SetDist(uSeq1, uSeq2, 0);\r
-               }\r
-\r
-       unsigned uMaxLength = 0;\r
-       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
-               {\r
-               const Seq &s = v.GetSeq(uSeqIndex);\r
-               unsigned uSeqLength = s.Length();\r
-               if (uSeqLength > uMaxLength)\r
-                       uMaxLength = uSeqLength;\r
-               }\r
-       if (0 == uMaxLength)\r
-               return;\r
-\r
-       byte *Seq1Letters = new byte[uMaxLength];\r
-       byte *Seq2Letters = new byte[uMaxLength];\r
-\r
-       for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)\r
-               {\r
-               const Seq &s1 = v.GetSeq(uSeqIndex1);\r
-               const unsigned uSeqLength1 = s1.Length();\r
-\r
-               SeqToLetters(s1, Seq1Letters);\r
-               CountKmers(Seq1Letters, uSeqLength1, KmerCounts);\r
-\r
-               for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;\r
-                 ++uSeqIndex2)\r
-                       {\r
-                       const Seq &s2 = v.GetSeq(uSeqIndex2);\r
-                       const unsigned uSeqLength2 = s2.Length();\r
-\r
-                       SeqToLetters(s2, Seq2Letters);\r
-\r
-                       unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,\r
-                         KmerCounts, Seq2Letters, uSeqLength2);\r
-\r
-                       unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);\r
-                       double F = (double) uCommonKmerCount / (uMinLength - K + 1);\r
-                       if (0.0 == F)\r
-                               F = 0.01;\r
-                       double Y = log(0.02 + F);\r
-                       double EstimatedPctId = Y/4.12 + 0.995;\r
-                       double KD = KimuraDist(EstimatedPctId);\r
-//                     DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);\r
-                       DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));\r
-#if    TRACE\r
-                       Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",\r
-                         uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);\r
-#endif\r
-                       }\r
-               }\r
-\r
-       delete[] Seq1Letters;\r
-       delete[] Seq2Letters;\r
-       }\r