binaries/src/muscle/fastdistkmer.cpp

   1 #include "muscle.h"\r
   2 #include "msa.h"\r
   3 #include "seqvect.h"\r
   4 #include "seq.h"\r
   5 #include "distfunc.h"\r
   6 #include <math.h>\r
   7 \r
   8 #define TRACE   0\r
   9 \r
  10 /***\r
  11 Some candidate alphabets considered because they\r
  12 have high correlations and small table sizes.\r
  13 Correlation coefficent is between k-mer distance\r
  14 and %id D measured from a CLUSTALW alignment.\r
  15 Table size is N^k where N is size of alphabet.\r
  16 A is standard (uncompressed) amino alphabet.\r
  17 \r
  18                            Correlation\r
  19 Alpha   N  k  Table Size   all   25-50%\r
  20 -----  --  -  ----------   ----  ------\r
  21 A      20  3       8,000  0.943   0.575\r
  22 A      20  4     160,000  0.962   0.685 <<\r
  23 LiA    14  4      38,416  0.966   0.645\r
  24 SEB    14  4      38,416  0.964   0.634\r
  25 LiA    13  4      28,561  0.965   0.640\r
  26 LiA    12  4      20,736  0.963   0.620\r
  27 LiA    10  5     100,000  0.964   0.652\r
  28 \r
  29 We select A with k=4 because it has the best\r
  30 correlations. The only drawback is a large table\r
  31 size, but space is readily available and the only \r
  32 additional time cost is in resetting the table to\r
  33 zero, which can be done quickly with memset or by\r
  34 keeping a list of the k-mers that were found (should\r
  35 test to see which is faster, and may vary by compiler\r
  36 and processor type). It also has the minor advantage\r
  37 that we don't need to convert the alphabet.\r
  38 \r
  39 Fractional identity d is estimated as follows.\r
  40 \r
  41         F = fractional k-mer count\r
  42         if F is 0: F = 0.01\r
  43         Y = log(0.02 + F)\r
  44         d = -4.1 + 4.12*Y\r
  45 \r
  46 The constant 0.02 was chosen to make the relationship\r
  47 between Y and D linear. The constants -4.1 and 4.12\r
  48 were chosen to fit a straight line to the scatterplot\r
  49 of Y vs D.\r
  50 ***/\r
  51 \r
  52 #define MIN(x, y)       (((x) < (y)) ? (x) : (y))\r
  53 \r
  54 const unsigned K = 4;\r
  55 const unsigned N = 20;\r
  56 const unsigned N_2 = 20*20;\r
  57 const unsigned N_3 = 20*20*20;\r
  58 const unsigned N_4 = 20*20*20*20;\r
  59 \r
  60 const unsigned TABLE_SIZE = N_4;\r
  61 \r
  62 // For debug output\r
  63 const char *KmerToStr(unsigned Kmer)\r
  64         {\r
  65         static char s[5];\r
  66 \r
  67         unsigned c3 = (Kmer/N_3)%N;\r
  68         unsigned c2 = (Kmer/N_2)%N;\r
  69         unsigned c1 = (Kmer/N)%N;\r
  70         unsigned c0 = Kmer%N;\r
  71 \r
  72         s[0] = LetterToChar(c3);\r
  73         s[1] = LetterToChar(c2);\r
  74         s[2] = LetterToChar(c1);\r
  75         s[3] = LetterToChar(c0);\r
  76         return s;\r
  77         }\r
  78 \r
  79 void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])\r
  80         {\r
  81 #if     TRACE\r
  82         Log("CountKmers\n");\r
  83 #endif\r
  84         memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));\r
  85 \r
  86         const byte *ptrKmerStart = s;\r
  87         const byte *ptrKmerEnd = s + 4;\r
  88         const byte *ptrSeqEnd = s + uSeqLength;\r
  89 \r
  90         unsigned c3 = s[0]*N_3;\r
  91         unsigned c2 = s[1]*N_2;\r
  92         unsigned c1 = s[2]*N;\r
  93         unsigned c0 = s[3];\r
  94 \r
  95         unsigned Kmer = c3 + c2 + c1 + c0;\r
  96 \r
  97         for (;;)\r
  98                 {\r
  99                 assert(Kmer < TABLE_SIZE);\r
 100 \r
 101 #if     TRACE\r
 102                 Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));\r
 103 #endif\r
 104                 ++(KmerCounts[Kmer]);\r
 105 \r
 106                 if (ptrKmerEnd == ptrSeqEnd)\r
 107                         break;\r
 108 \r
 109         // Compute k-mer as function of previous k-mer:\r
 110         // 1. Subtract first letter from previous k-mer.\r
 111         // 2. Multiply by N.\r
 112         // 3. Add next letter.\r
 113                 c3 = (*ptrKmerStart++) * N_3;\r
 114                 Kmer = (Kmer - c3)*N;\r
 115                 Kmer += *ptrKmerEnd++;\r
 116                 }\r
 117         }\r
 118 \r
 119 unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,\r
 120   const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)\r
 121         {\r
 122         byte KmerCounts2[TABLE_SIZE];\r
 123         CountKmers(Seq2, uSeqLength2, KmerCounts2);\r
 124 \r
 125         const byte *ptrKmerStart = Seq;\r
 126         const byte *ptrKmerEnd = Seq + 4;\r
 127         const byte *ptrSeqEnd = Seq + uSeqLength;\r
 128 \r
 129         unsigned c3 = Seq[0]*N_3;\r
 130         unsigned c2 = Seq[1]*N_2;\r
 131         unsigned c1 = Seq[2]*N;\r
 132         unsigned c0 = Seq[3];\r
 133 \r
 134         unsigned Kmer = c3 + c2 + c1 + c0;\r
 135 \r
 136         unsigned uCommonCount = 0;\r
 137         for (;;)\r
 138                 {\r
 139                 assert(Kmer < TABLE_SIZE);\r
 140 \r
 141                 const byte Count1 = KmerCounts1[Kmer];\r
 142                 const byte Count2 = KmerCounts2[Kmer];\r
 143 \r
 144                 uCommonCount += MIN(Count1, Count2);\r
 145 \r
 146         // Hack so we don't double-count\r
 147                 KmerCounts2[Kmer] = 0;\r
 148 \r
 149                 if (ptrKmerEnd == ptrSeqEnd)\r
 150                         break;\r
 151 \r
 152         // Compute k-mer as function of previous k-mer:\r
 153         // 1. Subtract first letter from previous k-mer.\r
 154         // 2. Multiply by N.\r
 155         // 3. Add next letter.\r
 156                 c3 = (*ptrKmerStart++) * N_3;\r
 157                 Kmer = (Kmer - c3)*N;\r
 158                 Kmer += *ptrKmerEnd++;\r
 159                 }\r
 160         return uCommonCount;\r
 161         }\r
 162 \r
 163 static void SeqToLetters(const Seq &s, byte Letters[])\r
 164         {\r
 165         const unsigned uSeqLength = s.Length();\r
 166         for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)\r
 167                 {\r
 168                 char c = s.GetChar(uCol);\r
 169         // Ugly hack. My k-mer counting code isn't wild-card\r
 170         // aware. Arbitrarily replace wildcards by a specific\r
 171         // amino acid.\r
 172                 if (IsWildcardChar(c))\r
 173                         c = 'A';\r
 174                 *Letters++ = CharToLetter(c);\r
 175                 }\r
 176         }\r
 177 \r
 178 void FastDistKmer(const SeqVect &v, DistFunc &DF)\r
 179         {\r
 180         byte KmerCounts[TABLE_SIZE];\r
 181 \r
 182         const unsigned uSeqCount = v.GetSeqCount();\r
 183 \r
 184         DF.SetCount(uSeqCount);\r
 185         if (0 == uSeqCount)\r
 186                 return;\r
 187 \r
 188 // Initialize distance matrix to zero\r
 189         for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)\r
 190                 {\r
 191                 DF.SetDist(uSeq1, uSeq1, 0);\r
 192                 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)\r
 193                         DF.SetDist(uSeq1, uSeq2, 0);\r
 194                 }\r
 195 \r
 196         unsigned uMaxLength = 0;\r
 197         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
 198                 {\r
 199                 const Seq &s = v.GetSeq(uSeqIndex);\r
 200                 unsigned uSeqLength = s.Length();\r
 201                 if (uSeqLength > uMaxLength)\r
 202                         uMaxLength = uSeqLength;\r
 203                 }\r
 204         if (0 == uMaxLength)\r
 205                 return;\r
 206 \r
 207         byte *Seq1Letters = new byte[uMaxLength];\r
 208         byte *Seq2Letters = new byte[uMaxLength];\r
 209 \r
 210         for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)\r
 211                 {\r
 212                 const Seq &s1 = v.GetSeq(uSeqIndex1);\r
 213                 const unsigned uSeqLength1 = s1.Length();\r
 214 \r
 215                 SeqToLetters(s1, Seq1Letters);\r
 216                 CountKmers(Seq1Letters, uSeqLength1, KmerCounts);\r
 217 \r
 218                 for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;\r
 219                   ++uSeqIndex2)\r
 220                         {\r
 221                         const Seq &s2 = v.GetSeq(uSeqIndex2);\r
 222                         const unsigned uSeqLength2 = s2.Length();\r
 223 \r
 224                         SeqToLetters(s2, Seq2Letters);\r
 225 \r
 226                         unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,\r
 227                           KmerCounts, Seq2Letters, uSeqLength2);\r
 228 \r
 229                         unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);\r
 230                         double F = (double) uCommonKmerCount / (uMinLength - K + 1);\r
 231                         if (0.0 == F)\r
 232                                 F = 0.01;\r
 233                         double Y = log(0.02 + F);\r
 234                         double EstimatedPctId = Y/4.12 + 0.995;\r
 235                         double KD = KimuraDist(EstimatedPctId);\r
 236 //                      DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);\r
 237                         DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));\r
 238 #if     TRACE\r
 239                         Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",\r
 240                           uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);\r
 241 #endif\r
 242                         }\r
 243                 }\r
 244 \r
 245         delete[] Seq1Letters;\r
 246         delete[] Seq2Letters;\r
 247         }\r