#include "muscle.h" #include "distfunc.h" #include "seqvect.h" #include #define TRACE 0 #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) const unsigned TUPLE_COUNT = 6*6*6*6*6*6; static unsigned char Count1[TUPLE_COUNT]; static unsigned char Count2[TUPLE_COUNT]; // Amino acid groups according to MAFFT (sextet5) // 0 = A G P S T // 1 = I L M V // 2 = N D Q E B Z // 3 = R H K // 4 = F W Y // 5 = C // 6 = X . - U unsigned ResidueGroup[] = { 0, // AX_A, 5, // AX_C, 2, // AX_D, 2, // AX_E, 4, // AX_F, 0, // AX_G, 3, // AX_H, 1, // AX_I, 3, // AX_K, 1, // AX_L, 1, // AX_M, 2, // AX_N, 0, // AX_P, 2, // AX_Q, 3, // AX_R, 0, // AX_S, 0, // AX_T, 1, // AX_V, 4, // AX_W, 4, // AX_Y, 2, // AX_B, // D or N 2, // AX_Z, // E or Q 0, // AX_X, // Unknown // ******** TODO ************* // This isn't the correct way of avoiding group 6 0 // AX_GAP, // ******** TODO ****************** }; unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); static char *TupleToStr(int t) { static char s[7]; int t1, t2, t3, t4, t5, t6; t1 = t%6; t2 = (t/6)%6; t3 = (t/(6*6))%6; t4 = (t/(6*6*6))%6; t5 = (t/(6*6*6*6))%6; t6 = (t/(6*6*6*6*6))%6; s[5] = '0' + t1; s[4] = '0' + t2; s[3] = '0' + t3; s[2] = '0' + t4; s[1] = '0' + t5; s[0] = '0' + t6; return s; } static unsigned GetTuple(const unsigned uLetters[], unsigned n) { assert(uLetters[n] < uResidueGroupCount); assert(uLetters[n+1] < uResidueGroupCount); assert(uLetters[n+2] < uResidueGroupCount); assert(uLetters[n+3] < uResidueGroupCount); assert(uLetters[n+4] < uResidueGroupCount); assert(uLetters[n+5] < uResidueGroupCount); unsigned u1 = ResidueGroup[uLetters[n]]; unsigned u2 = ResidueGroup[uLetters[n+1]]; unsigned u3 = ResidueGroup[uLetters[n+2]]; unsigned u4 = ResidueGroup[uLetters[n+3]]; unsigned u5 = ResidueGroup[uLetters[n+4]]; unsigned u6 = ResidueGroup[uLetters[n+5]]; return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; } static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) { memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); ++(Count[uTuple]); } } static void ListCount(const unsigned char Count[]) { for (unsigned n = 0; n < TUPLE_COUNT; ++n) { if (0 == Count[n]) continue; Log("%s %u\n", TupleToStr(n), Count[n]); } } void DistKmer6_6(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } // Convert to letters unsigned **Letters = new unsigned *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); unsigned *L = new unsigned[uSeqLength]; Letters[uSeqIndex] = L; for (unsigned n = 0; n < uSeqLength; ++n) { char c = s[n]; L[n] = CharToLetterEx(c); assert(L[n] < uResidueGroupCount); } } unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; for (unsigned n = 0; n < uSeqCount; ++n) { uCommonTupleCount[n] = new unsigned[uSeqCount]; memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); } const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &seq1 = *(v[uSeq1]); const unsigned uSeqLength1 = seq1.Length(); if (uSeqLength1 < 5) continue; const unsigned uTupleCount = uSeqLength1 - 5; const unsigned *L = Letters[uSeq1]; CountTuples(L, uTupleCount, Count1); #if TRACE { Log("Seq1=%d\n", uSeq1); Log("Groups:\n"); for (unsigned n = 0; n < uSeqLength1; ++n) Log("%u", ResidueGroup[L[n]]); Log("\n"); Log("Tuples:\n"); ListCount(Count1); } #endif SetProgressDesc("K-mer dist pass 1"); for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; Seq &seq2 = *(v[uSeq2]); const unsigned uSeqLength2 = seq2.Length(); if (uSeqLength2 < 5) { if (uSeq1 == uSeq2) DF.SetDist(uSeq1, uSeq2, 0); else DF.SetDist(uSeq1, uSeq2, 1); continue; } // First pass through seq 2 to count tuples const unsigned uTupleCount = uSeqLength2 - 5; const unsigned *L = Letters[uSeq2]; CountTuples(L, uTupleCount, Count2); #if TRACE Log("Seq2=%d Counts=\n", uSeq2); ListCount(Count2); #endif // Second pass to accumulate sum of shared tuples // MAFFT defines this as the sum over unique tuples // in seq2 of the minimum of the number of tuples found // in the two sequences. unsigned uSum = 0; for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); uSum += MIN(Count1[uTuple], Count2[uTuple]); // This is a hack to make sure each unique tuple counted only once. Count2[uTuple] = 0; } #if TRACE { Seq &s1 = *(v[uSeq1]); Seq &s2 = *(v[uSeq2]); const char *pName1 = s1.GetName(); const char *pName2 = s2.GetName(); Log("Common count %s(%d) - %s(%d) =%u\n", pName1, uSeq1, pName2, uSeq2, uSum); } #endif uCommonTupleCount[uSeq1][uSeq2] = uSum; uCommonTupleCount[uSeq2][uSeq1] = uSum; } } ProgressStepsDone(); uCount = 0; SetProgressDesc("K-mer dist pass 2"); for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &s1 = *(v[uSeq1]); const char *pName1 = s1.GetName(); double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; if (0 == dCommonTupleCount11) dCommonTupleCount11 = 1; DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; if (0 == dCommonTupleCount22) dCommonTupleCount22 = 1; const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount11; const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount22; // dMinDist is the value used for tree-building in MAFFT const double dMinDist = MIN(dDist1, dDist2); DF.SetDist(uSeq1, uSeq2, (float) dMinDist); //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); // **** TODO **** why does this make score slightly worse?? //const double dKimuraDist = KimuraDist(dEstimatedPctId); //DF.SetDist(uSeq1, uSeq2, dKimuraDist); } } ProgressStepsDone(); for (unsigned n = 0; n < uSeqCount; ++n) delete[] uCommonTupleCount[n]; delete[] uCommonTupleCount; delete[] Letters; } double PctIdToMAFFTDist(double dPctId) { if (dPctId < 0.05) dPctId = 0.05; double dDist = -log(dPctId); return dDist; } double PctIdToHeightMAFFT(double dPctId) { return PctIdToMAFFTDist(dPctId); }