+++ /dev/null
-#include "muscle.h"\r
-#include "pwpath.h"\r
-#include "estring.h"\r
-#include "seq.h"\r
-#include "msa.h"\r
-\r
-/***\r
-An "estring" is an edit string that operates on a sequence.\r
-An estring is represented as a vector of integers.\r
-It is interpreted in order of increasing suffix.\r
-A positive value n means copy n letters.\r
-A negative value -n means insert n indels.\r
-Zero marks the end of the vector.\r
-Consecutive entries must have opposite sign, i.e. the\r
-shortest possible representation must be used.\r
-\r
-A "tpair" is a traceback path for a pairwise alignment\r
-represented as two estrings, one for each sequence.\r
-***/\r
-\r
-#define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d)\r
-\r
-unsigned LengthEstring(const short es[])\r
- {\r
- unsigned i = 0;\r
- while (*es++ != 0)\r
- ++i;\r
- return i;\r
- }\r
-\r
-short *EstringNewCopy(const short es[])\r
- {\r
- unsigned n = LengthEstring(es) + 1;\r
- short *esNew = new short[n];\r
- memcpy(esNew, es, n*sizeof(short));\r
- return esNew;\r
- }\r
-\r
-void LogEstring(const short es[])\r
- {\r
- Log("<");\r
- for (unsigned i = 0; es[i] != 0; ++i)\r
- {\r
- if (i > 0)\r
- Log(" ");\r
- Log("%d", es[i]);\r
- }\r
- Log(">");\r
- }\r
-\r
-static bool EstringsEq(const short es1[], const short es2[])\r
- {\r
- for (;;)\r
- {\r
- if (*es1 != *es2)\r
- return false;\r
- if (0 == *es1)\r
- break;\r
- ++es1;\r
- ++es2;\r
- }\r
- return true;\r
- }\r
-\r
-static void EstringCounts(const short es[], unsigned *ptruSymbols,\r
- unsigned *ptruIndels)\r
- {\r
- unsigned uSymbols = 0;\r
- unsigned uIndels = 0;\r
- for (unsigned i = 0; es[i] != 0; ++i)\r
- {\r
- short n = es[i];\r
- if (n > 0)\r
- uSymbols += n;\r
- else if (n < 0)\r
- uIndels += -n;\r
- }\r
- *ptruSymbols = uSymbols;\r
- *ptruIndels = uIndels;\r
- }\r
-\r
-static char *EstringOp(const short es[], const char s[])\r
- {\r
- unsigned uSymbols;\r
- unsigned uIndels;\r
- EstringCounts(es, &uSymbols, &uIndels);\r
- assert((unsigned) strlen(s) == uSymbols);\r
- char *sout = new char[uSymbols + uIndels + 1];\r
- char *psout = sout;\r
- for (;;)\r
- {\r
- int n = *es++;\r
- if (0 == n)\r
- break;\r
- if (n > 0)\r
- for (int i = 0; i < n; ++i)\r
- *psout++ = *s++;\r
- else\r
- for (int i = 0; i < -n; ++i)\r
- *psout++ = '-';\r
- }\r
- assert(0 == *s);\r
- *psout = 0;\r
- return sout;\r
- }\r
-\r
-void EstringOp(const short es[], const Seq &sIn, Seq &sOut)\r
- {\r
-#if DEBUG\r
- unsigned uSymbols;\r
- unsigned uIndels;\r
- EstringCounts(es, &uSymbols, &uIndels);\r
- assert(sIn.Length() == uSymbols);\r
-#endif\r
- sOut.Clear();\r
- sOut.SetName(sIn.GetName());\r
- int p = 0;\r
- for (;;)\r
- {\r
- int n = *es++;\r
- if (0 == n)\r
- break;\r
- if (n > 0)\r
- for (int i = 0; i < n; ++i)\r
- {\r
- char c = sIn[p++];\r
- sOut.push_back(c);\r
- }\r
- else\r
- for (int i = 0; i < -n; ++i)\r
- sOut.push_back('-');\r
- }\r
- }\r
-\r
-unsigned EstringOp(const short es[], const Seq &sIn, MSA &a)\r
- {\r
- unsigned uSymbols;\r
- unsigned uIndels;\r
- EstringCounts(es, &uSymbols, &uIndels);\r
- assert(sIn.Length() == uSymbols);\r
-\r
- unsigned uColCount = uSymbols + uIndels;\r
-\r
- a.Clear();\r
- a.SetSize(1, uColCount);\r
-\r
- a.SetSeqName(0, sIn.GetName());\r
- a.SetSeqId(0, sIn.GetId());\r
-\r
- unsigned p = 0;\r
- unsigned uColIndex = 0;\r
- for (;;)\r
- {\r
- int n = *es++;\r
- if (0 == n)\r
- break;\r
- if (n > 0)\r
- for (int i = 0; i < n; ++i)\r
- {\r
- char c = sIn[p++];\r
- a.SetChar(0, uColIndex++, c);\r
- }\r
- else\r
- for (int i = 0; i < -n; ++i)\r
- a.SetChar(0, uColIndex++, '-');\r
- }\r
- assert(uColIndex == uColCount);\r
- return uColCount;\r
- }\r
-\r
-void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB)\r
- {\r
-// First pass to determine size of estrings esA and esB\r
- const unsigned uEdgeCount = Path.GetEdgeCount();\r
- if (0 == uEdgeCount)\r
- {\r
- short *esA = new short[1];\r
- short *esB = new short[1];\r
- esA[0] = 0;\r
- esB[0] = 0;\r
- *ptresA = esA;\r
- *ptresB = esB;\r
- return;\r
- }\r
-\r
- unsigned iLengthA = 1;\r
- unsigned iLengthB = 1;\r
- const char cFirstEdgeType = Path.GetEdge(0).cType;\r
- char cPrevEdgeType = cFirstEdgeType;\r
- for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
- {\r
- const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
- char cEdgeType = Edge.cType;\r
-\r
- switch (c2(cPrevEdgeType, cEdgeType))\r
- {\r
- case c2('M', 'M'):\r
- case c2('D', 'D'):\r
- case c2('I', 'I'):\r
- break;\r
-\r
- case c2('D', 'M'):\r
- case c2('M', 'D'):\r
- ++iLengthB;\r
- break;\r
-\r
- case c2('I', 'M'):\r
- case c2('M', 'I'):\r
- ++iLengthA;\r
- break;\r
-\r
- case c2('I', 'D'):\r
- case c2('D', 'I'):\r
- ++iLengthB;\r
- ++iLengthA;\r
- break;\r
-\r
- default:\r
- assert(false);\r
- }\r
- cPrevEdgeType = cEdgeType;\r
- }\r
-\r
-// Pass2 for seq A\r
- {\r
- short *esA = new short[iLengthA+1];\r
- unsigned iA = 0;\r
- switch (Path.GetEdge(0).cType)\r
- {\r
- case 'M':\r
- case 'D':\r
- esA[0] = 1;\r
- break;\r
-\r
- case 'I':\r
- esA[0] = -1;\r
- break;\r
-\r
- default:\r
- assert(false);\r
- }\r
-\r
- char cPrevEdgeType = cFirstEdgeType;\r
- for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
- {\r
- const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
- char cEdgeType = Edge.cType;\r
-\r
- switch (c2(cPrevEdgeType, cEdgeType))\r
- {\r
- case c2('M', 'M'):\r
- case c2('D', 'D'):\r
- case c2('D', 'M'):\r
- case c2('M', 'D'):\r
- ++(esA[iA]);\r
- break;\r
-\r
- case c2('I', 'D'):\r
- case c2('I', 'M'):\r
- ++iA;\r
- esA[iA] = 1;\r
- break;\r
-\r
- case c2('M', 'I'):\r
- case c2('D', 'I'):\r
- ++iA;\r
- esA[iA] = -1;\r
- break;\r
-\r
- case c2('I', 'I'):\r
- --(esA[iA]);\r
- break;\r
-\r
- default:\r
- assert(false);\r
- }\r
-\r
- cPrevEdgeType = cEdgeType;\r
- }\r
- assert(iA == iLengthA - 1);\r
- esA[iLengthA] = 0;\r
- *ptresA = esA;\r
- }\r
-\r
- {\r
-// Pass2 for seq B\r
- short *esB = new short[iLengthB+1];\r
- unsigned iB = 0;\r
- switch (Path.GetEdge(0).cType)\r
- {\r
- case 'M':\r
- case 'I':\r
- esB[0] = 1;\r
- break;\r
-\r
- case 'D':\r
- esB[0] = -1;\r
- break;\r
-\r
- default:\r
- assert(false);\r
- }\r
-\r
- char cPrevEdgeType = cFirstEdgeType;\r
- for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
- {\r
- const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
- char cEdgeType = Edge.cType;\r
-\r
- switch (c2(cPrevEdgeType, cEdgeType))\r
- {\r
- case c2('M', 'M'):\r
- case c2('I', 'I'):\r
- case c2('I', 'M'):\r
- case c2('M', 'I'):\r
- ++(esB[iB]);\r
- break;\r
-\r
- case c2('D', 'I'):\r
- case c2('D', 'M'):\r
- ++iB;\r
- esB[iB] = 1;\r
- break;\r
-\r
- case c2('M', 'D'):\r
- case c2('I', 'D'):\r
- ++iB;\r
- esB[iB] = -1;\r
- break;\r
-\r
- case c2('D', 'D'):\r
- --(esB[iB]);\r
- break;\r
-\r
- default:\r
- assert(false);\r
- }\r
-\r
- cPrevEdgeType = cEdgeType;\r
- }\r
- assert(iB == iLengthB - 1);\r
- esB[iLengthB] = 0;\r
- *ptresB = esB;\r
- }\r
-\r
-#if DEBUG\r
- {\r
- const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1);\r
- unsigned uSymbols;\r
- unsigned uIndels;\r
- EstringCounts(*ptresA, &uSymbols, &uIndels);\r
- assert(uSymbols == LastEdge.uPrefixLengthA);\r
- assert(uSymbols + uIndels == uEdgeCount);\r
-\r
- EstringCounts(*ptresB, &uSymbols, &uIndels);\r
- assert(uSymbols == LastEdge.uPrefixLengthB);\r
- assert(uSymbols + uIndels == uEdgeCount);\r
-\r
- PWPath TmpPath;\r
- EstringsToPath(*ptresA, *ptresB, TmpPath);\r
- TmpPath.AssertEqual(Path);\r
- }\r
-#endif\r
- }\r
-\r
-void EstringsToPath(const short esA[], const short esB[], PWPath &Path)\r
- {\r
- Path.Clear();\r
- unsigned iA = 0;\r
- unsigned iB = 0;\r
- int nA = esA[iA++];\r
- int nB = esB[iB++];\r
- unsigned uPrefixLengthA = 0;\r
- unsigned uPrefixLengthB = 0;\r
- for (;;)\r
- {\r
- char cType;\r
- if (nA > 0)\r
- {\r
- if (nB > 0)\r
- {\r
- cType = 'M';\r
- --nA;\r
- --nB;\r
- }\r
- else if (nB < 0)\r
- {\r
- cType = 'D';\r
- --nA;\r
- ++nB;\r
- }\r
- else\r
- assert(false);\r
- }\r
- else if (nA < 0)\r
- {\r
- if (nB > 0)\r
- {\r
- cType = 'I';\r
- ++nA;\r
- --nB;\r
- }\r
- else\r
- assert(false);\r
- }\r
- else\r
- assert(false);\r
-\r
- switch (cType)\r
- {\r
- case 'M':\r
- ++uPrefixLengthA;\r
- ++uPrefixLengthB;\r
- break;\r
- case 'D':\r
- ++uPrefixLengthA;\r
- break;\r
- case 'I':\r
- ++uPrefixLengthB;\r
- break;\r
- }\r
-\r
- PWEdge Edge;\r
- Edge.cType = cType;\r
- Edge.uPrefixLengthA = uPrefixLengthA;\r
- Edge.uPrefixLengthB = uPrefixLengthB;\r
- Path.AppendEdge(Edge);\r
-\r
- if (nA == 0)\r
- {\r
- if (0 == esA[iA])\r
- {\r
- assert(0 == esB[iB]);\r
- break;\r
- }\r
- nA = esA[iA++];\r
- }\r
- if (nB == 0)\r
- nB = esB[iB++];\r
- }\r
- }\r
-\r
-/***\r
-Multiply two estrings to make a third estring.\r
-The product of two estrings e1*e2 is defined to be\r
-the estring that produces the same result as applying\r
-e1 then e2. Multiplication is not commutative. In fact,\r
-the reversed order is undefined unless both estrings\r
-consist of a single, identical, positive entry.\r
-A primary motivation for using estrings is that\r
-multiplication is very fast, reducing the time\r
-needed to construct the root alignment.\r
-\r
-Example\r
-\r
- <-1,3>(XXX) = -XXX\r
- <2,-1,2>(-XXX) = -X-XX\r
-\r
-Therefore,\r
-\r
- <-1,3>*<2,-1,2> = <-1,1,-1,2>\r
-***/\r
-\r
-static bool CanMultiplyEstrings(const short es1[], const short es2[])\r
- {\r
- unsigned uSymbols1;\r
- unsigned uSymbols2;\r
- unsigned uIndels1;\r
- unsigned uIndels2;\r
- EstringCounts(es1, &uSymbols1, &uIndels1);\r
- EstringCounts(es2, &uSymbols2, &uIndels2);\r
- return uSymbols1 + uIndels1 == uSymbols2;\r
- }\r
-\r
-static inline void AppendGaps(short esp[], int &ip, int n)\r
- {\r
- if (-1 == ip)\r
- esp[++ip] = n;\r
- else if (esp[ip] < 0)\r
- esp[ip] += n;\r
- else\r
- esp[++ip] = n;\r
- }\r
-\r
-static inline void AppendSymbols(short esp[], int &ip, int n)\r
- {\r
- if (-1 == ip)\r
- esp[++ip] = n;\r
- else if (esp[ip] > 0)\r
- esp[ip] += n;\r
- else\r
- esp[++ip] = n;\r
- }\r
-\r
-void MulEstrings(const short es1[], const short es2[], short esp[])\r
- {\r
- assert(CanMultiplyEstrings(es1, es2));\r
-\r
- unsigned i1 = 0;\r
- int ip = -1;\r
- int n1 = es1[i1++];\r
- for (unsigned i2 = 0; ; ++i2)\r
- {\r
- int n2 = es2[i2];\r
- if (0 == n2)\r
- break;\r
- if (n2 > 0)\r
- {\r
- for (;;)\r
- {\r
- if (n1 < 0)\r
- {\r
- if (n2 > -n1)\r
- {\r
- AppendGaps(esp, ip, n1);\r
- n2 += n1;\r
- n1 = es1[i1++];\r
- }\r
- else if (n2 == -n1)\r
- {\r
- AppendGaps(esp, ip, n1);\r
- n1 = es1[i1++];\r
- break;\r
- }\r
- else\r
- {\r
- assert(n2 < -n1);\r
- AppendGaps(esp, ip, -n2);\r
- n1 += n2;\r
- break;\r
- }\r
- }\r
- else\r
- {\r
- assert(n1 > 0);\r
- if (n2 > n1)\r
- {\r
- AppendSymbols(esp, ip, n1);\r
- n2 -= n1;\r
- n1 = es1[i1++];\r
- }\r
- else if (n2 == n1)\r
- {\r
- AppendSymbols(esp, ip, n1);\r
- n1 = es1[i1++];\r
- break;\r
- }\r
- else\r
- {\r
- assert(n2 < n1);\r
- AppendSymbols(esp, ip, n2);\r
- n1 -= n2;\r
- break;\r
- }\r
- }\r
- }\r
- }\r
- else\r
- {\r
- assert(n2 < 0);\r
- AppendGaps(esp, ip, n2);\r
- }\r
- }\r
- esp[++ip] = 0;\r
-\r
-#if DEBUG\r
- {\r
- int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1);\r
- assert(ip < MaxLen);\r
- if (ip >= 2)\r
- for (int i = 0; i < ip - 2; ++i)\r
- {\r
- if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0))\r
- {\r
- Log("Bad result of MulEstring: ");\r
- LogEstring(esp);\r
- Quit("Assert failed (alternating signs)");\r
- }\r
- }\r
- unsigned uSymbols1;\r
- unsigned uSymbols2;\r
- unsigned uSymbolsp;\r
- unsigned uIndels1;\r
- unsigned uIndels2;\r
- unsigned uIndelsp;\r
- EstringCounts(es1, &uSymbols1, &uIndels1);\r
- EstringCounts(es2, &uSymbols2, &uIndels2);\r
- EstringCounts(esp, &uSymbolsp, &uIndelsp);\r
- if (uSymbols1 + uIndels1 != uSymbols2)\r
- {\r
- Log("Bad result of MulEstring: ");\r
- LogEstring(esp);\r
- Quit("Assert failed (counts1 %u %u %u)",\r
- uSymbols1, uIndels1, uSymbols2);\r
- }\r
- }\r
-#endif\r
- }\r
-\r
-static void test(const short es1[], const short es2[], const short esa[])\r
- {\r
- unsigned uSymbols1;\r
- unsigned uSymbols2;\r
- unsigned uIndels1;\r
- unsigned uIndels2;\r
- EstringCounts(es1, &uSymbols1, &uIndels1);\r
- EstringCounts(es2, &uSymbols2, &uIndels2);\r
-\r
- char s[4096];\r
- memset(s, 'X', sizeof(s));\r
- s[uSymbols1] = 0;\r
-\r
- char *s1 = EstringOp(es1, s);\r
- char *s12 = EstringOp(es2, s1);\r
-\r
- memset(s, 'X', sizeof(s));\r
- s[uSymbols2] = 0;\r
- char *s2 = EstringOp(es2, s);\r
-\r
- Log("%s * %s = %s\n", s1, s2, s12);\r
-\r
- LogEstring(es1);\r
- Log(" * ");\r
- LogEstring(es2);\r
- Log(" = ");\r
- LogEstring(esa);\r
- Log("\n");\r
-\r
- short esp[4096];\r
- MulEstrings(es1, es2, esp);\r
- LogEstring(esp);\r
- if (!EstringsEq(esp, esa))\r
- Log(" *ERROR* ");\r
- Log("\n");\r
-\r
- memset(s, 'X', sizeof(s));\r
- s[uSymbols1] = 0;\r
- char *sp = EstringOp(esp, s);\r
- Log("%s\n", sp);\r
- Log("\n==========\n\n");\r
- }\r
-\r
-void TestEstrings()\r
- {\r
- SetListFileName("c:\\tmp\\muscle.log", false);\r
- //{\r
- //short es1[] = { -1, 1, -1, 0 };\r
- //short es2[] = { 1, -1, 2, 0 };\r
- //short esa[] = { -2, 1, -1, 0 };\r
- //test(es1, es2, esa);\r
- //}\r
- //{\r
- //short es1[] = { 2, -1, 2, 0 };\r
- //short es2[] = { 1, -1, 3, -1, 1, 0 };\r
- //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 };\r
- //test(es1, es2, esa);\r
- //}\r
- //{\r
- //short es1[] = { -1, 3, 0 };\r
- //short es2[] = { 2, -1, 2, 0 };\r
- //short esa[] = { -1, 1, -1, 2, 0 };\r
- //test(es1, es2, esa);\r
- //}\r
- //{\r
- //short es1[] = { -1, 1, -1, 1, 0};\r
- //short es2[] = { 4, 0 };\r
- //short esa[] = { -1, 1, -1, 1, 0};\r
- //test(es1, es2, esa);\r
- //}\r
- //{\r
- //short es1[] = { 1, -1, 1, -1, 0};\r
- //short es2[] = { 4, 0 };\r
- //short esa[] = { 1, -1, 1, -1, 0};\r
- //test(es1, es2, esa);\r
- //}\r
- //{\r
- //short es1[] = { 1, -1, 1, -1, 0};\r
- //short es2[] = { -1, 4, -1, 0 };\r
- //short esa[] = { -1, 1, -1, 1, -2, 0};\r
- //test(es1, es2, esa);\r
- //}\r
- {\r
- short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0};\r
- short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 };\r
- short esa[] = { 0 };\r
- test(es1, es2, esa);\r
- }\r
- exit(0);\r
- }\r