--- /dev/null
+#include "muscle.h"\r
+#include "pwpath.h"\r
+#include "estring.h"\r
+#include "seq.h"\r
+#include "msa.h"\r
+\r
+/***\r
+An "estring" is an edit string that operates on a sequence.\r
+An estring is represented as a vector of integers.\r
+It is interpreted in order of increasing suffix.\r
+A positive value n means copy n letters.\r
+A negative value -n means insert n indels.\r
+Zero marks the end of the vector.\r
+Consecutive entries must have opposite sign, i.e. the\r
+shortest possible representation must be used.\r
+\r
+A "tpair" is a traceback path for a pairwise alignment\r
+represented as two estrings, one for each sequence.\r
+***/\r
+\r
+#define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d)\r
+\r
+unsigned LengthEstring(const short es[])\r
+ {\r
+ unsigned i = 0;\r
+ while (*es++ != 0)\r
+ ++i;\r
+ return i;\r
+ }\r
+\r
+short *EstringNewCopy(const short es[])\r
+ {\r
+ unsigned n = LengthEstring(es) + 1;\r
+ short *esNew = new short[n];\r
+ memcpy(esNew, es, n*sizeof(short));\r
+ return esNew;\r
+ }\r
+\r
+void LogEstring(const short es[])\r
+ {\r
+ Log("<");\r
+ for (unsigned i = 0; es[i] != 0; ++i)\r
+ {\r
+ if (i > 0)\r
+ Log(" ");\r
+ Log("%d", es[i]);\r
+ }\r
+ Log(">");\r
+ }\r
+\r
+static bool EstringsEq(const short es1[], const short es2[])\r
+ {\r
+ for (;;)\r
+ {\r
+ if (*es1 != *es2)\r
+ return false;\r
+ if (0 == *es1)\r
+ break;\r
+ ++es1;\r
+ ++es2;\r
+ }\r
+ return true;\r
+ }\r
+\r
+static void EstringCounts(const short es[], unsigned *ptruSymbols,\r
+ unsigned *ptruIndels)\r
+ {\r
+ unsigned uSymbols = 0;\r
+ unsigned uIndels = 0;\r
+ for (unsigned i = 0; es[i] != 0; ++i)\r
+ {\r
+ short n = es[i];\r
+ if (n > 0)\r
+ uSymbols += n;\r
+ else if (n < 0)\r
+ uIndels += -n;\r
+ }\r
+ *ptruSymbols = uSymbols;\r
+ *ptruIndels = uIndels;\r
+ }\r
+\r
+static char *EstringOp(const short es[], const char s[])\r
+ {\r
+ unsigned uSymbols;\r
+ unsigned uIndels;\r
+ EstringCounts(es, &uSymbols, &uIndels);\r
+ assert((unsigned) strlen(s) == uSymbols);\r
+ char *sout = new char[uSymbols + uIndels + 1];\r
+ char *psout = sout;\r
+ for (;;)\r
+ {\r
+ int n = *es++;\r
+ if (0 == n)\r
+ break;\r
+ if (n > 0)\r
+ for (int i = 0; i < n; ++i)\r
+ *psout++ = *s++;\r
+ else\r
+ for (int i = 0; i < -n; ++i)\r
+ *psout++ = '-';\r
+ }\r
+ assert(0 == *s);\r
+ *psout = 0;\r
+ return sout;\r
+ }\r
+\r
+void EstringOp(const short es[], const Seq &sIn, Seq &sOut)\r
+ {\r
+#if DEBUG\r
+ unsigned uSymbols;\r
+ unsigned uIndels;\r
+ EstringCounts(es, &uSymbols, &uIndels);\r
+ assert(sIn.Length() == uSymbols);\r
+#endif\r
+ sOut.Clear();\r
+ sOut.SetName(sIn.GetName());\r
+ int p = 0;\r
+ for (;;)\r
+ {\r
+ int n = *es++;\r
+ if (0 == n)\r
+ break;\r
+ if (n > 0)\r
+ for (int i = 0; i < n; ++i)\r
+ {\r
+ char c = sIn[p++];\r
+ sOut.push_back(c);\r
+ }\r
+ else\r
+ for (int i = 0; i < -n; ++i)\r
+ sOut.push_back('-');\r
+ }\r
+ }\r
+\r
+unsigned EstringOp(const short es[], const Seq &sIn, MSA &a)\r
+ {\r
+ unsigned uSymbols;\r
+ unsigned uIndels;\r
+ EstringCounts(es, &uSymbols, &uIndels);\r
+ assert(sIn.Length() == uSymbols);\r
+\r
+ unsigned uColCount = uSymbols + uIndels;\r
+\r
+ a.Clear();\r
+ a.SetSize(1, uColCount);\r
+\r
+ a.SetSeqName(0, sIn.GetName());\r
+ a.SetSeqId(0, sIn.GetId());\r
+\r
+ unsigned p = 0;\r
+ unsigned uColIndex = 0;\r
+ for (;;)\r
+ {\r
+ int n = *es++;\r
+ if (0 == n)\r
+ break;\r
+ if (n > 0)\r
+ for (int i = 0; i < n; ++i)\r
+ {\r
+ char c = sIn[p++];\r
+ a.SetChar(0, uColIndex++, c);\r
+ }\r
+ else\r
+ for (int i = 0; i < -n; ++i)\r
+ a.SetChar(0, uColIndex++, '-');\r
+ }\r
+ assert(uColIndex == uColCount);\r
+ return uColCount;\r
+ }\r
+\r
+void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB)\r
+ {\r
+// First pass to determine size of estrings esA and esB\r
+ const unsigned uEdgeCount = Path.GetEdgeCount();\r
+ if (0 == uEdgeCount)\r
+ {\r
+ short *esA = new short[1];\r
+ short *esB = new short[1];\r
+ esA[0] = 0;\r
+ esB[0] = 0;\r
+ *ptresA = esA;\r
+ *ptresB = esB;\r
+ return;\r
+ }\r
+\r
+ unsigned iLengthA = 1;\r
+ unsigned iLengthB = 1;\r
+ const char cFirstEdgeType = Path.GetEdge(0).cType;\r
+ char cPrevEdgeType = cFirstEdgeType;\r
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
+ {\r
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
+ char cEdgeType = Edge.cType;\r
+\r
+ switch (c2(cPrevEdgeType, cEdgeType))\r
+ {\r
+ case c2('M', 'M'):\r
+ case c2('D', 'D'):\r
+ case c2('I', 'I'):\r
+ break;\r
+\r
+ case c2('D', 'M'):\r
+ case c2('M', 'D'):\r
+ ++iLengthB;\r
+ break;\r
+\r
+ case c2('I', 'M'):\r
+ case c2('M', 'I'):\r
+ ++iLengthA;\r
+ break;\r
+\r
+ case c2('I', 'D'):\r
+ case c2('D', 'I'):\r
+ ++iLengthB;\r
+ ++iLengthA;\r
+ break;\r
+\r
+ default:\r
+ assert(false);\r
+ }\r
+ cPrevEdgeType = cEdgeType;\r
+ }\r
+\r
+// Pass2 for seq A\r
+ {\r
+ short *esA = new short[iLengthA+1];\r
+ unsigned iA = 0;\r
+ switch (Path.GetEdge(0).cType)\r
+ {\r
+ case 'M':\r
+ case 'D':\r
+ esA[0] = 1;\r
+ break;\r
+\r
+ case 'I':\r
+ esA[0] = -1;\r
+ break;\r
+\r
+ default:\r
+ assert(false);\r
+ }\r
+\r
+ char cPrevEdgeType = cFirstEdgeType;\r
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
+ {\r
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
+ char cEdgeType = Edge.cType;\r
+\r
+ switch (c2(cPrevEdgeType, cEdgeType))\r
+ {\r
+ case c2('M', 'M'):\r
+ case c2('D', 'D'):\r
+ case c2('D', 'M'):\r
+ case c2('M', 'D'):\r
+ ++(esA[iA]);\r
+ break;\r
+\r
+ case c2('I', 'D'):\r
+ case c2('I', 'M'):\r
+ ++iA;\r
+ esA[iA] = 1;\r
+ break;\r
+\r
+ case c2('M', 'I'):\r
+ case c2('D', 'I'):\r
+ ++iA;\r
+ esA[iA] = -1;\r
+ break;\r
+\r
+ case c2('I', 'I'):\r
+ --(esA[iA]);\r
+ break;\r
+\r
+ default:\r
+ assert(false);\r
+ }\r
+\r
+ cPrevEdgeType = cEdgeType;\r
+ }\r
+ assert(iA == iLengthA - 1);\r
+ esA[iLengthA] = 0;\r
+ *ptresA = esA;\r
+ }\r
+\r
+ {\r
+// Pass2 for seq B\r
+ short *esB = new short[iLengthB+1];\r
+ unsigned iB = 0;\r
+ switch (Path.GetEdge(0).cType)\r
+ {\r
+ case 'M':\r
+ case 'I':\r
+ esB[0] = 1;\r
+ break;\r
+\r
+ case 'D':\r
+ esB[0] = -1;\r
+ break;\r
+\r
+ default:\r
+ assert(false);\r
+ }\r
+\r
+ char cPrevEdgeType = cFirstEdgeType;\r
+ for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
+ {\r
+ const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
+ char cEdgeType = Edge.cType;\r
+\r
+ switch (c2(cPrevEdgeType, cEdgeType))\r
+ {\r
+ case c2('M', 'M'):\r
+ case c2('I', 'I'):\r
+ case c2('I', 'M'):\r
+ case c2('M', 'I'):\r
+ ++(esB[iB]);\r
+ break;\r
+\r
+ case c2('D', 'I'):\r
+ case c2('D', 'M'):\r
+ ++iB;\r
+ esB[iB] = 1;\r
+ break;\r
+\r
+ case c2('M', 'D'):\r
+ case c2('I', 'D'):\r
+ ++iB;\r
+ esB[iB] = -1;\r
+ break;\r
+\r
+ case c2('D', 'D'):\r
+ --(esB[iB]);\r
+ break;\r
+\r
+ default:\r
+ assert(false);\r
+ }\r
+\r
+ cPrevEdgeType = cEdgeType;\r
+ }\r
+ assert(iB == iLengthB - 1);\r
+ esB[iLengthB] = 0;\r
+ *ptresB = esB;\r
+ }\r
+\r
+#if DEBUG\r
+ {\r
+ const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1);\r
+ unsigned uSymbols;\r
+ unsigned uIndels;\r
+ EstringCounts(*ptresA, &uSymbols, &uIndels);\r
+ assert(uSymbols == LastEdge.uPrefixLengthA);\r
+ assert(uSymbols + uIndels == uEdgeCount);\r
+\r
+ EstringCounts(*ptresB, &uSymbols, &uIndels);\r
+ assert(uSymbols == LastEdge.uPrefixLengthB);\r
+ assert(uSymbols + uIndels == uEdgeCount);\r
+\r
+ PWPath TmpPath;\r
+ EstringsToPath(*ptresA, *ptresB, TmpPath);\r
+ TmpPath.AssertEqual(Path);\r
+ }\r
+#endif\r
+ }\r
+\r
+void EstringsToPath(const short esA[], const short esB[], PWPath &Path)\r
+ {\r
+ Path.Clear();\r
+ unsigned iA = 0;\r
+ unsigned iB = 0;\r
+ int nA = esA[iA++];\r
+ int nB = esB[iB++];\r
+ unsigned uPrefixLengthA = 0;\r
+ unsigned uPrefixLengthB = 0;\r
+ for (;;)\r
+ {\r
+ char cType;\r
+ if (nA > 0)\r
+ {\r
+ if (nB > 0)\r
+ {\r
+ cType = 'M';\r
+ --nA;\r
+ --nB;\r
+ }\r
+ else if (nB < 0)\r
+ {\r
+ cType = 'D';\r
+ --nA;\r
+ ++nB;\r
+ }\r
+ else\r
+ assert(false);\r
+ }\r
+ else if (nA < 0)\r
+ {\r
+ if (nB > 0)\r
+ {\r
+ cType = 'I';\r
+ ++nA;\r
+ --nB;\r
+ }\r
+ else\r
+ assert(false);\r
+ }\r
+ else\r
+ assert(false);\r
+\r
+ switch (cType)\r
+ {\r
+ case 'M':\r
+ ++uPrefixLengthA;\r
+ ++uPrefixLengthB;\r
+ break;\r
+ case 'D':\r
+ ++uPrefixLengthA;\r
+ break;\r
+ case 'I':\r
+ ++uPrefixLengthB;\r
+ break;\r
+ }\r
+\r
+ PWEdge Edge;\r
+ Edge.cType = cType;\r
+ Edge.uPrefixLengthA = uPrefixLengthA;\r
+ Edge.uPrefixLengthB = uPrefixLengthB;\r
+ Path.AppendEdge(Edge);\r
+\r
+ if (nA == 0)\r
+ {\r
+ if (0 == esA[iA])\r
+ {\r
+ assert(0 == esB[iB]);\r
+ break;\r
+ }\r
+ nA = esA[iA++];\r
+ }\r
+ if (nB == 0)\r
+ nB = esB[iB++];\r
+ }\r
+ }\r
+\r
+/***\r
+Multiply two estrings to make a third estring.\r
+The product of two estrings e1*e2 is defined to be\r
+the estring that produces the same result as applying\r
+e1 then e2. Multiplication is not commutative. In fact,\r
+the reversed order is undefined unless both estrings\r
+consist of a single, identical, positive entry.\r
+A primary motivation for using estrings is that\r
+multiplication is very fast, reducing the time\r
+needed to construct the root alignment.\r
+\r
+Example\r
+\r
+ <-1,3>(XXX) = -XXX\r
+ <2,-1,2>(-XXX) = -X-XX\r
+\r
+Therefore,\r
+\r
+ <-1,3>*<2,-1,2> = <-1,1,-1,2>\r
+***/\r
+\r
+static bool CanMultiplyEstrings(const short es1[], const short es2[])\r
+ {\r
+ unsigned uSymbols1;\r
+ unsigned uSymbols2;\r
+ unsigned uIndels1;\r
+ unsigned uIndels2;\r
+ EstringCounts(es1, &uSymbols1, &uIndels1);\r
+ EstringCounts(es2, &uSymbols2, &uIndels2);\r
+ return uSymbols1 + uIndels1 == uSymbols2;\r
+ }\r
+\r
+static inline void AppendGaps(short esp[], int &ip, int n)\r
+ {\r
+ if (-1 == ip)\r
+ esp[++ip] = n;\r
+ else if (esp[ip] < 0)\r
+ esp[ip] += n;\r
+ else\r
+ esp[++ip] = n;\r
+ }\r
+\r
+static inline void AppendSymbols(short esp[], int &ip, int n)\r
+ {\r
+ if (-1 == ip)\r
+ esp[++ip] = n;\r
+ else if (esp[ip] > 0)\r
+ esp[ip] += n;\r
+ else\r
+ esp[++ip] = n;\r
+ }\r
+\r
+void MulEstrings(const short es1[], const short es2[], short esp[])\r
+ {\r
+ assert(CanMultiplyEstrings(es1, es2));\r
+\r
+ unsigned i1 = 0;\r
+ int ip = -1;\r
+ int n1 = es1[i1++];\r
+ for (unsigned i2 = 0; ; ++i2)\r
+ {\r
+ int n2 = es2[i2];\r
+ if (0 == n2)\r
+ break;\r
+ if (n2 > 0)\r
+ {\r
+ for (;;)\r
+ {\r
+ if (n1 < 0)\r
+ {\r
+ if (n2 > -n1)\r
+ {\r
+ AppendGaps(esp, ip, n1);\r
+ n2 += n1;\r
+ n1 = es1[i1++];\r
+ }\r
+ else if (n2 == -n1)\r
+ {\r
+ AppendGaps(esp, ip, n1);\r
+ n1 = es1[i1++];\r
+ break;\r
+ }\r
+ else\r
+ {\r
+ assert(n2 < -n1);\r
+ AppendGaps(esp, ip, -n2);\r
+ n1 += n2;\r
+ break;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ assert(n1 > 0);\r
+ if (n2 > n1)\r
+ {\r
+ AppendSymbols(esp, ip, n1);\r
+ n2 -= n1;\r
+ n1 = es1[i1++];\r
+ }\r
+ else if (n2 == n1)\r
+ {\r
+ AppendSymbols(esp, ip, n1);\r
+ n1 = es1[i1++];\r
+ break;\r
+ }\r
+ else\r
+ {\r
+ assert(n2 < n1);\r
+ AppendSymbols(esp, ip, n2);\r
+ n1 -= n2;\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ else\r
+ {\r
+ assert(n2 < 0);\r
+ AppendGaps(esp, ip, n2);\r
+ }\r
+ }\r
+ esp[++ip] = 0;\r
+\r
+#if DEBUG\r
+ {\r
+ int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1);\r
+ assert(ip < MaxLen);\r
+ if (ip >= 2)\r
+ for (int i = 0; i < ip - 2; ++i)\r
+ {\r
+ if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0))\r
+ {\r
+ Log("Bad result of MulEstring: ");\r
+ LogEstring(esp);\r
+ Quit("Assert failed (alternating signs)");\r
+ }\r
+ }\r
+ unsigned uSymbols1;\r
+ unsigned uSymbols2;\r
+ unsigned uSymbolsp;\r
+ unsigned uIndels1;\r
+ unsigned uIndels2;\r
+ unsigned uIndelsp;\r
+ EstringCounts(es1, &uSymbols1, &uIndels1);\r
+ EstringCounts(es2, &uSymbols2, &uIndels2);\r
+ EstringCounts(esp, &uSymbolsp, &uIndelsp);\r
+ if (uSymbols1 + uIndels1 != uSymbols2)\r
+ {\r
+ Log("Bad result of MulEstring: ");\r
+ LogEstring(esp);\r
+ Quit("Assert failed (counts1 %u %u %u)",\r
+ uSymbols1, uIndels1, uSymbols2);\r
+ }\r
+ }\r
+#endif\r
+ }\r
+\r
+static void test(const short es1[], const short es2[], const short esa[])\r
+ {\r
+ unsigned uSymbols1;\r
+ unsigned uSymbols2;\r
+ unsigned uIndels1;\r
+ unsigned uIndels2;\r
+ EstringCounts(es1, &uSymbols1, &uIndels1);\r
+ EstringCounts(es2, &uSymbols2, &uIndels2);\r
+\r
+ char s[4096];\r
+ memset(s, 'X', sizeof(s));\r
+ s[uSymbols1] = 0;\r
+\r
+ char *s1 = EstringOp(es1, s);\r
+ char *s12 = EstringOp(es2, s1);\r
+\r
+ memset(s, 'X', sizeof(s));\r
+ s[uSymbols2] = 0;\r
+ char *s2 = EstringOp(es2, s);\r
+\r
+ Log("%s * %s = %s\n", s1, s2, s12);\r
+\r
+ LogEstring(es1);\r
+ Log(" * ");\r
+ LogEstring(es2);\r
+ Log(" = ");\r
+ LogEstring(esa);\r
+ Log("\n");\r
+\r
+ short esp[4096];\r
+ MulEstrings(es1, es2, esp);\r
+ LogEstring(esp);\r
+ if (!EstringsEq(esp, esa))\r
+ Log(" *ERROR* ");\r
+ Log("\n");\r
+\r
+ memset(s, 'X', sizeof(s));\r
+ s[uSymbols1] = 0;\r
+ char *sp = EstringOp(esp, s);\r
+ Log("%s\n", sp);\r
+ Log("\n==========\n\n");\r
+ }\r
+\r
+void TestEstrings()\r
+ {\r
+ SetListFileName("c:\\tmp\\muscle.log", false);\r
+ //{\r
+ //short es1[] = { -1, 1, -1, 0 };\r
+ //short es2[] = { 1, -1, 2, 0 };\r
+ //short esa[] = { -2, 1, -1, 0 };\r
+ //test(es1, es2, esa);\r
+ //}\r
+ //{\r
+ //short es1[] = { 2, -1, 2, 0 };\r
+ //short es2[] = { 1, -1, 3, -1, 1, 0 };\r
+ //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 };\r
+ //test(es1, es2, esa);\r
+ //}\r
+ //{\r
+ //short es1[] = { -1, 3, 0 };\r
+ //short es2[] = { 2, -1, 2, 0 };\r
+ //short esa[] = { -1, 1, -1, 2, 0 };\r
+ //test(es1, es2, esa);\r
+ //}\r
+ //{\r
+ //short es1[] = { -1, 1, -1, 1, 0};\r
+ //short es2[] = { 4, 0 };\r
+ //short esa[] = { -1, 1, -1, 1, 0};\r
+ //test(es1, es2, esa);\r
+ //}\r
+ //{\r
+ //short es1[] = { 1, -1, 1, -1, 0};\r
+ //short es2[] = { 4, 0 };\r
+ //short esa[] = { 1, -1, 1, -1, 0};\r
+ //test(es1, es2, esa);\r
+ //}\r
+ //{\r
+ //short es1[] = { 1, -1, 1, -1, 0};\r
+ //short es2[] = { -1, 4, -1, 0 };\r
+ //short esa[] = { -1, 1, -1, 1, -2, 0};\r
+ //test(es1, es2, esa);\r
+ //}\r
+ {\r
+ short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0};\r
+ short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 };\r
+ short esa[] = { 0 };\r
+ test(es1, es2, esa);\r
+ }\r
+ exit(0);\r
+ }\r