Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / estring.cpp
diff --git a/website/archive/binaries/mac/src/muscle/estring.cpp b/website/archive/binaries/mac/src/muscle/estring.cpp
new file mode 100644 (file)
index 0000000..ef7a576
--- /dev/null
@@ -0,0 +1,689 @@
+#include "muscle.h"\r
+#include "pwpath.h"\r
+#include "estring.h"\r
+#include "seq.h"\r
+#include "msa.h"\r
+\r
+/***\r
+An "estring" is an edit string that operates on a sequence.\r
+An estring is represented as a vector of integers.\r
+It is interpreted in order of increasing suffix.\r
+A positive value n means copy n letters.\r
+A negative value -n means insert n indels.\r
+Zero marks the end of the vector.\r
+Consecutive entries must have opposite sign, i.e. the\r
+shortest possible representation must be used.\r
+\r
+A "tpair" is a traceback path for a pairwise alignment\r
+represented as two estrings, one for each sequence.\r
+***/\r
+\r
+#define c2(c,d)        (((unsigned char) c) << 8 | (unsigned char) d)\r
+\r
+unsigned LengthEstring(const short es[])\r
+       {\r
+       unsigned i = 0;\r
+       while (*es++ != 0)\r
+               ++i;\r
+       return i;\r
+       }\r
+\r
+short *EstringNewCopy(const short es[])\r
+       {\r
+       unsigned n = LengthEstring(es) + 1;\r
+       short *esNew = new short[n];\r
+       memcpy(esNew, es, n*sizeof(short));\r
+       return esNew;\r
+       }\r
+\r
+void LogEstring(const short es[])\r
+       {\r
+       Log("<");\r
+       for (unsigned i = 0; es[i] != 0; ++i)\r
+               {\r
+               if (i > 0)\r
+                       Log(" ");\r
+               Log("%d", es[i]);\r
+               }\r
+       Log(">");\r
+       }\r
+\r
+static bool EstringsEq(const short es1[], const short es2[])\r
+       {\r
+       for (;;)\r
+               {\r
+               if (*es1 != *es2)\r
+                       return false;\r
+               if (0 == *es1)\r
+                       break;\r
+               ++es1;\r
+               ++es2;\r
+               }\r
+       return true;\r
+       }\r
+\r
+static void EstringCounts(const short es[], unsigned *ptruSymbols,\r
+  unsigned *ptruIndels)\r
+       {\r
+       unsigned uSymbols = 0;\r
+       unsigned uIndels = 0;\r
+       for (unsigned i = 0; es[i] != 0; ++i)\r
+               {\r
+               short n = es[i];\r
+               if (n > 0)\r
+                       uSymbols += n;\r
+               else if (n < 0)\r
+                       uIndels += -n;\r
+               }\r
+       *ptruSymbols = uSymbols;\r
+       *ptruIndels = uIndels;\r
+       }\r
+\r
+static char *EstringOp(const short es[], const char s[])\r
+       {\r
+       unsigned uSymbols;\r
+       unsigned uIndels;\r
+       EstringCounts(es, &uSymbols, &uIndels);\r
+       assert((unsigned) strlen(s) == uSymbols);\r
+       char *sout = new char[uSymbols + uIndels + 1];\r
+       char *psout = sout;\r
+       for (;;)\r
+               {\r
+               int n = *es++;\r
+               if (0 == n)\r
+                       break;\r
+               if (n > 0)\r
+                       for (int i = 0; i < n; ++i)\r
+                               *psout++ = *s++;\r
+               else\r
+                       for (int i = 0; i < -n; ++i)\r
+                               *psout++ = '-';\r
+               }\r
+       assert(0 == *s);\r
+       *psout = 0;\r
+       return sout;\r
+       }\r
+\r
+void EstringOp(const short es[], const Seq &sIn, Seq &sOut)\r
+       {\r
+#if    DEBUG\r
+       unsigned uSymbols;\r
+       unsigned uIndels;\r
+       EstringCounts(es, &uSymbols, &uIndels);\r
+       assert(sIn.Length() == uSymbols);\r
+#endif\r
+       sOut.Clear();\r
+       sOut.SetName(sIn.GetName());\r
+       int p = 0;\r
+       for (;;)\r
+               {\r
+               int n = *es++;\r
+               if (0 == n)\r
+                       break;\r
+               if (n > 0)\r
+                       for (int i = 0; i < n; ++i)\r
+                               {\r
+                               char c = sIn[p++];\r
+                               sOut.push_back(c);\r
+                               }\r
+               else\r
+                       for (int i = 0; i < -n; ++i)\r
+                               sOut.push_back('-');\r
+               }\r
+       }\r
+\r
+unsigned EstringOp(const short es[], const Seq &sIn, MSA &a)\r
+       {\r
+       unsigned uSymbols;\r
+       unsigned uIndels;\r
+       EstringCounts(es, &uSymbols, &uIndels);\r
+       assert(sIn.Length() == uSymbols);\r
+\r
+       unsigned uColCount = uSymbols + uIndels;\r
+\r
+       a.Clear();\r
+       a.SetSize(1, uColCount);\r
+\r
+       a.SetSeqName(0, sIn.GetName());\r
+       a.SetSeqId(0, sIn.GetId());\r
+\r
+       unsigned p = 0;\r
+       unsigned uColIndex = 0;\r
+       for (;;)\r
+               {\r
+               int n = *es++;\r
+               if (0 == n)\r
+                       break;\r
+               if (n > 0)\r
+                       for (int i = 0; i < n; ++i)\r
+                               {\r
+                               char c = sIn[p++];\r
+                               a.SetChar(0, uColIndex++, c);\r
+                               }\r
+               else\r
+                       for (int i = 0; i < -n; ++i)\r
+                               a.SetChar(0, uColIndex++, '-');\r
+               }\r
+       assert(uColIndex == uColCount);\r
+       return uColCount;\r
+       }\r
+\r
+void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB)\r
+       {\r
+// First pass to determine size of estrings esA and esB\r
+       const unsigned uEdgeCount = Path.GetEdgeCount();\r
+       if (0 == uEdgeCount)\r
+               {\r
+               short *esA = new short[1];\r
+               short *esB = new short[1];\r
+               esA[0] = 0;\r
+               esB[0] = 0;\r
+               *ptresA = esA;\r
+               *ptresB = esB;\r
+               return;\r
+               }\r
+\r
+       unsigned iLengthA = 1;\r
+       unsigned iLengthB = 1;\r
+       const char cFirstEdgeType = Path.GetEdge(0).cType;\r
+       char cPrevEdgeType = cFirstEdgeType;\r
+       for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
+               {\r
+               const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
+               char cEdgeType = Edge.cType;\r
+\r
+               switch (c2(cPrevEdgeType, cEdgeType))\r
+                       {\r
+               case c2('M', 'M'):\r
+               case c2('D', 'D'):\r
+               case c2('I', 'I'):\r
+                       break;\r
+\r
+               case c2('D', 'M'):\r
+               case c2('M', 'D'):\r
+                       ++iLengthB;\r
+                       break;\r
+\r
+               case c2('I', 'M'):\r
+               case c2('M', 'I'):\r
+                       ++iLengthA;\r
+                       break;\r
+\r
+               case c2('I', 'D'):\r
+               case c2('D', 'I'):\r
+                       ++iLengthB;\r
+                       ++iLengthA;\r
+                       break;\r
+\r
+               default:\r
+                       assert(false);\r
+                       }\r
+               cPrevEdgeType = cEdgeType;\r
+               }\r
+\r
+// Pass2 for seq A\r
+       {\r
+       short *esA = new short[iLengthA+1];\r
+       unsigned iA = 0;\r
+       switch (Path.GetEdge(0).cType)\r
+               {\r
+       case 'M':\r
+       case 'D':\r
+               esA[0] = 1;\r
+               break;\r
+\r
+       case 'I':\r
+               esA[0] = -1;\r
+               break;\r
+\r
+       default:\r
+               assert(false);\r
+               }\r
+\r
+       char cPrevEdgeType = cFirstEdgeType;\r
+       for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
+               {\r
+               const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
+               char cEdgeType = Edge.cType;\r
+\r
+               switch (c2(cPrevEdgeType, cEdgeType))\r
+                       {\r
+               case c2('M', 'M'):\r
+               case c2('D', 'D'):\r
+               case c2('D', 'M'):\r
+               case c2('M', 'D'):\r
+                       ++(esA[iA]);\r
+                       break;\r
+\r
+               case c2('I', 'D'):\r
+               case c2('I', 'M'):\r
+                       ++iA;\r
+                       esA[iA] = 1;\r
+                       break;\r
+\r
+               case c2('M', 'I'):\r
+               case c2('D', 'I'):\r
+                       ++iA;\r
+                       esA[iA] = -1;\r
+                       break;\r
+\r
+               case c2('I', 'I'):\r
+                       --(esA[iA]);\r
+                       break;\r
+\r
+               default:\r
+                       assert(false);\r
+                       }\r
+\r
+               cPrevEdgeType = cEdgeType;\r
+               }\r
+       assert(iA == iLengthA - 1);\r
+       esA[iLengthA] = 0;\r
+       *ptresA = esA;\r
+       }\r
+\r
+       {\r
+// Pass2 for seq B\r
+       short *esB = new short[iLengthB+1];\r
+       unsigned iB = 0;\r
+       switch (Path.GetEdge(0).cType)\r
+               {\r
+       case 'M':\r
+       case 'I':\r
+               esB[0] = 1;\r
+               break;\r
+\r
+       case 'D':\r
+               esB[0] = -1;\r
+               break;\r
+\r
+       default:\r
+               assert(false);\r
+               }\r
+\r
+       char cPrevEdgeType = cFirstEdgeType;\r
+       for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)\r
+               {\r
+               const PWEdge &Edge = Path.GetEdge(uEdgeIndex);\r
+               char cEdgeType = Edge.cType;\r
+\r
+               switch (c2(cPrevEdgeType, cEdgeType))\r
+                       {\r
+               case c2('M', 'M'):\r
+               case c2('I', 'I'):\r
+               case c2('I', 'M'):\r
+               case c2('M', 'I'):\r
+                       ++(esB[iB]);\r
+                       break;\r
+\r
+               case c2('D', 'I'):\r
+               case c2('D', 'M'):\r
+                       ++iB;\r
+                       esB[iB] = 1;\r
+                       break;\r
+\r
+               case c2('M', 'D'):\r
+               case c2('I', 'D'):\r
+                       ++iB;\r
+                       esB[iB] = -1;\r
+                       break;\r
+\r
+               case c2('D', 'D'):\r
+                       --(esB[iB]);\r
+                       break;\r
+\r
+               default:\r
+                       assert(false);\r
+                       }\r
+\r
+               cPrevEdgeType = cEdgeType;\r
+               }\r
+       assert(iB == iLengthB - 1);\r
+       esB[iLengthB] = 0;\r
+       *ptresB = esB;\r
+       }\r
+\r
+#if    DEBUG\r
+       {\r
+       const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1);\r
+       unsigned uSymbols;\r
+       unsigned uIndels;\r
+       EstringCounts(*ptresA, &uSymbols, &uIndels);\r
+       assert(uSymbols == LastEdge.uPrefixLengthA);\r
+       assert(uSymbols + uIndels == uEdgeCount);\r
+\r
+       EstringCounts(*ptresB, &uSymbols, &uIndels);\r
+       assert(uSymbols == LastEdge.uPrefixLengthB);\r
+       assert(uSymbols + uIndels == uEdgeCount);\r
+\r
+       PWPath TmpPath;\r
+       EstringsToPath(*ptresA, *ptresB, TmpPath);\r
+       TmpPath.AssertEqual(Path);\r
+       }\r
+#endif\r
+       }\r
+\r
+void EstringsToPath(const short esA[], const short esB[], PWPath &Path)\r
+       {\r
+       Path.Clear();\r
+       unsigned iA = 0;\r
+       unsigned iB = 0;\r
+       int nA = esA[iA++];\r
+       int nB = esB[iB++];\r
+       unsigned uPrefixLengthA = 0;\r
+       unsigned uPrefixLengthB = 0;\r
+       for (;;)\r
+               {\r
+               char cType;\r
+               if (nA > 0)\r
+                       {\r
+                       if (nB > 0)\r
+                               {\r
+                               cType = 'M';\r
+                               --nA;\r
+                               --nB;\r
+                               }\r
+                       else if (nB < 0)\r
+                               {\r
+                               cType = 'D';\r
+                               --nA;\r
+                               ++nB;\r
+                               }\r
+                       else\r
+                               assert(false);\r
+                       }\r
+               else if (nA < 0)\r
+                       {\r
+                       if (nB > 0)\r
+                               {\r
+                               cType = 'I';\r
+                               ++nA;\r
+                               --nB;\r
+                               }\r
+                       else\r
+                               assert(false);\r
+                       }\r
+               else\r
+                       assert(false);\r
+\r
+               switch (cType)\r
+                       {\r
+               case 'M':\r
+                       ++uPrefixLengthA;\r
+                       ++uPrefixLengthB;\r
+                       break;\r
+               case 'D':\r
+                       ++uPrefixLengthA;\r
+                       break;\r
+               case 'I':\r
+                       ++uPrefixLengthB;\r
+                       break;\r
+                       }\r
+\r
+               PWEdge Edge;\r
+               Edge.cType = cType;\r
+               Edge.uPrefixLengthA = uPrefixLengthA;\r
+               Edge.uPrefixLengthB = uPrefixLengthB;\r
+               Path.AppendEdge(Edge);\r
+\r
+               if (nA == 0)\r
+                       {\r
+                       if (0 == esA[iA])\r
+                               {\r
+                               assert(0 == esB[iB]);\r
+                               break;\r
+                               }\r
+                       nA = esA[iA++];\r
+                       }\r
+               if (nB == 0)\r
+                       nB = esB[iB++];\r
+               }\r
+       }\r
+\r
+/***\r
+Multiply two estrings to make a third estring.\r
+The product of two estrings e1*e2 is defined to be\r
+the estring that produces the same result as applying\r
+e1 then e2. Multiplication is not commutative. In fact,\r
+the reversed order is undefined unless both estrings\r
+consist of a single, identical, positive entry.\r
+A primary motivation for using estrings is that\r
+multiplication is very fast, reducing the time\r
+needed to construct the root alignment.\r
+\r
+Example\r
+\r
+       <-1,3>(XXX)     = -XXX\r
+       <2,-1,2>(-XXX) = -X-XX\r
+\r
+Therefore,\r
+\r
+       <-1,3>*<2,-1,2> = <-1,1,-1,2>\r
+***/\r
+\r
+static bool CanMultiplyEstrings(const short es1[], const short es2[])\r
+       {\r
+       unsigned uSymbols1;\r
+       unsigned uSymbols2;\r
+       unsigned uIndels1;\r
+       unsigned uIndels2;\r
+       EstringCounts(es1, &uSymbols1, &uIndels1);\r
+       EstringCounts(es2, &uSymbols2, &uIndels2);\r
+       return uSymbols1 + uIndels1 == uSymbols2;\r
+       }\r
+\r
+static inline void AppendGaps(short esp[], int &ip, int n)\r
+       {\r
+       if (-1 == ip)\r
+               esp[++ip] = n;\r
+       else if (esp[ip] < 0)\r
+               esp[ip] += n;\r
+       else\r
+               esp[++ip] = n;\r
+       }\r
+\r
+static inline void AppendSymbols(short esp[], int &ip, int n)\r
+       {\r
+       if (-1 == ip)\r
+               esp[++ip] = n;\r
+       else if (esp[ip] > 0)\r
+               esp[ip] += n;\r
+       else\r
+               esp[++ip] = n;\r
+       }\r
+\r
+void MulEstrings(const short es1[], const short es2[], short esp[])\r
+       {\r
+       assert(CanMultiplyEstrings(es1, es2));\r
+\r
+       unsigned i1 = 0;\r
+       int ip = -1;\r
+       int n1 = es1[i1++];\r
+       for (unsigned i2 = 0; ; ++i2)\r
+               {\r
+               int n2 = es2[i2];\r
+               if (0 == n2)\r
+                       break;\r
+               if (n2 > 0)\r
+                       {\r
+                       for (;;)\r
+                               {\r
+                               if (n1 < 0)\r
+                                       {\r
+                                       if (n2 > -n1)\r
+                                               {\r
+                                               AppendGaps(esp, ip, n1);\r
+                                               n2 += n1;\r
+                                               n1 = es1[i1++];\r
+                                               }\r
+                                       else if (n2 == -n1)\r
+                                               {\r
+                                               AppendGaps(esp, ip, n1);\r
+                                               n1 = es1[i1++];\r
+                                               break;\r
+                                               }\r
+                                       else\r
+                                               {\r
+                                               assert(n2 < -n1);\r
+                                               AppendGaps(esp, ip, -n2);\r
+                                               n1 += n2;\r
+                                               break;\r
+                                               }\r
+                                       }\r
+                               else\r
+                                       {\r
+                                       assert(n1 > 0);\r
+                                       if (n2 > n1)\r
+                                               {\r
+                                               AppendSymbols(esp, ip, n1);\r
+                                               n2 -= n1;\r
+                                               n1 = es1[i1++];\r
+                                               }\r
+                                       else if (n2 == n1)\r
+                                               {\r
+                                               AppendSymbols(esp, ip, n1);\r
+                                               n1 = es1[i1++];\r
+                                               break;\r
+                                               }\r
+                                       else\r
+                                               {\r
+                                               assert(n2 < n1);\r
+                                               AppendSymbols(esp, ip, n2);\r
+                                               n1 -= n2;\r
+                                               break;\r
+                                               }\r
+                                       }\r
+                               }\r
+                       }\r
+               else\r
+                       {\r
+                       assert(n2 < 0);\r
+                       AppendGaps(esp, ip, n2);\r
+                       }\r
+               }\r
+       esp[++ip] = 0;\r
+\r
+#if    DEBUG\r
+       {\r
+       int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1);\r
+       assert(ip < MaxLen);\r
+       if (ip >= 2)\r
+               for (int i = 0; i < ip - 2; ++i)\r
+                       {\r
+                       if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0))\r
+                               {\r
+                               Log("Bad result of MulEstring: ");\r
+                               LogEstring(esp);\r
+                               Quit("Assert failed (alternating signs)");\r
+                               }\r
+                       }\r
+       unsigned uSymbols1;\r
+       unsigned uSymbols2;\r
+       unsigned uSymbolsp;\r
+       unsigned uIndels1;\r
+       unsigned uIndels2;\r
+       unsigned uIndelsp;\r
+       EstringCounts(es1, &uSymbols1, &uIndels1);\r
+       EstringCounts(es2, &uSymbols2, &uIndels2);\r
+       EstringCounts(esp, &uSymbolsp, &uIndelsp);\r
+       if (uSymbols1 + uIndels1 != uSymbols2)\r
+               {\r
+               Log("Bad result of MulEstring: ");\r
+               LogEstring(esp);\r
+               Quit("Assert failed (counts1 %u %u %u)",\r
+                 uSymbols1, uIndels1, uSymbols2);\r
+               }\r
+       }\r
+#endif\r
+       }\r
+\r
+static void test(const short es1[], const short es2[], const short esa[])\r
+       {\r
+       unsigned uSymbols1;\r
+       unsigned uSymbols2;\r
+       unsigned uIndels1;\r
+       unsigned uIndels2;\r
+       EstringCounts(es1, &uSymbols1, &uIndels1);\r
+       EstringCounts(es2, &uSymbols2, &uIndels2);\r
+\r
+       char s[4096];\r
+       memset(s, 'X', sizeof(s));\r
+       s[uSymbols1] = 0;\r
+\r
+       char *s1 = EstringOp(es1, s);\r
+       char *s12 = EstringOp(es2, s1);\r
+\r
+       memset(s, 'X', sizeof(s));\r
+       s[uSymbols2] = 0;\r
+       char *s2 = EstringOp(es2, s);\r
+\r
+       Log("%s * %s = %s\n", s1, s2, s12);\r
+\r
+       LogEstring(es1);\r
+       Log(" * ");\r
+       LogEstring(es2);\r
+       Log(" = ");\r
+       LogEstring(esa);\r
+       Log("\n");\r
+\r
+       short esp[4096];\r
+       MulEstrings(es1, es2, esp);\r
+       LogEstring(esp);\r
+       if (!EstringsEq(esp, esa))\r
+               Log(" *ERROR* ");\r
+       Log("\n");\r
+\r
+       memset(s, 'X', sizeof(s));\r
+       s[uSymbols1] = 0;\r
+       char *sp = EstringOp(esp, s);\r
+       Log("%s\n", sp);\r
+       Log("\n==========\n\n");\r
+       }\r
+\r
+void TestEstrings()\r
+       {\r
+       SetListFileName("c:\\tmp\\muscle.log", false);\r
+       //{\r
+       //short es1[] = { -1, 1, -1, 0 };\r
+       //short es2[] = { 1, -1, 2, 0 };\r
+       //short esa[] = { -2, 1, -1, 0 };\r
+       //test(es1, es2, esa);\r
+       //}\r
+       //{\r
+       //short es1[] = { 2, -1, 2, 0 };\r
+       //short es2[] = { 1, -1, 3, -1, 1, 0 };\r
+       //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 };\r
+       //test(es1, es2, esa);\r
+       //}\r
+       //{\r
+       //short es1[] = { -1, 3, 0 };\r
+       //short es2[] = { 2, -1, 2, 0 };\r
+       //short esa[] = { -1, 1, -1, 2, 0 };\r
+       //test(es1, es2, esa);\r
+       //}\r
+       //{\r
+       //short es1[] = { -1, 1, -1, 1, 0};\r
+       //short es2[] = { 4, 0 };\r
+       //short esa[] = { -1, 1, -1, 1, 0};\r
+       //test(es1, es2, esa);\r
+       //}\r
+       //{\r
+       //short es1[] = { 1, -1, 1, -1, 0};\r
+       //short es2[] = { 4, 0 };\r
+       //short esa[] = { 1, -1, 1, -1, 0};\r
+       //test(es1, es2, esa);\r
+       //}\r
+       //{\r
+       //short es1[] = { 1, -1, 1, -1, 0};\r
+       //short es2[] = { -1, 4, -1, 0 };\r
+       //short esa[] = { -1, 1, -1, 1, -2, 0};\r
+       //test(es1, es2, esa);\r
+       //}\r
+       {\r
+       short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0};\r
+       short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 };\r
+       short esa[] = { 0 };\r
+       test(es1, es2, esa);\r
+       }\r
+       exit(0);\r
+       }\r