8 An "estring" is an edit string that operates on a sequence.
\r
9 An estring is represented as a vector of integers.
\r
10 It is interpreted in order of increasing suffix.
\r
11 A positive value n means copy n letters.
\r
12 A negative value -n means insert n indels.
\r
13 Zero marks the end of the vector.
\r
14 Consecutive entries must have opposite sign, i.e. the
\r
15 shortest possible representation must be used.
\r
17 A "tpair" is a traceback path for a pairwise alignment
\r
18 represented as two estrings, one for each sequence.
\r
21 #define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d)
\r
23 unsigned LengthEstring(const short es[])
\r
31 short *EstringNewCopy(const short es[])
\r
33 unsigned n = LengthEstring(es) + 1;
\r
34 short *esNew = new short[n];
\r
35 memcpy(esNew, es, n*sizeof(short));
\r
39 void LogEstring(const short es[])
\r
42 for (unsigned i = 0; es[i] != 0; ++i)
\r
51 static bool EstringsEq(const short es1[], const short es2[])
\r
65 static void EstringCounts(const short es[], unsigned *ptruSymbols,
\r
66 unsigned *ptruIndels)
\r
68 unsigned uSymbols = 0;
\r
69 unsigned uIndels = 0;
\r
70 for (unsigned i = 0; es[i] != 0; ++i)
\r
78 *ptruSymbols = uSymbols;
\r
79 *ptruIndels = uIndels;
\r
82 static char *EstringOp(const short es[], const char s[])
\r
86 EstringCounts(es, &uSymbols, &uIndels);
\r
87 assert((unsigned) strlen(s) == uSymbols);
\r
88 char *sout = new char[uSymbols + uIndels + 1];
\r
96 for (int i = 0; i < n; ++i)
\r
99 for (int i = 0; i < -n; ++i)
\r
107 void EstringOp(const short es[], const Seq &sIn, Seq &sOut)
\r
112 EstringCounts(es, &uSymbols, &uIndels);
\r
113 assert(sIn.Length() == uSymbols);
\r
116 sOut.SetName(sIn.GetName());
\r
124 for (int i = 0; i < n; ++i)
\r
130 for (int i = 0; i < -n; ++i)
\r
131 sOut.push_back('-');
\r
135 unsigned EstringOp(const short es[], const Seq &sIn, MSA &a)
\r
139 EstringCounts(es, &uSymbols, &uIndels);
\r
140 assert(sIn.Length() == uSymbols);
\r
142 unsigned uColCount = uSymbols + uIndels;
\r
145 a.SetSize(1, uColCount);
\r
147 a.SetSeqName(0, sIn.GetName());
\r
148 a.SetSeqId(0, sIn.GetId());
\r
151 unsigned uColIndex = 0;
\r
158 for (int i = 0; i < n; ++i)
\r
161 a.SetChar(0, uColIndex++, c);
\r
164 for (int i = 0; i < -n; ++i)
\r
165 a.SetChar(0, uColIndex++, '-');
\r
167 assert(uColIndex == uColCount);
\r
171 void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB)
\r
173 // First pass to determine size of estrings esA and esB
\r
174 const unsigned uEdgeCount = Path.GetEdgeCount();
\r
175 if (0 == uEdgeCount)
\r
177 short *esA = new short[1];
\r
178 short *esB = new short[1];
\r
186 unsigned iLengthA = 1;
\r
187 unsigned iLengthB = 1;
\r
188 const char cFirstEdgeType = Path.GetEdge(0).cType;
\r
189 char cPrevEdgeType = cFirstEdgeType;
\r
190 for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
\r
192 const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
\r
193 char cEdgeType = Edge.cType;
\r
195 switch (c2(cPrevEdgeType, cEdgeType))
\r
221 cPrevEdgeType = cEdgeType;
\r
226 short *esA = new short[iLengthA+1];
\r
228 switch (Path.GetEdge(0).cType)
\r
243 char cPrevEdgeType = cFirstEdgeType;
\r
244 for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
\r
246 const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
\r
247 char cEdgeType = Edge.cType;
\r
249 switch (c2(cPrevEdgeType, cEdgeType))
\r
278 cPrevEdgeType = cEdgeType;
\r
280 assert(iA == iLengthA - 1);
\r
287 short *esB = new short[iLengthB+1];
\r
289 switch (Path.GetEdge(0).cType)
\r
304 char cPrevEdgeType = cFirstEdgeType;
\r
305 for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
\r
307 const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
\r
308 char cEdgeType = Edge.cType;
\r
310 switch (c2(cPrevEdgeType, cEdgeType))
\r
339 cPrevEdgeType = cEdgeType;
\r
341 assert(iB == iLengthB - 1);
\r
348 const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1);
\r
351 EstringCounts(*ptresA, &uSymbols, &uIndels);
\r
352 assert(uSymbols == LastEdge.uPrefixLengthA);
\r
353 assert(uSymbols + uIndels == uEdgeCount);
\r
355 EstringCounts(*ptresB, &uSymbols, &uIndels);
\r
356 assert(uSymbols == LastEdge.uPrefixLengthB);
\r
357 assert(uSymbols + uIndels == uEdgeCount);
\r
360 EstringsToPath(*ptresA, *ptresB, TmpPath);
\r
361 TmpPath.AssertEqual(Path);
\r
366 void EstringsToPath(const short esA[], const short esB[], PWPath &Path)
\r
371 int nA = esA[iA++];
\r
372 int nB = esB[iB++];
\r
373 unsigned uPrefixLengthA = 0;
\r
374 unsigned uPrefixLengthB = 0;
\r
424 Edge.cType = cType;
\r
425 Edge.uPrefixLengthA = uPrefixLengthA;
\r
426 Edge.uPrefixLengthB = uPrefixLengthB;
\r
427 Path.AppendEdge(Edge);
\r
433 assert(0 == esB[iB]);
\r
444 Multiply two estrings to make a third estring.
\r
445 The product of two estrings e1*e2 is defined to be
\r
446 the estring that produces the same result as applying
\r
447 e1 then e2. Multiplication is not commutative. In fact,
\r
448 the reversed order is undefined unless both estrings
\r
449 consist of a single, identical, positive entry.
\r
450 A primary motivation for using estrings is that
\r
451 multiplication is very fast, reducing the time
\r
452 needed to construct the root alignment.
\r
457 <2,-1,2>(-XXX) = -X-XX
\r
461 <-1,3>*<2,-1,2> = <-1,1,-1,2>
\r
464 static bool CanMultiplyEstrings(const short es1[], const short es2[])
\r
466 unsigned uSymbols1;
\r
467 unsigned uSymbols2;
\r
470 EstringCounts(es1, &uSymbols1, &uIndels1);
\r
471 EstringCounts(es2, &uSymbols2, &uIndels2);
\r
472 return uSymbols1 + uIndels1 == uSymbols2;
\r
475 static inline void AppendGaps(short esp[], int &ip, int n)
\r
479 else if (esp[ip] < 0)
\r
485 static inline void AppendSymbols(short esp[], int &ip, int n)
\r
489 else if (esp[ip] > 0)
\r
495 void MulEstrings(const short es1[], const short es2[], short esp[])
\r
497 assert(CanMultiplyEstrings(es1, es2));
\r
501 int n1 = es1[i1++];
\r
502 for (unsigned i2 = 0; ; ++i2)
\r
515 AppendGaps(esp, ip, n1);
\r
519 else if (n2 == -n1)
\r
521 AppendGaps(esp, ip, n1);
\r
528 AppendGaps(esp, ip, -n2);
\r
538 AppendSymbols(esp, ip, n1);
\r
544 AppendSymbols(esp, ip, n1);
\r
551 AppendSymbols(esp, ip, n2);
\r
561 AppendGaps(esp, ip, n2);
\r
568 int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1);
\r
569 assert(ip < MaxLen);
\r
571 for (int i = 0; i < ip - 2; ++i)
\r
573 if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0))
\r
575 Log("Bad result of MulEstring: ");
\r
577 Quit("Assert failed (alternating signs)");
\r
580 unsigned uSymbols1;
\r
581 unsigned uSymbols2;
\r
582 unsigned uSymbolsp;
\r
586 EstringCounts(es1, &uSymbols1, &uIndels1);
\r
587 EstringCounts(es2, &uSymbols2, &uIndels2);
\r
588 EstringCounts(esp, &uSymbolsp, &uIndelsp);
\r
589 if (uSymbols1 + uIndels1 != uSymbols2)
\r
591 Log("Bad result of MulEstring: ");
\r
593 Quit("Assert failed (counts1 %u %u %u)",
\r
594 uSymbols1, uIndels1, uSymbols2);
\r
600 static void test(const short es1[], const short es2[], const short esa[])
\r
602 unsigned uSymbols1;
\r
603 unsigned uSymbols2;
\r
606 EstringCounts(es1, &uSymbols1, &uIndels1);
\r
607 EstringCounts(es2, &uSymbols2, &uIndels2);
\r
610 memset(s, 'X', sizeof(s));
\r
613 char *s1 = EstringOp(es1, s);
\r
614 char *s12 = EstringOp(es2, s1);
\r
616 memset(s, 'X', sizeof(s));
\r
618 char *s2 = EstringOp(es2, s);
\r
620 Log("%s * %s = %s\n", s1, s2, s12);
\r
630 MulEstrings(es1, es2, esp);
\r
632 if (!EstringsEq(esp, esa))
\r
636 memset(s, 'X', sizeof(s));
\r
638 char *sp = EstringOp(esp, s);
\r
640 Log("\n==========\n\n");
\r
643 void TestEstrings()
\r
645 SetListFileName("c:\\tmp\\muscle.log", false);
\r
647 //short es1[] = { -1, 1, -1, 0 };
\r
648 //short es2[] = { 1, -1, 2, 0 };
\r
649 //short esa[] = { -2, 1, -1, 0 };
\r
650 //test(es1, es2, esa);
\r
653 //short es1[] = { 2, -1, 2, 0 };
\r
654 //short es2[] = { 1, -1, 3, -1, 1, 0 };
\r
655 //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 };
\r
656 //test(es1, es2, esa);
\r
659 //short es1[] = { -1, 3, 0 };
\r
660 //short es2[] = { 2, -1, 2, 0 };
\r
661 //short esa[] = { -1, 1, -1, 2, 0 };
\r
662 //test(es1, es2, esa);
\r
665 //short es1[] = { -1, 1, -1, 1, 0};
\r
666 //short es2[] = { 4, 0 };
\r
667 //short esa[] = { -1, 1, -1, 1, 0};
\r
668 //test(es1, es2, esa);
\r
671 //short es1[] = { 1, -1, 1, -1, 0};
\r
672 //short es2[] = { 4, 0 };
\r
673 //short esa[] = { 1, -1, 1, -1, 0};
\r
674 //test(es1, es2, esa);
\r
677 //short es1[] = { 1, -1, 1, -1, 0};
\r
678 //short es2[] = { -1, 4, -1, 0 };
\r
679 //short esa[] = { -1, 1, -1, 1, -2, 0};
\r
680 //test(es1, es2, esa);
\r
683 short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0};
\r
684 short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 };
\r
685 short esa[] = { 0 };
\r
686 test(es1, es2, esa);
\r