1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
11 /*****************************************************************
12 * This code is an altered version of Henry Spencer's
13 * regex library. Alterations are limited to minor streamlining,
14 * and some name changes to protect the SQUID namespace.
15 * Henry's copyright notice appears below.
16 * You can obtain the original from
17 * ftp://ftp.zoo.toronto.edu/pub/bookregex.tar.Z
20 * SRE, Fri Aug 28 11:10:17 1998
21 * RCS $Id: hsregex.c,v 1.1.1.1 2005/03/22 08:34:17 cmzmasek Exp $
22 *****************************************************************/
30 /* global sqd_parse[] are managed by Strparse().
31 * WARNING: TODO: this code is not threadsafe, and needs to be revised.
35 /* Function: Strparse()
37 * Purpose: Match a regexp to a string. Returns 1 if pattern matches,
40 * Much like Perl, Strparse() makes copies of the matching
41 * substrings available via globals, sqd_parse[].
42 * sqd_parse[0] contains a copy of the complete matched
43 * text. sqd_parse[1-9] contain copies of up to nine
44 * different substrings matched within parentheses.
45 * The memory for these strings is internally managed and
46 * volatile; the next call to Strparse() may destroy them.
47 * If the caller needs the matched substrings to persist
48 * beyond a new Strparse() call, it must make its own
51 * A minor drawback of the memory management is that
52 * there will be a small amount of unfree'd memory being
53 * managed by Strparse() when a program exits; this may
54 * confuse memory debugging (Purify, dbmalloc). The
55 * general cleanup function SqdClean() is provided;
56 * you can call this before exiting.
58 * Uses an extended POSIX regular expression interface.
59 * A copylefted GNU implementation is included in the squid
60 * implementation (gnuregex.c) for use on non-POSIX compliant
61 * systems. POSIX 1003.2-compliant systems (all UNIX,
62 * some WinNT, I believe) can omit the GNU code if necessary.
64 * I built this for ease of use, not speed nor efficiency.
66 * Example: Strparse("foo-...-baz", "foo-bar-baz") returns 0
67 * Strparse("foo-(...)-baz", "foo-bar-baz")
68 * returns 0; sqd_parse[0] is "foo-bar-baz";
69 * sqd_parse[1] is "bar".
71 * Args: rexp - regular expression, extended POSIX form
72 * s - string to match against
73 * ntok - number of () substrings we will save (maximum NSUBEXP-1)
75 * Return: 1 on match, 0 if no match
78 Strparse(char *rexp, char *s, int ntok)
85 if (ntok >= NSUBEXP ) Die("Strparse(): ntok must be <= %d", NSUBEXP-1);
87 /* Free previous global substring buffers
89 for (i = 0; i <= ntok; i++)
90 if (sqd_parse[i] != NULL)
96 /* Compile and match the pattern, using our modified
97 * copy of Henry Spencer's regexp library
99 if ((pat = sqd_regcomp(rexp)) == NULL)
100 Die("regexp compilation failed.");
101 code = sqd_regexec(pat, s);
103 /* Fill the global substring buffers
106 for (i = 0; i <= ntok; i++)
107 if (pat->startp[i] != NULL && pat->endp[i] != NULL)
109 len = pat->endp[i] - pat->startp[i];
110 sqd_parse[i] = (char *) MallocOrDie(sizeof(char) * (len+1));
111 strncpy(sqd_parse[i], pat->startp[i], len);
112 sqd_parse[i][len] = '\0';
119 /* Function: SqdClean()
120 * Date: SRE, Wed Oct 29 12:52:08 1997 [TWA 721]
122 * Purpose: Clean up any squid library allocations before exiting
123 * a program, so we don't leave unfree'd memory around
124 * and confuse a malloc debugger like Purify or dbmalloc.
131 /* Free global substring buffers that Strparse() uses
133 for (i = 0; i <= 9; i++)
134 if (sqd_parse[i] != NULL) {
142 /* all code below is:
143 * Copyright (c) 1986, 1993, 1995 by University of Toronto.
144 * Written by Henry Spencer. Not derived from licensed software.
146 * Permission is granted to anyone to use this software for any
147 * purpose on any computer system, and to redistribute it in any way,
148 * subject to the following restrictions:
150 * 1. The author is not responsible for the consequences of use of
151 * this software, no matter how awful, even if they arise
152 * from defects in it.
154 * 2. The origin of this software must not be misrepresented, either
155 * by explicit claim or by omission.
157 * 3. Altered versions must be plainly marked as such, and must not
158 * be misrepresented (by explicit claim or omission) as being
159 * the original software.
161 * 4. This notice must not be removed or altered.
165 * sqd_regcomp and sqd_regexec -- sqd_regsub and sqd_regerror are elsewhere
169 * The first byte of the regexp internal "program" is actually this magic
170 * number; the start node begins in the second byte.
172 #define SQD_REGMAGIC 0234
175 * The "internal use only" fields in regexp.h are present to pass info from
176 * compile to execute that permits the execute phase to run lots faster on
177 * simple cases. They are:
179 * regstart char that must begin a match; '\0' if none obvious
180 * reganch is the match anchored (at beginning-of-line only)?
181 * regmust string (pointer into program) that match must include, or NULL
182 * regmlen length of regmust string
184 * Regstart and reganch permit very fast decisions on suitable starting points
185 * for a match, cutting down the work a lot. Regmust permits fast rejection
186 * of lines that cannot possibly match. The regmust tests are costly enough
187 * that sqd_regcomp() supplies a regmust only if the r.e. contains something
188 * potentially expensive (at present, the only such thing detected is * or +
189 * at the start of the r.e., which can involve a lot of backup). Regmlen is
190 * supplied because the test in sqd_regexec() needs it and sqd_regcomp() is computing
195 * Structure for regexp "program". This is essentially a linear encoding
196 * of a nondeterministic finite-state machine (aka syntax charts or
197 * "railroad normal form" in parsing technology). Each node is an opcode
198 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
199 * all nodes except BRANCH implement concatenation; a "next" pointer with
200 * a BRANCH on both ends of it is connecting two alternatives. (Here we
201 * have one of the subtle syntax dependencies: an individual BRANCH (as
202 * opposed to a collection of them) is never concatenated with anything
203 * because of operator precedence.) The operand of some types of node is
204 * a literal string; for others, it is a node leading into a sub-FSM. In
205 * particular, the operand of a BRANCH node is the first node of the branch.
206 * (NB this is *not* a tree structure: the tail of the branch connects
207 * to the thing following the set of BRANCHes.) The opcodes are:
210 /* definition number opnd? meaning */
211 #define END 0 /* no End of program. */
212 #define BOL 1 /* no Match beginning of line. */
213 #define EOL 2 /* no Match end of line. */
214 #define ANY 3 /* no Match any character. */
215 #define ANYOF 4 /* str Match any of these. */
216 #define ANYBUT 5 /* str Match any but one of these. */
217 #define BRANCH 6 /* node Match this, or the next..\&. */
218 #define BACK 7 /* no "next" ptr points backward. */
219 #define EXACTLY 8 /* str Match this string. */
220 #define NOTHING 9 /* no Match empty string. */
221 #define STAR 10 /* node Match this 0 or more times. */
222 #define PLUS 11 /* node Match this 1 or more times. */
223 #define OPEN 20 /* no Sub-RE starts here. */
224 /* OPEN+1 is number 1, etc. */
225 #define CLOSE 30 /* no Analogous to OPEN. */
230 * BRANCH The set of branches constituting a single choice are hooked
231 * together with their "next" pointers, since precedence prevents
232 * anything being concatenated to any individual branch. The
233 * "next" pointer of the last BRANCH in a choice points to the
234 * thing following the whole choice. This is also where the
235 * final "next" pointer of each individual branch points; each
236 * branch starts with the operand node of a BRANCH node.
238 * BACK Normal "next" pointers all implicitly point forward; BACK
239 * exists to make loop structures possible.
241 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
242 * BRANCH structures using BACK. Simple cases (one character
243 * per match) are implemented with STAR and PLUS for speed
244 * and to minimize recursive plunges.
246 * OPEN,CLOSE ...are numbered at compile time.
250 * A node is one char of opcode followed by two chars of "next" pointer.
251 * "Next" pointers are stored as two 8-bit pieces, high order first. The
252 * value is a positive offset from the opcode of the node containing it.
253 * An operand, if any, simply follows the node. (Note that much of the
254 * code generation knows about this implicit relationship.)
256 * Using two bytes for the "next" pointer is vast overkill for most things,
257 * but allows patterns to get big without disasters.
260 #define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377))
261 #define OPERAND(p) ((p) + 3)
264 * Utility definitions.
266 #define FAIL(m) { sqd_regerror(m); return(NULL); }
267 #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
268 #define META "^$.[()|?+*\\"
271 * Flags to be passed up and down.
273 #define HASWIDTH 01 /* Known never to match null string. */
274 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
275 #define SPSTART 04 /* Starts with * or +. */
276 #define WORST 0 /* Worst case. */
279 * Work-variable struct for sqd_regcomp().
282 char *regparse; /* Input-scan pointer. */
283 int regnpar; /* () count. */
284 char *regcode; /* Code-emit pointer; ®dummy = don't. */
285 char regdummy[3]; /* NOTHING, 0 next ptr */
286 long regsize; /* Code size. */
288 #define EMITTING(cp) ((cp)->regcode != (cp)->regdummy)
291 * Forward declarations for sqd_regcomp()'s friends.
293 static char *reg(struct comp *cp, int paren, int *flagp);
294 static char *regbranch(struct comp *cp, int *flagp);
295 static char *regpiece(struct comp *cp, int *flagp);
296 static char *regatom(struct comp *cp, int *flagp);
297 static char *regnode(struct comp *cp, int op);
298 static char *regnext(char *node);
299 static void regc(struct comp *cp, int c);
300 static void reginsert(struct comp *cp, int op, char *opnd);
301 static void regtail(struct comp *cp, char *p, char *val);
302 static void regoptail(struct comp *cp, char *p, char *val);
305 - sqd_regcomp - compile a regular expression into internal code
307 * We can't allocate space until we know how big the compiled form will be,
308 * but we can't compile it (and thus know how big it is) until we've got a
309 * place to put the code. So we cheat: we compile it twice, once with code
310 * generation turned off and size counting turned on, and once "for real".
311 * This also means that we don't allocate space until we are sure that the
312 * thing really will compile successfully, and we never have to move the
313 * code and thus invalidate pointers into it. (Note that it has to be in
314 * one piece because free() must be able to free it all.)
316 * Beware that the optimization-preparation code in here knows about some
317 * of the structure of the compiled regexp.
323 register sqd_regexp *r;
329 FAIL("NULL argument to sqd_regcomp");
331 /* First pass: determine size, legality. */
332 co.regparse = (char *)exp;
335 co.regdummy[0] = NOTHING;
336 co.regdummy[1] = co.regdummy[2] = 0;
337 co.regcode = co.regdummy;
338 regc(&co, SQD_REGMAGIC);
339 if (reg(&co, 0, &flags) == NULL)
342 /* Small enough for pointer-storage convention? */
343 if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
344 FAIL("regexp too big");
346 /* Allocate space. */
347 r = (sqd_regexp *)malloc(sizeof(sqd_regexp) + (size_t)co.regsize);
349 FAIL("out of space");
351 /* Second pass: emit code. */
352 co.regparse = (char *)exp;
354 co.regcode = r->program;
355 regc(&co, SQD_REGMAGIC);
356 if (reg(&co, 0, &flags) == NULL)
359 /* Dig out information for optimizations. */
360 r->regstart = '\0'; /* Worst-case defaults. */
364 scan = r->program+1; /* First BRANCH. */
365 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
366 scan = OPERAND(scan);
368 /* Starting-point info. */
369 if (OP(scan) == EXACTLY)
370 r->regstart = *OPERAND(scan);
371 else if (OP(scan) == BOL)
375 * If there's something expensive in the r.e., find the
376 * longest literal string that must appear and make it the
377 * regmust. Resolve ties in favor of later strings, since
378 * the regstart check works with the beginning of the r.e.
379 * and avoiding duplication strengthens checking. Not a
380 * strong reason, but sufficient in the absence of others.
383 register char *longest = NULL;
384 register size_t len = 0;
386 for (; scan != NULL; scan = regnext(scan))
387 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
388 longest = OPERAND(scan);
389 len = strlen(OPERAND(scan));
391 r->regmust = longest;
392 r->regmlen = (int)len;
400 - reg - regular expression, i.e. main body or parenthesized thing
402 * Caller must absorb opening parenthesis.
404 * Combining parenthesis handling with the base level of regular expression
405 * is a trifle forced, but the need to tie the tails of the branches to what
406 * follows makes it hard to avoid.
409 reg(cp, paren, flagp)
410 register struct comp *cp;
411 int paren; /* Parenthesized? */
414 register char *ret = NULL; /* SRE: NULL init added to silence gcc */
416 register char *ender;
417 register int parno = 0; /* SRE: init added to silence gcc */
420 *flagp = HASWIDTH; /* Tentatively. */
423 /* Make an OPEN node. */
424 if (cp->regnpar >= NSUBEXP)
428 ret = regnode(cp, OPEN+parno);
431 /* Pick up the branches, linking them together. */
432 br = regbranch(cp, &flags);
436 regtail(cp, ret, br); /* OPEN -> first. */
439 *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */
440 *flagp |= flags&SPSTART;
441 while (*cp->regparse == '|') {
443 br = regbranch(cp, &flags);
446 regtail(cp, ret, br); /* BRANCH -> BRANCH. */
447 *flagp &= ~(~flags&HASWIDTH);
448 *flagp |= flags&SPSTART;
451 /* Make a closing node, and hook it on the end. */
452 ender = regnode(cp, (paren) ? CLOSE+parno : END);
453 regtail(cp, ret, ender);
455 /* Hook the tails of the branches to the closing node. */
456 for (br = ret; br != NULL; br = regnext(br))
457 regoptail(cp, br, ender);
459 /* Check for proper termination. */
460 if (paren && *cp->regparse++ != ')') {
461 FAIL("unterminated ()");
462 } else if (!paren && *cp->regparse != '\0') {
463 if (*cp->regparse == ')') {
464 FAIL("unmatched ()");
466 FAIL("internal error: junk on end");
474 - regbranch - one alternative of an | operator
476 * Implements the concatenation operator.
480 register struct comp *cp;
484 register char *chain;
485 register char *latest;
489 *flagp = WORST; /* Tentatively. */
491 ret = regnode(cp, BRANCH);
493 while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') {
494 latest = regpiece(cp, &flags);
497 *flagp |= flags&HASWIDTH;
498 if (chain == NULL) /* First piece. */
499 *flagp |= flags&SPSTART;
501 regtail(cp, chain, latest);
504 if (chain == NULL) /* Loop ran zero times. */
505 (void) regnode(cp, NOTHING);
511 - regpiece - something followed by possible [*+?]
513 * Note that the branching code sequences used for ? and the general cases
514 * of * and + are somewhat optimized: they use the same NOTHING node as
515 * both the endmarker for their branch list and the body of the last branch.
516 * It might seem that this node could be dispensed with entirely, but the
517 * endmarker role is not redundant.
521 register struct comp *cp;
529 ret = regatom(cp, &flags);
539 if (!(flags&HASWIDTH) && op != '?')
540 FAIL("*+ operand could be empty");
542 case '*': *flagp = WORST|SPSTART; break;
543 case '+': *flagp = WORST|SPSTART|HASWIDTH; break;
544 case '?': *flagp = WORST; break;
547 if (op == '*' && (flags&SIMPLE))
548 reginsert(cp, STAR, ret);
549 else if (op == '*') {
550 /* Emit x* as (x&|), where & means "self". */
551 reginsert(cp, BRANCH, ret); /* Either x */
552 regoptail(cp, ret, regnode(cp, BACK)); /* and loop */
553 regoptail(cp, ret, ret); /* back */
554 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
555 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
556 } else if (op == '+' && (flags&SIMPLE))
557 reginsert(cp, PLUS, ret);
558 else if (op == '+') {
559 /* Emit x+ as x(&|), where & means "self". */
560 next = regnode(cp, BRANCH); /* Either */
561 regtail(cp, ret, next);
562 regtail(cp, regnode(cp, BACK), ret); /* loop back */
563 regtail(cp, next, regnode(cp, BRANCH)); /* or */
564 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
565 } else if (op == '?') {
566 /* Emit x? as (x|) */
567 reginsert(cp, BRANCH, ret); /* Either x */
568 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
569 next = regnode(cp, NOTHING); /* null. */
570 regtail(cp, ret, next);
571 regoptail(cp, ret, next);
574 if (ISREPN(*cp->regparse))
581 - regatom - the lowest level
583 * Optimization: gobbles an entire sequence of ordinary characters so that
584 * it can turn them into a single node, which is smaller to store and
585 * faster to run. Backslashed characters are exceptions, each becoming a
586 * separate node; the code is simpler that way and it's not worth fixing.
590 register struct comp *cp;
596 *flagp = WORST; /* Tentatively. */
598 switch (*cp->regparse++) {
600 ret = regnode(cp, BOL);
603 ret = regnode(cp, EOL);
606 ret = regnode(cp, ANY);
607 *flagp |= HASWIDTH|SIMPLE;
611 register int rangeend;
614 if (*cp->regparse == '^') { /* Complement of range. */
615 ret = regnode(cp, ANYBUT);
618 ret = regnode(cp, ANYOF);
619 if ((c = *cp->regparse) == ']' || c == '-') {
623 while ((c = *cp->regparse++) != '\0' && c != ']') {
626 else if ((c = *cp->regparse) == ']' || c == '\0')
629 range = (unsigned char)*(cp->regparse-2);
630 rangeend = (unsigned char)c;
631 if (range > rangeend)
632 FAIL("invalid [] range");
633 for (range++; range <= rangeend; range++)
640 FAIL("unmatched []");
641 *flagp |= HASWIDTH|SIMPLE;
645 ret = reg(cp, 1, &flags);
648 *flagp |= flags&(HASWIDTH|SPSTART);
653 /* supposed to be caught earlier */
654 FAIL("internal error: \\0|) unexpected");
659 FAIL("?+* follows nothing");
662 if (*cp->regparse == '\0')
664 ret = regnode(cp, EXACTLY);
665 regc(cp, *cp->regparse++);
667 *flagp |= HASWIDTH|SIMPLE;
674 len = strcspn(cp->regparse, META);
676 FAIL("internal error: strcspn 0");
677 ender = *(cp->regparse+len);
678 if (len > 1 && ISREPN(ender))
679 len--; /* Back off clear of ?+* operand. */
683 ret = regnode(cp, EXACTLY);
684 for (; len > 0; len--)
685 regc(cp, *cp->regparse++);
695 - regnode - emit a node
697 static char * /* Location. */
699 register struct comp *cp;
702 register char *const ret = cp->regcode;
712 *ptr++ = '\0'; /* Null next pointer. */
720 - regc - emit (if appropriate) a byte of code
724 register struct comp *cp;
734 - reginsert - insert an operator in front of already-emitted operand
736 * Means relocating the operand.
739 reginsert(cp, op, opnd)
740 register struct comp *cp;
744 register char *place;
751 (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd));
754 place = opnd; /* Op node, where operand used to be. */
761 - regtail - set the next-pointer at the end of a node chain
765 register struct comp *cp;
776 /* Find last node. */
777 for (scan = p; (temp = regnext(scan)) != NULL; scan = temp)
780 offset = (OP(scan) == BACK) ? scan - val : val - scan;
781 *(scan+1) = (offset>>8)&0177;
782 *(scan+2) = offset&0377;
786 - regoptail - regtail on operand of first argument; nop if operandless
789 regoptail(cp, p, val)
790 register struct comp *cp;
794 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
795 if (!EMITTING(cp) || OP(p) != BRANCH)
797 regtail(cp, OPERAND(p), val);
801 * sqd_regexec and friends
805 * Work-variable struct for sqd_regexec().
808 char *reginput; /* String-input pointer. */
809 char *regbol; /* Beginning of input, for ^ check. */
810 char **regstartp; /* Pointer to startp array. */
811 char **regendp; /* Ditto for endp. */
817 static int regtry(struct exec *ep, sqd_regexp *rp, char *string);
818 static int regmatch(struct exec *ep, char *prog);
819 static size_t regrepeat(struct exec *ep, char *node);
824 static char *regprop();
828 - sqd_regexec - match a regexp against a string
831 sqd_regexec(prog, str)
832 register sqd_regexp *prog;
835 register char *string = (char *)str; /* avert const poisoning */
840 if (prog == NULL || string == NULL) {
841 sqd_regerror("NULL argument to sqd_regexec");
845 /* Check validity of program. */
846 if ((unsigned char)*prog->program != SQD_REGMAGIC) {
847 sqd_regerror("corrupted regexp");
851 /* If there is a "must appear" string, look for it. */
852 if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
855 /* Mark beginning of line for ^ . */
857 ex.regstartp = prog->startp;
858 ex.regendp = prog->endp;
860 /* Simplest case: anchored match need be tried only once. */
862 return(regtry(&ex, prog, string));
864 /* Messy cases: unanchored match. */
865 if (prog->regstart != '\0') {
866 /* We know what char it must start with. */
867 for (s = string; s != NULL; s = strchr(s+1, prog->regstart))
868 if (regtry(&ex, prog, s))
872 /* We don't -- general case. */
873 for (s = string; !regtry(&ex, prog, s); s++)
882 - regtry - try match at specific point
884 static int /* 0 failure, 1 success */
885 regtry(ep, prog, string)
886 register struct exec *ep;
894 ep->reginput = string;
898 for (i = NSUBEXP; i > 0; i--) {
902 if (regmatch(ep, prog->program + 1)) {
903 prog->startp[0] = string;
904 prog->endp[0] = ep->reginput;
911 - regmatch - main matching routine
913 * Conceptually the strategy is simple: check to see whether the current
914 * node matches, call self recursively to see whether the rest matches,
915 * and then act accordingly. In practice we make some effort to avoid
916 * recursion, in particular by going through "ordinary" nodes (that don't
917 * need to know whether the rest of the match failed) by a loop instead of
920 static int /* 0 failure, 1 success */
922 register struct exec *ep;
925 register char *scan; /* Current node. */
926 char *next; /* Next node. */
929 if (prog != NULL && regnarrate)
930 fprintf(stderr, "%s(\n", regprop(prog));
932 for (scan = prog; scan != NULL; scan = next) {
935 fprintf(stderr, "%s...\n", regprop(scan));
937 next = regnext(scan);
941 if (ep->reginput != ep->regbol)
945 if (*ep->reginput != '\0')
949 if (*ep->reginput == '\0')
955 register char *const opnd = OPERAND(scan);
957 /* Inline the first character, for speed. */
958 if (*opnd != *ep->reginput)
961 if (len > 1 && strncmp(opnd, ep->reginput, len) != 0)
967 if (*ep->reginput == '\0' ||
968 strchr(OPERAND(scan), *ep->reginput) == NULL)
973 if (*ep->reginput == '\0' ||
974 strchr(OPERAND(scan), *ep->reginput) != NULL)
982 case OPEN+1: case OPEN+2: case OPEN+3:
983 case OPEN+4: case OPEN+5: case OPEN+6:
984 case OPEN+7: case OPEN+8: case OPEN+9: {
985 register const int no = OP(scan) - OPEN;
986 register char *const input = ep->reginput;
988 if (regmatch(ep, next)) {
990 * Don't set startp if some later
991 * invocation of the same parentheses
994 if (ep->regstartp[no] == NULL)
995 ep->regstartp[no] = input;
1001 case CLOSE+1: case CLOSE+2: case CLOSE+3:
1002 case CLOSE+4: case CLOSE+5: case CLOSE+6:
1003 case CLOSE+7: case CLOSE+8: case CLOSE+9: {
1004 register const int no = OP(scan) - CLOSE;
1005 register char *const input = ep->reginput;
1007 if (regmatch(ep, next)) {
1009 * Don't set endp if some later
1010 * invocation of the same parentheses
1013 if (ep->regendp[no] == NULL)
1014 ep->regendp[no] = input;
1021 register char *const save = ep->reginput;
1023 if (OP(next) != BRANCH) /* No choice. */
1024 next = OPERAND(scan); /* Avoid recursion. */
1026 while (OP(scan) == BRANCH) {
1027 if (regmatch(ep, OPERAND(scan)))
1029 ep->reginput = save;
1030 scan = regnext(scan);
1037 case STAR: case PLUS: {
1038 register const char nextch =
1039 (OP(next) == EXACTLY) ? *OPERAND(next) : '\0';
1041 register char *const save = ep->reginput;
1042 register const size_t min = (OP(scan) == STAR) ? 0 : 1;
1044 for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) {
1045 ep->reginput = save + no - 1;
1046 /* If it could work, try it. */
1047 if (nextch == '\0' || *ep->reginput == nextch)
1048 if (regmatch(ep, next))
1055 return(1); /* Success! */
1058 sqd_regerror("regexp corruption");
1065 * We get here only if there's trouble -- normally "case END" is
1066 * the terminating point.
1068 sqd_regerror("corrupted pointers");
1073 - regrepeat - report how many times something simple would match
1077 register struct exec *ep;
1080 register size_t count;
1081 register char *scan;
1086 return(strlen(ep->reginput));
1089 ch = *OPERAND(node);
1091 for (scan = ep->reginput; *scan == ch; scan++)
1096 return(strspn(ep->reginput, OPERAND(node)));
1099 return(strcspn(ep->reginput, OPERAND(node)));
1101 default: /* Oh dear. Called inappropriately. */
1102 sqd_regerror("internal error: bad call of regrepeat");
1103 return(0); /* Best compromise. */
1110 - regnext - dig the "next" pointer out of a node
1116 register const int offset = NEXT(p);
1121 return((OP(p) == BACK) ? p-offset : p+offset);
1126 static char *regprop();
1129 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1136 register char op = EXACTLY; /* Arbitrary non-END op. */
1137 register char *next;
1141 while (op != END) { /* While that wasn't END last time... */
1143 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1145 if (next == NULL) /* Next ptr. */
1148 printf("(%d)", (s-r->program)+(next-s));
1150 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1151 /* Literal string, where present. */
1152 while (*s != '\0') {
1161 /* Header fields of interest. */
1162 if (r->regstart != '\0')
1163 printf("start `%c' ", r->regstart);
1165 printf("anchored ");
1166 if (r->regmust != NULL)
1167 printf("must have \"%s\"", r->regmust);
1172 - regprop - printable representation of opcode
1179 static char buf[50];
1181 (void) strcpy(buf, ":");
1223 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1235 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1245 sqd_regerror("corrupted opcode");
1249 (void) strcat(buf, p);
1256 - sqd_regsub - perform substitutions after a regexp match
1259 sqd_regsub(rp, source, dest)
1260 const sqd_regexp *rp;
1264 register sqd_regexp * const prog = (sqd_regexp *)rp;
1265 register char *src = (char *)source;
1266 register char *dst = dest;
1269 register size_t len;
1271 if (prog == NULL || source == NULL || dest == NULL) {
1272 sqd_regerror("NULL parameter to sqd_regsub");
1275 if ((unsigned char)*(prog->program) != SQD_REGMAGIC) {
1276 sqd_regerror("damaged regexp");
1280 while ((c = *src++) != '\0') {
1283 else if (c == '\\' && isdigit((int) (*src)))
1288 if (no < 0) { /* Ordinary character. */
1289 if (c == '\\' && (*src == '\\' || *src == '&'))
1292 } else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
1293 prog->endp[no] > prog->startp[no]) {
1294 len = prog->endp[no] - prog->startp[no];
1295 (void) strncpy(dst, prog->startp[no], len);
1297 if (*(dst-1) == '\0') { /* strncpy hit NUL. */
1298 sqd_regerror("damaged match string");
1311 fprintf(stderr, "regexp(3): %s\n", s);