1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
10 /*****************************************************************
11 * This code is an altered version of Henry Spencer's
12 * regex library. Alterations are limited to minor streamlining,
13 * and some name changes to protect the SQUID namespace.
14 * Henry's copyright notice appears below.
15 * You can obtain the original from
16 * ftp://ftp.zoo.toronto.edu/pub/bookregex.tar.Z
19 * The magic word for compiling a testdriver: NBA_TEAM_IN_STL
20 * gcc -o test -g -DNBA_TEAM_IN_STL -L. hsregex.c -lsquid -lm
23 * test <pattern> <ntok> <string>
25 * SRE, Fri Aug 28 11:10:17 1998
26 * CVS $Id: hsregex.c,v 1.7 2001/08/09 17:50:17 eddy Exp)
27 *****************************************************************/
35 /* global sqd_parse[] are managed by Strparse().
36 * WARNING: TODO: this code is not threadsafe, and needs to be revised.
40 /* Function: Strparse()
42 * Purpose: Match a regexp to a string. Returns 1 if pattern matches,
45 * Much like Perl, Strparse() makes copies of the matching
46 * substrings available via globals, sqd_parse[].
47 * sqd_parse[0] contains a copy of the complete matched
48 * text. sqd_parse[1-9] contain copies of up to nine
49 * different substrings matched within parentheses.
50 * The memory for these strings is internally managed and
51 * volatile; the next call to Strparse() may destroy them.
52 * If the caller needs the matched substrings to persist
53 * beyond a new Strparse() call, it must make its own
56 * A minor drawback of the memory management is that
57 * there will be a small amount of unfree'd memory being
58 * managed by Strparse() when a program exits; this may
59 * confuse memory debugging (Purify, dbmalloc). The
60 * general cleanup function SqdClean() is provided;
61 * you can call this before exiting.
63 * Uses an extended POSIX regular expression interface.
64 * A copylefted GNU implementation is included in the squid
65 * implementation (gnuregex.c) for use on non-POSIX compliant
66 * systems. POSIX 1003.2-compliant systems (all UNIX,
67 * some WinNT, I believe) can omit the GNU code if necessary.
69 * I built this for ease of use, not speed nor efficiency.
71 * Example: Strparse("foo-...-baz", "foo-bar-baz") returns 0
72 * Strparse("foo-(...)-baz", "foo-bar-baz")
73 * returns 0; sqd_parse[0] is "foo-bar-baz";
74 * sqd_parse[1] is "bar".
77 * s = ">gnl|ti|3 G10P69425RH2.T0 {SUB 81..737} /len=657"
78 * pat = "SUB ([0-9]+)"
80 * returns 1; sqd_parse[1] is "81".
82 * Args: rexp - regular expression, extended POSIX form
83 * s - string to match against
84 * ntok - number of () substrings we will save (maximum NSUBEXP-1)
86 * Return: 1 on match, 0 if no match
89 Strparse(char *rexp, char *s, int ntok)
96 if (ntok >= NSUBEXP ) Die("Strparse(): ntok must be <= %d", NSUBEXP-1);
98 /* Free previous global substring buffers
100 for (i = 0; i <= ntok; i++)
101 if (sqd_parse[i] != NULL)
107 /* Compile and match the pattern, using our modified
108 * copy of Henry Spencer's regexp library
110 if ((pat = sqd_regcomp(rexp)) == NULL)
111 Die("regexp compilation failed.");
112 code = sqd_regexec(pat, s);
114 /* Fill the global substring buffers
117 for (i = 0; i <= ntok; i++)
118 if (pat->startp[i] != NULL && pat->endp[i] != NULL)
120 len = pat->endp[i] - pat->startp[i];
121 sqd_parse[i] = (char *) MallocOrDie(sizeof(char) * (len+1));
122 strncpy(sqd_parse[i], pat->startp[i], len);
123 sqd_parse[i][len] = '\0';
130 /* Function: SqdClean()
131 * Date: SRE, Wed Oct 29 12:52:08 1997 [TWA 721]
133 * Purpose: Clean up any squid library allocations before exiting
134 * a program, so we don't leave unfree'd memory around
135 * and confuse a malloc debugger like Purify or dbmalloc.
142 /* Free global substring buffers that Strparse() uses
144 for (i = 0; i <= 9; i++)
145 if (sqd_parse[i] != NULL) {
153 /* all code below is:
154 * Copyright (c) 1986, 1993, 1995 by University of Toronto.
155 * Written by Henry Spencer. Not derived from licensed software.
157 * Permission is granted to anyone to use this software for any
158 * purpose on any computer system, and to redistribute it in any way,
159 * subject to the following restrictions:
161 * 1. The author is not responsible for the consequences of use of
162 * this software, no matter how awful, even if they arise
163 * from defects in it.
165 * 2. The origin of this software must not be misrepresented, either
166 * by explicit claim or by omission.
168 * 3. Altered versions must be plainly marked as such, and must not
169 * be misrepresented (by explicit claim or omission) as being
170 * the original software.
172 * 4. This notice must not be removed or altered.
176 * sqd_regcomp and sqd_regexec -- sqd_regsub and sqd_regerror are elsewhere
180 * The first byte of the regexp internal "program" is actually this magic
181 * number; the start node begins in the second byte.
183 #define SQD_REGMAGIC 0234
186 * The "internal use only" fields in regexp.h are present to pass info from
187 * compile to execute that permits the execute phase to run lots faster on
188 * simple cases. They are:
190 * regstart char that must begin a match; '\0' if none obvious
191 * reganch is the match anchored (at beginning-of-line only)?
192 * regmust string (pointer into program) that match must include, or NULL
193 * regmlen length of regmust string
195 * Regstart and reganch permit very fast decisions on suitable starting points
196 * for a match, cutting down the work a lot. Regmust permits fast rejection
197 * of lines that cannot possibly match. The regmust tests are costly enough
198 * that sqd_regcomp() supplies a regmust only if the r.e. contains something
199 * potentially expensive (at present, the only such thing detected is * or +
200 * at the start of the r.e., which can involve a lot of backup). Regmlen is
201 * supplied because the test in sqd_regexec() needs it and sqd_regcomp() is computing
206 * Structure for regexp "program". This is essentially a linear encoding
207 * of a nondeterministic finite-state machine (aka syntax charts or
208 * "railroad normal form" in parsing technology). Each node is an opcode
209 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
210 * all nodes except BRANCH implement concatenation; a "next" pointer with
211 * a BRANCH on both ends of it is connecting two alternatives. (Here we
212 * have one of the subtle syntax dependencies: an individual BRANCH (as
213 * opposed to a collection of them) is never concatenated with anything
214 * because of operator precedence.) The operand of some types of node is
215 * a literal string; for others, it is a node leading into a sub-FSM. In
216 * particular, the operand of a BRANCH node is the first node of the branch.
217 * (NB this is *not* a tree structure: the tail of the branch connects
218 * to the thing following the set of BRANCHes.) The opcodes are:
221 /* definition number opnd? meaning */
222 #define END 0 /* no End of program. */
223 #define BOL 1 /* no Match beginning of line. */
224 #define EOL 2 /* no Match end of line. */
225 #define ANY 3 /* no Match any character. */
226 #define ANYOF 4 /* str Match any of these. */
227 #define ANYBUT 5 /* str Match any but one of these. */
228 #define BRANCH 6 /* node Match this, or the next..\&. */
229 #define BACK 7 /* no "next" ptr points backward. */
230 #define EXACTLY 8 /* str Match this string. */
231 #define NOTHING 9 /* no Match empty string. */
232 #define STAR 10 /* node Match this 0 or more times. */
233 #define PLUS 11 /* node Match this 1 or more times. */
234 #define OPEN 20 /* no Sub-RE starts here. */
235 /* OPEN+1 is number 1, etc. */
236 #define CLOSE 30 /* no Analogous to OPEN. */
241 * BRANCH The set of branches constituting a single choice are hooked
242 * together with their "next" pointers, since precedence prevents
243 * anything being concatenated to any individual branch. The
244 * "next" pointer of the last BRANCH in a choice points to the
245 * thing following the whole choice. This is also where the
246 * final "next" pointer of each individual branch points; each
247 * branch starts with the operand node of a BRANCH node.
249 * BACK Normal "next" pointers all implicitly point forward; BACK
250 * exists to make loop structures possible.
252 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
253 * BRANCH structures using BACK. Simple cases (one character
254 * per match) are implemented with STAR and PLUS for speed
255 * and to minimize recursive plunges.
257 * OPEN,CLOSE ...are numbered at compile time.
261 * A node is one char of opcode followed by two chars of "next" pointer.
262 * "Next" pointers are stored as two 8-bit pieces, high order first. The
263 * value is a positive offset from the opcode of the node containing it.
264 * An operand, if any, simply follows the node. (Note that much of the
265 * code generation knows about this implicit relationship.)
267 * Using two bytes for the "next" pointer is vast overkill for most things,
268 * but allows patterns to get big without disasters.
271 #define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377))
272 #define OPERAND(p) ((p) + 3)
275 * Utility definitions.
277 #define FAIL(m) { sqd_regerror(m); return(NULL); }
278 #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
279 #define META "^$.[()|?+*\\"
282 * Flags to be passed up and down.
284 #define HASWIDTH 01 /* Known never to match null string. */
285 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
286 #define SPSTART 04 /* Starts with * or +. */
287 #define WORST 0 /* Worst case. */
290 * Work-variable struct for sqd_regcomp().
293 char *regparse; /* Input-scan pointer. */
294 int regnpar; /* () count. */
295 char *regcode; /* Code-emit pointer; ®dummy = don't. */
296 char regdummy[3]; /* NOTHING, 0 next ptr */
297 long regsize; /* Code size. */
299 #define EMITTING(cp) ((cp)->regcode != (cp)->regdummy)
302 * Forward declarations for sqd_regcomp()'s friends.
304 static char *reg(struct comp *cp, int paren, int *flagp);
305 static char *regbranch(struct comp *cp, int *flagp);
306 static char *regpiece(struct comp *cp, int *flagp);
307 static char *regatom(struct comp *cp, int *flagp);
308 static char *regnode(struct comp *cp, int op);
309 static char *regnext(char *node);
310 static void regc(struct comp *cp, int c);
311 static void reginsert(struct comp *cp, int op, char *opnd);
312 static void regtail(struct comp *cp, char *p, char *val);
313 static void regoptail(struct comp *cp, char *p, char *val);
316 - sqd_regcomp - compile a regular expression into internal code
318 * We can't allocate space until we know how big the compiled form will be,
319 * but we can't compile it (and thus know how big it is) until we've got a
320 * place to put the code. So we cheat: we compile it twice, once with code
321 * generation turned off and size counting turned on, and once "for real".
322 * This also means that we don't allocate space until we are sure that the
323 * thing really will compile successfully, and we never have to move the
324 * code and thus invalidate pointers into it. (Note that it has to be in
325 * one piece because free() must be able to free it all.)
327 * Beware that the optimization-preparation code in here knows about some
328 * of the structure of the compiled regexp.
334 register sqd_regexp *r;
340 FAIL("NULL argument to sqd_regcomp");
342 /* First pass: determine size, legality. */
343 co.regparse = (char *)exp;
346 co.regdummy[0] = NOTHING;
347 co.regdummy[1] = co.regdummy[2] = 0;
348 co.regcode = co.regdummy;
349 regc(&co, SQD_REGMAGIC);
350 if (reg(&co, 0, &flags) == NULL)
353 /* Small enough for pointer-storage convention? */
354 if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
355 FAIL("regexp too big");
357 /* Allocate space. */
358 r = (sqd_regexp *)malloc(sizeof(sqd_regexp) + (size_t)co.regsize);
360 FAIL("out of space");
362 /* Second pass: emit code. */
363 co.regparse = (char *)exp;
365 co.regcode = r->program;
366 regc(&co, SQD_REGMAGIC);
367 if (reg(&co, 0, &flags) == NULL)
370 /* Dig out information for optimizations. */
371 r->regstart = '\0'; /* Worst-case defaults. */
375 scan = r->program+1; /* First BRANCH. */
376 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
377 scan = OPERAND(scan);
379 /* Starting-point info. */
380 if (OP(scan) == EXACTLY)
381 r->regstart = *OPERAND(scan);
382 else if (OP(scan) == BOL)
386 * If there's something expensive in the r.e., find the
387 * longest literal string that must appear and make it the
388 * regmust. Resolve ties in favor of later strings, since
389 * the regstart check works with the beginning of the r.e.
390 * and avoiding duplication strengthens checking. Not a
391 * strong reason, but sufficient in the absence of others.
394 register char *longest = NULL;
395 register size_t len = 0;
397 for (; scan != NULL; scan = regnext(scan))
398 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
399 longest = OPERAND(scan);
400 len = strlen(OPERAND(scan));
402 r->regmust = longest;
403 r->regmlen = (int)len;
411 - reg - regular expression, i.e. main body or parenthesized thing
413 * Caller must absorb opening parenthesis.
415 * Combining parenthesis handling with the base level of regular expression
416 * is a trifle forced, but the need to tie the tails of the branches to what
417 * follows makes it hard to avoid.
420 reg(cp, paren, flagp)
421 register struct comp *cp;
422 int paren; /* Parenthesized? */
425 register char *ret = NULL; /* SRE: NULL init added to silence gcc */
427 register char *ender;
428 register int parno = 0; /* SRE: init added to silence gcc */
431 *flagp = HASWIDTH; /* Tentatively. */
434 /* Make an OPEN node. */
435 if (cp->regnpar >= NSUBEXP)
439 ret = regnode(cp, OPEN+parno);
442 /* Pick up the branches, linking them together. */
443 br = regbranch(cp, &flags);
447 regtail(cp, ret, br); /* OPEN -> first. */
450 *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */
451 *flagp |= flags&SPSTART;
452 while (*cp->regparse == '|') {
454 br = regbranch(cp, &flags);
457 regtail(cp, ret, br); /* BRANCH -> BRANCH. */
458 *flagp &= ~(~flags&HASWIDTH);
459 *flagp |= flags&SPSTART;
462 /* Make a closing node, and hook it on the end. */
463 ender = regnode(cp, (paren) ? CLOSE+parno : END);
464 regtail(cp, ret, ender);
466 /* Hook the tails of the branches to the closing node. */
467 for (br = ret; br != NULL; br = regnext(br))
468 regoptail(cp, br, ender);
470 /* Check for proper termination. */
471 if (paren && *cp->regparse++ != ')') {
472 FAIL("unterminated ()");
473 } else if (!paren && *cp->regparse != '\0') {
474 if (*cp->regparse == ')') {
475 FAIL("unmatched ()");
477 FAIL("internal error: junk on end");
485 - regbranch - one alternative of an | operator
487 * Implements the concatenation operator.
491 register struct comp *cp;
495 register char *chain;
496 register char *latest;
500 *flagp = WORST; /* Tentatively. */
502 ret = regnode(cp, BRANCH);
504 while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') {
505 latest = regpiece(cp, &flags);
508 *flagp |= flags&HASWIDTH;
509 if (chain == NULL) /* First piece. */
510 *flagp |= flags&SPSTART;
512 regtail(cp, chain, latest);
515 if (chain == NULL) /* Loop ran zero times. */
516 (void) regnode(cp, NOTHING);
522 - regpiece - something followed by possible [*+?]
524 * Note that the branching code sequences used for ? and the general cases
525 * of * and + are somewhat optimized: they use the same NOTHING node as
526 * both the endmarker for their branch list and the body of the last branch.
527 * It might seem that this node could be dispensed with entirely, but the
528 * endmarker role is not redundant.
532 register struct comp *cp;
540 ret = regatom(cp, &flags);
550 if (!(flags&HASWIDTH) && op != '?')
551 FAIL("*+ operand could be empty");
553 case '*': *flagp = WORST|SPSTART; break;
554 case '+': *flagp = WORST|SPSTART|HASWIDTH; break;
555 case '?': *flagp = WORST; break;
558 if (op == '*' && (flags&SIMPLE))
559 reginsert(cp, STAR, ret);
560 else if (op == '*') {
561 /* Emit x* as (x&|), where & means "self". */
562 reginsert(cp, BRANCH, ret); /* Either x */
563 regoptail(cp, ret, regnode(cp, BACK)); /* and loop */
564 regoptail(cp, ret, ret); /* back */
565 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
566 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
567 } else if (op == '+' && (flags&SIMPLE))
568 reginsert(cp, PLUS, ret);
569 else if (op == '+') {
570 /* Emit x+ as x(&|), where & means "self". */
571 next = regnode(cp, BRANCH); /* Either */
572 regtail(cp, ret, next);
573 regtail(cp, regnode(cp, BACK), ret); /* loop back */
574 regtail(cp, next, regnode(cp, BRANCH)); /* or */
575 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
576 } else if (op == '?') {
577 /* Emit x? as (x|) */
578 reginsert(cp, BRANCH, ret); /* Either x */
579 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
580 next = regnode(cp, NOTHING); /* null. */
581 regtail(cp, ret, next);
582 regoptail(cp, ret, next);
585 if (ISREPN(*cp->regparse))
592 - regatom - the lowest level
594 * Optimization: gobbles an entire sequence of ordinary characters so that
595 * it can turn them into a single node, which is smaller to store and
596 * faster to run. Backslashed characters are exceptions, each becoming a
597 * separate node; the code is simpler that way and it's not worth fixing.
601 register struct comp *cp;
607 *flagp = WORST; /* Tentatively. */
609 switch (*cp->regparse++) {
611 ret = regnode(cp, BOL);
614 ret = regnode(cp, EOL);
617 ret = regnode(cp, ANY);
618 *flagp |= HASWIDTH|SIMPLE;
622 register int rangeend;
625 if (*cp->regparse == '^') { /* Complement of range. */
626 ret = regnode(cp, ANYBUT);
629 ret = regnode(cp, ANYOF);
630 if ((c = *cp->regparse) == ']' || c == '-') {
634 while ((c = *cp->regparse++) != '\0' && c != ']') {
637 else if ((c = *cp->regparse) == ']' || c == '\0')
640 range = (unsigned char)*(cp->regparse-2);
641 rangeend = (unsigned char)c;
642 if (range > rangeend)
643 FAIL("invalid [] range");
644 for (range++; range <= rangeend; range++)
651 FAIL("unmatched []");
652 *flagp |= HASWIDTH|SIMPLE;
656 ret = reg(cp, 1, &flags);
659 *flagp |= flags&(HASWIDTH|SPSTART);
664 /* supposed to be caught earlier */
665 FAIL("internal error: \\0|) unexpected");
670 FAIL("?+* follows nothing");
673 if (*cp->regparse == '\0')
675 ret = regnode(cp, EXACTLY);
676 regc(cp, *cp->regparse++);
678 *flagp |= HASWIDTH|SIMPLE;
685 len = strcspn(cp->regparse, META);
687 FAIL("internal error: strcspn 0");
688 ender = *(cp->regparse+len);
689 if (len > 1 && ISREPN(ender))
690 len--; /* Back off clear of ?+* operand. */
694 ret = regnode(cp, EXACTLY);
695 for (; len > 0; len--)
696 regc(cp, *cp->regparse++);
706 - regnode - emit a node
708 static char * /* Location. */
710 register struct comp *cp;
713 register char *const ret = cp->regcode;
723 *ptr++ = '\0'; /* Null next pointer. */
731 - regc - emit (if appropriate) a byte of code
735 register struct comp *cp;
745 - reginsert - insert an operator in front of already-emitted operand
747 * Means relocating the operand.
750 reginsert(cp, op, opnd)
751 register struct comp *cp;
755 register char *place;
762 (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd));
765 place = opnd; /* Op node, where operand used to be. */
772 - regtail - set the next-pointer at the end of a node chain
776 register struct comp *cp;
787 /* Find last node. */
788 for (scan = p; (temp = regnext(scan)) != NULL; scan = temp)
791 offset = (OP(scan) == BACK) ? scan - val : val - scan;
792 *(scan+1) = (offset>>8)&0177;
793 *(scan+2) = offset&0377;
797 - regoptail - regtail on operand of first argument; nop if operandless
800 regoptail(cp, p, val)
801 register struct comp *cp;
805 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
806 if (!EMITTING(cp) || OP(p) != BRANCH)
808 regtail(cp, OPERAND(p), val);
812 * sqd_regexec and friends
816 * Work-variable struct for sqd_regexec().
819 char *reginput; /* String-input pointer. */
820 char *regbol; /* Beginning of input, for ^ check. */
821 char **regstartp; /* Pointer to startp array. */
822 char **regendp; /* Ditto for endp. */
828 static int regtry(struct exec *ep, sqd_regexp *rp, char *string);
829 static int regmatch(struct exec *ep, char *prog);
830 static size_t regrepeat(struct exec *ep, char *node);
835 static char *regprop();
839 - sqd_regexec - match a regexp against a string
842 sqd_regexec(prog, str)
843 register sqd_regexp *prog;
846 register char *string = (char *)str; /* avert const poisoning */
851 if (prog == NULL || string == NULL) {
852 sqd_regerror("NULL argument to sqd_regexec");
856 /* Check validity of program. */
857 if ((unsigned char)*prog->program != SQD_REGMAGIC) {
858 sqd_regerror("corrupted regexp");
862 /* If there is a "must appear" string, look for it. */
863 if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
866 /* Mark beginning of line for ^ . */
868 ex.regstartp = prog->startp;
869 ex.regendp = prog->endp;
871 /* Simplest case: anchored match need be tried only once. */
873 return(regtry(&ex, prog, string));
875 /* Messy cases: unanchored match. */
876 if (prog->regstart != '\0') {
877 /* We know what char it must start with. */
878 for (s = string; s != NULL; s = strchr(s+1, prog->regstart))
879 if (regtry(&ex, prog, s))
883 /* We don't -- general case. */
884 for (s = string; !regtry(&ex, prog, s); s++)
893 - regtry - try match at specific point
895 static int /* 0 failure, 1 success */
896 regtry(ep, prog, string)
897 register struct exec *ep;
905 ep->reginput = string;
909 for (i = NSUBEXP; i > 0; i--) {
913 if (regmatch(ep, prog->program + 1)) {
914 prog->startp[0] = string;
915 prog->endp[0] = ep->reginput;
922 - regmatch - main matching routine
924 * Conceptually the strategy is simple: check to see whether the current
925 * node matches, call self recursively to see whether the rest matches,
926 * and then act accordingly. In practice we make some effort to avoid
927 * recursion, in particular by going through "ordinary" nodes (that don't
928 * need to know whether the rest of the match failed) by a loop instead of
931 static int /* 0 failure, 1 success */
933 register struct exec *ep;
936 register char *scan; /* Current node. */
937 char *next; /* Next node. */
940 if (prog != NULL && regnarrate)
941 fprintf(stderr, "%s(\n", regprop(prog));
943 for (scan = prog; scan != NULL; scan = next) {
946 fprintf(stderr, "%s...\n", regprop(scan));
948 next = regnext(scan);
952 if (ep->reginput != ep->regbol)
956 if (*ep->reginput != '\0')
960 if (*ep->reginput == '\0')
966 register char *const opnd = OPERAND(scan);
968 /* Inline the first character, for speed. */
969 if (*opnd != *ep->reginput)
972 if (len > 1 && strncmp(opnd, ep->reginput, len) != 0)
978 if (*ep->reginput == '\0' ||
979 strchr(OPERAND(scan), *ep->reginput) == NULL)
984 if (*ep->reginput == '\0' ||
985 strchr(OPERAND(scan), *ep->reginput) != NULL)
993 case OPEN+1: case OPEN+2: case OPEN+3:
994 case OPEN+4: case OPEN+5: case OPEN+6:
995 case OPEN+7: case OPEN+8: case OPEN+9: {
996 register const int no = OP(scan) - OPEN;
997 register char *const input = ep->reginput;
999 if (regmatch(ep, next)) {
1001 * Don't set startp if some later
1002 * invocation of the same parentheses
1005 if (ep->regstartp[no] == NULL)
1006 ep->regstartp[no] = input;
1012 case CLOSE+1: case CLOSE+2: case CLOSE+3:
1013 case CLOSE+4: case CLOSE+5: case CLOSE+6:
1014 case CLOSE+7: case CLOSE+8: case CLOSE+9: {
1015 register const int no = OP(scan) - CLOSE;
1016 register char *const input = ep->reginput;
1018 if (regmatch(ep, next)) {
1020 * Don't set endp if some later
1021 * invocation of the same parentheses
1024 if (ep->regendp[no] == NULL)
1025 ep->regendp[no] = input;
1032 register char *const save = ep->reginput;
1034 if (OP(next) != BRANCH) /* No choice. */
1035 next = OPERAND(scan); /* Avoid recursion. */
1037 while (OP(scan) == BRANCH) {
1038 if (regmatch(ep, OPERAND(scan)))
1040 ep->reginput = save;
1041 scan = regnext(scan);
1048 case STAR: case PLUS: {
1049 register const char nextch =
1050 (OP(next) == EXACTLY) ? *OPERAND(next) : '\0';
1052 register char *const save = ep->reginput;
1053 register const size_t min = (OP(scan) == STAR) ? 0 : 1;
1055 for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) {
1056 ep->reginput = save + no - 1;
1057 /* If it could work, try it. */
1058 if (nextch == '\0' || *ep->reginput == nextch)
1059 if (regmatch(ep, next))
1066 return(1); /* Success! */
1069 sqd_regerror("regexp corruption");
1076 * We get here only if there's trouble -- normally "case END" is
1077 * the terminating point.
1079 sqd_regerror("corrupted pointers");
1084 - regrepeat - report how many times something simple would match
1088 register struct exec *ep;
1091 register size_t count;
1092 register char *scan;
1097 return(strlen(ep->reginput));
1100 ch = *OPERAND(node);
1102 for (scan = ep->reginput; *scan == ch; scan++)
1107 return(strspn(ep->reginput, OPERAND(node)));
1110 return(strcspn(ep->reginput, OPERAND(node)));
1112 default: /* Oh dear. Called inappropriately. */
1113 sqd_regerror("internal error: bad call of regrepeat");
1114 return(0); /* Best compromise. */
1121 - regnext - dig the "next" pointer out of a node
1127 register const int offset = NEXT(p);
1132 return((OP(p) == BACK) ? p-offset : p+offset);
1137 static char *regprop();
1140 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1147 register char op = EXACTLY; /* Arbitrary non-END op. */
1148 register char *next;
1152 while (op != END) { /* While that wasn't END last time... */
1154 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1156 if (next == NULL) /* Next ptr. */
1159 printf("(%d)", (s-r->program)+(next-s));
1161 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1162 /* Literal string, where present. */
1163 while (*s != '\0') {
1172 /* Header fields of interest. */
1173 if (r->regstart != '\0')
1174 printf("start `%c' ", r->regstart);
1176 printf("anchored ");
1177 if (r->regmust != NULL)
1178 printf("must have \"%s\"", r->regmust);
1183 - regprop - printable representation of opcode
1190 static char buf[50];
1192 (void) strcpy(buf, ":");
1234 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1246 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1256 sqd_regerror("corrupted opcode");
1260 (void) strcat(buf, p);
1267 - sqd_regsub - perform substitutions after a regexp match
1270 sqd_regsub(rp, source, dest)
1271 const sqd_regexp *rp;
1275 register sqd_regexp * const prog = (sqd_regexp *)rp;
1276 register char *src = (char *)source;
1277 register char *dst = dest;
1280 register size_t len;
1282 if (prog == NULL || source == NULL || dest == NULL) {
1283 sqd_regerror("NULL parameter to sqd_regsub");
1286 if ((unsigned char)*(prog->program) != SQD_REGMAGIC) {
1287 sqd_regerror("damaged regexp");
1291 while ((c = *src++) != '\0') {
1294 else if (c == '\\' && isdigit((int) (*src)))
1299 if (no < 0) { /* Ordinary character. */
1300 if (c == '\\' && (*src == '\\' || *src == '&'))
1303 } else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
1304 prog->endp[no] > prog->startp[no]) {
1305 len = prog->endp[no] - prog->startp[no];
1306 (void) strncpy(dst, prog->startp[no], len);
1308 if (*(dst-1) == '\0') { /* strncpy hit NUL. */
1309 sqd_regerror("damaged match string");
1322 fprintf(stderr, "regexp(3): %s\n", s);
1327 #ifdef NBA_TEAM_IN_STL
1329 main(int argc, char **argv)
1337 ntok = atoi(argv[2]);
1340 status = Strparse(pat, s, ntok);
1342 printf("no match\n");
1346 for (i = 1; i <= ntok; i++)
1347 printf("matched token %1d: %s\n", i, sqd_parse[i]);
1350 #endif /*NBA_TEAM_IN_STL*/