X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fcom%2Fstevesoft%2Fpat%2FRegex.java;h=04bb0daea5b381b984e9bb2aeda42d0070a8d526;hb=26e4caf025740afea870ed6ceb5894bc56316ba6;hp=af5cbcce89b3349d032d52204ab5dff80ec57eca;hpb=7bc226b58110fa26d9dbd3f0c78095d06909ffc3;p=jalview.git diff --git a/src/com/stevesoft/pat/Regex.java b/src/com/stevesoft/pat/Regex.java index af5cbcc..04bb0da 100755 --- a/src/com/stevesoft/pat/Regex.java +++ b/src/com/stevesoft/pat/Regex.java @@ -1,2025 +1,2085 @@ -// -// This software is now distributed according to -// the Lesser Gnu Public License. Please see -// http://www.gnu.org/copyleft/lesser.txt for -// the details. -// -- Happy Computing! -// -package com.stevesoft.pat; - -import java.io.*; -import java.util.*; - -import com.stevesoft.pat.wrap.*; - -/** Matches a Unicode punctuation character. */ -class UnicodePunct - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1; - } -} - -/** Matches a Unicode white space character. */ -class UnicodeWhite - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1; - } -} - -/** Matches a character that is not a Unicode punctuation - * character. - */ -class NUnicodePunct - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1; - } -} - -/** Matches a character that is not a - * Unicode white space character. - */ -class NUnicodeWhite - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1; - } -} - -/** Matches a Unicode word character: an alphanumeric or underscore. */ -class UnicodeW - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - if (from >= s.length()) - { - return -1; - } - char c = s.charAt(from); - return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to : - -1; - } -} - -/** Matches a character that is not a Unicode alphanumeric or underscore. */ -class NUnicodeW - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - if (from >= s.length()) - { - return -1; - } - char c = s.charAt(from); - return! (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to : - -1; - } -} - -/** Matches a Unicode decimal digit. */ -class UnicodeDigit - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to : -1; - } -} - -/** Matches a character that is not a Unicode digit.*/ -class NUnicodeDigit - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to : -1; - } -} - -/** Matches a Unicode math character. */ -class UnicodeMath - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1; - } -} - -/** Matches a non-math Unicode character. */ -class NUnicodeMath - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1; - } -} - -/** Matches a Unicode currency symbol. */ -class UnicodeCurrency - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1; - } -} - -/** Matches a non-currency symbol Unicode character. */ -class NUnicodeCurrency - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1; - } -} - -/** Matches a Unicode alphabetic character. */ -class UnicodeAlpha - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1; - } -} - -/** Matches a non-alphabetic Unicode character. */ -class NUnicodeAlpha - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to : -1; - } -} - -/** Matches an upper case Unicode character. */ -class UnicodeUpper - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && isUpper(s.charAt(from)) ? to : -1; - } - - final boolean isUpper(char c) - { - return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c); - } -} - -/** Matches an upper case Unicode character. */ -class UnicodeLower - extends UniValidator -{ - public int validate(StringLike s, int from, int to) - { - return from < s.length() && isLower(s.charAt(from)) ? to : -1; - } - - final boolean isLower(char c) - { - return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c); - } -} - -/** - Regex provides the parser which constructs the linked list of - Pattern classes from a String. -
- For the purpose of this documentation, the fact that java interprets the - backslash will be ignored. In practice, however, you will need a - double backslash to obtain a string that contains a single backslash - character. Thus, the example pattern "\b" should really be typed - as "\\b" inside java code. -
- Note that Regex is part of package "com.stevesoft.pat". - To use it, simply import - com.stevesoft.pat.Regex at the top of your file. -
- Regex is made with a constructor that takes a String that defines - the regular expression. Thus, for example -
- Regex r = new Regex("[a-c]*"); -- matches any number of characters so long as the are 'a', 'b', or 'c'). -
- To attempt to match the Pattern to a given string, you can use either - the search(String) member function, or the matchAt(String,int position) - member function. These functions return a boolean which tells you - whether or not the thing worked, and sets the methods "charsMatched()" - and "matchedFrom()" in the Regex object appropriately. -
- The portion of the string before the match can be obtained by the - left() member, and the portion after the match can be obtained - by the right() member. -
- Essentially, this package implements a syntax that is very much - like the perl 5 regular expression syntax. - - Longer example: -
- Regex r = new Regex("x(a|b)y"); - r.matchAt("xay",0); - System.out.println("sub = "+r.stringMatched(1)); -- The above would print "sub = a". -
- r.left() // would return "x" - r.right() // would return "y" --
- Differences between this package and perl5:
- The extended Pattern for setting flags, is now supported,
- but the flags are different. "(?i)" tells the pattern to
- ignore case, "(?Q)" sets the "dontMatchInQuotes" flag, and
- "(?iQ)" sets them both. You can change the escape character.
- The pattern
(?e=#)#d+is the same as
\d+, - but note that the sequence
(?e=#)must occur - at the very beginning of the pattern. There may be other small - differences as well. I will either make my package conform - or note them as I become aware of them. -
- This package supports additional patterns not in perl5: -
(?@()) | Group | This matches all characters between - the '(' character and the balancing ')' character. Thus, it will - match "()" as well as "(())". The balancing characters are - arbitrary, thus (?@{}) matches on "{}" and "{{}}". | -
(?<1) | Backup | Moves the pointer backwards within the text. - This allows you to make a "look behind." It fails if it - attempts to move to a position before the beginning of the string. - "x(?<1)" is equivalent to "(?=x)". The number, 1 in this example, - is the number of characters to move backwards. | -
start*/ - public boolean searchFrom(String s, int start) - { - if (s == null) - { - throw new NullPointerException("Null String Given to Regex.searchFrom"); - } - return _search(s, start, s.length()); - } - - public boolean searchFrom(StringLike s, int start) - { - if (s == null) - { - throw new NullPointerException("Null String Given to Regex.searchFrom"); - } - return _search(s, start, s.length()); - } - - /** Search through a region of a String - for the first occurence of a match. */ - public boolean searchRegion(String s, int start, int end) - { - if (s == null) - { - throw new NullPointerException("Null String Given to Regex.searchRegion"); - } - return _search(s, start, end); - } - - /** Set this to change the default behavior of the "." pattern. - By default it now matches perl's behavior and fails to - match the '\n' character. */ - public static boolean dotDoesntMatchCR = true; - StringLike gFlags; - int gFlagto = 0; - boolean gFlag = false; - /** Set the 'g' flag */ - public void setGFlag(boolean b) - { - gFlag = b; - } - - /** Get the state of the 'g' flag. */ - public boolean getGFlag() - { - return gFlag; - } - - boolean sFlag = false; - /** Get the state of the sFlag */ - public boolean getSFlag() - { - return sFlag; - } - - boolean mFlag = false; - /** Get the state of the sFlag */ - public boolean getMFlag() - { - return mFlag; - } - - final boolean _search(String s, int start, int end) - { - return _search(new StringWrap(s), start, end); - } - - final boolean _search(StringLike s, int start, int end) - { - if (gFlag && gFlagto > 0 && gFlags != null && s.unwrap() == gFlags.unwrap()) - { - start = gFlagto; - } - gFlags = null; - - Pthings pt = prep(s); - - int up = (minMatch == null ? end : end - minMatch.i); - - if (up < start && end >= start) - { - up = start; - } - - if (skipper == null) - { - for (int i = start; i <= up; i++) - { - charsMatched_ = thePattern.matchAt(s, i, pt); - if (charsMatched_ >= 0) - { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_ + charsMatched_; - gFlags = s; - return didMatch_ = true; - } - } - } - else - { - pt.no_check = true; - for (int i = start; i <= up; i++) - { - i = skipper.find(src, i, up); - if (i < 0) - { - charsMatched_ = matchFrom_ = -1; - return didMatch_ = false; - } - charsMatched_ = thePattern.matchAt(s, i, pt); - if (charsMatched_ >= 0) - { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_ + charsMatched_; - gFlags = s; - return didMatch_ = true; - } - } - } - return didMatch_ = false; - } - - /*final boolean _search(LongStringLike s,long start,long end) { - if(gFlag && gFlagto > 0 && s==gFlags) - start = gFlagto; - gFlags = null; - - Pthings pt=prep(s); - - int up = end;//(minMatch == null ? end : end-minMatch.i); - - if(up < start && end >= start) up = start; - - if(skipper == null) { - for(long i=start;i<=up;i++) { - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_+charsMatched_; - return didMatch_=true; - } - } - } else { - pt.no_check = true; - for(long i=start;i<=up;i++) { - i = skipper.find(src,i,up); - if(i<0) { - charsMatched_ = matchFrom_ = -1; - return didMatch_ = false; - } - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_+charsMatched_; - gFlags = s; - return didMatch_=true; - } else { - i = s.adjustIndex(i); - up = s.adjustEnd(i); - } - } - } - return didMatch_=false; - }*/ - - boolean _reverseSearch(String s, int start, int end) - { - return _reverseSearch(new StringWrap(s), start, end); - } - - boolean _reverseSearch(StringLike s, int start, int end) - { - if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap()) - { - end = gFlagto; - } - gFlags = null; - Pthings pt = prep(s); - for (int i = end; i >= start; i--) - { - charsMatched_ = thePattern.matchAt(s, i, pt); - if (charsMatched_ >= 0) - { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_ - 1; - gFlags = s; - return didMatch_ = true; - } - } - return didMatch_ = false; - } - - // This routine sets the cbits variable - // of class Pattern. Cbits is true for - // the bit corresponding to a character inside - // a set of quotes. - static StringLike lasts = null; - static BitSet lastbs = null; - static void setCbits(StringLike s, Pthings pt) - { - if (s == lasts) - { - pt.cbits = lastbs; - return; - } - BitSet bs = new BitSet(s.length()); - char qc = ' '; - boolean setBit = false; - for (int i = 0; i < s.length(); i++) - { - if (setBit) - { - bs.set(i); - } - char c = s.charAt(i); - if (!setBit && c == '"') - { - qc = c; - setBit = true; - bs.set(i); - } - else if (!setBit && c == '\'') - { - qc = c; - setBit = true; - bs.set(i); - } - else if (setBit && c == qc) - { - setBit = false; - } - else if (setBit && c == '\\' && i + 1 < s.length()) - { - i++; - if (setBit) - { - bs.set(i); - } - } - } - pt.cbits = lastbs = bs; - lasts = s; - } - - // Wanted user to over-ride this in alpha version, - // but it wasn't really necessary because of this trick: - Regex newRegex() - { - try - { - return (Regex) getClass().newInstance(); - } - catch (InstantiationException ie) - { - return null; - } - catch (IllegalAccessException iae) - { - return null; - } - } - - /** Only needed for creating your own extensions of - Regex. This method adds the next Pattern in the chain - of patterns or sets the Pattern if it is the first call. */ - protected void add(Pattern p2) - { - if (p == null) - { - p = p2; - } - else - { - p.add(p2); - p2 = p; - } - } - - /** You only need to use this method if you are creating - your own extentions to Regex. - compile1 compiles one Pattern element, it can be - over-ridden to allow the Regex compiler to understand - new syntax. See deriv.java for an example. This routine - is the heart of class Regex. Rthings has one integer - member called intValue, it is used to keep track of the number - of ()'s in the Pattern. - @exception com.stevesoft.pat.RegSyntax is thrown when a nonsensensical - pattern is supplied. For example, a pattern beginning - with *. */ - protected void compile1(StrPos sp, Rthings mk) - throws RegSyntax - { - if (sp.match('[')) - { - sp.inc(); - add(matchBracket(sp)); - } - else if (sp.match('|')) - { - if (or == null) - { - or = new Or(); - } - if (p == null) - { - p = new NullPattern(); - } - or.addOr(p); - p = null; - } - else if (sp.incMatch("(?<")) - { - patInt i = sp.getPatInt(); - if (i == null) - { - RegSyntaxError.endItAll("No int after (?<"); - } - add(new Backup(i.intValue())); - if (!sp.match(')')) - { - RegSyntaxError.endItAll("No ) after (?<"); - } - } - else if (sp.incMatch("(?>")) - { - patInt i = sp.getPatInt(); - if (i == null) - { - RegSyntaxError.endItAll("No int after (?>"); - } - add(new Backup( -i.intValue())); - if (!sp.match(')')) - { - RegSyntaxError.endItAll("No ) after (?<"); - } - } - else if (sp.incMatch("(?@")) - { - char op = sp.c; - sp.inc(); - char cl = sp.c; - sp.inc(); - if (!sp.match(')')) - { - RegSyntaxError.endItAll( - "(?@ does not have closing paren"); - } - add(new Group(op, cl)); - } - else if (sp.incMatch("(?#")) - { - while (!sp.match(')')) - { - sp.inc(); - } - } - else if (sp.dontMatch && sp.c == 'w') - { - //Regex r = new Regex(); - //r._compile("[a-zA-Z0-9_]",mk); - //add(new Goop("\\w",r.thePattern)); - Bracket b = new Bracket(false); - b.addOr(new Range('a', 'z')); - b.addOr(new Range('A', 'Z')); - b.addOr(new Range('0', '9')); - b.addOr(new oneChar('_')); - add(b); - } - else if (sp.dontMatch && sp.c == 'G') - { - add(new BackG()); - } - else if (sp.dontMatch && sp.c == 's') - { - //Regex r = new Regex(); - //r._compile("[ \t\n\r\b]",mk); - //add(new Goop("\\s",r.thePattern)); - Bracket b = new Bracket(false); - b.addOr(new oneChar( (char) 32)); - b.addOr(new Range( (char) 8, (char) 10)); - b.addOr(new oneChar( (char) 13)); - add(b); - } - else if (sp.dontMatch && sp.c == 'd') - { - //Regex r = new Regex(); - //r._compile("[0-9]",mk); - //add(new Goop("\\d",r.thePattern)); - Range digit = new Range('0', '9'); - digit.printBrackets = true; - add(digit); - } - else if (sp.dontMatch && sp.c == 'W') - { - //Regex r = new Regex(); - //r._compile("[^a-zA-Z0-9_]",mk); - //add(new Goop("\\W",r.thePattern)); - Bracket b = new Bracket(true); - b.addOr(new Range('a', 'z')); - b.addOr(new Range('A', 'Z')); - b.addOr(new Range('0', '9')); - b.addOr(new oneChar('_')); - add(b); - } - else if (sp.dontMatch && sp.c == 'S') - { - //Regex r = new Regex(); - //r._compile("[^ \t\n\r\b]",mk); - //add(new Goop("\\S",r.thePattern)); - Bracket b = new Bracket(true); - b.addOr(new oneChar( (char) 32)); - b.addOr(new Range( (char) 8, (char) 10)); - b.addOr(new oneChar( (char) 13)); - add(b); - } - else if (sp.dontMatch && sp.c == 'D') - { - //Regex r = new Regex(); - //r._compile("[^0-9]",mk); - //add(new Goop("\\D",r.thePattern)); - Bracket b = new Bracket(true); - b.addOr(new Range('0', '9')); - add(b); - } - else if (sp.dontMatch && sp.c == 'B') - { - Regex r = new Regex(); - r._compile("(?!" + back_slash + "b)", mk); - add(r.thePattern); - } - else if (isOctalString(sp)) - { - int d = sp.c - '0'; - sp.inc(); - d = 8 * d + sp.c - '0'; - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if (isOctalDigit(sp2, false)) - { - sp.inc(); - d = 8 * d + sp.c - '0'; - } - add(new oneChar( (char) d)); - } - else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9') - { - int iv = sp.c - '0'; - StrPos s2 = new StrPos(sp); - s2.inc(); - if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9') - { - iv = 10 * iv + (s2.c - '0'); - sp.inc(); - } - add(new BackMatch(iv)); - } - else if (sp.dontMatch && sp.c == 'b') - { - add(new Boundary()); - } - else if (sp.match('\b')) - { - add(new Boundary()); - } - else if (sp.match('$')) - { - add(new End(true)); - } - else if (sp.dontMatch && sp.c == 'Z') - { - add(new End(false)); - } - else if (sp.match('.')) - { - add(new Any()); - } - else if (sp.incMatch("(??")) - { - StringBuffer sb = new StringBuffer(); - StringBuffer sb2 = new StringBuffer(); - while (!sp.match(')') && !sp.match(':')) - { - sb.append(sp.c); - sp.inc(); - } - if (sp.incMatch(":")) - { - while (!sp.match(')')) - { - sb2.append(sp.c); - sp.inc(); - } - } - String sbs = sb.toString(); - if (validators.get(sbs) instanceof String) - { - String pat = (String) validators.get(sbs); - Regex r = newRegex(); - Rthings rth = new Rthings(this); - rth.noBackRefs = true; - r._compile(pat, rth); - add(r.thePattern); - } - else - { - Custom cm = new Custom(sb.toString()); - if (cm.v != null) - { - Validator v2 = cm.v.arg(sb2.toString()); - if (v2 != null) - { - v2.argsave = sb2.toString(); - String p = cm.v.pattern; - cm.v = v2; - v2.pattern = p; - } - Regex r = newRegex(); - Rthings rth = new Rthings(this); - rth.noBackRefs = true; - r._compile(cm.v.pattern, rth); - cm.sub = r.thePattern; - cm.sub.add(new CustomEndpoint(cm)); - cm.sub.setParent(cm); - add(cm); - } - } - } - else if (sp.match('(')) - { - mk.parenLevel++; - Regex r = newRegex(); - // r.or = new Or(); - sp.inc(); - if (sp.incMatch("?:")) - { - r.or = new Or(); - } - else if (sp.incMatch("?=")) - { - r.or = new lookAhead(false); - } - else if (sp.incMatch("?!")) - { - r.or = new lookAhead(true); - } - else if (sp.match('?')) - { - sp.inc(); - do - { - if (sp.c == 'i') - { - mk.ignoreCase = true; - } - if (sp.c == 'Q') - { - mk.dontMatchInQuotes = true; - } - if (sp.c == 'o') - { - mk.optimizeMe = true; - } - if (sp.c == 'g') - { - mk.gFlag = true; - } - if (sp.c == 's') - { - mk.sFlag = true; - } - if (sp.c == 'm') - { - mk.mFlag = true; - } - sp.inc(); - } - while (!sp.match(')') && !sp.eos); - r = null; - mk.parenLevel--; - if (sp.eos) //throw new RegSyntax - { - RegSyntaxError.endItAll("Unclosed ()"); - } - } - else - { // just ordinary parenthesis - r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++); - } - if (r != null) - { - add(r._compile(sp, mk)); - } - } - else if (sp.match('^')) - { - add(new Start(true)); - } - else if (sp.dontMatch && sp.c == 'A') - { - add(new Start(false)); - } - else if (sp.match('*')) - { - addMulti(new patInt(0), new patInf()); - } - else if (sp.match('+')) - { - addMulti(new patInt(1), new patInf()); - } - else if (sp.match('?')) - { - addMulti(new patInt(0), new patInt(1)); - } - else if (sp.match('{')) - { - boolean bad = false; - StrPos sp2 = new StrPos(sp); - //StringBuffer sb = new StringBuffer(); - sp.inc(); - patInt i1 = sp.getPatInt(); - patInt i2 = null; - if (sp.match('}')) - { - i2 = i1; - } - else - { - if (!sp.match(',')) /* - { - RegSyntaxError.endItAll( - "String \"{"+i2+ - "\" should be followed with , or }");*/ - bad = true; - } - sp.inc(); - if (sp.match('}')) - { - i2 = new patInf(); - } - else - { - i2 = sp.getPatInt(); - } - } - if (i1 == null || i2 == null) /* - { - throw new RegSyntax("Badly formatted Multi: " - +"{"+i1+","+i2+"}"); */bad = true; - } - if (bad) - { - sp.dup(sp2); - add(new oneChar(sp.c)); - } - else - { - addMulti(i1, i2); - } - } - else if (sp.escMatch('x') && next2Hex(sp)) - { - sp.inc(); - int d = getHexDigit(sp); - sp.inc(); - d = 16 * d + getHexDigit(sp); - add(new oneChar( (char) d)); - } - else if (sp.escMatch('c')) - { - sp.inc(); - if (sp.c < Ctrl.cmap.length) - { - add(new oneChar(Ctrl.cmap[sp.c])); - } - else - { - add(new oneChar(sp.c)); - } - } - else if (sp.escMatch('f')) - { - add(new oneChar( (char) 12)); - } - else if (sp.escMatch('a')) - { - add(new oneChar( (char) 7)); - } - else if (sp.escMatch('t')) - { - add(new oneChar('\t')); - } - else if (sp.escMatch('n')) - { - add(new oneChar('\n')); - } - else if (sp.escMatch('r')) - { - add(new oneChar('\r')); - } - else if (sp.escMatch('b')) - { - add(new oneChar('\b')); - } - else if (sp.escMatch('e')) - { - add(new oneChar( (char) 27)); - } - else - { - add(new oneChar(sp.c)); - if (sp.match(')')) - { - RegSyntaxError.endItAll("Unmatched right paren in pattern"); - } - } - } - - // compiles all Pattern elements, internal method - private Pattern _compile(String pat, Rthings mk) - throws RegSyntax - { - minMatch = null; - sFlag = mFlag = ignoreCase = gFlag = false; - StrPos sp = new StrPos(pat, 0); - thePattern = _compile(sp, mk); - pt.marks = null; - return thePattern; - } - - Pattern p = null; - Or or = null; - Pattern _compile(StrPos sp, Rthings mk) - throws RegSyntax - { - while (! (sp.eos || (or != null && sp.match(')')))) - { - compile1(sp, mk); - sp.inc(); - } - if (sp.match(')')) - { - mk.parenLevel--; - } - else if (sp.eos && mk.parenLevel != 0) - { - RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel); - } - if (or != null) - { - if (p == null) - { - p = new NullPattern(); - } - or.addOr(p); - return or; - } - return p == null ? new NullPattern() : p; - } - - // add a multi object to the end of the chain - // which applies to the last object - void addMulti(patInt i1, patInt i2) - throws RegSyntax - { - Pattern last, last2; - for (last = p; last != null && last.next != null; last = last.next) - { - ; - } - if (last == null || last == p) - { - last2 = null; - } - else - { - for (last2 = p; last2.next != last; last2 = last2.next) - { - ; - } - } - if (last instanceof Multi && i1.intValue() == 0 && - i2.intValue() == 1) - { - ( (Multi) last).matchFewest = true; - } - else if (last instanceof FastMulti && i1.intValue() == 0 && - i2.intValue() == 1) - { - ( (FastMulti) last).matchFewest = true; - } - else if (last instanceof DotMulti && i1.intValue() == 0 && - i2.intValue() == 1) - { - ( (DotMulti) last).matchFewest = true; - } - else if (last instanceof Multi - || last instanceof DotMulti - || last instanceof FastMulti) - { - throw new RegSyntax("Syntax error."); - } - else if (last2 == null) - { - p = mkMulti(i1, i2, p); - } - else - { - last2.next = mkMulti(i1, i2, last); - } - } - - final static Pattern mkMulti(patInt lo, patInt hi, Pattern p) - throws RegSyntax - { - if (p instanceof Any && p.next == null) - { - return (Pattern)new DotMulti(lo, hi); - } - return RegOpt.safe4fm(p) ? (Pattern)new FastMulti(lo, hi, p) : - (Pattern)new Multi(lo, hi, p); - } - - // process the bracket operator - Pattern matchBracket(StrPos sp) - throws RegSyntax - { - Bracket ret; - if (sp.match('^')) - { - ret = new Bracket(true); - sp.inc(); - } - else - { - ret = new Bracket(false); - } - if (sp.match(']')) - { - //throw new RegSyntax - RegSyntaxError.endItAll("Unmatched []"); - } - - while (!sp.eos && !sp.match(']')) - { - StrPos s1 = new StrPos(sp); - s1.inc(); - StrPos s1_ = new StrPos(s1); - s1_.inc(); - if (s1.match('-') && !s1_.match(']')) - { - StrPos s2 = new StrPos(s1); - s2.inc(); - if (!s2.eos) - { - ret.addOr(new Range(sp.c, s2.c)); - } - sp.inc(); - sp.inc(); - } - else if (sp.escMatch('Q')) - { - sp.inc(); - while (!sp.escMatch('E')) - { - ret.addOr(new oneChar(sp.c)); - sp.inc(); - } - } - else if (sp.escMatch('d')) - { - ret.addOr(new Range('0', '9')); - } - else if (sp.escMatch('s')) - { - ret.addOr(new oneChar( (char) 32)); - ret.addOr(new Range( (char) 8, (char) 10)); - ret.addOr(new oneChar( (char) 13)); - } - else if (sp.escMatch('w')) - { - ret.addOr(new Range('a', 'z')); - ret.addOr(new Range('A', 'Z')); - ret.addOr(new Range('0', '9')); - ret.addOr(new oneChar('_')); - } - else if (sp.escMatch('D')) - { - ret.addOr(new Range( (char) 0, (char) 47)); - ret.addOr(new Range( (char) 58, (char) 65535)); - } - else if (sp.escMatch('S')) - { - ret.addOr(new Range( (char) 0, (char) 7)); - ret.addOr(new Range( (char) 11, (char) 12)); - ret.addOr(new Range( (char) 14, (char) 31)); - ret.addOr(new Range( (char) 33, (char) 65535)); - } - else if (sp.escMatch('W')) - { - ret.addOr(new Range( (char) 0, (char) 64)); - ret.addOr(new Range( (char) 91, (char) 94)); - ret.addOr(new oneChar( (char) 96)); - ret.addOr(new Range( (char) 123, (char) 65535)); - } - else if (sp.escMatch('x') && next2Hex(sp)) - { - sp.inc(); - int d = getHexDigit(sp); - sp.inc(); - d = 16 * d + getHexDigit(sp); - ret.addOr(new oneChar( (char) d)); - } - else if (sp.escMatch('a')) - { - ret.addOr(new oneChar( (char) 7)); - } - else if (sp.escMatch('f')) - { - ret.addOr(new oneChar( (char) 12)); - } - else if (sp.escMatch('e')) - { - ret.addOr(new oneChar( (char) 27)); - } - else if (sp.escMatch('n')) - { - ret.addOr(new oneChar('\n')); - } - else if (sp.escMatch('t')) - { - ret.addOr(new oneChar('\t')); - } - else if (sp.escMatch('r')) - { - ret.addOr(new oneChar('\r')); - } - else if (sp.escMatch('c')) - { - sp.inc(); - if (sp.c < Ctrl.cmap.length) - { - ret.addOr(new oneChar(Ctrl.cmap[sp.c])); - } - else - { - ret.addOr(new oneChar(sp.c)); - } - } - else if (isOctalString(sp)) - { - int d = sp.c - '0'; - sp.inc(); - d = 8 * d + sp.c - '0'; - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if (isOctalDigit(sp2, false)) - { - sp.inc(); - d = 8 * d + sp.c - '0'; - } - ret.addOr(new oneChar( (char) d)); - } - else - { - ret.addOr(new oneChar(sp.c)); - } - sp.inc(); - } - return ret; - } - - /** Converts the stored Pattern to a String -- this is a - decompile. Note that \t and \n will really print out here, - Not just the two character representations. - Also be prepared to see some strange output if your characters - are not printable. */ - public String toString() - { - if (false && thePattern == null) - { - return ""; - } - else - { - StringBuffer sb = new StringBuffer(); - if (esc != Pattern.ESC) - { - sb.append("(?e="); - sb.append(esc); - sb.append(")"); - } - if (gFlag - || mFlag - || !dotDoesntMatchCR - || sFlag - || ignoreCase - || dontMatchInQuotes - || optimized()) - { - sb.append("(?"); - if (ignoreCase) - { - sb.append("i"); - } - if (mFlag) - { - sb.append("m"); - } - if (sFlag || !dotDoesntMatchCR) - { - sb.append("s"); - } - if (dontMatchInQuotes) - { - sb.append("Q"); - } - if (optimized()) - { - sb.append("o"); - } - if (gFlag) - { - sb.append("g"); - } - sb.append(")"); - } - String patstr = thePattern.toString(); - if (esc != Pattern.ESC) - { - patstr = reEscape(patstr, Pattern.ESC, esc); - } - sb.append(patstr); - return sb.toString(); - } - } - - // Re-escape Pattern, allows us to use a different escape - // character. - static String reEscape(String s, char oldEsc, char newEsc) - { - if (oldEsc == newEsc) - { - return s; - } - int i; - StringBuffer sb = new StringBuffer(); - for (i = 0; i < s.length(); i++) - { - if (s.charAt(i) == oldEsc && i + 1 < s.length()) - { - if (s.charAt(i + 1) == oldEsc) - { - sb.append(oldEsc); - } - else - { - sb.append(newEsc); - sb.append(s.charAt(i + 1)); - } - i++; - } - else if (s.charAt(i) == newEsc) - { - sb.append(newEsc); - sb.append(newEsc); - } - else - { - sb.append(s.charAt(i)); - } - } - return sb.toString(); - } - - /** This method implements FilenameFilter, allowing one - to use a Regex to search through a directory using File.list. - There is a FileRegex now that does this better. - @see com.stevesoft.pat.FileRegex - */ - public boolean accept(File dir, String s) - { - return search(s); - } - - /** The version of this package */ - final static public String version() - { - return "lgpl release 1.5.3"; - } - - /** Once this method is called, the state of variables - ignoreCase and dontMatchInQuotes should not be changed as the - results will be unpredictable. However, - search and matchAt will run more quickly. Note that you - can check to see if the pattern has been optimized by calling - the optimized() method.
This method will attempt to rewrite - your pattern in a way that makes it faster (not all patterns - execute at the same speed). In general, "(?: ... )" will be - faster than "( ... )" so if you don't need the backreference, - you should group using the former pattern.
It will also - introduce new pattern elements that you can't get to otherwise, - for example if you have a large table of strings, i.e. the - months of the year "(January|February|...)" optimize() will make - a Hashtable that takes it to the next appropriate pattern - element -- eliminating the need for a linear search. - @see com.stevesoft.pat.Regex#optimized - @see com.stevesoft.pat.Regex#ignoreCase - @see com.stevesoft.pat.Regex#dontMatchInQuotes - @see com.stevesoft.pat.Regex#matchAt - @see com.stevesoft.pat.Regex#search - */ - public void optimize() - { - if (optimized() || thePattern == null) - { - return; - } - minMatch = new patInt(0); //thePattern.countMinChars(); - thePattern = RegOpt.opt(thePattern, ignoreCase, - dontMatchInQuotes); - skipper = Skip.findSkip(this); - //RegOpt.setParents(this); - return; - } - - Skip skipper; - /** This function returns true if the optimize method has - been called. */ - public boolean optimized() - { - return minMatch != null; - } - - /** A bit of syntactic surgar for those who want to make - their code look more perl-like. To use this initialize - your Regex object by saying: -
- Regex r1 = Regex.perlCode("s/hello/goodbye/"); - Regex r2 = Regex.perlCode("s'fish'frog'i"); - Regex r3 = Regex.perlCode("m'hello'); -- The i for ignoreCase is supported in - this syntax, as well as m, s, and x. The g flat - is a bit of a special case.
- If you wish to replace all occurences of a pattern, you - do not put a 'g' in the perlCode, but call Regex's - replaceAll method.
- If you wish to simply - and only do a search for r2's pattern, you can do this - by calling the searchFrom method method repeatedly, or - by calling search repeatedly if the g flag is set. -
- Note: Currently perlCode does not - support the (?e=#) syntax for - changing the escape character. - */ - - public static Regex perlCode(String s) - { - // this file is big enough, see parsePerl.java - // for this function. - return parsePerl.parse(s); - } - - static final char back_slash = '\\'; - - /** Checks to see if there are only literal and no special - pattern elements in this Regex. */ - public boolean isLiteral() - { - Pattern x = thePattern; - while (x != null) - { - if (x instanceof oneChar) - { - ; - } - else if (x instanceof Skipped) - { - ; - } - else - { - return false; - } - x = x.next; - } - return true; - } - - /** You only need to know about this if you are inventing - your own pattern elements. */ - public patInt countMinChars() - { - return thePattern.countMinChars(); - } - - /** You only need to know about this if you are inventing - your own pattern elements. */ - public patInt countMaxChars() - { - return thePattern.countMaxChars(); - } - - boolean isHexDigit(StrPos sp) - { - boolean r = - !sp.eos && !sp.dontMatch - && ( (sp.c >= '0' && sp.c <= '9') - || (sp.c >= 'a' && sp.c <= 'f') - || (sp.c >= 'A' && sp.c <= 'F')); - return r; - } - - boolean isOctalDigit(StrPos sp, boolean first) - { - boolean r = - !sp.eos && ! (first ^ sp.dontMatch) - && sp.c >= '0' && sp.c <= '7'; - return r; - } - - int getHexDigit(StrPos sp) - { - if (sp.c >= '0' && sp.c <= '9') - { - return sp.c - '0'; - } - if (sp.c >= 'a' && sp.c <= 'f') - { - return sp.c - 'a' + 10; - } - return sp.c - 'A' + 10; - } - - boolean next2Hex(StrPos sp) - { - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if (!isHexDigit(sp2)) - { - return false; - } - sp2.inc(); - if (!isHexDigit(sp2)) - { - return false; - } - return true; - } - - boolean isOctalString(StrPos sp) - { - if (!isOctalDigit(sp, true)) - { - return false; - } - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if (!isOctalDigit(sp2, false)) - { - return false; - } - return true; - } -} +// +// This software is now distributed according to +// the Lesser Gnu Public License. Please see +// http://www.gnu.org/copyleft/lesser.txt for +// the details. +// -- Happy Computing! +// +package com.stevesoft.pat; + +import java.io.*; +import java.util.*; + +import com.stevesoft.pat.wrap.*; + +/** Matches a Unicode punctuation character. */ +class UnicodePunct extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode white space character. */ +class UnicodeWhite extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1; + } +} + +/** + * Matches a character that is not a Unicode punctuation character. + */ +class NUnicodePunct extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1; + } +} + +/** + * Matches a character that is not a Unicode white space character. + */ +class NUnicodeWhite extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode word character: an alphanumeric or underscore. */ +class UnicodeW extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + if (from >= s.length()) + { + return -1; + } + char c = s.charAt(from); + return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to + : -1; + } +} + +/** Matches a character that is not a Unicode alphanumeric or underscore. */ +class NUnicodeW extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + if (from >= s.length()) + { + return -1; + } + char c = s.charAt(from); + return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to + : -1; + } +} + +/** Matches a Unicode decimal digit. */ +class UnicodeDigit extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to + : -1; + } +} + +/** Matches a character that is not a Unicode digit. */ +class NUnicodeDigit extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to + : -1; + } +} + +/** Matches a Unicode math character. */ +class UnicodeMath extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1; + } +} + +/** Matches a non-math Unicode character. */ +class NUnicodeMath extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode currency symbol. */ +class UnicodeCurrency extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1; + } +} + +/** Matches a non-currency symbol Unicode character. */ +class NUnicodeCurrency extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode alphabetic character. */ +class UnicodeAlpha extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1; + } +} + +/** Matches a non-alphabetic Unicode character. */ +class NUnicodeAlpha extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to + : -1; + } +} + +/** Matches an upper case Unicode character. */ +class UnicodeUpper extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && isUpper(s.charAt(from)) ? to : -1; + } + + final boolean isUpper(char c) + { + return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c); + } +} + +/** Matches an upper case Unicode character. */ +class UnicodeLower extends UniValidator +{ + public int validate(StringLike s, int from, int to) + { + return from < s.length() && isLower(s.charAt(from)) ? to : -1; + } + + final boolean isLower(char c) + { + return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c); + } +} + +/** + * Regex provides the parser which constructs the linked list of Pattern classes + * from a String. + *
+ * For the purpose of this documentation, the fact that java interprets the + * backslash will be ignored. In practice, however, you will need a double + * backslash to obtain a string that contains a single backslash character. + * Thus, the example pattern "\b" should really be typed as "\\b" inside java + * code. + *
+ * Note that Regex is part of package "com.stevesoft.pat". To use it, simply + * import com.stevesoft.pat.Regex at the top of your file. + *
+ * Regex is made with a constructor that takes a String that defines the regular + * expression. Thus, for example + * + *
+ * Regex r = new Regex("[a-c]*"); + *+ * + * matches any number of characters so long as the are 'a', 'b', or 'c'). + *
+ * To attempt to match the Pattern to a given string, you can use either the + * search(String) member function, or the matchAt(String,int position) member + * function. These functions return a boolean which tells you whether or not the + * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in + * the Regex object appropriately. + *
+ * The portion of the string before the match can be obtained by the left() + * member, and the portion after the match can be obtained by the right() + * member. + *
+ * Essentially, this package implements a syntax that is very much like the perl + * 5 regular expression syntax. + * + * Longer example: + * + *
+ * Regex r = new Regex("x(a|b)y"); + * r.matchAt("xay", 0); + * System.out.println("sub = " + r.stringMatched(1)); + *+ * + * The above would print "sub = a". + * + *
+ * r.left() // would return "x" + * r.right() // would return "y" + *+ * + *
+ * Differences between this package and perl5:
+ * The extended Pattern for setting flags, is now supported, but the flags are
+ * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the
+ * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the
+ * escape character. The pattern
+ *
+ *
+ * (?e=#)#d+ + *+ * + * is the same as + * + *
+ * \d+ + *, but note that the sequence + * + *
+ * (?e=#) + *+ * + * must occur at the very beginning of the pattern. There may be other + * small differences as well. I will either make my package conform or note them + * as I become aware of them. + *
+ * This package supports additional patterns not in perl5:
(?@()) | + *Group | + *This matches all characters between the '(' character and the balancing + * ')' character. Thus, it will match "()" as well as "(())". The balancing + * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}". | + *
(?<1) | + *Backup | + *Moves the pointer backwards within the text. This allows you to make a + * "look behind." It fails if it attempts to move to a position before the + * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1 + * in this example, is the number of characters to move backwards. | + *
+ * start + *+ */ + public boolean searchFrom(String s, int start) + { + if (s == null) + { + throw new NullPointerException( + "Null String Given to Regex.searchFrom"); + } + return _search(s, start, s.length()); + } + + public boolean searchFrom(StringLike s, int start) + { + if (s == null) + { + throw new NullPointerException( + "Null String Given to Regex.searchFrom"); + } + return _search(s, start, s.length()); + } + + /** + * Search through a region of a String for the first occurence of a match. + */ + public boolean searchRegion(String s, int start, int end) + { + if (s == null) + { + throw new NullPointerException( + "Null String Given to Regex.searchRegion"); + } + return _search(s, start, end); + } + + /** + * Set this to change the default behavior of the "." pattern. By default it + * now matches perl's behavior and fails to match the '\n' character. + */ + public static boolean dotDoesntMatchCR = true; + + StringLike gFlags; + + int gFlagto = 0; + + boolean gFlag = false; + + /** Set the 'g' flag */ + public void setGFlag(boolean b) + { + gFlag = b; + } + + /** Get the state of the 'g' flag. */ + public boolean getGFlag() + { + return gFlag; + } + + boolean sFlag = false; + + /** Get the state of the sFlag */ + public boolean getSFlag() + { + return sFlag; + } + + boolean mFlag = false; + + /** Get the state of the sFlag */ + public boolean getMFlag() + { + return mFlag; + } + + final boolean _search(String s, int start, int end) + { + return _search(new StringWrap(s), start, end); + } + + final boolean _search(StringLike s, int start, int end) + { + if (gFlag && gFlagto > 0 && gFlags != null + && s.unwrap() == gFlags.unwrap()) + { + start = gFlagto; + } + gFlags = null; + + Pthings pt = prep(s); + + int up = (minMatch == null ? end : end - minMatch.i); + + if (up < start && end >= start) + { + up = start; + } + + if (skipper == null) + { + for (int i = start; i <= up; i++) + { + charsMatched_ = thePattern.matchAt(s, i, pt); + if (charsMatched_ >= 0) + { + matchFrom_ = thePattern.mfrom; + marks = pt.marks; + gFlagto = matchFrom_ + charsMatched_; + gFlags = s; + return didMatch_ = true; + } + } + } + else + { + pt.no_check = true; + for (int i = start; i <= up; i++) + { + i = skipper.find(src, i, up); + if (i < 0) + { + charsMatched_ = matchFrom_ = -1; + return didMatch_ = false; + } + charsMatched_ = thePattern.matchAt(s, i, pt); + if (charsMatched_ >= 0) + { + matchFrom_ = thePattern.mfrom; + marks = pt.marks; + gFlagto = matchFrom_ + charsMatched_; + gFlags = s; + return didMatch_ = true; + } + } + } + return didMatch_ = false; + } + + /* + * final boolean _search(LongStringLike s,long start,long end) { if(gFlag && + * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null; + * + * Pthings pt=prep(s); + * + * int up = end;//(minMatch == null ? end : end-minMatch.i); + * + * if(up < start && end >= start) up = start; + * + * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ = + * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ = + * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_; + * return didMatch_=true; } } } else { pt.no_check = true; for(long i=start;i<=up;i++) { + * i = skipper.find(src,i,up); if(i<0) { charsMatched_ = matchFrom_ = -1; + * return didMatch_ = false; } charsMatched_ = thePattern.matchAt(s,i,pt); + * if(charsMatched_ >= 0) { matchFrom_ = thePattern.mfrom; marks = pt.marks; + * gFlagto = matchFrom_+charsMatched_; gFlags = s; return didMatch_=true; } + * else { i = s.adjustIndex(i); up = s.adjustEnd(i); } } } return + * didMatch_=false; } + */ + + boolean _reverseSearch(String s, int start, int end) + { + return _reverseSearch(new StringWrap(s), start, end); + } + + boolean _reverseSearch(StringLike s, int start, int end) + { + if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap()) + { + end = gFlagto; + } + gFlags = null; + Pthings pt = prep(s); + for (int i = end; i >= start; i--) + { + charsMatched_ = thePattern.matchAt(s, i, pt); + if (charsMatched_ >= 0) + { + matchFrom_ = thePattern.mfrom; + marks = pt.marks; + gFlagto = matchFrom_ - 1; + gFlags = s; + return didMatch_ = true; + } + } + return didMatch_ = false; + } + + // This routine sets the cbits variable + // of class Pattern. Cbits is true for + // the bit corresponding to a character inside + // a set of quotes. + static StringLike lasts = null; + + static BitSet lastbs = null; + + static void setCbits(StringLike s, Pthings pt) + { + if (s == lasts) + { + pt.cbits = lastbs; + return; + } + BitSet bs = new BitSet(s.length()); + char qc = ' '; + boolean setBit = false; + for (int i = 0; i < s.length(); i++) + { + if (setBit) + { + bs.set(i); + } + char c = s.charAt(i); + if (!setBit && c == '"') + { + qc = c; + setBit = true; + bs.set(i); + } + else if (!setBit && c == '\'') + { + qc = c; + setBit = true; + bs.set(i); + } + else if (setBit && c == qc) + { + setBit = false; + } + else if (setBit && c == '\\' && i + 1 < s.length()) + { + i++; + if (setBit) + { + bs.set(i); + } + } + } + pt.cbits = lastbs = bs; + lasts = s; + } + + // Wanted user to over-ride this in alpha version, + // but it wasn't really necessary because of this trick: + Regex newRegex() + { + try + { + return (Regex) getClass().newInstance(); + } catch (InstantiationException ie) + { + return null; + } catch (IllegalAccessException iae) + { + return null; + } + } + + /** + * Only needed for creating your own extensions of Regex. This method adds the + * next Pattern in the chain of patterns or sets the Pattern if it is the + * first call. + */ + protected void add(Pattern p2) + { + if (p == null) + { + p = p2; + } + else + { + p.add(p2); + p2 = p; + } + } + + /** + * You only need to use this method if you are creating your own extentions to + * Regex. compile1 compiles one Pattern element, it can be over-ridden to + * allow the Regex compiler to understand new syntax. See deriv.java for an + * example. This routine is the heart of class Regex. Rthings has one integer + * member called intValue, it is used to keep track of the number of ()'s in + * the Pattern. + * + * @exception com.stevesoft.pat.RegSyntax + * is thrown when a nonsensensical pattern is supplied. For + * example, a pattern beginning with *. + */ + protected void compile1(StrPos sp, Rthings mk) throws RegSyntax + { + if (sp.match('[')) + { + sp.inc(); + add(matchBracket(sp)); + } + else if (sp.match('|')) + { + if (or == null) + { + or = new Or(); + } + if (p == null) + { + p = new NullPattern(); + } + or.addOr(p); + p = null; + } + else if (sp.incMatch("(?<")) + { + patInt i = sp.getPatInt(); + if (i == null) + { + RegSyntaxError.endItAll("No int after (?<"); + } + add(new Backup(i.intValue())); + if (!sp.match(')')) + { + RegSyntaxError.endItAll("No ) after (?<"); + } + } + else if (sp.incMatch("(?>")) + { + patInt i = sp.getPatInt(); + if (i == null) + { + RegSyntaxError.endItAll("No int after (?>"); + } + add(new Backup(-i.intValue())); + if (!sp.match(')')) + { + RegSyntaxError.endItAll("No ) after (?<"); + } + } + else if (sp.incMatch("(?@")) + { + char op = sp.c; + sp.inc(); + char cl = sp.c; + sp.inc(); + if (!sp.match(')')) + { + RegSyntaxError.endItAll("(?@ does not have closing paren"); + } + add(new Group(op, cl)); + } + else if (sp.incMatch("(?#")) + { + while (!sp.match(')')) + { + sp.inc(); + } + } + else if (sp.dontMatch && sp.c == 'w') + { + // Regex r = new Regex(); + // r._compile("[a-zA-Z0-9_]",mk); + // add(new Goop("\\w",r.thePattern)); + Bracket b = new Bracket(false); + b.addOr(new Range('a', 'z')); + b.addOr(new Range('A', 'Z')); + b.addOr(new Range('0', '9')); + b.addOr(new oneChar('_')); + add(b); + } + else if (sp.dontMatch && sp.c == 'G') + { + add(new BackG()); + } + else if (sp.dontMatch && sp.c == 's') + { + // Regex r = new Regex(); + // r._compile("[ \t\n\r\b]",mk); + // add(new Goop("\\s",r.thePattern)); + Bracket b = new Bracket(false); + b.addOr(new oneChar((char) 32)); + b.addOr(new Range((char) 8, (char) 10)); + b.addOr(new oneChar((char) 13)); + add(b); + } + else if (sp.dontMatch && sp.c == 'd') + { + // Regex r = new Regex(); + // r._compile("[0-9]",mk); + // add(new Goop("\\d",r.thePattern)); + Range digit = new Range('0', '9'); + digit.printBrackets = true; + add(digit); + } + else if (sp.dontMatch && sp.c == 'W') + { + // Regex r = new Regex(); + // r._compile("[^a-zA-Z0-9_]",mk); + // add(new Goop("\\W",r.thePattern)); + Bracket b = new Bracket(true); + b.addOr(new Range('a', 'z')); + b.addOr(new Range('A', 'Z')); + b.addOr(new Range('0', '9')); + b.addOr(new oneChar('_')); + add(b); + } + else if (sp.dontMatch && sp.c == 'S') + { + // Regex r = new Regex(); + // r._compile("[^ \t\n\r\b]",mk); + // add(new Goop("\\S",r.thePattern)); + Bracket b = new Bracket(true); + b.addOr(new oneChar((char) 32)); + b.addOr(new Range((char) 8, (char) 10)); + b.addOr(new oneChar((char) 13)); + add(b); + } + else if (sp.dontMatch && sp.c == 'D') + { + // Regex r = new Regex(); + // r._compile("[^0-9]",mk); + // add(new Goop("\\D",r.thePattern)); + Bracket b = new Bracket(true); + b.addOr(new Range('0', '9')); + add(b); + } + else if (sp.dontMatch && sp.c == 'B') + { + Regex r = new Regex(); + r._compile("(?!" + back_slash + "b)", mk); + add(r.thePattern); + } + else if (isOctalString(sp)) + { + int d = sp.c - '0'; + sp.inc(); + d = 8 * d + sp.c - '0'; + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (isOctalDigit(sp2, false)) + { + sp.inc(); + d = 8 * d + sp.c - '0'; + } + add(new oneChar((char) d)); + } + else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9') + { + int iv = sp.c - '0'; + StrPos s2 = new StrPos(sp); + s2.inc(); + if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9') + { + iv = 10 * iv + (s2.c - '0'); + sp.inc(); + } + add(new BackMatch(iv)); + } + else if (sp.dontMatch && sp.c == 'b') + { + add(new Boundary()); + } + else if (sp.match('\b')) + { + add(new Boundary()); + } + else if (sp.match('$')) + { + add(new End(true)); + } + else if (sp.dontMatch && sp.c == 'Z') + { + add(new End(false)); + } + else if (sp.match('.')) + { + add(new Any()); + } + else if (sp.incMatch("(??")) + { + StringBuffer sb = new StringBuffer(); + StringBuffer sb2 = new StringBuffer(); + while (!sp.match(')') && !sp.match(':')) + { + sb.append(sp.c); + sp.inc(); + } + if (sp.incMatch(":")) + { + while (!sp.match(')')) + { + sb2.append(sp.c); + sp.inc(); + } + } + String sbs = sb.toString(); + if (validators.get(sbs) instanceof String) + { + String pat = (String) validators.get(sbs); + Regex r = newRegex(); + Rthings rth = new Rthings(this); + rth.noBackRefs = true; + r._compile(pat, rth); + add(r.thePattern); + } + else + { + Custom cm = new Custom(sb.toString()); + if (cm.v != null) + { + Validator v2 = cm.v.arg(sb2.toString()); + if (v2 != null) + { + v2.argsave = sb2.toString(); + String p = cm.v.pattern; + cm.v = v2; + v2.pattern = p; + } + Regex r = newRegex(); + Rthings rth = new Rthings(this); + rth.noBackRefs = true; + r._compile(cm.v.pattern, rth); + cm.sub = r.thePattern; + cm.sub.add(new CustomEndpoint(cm)); + cm.sub.setParent(cm); + add(cm); + } + } + } + else if (sp.match('(')) + { + mk.parenLevel++; + Regex r = newRegex(); + // r.or = new Or(); + sp.inc(); + if (sp.incMatch("?:")) + { + r.or = new Or(); + } + else if (sp.incMatch("?=")) + { + r.or = new lookAhead(false); + } + else if (sp.incMatch("?!")) + { + r.or = new lookAhead(true); + } + else if (sp.match('?')) + { + sp.inc(); + do + { + if (sp.c == 'i') + { + mk.ignoreCase = true; + } + if (sp.c == 'Q') + { + mk.dontMatchInQuotes = true; + } + if (sp.c == 'o') + { + mk.optimizeMe = true; + } + if (sp.c == 'g') + { + mk.gFlag = true; + } + if (sp.c == 's') + { + mk.sFlag = true; + } + if (sp.c == 'm') + { + mk.mFlag = true; + } + sp.inc(); + } while (!sp.match(')') && !sp.eos); + r = null; + mk.parenLevel--; + if (sp.eos) // throw new RegSyntax + { + RegSyntaxError.endItAll("Unclosed ()"); + } + } + else + { // just ordinary parenthesis + r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++); + } + if (r != null) + { + add(r._compile(sp, mk)); + } + } + else if (sp.match('^')) + { + add(new Start(true)); + } + else if (sp.dontMatch && sp.c == 'A') + { + add(new Start(false)); + } + else if (sp.match('*')) + { + addMulti(new patInt(0), new patInf()); + } + else if (sp.match('+')) + { + addMulti(new patInt(1), new patInf()); + } + else if (sp.match('?')) + { + addMulti(new patInt(0), new patInt(1)); + } + else if (sp.match('{')) + { + boolean bad = false; + StrPos sp2 = new StrPos(sp); + // StringBuffer sb = new StringBuffer(); + sp.inc(); + patInt i1 = sp.getPatInt(); + patInt i2 = null; + if (sp.match('}')) + { + i2 = i1; + } + else + { + if (!sp.match(',')) + { + /* + * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed + * with , or }"); + */ + bad = true; + } + sp.inc(); + if (sp.match('}')) + { + i2 = new patInf(); + } + else + { + i2 = sp.getPatInt(); + } + } + if (i1 == null || i2 == null) + { + /* + * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}"); + */ + bad = true; + } + if (bad) + { + sp.dup(sp2); + add(new oneChar(sp.c)); + } + else + { + addMulti(i1, i2); + } + } + else if (sp.escMatch('x') && next2Hex(sp)) + { + sp.inc(); + int d = getHexDigit(sp); + sp.inc(); + d = 16 * d + getHexDigit(sp); + add(new oneChar((char) d)); + } + else if (sp.escMatch('c')) + { + sp.inc(); + if (sp.c < Ctrl.cmap.length) + { + add(new oneChar(Ctrl.cmap[sp.c])); + } + else + { + add(new oneChar(sp.c)); + } + } + else if (sp.escMatch('f')) + { + add(new oneChar((char) 12)); + } + else if (sp.escMatch('a')) + { + add(new oneChar((char) 7)); + } + else if (sp.escMatch('t')) + { + add(new oneChar('\t')); + } + else if (sp.escMatch('n')) + { + add(new oneChar('\n')); + } + else if (sp.escMatch('r')) + { + add(new oneChar('\r')); + } + else if (sp.escMatch('b')) + { + add(new oneChar('\b')); + } + else if (sp.escMatch('e')) + { + add(new oneChar((char) 27)); + } + else + { + add(new oneChar(sp.c)); + if (sp.match(')')) + { + RegSyntaxError.endItAll("Unmatched right paren in pattern"); + } + } + } + + // compiles all Pattern elements, internal method + private Pattern _compile(String pat, Rthings mk) throws RegSyntax + { + minMatch = null; + sFlag = mFlag = ignoreCase = gFlag = false; + StrPos sp = new StrPos(pat, 0); + thePattern = _compile(sp, mk); + pt.marks = null; + return thePattern; + } + + Pattern p = null; + + Or or = null; + + Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax + { + while (!(sp.eos || (or != null && sp.match(')')))) + { + compile1(sp, mk); + sp.inc(); + } + if (sp.match(')')) + { + mk.parenLevel--; + } + else if (sp.eos && mk.parenLevel != 0) + { + RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel); + } + if (or != null) + { + if (p == null) + { + p = new NullPattern(); + } + or.addOr(p); + return or; + } + return p == null ? new NullPattern() : p; + } + + // add a multi object to the end of the chain + // which applies to the last object + void addMulti(patInt i1, patInt i2) throws RegSyntax + { + Pattern last, last2; + for (last = p; last != null && last.next != null; last = last.next) + { + ; + } + if (last == null || last == p) + { + last2 = null; + } + else + { + for (last2 = p; last2.next != last; last2 = last2.next) + { + ; + } + } + if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1) + { + ((Multi) last).matchFewest = true; + } + else if (last instanceof FastMulti && i1.intValue() == 0 + && i2.intValue() == 1) + { + ((FastMulti) last).matchFewest = true; + } + else if (last instanceof DotMulti && i1.intValue() == 0 + && i2.intValue() == 1) + { + ((DotMulti) last).matchFewest = true; + } + else if (last instanceof Multi || last instanceof DotMulti + || last instanceof FastMulti) + { + throw new RegSyntax("Syntax error."); + } + else if (last2 == null) + { + p = mkMulti(i1, i2, p); + } + else + { + last2.next = mkMulti(i1, i2, last); + } + } + + final static Pattern mkMulti(patInt lo, patInt hi, Pattern p) + throws RegSyntax + { + if (p instanceof Any && p.next == null) + { + return (Pattern) new DotMulti(lo, hi); + } + return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p) + : (Pattern) new Multi(lo, hi, p); + } + + // process the bracket operator + Pattern matchBracket(StrPos sp) throws RegSyntax + { + Bracket ret; + if (sp.match('^')) + { + ret = new Bracket(true); + sp.inc(); + } + else + { + ret = new Bracket(false); + } + if (sp.match(']')) + { + // throw new RegSyntax + RegSyntaxError.endItAll("Unmatched []"); + } + + while (!sp.eos && !sp.match(']')) + { + StrPos s1 = new StrPos(sp); + s1.inc(); + StrPos s1_ = new StrPos(s1); + s1_.inc(); + if (s1.match('-') && !s1_.match(']')) + { + StrPos s2 = new StrPos(s1); + s2.inc(); + if (!s2.eos) + { + ret.addOr(new Range(sp.c, s2.c)); + } + sp.inc(); + sp.inc(); + } + else if (sp.escMatch('Q')) + { + sp.inc(); + while (!sp.escMatch('E')) + { + ret.addOr(new oneChar(sp.c)); + sp.inc(); + } + } + else if (sp.escMatch('d')) + { + ret.addOr(new Range('0', '9')); + } + else if (sp.escMatch('s')) + { + ret.addOr(new oneChar((char) 32)); + ret.addOr(new Range((char) 8, (char) 10)); + ret.addOr(new oneChar((char) 13)); + } + else if (sp.escMatch('w')) + { + ret.addOr(new Range('a', 'z')); + ret.addOr(new Range('A', 'Z')); + ret.addOr(new Range('0', '9')); + ret.addOr(new oneChar('_')); + } + else if (sp.escMatch('D')) + { + ret.addOr(new Range((char) 0, (char) 47)); + ret.addOr(new Range((char) 58, (char) 65535)); + } + else if (sp.escMatch('S')) + { + ret.addOr(new Range((char) 0, (char) 7)); + ret.addOr(new Range((char) 11, (char) 12)); + ret.addOr(new Range((char) 14, (char) 31)); + ret.addOr(new Range((char) 33, (char) 65535)); + } + else if (sp.escMatch('W')) + { + ret.addOr(new Range((char) 0, (char) 64)); + ret.addOr(new Range((char) 91, (char) 94)); + ret.addOr(new oneChar((char) 96)); + ret.addOr(new Range((char) 123, (char) 65535)); + } + else if (sp.escMatch('x') && next2Hex(sp)) + { + sp.inc(); + int d = getHexDigit(sp); + sp.inc(); + d = 16 * d + getHexDigit(sp); + ret.addOr(new oneChar((char) d)); + } + else if (sp.escMatch('a')) + { + ret.addOr(new oneChar((char) 7)); + } + else if (sp.escMatch('f')) + { + ret.addOr(new oneChar((char) 12)); + } + else if (sp.escMatch('e')) + { + ret.addOr(new oneChar((char) 27)); + } + else if (sp.escMatch('n')) + { + ret.addOr(new oneChar('\n')); + } + else if (sp.escMatch('t')) + { + ret.addOr(new oneChar('\t')); + } + else if (sp.escMatch('r')) + { + ret.addOr(new oneChar('\r')); + } + else if (sp.escMatch('c')) + { + sp.inc(); + if (sp.c < Ctrl.cmap.length) + { + ret.addOr(new oneChar(Ctrl.cmap[sp.c])); + } + else + { + ret.addOr(new oneChar(sp.c)); + } + } + else if (isOctalString(sp)) + { + int d = sp.c - '0'; + sp.inc(); + d = 8 * d + sp.c - '0'; + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (isOctalDigit(sp2, false)) + { + sp.inc(); + d = 8 * d + sp.c - '0'; + } + ret.addOr(new oneChar((char) d)); + } + else + { + ret.addOr(new oneChar(sp.c)); + } + sp.inc(); + } + return ret; + } + + /** + * Converts the stored Pattern to a String -- this is a decompile. Note that + * \t and \n will really print out here, Not just the two character + * representations. Also be prepared to see some strange output if your + * characters are not printable. + */ + public String toString() + { + if (false && thePattern == null) + { + return ""; + } + else + { + StringBuffer sb = new StringBuffer(); + if (esc != Pattern.ESC) + { + sb.append("(?e="); + sb.append(esc); + sb.append(")"); + } + if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase + || dontMatchInQuotes || optimized()) + { + sb.append("(?"); + if (ignoreCase) + { + sb.append("i"); + } + if (mFlag) + { + sb.append("m"); + } + if (sFlag || !dotDoesntMatchCR) + { + sb.append("s"); + } + if (dontMatchInQuotes) + { + sb.append("Q"); + } + if (optimized()) + { + sb.append("o"); + } + if (gFlag) + { + sb.append("g"); + } + sb.append(")"); + } + String patstr = thePattern.toString(); + if (esc != Pattern.ESC) + { + patstr = reEscape(patstr, Pattern.ESC, esc); + } + sb.append(patstr); + return sb.toString(); + } + } + + // Re-escape Pattern, allows us to use a different escape + // character. + static String reEscape(String s, char oldEsc, char newEsc) + { + if (oldEsc == newEsc) + { + return s; + } + int i; + StringBuffer sb = new StringBuffer(); + for (i = 0; i < s.length(); i++) + { + if (s.charAt(i) == oldEsc && i + 1 < s.length()) + { + if (s.charAt(i + 1) == oldEsc) + { + sb.append(oldEsc); + } + else + { + sb.append(newEsc); + sb.append(s.charAt(i + 1)); + } + i++; + } + else if (s.charAt(i) == newEsc) + { + sb.append(newEsc); + sb.append(newEsc); + } + else + { + sb.append(s.charAt(i)); + } + } + return sb.toString(); + } + + /** + * This method implements FilenameFilter, allowing one to use a Regex to + * search through a directory using File.list. There is a FileRegex now that + * does this better. + * + * @see com.stevesoft.pat.FileRegex + */ + public boolean accept(File dir, String s) + { + return search(s); + } + + /** The version of this package */ + final static public String version() + { + return "lgpl release 1.5.3"; + } + + /** + * Once this method is called, the state of variables ignoreCase and + * dontMatchInQuotes should not be changed as the results will be + * unpredictable. However, search and matchAt will run more quickly. Note that + * you can check to see if the pattern has been optimized by calling the + * optimized() method. + *
+ * This method will attempt to rewrite your pattern in a way that makes it + * faster (not all patterns execute at the same speed). In general, "(?: ... )" + * will be faster than "( ... )" so if you don't need the backreference, you + * should group using the former pattern. + *
+ * It will also introduce new pattern elements that you can't get to + * otherwise, for example if you have a large table of strings, i.e. the + * months of the year "(January|February|...)" optimize() will make a + * Hashtable that takes it to the next appropriate pattern element -- + * eliminating the need for a linear search. + * + * @see com.stevesoft.pat.Regex#optimized + * @see com.stevesoft.pat.Regex#ignoreCase + * @see com.stevesoft.pat.Regex#dontMatchInQuotes + * @see com.stevesoft.pat.Regex#matchAt + * @see com.stevesoft.pat.Regex#search + */ + public void optimize() + { + if (optimized() || thePattern == null) + { + return; + } + minMatch = new patInt(0); // thePattern.countMinChars(); + thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes); + skipper = Skip.findSkip(this); + // RegOpt.setParents(this); + return; + } + + Skip skipper; + + /** + * This function returns true if the optimize method has been called. + */ + public boolean optimized() + { + return minMatch != null; + } + + /** + * A bit of syntactic surgar for those who want to make their code look more + * perl-like. To use this initialize your Regex object by saying: + * + *
+ * Regex r1 = Regex.perlCode("s/hello/goodbye/"); + * Regex r2 = Regex.perlCode("s'fish'frog'i"); + * Regex r3 = Regex.perlCode("m'hello'); + *+ * + * The i for ignoreCase is supported in this syntax, as well as m, s, and x. + * The g flat is a bit of a special case. + *
+ * If you wish to replace all occurences of a pattern, you do not put a 'g' in + * the perlCode, but call Regex's replaceAll method. + *
+ * If you wish to simply and only do a search for r2's pattern, you can do + * this by calling the searchFrom method method repeatedly, or by calling + * search repeatedly if the g flag is set. + *
+ * Note: Currently perlCode does not support the (?e=#) syntax for + * changing the escape character. + */ + + public static Regex perlCode(String s) + { + // this file is big enough, see parsePerl.java + // for this function. + return parsePerl.parse(s); + } + + static final char back_slash = '\\'; + + /** + * Checks to see if there are only literal and no special pattern elements in + * this Regex. + */ + public boolean isLiteral() + { + Pattern x = thePattern; + while (x != null) + { + if (x instanceof oneChar) + { + ; + } + else if (x instanceof Skipped) + { + ; + } + else + { + return false; + } + x = x.next; + } + return true; + } + + /** + * You only need to know about this if you are inventing your own pattern + * elements. + */ + public patInt countMinChars() + { + return thePattern.countMinChars(); + } + + /** + * You only need to know about this if you are inventing your own pattern + * elements. + */ + public patInt countMaxChars() + { + return thePattern.countMaxChars(); + } + + boolean isHexDigit(StrPos sp) + { + boolean r = !sp.eos + && !sp.dontMatch + && ((sp.c >= '0' && sp.c <= '9') + || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F')); + return r; + } + + boolean isOctalDigit(StrPos sp, boolean first) + { + boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0' + && sp.c <= '7'; + return r; + } + + int getHexDigit(StrPos sp) + { + if (sp.c >= '0' && sp.c <= '9') + { + return sp.c - '0'; + } + if (sp.c >= 'a' && sp.c <= 'f') + { + return sp.c - 'a' + 10; + } + return sp.c - 'A' + 10; + } + + boolean next2Hex(StrPos sp) + { + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (!isHexDigit(sp2)) + { + return false; + } + sp2.inc(); + if (!isHexDigit(sp2)) + { + return false; + } + return true; + } + + boolean isOctalString(StrPos sp) + { + if (!isOctalDigit(sp, true)) + { + return false; + } + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (!isOctalDigit(sp2, false)) + { + return false; + } + return true; + } +}