X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fcom%2Fstevesoft%2Fpat%2FRegex.java;h=fea3d07f77ad7f0ad39a9749db56874cebf123f5;hb=57738a1f3c19b1c3a00bd3ac5108f8cd0af32f99;hp=665632e67a7e29eadc10c99157b4070b124fc45e;hpb=f24dacb1da56fccf05d684e2f4899facec2aecf7;p=jalview.git diff --git a/src/com/stevesoft/pat/Regex.java b/src/com/stevesoft/pat/Regex.java index 665632e..fea3d07 100755 --- a/src/com/stevesoft/pat/Regex.java +++ b/src/com/stevesoft/pat/Regex.java @@ -1,1429 +1,2116 @@ -// -// This software is now distributed according to -// the Lesser Gnu Public License. Please see -// http://www.gnu.org/copyleft/lesser.txt for -// the details. -// -- Happy Computing! -// -package com.stevesoft.pat; -import java.util.*; -import java.io.*; -import com.stevesoft.pat.wrap.StringWrap; - - -/** Matches a Unicode punctuation character. */ -class UnicodePunct extends UniValidator { - public int validate(StringLike s,int from,int to) { - return from= s.length()) return -1; - char c = s.charAt(from); - return (Prop.isAlphabetic(c)||Prop.isDecimalDigit(c)||c=='_') ? to : -1; - } -} - -/** Matches a character that is not a Unicode alphanumeric or underscore. */ -class NUnicodeW extends UniValidator { - public int validate(StringLike s,int from,int to) { - if(from >= s.length()) return -1; - char c = s.charAt(from); - return !(Prop.isAlphabetic(c)||Prop.isDecimalDigit(c)||c=='_') ? to : -1; - } -} - -/** Matches a Unicode decimal digit. */ -class UnicodeDigit extends UniValidator { - public int validate(StringLike s,int from,int to) { - return from -For the purpose of this documentation, the fact that java interprets the -backslash will be ignored. In practice, however, you will need a -double backslash to obtain a string that contains a single backslash -character. Thus, the example pattern "\b" should really be typed -as "\\b" inside java code. -

-Note that Regex is part of package "com.stevesoft.pat". -To use it, simply import -com.stevesoft.pat.Regex at the top of your file. -

-Regex is made with a constructor that takes a String that defines -the regular expression. Thus, for example -

-      Regex r = new Regex("[a-c]*");
-
-matches any number of characters so long as the are 'a', 'b', or 'c'). -

-To attempt to match the Pattern to a given string, you can use either -the search(String) member function, or the matchAt(String,int position) -member function. These functions return a boolean which tells you -whether or not the thing worked, and sets the methods "charsMatched()" -and "matchedFrom()" in the Regex object appropriately. -

-The portion of the string before the match can be obtained by the -left() member, and the portion after the match can be obtained -by the right() member. -

-Essentially, this package implements a syntax that is very much -like the perl 5 regular expression syntax. - -Longer example: -

-        Regex r = new Regex("x(a|b)y");
-        r.matchAt("xay",0);
-        System.out.println("sub = "+r.stringMatched(1));
-
-The above would print "sub = a". -
-        r.left() // would return "x"
-        r.right() // would return "y"
-
-

-Differences between this package and perl5:
-The extended Pattern for setting flags, is now supported, -but the flags are different. "(?i)" tells the pattern to -ignore case, "(?Q)" sets the "dontMatchInQuotes" flag, and -"(?iQ)" sets them both. You can change the escape character. -The pattern

(?e=#)#d+
is the same as
\d+
, -but note that the sequence
(?e=#)
must occur -at the very beginning of the pattern. There may be other small -differences as well. I will either make my package conform -or note them as I become aware of them. -

-This package supports additional patterns not in perl5: -

- - - -
(?@())GroupThis matches all characters between -the '(' character and the balancing ')' character. Thus, it will -match "()" as well as "(())". The balancing characters are -arbitrary, thus (?@{}) matches on "{}" and "{{}}".
(?<1)BackupMoves the pointer backwards within the text. -This allows you to make a "look behind." It fails if it -attempts to move to a position before the beginning of the string. -"x(?<1)" is equivalent to "(?=x)". The number, 1 in this example, -is the number of characters to move backwards.
-
- -@author Steven R. Brandt -@version package com.stevesoft.pat, release 1.5.3 -@see Pattern -*/ -public class Regex extends RegRes implements FilenameFilter { - /** BackRefOffset gives the identity number of the first - pattern. Version 1.0 used zero, version 1.1 uses 1 to be - more compatible with perl. */ - static int BackRefOffset = 1; - private static Pattern none = new NoPattern(); - Pattern thePattern = none; - patInt minMatch = new patInt(0); - - static Hashtable validators = new Hashtable(); - static { - define("p","(?>1)",new UnicodePunct()); - define("P","(?>1)",new NUnicodePunct()); - define("s","(?>1)",new UnicodeWhite()); - define("S","(?>1)",new NUnicodeWhite()); - define("w","(?>1)",new UnicodeW()); - define("W","(?>1)",new NUnicodeW()); - define("d","(?>1)",new UnicodeDigit()); - define("D","(?>1)",new NUnicodeDigit()); - define("m","(?>1)",new UnicodeMath()); - define("M","(?>1)",new NUnicodeMath()); - define("c","(?>1)",new UnicodeCurrency()); - define("C","(?>1)",new NUnicodeCurrency()); - define("a","(?>1)",new UnicodeAlpha()); - define("A","(?>1)",new NUnicodeAlpha()); - define("uc","(?>1)",new UnicodeUpper()); - define("lc","(?>1)",new UnicodeLower()); - } - - /** Set the dontMatch in quotes flag. */ - public void setDontMatchInQuotes(boolean b) { - dontMatchInQuotes = b; - } - /** Find out if the dontMatchInQuotes flag is enabled. */ - public boolean getDontMatchInQuotes() { - return dontMatchInQuotes; - } - boolean dontMatchInQuotes = false; - - /** Set the state of the ignoreCase flag. If set to true, then - the pattern matcher will ignore case when searching for a - match. */ - public void setIgnoreCase(boolean b) { - ignoreCase = b; - } - /** Get the state of the ignoreCase flag. Returns true if we - are ignoring the case of the pattern, false otherwise. */ - public boolean getIgnoreCase() { - return ignoreCase; - } - boolean ignoreCase = false; - - static boolean defaultMFlag = false; - /** Set the default value of the m flag. If it - is set to true, then the MFlag will be on - for any regex search executed. */ - public static void setDefaultMFlag(boolean mFlag) { - defaultMFlag = mFlag; - } - /** Get the default value of the m flag. If it - is set to true, then the MFlag will be on - for any regex search executed. */ - public static boolean getDefaultMFlag() { - return defaultMFlag; - } - - /** Initializes the object without a Pattern. To supply a Pattern - use compile(String s). - @see com.stevesoft.pat.Regex#compile(java.lang.String) - */ - public Regex() {} - /** Create and compile a Regex, but do not throw any exceptions. - If you wish to have exceptions thrown for syntax errors, - you must use the Regex(void) constructor to create the - Regex object, and then call the compile method. Therefore, you - should only call this method when you know your pattern is right. - I will probably become more like - @see com.stevesoft.pat.Regex#search(java.lang.String) - @see com.stevesoft.pat.Regex#compile(java.lang.String) - */ - public Regex(String s) { - try { - compile(s); - } catch(RegSyntax rs) {} - } - - ReplaceRule rep = null; - /** Create and compile both a Regex and a ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#compile(java.lang.String) - */ - public Regex(String s,String rp) { - this(s); - rep = ReplaceRule.perlCode(rp); - } - /** Create and compile a Regex, but give it the ReplaceRule - specified. This allows the user finer control of the - Replacement process, if that is desired. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#compile(java.lang.String) - */ - public Regex(String s,ReplaceRule rp) { - this(s); - rep = rp; - } - - /** Change the ReplaceRule of this Regex by compiling - a new one using String rp. */ - public void setReplaceRule(String rp) { - rep = ReplaceRule.perlCode(rp); - repr = null; // Clear Replacer history - } - - /** Change the ReplaceRule of this Regex to rp. */ - public void setReplaceRule(ReplaceRule rp) { - rep = rp; - } - /** Test to see if a custom defined rule exists. - @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator) - */ - public static boolean isDefined(String nm) { - return validators.get(nm) != null; - } - /** Removes a custom defined rule. - @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator) - */ - public static void undefine(String nm) { - validators.remove(nm); - } - /** Defines a method to create a new rule. See test/deriv2.java - and test/deriv3.java for examples of how to use it. */ - public static void define(String nm,String pat,Validator v) { - v.pattern = pat; - validators.put(nm,v); - } - /** Defines a shorthand for a pattern. The pattern will be - invoked by a string that has the form "(??"+nm+")". - */ - public static void define(String nm,String pat) { - validators.put(nm,pat); - } - - /** Get the current ReplaceRule. */ - public ReplaceRule getReplaceRule() { return rep; } - - Replacer repr = null; - final Replacer _getReplacer() { - return repr==null ? repr=new Replacer() : repr; - } - public Replacer getReplacer() { - if(repr == null) - repr = new Replacer(); - repr.rh.me = this; - repr.rh.prev = null; - return repr; - } - /** Replace the first occurence of this pattern in String s - according to the ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#getReplaceRule() - */ - public String replaceFirst(String s) { - return _getReplacer().replaceFirstRegion(s,this,0,s.length()).toString(); - } - /** Replace the first occurence of this pattern in String s - beginning with position pos according to the ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#getReplaceRule() - */ - public String replaceFirstFrom(String s,int pos) { - return _getReplacer().replaceFirstRegion(s,this,pos,s.length()).toString(); - } - /** Replace the first occurence of this pattern in String s - beginning with position start and ending with end - according to the ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#getReplaceRule() - */ - public String replaceFirstRegion(String s,int start,int end) { - return _getReplacer().replaceFirstRegion(s,this,start,end).toString(); - } - - /** Replace all occurences of this pattern in String s - according to the ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#getReplaceRule() - */ - public String replaceAll(String s) { - return _getReplacer().replaceAllRegion(s,this,0,s.length()).toString(); - } - public StringLike replaceAll(StringLike s) { - return _getReplacer().replaceAllRegion(s,this,0,s.length()); - } - /** Replace all occurences of this pattern in String s - beginning with position pos according to the ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#getReplaceRule() - */ - public String replaceAllFrom(String s,int pos) { - return _getReplacer().replaceAllRegion(s,this,pos,s.length()).toString(); - } - /** Replace all occurences of this pattern in String s - beginning with position start and ending with end - according to the ReplaceRule. - @see com.stevesoft.pat.ReplaceRule - @see com.stevesoft.pat.Regex#getReplaceRule() - */ - public String replaceAllRegion(String s,int start,int end) { - return _getReplacer().replaceAllRegion(s,this,start,end).toString(); - } - - - /** Essentially clones the Regex object */ - public Regex(Regex r) { - super((RegRes)r); - dontMatchInQuotes = r.dontMatchInQuotes; - esc = r.esc; - ignoreCase = r.ignoreCase; - gFlag = r.gFlag; - if(r.rep==null) - rep = null; - else - rep = (ReplaceRule)r.rep.clone(); - /* try { - compile(r.toString()); - } catch(RegSyntax r_) {} */ - thePattern = r.thePattern.clone(new Hashtable()); - minMatch = r.minMatch; - skipper = r.skipper; - } - - /** By default, - the escape character is the backslash, but you can - make it anything you want by setting this variable. */ - public char esc = Pattern.ESC; - /** This method compiles a regular expression, making it - possible to call the search or matchAt methods. - @exception com.stevesoft.pat.RegSyntax - is thrown if a syntax error is encountered - in the pattern. - For example, "x{3,1}" or "*a" are not valid - patterns. - @see com.stevesoft.pat.Regex#search - @see com.stevesoft.pat.Regex#matchAt - */ - public void compile(String prepat) throws RegSyntax { - String postpat = parsePerl.codify(prepat,true); - String pat = postpat==null ? prepat : postpat; - minMatch = null; - ignoreCase = false; - dontMatchInQuotes = false; - Rthings mk = new Rthings(this); - int offset = mk.val; - String newpat = pat; - thePattern = none; - p = null; - or = null; - minMatch = new patInt(0); - StrPos sp = new StrPos(pat,0); - if(sp.incMatch("(?e=")) { - char newEsc = sp.c; - sp.inc(); - if(sp.match(')')) - newpat = reEscape(pat.substring(6), - newEsc,Pattern.ESC); - } else if(esc != Pattern.ESC) - newpat = reEscape(pat,esc,Pattern.ESC); - thePattern = _compile(newpat,mk); - numSubs_ = mk.val-offset; - mk.set(this); - } - - /* If a Regex is compared against a Regex, a check is - done to see that the patterns are equal as well as - the most recent match. If a Regex is compare with - a RegRes, only the result of the most recent match - is compared. */ - public boolean equals(Object o) { - if(o instanceof Regex) { - if(toString().equals(o.toString())) - return super.equals(o); - else - return false; - } else return super.equals(o); - } - - /** A clone by any other name would smell as sweet. */ - public Object clone() { - return new Regex(this); - } - /** Return a clone of the underlying RegRes object. */ - public RegRes result() { - return (RegRes)super.clone(); - } - - // prep sets global variables of class - // Pattern so that it can access them - // during an attempt at a match - Pthings pt = new Pthings(); - final Pthings prep(StringLike s) { - //if(gFlag) - pt.lastPos = matchedTo(); - if(pt.lastPos < 0) pt.lastPos = 0; - if( (s==null ? null : s.unwrap()) != (src==null ? null : s.unwrap()) ) - pt.lastPos = 0; - src = s; - pt.dotDoesntMatchCR=dotDoesntMatchCR && (!sFlag); - pt.mFlag = (mFlag | defaultMFlag); - pt.ignoreCase = ignoreCase; - pt.no_check = false; - if(pt.marks != null) - for(int i=0;istart*/ - public boolean searchFrom(String s,int start) { - if(s==null) - throw new NullPointerException("Null String Given to Regex.searchFrom"); - return _search(s,start,s.length()); - } - public boolean searchFrom(StringLike s,int start) { - if(s==null) - throw new NullPointerException("Null String Given to Regex.searchFrom"); - return _search(s,start,s.length()); - } - /** Search through a region of a String - for the first occurence of a match. */ - public boolean searchRegion(String s,int start,int end) { - if(s==null) - throw new NullPointerException("Null String Given to Regex.searchRegion"); - return _search(s,start,end); - } - /** Set this to change the default behavior of the "." pattern. - By default it now matches perl's behavior and fails to - match the '\n' character. */ - public static boolean dotDoesntMatchCR = true; - StringLike gFlags; - int gFlagto = 0; - boolean gFlag = false; - /** Set the 'g' flag */ - public void setGFlag(boolean b) { - gFlag = b; - } - /** Get the state of the 'g' flag. */ - public boolean getGFlag() { - return gFlag; - } - boolean sFlag = false; - /** Get the state of the sFlag */ - public boolean getSFlag() { - return sFlag; - } - boolean mFlag = false; - /** Get the state of the sFlag */ - public boolean getMFlag() { - return mFlag; - } - - final boolean _search(String s,int start,int end) { - return _search(new StringWrap(s),start,end); - } - final boolean _search(StringLike s,int start,int end) { - if(gFlag && gFlagto > 0 && gFlags!=null && s.unwrap()==gFlags.unwrap()) - start = gFlagto; - gFlags = null; - - Pthings pt=prep(s); - - int up = (minMatch == null ? end : end-minMatch.i); - - if(up < start && end >= start) up = start; - - if(skipper == null) { - for(int i=start;i<=up;i++) { - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_+charsMatched_; - gFlags = s; - return didMatch_=true; - } - } - } else { - pt.no_check = true; - for(int i=start;i<=up;i++) { - i = skipper.find(src,i,up); - if(i<0) { - charsMatched_ = matchFrom_ = -1; - return didMatch_ = false; - } - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_+charsMatched_; - gFlags = s; - return didMatch_=true; - } - } - } - return didMatch_=false; - } - /*final boolean _search(LongStringLike s,long start,long end) { - if(gFlag && gFlagto > 0 && s==gFlags) - start = gFlagto; - gFlags = null; - - Pthings pt=prep(s); - - int up = end;//(minMatch == null ? end : end-minMatch.i); - - if(up < start && end >= start) up = start; - - if(skipper == null) { - for(long i=start;i<=up;i++) { - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_+charsMatched_; - return didMatch_=true; - } - } - } else { - pt.no_check = true; - for(long i=start;i<=up;i++) { - i = skipper.find(src,i,up); - if(i<0) { - charsMatched_ = matchFrom_ = -1; - return didMatch_ = false; - } - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_+charsMatched_; - gFlags = s; - return didMatch_=true; - } else { - i = s.adjustIndex(i); - up = s.adjustEnd(i); - } - } - } - return didMatch_=false; - }*/ - - boolean _reverseSearch(String s,int start,int end) { - return _reverseSearch(new StringWrap(s),start,end); - } - boolean _reverseSearch(StringLike s,int start,int end) { - if(gFlag && gFlagto > 0 && s.unwrap()==gFlags.unwrap()) - end = gFlagto; - gFlags = null; - Pthings pt=prep(s); - for(int i=end;i>=start;i--) { - charsMatched_ = thePattern.matchAt(s,i,pt); - if(charsMatched_ >= 0) { - matchFrom_ = thePattern.mfrom; - marks = pt.marks; - gFlagto = matchFrom_-1; - gFlags = s; - return didMatch_=true; - } - } - return didMatch_=false; - } - - // This routine sets the cbits variable - // of class Pattern. Cbits is true for - // the bit corresponding to a character inside - // a set of quotes. - static StringLike lasts=null; - static BitSet lastbs=null; - static void setCbits(StringLike s,Pthings pt) { - if(s == lasts) { - pt.cbits = lastbs; - return; - } - BitSet bs = new BitSet(s.length()); - char qc = ' '; - boolean setBit = false; - for(int i=0;i")) { - patInt i = sp.getPatInt(); - if(i==null) RegSyntaxError.endItAll("No int after (?>"); - add(new Backup(-i.intValue())); - if(!sp.match(')')) RegSyntaxError.endItAll("No ) after (?<"); - } else if(sp.incMatch("(?@")) { - char op = sp.c; - sp.inc(); - char cl = sp.c; - sp.inc(); - if(!sp.match(')')) - RegSyntaxError.endItAll( - "(?@ does not have closing paren"); - add(new Group(op,cl)); - } else if(sp.incMatch("(?#")) { - while(!sp.match(')')) - sp.inc(); - } else if(sp.dontMatch && sp.c == 'w') { - //Regex r = new Regex(); - //r._compile("[a-zA-Z0-9_]",mk); - //add(new Goop("\\w",r.thePattern)); - Bracket b = new Bracket(false); - b.addOr(new Range('a','z')); - b.addOr(new Range('A','Z')); - b.addOr(new Range('0','9')); - b.addOr(new oneChar('_')); - add(b); - } else if(sp.dontMatch && sp.c == 'G') { - add(new BackG()); - } else if(sp.dontMatch && sp.c == 's') { - //Regex r = new Regex(); - //r._compile("[ \t\n\r\b]",mk); - //add(new Goop("\\s",r.thePattern)); - Bracket b = new Bracket(false); - b.addOr(new oneChar((char)32)); - b.addOr(new Range((char)8,(char)10)); - b.addOr(new oneChar((char)13)); - add(b); - } else if(sp.dontMatch && sp.c == 'd') { - //Regex r = new Regex(); - //r._compile("[0-9]",mk); - //add(new Goop("\\d",r.thePattern)); - Range digit = new Range('0','9'); - digit.printBrackets = true; - add(digit); - } else if(sp.dontMatch && sp.c == 'W') { - //Regex r = new Regex(); - //r._compile("[^a-zA-Z0-9_]",mk); - //add(new Goop("\\W",r.thePattern)); - Bracket b = new Bracket(true); - b.addOr(new Range('a','z')); - b.addOr(new Range('A','Z')); - b.addOr(new Range('0','9')); - b.addOr(new oneChar('_')); - add(b); - } else if(sp.dontMatch && sp.c == 'S') { - //Regex r = new Regex(); - //r._compile("[^ \t\n\r\b]",mk); - //add(new Goop("\\S",r.thePattern)); - Bracket b = new Bracket(true); - b.addOr(new oneChar((char)32)); - b.addOr(new Range((char)8,(char)10)); - b.addOr(new oneChar((char)13)); - add(b); - } else if(sp.dontMatch && sp.c == 'D') { - //Regex r = new Regex(); - //r._compile("[^0-9]",mk); - //add(new Goop("\\D",r.thePattern)); - Bracket b = new Bracket(true); - b.addOr(new Range('0','9')); - add(b); - } else if(sp.dontMatch && sp.c == 'B') { - Regex r = new Regex(); - r._compile("(?!"+back_slash+"b)",mk); - add(r.thePattern); - } else if(isOctalString(sp)) { - int d = sp.c - '0'; - sp.inc(); - d = 8*d + sp.c - '0'; - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if(isOctalDigit(sp2,false)) { - sp.inc(); - d = 8*d + sp.c - '0'; - } - add(new oneChar((char)d)); - } else if(sp.dontMatch && sp.c >= '1' && sp.c <= '9') { - int iv = sp.c-'0'; - StrPos s2 = new StrPos(sp); - s2.inc(); - if(!s2.dontMatch && s2.c >= '0' && s2.c <= '9') { - iv = 10*iv+(s2.c-'0'); - sp.inc(); - } - add(new BackMatch(iv)); - } else if(sp.dontMatch && sp.c == 'b') { - add(new Boundary()); - } else if(sp.match('\b')) { - add(new Boundary()); - } else if(sp.match('$')) { - add(new End(true)); - } else if(sp.dontMatch && sp.c == 'Z') { - add(new End(false)); - } else if(sp.match('.')) { - add(new Any()); - } else if(sp.incMatch("(??")) { - StringBuffer sb = new StringBuffer(); - StringBuffer sb2 = new StringBuffer(); - while(!sp.match(')') && !sp.match(':')) { - sb.append(sp.c); - sp.inc(); - } - if(sp.incMatch(":")) { - while(!sp.match(')')) { - sb2.append(sp.c); - sp.inc(); - } - } - String sbs = sb.toString(); - if(validators.get(sbs) instanceof String) { - String pat = (String)validators.get(sbs); - Regex r = newRegex(); - Rthings rth = new Rthings(this); - rth.noBackRefs = true; - r._compile(pat,rth); - add(r.thePattern); - } else { - Custom cm = new Custom(sb.toString()); - if(cm.v != null) { - Validator v2 = cm.v.arg(sb2.toString()); - if(v2 != null) { - v2.argsave = sb2.toString(); - String p = cm.v.pattern; - cm.v = v2; - v2.pattern = p; - } - Regex r = newRegex(); - Rthings rth = new Rthings(this); - rth.noBackRefs = true; - r._compile(cm.v.pattern,rth); - cm.sub = r.thePattern; - cm.sub.add(new CustomEndpoint(cm)); - cm.sub.setParent(cm); - add(cm); - } - } - } else if(sp.match('(')) { - mk.parenLevel++; - Regex r = newRegex(); - // r.or = new Or(); - sp.inc(); - if(sp.incMatch("?:")) { - r.or = new Or(); - } else if(sp.incMatch("?=")) { - r.or = new lookAhead(false); - } else if(sp.incMatch("?!")) { - r.or = new lookAhead(true); - } else if(sp.match('?')) { - sp.inc(); - do { - if(sp.c=='i')mk.ignoreCase = true; - if(sp.c=='Q')mk.dontMatchInQuotes = true; - if(sp.c=='o')mk.optimizeMe = true; - if(sp.c=='g')mk.gFlag = true; - if(sp.c=='s')mk.sFlag = true; - if(sp.c=='m')mk.mFlag = true; - sp.inc(); - } while(!sp.match(')') && !sp.eos); - r = null; - mk.parenLevel--; - if(sp.eos) //throw new RegSyntax - RegSyntaxError.endItAll("Unclosed ()"); - } else { // just ordinary parenthesis - r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++); - } - if(r != null) add(r._compile(sp,mk)); - } else if(sp.match('^')) { - add(new Start(true)); - } else if(sp.dontMatch && sp.c=='A') { - add(new Start(false)); - } else if(sp.match('*')) { - addMulti(new patInt(0),new patInf()); - } else if(sp.match('+')) { - addMulti(new patInt(1),new patInf()); - } else if(sp.match('?')) { - addMulti(new patInt(0),new patInt(1)); - } else if(sp.match('{')) { - boolean bad = false; - StrPos sp2 = new StrPos(sp); - //StringBuffer sb = new StringBuffer(); - sp.inc(); - patInt i1 = sp.getPatInt(); - patInt i2 = null; - if(sp.match('}')) { - i2 = i1; - } else { - if(!sp.match(','))/* - RegSyntaxError.endItAll( - "String \"{"+i2+ - "\" should be followed with , or }");*/ - bad = true; - sp.inc(); - if(sp.match('}')) - i2 = new patInf(); - else - i2 = sp.getPatInt(); - } - if(i1 == null || i2 == null) /* - throw new RegSyntax("Badly formatted Multi: " - +"{"+i1+","+i2+"}"); */ bad = true; - if(bad) { - sp.dup(sp2); - add(new oneChar(sp.c)); - } else - addMulti(i1,i2); - } else if(sp.escMatch('x') && next2Hex(sp)) { - sp.inc(); - int d = getHexDigit(sp); - sp.inc(); - d = 16*d + getHexDigit(sp); - add(new oneChar((char)d)); - } else if(sp.escMatch('c')) { - sp.inc(); - if(sp.c < Ctrl.cmap.length) - add(new oneChar(Ctrl.cmap[sp.c])); - else - add(new oneChar(sp.c)); - } else if(sp.escMatch('f')) { - add(new oneChar((char)12)); - } else if(sp.escMatch('a')) { - add(new oneChar((char)7)); - } else if(sp.escMatch('t')) { - add(new oneChar('\t')); - } else if(sp.escMatch('n')) { - add(new oneChar('\n')); - } else if(sp.escMatch('r')) { - add(new oneChar('\r')); - } else if(sp.escMatch('b')) { - add(new oneChar('\b')); - } else if(sp.escMatch('e')) { - add(new oneChar((char)27)); - } else { - add(new oneChar(sp.c)); - if(sp.match(')')) - RegSyntaxError.endItAll("Unmatched right paren in pattern"); - } - } - - // compiles all Pattern elements, internal method - private Pattern _compile(String pat,Rthings mk) throws RegSyntax { - minMatch = null; - sFlag = mFlag = ignoreCase = gFlag = false; - StrPos sp = new StrPos(pat,0); - thePattern = _compile(sp,mk); - pt.marks = null; - return thePattern; - } - - Pattern p = null; - Or or = null; - Pattern _compile(StrPos sp,Rthings mk) throws RegSyntax { - while(!(sp.eos || (or != null && sp.match(')')) )) { - compile1(sp,mk); - sp.inc(); - } - if(sp.match(')')) mk.parenLevel--; - else if(sp.eos && mk.parenLevel != 0) { - RegSyntaxError.endItAll("Unclosed Parenthesis! lvl="+mk.parenLevel); - } if(or != null) { - if(p == null) p = new NullPattern(); - or.addOr(p); - return or; - } - return p==null ? new NullPattern() : p; - } - - // add a multi object to the end of the chain - // which applies to the last object - void addMulti(patInt i1,patInt i2) throws RegSyntax { - Pattern last,last2; - for(last = p;last != null && last.next != null;last=last.next) - ; - if(last == null || last == p) - last2 = null; - else - for(last2 = p;last2.next != last;last2=last2.next) - ; - if(last instanceof Multi && i1.intValue()==0 && - i2.intValue()==1) - ((Multi)last).matchFewest = true; - else if(last instanceof FastMulti && i1.intValue()==0 && - i2.intValue()==1) - ((FastMulti)last).matchFewest = true; - else if(last instanceof DotMulti && i1.intValue()==0 && - i2.intValue()==1) - ((DotMulti)last).matchFewest = true; - else if(last instanceof Multi - || last instanceof DotMulti - || last instanceof FastMulti) - throw new RegSyntax("Syntax error."); - else if(last2 == null) - p = mkMulti(i1,i2,p); - else - last2.next = mkMulti(i1,i2,last); - } - final static Pattern mkMulti(patInt lo,patInt hi,Pattern p) throws RegSyntax { - if(p instanceof Any && p.next == null) - return (Pattern)new DotMulti(lo,hi); - return RegOpt.safe4fm(p) ? (Pattern)new FastMulti(lo,hi,p) : - (Pattern)new Multi(lo,hi,p); - } - // process the bracket operator - Pattern matchBracket(StrPos sp) throws RegSyntax { - Bracket ret; - if(sp.match('^')) { - ret = new Bracket(true); - sp.inc(); - } else - ret = new Bracket(false); - if(sp.match(']')) - //throw new RegSyntax - RegSyntaxError.endItAll("Unmatched []"); - - while(!sp.eos && !sp.match(']')) { - StrPos s1 = new StrPos(sp); - s1.inc(); - StrPos s1_ = new StrPos(s1); - s1_.inc(); - if(s1.match('-') && !s1_.match(']')) { - StrPos s2 = new StrPos(s1); - s2.inc(); - if(!s2.eos) - ret.addOr(new Range(sp.c,s2.c)); - sp.inc(); - sp.inc(); - } else if(sp.escMatch('Q')) { - sp.inc(); - while(!sp.escMatch('E')) { - ret.addOr(new oneChar(sp.c)); - sp.inc(); - } - } else if(sp.escMatch('d')) { - ret.addOr(new Range('0','9')); - } else if(sp.escMatch('s')) { - ret.addOr(new oneChar((char)32)); - ret.addOr(new Range((char)8,(char)10)); - ret.addOr(new oneChar((char)13)); - } else if(sp.escMatch('w')) { - ret.addOr(new Range('a','z')); - ret.addOr(new Range('A','Z')); - ret.addOr(new Range('0','9')); - ret.addOr(new oneChar('_')); - } else if(sp.escMatch('D')) { - ret.addOr(new Range((char)0,(char)47)); - ret.addOr(new Range((char)58,(char)65535)); - } else if(sp.escMatch('S')) { - ret.addOr(new Range((char)0,(char)7)); - ret.addOr(new Range((char)11,(char)12)); - ret.addOr(new Range((char)14,(char)31)); - ret.addOr(new Range((char)33,(char)65535)); - } else if(sp.escMatch('W')) { - ret.addOr(new Range((char)0,(char)64)); - ret.addOr(new Range((char)91,(char)94)); - ret.addOr(new oneChar((char)96)); - ret.addOr(new Range((char)123,(char)65535)); - } else if(sp.escMatch('x') && next2Hex(sp)) { - sp.inc(); - int d = getHexDigit(sp); - sp.inc(); - d = 16*d + getHexDigit(sp); - ret.addOr(new oneChar((char)d)); - } else if(sp.escMatch('a')) { - ret.addOr(new oneChar((char)7)); - } else if(sp.escMatch('f')) { - ret.addOr(new oneChar((char)12)); - } else if(sp.escMatch('e')) { - ret.addOr(new oneChar((char)27)); - } else if(sp.escMatch('n')) { - ret.addOr(new oneChar('\n')); - } else if(sp.escMatch('t')) { - ret.addOr(new oneChar('\t')); - } else if(sp.escMatch('r')) { - ret.addOr(new oneChar('\r')); - } else if(sp.escMatch('c')) { - sp.inc(); - if(sp.c < Ctrl.cmap.length) - ret.addOr(new oneChar(Ctrl.cmap[sp.c])); - else - ret.addOr(new oneChar(sp.c)); - } else if(isOctalString(sp)) { - int d = sp.c - '0'; - sp.inc(); - d = 8*d + sp.c - '0'; - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if(isOctalDigit(sp2,false)) { - sp.inc(); - d = 8*d + sp.c - '0'; - } - ret.addOr(new oneChar((char)d)); - } else - ret.addOr(new oneChar(sp.c)); - sp.inc(); - } - return ret; - } - - /** Converts the stored Pattern to a String -- this is a - decompile. Note that \t and \n will really print out here, - Not just the two character representations. - Also be prepared to see some strange output if your characters - are not printable. */ - public String toString() { - if( false && thePattern == null ) - return ""; - else { - StringBuffer sb = new StringBuffer(); - if(esc != Pattern.ESC) { - sb.append("(?e="); - sb.append(esc); - sb.append(")"); - } - if(gFlag - ||mFlag - ||!dotDoesntMatchCR - ||sFlag - ||ignoreCase - ||dontMatchInQuotes - ||optimized()) { - sb.append("(?"); - if(ignoreCase)sb.append("i"); - if(mFlag)sb.append("m"); - if(sFlag||!dotDoesntMatchCR)sb.append("s"); - if(dontMatchInQuotes)sb.append("Q"); - if(optimized())sb.append("o"); - if(gFlag)sb.append("g"); - sb.append(")"); - } - String patstr = thePattern.toString(); - if(esc != Pattern.ESC) - patstr = reEscape(patstr,Pattern.ESC,esc); - sb.append(patstr); - return sb.toString(); - } - } - // Re-escape Pattern, allows us to use a different escape - // character. - static String reEscape(String s,char oldEsc,char newEsc) { - if(oldEsc == newEsc) return s; - int i; - StringBuffer sb = new StringBuffer(); - for(i=0;iThis method will attempt to rewrite - your pattern in a way that makes it faster (not all patterns - execute at the same speed). In general, "(?: ... )" will be - faster than "( ... )" so if you don't need the backreference, - you should group using the former pattern.

It will also - introduce new pattern elements that you can't get to otherwise, - for example if you have a large table of strings, i.e. the - months of the year "(January|February|...)" optimize() will make - a Hashtable that takes it to the next appropriate pattern - element -- eliminating the need for a linear search. - @see com.stevesoft.pat.Regex#optimized - @see com.stevesoft.pat.Regex#ignoreCase - @see com.stevesoft.pat.Regex#dontMatchInQuotes - @see com.stevesoft.pat.Regex#matchAt - @see com.stevesoft.pat.Regex#search - */ - public void optimize() { - if(optimized()||thePattern==null) return; - minMatch = new patInt(0);//thePattern.countMinChars(); - thePattern = RegOpt.opt(thePattern,ignoreCase, - dontMatchInQuotes); - skipper = Skip.findSkip(this); - //RegOpt.setParents(this); - return; - } - Skip skipper; - /** This function returns true if the optimize method has - been called. */ - public boolean optimized() { - return minMatch != null; - } - - /** A bit of syntactic surgar for those who want to make - their code look more perl-like. To use this initialize - your Regex object by saying: -

-        Regex r1 = Regex.perlCode("s/hello/goodbye/");
-        Regex r2 = Regex.perlCode("s'fish'frog'i");
-        Regex r3 = Regex.perlCode("m'hello');
-        
- The i for ignoreCase is supported in - this syntax, as well as m, s, and x. The g flat - is a bit of a special case.

- If you wish to replace all occurences of a pattern, you - do not put a 'g' in the perlCode, but call Regex's - replaceAll method.

- If you wish to simply - and only do a search for r2's pattern, you can do this - by calling the searchFrom method method repeatedly, or - by calling search repeatedly if the g flag is set. -

- Note: Currently perlCode does not - support the (?e=#) syntax for - changing the escape character. - */ - - public static Regex perlCode(String s) { - // this file is big enough, see parsePerl.java - // for this function. - return parsePerl.parse(s); - } - static final char back_slash = '\\'; - - /** Checks to see if there are only literal and no special - pattern elements in this Regex. */ - public boolean isLiteral() { - Pattern x = thePattern; - while(x != null) { - if(x instanceof oneChar) - ; - else if(x instanceof Skipped) - ; - else - return false; - x = x.next; - } - return true; - } - - /** You only need to know about this if you are inventing - your own pattern elements. */ - public patInt countMinChars() { return thePattern.countMinChars(); } - /** You only need to know about this if you are inventing - your own pattern elements. */ - public patInt countMaxChars() { return thePattern.countMaxChars(); } - - boolean isHexDigit(StrPos sp) { - boolean r = - !sp.eos && !sp.dontMatch - && ((sp.c>='0'&&sp.c<='9') - ||(sp.c>='a'&&sp.c<='f') - ||(sp.c>='A'&&sp.c<='F')); - return r; - } - boolean isOctalDigit(StrPos sp,boolean first) { - boolean r = - !sp.eos && !(first^sp.dontMatch) - && sp.c>='0'&&sp.c<='7'; - return r; - } - int getHexDigit(StrPos sp) { - if(sp.c >= '0' && sp.c <= '9') - return sp.c - '0'; - if(sp.c >= 'a' && sp.c <= 'f') - return sp.c - 'a' + 10; - return sp.c - 'A' + 10; - } - boolean next2Hex(StrPos sp) { - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if(!isHexDigit(sp2)) - return false; - sp2.inc(); - if(!isHexDigit(sp2)) - return false; - return true; - } - boolean isOctalString(StrPos sp) { - if(!isOctalDigit(sp,true)) - return false; - StrPos sp2 = new StrPos(sp); - sp2.inc(); - if(!isOctalDigit(sp2,false)) - return false; - return true; - } -} +// +// This software is now distributed according to +// the Lesser Gnu Public License. Please see +// http://www.gnu.org/copyleft/lesser.txt for +// the details. +// -- Happy Computing! +// +package com.stevesoft.pat; + +import jalview.util.MessageManager; + +import java.io.File; +import java.io.FilenameFilter; +import java.util.BitSet; +import java.util.Hashtable; + +import com.stevesoft.pat.wrap.StringWrap; + +/** Matches a Unicode punctuation character. */ +class UnicodePunct extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode white space character. */ +class UnicodeWhite extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1; + } +} + +/** + * Matches a character that is not a Unicode punctuation character. + */ +class NUnicodePunct extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1; + } +} + +/** + * Matches a character that is not a Unicode white space character. + */ +class NUnicodeWhite extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode word character: an alphanumeric or underscore. */ +class UnicodeW extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + if (from >= s.length()) + { + return -1; + } + char c = s.charAt(from); + return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to + : -1; + } +} + +/** Matches a character that is not a Unicode alphanumeric or underscore. */ +class NUnicodeW extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + if (from >= s.length()) + { + return -1; + } + char c = s.charAt(from); + return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') + ? to + : -1; + } +} + +/** Matches a Unicode decimal digit. */ +class UnicodeDigit extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to + : -1; + } +} + +/** Matches a character that is not a Unicode digit. */ +class NUnicodeDigit extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to + : -1; + } +} + +/** Matches a Unicode math character. */ +class UnicodeMath extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1; + } +} + +/** Matches a non-math Unicode character. */ +class NUnicodeMath extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode currency symbol. */ +class UnicodeCurrency extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1; + } +} + +/** Matches a non-currency symbol Unicode character. */ +class NUnicodeCurrency extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1; + } +} + +/** Matches a Unicode alphabetic character. */ +class UnicodeAlpha extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1; + } +} + +/** Matches a non-alphabetic Unicode character. */ +class NUnicodeAlpha extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to + : -1; + } +} + +/** Matches an upper case Unicode character. */ +class UnicodeUpper extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && isUpper(s.charAt(from)) ? to : -1; + } + + final boolean isUpper(char c) + { + return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c); + } +} + +/** Matches an upper case Unicode character. */ +class UnicodeLower extends UniValidator +{ + @Override + public int validate(StringLike s, int from, int to) + { + return from < s.length() && isLower(s.charAt(from)) ? to : -1; + } + + final boolean isLower(char c) + { + return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c); + } +} + +/** + * Regex provides the parser which constructs the linked list of Pattern classes + * from a String. + *

+ * For the purpose of this documentation, the fact that java interprets the + * backslash will be ignored. In practice, however, you will need a double + * backslash to obtain a string that contains a single backslash character. + * Thus, the example pattern "\b" should really be typed as "\\b" inside java + * code. + *

+ * Note that Regex is part of package "com.stevesoft.pat". To use it, simply + * import com.stevesoft.pat.Regex at the top of your file. + *

+ * Regex is made with a constructor that takes a String that defines the regular + * expression. Thus, for example + * + *

+ * Regex r = new Regex("[a-c]*");
+ * 
+ * + * matches any number of characters so long as the are 'a', 'b', or 'c'). + *

+ * To attempt to match the Pattern to a given string, you can use either the + * search(String) member function, or the matchAt(String,int position) member + * function. These functions return a boolean which tells you whether or not the + * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in + * the Regex object appropriately. + *

+ * The portion of the string before the match can be obtained by the left() + * member, and the portion after the match can be obtained by the right() + * member. + *

+ * Essentially, this package implements a syntax that is very much like the perl + * 5 regular expression syntax. + * + * Longer example: + * + *

+ * Regex r = new Regex("x(a|b)y");
+ * r.matchAt("xay", 0);
+ * System.out.println("sub = " + r.stringMatched(1));
+ * 
+ * + * The above would print "sub = a". + * + *
+ *  r.left() // would return "x"
+ *  r.right() // would return "y"
+ * 
+ * + *

+ * Differences between this package and perl5:
+ * The extended Pattern for setting flags, is now supported, but the flags are + * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the + * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the + * escape character. The pattern + * + *

+ * (?e=#)#d+
+ * 
+ * + * is the same as + * + *
+ * \d+
+ * 
+ * + * , but note that the sequence + * + *
+ * (?e=#)
+ * 
+ * + * must occur at the very beginning of the pattern. There may be other + * small differences as well. I will either make my package conform or note them + * as I become aware of them. + *

+ * This package supports additional patterns not in perl5:

+ * + * + * + * + * + * + * + * + *
(?@())GroupThis matches all characters between the '(' character and the balancing + * ')' character. Thus, it will match "()" as well as "(())". The balancing + * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".
(?<1)BackupMoves the pointer backwards within the text. This allows you to make a + * "look behind." It fails if it attempts to move to a position before the + * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1 + * in this example, is the number of characters to move backwards.
+ *
+ * + * + * @author Steven R. Brandt + * @version package com.stevesoft.pat, release 1.5.3 + * @see Pattern + */ +public class Regex extends RegRes implements FilenameFilter +{ + /** + * BackRefOffset gives the identity number of the first pattern. Version 1.0 + * used zero, version 1.1 uses 1 to be more compatible with perl. + */ + static int BackRefOffset = 1; + + private static Pattern none = new NoPattern(); + + Pattern thePattern = none; + + patInt minMatch = new patInt(0); + + static Hashtable validators = new Hashtable(); + static + { + define("p", "(?>1)", new UnicodePunct()); + define("P", "(?>1)", new NUnicodePunct()); + define("s", "(?>1)", new UnicodeWhite()); + define("S", "(?>1)", new NUnicodeWhite()); + define("w", "(?>1)", new UnicodeW()); + define("W", "(?>1)", new NUnicodeW()); + define("d", "(?>1)", new UnicodeDigit()); + define("D", "(?>1)", new NUnicodeDigit()); + define("m", "(?>1)", new UnicodeMath()); + define("M", "(?>1)", new NUnicodeMath()); + define("c", "(?>1)", new UnicodeCurrency()); + define("C", "(?>1)", new NUnicodeCurrency()); + define("a", "(?>1)", new UnicodeAlpha()); + define("A", "(?>1)", new NUnicodeAlpha()); + define("uc", "(?>1)", new UnicodeUpper()); + define("lc", "(?>1)", new UnicodeLower()); + } + + /** Set the dontMatch in quotes flag. */ + public void setDontMatchInQuotes(boolean b) + { + dontMatchInQuotes = b; + } + + /** Find out if the dontMatchInQuotes flag is enabled. */ + public boolean getDontMatchInQuotes() + { + return dontMatchInQuotes; + } + + boolean dontMatchInQuotes = false; + + /** + * Set the state of the ignoreCase flag. If set to true, then the pattern + * matcher will ignore case when searching for a match. + */ + public void setIgnoreCase(boolean b) + { + ignoreCase = b; + } + + /** + * Get the state of the ignoreCase flag. Returns true if we are ignoring the + * case of the pattern, false otherwise. + */ + public boolean getIgnoreCase() + { + return ignoreCase; + } + + boolean ignoreCase = false; + + static boolean defaultMFlag = false; + + /** + * Set the default value of the m flag. If it is set to true, then the MFlag + * will be on for any regex search executed. + */ + public static void setDefaultMFlag(boolean mFlag) + { + defaultMFlag = mFlag; + } + + /** + * Get the default value of the m flag. If it is set to true, then the MFlag + * will be on for any regex search executed. + */ + public static boolean getDefaultMFlag() + { + return defaultMFlag; + } + + /** + * Initializes the object without a Pattern. To supply a Pattern use + * compile(String s). + * + * @see com.stevesoft.pat.Regex#compile(java.lang.String) + */ + public Regex() + { + } + + /** + * Create and compile a Regex, but do not throw any exceptions. If you wish to + * have exceptions thrown for syntax errors, you must use the Regex(void) + * constructor to create the Regex object, and then call the compile method. + * Therefore, you should only call this method when you know your pattern is + * right. I will probably become more like + * + * @see com.stevesoft.pat.Regex#search(java.lang.String) + * @see com.stevesoft.pat.Regex#compile(java.lang.String) + */ + public Regex(String s) + { + try + { + compile(s); + } catch (RegSyntax rs) + { + } + } + + ReplaceRule rep = null; + + /** + * Create and compile both a Regex and a ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#compile(java.lang.String) + */ + public Regex(String s, String rp) + { + this(s); + rep = ReplaceRule.perlCode(rp); + } + + /** + * Create and compile a Regex, but give it the ReplaceRule specified. This + * allows the user finer control of the Replacement process, if that is + * desired. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#compile(java.lang.String) + */ + public Regex(String s, ReplaceRule rp) + { + this(s); + rep = rp; + } + + /** + * Change the ReplaceRule of this Regex by compiling a new one using String + * rp. + */ + public void setReplaceRule(String rp) + { + rep = ReplaceRule.perlCode(rp); + repr = null; // Clear Replacer history + } + + /** Change the ReplaceRule of this Regex to rp. */ + public void setReplaceRule(ReplaceRule rp) + { + rep = rp; + } + + /** + * Test to see if a custom defined rule exists. + * + * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator) + */ + public static boolean isDefined(String nm) + { + return validators.get(nm) != null; + } + + /** + * Removes a custom defined rule. + * + * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator) + */ + public static void undefine(String nm) + { + validators.remove(nm); + } + + /** + * Defines a method to create a new rule. See test/deriv2.java and + * test/deriv3.java for examples of how to use it. + */ + public static void define(String nm, String pat, Validator v) + { + v.pattern = pat; + validators.put(nm, v); + } + + /** + * Defines a shorthand for a pattern. The pattern will be invoked by a string + * that has the form "(??"+nm+")". + */ + public static void define(String nm, String pat) + { + validators.put(nm, pat); + } + + /** Get the current ReplaceRule. */ + public ReplaceRule getReplaceRule() + { + return rep; + } + + Replacer repr = null; + + final Replacer _getReplacer() + { + return repr == null ? repr = new Replacer() : repr; + } + + public Replacer getReplacer() + { + if (repr == null) + { + repr = new Replacer(); + } + repr.rh.me = this; + repr.rh.prev = null; + return repr; + } + + /** + * Replace the first occurence of this pattern in String s according to the + * ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#getReplaceRule() + */ + public String replaceFirst(String s) + { + return _getReplacer().replaceFirstRegion(s, this, 0, s.length()) + .toString(); + } + + /** + * Replace the first occurence of this pattern in String s beginning with + * position pos according to the ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#getReplaceRule() + */ + public String replaceFirstFrom(String s, int pos) + { + return _getReplacer().replaceFirstRegion(s, this, pos, s.length()) + .toString(); + } + + /** + * Replace the first occurence of this pattern in String s beginning with + * position start and ending with end according to the ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#getReplaceRule() + */ + public String replaceFirstRegion(String s, int start, int end) + { + return _getReplacer().replaceFirstRegion(s, this, start, end) + .toString(); + } + + /** + * Replace all occurences of this pattern in String s according to the + * ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#getReplaceRule() + */ + public String replaceAll(String s) + { + return _getReplacer().replaceAllRegion(s, this, 0, s.length()) + .toString(); + } + + public StringLike replaceAll(StringLike s) + { + return _getReplacer().replaceAllRegion(s, this, 0, s.length()); + } + + /** + * Replace all occurences of this pattern in String s beginning with position + * pos according to the ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#getReplaceRule() + */ + public String replaceAllFrom(String s, int pos) + { + return _getReplacer().replaceAllRegion(s, this, pos, s.length()) + .toString(); + } + + /** + * Replace all occurences of this pattern in String s beginning with position + * start and ending with end according to the ReplaceRule. + * + * @see com.stevesoft.pat.ReplaceRule + * @see com.stevesoft.pat.Regex#getReplaceRule() + */ + public String replaceAllRegion(String s, int start, int end) + { + return _getReplacer().replaceAllRegion(s, this, start, end).toString(); + } + + /** Essentially clones the Regex object */ + public Regex(Regex r) + { + super(r); + dontMatchInQuotes = r.dontMatchInQuotes; + esc = r.esc; + ignoreCase = r.ignoreCase; + gFlag = r.gFlag; + if (r.rep == null) + { + rep = null; + } + else + { + rep = (ReplaceRule) r.rep.clone(); + } + /* + * try { compile(r.toString()); } catch(RegSyntax r_) {} + */ + thePattern = r.thePattern.clone(new Hashtable()); + minMatch = r.minMatch; + skipper = r.skipper; + } + + /** + * By default, the escape character is the backslash, but you can make it + * anything you want by setting this variable. + */ + public char esc = Pattern.ESC; + + /** + * This method compiles a regular expression, making it possible to call the + * search or matchAt methods. + * + * @exception com.stevesoft.pat.RegSyntax + * is thrown if a syntax error is encountered in the pattern. For + * example, "x{3,1}" or "*a" are not valid patterns. + * @see com.stevesoft.pat.Regex#search + * @see com.stevesoft.pat.Regex#matchAt + */ + public void compile(String prepat) throws RegSyntax + { + String postpat = parsePerl.codify(prepat, true); + String pat = postpat == null ? prepat : postpat; + minMatch = null; + ignoreCase = false; + dontMatchInQuotes = false; + Rthings mk = new Rthings(this); + int offset = mk.val; + String newpat = pat; + thePattern = none; + p = null; + or = null; + minMatch = new patInt(0); + StrPos sp = new StrPos(pat, 0); + if (sp.incMatch("(?e=")) + { + char newEsc = sp.c; + sp.inc(); + if (sp.match(')')) + { + newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC); + } + } + else if (esc != Pattern.ESC) + { + newpat = reEscape(pat, esc, Pattern.ESC); + } + thePattern = _compile(newpat, mk); + numSubs_ = mk.val - offset; + mk.set(this); + } + + /* + * If a Regex is compared against a Regex, a check is done to see that the + * patterns are equal as well as the most recent match. If a Regex is compare + * with a RegRes, only the result of the most recent match is compared. + */ + @Override + public boolean equals(Object o) + { + if (o instanceof Regex) + { + if (toString().equals(o.toString())) + { + return super.equals(o); + } + else + { + return false; + } + } + else + { + return super.equals(o); + } + } + + /** A clone by any other name would smell as sweet. */ + @Override + public Object clone() + { + return new Regex(this); + } + + /** Return a clone of the underlying RegRes object. */ + public RegRes result() + { + return (RegRes) super.clone(); + } + + // prep sets global variables of class + // Pattern so that it can access them + // during an attempt at a match + Pthings pt = new Pthings(); + + final Pthings prep(StringLike s) + { + // if(gFlag) + pt.lastPos = matchedTo(); + if (pt.lastPos < 0) + { + pt.lastPos = 0; + } + if ((s == null ? null : s.unwrap()) != (src == null ? null + : s.unwrap())) + { + pt.lastPos = 0; + } + src = s; + pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag); + pt.mFlag = (mFlag | defaultMFlag); + pt.ignoreCase = ignoreCase; + pt.no_check = false; + if (pt.marks != null) + { + for (int i = 0; i < pt.marks.length; i++) + { + pt.marks[i] = -1; + } + } + pt.marks = null; + pt.nMarks = numSubs_; + pt.src = s; + if (dontMatchInQuotes) + { + setCbits(s, pt); + } + else + { + pt.cbits = null; + } + return pt; + } + + /** + * Attempt to match a Pattern beginning at a specified location within the + * string. + * + * @see com.stevesoft.pat.Regex#search + */ + public boolean matchAt(String s, int start_pos) + { + return _search(s, start_pos, start_pos); + } + + /** + * Attempt to match a Pattern beginning at a specified location within the + * StringLike. + * + * @see com.stevesoft.pat.Regex#search + */ + public boolean matchAt(StringLike s, int start_pos) + { + return _search(s, start_pos, start_pos); + } + + /** + * Search through a String for the first occurrence of a match. + * + * @see com.stevesoft.pat.Regex#searchFrom + * @see com.stevesoft.pat.Regex#matchAt + */ + public boolean search(String s) + { + if (s == null) + { + throw new NullPointerException(MessageManager + .getString("exception.null_string_given_to_regex_search")); + } + return _search(s, 0, s.length()); + } + + public boolean search(StringLike sl) + { + if (sl == null) + { + throw new NullPointerException(MessageManager.getString( + "exception.null_string_like_given_to_regex_search")); + } + return _search(sl, 0, sl.length()); + } + + public boolean reverseSearch(String s) + { + if (s == null) + { + throw new NullPointerException(MessageManager.getString( + "exception.null_string_given_to_regex_reverse_search")); + } + return _reverseSearch(s, 0, s.length()); + } + + public boolean reverseSearch(StringLike sl) + { + if (sl == null) + { + throw new NullPointerException(MessageManager.getString( + "exception.null_string_like_given_to_regex_reverse_search")); + } + return _reverseSearch(sl, 0, sl.length()); + } + + /** + * Search through a String for the first occurence of a match, but start at + * position + * + *
+   * start
+   * 
+ */ + public boolean searchFrom(String s, int start) + { + if (s == null) + { + throw new NullPointerException(MessageManager.getString( + "exception.null_string_like_given_to_regex_search_from")); + } + return _search(s, start, s.length()); + } + + public boolean searchFrom(StringLike s, int start) + { + if (s == null) + { + throw new NullPointerException(MessageManager.getString( + "exception.null_string_like_given_to_regex_search_from")); + } + return _search(s, start, s.length()); + } + + /** + * Search through a region of a String for the first occurence of a match. + */ + public boolean searchRegion(String s, int start, int end) + { + if (s == null) + { + throw new NullPointerException(MessageManager.getString( + "exception.null_string_like_given_to_regex_search_region")); + } + return _search(s, start, end); + } + + /** + * Set this to change the default behavior of the "." pattern. By default it + * now matches perl's behavior and fails to match the '\n' character. + */ + public static boolean dotDoesntMatchCR = true; + + StringLike gFlags; + + int gFlagto = 0; + + boolean gFlag = false; + + /** Set the 'g' flag */ + public void setGFlag(boolean b) + { + gFlag = b; + } + + /** Get the state of the 'g' flag. */ + public boolean getGFlag() + { + return gFlag; + } + + boolean sFlag = false; + + /** Get the state of the sFlag */ + public boolean getSFlag() + { + return sFlag; + } + + boolean mFlag = false; + + /** Get the state of the sFlag */ + public boolean getMFlag() + { + return mFlag; + } + + final boolean _search(String s, int start, int end) + { + return _search(new StringWrap(s), start, end); + } + + final boolean _search(StringLike s, int start, int end) + { + if (gFlag && gFlagto > 0 && gFlags != null + && s.unwrap() == gFlags.unwrap()) + { + start = gFlagto; + } + gFlags = null; + + Pthings pt = prep(s); + + int up = (minMatch == null ? end : end - minMatch.i); + + if (up < start && end >= start) + { + up = start; + } + + if (skipper == null) + { + for (int i = start; i <= up; i++) + { + charsMatched_ = thePattern.matchAt(s, i, pt); + if (charsMatched_ >= 0) + { + matchFrom_ = thePattern.mfrom; + marks = pt.marks; + gFlagto = matchFrom_ + charsMatched_; + gFlags = s; + return didMatch_ = true; + } + } + } + else + { + pt.no_check = true; + for (int i = start; i <= up; i++) + { + i = skipper.find(src, i, up); + if (i < 0) + { + charsMatched_ = matchFrom_ = -1; + return didMatch_ = false; + } + charsMatched_ = thePattern.matchAt(s, i, pt); + if (charsMatched_ >= 0) + { + matchFrom_ = thePattern.mfrom; + marks = pt.marks; + gFlagto = matchFrom_ + charsMatched_; + gFlags = s; + return didMatch_ = true; + } + } + } + return didMatch_ = false; + } + + /* + * final boolean _search(LongStringLike s,long start,long end) { if(gFlag && + * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null; + * + * Pthings pt=prep(s); + * + * int up = end;//(minMatch == null ? end : end-minMatch.i); + * + * if(up < start && end >= start) up = start; + * + * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ = + * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ = + * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_; + * return didMatch_=true; } } } else { pt.no_check = true; for(long + * i=start;i<=up;i++) { i = skipper.find(src,i,up); if(i<0) { charsMatched_ = + * matchFrom_ = -1; return didMatch_ = false; } charsMatched_ = + * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ = + * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_; + * gFlags = s; return didMatch_=true; } else { i = s.adjustIndex(i); up = + * s.adjustEnd(i); } } } return didMatch_=false; } + */ + + boolean _reverseSearch(String s, int start, int end) + { + return _reverseSearch(new StringWrap(s), start, end); + } + + boolean _reverseSearch(StringLike s, int start, int end) + { + if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap()) + { + end = gFlagto; + } + gFlags = null; + Pthings pt = prep(s); + for (int i = end; i >= start; i--) + { + charsMatched_ = thePattern.matchAt(s, i, pt); + if (charsMatched_ >= 0) + { + matchFrom_ = thePattern.mfrom; + marks = pt.marks; + gFlagto = matchFrom_ - 1; + gFlags = s; + return didMatch_ = true; + } + } + return didMatch_ = false; + } + + // This routine sets the cbits variable + // of class Pattern. Cbits is true for + // the bit corresponding to a character inside + // a set of quotes. + static StringLike lasts = null; + + static BitSet lastbs = null; + + static void setCbits(StringLike s, Pthings pt) + { + if (s == lasts) + { + pt.cbits = lastbs; + return; + } + BitSet bs = new BitSet(s.length()); + char qc = ' '; + boolean setBit = false; + for (int i = 0; i < s.length(); i++) + { + if (setBit) + { + bs.set(i); + } + char c = s.charAt(i); + if (!setBit && c == '"') + { + qc = c; + setBit = true; + bs.set(i); + } + else if (!setBit && c == '\'') + { + qc = c; + setBit = true; + bs.set(i); + } + else if (setBit && c == qc) + { + setBit = false; + } + else if (setBit && c == '\\' && i + 1 < s.length()) + { + i++; + if (setBit) + { + bs.set(i); + } + } + } + pt.cbits = lastbs = bs; + lasts = s; + } + + // Wanted user to over-ride this in alpha version, + // but it wasn't really necessary because of this trick: + Regex newRegex() + { + try + { + return getClass().getDeclaredConstructor().newInstance(); + } catch (InstantiationException ie) + { + return null; + } catch (IllegalAccessException iae) + { + return null; + } catch (ReflectiveOperationException roe) + { + return null; + } + } + + /** + * Only needed for creating your own extensions of Regex. This method adds the + * next Pattern in the chain of patterns or sets the Pattern if it is the + * first call. + */ + protected void add(Pattern p2) + { + if (p == null) + { + p = p2; + } + else + { + p.add(p2); + p2 = p; + } + } + + /** + * You only need to use this method if you are creating your own extentions to + * Regex. compile1 compiles one Pattern element, it can be over-ridden to + * allow the Regex compiler to understand new syntax. See deriv.java for an + * example. This routine is the heart of class Regex. Rthings has one integer + * member called intValue, it is used to keep track of the number of ()'s in + * the Pattern. + * + * @exception com.stevesoft.pat.RegSyntax + * is thrown when a nonsensensical pattern is supplied. For + * example, a pattern beginning with *. + */ + protected void compile1(StrPos sp, Rthings mk) throws RegSyntax + { + if (sp.match('[')) + { + sp.inc(); + add(matchBracket(sp)); + } + else if (sp.match('|')) + { + if (or == null) + { + or = new Or(); + } + if (p == null) + { + p = new NullPattern(); + } + or.addOr(p); + p = null; + } + else if (sp.incMatch("(?<")) + { + patInt i = sp.getPatInt(); + if (i == null) + { + RegSyntaxError.endItAll("No int after (?<"); + } + add(new Backup(i.intValue())); + if (!sp.match(')')) + { + RegSyntaxError.endItAll("No ) after (?<"); + } + } + else if (sp.incMatch("(?>")) + { + patInt i = sp.getPatInt(); + if (i == null) + { + RegSyntaxError.endItAll("No int after (?>"); + } + add(new Backup(-i.intValue())); + if (!sp.match(')')) + { + RegSyntaxError.endItAll("No ) after (?<"); + } + } + else if (sp.incMatch("(?@")) + { + char op = sp.c; + sp.inc(); + char cl = sp.c; + sp.inc(); + if (!sp.match(')')) + { + RegSyntaxError.endItAll("(?@ does not have closing paren"); + } + add(new Group(op, cl)); + } + else if (sp.incMatch("(?#")) + { + while (!sp.match(')')) + { + sp.inc(); + } + } + else if (sp.dontMatch && sp.c == 'w') + { + // Regex r = new Regex(); + // r._compile("[a-zA-Z0-9_]",mk); + // add(new Goop("\\w",r.thePattern)); + Bracket b = new Bracket(false); + b.addOr(new Range('a', 'z')); + b.addOr(new Range('A', 'Z')); + b.addOr(new Range('0', '9')); + b.addOr(new oneChar('_')); + add(b); + } + else if (sp.dontMatch && sp.c == 'G') + { + add(new BackG()); + } + else if (sp.dontMatch && sp.c == 's') + { + // Regex r = new Regex(); + // r._compile("[ \t\n\r\b]",mk); + // add(new Goop("\\s",r.thePattern)); + Bracket b = new Bracket(false); + b.addOr(new oneChar((char) 32)); + b.addOr(new Range((char) 8, (char) 10)); + b.addOr(new oneChar((char) 13)); + add(b); + } + else if (sp.dontMatch && sp.c == 'd') + { + // Regex r = new Regex(); + // r._compile("[0-9]",mk); + // add(new Goop("\\d",r.thePattern)); + Range digit = new Range('0', '9'); + digit.printBrackets = true; + add(digit); + } + else if (sp.dontMatch && sp.c == 'W') + { + // Regex r = new Regex(); + // r._compile("[^a-zA-Z0-9_]",mk); + // add(new Goop("\\W",r.thePattern)); + Bracket b = new Bracket(true); + b.addOr(new Range('a', 'z')); + b.addOr(new Range('A', 'Z')); + b.addOr(new Range('0', '9')); + b.addOr(new oneChar('_')); + add(b); + } + else if (sp.dontMatch && sp.c == 'S') + { + // Regex r = new Regex(); + // r._compile("[^ \t\n\r\b]",mk); + // add(new Goop("\\S",r.thePattern)); + Bracket b = new Bracket(true); + b.addOr(new oneChar((char) 32)); + b.addOr(new Range((char) 8, (char) 10)); + b.addOr(new oneChar((char) 13)); + add(b); + } + else if (sp.dontMatch && sp.c == 'D') + { + // Regex r = new Regex(); + // r._compile("[^0-9]",mk); + // add(new Goop("\\D",r.thePattern)); + Bracket b = new Bracket(true); + b.addOr(new Range('0', '9')); + add(b); + } + else if (sp.dontMatch && sp.c == 'B') + { + Regex r = new Regex(); + r._compile("(?!" + back_slash + "b)", mk); + add(r.thePattern); + } + else if (isOctalString(sp)) + { + int d = sp.c - '0'; + sp.inc(); + d = 8 * d + sp.c - '0'; + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (isOctalDigit(sp2, false)) + { + sp.inc(); + d = 8 * d + sp.c - '0'; + } + add(new oneChar((char) d)); + } + else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9') + { + int iv = sp.c - '0'; + StrPos s2 = new StrPos(sp); + s2.inc(); + if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9') + { + iv = 10 * iv + (s2.c - '0'); + sp.inc(); + } + add(new BackMatch(iv)); + } + else if (sp.dontMatch && sp.c == 'b') + { + add(new Boundary()); + } + else if (sp.match('\b')) + { + add(new Boundary()); + } + else if (sp.match('$')) + { + add(new End(true)); + } + else if (sp.dontMatch && sp.c == 'Z') + { + add(new End(false)); + } + else if (sp.match('.')) + { + add(new Any()); + } + else if (sp.incMatch("(??")) + { + StringBuffer sb = new StringBuffer(); + StringBuffer sb2 = new StringBuffer(); + while (!sp.match(')') && !sp.match(':')) + { + sb.append(sp.c); + sp.inc(); + } + if (sp.incMatch(":")) + { + while (!sp.match(')')) + { + sb2.append(sp.c); + sp.inc(); + } + } + String sbs = sb.toString(); + if (validators.get(sbs) instanceof String) + { + String pat = (String) validators.get(sbs); + Regex r = newRegex(); + Rthings rth = new Rthings(this); + rth.noBackRefs = true; + r._compile(pat, rth); + add(r.thePattern); + } + else + { + Custom cm = new Custom(sb.toString()); + if (cm.v != null) + { + Validator v2 = cm.v.arg(sb2.toString()); + if (v2 != null) + { + v2.argsave = sb2.toString(); + String p = cm.v.pattern; + cm.v = v2; + v2.pattern = p; + } + Regex r = newRegex(); + Rthings rth = new Rthings(this); + rth.noBackRefs = true; + r._compile(cm.v.pattern, rth); + cm.sub = r.thePattern; + cm.sub.add(new CustomEndpoint(cm)); + cm.sub.setParent(cm); + add(cm); + } + } + } + else if (sp.match('(')) + { + mk.parenLevel++; + Regex r = newRegex(); + // r.or = new Or(); + sp.inc(); + if (sp.incMatch("?:")) + { + r.or = new Or(); + } + else if (sp.incMatch("?=")) + { + r.or = new lookAhead(false); + } + else if (sp.incMatch("?!")) + { + r.or = new lookAhead(true); + } + else if (sp.match('?')) + { + sp.inc(); + do + { + if (sp.c == 'i') + { + mk.ignoreCase = true; + } + if (sp.c == 'Q') + { + mk.dontMatchInQuotes = true; + } + if (sp.c == 'o') + { + mk.optimizeMe = true; + } + if (sp.c == 'g') + { + mk.gFlag = true; + } + if (sp.c == 's') + { + mk.sFlag = true; + } + if (sp.c == 'm') + { + mk.mFlag = true; + } + sp.inc(); + } while (!sp.match(')') && !sp.eos); + r = null; + mk.parenLevel--; + if (sp.eos) // throw new RegSyntax + { + RegSyntaxError.endItAll("Unclosed ()"); + } + } + else + { // just ordinary parenthesis + r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++); + } + if (r != null) + { + add(r._compile(sp, mk)); + } + } + else if (sp.match('^')) + { + add(new Start(true)); + } + else if (sp.dontMatch && sp.c == 'A') + { + add(new Start(false)); + } + else if (sp.match('*')) + { + addMulti(new patInt(0), new patInf()); + } + else if (sp.match('+')) + { + addMulti(new patInt(1), new patInf()); + } + else if (sp.match('?')) + { + addMulti(new patInt(0), new patInt(1)); + } + else if (sp.match('{')) + { + boolean bad = false; + StrPos sp2 = new StrPos(sp); + // StringBuffer sb = new StringBuffer(); + sp.inc(); + patInt i1 = sp.getPatInt(); + patInt i2 = null; + if (sp.match('}')) + { + i2 = i1; + } + else + { + if (!sp.match(',')) + { + /* + * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed + * with , or }"); + */ + bad = true; + } + sp.inc(); + if (sp.match('}')) + { + i2 = new patInf(); + } + else + { + i2 = sp.getPatInt(); + } + } + if (i1 == null || i2 == null) + { + /* + * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}"); + */ + bad = true; + } + if (bad) + { + sp.dup(sp2); + add(new oneChar(sp.c)); + } + else + { + addMulti(i1, i2); + } + } + else if (sp.escMatch('x') && next2Hex(sp)) + { + sp.inc(); + int d = getHexDigit(sp); + sp.inc(); + d = 16 * d + getHexDigit(sp); + add(new oneChar((char) d)); + } + else if (sp.escMatch('c')) + { + sp.inc(); + if (sp.c < Ctrl.cmap.length) + { + add(new oneChar(Ctrl.cmap[sp.c])); + } + else + { + add(new oneChar(sp.c)); + } + } + else if (sp.escMatch('f')) + { + add(new oneChar((char) 12)); + } + else if (sp.escMatch('a')) + { + add(new oneChar((char) 7)); + } + else if (sp.escMatch('t')) + { + add(new oneChar('\t')); + } + else if (sp.escMatch('n')) + { + add(new oneChar('\n')); + } + else if (sp.escMatch('r')) + { + add(new oneChar('\r')); + } + else if (sp.escMatch('b')) + { + add(new oneChar('\b')); + } + else if (sp.escMatch('e')) + { + add(new oneChar((char) 27)); + } + else + { + add(new oneChar(sp.c)); + if (sp.match(')')) + { + RegSyntaxError.endItAll("Unmatched right paren in pattern"); + } + } + } + + // compiles all Pattern elements, internal method + private Pattern _compile(String pat, Rthings mk) throws RegSyntax + { + minMatch = null; + sFlag = mFlag = ignoreCase = gFlag = false; + StrPos sp = new StrPos(pat, 0); + thePattern = _compile(sp, mk); + pt.marks = null; + return thePattern; + } + + Pattern p = null; + + Or or = null; + + Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax + { + while (!(sp.eos || (or != null && sp.match(')')))) + { + compile1(sp, mk); + sp.inc(); + } + if (sp.match(')')) + { + mk.parenLevel--; + } + else if (sp.eos && mk.parenLevel != 0) + { + RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel); + } + if (or != null) + { + if (p == null) + { + p = new NullPattern(); + } + or.addOr(p); + return or; + } + return p == null ? new NullPattern() : p; + } + + // add a multi object to the end of the chain + // which applies to the last object + void addMulti(patInt i1, patInt i2) throws RegSyntax + { + Pattern last, last2; + for (last = p; last != null && last.next != null; last = last.next) + { + ; + } + if (last == null || last == p) + { + last2 = null; + } + else + { + for (last2 = p; last2.next != last; last2 = last2.next) + { + ; + } + } + if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1) + { + ((Multi) last).matchFewest = true; + } + else if (last instanceof FastMulti && i1.intValue() == 0 + && i2.intValue() == 1) + { + ((FastMulti) last).matchFewest = true; + } + else if (last instanceof DotMulti && i1.intValue() == 0 + && i2.intValue() == 1) + { + ((DotMulti) last).matchFewest = true; + } + else if (last instanceof Multi || last instanceof DotMulti + || last instanceof FastMulti) + { + throw new RegSyntax("Syntax error."); + } + else if (last2 == null) + { + p = mkMulti(i1, i2, p); + } + else + { + last2.next = mkMulti(i1, i2, last); + } + } + + final static Pattern mkMulti(patInt lo, patInt hi, Pattern p) + throws RegSyntax + { + if (p instanceof Any && p.next == null) + { + return new DotMulti(lo, hi); + } + return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p) + : (Pattern) new Multi(lo, hi, p); + } + + // process the bracket operator + Pattern matchBracket(StrPos sp) throws RegSyntax + { + Bracket ret; + if (sp.match('^')) + { + ret = new Bracket(true); + sp.inc(); + } + else + { + ret = new Bracket(false); + } + if (sp.match(']')) + { + // throw new RegSyntax + RegSyntaxError.endItAll("Unmatched []"); + } + + while (!sp.eos && !sp.match(']')) + { + StrPos s1 = new StrPos(sp); + s1.inc(); + StrPos s1_ = new StrPos(s1); + s1_.inc(); + if (s1.match('-') && !s1_.match(']')) + { + StrPos s2 = new StrPos(s1); + s2.inc(); + if (!s2.eos) + { + ret.addOr(new Range(sp.c, s2.c)); + } + sp.inc(); + sp.inc(); + } + else if (sp.escMatch('Q')) + { + sp.inc(); + while (!sp.escMatch('E')) + { + ret.addOr(new oneChar(sp.c)); + sp.inc(); + } + } + else if (sp.escMatch('d')) + { + ret.addOr(new Range('0', '9')); + } + else if (sp.escMatch('s')) + { + ret.addOr(new oneChar((char) 32)); + ret.addOr(new Range((char) 8, (char) 10)); + ret.addOr(new oneChar((char) 13)); + } + else if (sp.escMatch('w')) + { + ret.addOr(new Range('a', 'z')); + ret.addOr(new Range('A', 'Z')); + ret.addOr(new Range('0', '9')); + ret.addOr(new oneChar('_')); + } + else if (sp.escMatch('D')) + { + ret.addOr(new Range((char) 0, (char) 47)); + ret.addOr(new Range((char) 58, (char) 65535)); + } + else if (sp.escMatch('S')) + { + ret.addOr(new Range((char) 0, (char) 7)); + ret.addOr(new Range((char) 11, (char) 12)); + ret.addOr(new Range((char) 14, (char) 31)); + ret.addOr(new Range((char) 33, (char) 65535)); + } + else if (sp.escMatch('W')) + { + ret.addOr(new Range((char) 0, (char) 64)); + ret.addOr(new Range((char) 91, (char) 94)); + ret.addOr(new oneChar((char) 96)); + ret.addOr(new Range((char) 123, (char) 65535)); + } + else if (sp.escMatch('x') && next2Hex(sp)) + { + sp.inc(); + int d = getHexDigit(sp); + sp.inc(); + d = 16 * d + getHexDigit(sp); + ret.addOr(new oneChar((char) d)); + } + else if (sp.escMatch('a')) + { + ret.addOr(new oneChar((char) 7)); + } + else if (sp.escMatch('f')) + { + ret.addOr(new oneChar((char) 12)); + } + else if (sp.escMatch('e')) + { + ret.addOr(new oneChar((char) 27)); + } + else if (sp.escMatch('n')) + { + ret.addOr(new oneChar('\n')); + } + else if (sp.escMatch('t')) + { + ret.addOr(new oneChar('\t')); + } + else if (sp.escMatch('r')) + { + ret.addOr(new oneChar('\r')); + } + else if (sp.escMatch('c')) + { + sp.inc(); + if (sp.c < Ctrl.cmap.length) + { + ret.addOr(new oneChar(Ctrl.cmap[sp.c])); + } + else + { + ret.addOr(new oneChar(sp.c)); + } + } + else if (isOctalString(sp)) + { + int d = sp.c - '0'; + sp.inc(); + d = 8 * d + sp.c - '0'; + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (isOctalDigit(sp2, false)) + { + sp.inc(); + d = 8 * d + sp.c - '0'; + } + ret.addOr(new oneChar((char) d)); + } + else + { + ret.addOr(new oneChar(sp.c)); + } + sp.inc(); + } + return ret; + } + + /** + * Converts the stored Pattern to a String -- this is a decompile. Note that + * \t and \n will really print out here, Not just the two character + * representations. Also be prepared to see some strange output if your + * characters are not printable. + */ + @Override + public String toString() + { + if (false && thePattern == null) + { + return ""; + } + else + { + StringBuffer sb = new StringBuffer(); + if (esc != Pattern.ESC) + { + sb.append("(?e="); + sb.append(esc); + sb.append(")"); + } + if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase + || dontMatchInQuotes || optimized()) + { + sb.append("(?"); + if (ignoreCase) + { + sb.append("i"); + } + if (mFlag) + { + sb.append("m"); + } + if (sFlag || !dotDoesntMatchCR) + { + sb.append("s"); + } + if (dontMatchInQuotes) + { + sb.append("Q"); + } + if (optimized()) + { + sb.append("o"); + } + if (gFlag) + { + sb.append("g"); + } + sb.append(")"); + } + String patstr = thePattern.toString(); + if (esc != Pattern.ESC) + { + patstr = reEscape(patstr, Pattern.ESC, esc); + } + sb.append(patstr); + return sb.toString(); + } + } + + // Re-escape Pattern, allows us to use a different escape + // character. + static String reEscape(String s, char oldEsc, char newEsc) + { + if (oldEsc == newEsc) + { + return s; + } + int i; + StringBuffer sb = new StringBuffer(); + for (i = 0; i < s.length(); i++) + { + if (s.charAt(i) == oldEsc && i + 1 < s.length()) + { + if (s.charAt(i + 1) == oldEsc) + { + sb.append(oldEsc); + } + else + { + sb.append(newEsc); + sb.append(s.charAt(i + 1)); + } + i++; + } + else if (s.charAt(i) == newEsc) + { + sb.append(newEsc); + sb.append(newEsc); + } + else + { + sb.append(s.charAt(i)); + } + } + return sb.toString(); + } + + /** + * This method implements FilenameFilter, allowing one to use a Regex to + * search through a directory using File.list. There is a FileRegex now that + * does this better. + * + * @see com.stevesoft.pat.FileRegex + */ + @Override + public boolean accept(File dir, String s) + { + return search(s); + } + + /** The version of this package */ + final static public String version() + { + return "lgpl release 1.5.3"; + } + + /** + * Once this method is called, the state of variables ignoreCase and + * dontMatchInQuotes should not be changed as the results will be + * unpredictable. However, search and matchAt will run more quickly. Note that + * you can check to see if the pattern has been optimized by calling the + * optimized() method. + *

+ * This method will attempt to rewrite your pattern in a way that makes it + * faster (not all patterns execute at the same speed). In general, "(?: ... + * )" will be faster than "( ... )" so if you don't need the backreference, + * you should group using the former pattern. + *

+ * It will also introduce new pattern elements that you can't get to + * otherwise, for example if you have a large table of strings, i.e. the + * months of the year "(January|February|...)" optimize() will make a + * Hashtable that takes it to the next appropriate pattern element -- + * eliminating the need for a linear search. + * + * @see com.stevesoft.pat.Regex#optimized + * @see com.stevesoft.pat.Regex#ignoreCase + * @see com.stevesoft.pat.Regex#dontMatchInQuotes + * @see com.stevesoft.pat.Regex#matchAt + * @see com.stevesoft.pat.Regex#search + */ + public void optimize() + { + if (optimized() || thePattern == null) + { + return; + } + minMatch = new patInt(0); // thePattern.countMinChars(); + thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes); + skipper = Skip.findSkip(this); + // RegOpt.setParents(this); + return; + } + + Skip skipper; + + /** + * This function returns true if the optimize method has been called. + */ + public boolean optimized() + { + return minMatch != null; + } + + /** + * A bit of syntactic surgar for those who want to make their code look more + * perl-like. To use this initialize your Regex object by saying: + * + *

+   *       Regex r1 = Regex.perlCode("s/hello/goodbye/");
+   *       Regex r2 = Regex.perlCode("s'fish'frog'i");
+   *       Regex r3 = Regex.perlCode("m'hello');
+   * 
+ * + * The i for ignoreCase is supported in this syntax, as well as m, s, and x. + * The g flat is a bit of a special case. + *

+ * If you wish to replace all occurences of a pattern, you do not put a 'g' in + * the perlCode, but call Regex's replaceAll method. + *

+ * If you wish to simply and only do a search for r2's pattern, you can do + * this by calling the searchFrom method method repeatedly, or by calling + * search repeatedly if the g flag is set. + *

+ * Note: Currently perlCode does not support the (?e=#) syntax for + * changing the escape character. + */ + + public static Regex perlCode(String s) + { + // this file is big enough, see parsePerl.java + // for this function. + return parsePerl.parse(s); + } + + static final char back_slash = '\\'; + + /** + * Checks to see if there are only literal and no special pattern elements in + * this Regex. + */ + public boolean isLiteral() + { + Pattern x = thePattern; + while (x != null) + { + if (x instanceof oneChar) + { + ; + } + else if (x instanceof Skipped) + { + ; + } + else + { + return false; + } + x = x.next; + } + return true; + } + + /** + * You only need to know about this if you are inventing your own pattern + * elements. + */ + public patInt countMinChars() + { + return thePattern.countMinChars(); + } + + /** + * You only need to know about this if you are inventing your own pattern + * elements. + */ + public patInt countMaxChars() + { + return thePattern.countMaxChars(); + } + + boolean isHexDigit(StrPos sp) + { + boolean r = !sp.eos && !sp.dontMatch + && ((sp.c >= '0' && sp.c <= '9') || (sp.c >= 'a' && sp.c <= 'f') + || (sp.c >= 'A' && sp.c <= 'F')); + return r; + } + + boolean isOctalDigit(StrPos sp, boolean first) + { + boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0' + && sp.c <= '7'; + return r; + } + + int getHexDigit(StrPos sp) + { + if (sp.c >= '0' && sp.c <= '9') + { + return sp.c - '0'; + } + if (sp.c >= 'a' && sp.c <= 'f') + { + return sp.c - 'a' + 10; + } + return sp.c - 'A' + 10; + } + + boolean next2Hex(StrPos sp) + { + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (!isHexDigit(sp2)) + { + return false; + } + sp2.inc(); + if (!isHexDigit(sp2)) + { + return false; + } + return true; + } + + boolean isOctalString(StrPos sp) + { + if (!isOctalDigit(sp, true)) + { + return false; + } + StrPos sp2 = new StrPos(sp); + sp2.inc(); + if (!isOctalDigit(sp2, false)) + { + return false; + } + return true; + } +}