2 // This software is now distributed according to
3 // the Lesser Gnu Public License. Please see
4 // http://www.gnu.org/copyleft/lesser.txt for
8 package com.stevesoft.pat;
13 import com.stevesoft.pat.wrap.*;
15 /** Matches a Unicode punctuation character. */
16 class UnicodePunct extends UniValidator
18 public int validate(StringLike s, int from, int to)
20 return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1;
24 /** Matches a Unicode white space character. */
25 class UnicodeWhite extends UniValidator
27 public int validate(StringLike s, int from, int to)
29 return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1;
34 * Matches a character that is not a Unicode punctuation character.
36 class NUnicodePunct extends UniValidator
38 public int validate(StringLike s, int from, int to)
40 return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1;
45 * Matches a character that is not a Unicode white space character.
47 class NUnicodeWhite extends UniValidator
49 public int validate(StringLike s, int from, int to)
51 return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1;
55 /** Matches a Unicode word character: an alphanumeric or underscore. */
56 class UnicodeW extends UniValidator
58 public int validate(StringLike s, int from, int to)
60 if (from >= s.length())
64 char c = s.charAt(from);
65 return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
70 /** Matches a character that is not a Unicode alphanumeric or underscore. */
71 class NUnicodeW extends UniValidator
73 public int validate(StringLike s, int from, int to)
75 if (from >= s.length())
79 char c = s.charAt(from);
80 return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
85 /** Matches a Unicode decimal digit. */
86 class UnicodeDigit extends UniValidator
88 public int validate(StringLike s, int from, int to)
90 return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to
95 /** Matches a character that is not a Unicode digit. */
96 class NUnicodeDigit extends UniValidator
98 public int validate(StringLike s, int from, int to)
100 return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to
105 /** Matches a Unicode math character. */
106 class UnicodeMath extends UniValidator
108 public int validate(StringLike s, int from, int to)
110 return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1;
114 /** Matches a non-math Unicode character. */
115 class NUnicodeMath extends UniValidator
117 public int validate(StringLike s, int from, int to)
119 return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1;
123 /** Matches a Unicode currency symbol. */
124 class UnicodeCurrency extends UniValidator
126 public int validate(StringLike s, int from, int to)
128 return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1;
132 /** Matches a non-currency symbol Unicode character. */
133 class NUnicodeCurrency extends UniValidator
135 public int validate(StringLike s, int from, int to)
137 return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1;
141 /** Matches a Unicode alphabetic character. */
142 class UnicodeAlpha extends UniValidator
144 public int validate(StringLike s, int from, int to)
146 return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1;
150 /** Matches a non-alphabetic Unicode character. */
151 class NUnicodeAlpha extends UniValidator
153 public int validate(StringLike s, int from, int to)
155 return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to
160 /** Matches an upper case Unicode character. */
161 class UnicodeUpper extends UniValidator
163 public int validate(StringLike s, int from, int to)
165 return from < s.length() && isUpper(s.charAt(from)) ? to : -1;
168 final boolean isUpper(char c)
170 return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c);
174 /** Matches an upper case Unicode character. */
175 class UnicodeLower extends UniValidator
177 public int validate(StringLike s, int from, int to)
179 return from < s.length() && isLower(s.charAt(from)) ? to : -1;
182 final boolean isLower(char c)
184 return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c);
189 * Regex provides the parser which constructs the linked list of Pattern classes
192 * For the purpose of this documentation, the fact that java interprets the
193 * backslash will be ignored. In practice, however, you will need a double
194 * backslash to obtain a string that contains a single backslash character.
195 * Thus, the example pattern "\b" should really be typed as "\\b" inside java
198 * Note that Regex is part of package "com.stevesoft.pat". To use it, simply
199 * import com.stevesoft.pat.Regex at the top of your file.
201 * Regex is made with a constructor that takes a String that defines the regular
202 * expression. Thus, for example
205 * Regex r = new Regex("[a-c]*");
208 * matches any number of characters so long as the are 'a', 'b', or 'c').
210 * To attempt to match the Pattern to a given string, you can use either the
211 * search(String) member function, or the matchAt(String,int position) member
212 * function. These functions return a boolean which tells you whether or not the
213 * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in
214 * the Regex object appropriately.
216 * The portion of the string before the match can be obtained by the left()
217 * member, and the portion after the match can be obtained by the right()
220 * Essentially, this package implements a syntax that is very much like the perl
221 * 5 regular expression syntax.
226 * Regex r = new Regex("x(a|b)y");
227 * r.matchAt("xay", 0);
228 * System.out.println("sub = " + r.stringMatched(1));
231 * The above would print "sub = a".
234 * r.left() // would return "x"
235 * r.right() // would return "y"
239 * Differences between this package and perl5:<br>
240 * The extended Pattern for setting flags, is now supported, but the flags are
241 * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the
242 * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the
243 * escape character. The pattern
253 * </pre>, but note that the sequence
259 * <b>must</b> occur at the very beginning of the pattern. There may be other
260 * small differences as well. I will either make my package conform or note them
261 * as I become aware of them.
263 * This package supports additional patterns not in perl5: <center> <table
268 * <td>This matches all characters between the '(' character and the balancing
269 * ')' character. Thus, it will match "()" as well as "(())". The balancing
270 * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".</td>
274 * <td>Moves the pointer backwards within the text. This allows you to make a
275 * "look behind." It fails if it attempts to move to a position before the
276 * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1
277 * in this example, is the number of characters to move backwards.</td>
281 * @author Steven R. Brandt
282 * @version package com.stevesoft.pat, release 1.5.3
285 public class Regex extends RegRes implements FilenameFilter
288 * BackRefOffset gives the identity number of the first pattern. Version 1.0
289 * used zero, version 1.1 uses 1 to be more compatible with perl.
291 static int BackRefOffset = 1;
293 private static Pattern none = new NoPattern();
295 Pattern thePattern = none;
297 patInt minMatch = new patInt(0);
299 static Hashtable validators = new Hashtable();
302 define("p", "(?>1)", new UnicodePunct());
303 define("P", "(?>1)", new NUnicodePunct());
304 define("s", "(?>1)", new UnicodeWhite());
305 define("S", "(?>1)", new NUnicodeWhite());
306 define("w", "(?>1)", new UnicodeW());
307 define("W", "(?>1)", new NUnicodeW());
308 define("d", "(?>1)", new UnicodeDigit());
309 define("D", "(?>1)", new NUnicodeDigit());
310 define("m", "(?>1)", new UnicodeMath());
311 define("M", "(?>1)", new NUnicodeMath());
312 define("c", "(?>1)", new UnicodeCurrency());
313 define("C", "(?>1)", new NUnicodeCurrency());
314 define("a", "(?>1)", new UnicodeAlpha());
315 define("A", "(?>1)", new NUnicodeAlpha());
316 define("uc", "(?>1)", new UnicodeUpper());
317 define("lc", "(?>1)", new UnicodeLower());
320 /** Set the dontMatch in quotes flag. */
321 public void setDontMatchInQuotes(boolean b)
323 dontMatchInQuotes = b;
326 /** Find out if the dontMatchInQuotes flag is enabled. */
327 public boolean getDontMatchInQuotes()
329 return dontMatchInQuotes;
332 boolean dontMatchInQuotes = false;
335 * Set the state of the ignoreCase flag. If set to true, then the pattern
336 * matcher will ignore case when searching for a match.
338 public void setIgnoreCase(boolean b)
344 * Get the state of the ignoreCase flag. Returns true if we are ignoring the
345 * case of the pattern, false otherwise.
347 public boolean getIgnoreCase()
352 boolean ignoreCase = false;
354 static boolean defaultMFlag = false;
357 * Set the default value of the m flag. If it is set to true, then the MFlag
358 * will be on for any regex search executed.
360 public static void setDefaultMFlag(boolean mFlag)
362 defaultMFlag = mFlag;
366 * Get the default value of the m flag. If it is set to true, then the MFlag
367 * will be on for any regex search executed.
369 public static boolean getDefaultMFlag()
375 * Initializes the object without a Pattern. To supply a Pattern use
378 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
385 * Create and compile a Regex, but do not throw any exceptions. If you wish to
386 * have exceptions thrown for syntax errors, you must use the Regex(void)
387 * constructor to create the Regex object, and then call the compile method.
388 * Therefore, you should only call this method when you know your pattern is
389 * right. I will probably become more like
391 * @see com.stevesoft.pat.Regex#search(java.lang.String)
392 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
394 public Regex(String s)
399 } catch (RegSyntax rs)
404 ReplaceRule rep = null;
407 * Create and compile both a Regex and a ReplaceRule.
409 * @see com.stevesoft.pat.ReplaceRule
410 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
412 public Regex(String s, String rp)
415 rep = ReplaceRule.perlCode(rp);
419 * Create and compile a Regex, but give it the ReplaceRule specified. This
420 * allows the user finer control of the Replacement process, if that is
423 * @see com.stevesoft.pat.ReplaceRule
424 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
426 public Regex(String s, ReplaceRule rp)
433 * Change the ReplaceRule of this Regex by compiling a new one using String
436 public void setReplaceRule(String rp)
438 rep = ReplaceRule.perlCode(rp);
439 repr = null; // Clear Replacer history
442 /** Change the ReplaceRule of this Regex to rp. */
443 public void setReplaceRule(ReplaceRule rp)
449 * Test to see if a custom defined rule exists.
451 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
453 public static boolean isDefined(String nm)
455 return validators.get(nm) != null;
459 * Removes a custom defined rule.
461 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
463 public static void undefine(String nm)
465 validators.remove(nm);
469 * Defines a method to create a new rule. See test/deriv2.java and
470 * test/deriv3.java for examples of how to use it.
472 public static void define(String nm, String pat, Validator v)
475 validators.put(nm, v);
479 * Defines a shorthand for a pattern. The pattern will be invoked by a string
480 * that has the form "(??"+nm+")".
482 public static void define(String nm, String pat)
484 validators.put(nm, pat);
487 /** Get the current ReplaceRule. */
488 public ReplaceRule getReplaceRule()
493 Replacer repr = null;
495 final Replacer _getReplacer()
497 return repr == null ? repr = new Replacer() : repr;
500 public Replacer getReplacer()
504 repr = new Replacer();
512 * Replace the first occurence of this pattern in String s according to the
515 * @see com.stevesoft.pat.ReplaceRule
516 * @see com.stevesoft.pat.Regex#getReplaceRule()
518 public String replaceFirst(String s)
520 return _getReplacer().replaceFirstRegion(s, this, 0, s.length())
525 * Replace the first occurence of this pattern in String s beginning with
526 * position pos according to the ReplaceRule.
528 * @see com.stevesoft.pat.ReplaceRule
529 * @see com.stevesoft.pat.Regex#getReplaceRule()
531 public String replaceFirstFrom(String s, int pos)
533 return _getReplacer().replaceFirstRegion(s, this, pos, s.length())
538 * Replace the first occurence of this pattern in String s beginning with
539 * position start and ending with end according to the ReplaceRule.
541 * @see com.stevesoft.pat.ReplaceRule
542 * @see com.stevesoft.pat.Regex#getReplaceRule()
544 public String replaceFirstRegion(String s, int start, int end)
546 return _getReplacer().replaceFirstRegion(s, this, start, end)
551 * Replace all occurences of this pattern in String s according to the
554 * @see com.stevesoft.pat.ReplaceRule
555 * @see com.stevesoft.pat.Regex#getReplaceRule()
557 public String replaceAll(String s)
559 return _getReplacer().replaceAllRegion(s, this, 0, s.length())
563 public StringLike replaceAll(StringLike s)
565 return _getReplacer().replaceAllRegion(s, this, 0, s.length());
569 * Replace all occurences of this pattern in String s beginning with position
570 * pos according to the ReplaceRule.
572 * @see com.stevesoft.pat.ReplaceRule
573 * @see com.stevesoft.pat.Regex#getReplaceRule()
575 public String replaceAllFrom(String s, int pos)
577 return _getReplacer().replaceAllRegion(s, this, pos, s.length())
582 * Replace all occurences of this pattern in String s beginning with position
583 * start and ending with end according to the ReplaceRule.
585 * @see com.stevesoft.pat.ReplaceRule
586 * @see com.stevesoft.pat.Regex#getReplaceRule()
588 public String replaceAllRegion(String s, int start, int end)
590 return _getReplacer().replaceAllRegion(s, this, start, end).toString();
593 /** Essentially clones the Regex object */
594 public Regex(Regex r)
597 dontMatchInQuotes = r.dontMatchInQuotes;
599 ignoreCase = r.ignoreCase;
607 rep = (ReplaceRule) r.rep.clone();
610 * try { compile(r.toString()); } catch(RegSyntax r_) {}
612 thePattern = r.thePattern.clone(new Hashtable());
613 minMatch = r.minMatch;
618 * By default, the escape character is the backslash, but you can make it
619 * anything you want by setting this variable.
621 public char esc = Pattern.ESC;
624 * This method compiles a regular expression, making it possible to call the
625 * search or matchAt methods.
627 * @exception com.stevesoft.pat.RegSyntax
628 * is thrown if a syntax error is encountered in the
629 * pattern. For example, "x{3,1}" or "*a" are not valid
631 * @see com.stevesoft.pat.Regex#search
632 * @see com.stevesoft.pat.Regex#matchAt
634 public void compile(String prepat) throws RegSyntax
636 String postpat = parsePerl.codify(prepat, true);
637 String pat = postpat == null ? prepat : postpat;
640 dontMatchInQuotes = false;
641 Rthings mk = new Rthings(this);
647 minMatch = new patInt(0);
648 StrPos sp = new StrPos(pat, 0);
649 if (sp.incMatch("(?e="))
655 newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC);
658 else if (esc != Pattern.ESC)
660 newpat = reEscape(pat, esc, Pattern.ESC);
662 thePattern = _compile(newpat, mk);
663 numSubs_ = mk.val - offset;
668 * If a Regex is compared against a Regex, a check is done to see that the
669 * patterns are equal as well as the most recent match. If a Regex is compare
670 * with a RegRes, only the result of the most recent match is compared.
672 public boolean equals(Object o)
674 if (o instanceof Regex)
676 if (toString().equals(o.toString()))
678 return super.equals(o);
687 return super.equals(o);
691 /** A clone by any other name would smell as sweet. */
692 public Object clone()
694 return new Regex(this);
697 /** Return a clone of the underlying RegRes object. */
698 public RegRes result()
700 return (RegRes) super.clone();
703 // prep sets global variables of class
704 // Pattern so that it can access them
705 // during an attempt at a match
706 Pthings pt = new Pthings();
708 final Pthings prep(StringLike s)
711 pt.lastPos = matchedTo();
716 if ((s == null ? null : s.unwrap()) != (src == null ? null : s.unwrap()))
721 pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag);
722 pt.mFlag = (mFlag | defaultMFlag);
723 pt.ignoreCase = ignoreCase;
725 if (pt.marks != null)
727 for (int i = 0; i < pt.marks.length; i++)
733 pt.nMarks = numSubs_;
735 if (dontMatchInQuotes)
747 * Attempt to match a Pattern beginning at a specified location within the
750 * @see com.stevesoft.pat.Regex#search
752 public boolean matchAt(String s, int start_pos)
754 return _search(s, start_pos, start_pos);
758 * Attempt to match a Pattern beginning at a specified location within the
761 * @see com.stevesoft.pat.Regex#search
763 public boolean matchAt(StringLike s, int start_pos)
765 return _search(s, start_pos, start_pos);
769 * Search through a String for the first occurrence of a match.
771 * @see com.stevesoft.pat.Regex#searchFrom
772 * @see com.stevesoft.pat.Regex#matchAt
774 public boolean search(String s)
778 throw new NullPointerException("Null String Given to Regex.search");
780 return _search(s, 0, s.length());
783 public boolean search(StringLike sl)
787 throw new NullPointerException(
788 "Null StringLike Given to Regex.search");
790 return _search(sl, 0, sl.length());
793 public boolean reverseSearch(String s)
797 throw new NullPointerException(
798 "Null String Given to Regex.reverseSearch");
800 return _reverseSearch(s, 0, s.length());
803 public boolean reverseSearch(StringLike sl)
807 throw new NullPointerException(
808 "Null StringLike Given to Regex.reverseSearch");
810 return _reverseSearch(sl, 0, sl.length());
814 * Search through a String for the first occurence of a match, but start at
821 public boolean searchFrom(String s, int start)
825 throw new NullPointerException(
826 "Null String Given to Regex.searchFrom");
828 return _search(s, start, s.length());
831 public boolean searchFrom(StringLike s, int start)
835 throw new NullPointerException(
836 "Null String Given to Regex.searchFrom");
838 return _search(s, start, s.length());
842 * Search through a region of a String for the first occurence of a match.
844 public boolean searchRegion(String s, int start, int end)
848 throw new NullPointerException(
849 "Null String Given to Regex.searchRegion");
851 return _search(s, start, end);
855 * Set this to change the default behavior of the "." pattern. By default it
856 * now matches perl's behavior and fails to match the '\n' character.
858 public static boolean dotDoesntMatchCR = true;
864 boolean gFlag = false;
866 /** Set the 'g' flag */
867 public void setGFlag(boolean b)
872 /** Get the state of the 'g' flag. */
873 public boolean getGFlag()
878 boolean sFlag = false;
880 /** Get the state of the sFlag */
881 public boolean getSFlag()
886 boolean mFlag = false;
888 /** Get the state of the sFlag */
889 public boolean getMFlag()
894 final boolean _search(String s, int start, int end)
896 return _search(new StringWrap(s), start, end);
899 final boolean _search(StringLike s, int start, int end)
901 if (gFlag && gFlagto > 0 && gFlags != null
902 && s.unwrap() == gFlags.unwrap())
908 Pthings pt = prep(s);
910 int up = (minMatch == null ? end : end - minMatch.i);
912 if (up < start && end >= start)
919 for (int i = start; i <= up; i++)
921 charsMatched_ = thePattern.matchAt(s, i, pt);
922 if (charsMatched_ >= 0)
924 matchFrom_ = thePattern.mfrom;
926 gFlagto = matchFrom_ + charsMatched_;
928 return didMatch_ = true;
935 for (int i = start; i <= up; i++)
937 i = skipper.find(src, i, up);
940 charsMatched_ = matchFrom_ = -1;
941 return didMatch_ = false;
943 charsMatched_ = thePattern.matchAt(s, i, pt);
944 if (charsMatched_ >= 0)
946 matchFrom_ = thePattern.mfrom;
948 gFlagto = matchFrom_ + charsMatched_;
950 return didMatch_ = true;
954 return didMatch_ = false;
958 * final boolean _search(LongStringLike s,long start,long end) { if(gFlag &&
959 * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null;
961 * Pthings pt=prep(s);
963 * int up = end;//(minMatch == null ? end : end-minMatch.i);
965 * if(up < start && end >= start) up = start;
967 * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ =
968 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
969 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
970 * return didMatch_=true; } } } else { pt.no_check = true; for(long i=start;i<=up;i++) {
971 * i = skipper.find(src,i,up); if(i<0) { charsMatched_ = matchFrom_ = -1;
972 * return didMatch_ = false; } charsMatched_ = thePattern.matchAt(s,i,pt);
973 * if(charsMatched_ >= 0) { matchFrom_ = thePattern.mfrom; marks = pt.marks;
974 * gFlagto = matchFrom_+charsMatched_; gFlags = s; return didMatch_=true; }
975 * else { i = s.adjustIndex(i); up = s.adjustEnd(i); } } } return
979 boolean _reverseSearch(String s, int start, int end)
981 return _reverseSearch(new StringWrap(s), start, end);
984 boolean _reverseSearch(StringLike s, int start, int end)
986 if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap())
991 Pthings pt = prep(s);
992 for (int i = end; i >= start; i--)
994 charsMatched_ = thePattern.matchAt(s, i, pt);
995 if (charsMatched_ >= 0)
997 matchFrom_ = thePattern.mfrom;
999 gFlagto = matchFrom_ - 1;
1001 return didMatch_ = true;
1004 return didMatch_ = false;
1007 // This routine sets the cbits variable
1008 // of class Pattern. Cbits is true for
1009 // the bit corresponding to a character inside
1011 static StringLike lasts = null;
1013 static BitSet lastbs = null;
1015 static void setCbits(StringLike s, Pthings pt)
1022 BitSet bs = new BitSet(s.length());
1024 boolean setBit = false;
1025 for (int i = 0; i < s.length(); i++)
1031 char c = s.charAt(i);
1032 if (!setBit && c == '"')
1038 else if (!setBit && c == '\'')
1044 else if (setBit && c == qc)
1048 else if (setBit && c == '\\' && i + 1 < s.length())
1057 pt.cbits = lastbs = bs;
1061 // Wanted user to over-ride this in alpha version,
1062 // but it wasn't really necessary because of this trick:
1067 return (Regex) getClass().newInstance();
1068 } catch (InstantiationException ie)
1071 } catch (IllegalAccessException iae)
1078 * Only needed for creating your own extensions of Regex. This method adds the
1079 * next Pattern in the chain of patterns or sets the Pattern if it is the
1082 protected void add(Pattern p2)
1096 * You only need to use this method if you are creating your own extentions to
1097 * Regex. compile1 compiles one Pattern element, it can be over-ridden to
1098 * allow the Regex compiler to understand new syntax. See deriv.java for an
1099 * example. This routine is the heart of class Regex. Rthings has one integer
1100 * member called intValue, it is used to keep track of the number of ()'s in
1103 * @exception com.stevesoft.pat.RegSyntax
1104 * is thrown when a nonsensensical pattern is supplied. For
1105 * example, a pattern beginning with *.
1107 protected void compile1(StrPos sp, Rthings mk) throws RegSyntax
1112 add(matchBracket(sp));
1114 else if (sp.match('|'))
1122 p = new NullPattern();
1127 else if (sp.incMatch("(?<"))
1129 patInt i = sp.getPatInt();
1132 RegSyntaxError.endItAll("No int after (?<");
1134 add(new Backup(i.intValue()));
1137 RegSyntaxError.endItAll("No ) after (?<");
1140 else if (sp.incMatch("(?>"))
1142 patInt i = sp.getPatInt();
1145 RegSyntaxError.endItAll("No int after (?>");
1147 add(new Backup(-i.intValue()));
1150 RegSyntaxError.endItAll("No ) after (?<");
1153 else if (sp.incMatch("(?@"))
1161 RegSyntaxError.endItAll("(?@ does not have closing paren");
1163 add(new Group(op, cl));
1165 else if (sp.incMatch("(?#"))
1167 while (!sp.match(')'))
1172 else if (sp.dontMatch && sp.c == 'w')
1174 // Regex r = new Regex();
1175 // r._compile("[a-zA-Z0-9_]",mk);
1176 // add(new Goop("\\w",r.thePattern));
1177 Bracket b = new Bracket(false);
1178 b.addOr(new Range('a', 'z'));
1179 b.addOr(new Range('A', 'Z'));
1180 b.addOr(new Range('0', '9'));
1181 b.addOr(new oneChar('_'));
1184 else if (sp.dontMatch && sp.c == 'G')
1188 else if (sp.dontMatch && sp.c == 's')
1190 // Regex r = new Regex();
1191 // r._compile("[ \t\n\r\b]",mk);
1192 // add(new Goop("\\s",r.thePattern));
1193 Bracket b = new Bracket(false);
1194 b.addOr(new oneChar((char) 32));
1195 b.addOr(new Range((char) 8, (char) 10));
1196 b.addOr(new oneChar((char) 13));
1199 else if (sp.dontMatch && sp.c == 'd')
1201 // Regex r = new Regex();
1202 // r._compile("[0-9]",mk);
1203 // add(new Goop("\\d",r.thePattern));
1204 Range digit = new Range('0', '9');
1205 digit.printBrackets = true;
1208 else if (sp.dontMatch && sp.c == 'W')
1210 // Regex r = new Regex();
1211 // r._compile("[^a-zA-Z0-9_]",mk);
1212 // add(new Goop("\\W",r.thePattern));
1213 Bracket b = new Bracket(true);
1214 b.addOr(new Range('a', 'z'));
1215 b.addOr(new Range('A', 'Z'));
1216 b.addOr(new Range('0', '9'));
1217 b.addOr(new oneChar('_'));
1220 else if (sp.dontMatch && sp.c == 'S')
1222 // Regex r = new Regex();
1223 // r._compile("[^ \t\n\r\b]",mk);
1224 // add(new Goop("\\S",r.thePattern));
1225 Bracket b = new Bracket(true);
1226 b.addOr(new oneChar((char) 32));
1227 b.addOr(new Range((char) 8, (char) 10));
1228 b.addOr(new oneChar((char) 13));
1231 else if (sp.dontMatch && sp.c == 'D')
1233 // Regex r = new Regex();
1234 // r._compile("[^0-9]",mk);
1235 // add(new Goop("\\D",r.thePattern));
1236 Bracket b = new Bracket(true);
1237 b.addOr(new Range('0', '9'));
1240 else if (sp.dontMatch && sp.c == 'B')
1242 Regex r = new Regex();
1243 r._compile("(?!" + back_slash + "b)", mk);
1246 else if (isOctalString(sp))
1250 d = 8 * d + sp.c - '0';
1251 StrPos sp2 = new StrPos(sp);
1253 if (isOctalDigit(sp2, false))
1256 d = 8 * d + sp.c - '0';
1258 add(new oneChar((char) d));
1260 else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9')
1262 int iv = sp.c - '0';
1263 StrPos s2 = new StrPos(sp);
1265 if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9')
1267 iv = 10 * iv + (s2.c - '0');
1270 add(new BackMatch(iv));
1272 else if (sp.dontMatch && sp.c == 'b')
1274 add(new Boundary());
1276 else if (sp.match('\b'))
1278 add(new Boundary());
1280 else if (sp.match('$'))
1284 else if (sp.dontMatch && sp.c == 'Z')
1286 add(new End(false));
1288 else if (sp.match('.'))
1292 else if (sp.incMatch("(??"))
1294 StringBuffer sb = new StringBuffer();
1295 StringBuffer sb2 = new StringBuffer();
1296 while (!sp.match(')') && !sp.match(':'))
1301 if (sp.incMatch(":"))
1303 while (!sp.match(')'))
1309 String sbs = sb.toString();
1310 if (validators.get(sbs) instanceof String)
1312 String pat = (String) validators.get(sbs);
1313 Regex r = newRegex();
1314 Rthings rth = new Rthings(this);
1315 rth.noBackRefs = true;
1316 r._compile(pat, rth);
1321 Custom cm = new Custom(sb.toString());
1324 Validator v2 = cm.v.arg(sb2.toString());
1327 v2.argsave = sb2.toString();
1328 String p = cm.v.pattern;
1332 Regex r = newRegex();
1333 Rthings rth = new Rthings(this);
1334 rth.noBackRefs = true;
1335 r._compile(cm.v.pattern, rth);
1336 cm.sub = r.thePattern;
1337 cm.sub.add(new CustomEndpoint(cm));
1338 cm.sub.setParent(cm);
1343 else if (sp.match('('))
1346 Regex r = newRegex();
1349 if (sp.incMatch("?:"))
1353 else if (sp.incMatch("?="))
1355 r.or = new lookAhead(false);
1357 else if (sp.incMatch("?!"))
1359 r.or = new lookAhead(true);
1361 else if (sp.match('?'))
1368 mk.ignoreCase = true;
1372 mk.dontMatchInQuotes = true;
1376 mk.optimizeMe = true;
1391 } while (!sp.match(')') && !sp.eos);
1394 if (sp.eos) // throw new RegSyntax
1396 RegSyntaxError.endItAll("Unclosed ()");
1400 { // just ordinary parenthesis
1401 r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++);
1405 add(r._compile(sp, mk));
1408 else if (sp.match('^'))
1410 add(new Start(true));
1412 else if (sp.dontMatch && sp.c == 'A')
1414 add(new Start(false));
1416 else if (sp.match('*'))
1418 addMulti(new patInt(0), new patInf());
1420 else if (sp.match('+'))
1422 addMulti(new patInt(1), new patInf());
1424 else if (sp.match('?'))
1426 addMulti(new patInt(0), new patInt(1));
1428 else if (sp.match('{'))
1430 boolean bad = false;
1431 StrPos sp2 = new StrPos(sp);
1432 // StringBuffer sb = new StringBuffer();
1434 patInt i1 = sp.getPatInt();
1445 * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed
1457 i2 = sp.getPatInt();
1460 if (i1 == null || i2 == null)
1463 * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}");
1470 add(new oneChar(sp.c));
1477 else if (sp.escMatch('x') && next2Hex(sp))
1480 int d = getHexDigit(sp);
1482 d = 16 * d + getHexDigit(sp);
1483 add(new oneChar((char) d));
1485 else if (sp.escMatch('c'))
1488 if (sp.c < Ctrl.cmap.length)
1490 add(new oneChar(Ctrl.cmap[sp.c]));
1494 add(new oneChar(sp.c));
1497 else if (sp.escMatch('f'))
1499 add(new oneChar((char) 12));
1501 else if (sp.escMatch('a'))
1503 add(new oneChar((char) 7));
1505 else if (sp.escMatch('t'))
1507 add(new oneChar('\t'));
1509 else if (sp.escMatch('n'))
1511 add(new oneChar('\n'));
1513 else if (sp.escMatch('r'))
1515 add(new oneChar('\r'));
1517 else if (sp.escMatch('b'))
1519 add(new oneChar('\b'));
1521 else if (sp.escMatch('e'))
1523 add(new oneChar((char) 27));
1527 add(new oneChar(sp.c));
1530 RegSyntaxError.endItAll("Unmatched right paren in pattern");
1535 // compiles all Pattern elements, internal method
1536 private Pattern _compile(String pat, Rthings mk) throws RegSyntax
1539 sFlag = mFlag = ignoreCase = gFlag = false;
1540 StrPos sp = new StrPos(pat, 0);
1541 thePattern = _compile(sp, mk);
1550 Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax
1552 while (!(sp.eos || (or != null && sp.match(')'))))
1561 else if (sp.eos && mk.parenLevel != 0)
1563 RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel);
1569 p = new NullPattern();
1574 return p == null ? new NullPattern() : p;
1577 // add a multi object to the end of the chain
1578 // which applies to the last object
1579 void addMulti(patInt i1, patInt i2) throws RegSyntax
1581 Pattern last, last2;
1582 for (last = p; last != null && last.next != null; last = last.next)
1586 if (last == null || last == p)
1592 for (last2 = p; last2.next != last; last2 = last2.next)
1597 if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1)
1599 ((Multi) last).matchFewest = true;
1601 else if (last instanceof FastMulti && i1.intValue() == 0
1602 && i2.intValue() == 1)
1604 ((FastMulti) last).matchFewest = true;
1606 else if (last instanceof DotMulti && i1.intValue() == 0
1607 && i2.intValue() == 1)
1609 ((DotMulti) last).matchFewest = true;
1611 else if (last instanceof Multi || last instanceof DotMulti
1612 || last instanceof FastMulti)
1614 throw new RegSyntax("Syntax error.");
1616 else if (last2 == null)
1618 p = mkMulti(i1, i2, p);
1622 last2.next = mkMulti(i1, i2, last);
1626 final static Pattern mkMulti(patInt lo, patInt hi, Pattern p)
1629 if (p instanceof Any && p.next == null)
1631 return (Pattern) new DotMulti(lo, hi);
1633 return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p)
1634 : (Pattern) new Multi(lo, hi, p);
1637 // process the bracket operator
1638 Pattern matchBracket(StrPos sp) throws RegSyntax
1643 ret = new Bracket(true);
1648 ret = new Bracket(false);
1652 // throw new RegSyntax
1653 RegSyntaxError.endItAll("Unmatched []");
1656 while (!sp.eos && !sp.match(']'))
1658 StrPos s1 = new StrPos(sp);
1660 StrPos s1_ = new StrPos(s1);
1662 if (s1.match('-') && !s1_.match(']'))
1664 StrPos s2 = new StrPos(s1);
1668 ret.addOr(new Range(sp.c, s2.c));
1673 else if (sp.escMatch('Q'))
1676 while (!sp.escMatch('E'))
1678 ret.addOr(new oneChar(sp.c));
1682 else if (sp.escMatch('d'))
1684 ret.addOr(new Range('0', '9'));
1686 else if (sp.escMatch('s'))
1688 ret.addOr(new oneChar((char) 32));
1689 ret.addOr(new Range((char) 8, (char) 10));
1690 ret.addOr(new oneChar((char) 13));
1692 else if (sp.escMatch('w'))
1694 ret.addOr(new Range('a', 'z'));
1695 ret.addOr(new Range('A', 'Z'));
1696 ret.addOr(new Range('0', '9'));
1697 ret.addOr(new oneChar('_'));
1699 else if (sp.escMatch('D'))
1701 ret.addOr(new Range((char) 0, (char) 47));
1702 ret.addOr(new Range((char) 58, (char) 65535));
1704 else if (sp.escMatch('S'))
1706 ret.addOr(new Range((char) 0, (char) 7));
1707 ret.addOr(new Range((char) 11, (char) 12));
1708 ret.addOr(new Range((char) 14, (char) 31));
1709 ret.addOr(new Range((char) 33, (char) 65535));
1711 else if (sp.escMatch('W'))
1713 ret.addOr(new Range((char) 0, (char) 64));
1714 ret.addOr(new Range((char) 91, (char) 94));
1715 ret.addOr(new oneChar((char) 96));
1716 ret.addOr(new Range((char) 123, (char) 65535));
1718 else if (sp.escMatch('x') && next2Hex(sp))
1721 int d = getHexDigit(sp);
1723 d = 16 * d + getHexDigit(sp);
1724 ret.addOr(new oneChar((char) d));
1726 else if (sp.escMatch('a'))
1728 ret.addOr(new oneChar((char) 7));
1730 else if (sp.escMatch('f'))
1732 ret.addOr(new oneChar((char) 12));
1734 else if (sp.escMatch('e'))
1736 ret.addOr(new oneChar((char) 27));
1738 else if (sp.escMatch('n'))
1740 ret.addOr(new oneChar('\n'));
1742 else if (sp.escMatch('t'))
1744 ret.addOr(new oneChar('\t'));
1746 else if (sp.escMatch('r'))
1748 ret.addOr(new oneChar('\r'));
1750 else if (sp.escMatch('c'))
1753 if (sp.c < Ctrl.cmap.length)
1755 ret.addOr(new oneChar(Ctrl.cmap[sp.c]));
1759 ret.addOr(new oneChar(sp.c));
1762 else if (isOctalString(sp))
1766 d = 8 * d + sp.c - '0';
1767 StrPos sp2 = new StrPos(sp);
1769 if (isOctalDigit(sp2, false))
1772 d = 8 * d + sp.c - '0';
1774 ret.addOr(new oneChar((char) d));
1778 ret.addOr(new oneChar(sp.c));
1786 * Converts the stored Pattern to a String -- this is a decompile. Note that
1787 * \t and \n will really print out here, Not just the two character
1788 * representations. Also be prepared to see some strange output if your
1789 * characters are not printable.
1791 public String toString()
1793 if (false && thePattern == null)
1799 StringBuffer sb = new StringBuffer();
1800 if (esc != Pattern.ESC)
1806 if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase
1807 || dontMatchInQuotes || optimized())
1818 if (sFlag || !dotDoesntMatchCR)
1822 if (dontMatchInQuotes)
1836 String patstr = thePattern.toString();
1837 if (esc != Pattern.ESC)
1839 patstr = reEscape(patstr, Pattern.ESC, esc);
1842 return sb.toString();
1846 // Re-escape Pattern, allows us to use a different escape
1848 static String reEscape(String s, char oldEsc, char newEsc)
1850 if (oldEsc == newEsc)
1855 StringBuffer sb = new StringBuffer();
1856 for (i = 0; i < s.length(); i++)
1858 if (s.charAt(i) == oldEsc && i + 1 < s.length())
1860 if (s.charAt(i + 1) == oldEsc)
1867 sb.append(s.charAt(i + 1));
1871 else if (s.charAt(i) == newEsc)
1878 sb.append(s.charAt(i));
1881 return sb.toString();
1885 * This method implements FilenameFilter, allowing one to use a Regex to
1886 * search through a directory using File.list. There is a FileRegex now that
1889 * @see com.stevesoft.pat.FileRegex
1891 public boolean accept(File dir, String s)
1896 /** The version of this package */
1897 final static public String version()
1899 return "lgpl release 1.5.3";
1903 * Once this method is called, the state of variables ignoreCase and
1904 * dontMatchInQuotes should not be changed as the results will be
1905 * unpredictable. However, search and matchAt will run more quickly. Note that
1906 * you can check to see if the pattern has been optimized by calling the
1907 * optimized() method.
1909 * This method will attempt to rewrite your pattern in a way that makes it
1910 * faster (not all patterns execute at the same speed). In general, "(?: ... )"
1911 * will be faster than "( ... )" so if you don't need the backreference, you
1912 * should group using the former pattern.
1914 * It will also introduce new pattern elements that you can't get to
1915 * otherwise, for example if you have a large table of strings, i.e. the
1916 * months of the year "(January|February|...)" optimize() will make a
1917 * Hashtable that takes it to the next appropriate pattern element --
1918 * eliminating the need for a linear search.
1920 * @see com.stevesoft.pat.Regex#optimized
1921 * @see com.stevesoft.pat.Regex#ignoreCase
1922 * @see com.stevesoft.pat.Regex#dontMatchInQuotes
1923 * @see com.stevesoft.pat.Regex#matchAt
1924 * @see com.stevesoft.pat.Regex#search
1926 public void optimize()
1928 if (optimized() || thePattern == null)
1932 minMatch = new patInt(0); // thePattern.countMinChars();
1933 thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes);
1934 skipper = Skip.findSkip(this);
1935 // RegOpt.setParents(this);
1942 * This function returns true if the optimize method has been called.
1944 public boolean optimized()
1946 return minMatch != null;
1950 * A bit of syntactic surgar for those who want to make their code look more
1951 * perl-like. To use this initialize your Regex object by saying:
1954 * Regex r1 = Regex.perlCode("s/hello/goodbye/");
1955 * Regex r2 = Regex.perlCode("s'fish'frog'i");
1956 * Regex r3 = Regex.perlCode("m'hello');
1959 * The i for ignoreCase is supported in this syntax, as well as m, s, and x.
1960 * The g flat is a bit of a special case.
1962 * If you wish to replace all occurences of a pattern, you do not put a 'g' in
1963 * the perlCode, but call Regex's replaceAll method.
1965 * If you wish to simply and only do a search for r2's pattern, you can do
1966 * this by calling the searchFrom method method repeatedly, or by calling
1967 * search repeatedly if the g flag is set.
1969 * Note: Currently perlCode does <em>not</em> support the (?e=#) syntax for
1970 * changing the escape character.
1973 public static Regex perlCode(String s)
1975 // this file is big enough, see parsePerl.java
1976 // for this function.
1977 return parsePerl.parse(s);
1980 static final char back_slash = '\\';
1983 * Checks to see if there are only literal and no special pattern elements in
1986 public boolean isLiteral()
1988 Pattern x = thePattern;
1991 if (x instanceof oneChar)
1995 else if (x instanceof Skipped)
2009 * You only need to know about this if you are inventing your own pattern
2012 public patInt countMinChars()
2014 return thePattern.countMinChars();
2018 * You only need to know about this if you are inventing your own pattern
2021 public patInt countMaxChars()
2023 return thePattern.countMaxChars();
2026 boolean isHexDigit(StrPos sp)
2030 && ((sp.c >= '0' && sp.c <= '9')
2031 || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F'));
2035 boolean isOctalDigit(StrPos sp, boolean first)
2037 boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0'
2042 int getHexDigit(StrPos sp)
2044 if (sp.c >= '0' && sp.c <= '9')
2048 if (sp.c >= 'a' && sp.c <= 'f')
2050 return sp.c - 'a' + 10;
2052 return sp.c - 'A' + 10;
2055 boolean next2Hex(StrPos sp)
2057 StrPos sp2 = new StrPos(sp);
2059 if (!isHexDigit(sp2))
2064 if (!isHexDigit(sp2))
2071 boolean isOctalString(StrPos sp)
2073 if (!isOctalDigit(sp, true))
2077 StrPos sp2 = new StrPos(sp);
2079 if (!isOctalDigit(sp2, false))