2 // This software is now distributed according to
3 // the Lesser Gnu Public License. Please see
4 // http://www.gnu.org/copyleft/lesser.txt for
8 package com.stevesoft.pat;
10 import com.stevesoft.pat.MessageManager;
12 import jalview.jsdev.api.RegExpInterface;
15 import java.util.BitSet;
16 import java.util.Hashtable;
18 import com.stevesoft.pat.wrap.StringWrap;
20 /** Matches a Unicode punctuation character. */
21 class UnicodePunct extends UniValidator
23 public int validate(StringLike s, int from, int to)
25 return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1;
29 /** Matches a Unicode white space character. */
30 class UnicodeWhite extends UniValidator
32 public int validate(StringLike s, int from, int to)
34 return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1;
39 * Matches a character that is not a Unicode punctuation character.
41 class NUnicodePunct extends UniValidator
43 public int validate(StringLike s, int from, int to)
45 return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1;
50 * Matches a character that is not a Unicode white space character.
52 class NUnicodeWhite extends UniValidator
54 public int validate(StringLike s, int from, int to)
56 return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1;
60 /** Matches a Unicode word character: an alphanumeric or underscore. */
61 class UnicodeW extends UniValidator
63 public int validate(StringLike s, int from, int to)
65 if (from >= s.length())
69 char c = s.charAt(from);
70 return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
75 /** Matches a character that is not a Unicode alphanumeric or underscore. */
76 class NUnicodeW extends UniValidator
78 public int validate(StringLike s, int from, int to)
80 if (from >= s.length())
84 char c = s.charAt(from);
85 return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
90 /** Matches a Unicode decimal digit. */
91 class UnicodeDigit extends UniValidator
93 public int validate(StringLike s, int from, int to)
95 return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to
100 /** Matches a character that is not a Unicode digit. */
101 class NUnicodeDigit extends UniValidator
103 public int validate(StringLike s, int from, int to)
105 return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to
110 /** Matches a Unicode math character. */
111 class UnicodeMath extends UniValidator
113 public int validate(StringLike s, int from, int to)
115 return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1;
119 /** Matches a non-math Unicode character. */
120 class NUnicodeMath extends UniValidator
122 public int validate(StringLike s, int from, int to)
124 return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1;
128 /** Matches a Unicode currency symbol. */
129 class UnicodeCurrency extends UniValidator
131 public int validate(StringLike s, int from, int to)
133 return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1;
137 /** Matches a non-currency symbol Unicode character. */
138 class NUnicodeCurrency extends UniValidator
140 public int validate(StringLike s, int from, int to)
142 return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1;
146 /** Matches a Unicode alphabetic character. */
147 class UnicodeAlpha extends UniValidator
149 public int validate(StringLike s, int from, int to)
151 return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1;
155 /** Matches a non-alphabetic Unicode character. */
156 class NUnicodeAlpha extends UniValidator
158 public int validate(StringLike s, int from, int to)
160 return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to
165 /** Matches an upper case Unicode character. */
166 class UnicodeUpper extends UniValidator
168 public int validate(StringLike s, int from, int to)
170 return from < s.length() && isUpper(s.charAt(from)) ? to : -1;
173 final boolean isUpper(char c)
175 return c == CaseMgr.toUpperCaseC(c) && c != CaseMgr.toLowerCaseC(c);
179 /** Matches an upper case Unicode character. */
180 class UnicodeLower extends UniValidator
182 public int validate(StringLike s, int from, int to)
184 return from < s.length() && isLower(s.charAt(from)) ? to : -1;
187 final boolean isLower(char c)
189 return c != CaseMgr.toUpperCaseC(c) && c == CaseMgr.toLowerCaseC(c);
194 * Regex provides the parser which constructs the linked list of Pattern classes
197 * For the purpose of this documentation, the fact that java interprets the
198 * backslash will be ignored. In practice, however, you will need a double
199 * backslash to obtain a string that contains a single backslash character.
200 * Thus, the example pattern "\b" should really be typed as "\\b" inside java
203 * Note that Regex is part of package "com.stevesoft.pat". To use it, simply
204 * import com.stevesoft.pat.Regex at the top of your file.
206 * Regex is made with a constructor that takes a String that defines the regular
207 * expression. Thus, for example
210 * Regex r = new Regex("[a-c]*");
213 * matches any number of characters so long as the are 'a', 'b', or 'c').
215 * To attempt to match the Pattern to a given string, you can use either the
216 * search(String) member function, or the matchAt(String,int position) member
217 * function. These functions return a boolean which tells you whether or not the
218 * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in
219 * the Regex object appropriately.
221 * The portion of the string before the match can be obtained by the left()
222 * member, and the portion after the match can be obtained by the right()
225 * Essentially, this package implements a syntax that is very much like the perl
226 * 5 regular expression syntax.
231 * Regex r = new Regex("x(a|b)y");
232 * r.matchAt("xay", 0);
233 * System.out.println("sub = " + r.stringMatched(1));
236 * The above would print "sub = a".
239 * r.left() // would return "x"
240 * r.right() // would return "y"
244 * Differences between this package and perl5:<br>
245 * The extended Pattern for setting flags, is now supported, but the flags are
246 * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the
247 * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the
248 * escape character. The pattern
258 * </pre>, but note that the sequence
264 * <b>must</b> occur at the very beginning of the pattern. There may be other
265 * small differences as well. I will either make my package conform or note them
266 * as I become aware of them.
268 * This package supports additional patterns not in perl5: <center> <table
273 * <td>This matches all characters between the '(' character and the balancing
274 * ')' character. Thus, it will match "()" as well as "(())". The balancing
275 * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".</td>
279 * <td>Moves the pointer backwards within the text. This allows you to make a
280 * "look behind." It fails if it attempts to move to a position before the
281 * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1
282 * in this example, is the number of characters to move backwards.</td>
286 * @author Steven R. Brandt
287 * @version package com.stevesoft.pat, release 1.5.3
290 public class Regex extends RegRes implements RegExpInterface, Cloneable //implements FilenameFilter
293 * BackRefOffset gives the identity number of the first pattern. Version 1.0
294 * used zero, version 1.1 uses 1 to be more compatible with perl.
296 static int BackRefOffset = 1;
298 private static Pattern none = new NoPattern();
300 Pattern thePattern = none;
302 patInt minMatch = new patInt(0);
304 static Hashtable<Object, Object> validators = new Hashtable<Object, Object>();
307 defineV("p", "(?>1)", new UnicodePunct());
308 defineV("P", "(?>1)", new NUnicodePunct());
309 defineV("s", "(?>1)", new UnicodeWhite());
310 defineV("S", "(?>1)", new NUnicodeWhite());
311 defineV("w", "(?>1)", new UnicodeW());
312 defineV("W", "(?>1)", new NUnicodeW());
313 defineV("d", "(?>1)", new UnicodeDigit());
314 defineV("D", "(?>1)", new NUnicodeDigit());
315 defineV("m", "(?>1)", new UnicodeMath());
316 defineV("M", "(?>1)", new NUnicodeMath());
317 defineV("c", "(?>1)", new UnicodeCurrency());
318 defineV("C", "(?>1)", new NUnicodeCurrency());
319 defineV("a", "(?>1)", new UnicodeAlpha());
320 defineV("A", "(?>1)", new NUnicodeAlpha());
321 defineV("uc", "(?>1)", new UnicodeUpper());
322 defineV("lc", "(?>1)", new UnicodeLower());
325 ReplaceRule rep = null;
329 * Initializes the object without a Pattern. To supply a Pattern use
334 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
342 * Create and compile both a Regex and a ReplaceRule.
344 * @see com.stevesoft.pat.ReplaceRule
345 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
347 public Regex(String s, String strRp)
353 if (strRp.length() > 0)
354 rep = ReplaceRule.perlCode(strRp);
355 } catch (RegSyntax rs)
361 * Create and compile a Regex, but give it the ReplaceRule specified. This
362 * allows the user finer control of the Replacement process, if that is
367 * @see com.stevesoft.pat.ReplaceRule
368 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
370 public Regex(String s, ReplaceRule rp)
377 * Create and compile a Regex, but do not throw any exceptions. If you wish to
378 * have exceptions thrown for syntax errors, you must use the Regex(void)
379 * constructor to create the Regex object, and then call the compile method.
380 * Therefore, you should only call this method when you know your pattern is
381 * right. I will probably become more like
385 * @see com.stevesoft.pat.Regex#search(java.lang.String)
386 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
389 public Regex(String s)
394 /** A clone by any other name would smell as sweet. */
395 public Object clone()
397 return new Regex(null, "").cloneFrom(this);
400 // /** Return a clone of the underlying RegRes object. */
401 // public RegRes result()
403 // return (RegRes) super.clone();
406 /** Essentially clones the Regex object */
407 public Regex cloneFrom(Regex r)
410 // super((RegRes) r);
411 dontMatchInQuotes = r.dontMatchInQuotes;
413 ignoreCase = r.ignoreCase;
421 rep = (ReplaceRule) r.rep.clone();
424 * try { compile(r.toString()); } catch(RegSyntax r_) {}
426 thePattern = r.thePattern.clone(new Hashtable<Object, Object>());
427 minMatch = r.minMatch;
432 /** Set the dontMatch in quotes flag. */
433 public void setDontMatchInQuotes(boolean b)
435 dontMatchInQuotes = b;
438 /** Find out if the dontMatchInQuotes flag is enabled. */
439 public boolean getDontMatchInQuotes()
441 return dontMatchInQuotes;
444 boolean dontMatchInQuotes = false;
447 * Set the state of the ignoreCase flag. If set to true, then the pattern
448 * matcher will ignore case when searching for a match.
450 public void setIgnoreCase(boolean b)
456 * Get the state of the ignoreCase flag. Returns true if we are ignoring the
457 * case of the pattern, false otherwise.
459 public boolean getIgnoreCase()
464 boolean ignoreCase = false;
466 static boolean defaultMFlag = false;
469 * Set the default value of the m flag. If it is set to true, then the MFlag
470 * will be on for any regex search executed.
472 public static void setDefaultMFlag(boolean mFlag)
474 defaultMFlag = mFlag;
478 * Get the default value of the m flag. If it is set to true, then the MFlag
479 * will be on for any regex search executed.
481 public static boolean getDefaultMFlag()
487 * Change the ReplaceRule of this Regex by compiling a new one using String
490 public void setReplaceRuleStr(String rp)
492 rep = ReplaceRule.perlCode(rp);
493 repr = null; // Clear Replacer history
496 /** Change the ReplaceRule of this Regex to rp. */
497 public void setReplaceRule(ReplaceRule rp)
503 * Test to see if a custom defined rule exists.
505 * @see com.stevesoft.pat#defineV(java.lang.String,java.lang.String,Validator)
507 public static boolean isDefined(String nm)
509 return validators.get(nm) != null;
513 * Removes a custom defined rule.
515 * @see com.stevesoft.pat#defineV(java.lang.String,java.lang.String,Validator)
517 public static void undefine(String nm)
519 validators.remove(nm);
523 * Defines a method to create a new rule. See test/deriv2.java and
524 * test/deriv3.java for examples of how to use it.
526 public static void defineV(String nm, String pat, Validator v)
529 validators.put(nm, v);
533 * Defines a shorthand for a pattern. The pattern will be invoked by a string
534 * that has the form "(??"+nm+")".
536 public static void define(String nm, String pat)
538 validators.put(nm, pat);
541 /** Get the current ReplaceRule. */
542 public ReplaceRule getReplaceRule()
547 Replacer repr = null;
549 final Replacer _getReplacer()
551 return repr == null ? repr = new Replacer() : repr;
554 public Replacer getReplacer()
558 repr = new Replacer();
566 * Replace the first occurence of this pattern in String s according to the
569 * @see com.stevesoft.pat.ReplaceRule
570 * @see com.stevesoft.pat.Regex#getReplaceRule()
572 public String replaceFirst(String s)
574 return _getReplacer().replaceFirstRegion(s, this, 0, s.length())
579 * Replace the first occurence of this pattern in String s beginning with
580 * position pos according to the ReplaceRule.
582 * @see com.stevesoft.pat.ReplaceRule
583 * @see com.stevesoft.pat.Regex#getReplaceRule()
585 public String replaceFirstFrom(String s, int pos)
587 return _getReplacer().replaceFirstRegion(s, this, pos, s.length())
592 * Replace the first occurence of this pattern in String s beginning with
593 * position start and ending with end according to the ReplaceRule.
595 * @see com.stevesoft.pat.ReplaceRule
596 * @see com.stevesoft.pat.Regex#getReplaceRule()
598 public String replaceFirstRegion(String s, int start, int end)
600 return _getReplacer().replaceFirstRegion(s, this, start, end)
605 * Replace all occurences of this pattern in String s according to the
608 * @see com.stevesoft.pat.ReplaceRule
609 * @see com.stevesoft.pat.Regex#getReplaceRule()
611 public String replaceAll(String s)
613 return _getReplacer().replaceAllRegion(s, this, 0, s.length())
617 public StringLike replaceAllLike(StringLike s)
619 return _getReplacer().replaceAllRegion(s, this, 0, s.length());
623 * Replace all occurences of this pattern in String s beginning with position
624 * pos according to the ReplaceRule.
626 * @see com.stevesoft.pat.ReplaceRule
627 * @see com.stevesoft.pat.Regex#getReplaceRule()
629 public String replaceAllFrom(String s, int pos)
631 return _getReplacer().replaceAllRegion(s, this, pos, s.length())
636 * Replace all occurences of this pattern in String s beginning with position
637 * start and ending with end according to the ReplaceRule.
639 * @see com.stevesoft.pat.ReplaceRule
640 * @see com.stevesoft.pat.Regex#getReplaceRule()
642 public String replaceAllRegion(String s, int start, int end)
644 return _getReplacer().replaceAllRegion(s, this, start, end).toString();
648 * By default, the escape character is the backslash, but you can make it
649 * anything you want by setting this variable.
651 public char esc = Pattern.ESC;
654 * This method compiles a regular expression, making it possible to call the
655 * search or matchAt methods.
657 * @exception com.stevesoft.pat.RegSyntax
658 * is thrown if a syntax error is encountered in the
659 * pattern. For example, "x{3,1}" or "*a" are not valid
661 * @see com.stevesoft.pat.Regex#search
662 * @see com.stevesoft.pat.Regex#matchAt
664 public void compile(String prepat) throws RegSyntax
666 String postpat = parsePerl.codify(prepat, true);
667 String pat = postpat == null ? prepat : postpat;
670 dontMatchInQuotes = false;
671 Rthings mk = new Rthings(this);
677 minMatch = new patInt(0);
678 StrPos sp = new StrPos(pat, 0);
679 if (sp.incMatch("(?e="))
685 newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC);
688 else if (esc != Pattern.ESC)
690 newpat = reEscape(pat, esc, Pattern.ESC);
692 thePattern = _compile(newpat, mk);
693 numSubs_ = mk.val - offset;
698 * If a Regex is compared against a Regex, a check is done to see that the
699 * patterns are equal as well as the most recent match. If a Regex is compare
700 * with a RegRes, only the result of the most recent match is compared.
702 public boolean equals(Object o)
704 if (o instanceof Regex)
706 if (toString().equals(o.toString()))
708 return super.equals(o);
717 return super.equals(o);
721 // prep sets global variables of class
722 // Pattern so that it can access them
723 // during an attempt at a match
724 Pthings pt = new Pthings();
726 final Pthings prep(StringLike s)
729 pt.lastPos = matchedTo();
734 if ((s == null ? null : s.unwrap()) != (src == null ? null : s.unwrap()))
739 pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag);
740 pt.mFlag = (mFlag | defaultMFlag);
741 pt.ignoreCase = ignoreCase;
743 if (pt.marks != null)
745 for (int i = 0; i < pt.marks.length; i++)
751 pt.nMarks = numSubs_;
753 if (dontMatchInQuotes)
765 * Attempt to match a Pattern beginning at a specified location within the
768 * @see com.stevesoft.pat.Regex#search
770 public boolean matchAt(String s, int start_pos)
772 return _search(s, start_pos, start_pos);
776 * Attempt to match a Pattern beginning at a specified location within the
779 * @see com.stevesoft.pat.Regex#search
781 public boolean matchAtLike(StringLike s, int start_pos)
783 return _searchLike(s, start_pos, start_pos);
787 * Search through a String for the first occurrence of a match.
789 * @see com.stevesoft.pat.Regex#searchFrom
790 * @see com.stevesoft.pat.Regex#matchAt
792 public boolean search(String s)
796 throw new NullPointerException(MessageManager.getString("exception.null_string_given_to_regex_search"));
798 return _search(s, 0, s.length());
801 public boolean searchLike(StringLike sl)
805 throw new NullPointerException(MessageManager.getString("exception.null_string_like_given_to_regex_search"));
807 return _searchLike(sl, 0, sl.length());
810 public boolean reverseSearch(String s)
814 throw new NullPointerException(MessageManager.getString("exception.null_string_given_to_regex_reverse_search"));
816 return _reverseSearch(s, 0, s.length());
819 public boolean reverseSearchLike(StringLike sl)
823 throw new NullPointerException(MessageManager.getString("exception.null_string_like_given_to_regex_reverse_search"));
825 return _reverseSearchLike(sl, 0, sl.length());
829 * Search through a String for the first occurence of a match, but start at
836 public boolean searchFrom(String s, int start)
840 throw new NullPointerException(MessageManager.getString("exception.null_string_like_given_to_regex_search_from"));
842 return _search(s, start, s.length());
845 public boolean searchFromLike(StringLike s, int start)
849 throw new NullPointerException(MessageManager.getString("exception.null_string_like_given_to_regex_search_from"));
851 return _searchLike(s, start, s.length());
855 * Search through a region of a String for the first occurence of a match.
857 public boolean searchRegion(String s, int start, int end)
861 throw new NullPointerException(MessageManager.getString("exception.null_string_like_given_to_regex_search_region"));
863 return _search(s, start, end);
867 * Set this to change the default behavior of the "." pattern. By default it
868 * now matches perl's behavior and fails to match the '\n' character.
870 public static boolean dotDoesntMatchCR = true;
876 boolean gFlag = false;
878 /** Set the 'g' flag */
879 public void setGFlag(boolean b)
884 /** Get the state of the 'g' flag. */
885 public boolean getGFlag()
890 boolean sFlag = false;
892 /** Get the state of the sFlag */
893 public boolean getSFlag()
898 boolean mFlag = false;
900 /** Get the state of the sFlag */
901 public boolean getMFlag()
906 final boolean _search(String s, int start, int end)
908 return _searchLike(new StringWrap(s), start, end);
911 final boolean _searchLike(StringLike s, int start, int end)
913 if (gFlag && gFlagto > 0 && gFlags != null
914 && s.unwrap() == gFlags.unwrap())
920 Pthings pt = prep(s);
922 int up = (minMatch == null ? end : end - minMatch.i);
924 if (up < start && end >= start)
931 for (int i = start; i <= up; i++)
933 charsMatched_ = thePattern.matchAt(s, i, pt);
934 if (charsMatched_ >= 0)
936 matchFrom_ = thePattern.mfrom;
938 gFlagto = matchFrom_ + charsMatched_;
940 return didMatch_ = true;
947 for (int i = start; i <= up; i++)
949 i = skipper.find(src, i, up);
952 charsMatched_ = matchFrom_ = -1;
953 return didMatch_ = false;
955 charsMatched_ = thePattern.matchAt(s, i, pt);
956 if (charsMatched_ >= 0)
958 matchFrom_ = thePattern.mfrom;
960 gFlagto = matchFrom_ + charsMatched_;
962 return didMatch_ = true;
966 return didMatch_ = false;
970 * final boolean _search(LongStringLike s,long start,long end) { if(gFlag &&
971 * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null;
973 * Pthings pt=prep(s);
975 * int up = end;//(minMatch == null ? end : end-minMatch.i);
977 * if(up < start && end >= start) up = start;
979 * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ =
980 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
981 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
982 * return didMatch_=true; } } } else { pt.no_check = true; for(long i=start;i<=up;i++) {
983 * i = skipper.find(src,i,up); if(i<0) { charsMatched_ = matchFrom_ = -1;
984 * return didMatch_ = false; } charsMatched_ = thePattern.matchAt(s,i,pt);
985 * if(charsMatched_ >= 0) { matchFrom_ = thePattern.mfrom; marks = pt.marks;
986 * gFlagto = matchFrom_+charsMatched_; gFlags = s; return didMatch_=true; }
987 * else { i = s.adjustIndex(i); up = s.adjustEnd(i); } } } return
991 boolean _reverseSearch(String s, int start, int end)
993 return _reverseSearchLike(new StringWrap(s), start, end);
996 boolean _reverseSearchLike(StringLike s, int start, int end)
998 if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap())
1003 Pthings pt = prep(s);
1004 for (int i = end; i >= start; i--)
1006 charsMatched_ = thePattern.matchAt(s, i, pt);
1007 if (charsMatched_ >= 0)
1009 matchFrom_ = thePattern.mfrom;
1011 gFlagto = matchFrom_ - 1;
1013 return didMatch_ = true;
1016 return didMatch_ = false;
1019 // This routine sets the cbits variable
1020 // of class Pattern. Cbits is true for
1021 // the bit corresponding to a character inside
1023 static StringLike lasts = null;
1025 static BitSet lastbs = null;
1027 static void setCbits(StringLike s, Pthings pt)
1034 BitSet bs = new BitSet(s.length());
1036 boolean setBit = false;
1037 for (int i = 0; i < s.length(); i++)
1043 char c = s.charAt(i);
1044 if (!setBit && c == '"')
1050 else if (!setBit && c == '\'')
1056 else if (setBit && c == qc)
1060 else if (setBit && c == '\\' && i + 1 < s.length())
1069 pt.cbits = lastbs = bs;
1073 // // Wanted user to over-ride this in alpha version,
1074 // // but it wasn't really necessary because of this trick:
1079 // return (Regex) getClass().newInstance();
1080 // } catch (InstantiationException ie)
1083 // } catch (IllegalAccessException iae)
1090 * Only needed for creating your own extensions of Regex. This method adds the
1091 * next Pattern in the chain of patterns or sets the Pattern if it is the
1094 protected void add(Pattern p2)
1108 * You only need to use this method if you are creating your own extentions to
1109 * Regex. compile1 compiles one Pattern element, it can be over-ridden to
1110 * allow the Regex compiler to understand new syntax. See deriv.java for an
1111 * example. This routine is the heart of class Regex. Rthings has one integer
1112 * member called intValue, it is used to keep track of the number of ()'s in
1115 * @exception com.stevesoft.pat.RegSyntax
1116 * is thrown when a nonsensensical pattern is supplied. For
1117 * example, a pattern beginning with *.
1119 protected void compileSP(StrPos sp, Rthings mk) throws RegSyntax
1124 add(matchBracket(sp));
1126 else if (sp.match('|'))
1134 p = new NullPattern();
1139 else if (sp.incMatch("(?<"))
1141 patInt i = sp.getPatInt();
1144 RegSyntaxError.endItAll("No int after (?<");
1146 add(new Backup(i.intValue()));
1149 RegSyntaxError.endItAll("No ) after (?<");
1152 else if (sp.incMatch("(?>"))
1154 patInt i = sp.getPatInt();
1157 RegSyntaxError.endItAll("No int after (?>");
1159 add(new Backup(-i.intValue()));
1162 RegSyntaxError.endItAll("No ) after (?<");
1165 else if (sp.incMatch("(?@"))
1173 RegSyntaxError.endItAll("(?@ does not have closing paren");
1175 add(new Group(op, cl));
1177 else if (sp.incMatch("(?#"))
1179 while (!sp.match(')'))
1184 else if (sp.dontMatch && sp.c == 'w')
1186 // Regex r = new Regex();
1187 // r._compile("[a-zA-Z0-9_]",mk);
1188 // add(new Goop("\\w",r.thePattern));
1189 Bracket b = new Bracket(false);
1190 b.addOr(new Range('a', 'z'));
1191 b.addOr(new Range('A', 'Z'));
1192 b.addOr(new Range('0', '9'));
1193 b.addOr(new oneChar('_'));
1196 else if (sp.dontMatch && sp.c == 'G')
1200 else if (sp.dontMatch && sp.c == 's')
1202 // Regex r = new Regex();
1203 // r._compile("[ \t\n\r\b]",mk);
1204 // add(new Goop("\\s",r.thePattern));
1205 Bracket b = new Bracket(false);
1206 b.addOr(new oneChar((char) 32));
1207 b.addOr(new Range((char) 8, (char) 10));
1208 b.addOr(new oneChar((char) 13));
1211 else if (sp.dontMatch && sp.c == 'd')
1213 // Regex r = new Regex();
1214 // r._compile("[0-9]",mk);
1215 // add(new Goop("\\d",r.thePattern));
1216 Range digit = new Range('0', '9');
1217 digit.printBrackets = true;
1220 else if (sp.dontMatch && sp.c == 'W')
1222 // Regex r = new Regex();
1223 // r._compile("[^a-zA-Z0-9_]",mk);
1224 // add(new Goop("\\W",r.thePattern));
1225 Bracket b = new Bracket(true);
1226 b.addOr(new Range('a', 'z'));
1227 b.addOr(new Range('A', 'Z'));
1228 b.addOr(new Range('0', '9'));
1229 b.addOr(new oneChar('_'));
1232 else if (sp.dontMatch && sp.c == 'S')
1234 // Regex r = new Regex();
1235 // r._compile("[^ \t\n\r\b]",mk);
1236 // add(new Goop("\\S",r.thePattern));
1237 Bracket b = new Bracket(true);
1238 b.addOr(new oneChar((char) 32));
1239 b.addOr(new Range((char) 8, (char) 10));
1240 b.addOr(new oneChar((char) 13));
1243 else if (sp.dontMatch && sp.c == 'D')
1245 // Regex r = new Regex();
1246 // r._compile("[^0-9]",mk);
1247 // add(new Goop("\\D",r.thePattern));
1248 Bracket b = new Bracket(true);
1249 b.addOr(new Range('0', '9'));
1252 else if (sp.dontMatch && sp.c == 'B')
1254 Regex r = new Regex(null, "");
1255 r._compile("(?!" + back_slash + "b)", mk);
1258 else if (isOctalString(sp))
1262 d = 8 * d + sp.c - '0';
1263 StrPos sp2 = new StrPos(sp);
1265 if (isOctalDigit(sp2, false))
1268 d = 8 * d + sp.c - '0';
1270 add(new oneChar((char) d));
1272 else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9')
1274 int iv = sp.c - '0';
1275 StrPos s2 = new StrPos(sp);
1277 if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9')
1279 iv = 10 * iv + (s2.c - '0');
1282 add(new BackMatch(iv));
1284 else if (sp.dontMatch && sp.c == 'b')
1286 add(new Boundary());
1288 else if (sp.match('\b'))
1290 add(new Boundary());
1292 else if (sp.match('$'))
1296 else if (sp.dontMatch && sp.c == 'Z')
1298 add(new End(false));
1300 else if (sp.match('.'))
1304 else if (sp.incMatch("(??"))
1306 StringBuffer sb = new StringBuffer();
1307 StringBuffer sb2 = new StringBuffer();
1308 while (!sp.match(')') && !sp.match(':'))
1313 if (sp.incMatch(":"))
1315 while (!sp.match(')'))
1321 String sbs = sb.toString();
1322 if (validators.get(sbs) instanceof String)
1324 String pat = (String) validators.get(sbs);
1325 Regex r = new Regex(null, "");
1326 Rthings rth = new Rthings(this);
1327 rth.noBackRefs = true;
1328 r._compile(pat, rth);
1333 Custom cm = new Custom(sb.toString());
1336 Validator v2 = cm.v.arg(sb2.toString());
1339 v2.argsave = sb2.toString();
1340 String p = cm.v.pattern;
1344 Regex r = new Regex(null, "");
1345 Rthings rth = new Rthings(this);
1346 rth.noBackRefs = true;
1347 r._compile(cm.v.pattern, rth);
1348 cm.sub = r.thePattern;
1349 cm.sub.add(new CustomEndpoint(cm));
1350 cm.sub.setParent(cm);
1355 else if (sp.match('('))
1358 Regex r = new Regex(null, "");
1361 if (sp.incMatch("?:"))
1365 else if (sp.incMatch("?="))
1367 r.or = new lookAhead(false);
1369 else if (sp.incMatch("?!"))
1371 r.or = new lookAhead(true);
1373 else if (sp.match('?'))
1380 mk.ignoreCase = true;
1384 mk.dontMatchInQuotes = true;
1388 mk.optimizeMe = true;
1403 } while (!sp.match(')') && !sp.eos);
1406 if (sp.eos) // throw new RegSyntax
1408 RegSyntaxError.endItAll("Unclosed ()");
1412 { // just ordinary parenthesis
1413 r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++);
1417 add(r._compileSP(sp, mk));
1420 else if (sp.match('^'))
1422 add(new Start(true));
1424 else if (sp.dontMatch && sp.c == 'A')
1426 add(new Start(false));
1428 else if (sp.match('*'))
1430 addMulti(new patInt(0), new patInf());
1432 else if (sp.match('+'))
1434 addMulti(new patInt(1), new patInf());
1436 else if (sp.match('?'))
1438 addMulti(new patInt(0), new patInt(1));
1440 else if (sp.match('{'))
1442 boolean bad = false;
1443 StrPos sp2 = new StrPos(sp);
1444 // StringBuffer sb = new StringBuffer();
1446 patInt i1 = sp.getPatInt();
1457 * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed
1469 i2 = sp.getPatInt();
1472 if (i1 == null || i2 == null)
1475 * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}");
1482 add(new oneChar(sp.c));
1489 else if (sp.escMatch('x') && next2Hex(sp))
1492 int d = getHexDigit(sp);
1494 d = 16 * d + getHexDigit(sp);
1495 add(new oneChar((char) d));
1497 else if (sp.escMatch('c'))
1500 if (sp.c < Ctrl.cmap.length)
1502 add(new oneChar(Ctrl.cmap[sp.c]));
1506 add(new oneChar(sp.c));
1509 else if (sp.escMatch('f'))
1511 add(new oneChar((char) 12));
1513 else if (sp.escMatch('a'))
1515 add(new oneChar((char) 7));
1517 else if (sp.escMatch('t'))
1519 add(new oneChar('\t'));
1521 else if (sp.escMatch('n'))
1523 add(new oneChar('\n'));
1525 else if (sp.escMatch('r'))
1527 add(new oneChar('\r'));
1529 else if (sp.escMatch('b'))
1531 add(new oneChar('\b'));
1533 else if (sp.escMatch('e'))
1535 add(new oneChar((char) 27));
1539 add(new oneChar(sp.c));
1542 RegSyntaxError.endItAll("Unmatched right paren in pattern");
1547 // compiles all Pattern elements, internal method
1548 private Pattern _compile(String pat, Rthings mk) throws RegSyntax
1551 sFlag = mFlag = ignoreCase = gFlag = false;
1552 StrPos sp = new StrPos(pat, 0);
1553 thePattern = _compileSP(sp, mk);
1562 Pattern _compileSP(StrPos sp, Rthings mk) throws RegSyntax
1564 while (!(sp.eos || (or != null && sp.match(')'))))
1573 else if (sp.eos && mk.parenLevel != 0)
1575 RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel);
1581 p = new NullPattern();
1586 return p == null ? new NullPattern() : p;
1589 // add a multi object to the end of the chain
1590 // which applies to the last object
1591 void addMulti(patInt i1, patInt i2) throws RegSyntax
1593 Pattern last, last2;
1594 for (last = p; last != null && last.next != null; last = last.next)
1598 if (last == null || last == p)
1604 for (last2 = p; last2.next != last; last2 = last2.next)
1609 if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1)
1611 ((Multi) last).matchFewest = true;
1613 else if (last instanceof FastMulti && i1.intValue() == 0
1614 && i2.intValue() == 1)
1616 ((FastMulti) last).matchFewest = true;
1618 else if (last instanceof DotMulti && i1.intValue() == 0
1619 && i2.intValue() == 1)
1621 ((DotMulti) last).matchFewest = true;
1623 else if (last instanceof Multi || last instanceof DotMulti
1624 || last instanceof FastMulti)
1626 throw new RegSyntax("Syntax error.");
1628 else if (last2 == null)
1630 p = mkMulti(i1, i2, p);
1634 last2.next = mkMulti(i1, i2, last);
1638 final static Pattern mkMulti(patInt lo, patInt hi, Pattern p)
1641 if (p instanceof Any && p.next == null)
1643 return (Pattern) new DotMulti(lo, hi);
1645 return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p)
1646 : (Pattern) new Multi(lo, hi, p);
1649 // process the bracket operator
1650 Pattern matchBracket(StrPos sp) throws RegSyntax
1655 ret = new Bracket(true);
1660 ret = new Bracket(false);
1664 // throw new RegSyntax
1665 RegSyntaxError.endItAll("Unmatched []");
1668 while (!sp.eos && !sp.match(']'))
1670 StrPos s1 = new StrPos(sp);
1672 StrPos s1_ = new StrPos(s1);
1674 if (s1.match('-') && !s1_.match(']'))
1676 StrPos s2 = new StrPos(s1);
1680 ret.addOr(new Range(sp.c, s2.c));
1685 else if (sp.escMatch('Q'))
1688 while (!sp.escMatch('E'))
1690 ret.addOr(new oneChar(sp.c));
1694 else if (sp.escMatch('d'))
1696 ret.addOr(new Range('0', '9'));
1698 else if (sp.escMatch('s'))
1700 ret.addOr(new oneChar((char) 32));
1701 ret.addOr(new Range((char) 8, (char) 10));
1702 ret.addOr(new oneChar((char) 13));
1704 else if (sp.escMatch('w'))
1706 ret.addOr(new Range('a', 'z'));
1707 ret.addOr(new Range('A', 'Z'));
1708 ret.addOr(new Range('0', '9'));
1709 ret.addOr(new oneChar('_'));
1711 else if (sp.escMatch('D'))
1713 ret.addOr(new Range((char) 0, (char) 47));
1714 ret.addOr(new Range((char) 58, (char) 65535));
1716 else if (sp.escMatch('S'))
1718 ret.addOr(new Range((char) 0, (char) 7));
1719 ret.addOr(new Range((char) 11, (char) 12));
1720 ret.addOr(new Range((char) 14, (char) 31));
1721 ret.addOr(new Range((char) 33, (char) 65535));
1723 else if (sp.escMatch('W'))
1725 ret.addOr(new Range((char) 0, (char) 64));
1726 ret.addOr(new Range((char) 91, (char) 94));
1727 ret.addOr(new oneChar((char) 96));
1728 ret.addOr(new Range((char) 123, (char) 65535));
1730 else if (sp.escMatch('x') && next2Hex(sp))
1733 int d = getHexDigit(sp);
1735 d = 16 * d + getHexDigit(sp);
1736 ret.addOr(new oneChar((char) d));
1738 else if (sp.escMatch('a'))
1740 ret.addOr(new oneChar((char) 7));
1742 else if (sp.escMatch('f'))
1744 ret.addOr(new oneChar((char) 12));
1746 else if (sp.escMatch('e'))
1748 ret.addOr(new oneChar((char) 27));
1750 else if (sp.escMatch('n'))
1752 ret.addOr(new oneChar('\n'));
1754 else if (sp.escMatch('t'))
1756 ret.addOr(new oneChar('\t'));
1758 else if (sp.escMatch('r'))
1760 ret.addOr(new oneChar('\r'));
1762 else if (sp.escMatch('c'))
1765 if (sp.c < Ctrl.cmap.length)
1767 ret.addOr(new oneChar(Ctrl.cmap[sp.c]));
1771 ret.addOr(new oneChar(sp.c));
1774 else if (isOctalString(sp))
1778 d = 8 * d + sp.c - '0';
1779 StrPos sp2 = new StrPos(sp);
1781 if (isOctalDigit(sp2, false))
1784 d = 8 * d + sp.c - '0';
1786 ret.addOr(new oneChar((char) d));
1790 ret.addOr(new oneChar(sp.c));
1798 * Converts the stored Pattern to a String -- this is a decompile. Note that
1799 * \t and \n will really print out here, Not just the two character
1800 * representations. Also be prepared to see some strange output if your
1801 * characters are not printable.
1803 public String toString()
1805 // if (false && thePattern == null)
1811 StringBuffer sb = new StringBuffer();
1812 if (esc != Pattern.ESC)
1818 if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase
1819 || dontMatchInQuotes || optimized())
1830 if (sFlag || !dotDoesntMatchCR)
1834 if (dontMatchInQuotes)
1848 String patstr = thePattern.toString();
1849 if (esc != Pattern.ESC)
1851 patstr = reEscape(patstr, Pattern.ESC, esc);
1854 return sb.toString();
1858 // Re-escape Pattern, allows us to use a different escape
1860 static String reEscape(String s, char oldEsc, char newEsc)
1862 if (oldEsc == newEsc)
1867 StringBuffer sb = new StringBuffer();
1868 for (i = 0; i < s.length(); i++)
1870 if (s.charAt(i) == oldEsc && i + 1 < s.length())
1872 if (s.charAt(i + 1) == oldEsc)
1879 sb.append(s.charAt(i + 1));
1883 else if (s.charAt(i) == newEsc)
1890 sb.append(s.charAt(i));
1893 return sb.toString();
1897 * This method implements FilenameFilter, allowing one to use a Regex to
1898 * search through a directory using File.list. There is a FileRegex now that
1901 * @see com.stevesoft.pat.FileRegex
1903 public boolean accept(File dir, String s)
1908 /** The version of this package */
1909 final static public String version()
1911 return "lgpl release 1.5.3";
1915 * Once this method is called, the state of variables ignoreCase and
1916 * dontMatchInQuotes should not be changed as the results will be
1917 * unpredictable. However, search and matchAt will run more quickly. Note that
1918 * you can check to see if the pattern has been optimized by calling the
1919 * optimized() method.
1921 * This method will attempt to rewrite your pattern in a way that makes it
1922 * faster (not all patterns execute at the same speed). In general, "(?: ... )"
1923 * will be faster than "( ... )" so if you don't need the backreference, you
1924 * should group using the former pattern.
1926 * It will also introduce new pattern elements that you can't get to
1927 * otherwise, for example if you have a large table of strings, i.e. the
1928 * months of the year "(January|February|...)" optimize() will make a
1929 * Hashtable that takes it to the next appropriate pattern element --
1930 * eliminating the need for a linear search.
1932 * @see com.stevesoft.pat.Regex#optimized
1933 * @see com.stevesoft.pat.Regex#ignoreCase
1934 * @see com.stevesoft.pat.Regex#dontMatchInQuotes
1935 * @see com.stevesoft.pat.Regex#matchAt
1936 * @see com.stevesoft.pat.Regex#search
1938 public void optimize()
1940 if (optimized() || thePattern == null)
1944 minMatch = new patInt(0); // thePattern.countMinChars();
1945 thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes);
1946 skipper = Skip.findSkipRegex(this);
1947 // RegOpt.setParents(this);
1954 * This function returns true if the optimize method has been called.
1956 public boolean optimized()
1958 return minMatch != null;
1962 * A bit of syntactic surgar for those who want to make their code look more
1963 * perl-like. To use this initialize your Regex object by saying:
1966 * Regex r1 = Regex.perlCode("s/hello/goodbye/");
1967 * Regex r2 = Regex.perlCode("s'fish'frog'i");
1968 * Regex r3 = Regex.perlCode("m'hello');
1971 * The i for ignoreCase is supported in this syntax, as well as m, s, and x.
1972 * The g flat is a bit of a special case.
1974 * If you wish to replace all occurences of a pattern, you do not put a 'g' in
1975 * the perlCode, but call Regex's replaceAll method.
1977 * If you wish to simply and only do a search for r2's pattern, you can do
1978 * this by calling the searchFrom method method repeatedly, or by calling
1979 * search repeatedly if the g flag is set.
1981 * Note: Currently perlCode does <em>not</em> support the (?e=#) syntax for
1982 * changing the escape character.
1985 public static Regex perlCode(String s)
1987 // this file is big enough, see parsePerl.java
1988 // for this function.
1989 return parsePerl.parse(s);
1992 static final char back_slash = '\\';
1995 * Checks to see if there are only literal and no special pattern elements in
1998 public boolean isLiteral()
2000 Pattern x = thePattern;
2003 if (x instanceof oneChar)
2007 else if (x instanceof Skipped)
2021 * You only need to know about this if you are inventing your own pattern
2024 public patInt countMinChars()
2026 return thePattern.countMinChars();
2030 * You only need to know about this if you are inventing your own pattern
2033 public patInt countMaxChars()
2035 return thePattern.countMaxChars();
2038 boolean isHexDigit(StrPos sp)
2042 && ((sp.c >= '0' && sp.c <= '9')
2043 || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F'));
2047 boolean isOctalDigit(StrPos sp, boolean first)
2049 boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0'
2054 int getHexDigit(StrPos sp)
2056 if (sp.c >= '0' && sp.c <= '9')
2060 if (sp.c >= 'a' && sp.c <= 'f')
2062 return sp.c - 'a' + 10;
2064 return sp.c - 'A' + 10;
2067 boolean next2Hex(StrPos sp)
2069 StrPos sp2 = new StrPos(sp);
2071 if (!isHexDigit(sp2))
2076 if (!isHexDigit(sp2))
2083 boolean isOctalString(StrPos sp)
2085 if (!isOctalDigit(sp, true))
2089 StrPos sp2 = new StrPos(sp);
2091 if (!isOctalDigit(sp2, false))