2 // This software is now distributed according to
3 // the Lesser Gnu Public License. Please see
4 // http://www.gnu.org/copyleft/lesser.txt for
8 package com.stevesoft.pat;
10 import jalview.util.MessageManager;
13 import java.io.FilenameFilter;
14 import java.util.BitSet;
15 import java.util.Hashtable;
17 import com.stevesoft.pat.wrap.StringWrap;
19 /** Matches a Unicode punctuation character. */
20 class UnicodePunct extends UniValidator
22 public int validate(StringLike s, int from, int to)
24 return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1;
28 /** Matches a Unicode white space character. */
29 class UnicodeWhite extends UniValidator
31 public int validate(StringLike s, int from, int to)
33 return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1;
38 * Matches a character that is not a Unicode punctuation character.
40 class NUnicodePunct extends UniValidator
42 public int validate(StringLike s, int from, int to)
44 return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1;
49 * Matches a character that is not a Unicode white space character.
51 class NUnicodeWhite extends UniValidator
53 public int validate(StringLike s, int from, int to)
55 return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1;
59 /** Matches a Unicode word character: an alphanumeric or underscore. */
60 class UnicodeW extends UniValidator
62 public int validate(StringLike s, int from, int to)
64 if (from >= s.length())
68 char c = s.charAt(from);
69 return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
74 /** Matches a character that is not a Unicode alphanumeric or underscore. */
75 class NUnicodeW extends UniValidator
77 public int validate(StringLike s, int from, int to)
79 if (from >= s.length())
83 char c = s.charAt(from);
84 return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
89 /** Matches a Unicode decimal digit. */
90 class UnicodeDigit extends UniValidator
92 public int validate(StringLike s, int from, int to)
94 return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to
99 /** Matches a character that is not a Unicode digit. */
100 class NUnicodeDigit extends UniValidator
102 public int validate(StringLike s, int from, int to)
104 return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to
109 /** Matches a Unicode math character. */
110 class UnicodeMath extends UniValidator
112 public int validate(StringLike s, int from, int to)
114 return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1;
118 /** Matches a non-math Unicode character. */
119 class NUnicodeMath extends UniValidator
121 public int validate(StringLike s, int from, int to)
123 return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1;
127 /** Matches a Unicode currency symbol. */
128 class UnicodeCurrency extends UniValidator
130 public int validate(StringLike s, int from, int to)
132 return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1;
136 /** Matches a non-currency symbol Unicode character. */
137 class NUnicodeCurrency extends UniValidator
139 public int validate(StringLike s, int from, int to)
141 return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1;
145 /** Matches a Unicode alphabetic character. */
146 class UnicodeAlpha extends UniValidator
148 public int validate(StringLike s, int from, int to)
150 return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1;
154 /** Matches a non-alphabetic Unicode character. */
155 class NUnicodeAlpha extends UniValidator
157 public int validate(StringLike s, int from, int to)
159 return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to
164 /** Matches an upper case Unicode character. */
165 class UnicodeUpper extends UniValidator
167 public int validate(StringLike s, int from, int to)
169 return from < s.length() && isUpper(s.charAt(from)) ? to : -1;
172 final boolean isUpper(char c)
174 return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c);
178 /** Matches an upper case Unicode character. */
179 class UnicodeLower extends UniValidator
181 public int validate(StringLike s, int from, int to)
183 return from < s.length() && isLower(s.charAt(from)) ? to : -1;
186 final boolean isLower(char c)
188 return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c);
193 * Regex provides the parser which constructs the linked list of Pattern classes
196 * For the purpose of this documentation, the fact that java interprets the
197 * backslash will be ignored. In practice, however, you will need a double
198 * backslash to obtain a string that contains a single backslash character.
199 * Thus, the example pattern "\b" should really be typed as "\\b" inside java
202 * Note that Regex is part of package "com.stevesoft.pat". To use it, simply
203 * import com.stevesoft.pat.Regex at the top of your file.
205 * Regex is made with a constructor that takes a String that defines the regular
206 * expression. Thus, for example
209 * Regex r = new Regex("[a-c]*");
212 * matches any number of characters so long as the are 'a', 'b', or 'c').
214 * To attempt to match the Pattern to a given string, you can use either the
215 * search(String) member function, or the matchAt(String,int position) member
216 * function. These functions return a boolean which tells you whether or not the
217 * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in
218 * the Regex object appropriately.
220 * The portion of the string before the match can be obtained by the left()
221 * member, and the portion after the match can be obtained by the right()
224 * Essentially, this package implements a syntax that is very much like the perl
225 * 5 regular expression syntax.
230 * Regex r = new Regex("x(a|b)y");
231 * r.matchAt("xay", 0);
232 * System.out.println("sub = " + r.stringMatched(1));
235 * The above would print "sub = a".
238 * r.left() // would return "x"
239 * r.right() // would return "y"
243 * Differences between this package and perl5:<br>
244 * The extended Pattern for setting flags, is now supported, but the flags are
245 * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the
246 * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the
247 * escape character. The pattern
259 * , but note that the sequence
265 * <b>must</b> occur at the very beginning of the pattern. There may be other
266 * small differences as well. I will either make my package conform or note them
267 * as I become aware of them.
269 * This package supports additional patterns not in perl5: <center>
274 * <td>This matches all characters between the '(' character and the balancing
275 * ')' character. Thus, it will match "()" as well as "(())". The balancing
276 * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".</td>
280 * <td>Moves the pointer backwards within the text. This allows you to make a
281 * "look behind." It fails if it attempts to move to a position before the
282 * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1
283 * in this example, is the number of characters to move backwards.</td>
287 * @author Steven R. Brandt
288 * @version package com.stevesoft.pat, release 1.5.3
291 public class Regex extends RegRes implements FilenameFilter
294 * BackRefOffset gives the identity number of the first pattern. Version 1.0
295 * used zero, version 1.1 uses 1 to be more compatible with perl.
297 static int BackRefOffset = 1;
299 private static Pattern none = new NoPattern();
301 Pattern thePattern = none;
303 patInt minMatch = new patInt(0);
305 static Hashtable validators = new Hashtable();
308 define("p", "(?>1)", new UnicodePunct());
309 define("P", "(?>1)", new NUnicodePunct());
310 define("s", "(?>1)", new UnicodeWhite());
311 define("S", "(?>1)", new NUnicodeWhite());
312 define("w", "(?>1)", new UnicodeW());
313 define("W", "(?>1)", new NUnicodeW());
314 define("d", "(?>1)", new UnicodeDigit());
315 define("D", "(?>1)", new NUnicodeDigit());
316 define("m", "(?>1)", new UnicodeMath());
317 define("M", "(?>1)", new NUnicodeMath());
318 define("c", "(?>1)", new UnicodeCurrency());
319 define("C", "(?>1)", new NUnicodeCurrency());
320 define("a", "(?>1)", new UnicodeAlpha());
321 define("A", "(?>1)", new NUnicodeAlpha());
322 define("uc", "(?>1)", new UnicodeUpper());
323 define("lc", "(?>1)", new UnicodeLower());
326 /** Set the dontMatch in quotes flag. */
327 public void setDontMatchInQuotes(boolean b)
329 dontMatchInQuotes = b;
332 /** Find out if the dontMatchInQuotes flag is enabled. */
333 public boolean getDontMatchInQuotes()
335 return dontMatchInQuotes;
338 boolean dontMatchInQuotes = false;
341 * Set the state of the ignoreCase flag. If set to true, then the pattern
342 * matcher will ignore case when searching for a match.
344 public void setIgnoreCase(boolean b)
350 * Get the state of the ignoreCase flag. Returns true if we are ignoring the
351 * case of the pattern, false otherwise.
353 public boolean getIgnoreCase()
358 boolean ignoreCase = false;
360 static boolean defaultMFlag = false;
363 * Set the default value of the m flag. If it is set to true, then the MFlag
364 * will be on for any regex search executed.
366 public static void setDefaultMFlag(boolean mFlag)
368 defaultMFlag = mFlag;
372 * Get the default value of the m flag. If it is set to true, then the MFlag
373 * will be on for any regex search executed.
375 public static boolean getDefaultMFlag()
381 * Initializes the object without a Pattern. To supply a Pattern use
384 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
391 * Create and compile a Regex, but do not throw any exceptions. If you wish to
392 * have exceptions thrown for syntax errors, you must use the Regex(void)
393 * constructor to create the Regex object, and then call the compile method.
394 * Therefore, you should only call this method when you know your pattern is
395 * right. I will probably become more like
397 * @see com.stevesoft.pat.Regex#search(java.lang.String)
398 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
400 public Regex(String s)
405 } catch (RegSyntax rs)
410 ReplaceRule rep = null;
413 * Create and compile both a Regex and a ReplaceRule.
415 * @see com.stevesoft.pat.ReplaceRule
416 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
418 public Regex(String s, String rp)
421 rep = ReplaceRule.perlCode(rp);
425 * Create and compile a Regex, but give it the ReplaceRule specified. This
426 * allows the user finer control of the Replacement process, if that is
429 * @see com.stevesoft.pat.ReplaceRule
430 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
432 public Regex(String s, ReplaceRule rp)
439 * Change the ReplaceRule of this Regex by compiling a new one using String
442 public void setReplaceRule(String rp)
444 rep = ReplaceRule.perlCode(rp);
445 repr = null; // Clear Replacer history
448 /** Change the ReplaceRule of this Regex to rp. */
449 public void setReplaceRule(ReplaceRule rp)
455 * Test to see if a custom defined rule exists.
457 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
459 public static boolean isDefined(String nm)
461 return validators.get(nm) != null;
465 * Removes a custom defined rule.
467 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
469 public static void undefine(String nm)
471 validators.remove(nm);
475 * Defines a method to create a new rule. See test/deriv2.java and
476 * test/deriv3.java for examples of how to use it.
478 public static void define(String nm, String pat, Validator v)
481 validators.put(nm, v);
485 * Defines a shorthand for a pattern. The pattern will be invoked by a string
486 * that has the form "(??"+nm+")".
488 public static void define(String nm, String pat)
490 validators.put(nm, pat);
493 /** Get the current ReplaceRule. */
494 public ReplaceRule getReplaceRule()
499 Replacer repr = null;
501 final Replacer _getReplacer()
503 return repr == null ? repr = new Replacer() : repr;
506 public Replacer getReplacer()
510 repr = new Replacer();
518 * Replace the first occurence of this pattern in String s according to the
521 * @see com.stevesoft.pat.ReplaceRule
522 * @see com.stevesoft.pat.Regex#getReplaceRule()
524 public String replaceFirst(String s)
526 return _getReplacer().replaceFirstRegion(s, this, 0, s.length())
531 * Replace the first occurence of this pattern in String s beginning with
532 * position pos according to the ReplaceRule.
534 * @see com.stevesoft.pat.ReplaceRule
535 * @see com.stevesoft.pat.Regex#getReplaceRule()
537 public String replaceFirstFrom(String s, int pos)
539 return _getReplacer().replaceFirstRegion(s, this, pos, s.length())
544 * Replace the first occurence of this pattern in String s beginning with
545 * position start and ending with end according to the ReplaceRule.
547 * @see com.stevesoft.pat.ReplaceRule
548 * @see com.stevesoft.pat.Regex#getReplaceRule()
550 public String replaceFirstRegion(String s, int start, int end)
552 return _getReplacer().replaceFirstRegion(s, this, start, end)
557 * Replace all occurences of this pattern in String s according to the
560 * @see com.stevesoft.pat.ReplaceRule
561 * @see com.stevesoft.pat.Regex#getReplaceRule()
563 public String replaceAll(String s)
565 return _getReplacer().replaceAllRegion(s, this, 0, s.length())
569 public StringLike replaceAll(StringLike s)
571 return _getReplacer().replaceAllRegion(s, this, 0, s.length());
575 * Replace all occurences of this pattern in String s beginning with position
576 * pos according to the ReplaceRule.
578 * @see com.stevesoft.pat.ReplaceRule
579 * @see com.stevesoft.pat.Regex#getReplaceRule()
581 public String replaceAllFrom(String s, int pos)
583 return _getReplacer().replaceAllRegion(s, this, pos, s.length())
588 * Replace all occurences of this pattern in String s beginning with position
589 * start and ending with end according to the ReplaceRule.
591 * @see com.stevesoft.pat.ReplaceRule
592 * @see com.stevesoft.pat.Regex#getReplaceRule()
594 public String replaceAllRegion(String s, int start, int end)
596 return _getReplacer().replaceAllRegion(s, this, start, end).toString();
599 /** Essentially clones the Regex object */
600 public Regex(Regex r)
603 dontMatchInQuotes = r.dontMatchInQuotes;
605 ignoreCase = r.ignoreCase;
613 rep = (ReplaceRule) r.rep.clone();
616 * try { compile(r.toString()); } catch(RegSyntax r_) {}
618 thePattern = r.thePattern.clone(new Hashtable());
619 minMatch = r.minMatch;
624 * By default, the escape character is the backslash, but you can make it
625 * anything you want by setting this variable.
627 public char esc = Pattern.ESC;
630 * This method compiles a regular expression, making it possible to call the
631 * search or matchAt methods.
633 * @exception com.stevesoft.pat.RegSyntax
634 * is thrown if a syntax error is encountered in the pattern. For
635 * example, "x{3,1}" or "*a" are not valid patterns.
636 * @see com.stevesoft.pat.Regex#search
637 * @see com.stevesoft.pat.Regex#matchAt
639 public void compile(String prepat) throws RegSyntax
641 String postpat = parsePerl.codify(prepat, true);
642 String pat = postpat == null ? prepat : postpat;
645 dontMatchInQuotes = false;
646 Rthings mk = new Rthings(this);
652 minMatch = new patInt(0);
653 StrPos sp = new StrPos(pat, 0);
654 if (sp.incMatch("(?e="))
660 newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC);
663 else if (esc != Pattern.ESC)
665 newpat = reEscape(pat, esc, Pattern.ESC);
667 thePattern = _compile(newpat, mk);
668 numSubs_ = mk.val - offset;
673 * If a Regex is compared against a Regex, a check is done to see that the
674 * patterns are equal as well as the most recent match. If a Regex is compare
675 * with a RegRes, only the result of the most recent match is compared.
677 public boolean equals(Object o)
679 if (o instanceof Regex)
681 if (toString().equals(o.toString()))
683 return super.equals(o);
692 return super.equals(o);
696 /** A clone by any other name would smell as sweet. */
697 public Object clone()
699 return new Regex(this);
702 /** Return a clone of the underlying RegRes object. */
703 public RegRes result()
705 return (RegRes) super.clone();
708 // prep sets global variables of class
709 // Pattern so that it can access them
710 // during an attempt at a match
711 Pthings pt = new Pthings();
713 final Pthings prep(StringLike s)
716 pt.lastPos = matchedTo();
721 if ((s == null ? null : s.unwrap()) != (src == null ? null : s.unwrap()))
726 pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag);
727 pt.mFlag = (mFlag | defaultMFlag);
728 pt.ignoreCase = ignoreCase;
730 if (pt.marks != null)
732 for (int i = 0; i < pt.marks.length; i++)
738 pt.nMarks = numSubs_;
740 if (dontMatchInQuotes)
752 * Attempt to match a Pattern beginning at a specified location within the
755 * @see com.stevesoft.pat.Regex#search
757 public boolean matchAt(String s, int start_pos)
759 return _search(s, start_pos, start_pos);
763 * Attempt to match a Pattern beginning at a specified location within the
766 * @see com.stevesoft.pat.Regex#search
768 public boolean matchAt(StringLike s, int start_pos)
770 return _search(s, start_pos, start_pos);
774 * Search through a String for the first occurrence of a match.
776 * @see com.stevesoft.pat.Regex#searchFrom
777 * @see com.stevesoft.pat.Regex#matchAt
779 public boolean search(String s)
783 throw new NullPointerException(
785 .getString("exception.null_string_given_to_regex_search"));
787 return _search(s, 0, s.length());
790 public boolean search(StringLike sl)
794 throw new NullPointerException(
796 .getString("exception.null_string_like_given_to_regex_search"));
798 return _search(sl, 0, sl.length());
801 public boolean reverseSearch(String s)
805 throw new NullPointerException(
807 .getString("exception.null_string_given_to_regex_reverse_search"));
809 return _reverseSearch(s, 0, s.length());
812 public boolean reverseSearch(StringLike sl)
816 throw new NullPointerException(
818 .getString("exception.null_string_like_given_to_regex_reverse_search"));
820 return _reverseSearch(sl, 0, sl.length());
824 * Search through a String for the first occurence of a match, but start at
831 public boolean searchFrom(String s, int start)
835 throw new NullPointerException(
837 .getString("exception.null_string_like_given_to_regex_search_from"));
839 return _search(s, start, s.length());
842 public boolean searchFrom(StringLike s, int start)
846 throw new NullPointerException(
848 .getString("exception.null_string_like_given_to_regex_search_from"));
850 return _search(s, start, s.length());
854 * Search through a region of a String for the first occurence of a match.
856 public boolean searchRegion(String s, int start, int end)
860 throw new NullPointerException(
862 .getString("exception.null_string_like_given_to_regex_search_region"));
864 return _search(s, start, end);
868 * Set this to change the default behavior of the "." pattern. By default it
869 * now matches perl's behavior and fails to match the '\n' character.
871 public static boolean dotDoesntMatchCR = true;
877 boolean gFlag = false;
879 /** Set the 'g' flag */
880 public void setGFlag(boolean b)
885 /** Get the state of the 'g' flag. */
886 public boolean getGFlag()
891 boolean sFlag = false;
893 /** Get the state of the sFlag */
894 public boolean getSFlag()
899 boolean mFlag = false;
901 /** Get the state of the sFlag */
902 public boolean getMFlag()
907 final boolean _search(String s, int start, int end)
909 return _search(new StringWrap(s), start, end);
912 final boolean _search(StringLike s, int start, int end)
914 if (gFlag && gFlagto > 0 && gFlags != null
915 && s.unwrap() == gFlags.unwrap())
921 Pthings pt = prep(s);
923 int up = (minMatch == null ? end : end - minMatch.i);
925 if (up < start && end >= start)
932 for (int i = start; i <= up; i++)
934 charsMatched_ = thePattern.matchAt(s, i, pt);
935 if (charsMatched_ >= 0)
937 matchFrom_ = thePattern.mfrom;
939 gFlagto = matchFrom_ + charsMatched_;
941 return didMatch_ = true;
948 for (int i = start; i <= up; i++)
950 i = skipper.find(src, i, up);
953 charsMatched_ = matchFrom_ = -1;
954 return didMatch_ = false;
956 charsMatched_ = thePattern.matchAt(s, i, pt);
957 if (charsMatched_ >= 0)
959 matchFrom_ = thePattern.mfrom;
961 gFlagto = matchFrom_ + charsMatched_;
963 return didMatch_ = true;
967 return didMatch_ = false;
971 * final boolean _search(LongStringLike s,long start,long end) { if(gFlag &&
972 * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null;
974 * Pthings pt=prep(s);
976 * int up = end;//(minMatch == null ? end : end-minMatch.i);
978 * if(up < start && end >= start) up = start;
980 * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ =
981 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
982 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
983 * return didMatch_=true; } } } else { pt.no_check = true; for(long
984 * i=start;i<=up;i++) { i = skipper.find(src,i,up); if(i<0) { charsMatched_ =
985 * matchFrom_ = -1; return didMatch_ = false; } charsMatched_ =
986 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
987 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
988 * gFlags = s; return didMatch_=true; } else { i = s.adjustIndex(i); up =
989 * s.adjustEnd(i); } } } return didMatch_=false; }
992 boolean _reverseSearch(String s, int start, int end)
994 return _reverseSearch(new StringWrap(s), start, end);
997 boolean _reverseSearch(StringLike s, int start, int end)
999 if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap())
1004 Pthings pt = prep(s);
1005 for (int i = end; i >= start; i--)
1007 charsMatched_ = thePattern.matchAt(s, i, pt);
1008 if (charsMatched_ >= 0)
1010 matchFrom_ = thePattern.mfrom;
1012 gFlagto = matchFrom_ - 1;
1014 return didMatch_ = true;
1017 return didMatch_ = false;
1020 // This routine sets the cbits variable
1021 // of class Pattern. Cbits is true for
1022 // the bit corresponding to a character inside
1024 static StringLike lasts = null;
1026 static BitSet lastbs = null;
1028 static void setCbits(StringLike s, Pthings pt)
1035 BitSet bs = new BitSet(s.length());
1037 boolean setBit = false;
1038 for (int i = 0; i < s.length(); i++)
1044 char c = s.charAt(i);
1045 if (!setBit && c == '"')
1051 else if (!setBit && c == '\'')
1057 else if (setBit && c == qc)
1061 else if (setBit && c == '\\' && i + 1 < s.length())
1070 pt.cbits = lastbs = bs;
1074 // Wanted user to over-ride this in alpha version,
1075 // but it wasn't really necessary because of this trick:
1080 return (Regex) getClass().newInstance();
1081 } catch (InstantiationException ie)
1084 } catch (IllegalAccessException iae)
1091 * Only needed for creating your own extensions of Regex. This method adds the
1092 * next Pattern in the chain of patterns or sets the Pattern if it is the
1095 protected void add(Pattern p2)
1109 * You only need to use this method if you are creating your own extentions to
1110 * Regex. compile1 compiles one Pattern element, it can be over-ridden to
1111 * allow the Regex compiler to understand new syntax. See deriv.java for an
1112 * example. This routine is the heart of class Regex. Rthings has one integer
1113 * member called intValue, it is used to keep track of the number of ()'s in
1116 * @exception com.stevesoft.pat.RegSyntax
1117 * is thrown when a nonsensensical pattern is supplied. For
1118 * example, a pattern beginning with *.
1120 protected void compile1(StrPos sp, Rthings mk) throws RegSyntax
1125 add(matchBracket(sp));
1127 else if (sp.match('|'))
1135 p = new NullPattern();
1140 else if (sp.incMatch("(?<"))
1142 patInt i = sp.getPatInt();
1145 RegSyntaxError.endItAll("No int after (?<");
1147 add(new Backup(i.intValue()));
1150 RegSyntaxError.endItAll("No ) after (?<");
1153 else if (sp.incMatch("(?>"))
1155 patInt i = sp.getPatInt();
1158 RegSyntaxError.endItAll("No int after (?>");
1160 add(new Backup(-i.intValue()));
1163 RegSyntaxError.endItAll("No ) after (?<");
1166 else if (sp.incMatch("(?@"))
1174 RegSyntaxError.endItAll("(?@ does not have closing paren");
1176 add(new Group(op, cl));
1178 else if (sp.incMatch("(?#"))
1180 while (!sp.match(')'))
1185 else if (sp.dontMatch && sp.c == 'w')
1187 // Regex r = new Regex();
1188 // r._compile("[a-zA-Z0-9_]",mk);
1189 // add(new Goop("\\w",r.thePattern));
1190 Bracket b = new Bracket(false);
1191 b.addOr(new Range('a', 'z'));
1192 b.addOr(new Range('A', 'Z'));
1193 b.addOr(new Range('0', '9'));
1194 b.addOr(new oneChar('_'));
1197 else if (sp.dontMatch && sp.c == 'G')
1201 else if (sp.dontMatch && sp.c == 's')
1203 // Regex r = new Regex();
1204 // r._compile("[ \t\n\r\b]",mk);
1205 // add(new Goop("\\s",r.thePattern));
1206 Bracket b = new Bracket(false);
1207 b.addOr(new oneChar((char) 32));
1208 b.addOr(new Range((char) 8, (char) 10));
1209 b.addOr(new oneChar((char) 13));
1212 else if (sp.dontMatch && sp.c == 'd')
1214 // Regex r = new Regex();
1215 // r._compile("[0-9]",mk);
1216 // add(new Goop("\\d",r.thePattern));
1217 Range digit = new Range('0', '9');
1218 digit.printBrackets = true;
1221 else if (sp.dontMatch && sp.c == 'W')
1223 // Regex r = new Regex();
1224 // r._compile("[^a-zA-Z0-9_]",mk);
1225 // add(new Goop("\\W",r.thePattern));
1226 Bracket b = new Bracket(true);
1227 b.addOr(new Range('a', 'z'));
1228 b.addOr(new Range('A', 'Z'));
1229 b.addOr(new Range('0', '9'));
1230 b.addOr(new oneChar('_'));
1233 else if (sp.dontMatch && sp.c == 'S')
1235 // Regex r = new Regex();
1236 // r._compile("[^ \t\n\r\b]",mk);
1237 // add(new Goop("\\S",r.thePattern));
1238 Bracket b = new Bracket(true);
1239 b.addOr(new oneChar((char) 32));
1240 b.addOr(new Range((char) 8, (char) 10));
1241 b.addOr(new oneChar((char) 13));
1244 else if (sp.dontMatch && sp.c == 'D')
1246 // Regex r = new Regex();
1247 // r._compile("[^0-9]",mk);
1248 // add(new Goop("\\D",r.thePattern));
1249 Bracket b = new Bracket(true);
1250 b.addOr(new Range('0', '9'));
1253 else if (sp.dontMatch && sp.c == 'B')
1255 Regex r = new Regex();
1256 r._compile("(?!" + back_slash + "b)", mk);
1259 else if (isOctalString(sp))
1263 d = 8 * d + sp.c - '0';
1264 StrPos sp2 = new StrPos(sp);
1266 if (isOctalDigit(sp2, false))
1269 d = 8 * d + sp.c - '0';
1271 add(new oneChar((char) d));
1273 else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9')
1275 int iv = sp.c - '0';
1276 StrPos s2 = new StrPos(sp);
1278 if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9')
1280 iv = 10 * iv + (s2.c - '0');
1283 add(new BackMatch(iv));
1285 else if (sp.dontMatch && sp.c == 'b')
1287 add(new Boundary());
1289 else if (sp.match('\b'))
1291 add(new Boundary());
1293 else if (sp.match('$'))
1297 else if (sp.dontMatch && sp.c == 'Z')
1299 add(new End(false));
1301 else if (sp.match('.'))
1305 else if (sp.incMatch("(??"))
1307 StringBuffer sb = new StringBuffer();
1308 StringBuffer sb2 = new StringBuffer();
1309 while (!sp.match(')') && !sp.match(':'))
1314 if (sp.incMatch(":"))
1316 while (!sp.match(')'))
1322 String sbs = sb.toString();
1323 if (validators.get(sbs) instanceof String)
1325 String pat = (String) validators.get(sbs);
1326 Regex r = newRegex();
1327 Rthings rth = new Rthings(this);
1328 rth.noBackRefs = true;
1329 r._compile(pat, rth);
1334 Custom cm = new Custom(sb.toString());
1337 Validator v2 = cm.v.arg(sb2.toString());
1340 v2.argsave = sb2.toString();
1341 String p = cm.v.pattern;
1345 Regex r = newRegex();
1346 Rthings rth = new Rthings(this);
1347 rth.noBackRefs = true;
1348 r._compile(cm.v.pattern, rth);
1349 cm.sub = r.thePattern;
1350 cm.sub.add(new CustomEndpoint(cm));
1351 cm.sub.setParent(cm);
1356 else if (sp.match('('))
1359 Regex r = newRegex();
1362 if (sp.incMatch("?:"))
1366 else if (sp.incMatch("?="))
1368 r.or = new lookAhead(false);
1370 else if (sp.incMatch("?!"))
1372 r.or = new lookAhead(true);
1374 else if (sp.match('?'))
1381 mk.ignoreCase = true;
1385 mk.dontMatchInQuotes = true;
1389 mk.optimizeMe = true;
1404 } while (!sp.match(')') && !sp.eos);
1407 if (sp.eos) // throw new RegSyntax
1409 RegSyntaxError.endItAll("Unclosed ()");
1413 { // just ordinary parenthesis
1414 r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++);
1418 add(r._compile(sp, mk));
1421 else if (sp.match('^'))
1423 add(new Start(true));
1425 else if (sp.dontMatch && sp.c == 'A')
1427 add(new Start(false));
1429 else if (sp.match('*'))
1431 addMulti(new patInt(0), new patInf());
1433 else if (sp.match('+'))
1435 addMulti(new patInt(1), new patInf());
1437 else if (sp.match('?'))
1439 addMulti(new patInt(0), new patInt(1));
1441 else if (sp.match('{'))
1443 boolean bad = false;
1444 StrPos sp2 = new StrPos(sp);
1445 // StringBuffer sb = new StringBuffer();
1447 patInt i1 = sp.getPatInt();
1458 * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed
1470 i2 = sp.getPatInt();
1473 if (i1 == null || i2 == null)
1476 * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}");
1483 add(new oneChar(sp.c));
1490 else if (sp.escMatch('x') && next2Hex(sp))
1493 int d = getHexDigit(sp);
1495 d = 16 * d + getHexDigit(sp);
1496 add(new oneChar((char) d));
1498 else if (sp.escMatch('c'))
1501 if (sp.c < Ctrl.cmap.length)
1503 add(new oneChar(Ctrl.cmap[sp.c]));
1507 add(new oneChar(sp.c));
1510 else if (sp.escMatch('f'))
1512 add(new oneChar((char) 12));
1514 else if (sp.escMatch('a'))
1516 add(new oneChar((char) 7));
1518 else if (sp.escMatch('t'))
1520 add(new oneChar('\t'));
1522 else if (sp.escMatch('n'))
1524 add(new oneChar('\n'));
1526 else if (sp.escMatch('r'))
1528 add(new oneChar('\r'));
1530 else if (sp.escMatch('b'))
1532 add(new oneChar('\b'));
1534 else if (sp.escMatch('e'))
1536 add(new oneChar((char) 27));
1540 add(new oneChar(sp.c));
1543 RegSyntaxError.endItAll("Unmatched right paren in pattern");
1548 // compiles all Pattern elements, internal method
1549 private Pattern _compile(String pat, Rthings mk) throws RegSyntax
1552 sFlag = mFlag = ignoreCase = gFlag = false;
1553 StrPos sp = new StrPos(pat, 0);
1554 thePattern = _compile(sp, mk);
1563 Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax
1565 while (!(sp.eos || (or != null && sp.match(')'))))
1574 else if (sp.eos && mk.parenLevel != 0)
1576 RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel);
1582 p = new NullPattern();
1587 return p == null ? new NullPattern() : p;
1590 // add a multi object to the end of the chain
1591 // which applies to the last object
1592 void addMulti(patInt i1, patInt i2) throws RegSyntax
1594 Pattern last, last2;
1595 for (last = p; last != null && last.next != null; last = last.next)
1599 if (last == null || last == p)
1605 for (last2 = p; last2.next != last; last2 = last2.next)
1610 if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1)
1612 ((Multi) last).matchFewest = true;
1614 else if (last instanceof FastMulti && i1.intValue() == 0
1615 && i2.intValue() == 1)
1617 ((FastMulti) last).matchFewest = true;
1619 else if (last instanceof DotMulti && i1.intValue() == 0
1620 && i2.intValue() == 1)
1622 ((DotMulti) last).matchFewest = true;
1624 else if (last instanceof Multi || last instanceof DotMulti
1625 || last instanceof FastMulti)
1627 throw new RegSyntax("Syntax error.");
1629 else if (last2 == null)
1631 p = mkMulti(i1, i2, p);
1635 last2.next = mkMulti(i1, i2, last);
1639 final static Pattern mkMulti(patInt lo, patInt hi, Pattern p)
1642 if (p instanceof Any && p.next == null)
1644 return (Pattern) new DotMulti(lo, hi);
1646 return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p)
1647 : (Pattern) new Multi(lo, hi, p);
1650 // process the bracket operator
1651 Pattern matchBracket(StrPos sp) throws RegSyntax
1656 ret = new Bracket(true);
1661 ret = new Bracket(false);
1665 // throw new RegSyntax
1666 RegSyntaxError.endItAll("Unmatched []");
1669 while (!sp.eos && !sp.match(']'))
1671 StrPos s1 = new StrPos(sp);
1673 StrPos s1_ = new StrPos(s1);
1675 if (s1.match('-') && !s1_.match(']'))
1677 StrPos s2 = new StrPos(s1);
1681 ret.addOr(new Range(sp.c, s2.c));
1686 else if (sp.escMatch('Q'))
1689 while (!sp.escMatch('E'))
1691 ret.addOr(new oneChar(sp.c));
1695 else if (sp.escMatch('d'))
1697 ret.addOr(new Range('0', '9'));
1699 else if (sp.escMatch('s'))
1701 ret.addOr(new oneChar((char) 32));
1702 ret.addOr(new Range((char) 8, (char) 10));
1703 ret.addOr(new oneChar((char) 13));
1705 else if (sp.escMatch('w'))
1707 ret.addOr(new Range('a', 'z'));
1708 ret.addOr(new Range('A', 'Z'));
1709 ret.addOr(new Range('0', '9'));
1710 ret.addOr(new oneChar('_'));
1712 else if (sp.escMatch('D'))
1714 ret.addOr(new Range((char) 0, (char) 47));
1715 ret.addOr(new Range((char) 58, (char) 65535));
1717 else if (sp.escMatch('S'))
1719 ret.addOr(new Range((char) 0, (char) 7));
1720 ret.addOr(new Range((char) 11, (char) 12));
1721 ret.addOr(new Range((char) 14, (char) 31));
1722 ret.addOr(new Range((char) 33, (char) 65535));
1724 else if (sp.escMatch('W'))
1726 ret.addOr(new Range((char) 0, (char) 64));
1727 ret.addOr(new Range((char) 91, (char) 94));
1728 ret.addOr(new oneChar((char) 96));
1729 ret.addOr(new Range((char) 123, (char) 65535));
1731 else if (sp.escMatch('x') && next2Hex(sp))
1734 int d = getHexDigit(sp);
1736 d = 16 * d + getHexDigit(sp);
1737 ret.addOr(new oneChar((char) d));
1739 else if (sp.escMatch('a'))
1741 ret.addOr(new oneChar((char) 7));
1743 else if (sp.escMatch('f'))
1745 ret.addOr(new oneChar((char) 12));
1747 else if (sp.escMatch('e'))
1749 ret.addOr(new oneChar((char) 27));
1751 else if (sp.escMatch('n'))
1753 ret.addOr(new oneChar('\n'));
1755 else if (sp.escMatch('t'))
1757 ret.addOr(new oneChar('\t'));
1759 else if (sp.escMatch('r'))
1761 ret.addOr(new oneChar('\r'));
1763 else if (sp.escMatch('c'))
1766 if (sp.c < Ctrl.cmap.length)
1768 ret.addOr(new oneChar(Ctrl.cmap[sp.c]));
1772 ret.addOr(new oneChar(sp.c));
1775 else if (isOctalString(sp))
1779 d = 8 * d + sp.c - '0';
1780 StrPos sp2 = new StrPos(sp);
1782 if (isOctalDigit(sp2, false))
1785 d = 8 * d + sp.c - '0';
1787 ret.addOr(new oneChar((char) d));
1791 ret.addOr(new oneChar(sp.c));
1799 * Converts the stored Pattern to a String -- this is a decompile. Note that
1800 * \t and \n will really print out here, Not just the two character
1801 * representations. Also be prepared to see some strange output if your
1802 * characters are not printable.
1804 public String toString()
1806 if (false && thePattern == null)
1812 StringBuffer sb = new StringBuffer();
1813 if (esc != Pattern.ESC)
1819 if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase
1820 || dontMatchInQuotes || optimized())
1831 if (sFlag || !dotDoesntMatchCR)
1835 if (dontMatchInQuotes)
1849 String patstr = thePattern.toString();
1850 if (esc != Pattern.ESC)
1852 patstr = reEscape(patstr, Pattern.ESC, esc);
1855 return sb.toString();
1859 // Re-escape Pattern, allows us to use a different escape
1861 static String reEscape(String s, char oldEsc, char newEsc)
1863 if (oldEsc == newEsc)
1868 StringBuffer sb = new StringBuffer();
1869 for (i = 0; i < s.length(); i++)
1871 if (s.charAt(i) == oldEsc && i + 1 < s.length())
1873 if (s.charAt(i + 1) == oldEsc)
1880 sb.append(s.charAt(i + 1));
1884 else if (s.charAt(i) == newEsc)
1891 sb.append(s.charAt(i));
1894 return sb.toString();
1898 * This method implements FilenameFilter, allowing one to use a Regex to
1899 * search through a directory using File.list. There is a FileRegex now that
1902 * @see com.stevesoft.pat.FileRegex
1904 public boolean accept(File dir, String s)
1909 /** The version of this package */
1910 final static public String version()
1912 return "lgpl release 1.5.3";
1916 * Once this method is called, the state of variables ignoreCase and
1917 * dontMatchInQuotes should not be changed as the results will be
1918 * unpredictable. However, search and matchAt will run more quickly. Note that
1919 * you can check to see if the pattern has been optimized by calling the
1920 * optimized() method.
1922 * This method will attempt to rewrite your pattern in a way that makes it
1923 * faster (not all patterns execute at the same speed). In general,
1924 * "(?: ... )" will be faster than "( ... )" so if you don't need the
1925 * backreference, you should group using the former pattern.
1927 * It will also introduce new pattern elements that you can't get to
1928 * otherwise, for example if you have a large table of strings, i.e. the
1929 * months of the year "(January|February|...)" optimize() will make a
1930 * Hashtable that takes it to the next appropriate pattern element --
1931 * eliminating the need for a linear search.
1933 * @see com.stevesoft.pat.Regex#optimized
1934 * @see com.stevesoft.pat.Regex#ignoreCase
1935 * @see com.stevesoft.pat.Regex#dontMatchInQuotes
1936 * @see com.stevesoft.pat.Regex#matchAt
1937 * @see com.stevesoft.pat.Regex#search
1939 public void optimize()
1941 if (optimized() || thePattern == null)
1945 minMatch = new patInt(0); // thePattern.countMinChars();
1946 thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes);
1947 skipper = Skip.findSkip(this);
1948 // RegOpt.setParents(this);
1955 * This function returns true if the optimize method has been called.
1957 public boolean optimized()
1959 return minMatch != null;
1963 * A bit of syntactic surgar for those who want to make their code look more
1964 * perl-like. To use this initialize your Regex object by saying:
1967 * Regex r1 = Regex.perlCode("s/hello/goodbye/");
1968 * Regex r2 = Regex.perlCode("s'fish'frog'i");
1969 * Regex r3 = Regex.perlCode("m'hello');
1972 * The i for ignoreCase is supported in this syntax, as well as m, s, and x.
1973 * The g flat is a bit of a special case.
1975 * If you wish to replace all occurences of a pattern, you do not put a 'g' in
1976 * the perlCode, but call Regex's replaceAll method.
1978 * If you wish to simply and only do a search for r2's pattern, you can do
1979 * this by calling the searchFrom method method repeatedly, or by calling
1980 * search repeatedly if the g flag is set.
1982 * Note: Currently perlCode does <em>not</em> support the (?e=#) syntax for
1983 * changing the escape character.
1986 public static Regex perlCode(String s)
1988 // this file is big enough, see parsePerl.java
1989 // for this function.
1990 return parsePerl.parse(s);
1993 static final char back_slash = '\\';
1996 * Checks to see if there are only literal and no special pattern elements in
1999 public boolean isLiteral()
2001 Pattern x = thePattern;
2004 if (x instanceof oneChar)
2008 else if (x instanceof Skipped)
2022 * You only need to know about this if you are inventing your own pattern
2025 public patInt countMinChars()
2027 return thePattern.countMinChars();
2031 * You only need to know about this if you are inventing your own pattern
2034 public patInt countMaxChars()
2036 return thePattern.countMaxChars();
2039 boolean isHexDigit(StrPos sp)
2043 && ((sp.c >= '0' && sp.c <= '9')
2044 || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F'));
2048 boolean isOctalDigit(StrPos sp, boolean first)
2050 boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0'
2055 int getHexDigit(StrPos sp)
2057 if (sp.c >= '0' && sp.c <= '9')
2061 if (sp.c >= 'a' && sp.c <= 'f')
2063 return sp.c - 'a' + 10;
2065 return sp.c - 'A' + 10;
2068 boolean next2Hex(StrPos sp)
2070 StrPos sp2 = new StrPos(sp);
2072 if (!isHexDigit(sp2))
2077 if (!isHexDigit(sp2))
2084 boolean isOctalString(StrPos sp)
2086 if (!isOctalDigit(sp, true))
2090 StrPos sp2 = new StrPos(sp);
2092 if (!isOctalDigit(sp2, false))