2 // This software is now distributed according to
3 // the Lesser Gnu Public License. Please see
4 // http://www.gnu.org/copyleft/lesser.txt for
8 package com.stevesoft.pat;
10 import jalview.util.MessageManager;
13 import java.io.FilenameFilter;
14 import java.util.BitSet;
15 import java.util.Hashtable;
17 import com.stevesoft.pat.wrap.StringWrap;
20 /** Matches a Unicode punctuation character. */
21 class UnicodePunct extends UniValidator
24 public int validate(StringLike s, int from, int to)
26 return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1;
30 /** Matches a Unicode white space character. */
31 class UnicodeWhite extends UniValidator
34 public int validate(StringLike s, int from, int to)
36 return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1;
41 * Matches a character that is not a Unicode punctuation character.
43 class NUnicodePunct extends UniValidator
46 public int validate(StringLike s, int from, int to)
48 return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1;
53 * Matches a character that is not a Unicode white space character.
55 class NUnicodeWhite extends UniValidator
58 public int validate(StringLike s, int from, int to)
60 return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1;
64 /** Matches a Unicode word character: an alphanumeric or underscore. */
65 class UnicodeW extends UniValidator
68 public int validate(StringLike s, int from, int to)
70 if (from >= s.length())
74 char c = s.charAt(from);
75 return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
80 /** Matches a character that is not a Unicode alphanumeric or underscore. */
81 class NUnicodeW extends UniValidator
84 public int validate(StringLike s, int from, int to)
86 if (from >= s.length())
90 char c = s.charAt(from);
91 return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
96 /** Matches a Unicode decimal digit. */
97 class UnicodeDigit extends UniValidator
100 public int validate(StringLike s, int from, int to)
102 return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to
107 /** Matches a character that is not a Unicode digit. */
108 class NUnicodeDigit extends UniValidator
111 public int validate(StringLike s, int from, int to)
113 return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to
118 /** Matches a Unicode math character. */
119 class UnicodeMath extends UniValidator
122 public int validate(StringLike s, int from, int to)
124 return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1;
128 /** Matches a non-math Unicode character. */
129 class NUnicodeMath extends UniValidator
132 public int validate(StringLike s, int from, int to)
134 return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1;
138 /** Matches a Unicode currency symbol. */
139 class UnicodeCurrency extends UniValidator
142 public int validate(StringLike s, int from, int to)
144 return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1;
148 /** Matches a non-currency symbol Unicode character. */
149 class NUnicodeCurrency extends UniValidator
152 public int validate(StringLike s, int from, int to)
154 return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1;
158 /** Matches a Unicode alphabetic character. */
159 class UnicodeAlpha extends UniValidator
162 public int validate(StringLike s, int from, int to)
164 return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1;
168 /** Matches a non-alphabetic Unicode character. */
169 class NUnicodeAlpha extends UniValidator
172 public int validate(StringLike s, int from, int to)
174 return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to
179 /** Matches an upper case Unicode character. */
180 class UnicodeUpper extends UniValidator
183 public int validate(StringLike s, int from, int to)
185 return from < s.length() && isUpper(s.charAt(from)) ? to : -1;
188 final boolean isUpper(char c)
190 return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c);
194 /** Matches an upper case Unicode character. */
195 class UnicodeLower extends UniValidator
198 public int validate(StringLike s, int from, int to)
200 return from < s.length() && isLower(s.charAt(from)) ? to : -1;
203 final boolean isLower(char c)
205 return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c);
210 * Regex provides the parser which constructs the linked list of Pattern classes
213 * For the purpose of this documentation, the fact that java interprets the
214 * backslash will be ignored. In practice, however, you will need a double
215 * backslash to obtain a string that contains a single backslash character.
216 * Thus, the example pattern "\b" should really be typed as "\\b" inside java
219 * Note that Regex is part of package "com.stevesoft.pat". To use it, simply
220 * import com.stevesoft.pat.Regex at the top of your file.
222 * Regex is made with a constructor that takes a String that defines the regular
223 * expression. Thus, for example
226 * Regex r = new Regex("[a-c]*");
229 * matches any number of characters so long as the are 'a', 'b', or 'c').
231 * To attempt to match the Pattern to a given string, you can use either the
232 * search(String) member function, or the matchAt(String,int position) member
233 * function. These functions return a boolean which tells you whether or not the
234 * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in
235 * the Regex object appropriately.
237 * The portion of the string before the match can be obtained by the left()
238 * member, and the portion after the match can be obtained by the right()
241 * Essentially, this package implements a syntax that is very much like the perl
242 * 5 regular expression syntax.
247 * Regex r = new Regex("x(a|b)y");
248 * r.matchAt("xay", 0);
249 * System.out.println("sub = " + r.stringMatched(1));
252 * The above would print "sub = a".
255 * r.left() // would return "x"
256 * r.right() // would return "y"
260 * Differences between this package and perl5:<br>
261 * The extended Pattern for setting flags, is now supported, but the flags are
262 * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the
263 * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the
264 * escape character. The pattern
276 * , but note that the sequence
282 * <b>must</b> occur at the very beginning of the pattern. There may be other
283 * small differences as well. I will either make my package conform or note them
284 * as I become aware of them.
286 * This package supports additional patterns not in perl5: <center>
291 * <td>This matches all characters between the '(' character and the balancing
292 * ')' character. Thus, it will match "()" as well as "(())". The balancing
293 * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".</td>
297 * <td>Moves the pointer backwards within the text. This allows you to make a
298 * "look behind." It fails if it attempts to move to a position before the
299 * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1
300 * in this example, is the number of characters to move backwards.</td>
304 * @author Steven R. Brandt
305 * @version package com.stevesoft.pat, release 1.5.3
308 public class Regex extends RegRes implements FilenameFilter
314 * This is the entry class. Load the core file directly, if it exists. See
320 * swingjs.JSUtil.loadStaticResource$S("core/core_stevesoft.z.js");
325 * BackRefOffset gives the identity number of the first pattern. Version 1.0
326 * used zero, version 1.1 uses 1 to be more compatible with perl.
328 static int BackRefOffset = 1;
330 private static Pattern none = new NoPattern();
332 Pattern thePattern = none;
334 patInt minMatch = new patInt(0);
336 static Hashtable validators = new Hashtable();
339 define("p", "(?>1)", new UnicodePunct());
340 define("P", "(?>1)", new NUnicodePunct());
341 define("s", "(?>1)", new UnicodeWhite());
342 define("S", "(?>1)", new NUnicodeWhite());
343 define("w", "(?>1)", new UnicodeW());
344 define("W", "(?>1)", new NUnicodeW());
345 define("d", "(?>1)", new UnicodeDigit());
346 define("D", "(?>1)", new NUnicodeDigit());
347 define("m", "(?>1)", new UnicodeMath());
348 define("M", "(?>1)", new NUnicodeMath());
349 define("c", "(?>1)", new UnicodeCurrency());
350 define("C", "(?>1)", new NUnicodeCurrency());
351 define("a", "(?>1)", new UnicodeAlpha());
352 define("A", "(?>1)", new NUnicodeAlpha());
353 define("uc", "(?>1)", new UnicodeUpper());
354 define("lc", "(?>1)", new UnicodeLower());
357 /** Set the dontMatch in quotes flag. */
358 public void setDontMatchInQuotes(boolean b)
360 dontMatchInQuotes = b;
363 /** Find out if the dontMatchInQuotes flag is enabled. */
364 public boolean getDontMatchInQuotes()
366 return dontMatchInQuotes;
369 boolean dontMatchInQuotes = false;
372 * Set the state of the ignoreCase flag. If set to true, then the pattern
373 * matcher will ignore case when searching for a match.
375 public void setIgnoreCase(boolean b)
381 * Get the state of the ignoreCase flag. Returns true if we are ignoring the
382 * case of the pattern, false otherwise.
384 public boolean getIgnoreCase()
389 boolean ignoreCase = false;
391 static boolean defaultMFlag = false;
394 * Set the default value of the m flag. If it is set to true, then the MFlag
395 * will be on for any regex search executed.
397 public static void setDefaultMFlag(boolean mFlag)
399 defaultMFlag = mFlag;
403 * Get the default value of the m flag. If it is set to true, then the MFlag
404 * will be on for any regex search executed.
406 public static boolean getDefaultMFlag()
412 * Initializes the object without a Pattern. To supply a Pattern use
415 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
422 * Create and compile a Regex, but do not throw any exceptions. If you wish to
423 * have exceptions thrown for syntax errors, you must use the Regex(void)
424 * constructor to create the Regex object, and then call the compile method.
425 * Therefore, you should only call this method when you know your pattern is
426 * right. I will probably become more like
428 * @see com.stevesoft.pat.Regex#search(java.lang.String)
429 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
431 public Regex(String s)
436 } catch (RegSyntax rs)
441 ReplaceRule rep = null;
444 * Create and compile both a Regex and a ReplaceRule.
446 * @see com.stevesoft.pat.ReplaceRule
447 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
449 public Regex(String s, String rp)
452 rep = ReplaceRule.perlCode(rp);
456 * Create and compile a Regex, but give it the ReplaceRule specified. This
457 * allows the user finer control of the Replacement process, if that is
460 * @see com.stevesoft.pat.ReplaceRule
461 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
463 public Regex(String s, ReplaceRule rp)
470 * Change the ReplaceRule of this Regex by compiling a new one using String
473 public void setReplaceRule(String rp)
475 rep = ReplaceRule.perlCode(rp);
476 repr = null; // Clear Replacer history
479 /** Change the ReplaceRule of this Regex to rp. */
480 public void setReplaceRule(ReplaceRule rp)
486 * Test to see if a custom defined rule exists.
488 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
490 public static boolean isDefined(String nm)
492 return validators.get(nm) != null;
496 * Removes a custom defined rule.
498 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
500 public static void undefine(String nm)
502 validators.remove(nm);
506 * Defines a method to create a new rule. See test/deriv2.java and
507 * test/deriv3.java for examples of how to use it.
509 public static void define(String nm, String pat, Validator v)
512 validators.put(nm, v);
516 * Defines a shorthand for a pattern. The pattern will be invoked by a string
517 * that has the form "(??"+nm+")".
519 public static void define(String nm, String pat)
521 validators.put(nm, pat);
524 /** Get the current ReplaceRule. */
525 public ReplaceRule getReplaceRule()
530 Replacer repr = null;
532 final Replacer _getReplacer()
534 return repr == null ? repr = new Replacer() : repr;
537 public Replacer getReplacer()
541 repr = new Replacer();
549 * Replace the first occurence of this pattern in String s according to the
552 * @see com.stevesoft.pat.ReplaceRule
553 * @see com.stevesoft.pat.Regex#getReplaceRule()
555 public String replaceFirst(String s)
557 return _getReplacer().replaceFirstRegion(s, this, 0, s.length())
562 * Replace the first occurence of this pattern in String s beginning with
563 * position pos according to the ReplaceRule.
565 * @see com.stevesoft.pat.ReplaceRule
566 * @see com.stevesoft.pat.Regex#getReplaceRule()
568 public String replaceFirstFrom(String s, int pos)
570 return _getReplacer().replaceFirstRegion(s, this, pos, s.length())
575 * Replace the first occurence of this pattern in String s beginning with
576 * position start and ending with end according to the ReplaceRule.
578 * @see com.stevesoft.pat.ReplaceRule
579 * @see com.stevesoft.pat.Regex#getReplaceRule()
581 public String replaceFirstRegion(String s, int start, int end)
583 return _getReplacer().replaceFirstRegion(s, this, start, end)
588 * Replace all occurences of this pattern in String s according to the
591 * @see com.stevesoft.pat.ReplaceRule
592 * @see com.stevesoft.pat.Regex#getReplaceRule()
594 public String replaceAll(String s)
596 return _getReplacer().replaceAllRegion(s, this, 0, s.length())
600 public StringLike replaceAll(StringLike s)
602 return _getReplacer().replaceAllRegion(s, this, 0, s.length());
606 * Replace all occurences of this pattern in String s beginning with position
607 * pos according to the ReplaceRule.
609 * @see com.stevesoft.pat.ReplaceRule
610 * @see com.stevesoft.pat.Regex#getReplaceRule()
612 public String replaceAllFrom(String s, int pos)
614 return _getReplacer().replaceAllRegion(s, this, pos, s.length())
619 * Replace all occurences of this pattern in String s beginning with position
620 * start and ending with end according to the ReplaceRule.
622 * @see com.stevesoft.pat.ReplaceRule
623 * @see com.stevesoft.pat.Regex#getReplaceRule()
625 public String replaceAllRegion(String s, int start, int end)
627 return _getReplacer().replaceAllRegion(s, this, start, end).toString();
630 /** Essentially clones the Regex object */
631 public Regex(Regex r)
634 dontMatchInQuotes = r.dontMatchInQuotes;
636 ignoreCase = r.ignoreCase;
644 rep = (ReplaceRule) r.rep.clone();
647 * try { compile(r.toString()); } catch(RegSyntax r_) {}
649 thePattern = r.thePattern.clone(new Hashtable());
650 minMatch = r.minMatch;
655 * By default, the escape character is the backslash, but you can make it
656 * anything you want by setting this variable.
658 public char esc = Pattern.ESC;
661 * This method compiles a regular expression, making it possible to call the
662 * search or matchAt methods.
664 * @exception com.stevesoft.pat.RegSyntax
665 * is thrown if a syntax error is encountered in the pattern. For
666 * example, "x{3,1}" or "*a" are not valid patterns.
667 * @see com.stevesoft.pat.Regex#search
668 * @see com.stevesoft.pat.Regex#matchAt
670 public void compile(String prepat) throws RegSyntax
672 String postpat = parsePerl.codify(prepat, true);
673 String pat = postpat == null ? prepat : postpat;
676 dontMatchInQuotes = false;
677 Rthings mk = new Rthings(this);
683 minMatch = new patInt(0);
684 StrPos sp = new StrPos(pat, 0);
685 if (sp.incMatch("(?e="))
691 newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC);
694 else if (esc != Pattern.ESC)
696 newpat = reEscape(pat, esc, Pattern.ESC);
698 thePattern = _compile(newpat, mk);
699 numSubs_ = mk.val - offset;
704 * If a Regex is compared against a Regex, a check is done to see that the
705 * patterns are equal as well as the most recent match. If a Regex is compare
706 * with a RegRes, only the result of the most recent match is compared.
709 public boolean equals(Object o)
711 if (o instanceof Regex)
713 if (toString().equals(o.toString()))
715 return super.equals(o);
724 return super.equals(o);
728 /** A clone by any other name would smell as sweet. */
730 public Object clone()
732 return new Regex(this);
735 /** Return a clone of the underlying RegRes object. */
736 public RegRes result()
738 return (RegRes) super.clone();
741 // prep sets global variables of class
742 // Pattern so that it can access them
743 // during an attempt at a match
744 Pthings pt = new Pthings();
746 final Pthings prep(StringLike s)
749 pt.lastPos = matchedTo();
754 if ((s == null ? null : s.unwrap()) != (src == null ? null : s.unwrap()))
759 pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag);
760 pt.mFlag = (mFlag | defaultMFlag);
761 pt.ignoreCase = ignoreCase;
763 if (pt.marks != null)
765 for (int i = 0; i < pt.marks.length; i++)
771 pt.nMarks = numSubs_;
773 if (dontMatchInQuotes)
785 * Attempt to match a Pattern beginning at a specified location within the
788 * @see com.stevesoft.pat.Regex#search
790 public boolean matchAt(String s, int start_pos)
792 return _search(s, start_pos, start_pos);
796 * Attempt to match a Pattern beginning at a specified location within the
799 * @see com.stevesoft.pat.Regex#search
801 public boolean matchAt(StringLike s, int start_pos)
803 return _search(s, start_pos, start_pos);
807 * Search through a String for the first occurrence of a match.
809 * @see com.stevesoft.pat.Regex#searchFrom
810 * @see com.stevesoft.pat.Regex#matchAt
812 public boolean search(String s)
816 throw new NullPointerException(
818 .getString("exception.null_string_given_to_regex_search"));
820 return _search(s, 0, s.length());
823 public boolean search(StringLike sl)
827 throw new NullPointerException(
829 .getString("exception.null_string_like_given_to_regex_search"));
831 return _search(sl, 0, sl.length());
834 public boolean reverseSearch(String s)
838 throw new NullPointerException(
840 .getString("exception.null_string_given_to_regex_reverse_search"));
842 return _reverseSearch(s, 0, s.length());
845 public boolean reverseSearch(StringLike sl)
849 throw new NullPointerException(
851 .getString("exception.null_string_like_given_to_regex_reverse_search"));
853 return _reverseSearch(sl, 0, sl.length());
857 * Search through a String for the first occurence of a match, but start at
864 public boolean searchFrom(String s, int start)
868 throw new NullPointerException(
870 .getString("exception.null_string_like_given_to_regex_search_from"));
872 return _search(s, start, s.length());
875 public boolean searchFrom(StringLike s, int start)
879 throw new NullPointerException(
881 .getString("exception.null_string_like_given_to_regex_search_from"));
883 return _search(s, start, s.length());
887 * Search through a region of a String for the first occurence of a match.
889 public boolean searchRegion(String s, int start, int end)
893 throw new NullPointerException(
895 .getString("exception.null_string_like_given_to_regex_search_region"));
897 return _search(s, start, end);
901 * Set this to change the default behavior of the "." pattern. By default it
902 * now matches perl's behavior and fails to match the '\n' character.
904 public static boolean dotDoesntMatchCR = true;
910 boolean gFlag = false;
912 /** Set the 'g' flag */
913 public void setGFlag(boolean b)
918 /** Get the state of the 'g' flag. */
919 public boolean getGFlag()
924 boolean sFlag = false;
926 /** Get the state of the sFlag */
927 public boolean getSFlag()
932 boolean mFlag = false;
934 /** Get the state of the sFlag */
935 public boolean getMFlag()
940 final boolean _search(String s, int start, int end)
942 return _search(new StringWrap(s), start, end);
945 final boolean _search(StringLike s, int start, int end)
947 if (gFlag && gFlagto > 0 && gFlags != null
948 && s.unwrap() == gFlags.unwrap())
954 Pthings pt = prep(s);
956 int up = (minMatch == null ? end : end - minMatch.i);
958 if (up < start && end >= start)
965 for (int i = start; i <= up; i++)
967 charsMatched_ = thePattern.matchAt(s, i, pt);
968 if (charsMatched_ >= 0)
970 matchFrom_ = thePattern.mfrom;
972 gFlagto = matchFrom_ + charsMatched_;
974 return didMatch_ = true;
981 for (int i = start; i <= up; i++)
983 i = skipper.find(src, i, up);
986 charsMatched_ = matchFrom_ = -1;
987 return didMatch_ = false;
989 charsMatched_ = thePattern.matchAt(s, i, pt);
990 if (charsMatched_ >= 0)
992 matchFrom_ = thePattern.mfrom;
994 gFlagto = matchFrom_ + charsMatched_;
996 return didMatch_ = true;
1000 return didMatch_ = false;
1004 * final boolean _search(LongStringLike s,long start,long end) { if(gFlag &&
1005 * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null;
1007 * Pthings pt=prep(s);
1009 * int up = end;//(minMatch == null ? end : end-minMatch.i);
1011 * if(up < start && end >= start) up = start;
1013 * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ =
1014 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
1015 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
1016 * return didMatch_=true; } } } else { pt.no_check = true; for(long
1017 * i=start;i<=up;i++) { i = skipper.find(src,i,up); if(i<0) { charsMatched_ =
1018 * matchFrom_ = -1; return didMatch_ = false; } charsMatched_ =
1019 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
1020 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
1021 * gFlags = s; return didMatch_=true; } else { i = s.adjustIndex(i); up =
1022 * s.adjustEnd(i); } } } return didMatch_=false; }
1025 boolean _reverseSearch(String s, int start, int end)
1027 return _reverseSearch(new StringWrap(s), start, end);
1030 boolean _reverseSearch(StringLike s, int start, int end)
1032 if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap())
1037 Pthings pt = prep(s);
1038 for (int i = end; i >= start; i--)
1040 charsMatched_ = thePattern.matchAt(s, i, pt);
1041 if (charsMatched_ >= 0)
1043 matchFrom_ = thePattern.mfrom;
1045 gFlagto = matchFrom_ - 1;
1047 return didMatch_ = true;
1050 return didMatch_ = false;
1053 // This routine sets the cbits variable
1054 // of class Pattern. Cbits is true for
1055 // the bit corresponding to a character inside
1057 static StringLike lasts = null;
1059 static BitSet lastbs = null;
1061 static void setCbits(StringLike s, Pthings pt)
1068 BitSet bs = new BitSet(s.length());
1070 boolean setBit = false;
1071 for (int i = 0; i < s.length(); i++)
1077 char c = s.charAt(i);
1078 if (!setBit && c == '"')
1084 else if (!setBit && c == '\'')
1090 else if (setBit && c == qc)
1094 else if (setBit && c == '\\' && i + 1 < s.length())
1103 pt.cbits = lastbs = bs;
1107 // Wanted user to over-ride this in alpha version,
1108 // but it wasn't really necessary because of this trick:
1113 return getClass().newInstance();
1114 } catch (InstantiationException ie)
1117 } catch (IllegalAccessException iae)
1124 * Only needed for creating your own extensions of Regex. This method adds the
1125 * next Pattern in the chain of patterns or sets the Pattern if it is the
1128 protected void add(Pattern p2)
1142 * You only need to use this method if you are creating your own extentions to
1143 * Regex. compile1 compiles one Pattern element, it can be over-ridden to
1144 * allow the Regex compiler to understand new syntax. See deriv.java for an
1145 * example. This routine is the heart of class Regex. Rthings has one integer
1146 * member called intValue, it is used to keep track of the number of ()'s in
1149 * @exception com.stevesoft.pat.RegSyntax
1150 * is thrown when a nonsensensical pattern is supplied. For
1151 * example, a pattern beginning with *.
1153 protected void compile1(StrPos sp, Rthings mk) throws RegSyntax
1158 add(matchBracket(sp));
1160 else if (sp.match('|'))
1168 p = new NullPattern();
1173 else if (sp.incMatch("(?<"))
1175 patInt i = sp.getPatInt();
1178 RegSyntaxError.endItAll("No int after (?<");
1180 add(new Backup(i.intValue()));
1183 RegSyntaxError.endItAll("No ) after (?<");
1186 else if (sp.incMatch("(?>"))
1188 patInt i = sp.getPatInt();
1191 RegSyntaxError.endItAll("No int after (?>");
1193 add(new Backup(-i.intValue()));
1196 RegSyntaxError.endItAll("No ) after (?<");
1199 else if (sp.incMatch("(?@"))
1207 RegSyntaxError.endItAll("(?@ does not have closing paren");
1209 add(new Group(op, cl));
1211 else if (sp.incMatch("(?#"))
1213 while (!sp.match(')'))
1218 else if (sp.dontMatch && sp.c == 'w')
1220 // Regex r = new Regex();
1221 // r._compile("[a-zA-Z0-9_]",mk);
1222 // add(new Goop("\\w",r.thePattern));
1223 Bracket b = new Bracket(false);
1224 b.addOr(new Range('a', 'z'));
1225 b.addOr(new Range('A', 'Z'));
1226 b.addOr(new Range('0', '9'));
1227 b.addOr(new oneChar('_'));
1230 else if (sp.dontMatch && sp.c == 'G')
1234 else if (sp.dontMatch && sp.c == 's')
1236 // Regex r = new Regex();
1237 // r._compile("[ \t\n\r\b]",mk);
1238 // add(new Goop("\\s",r.thePattern));
1239 Bracket b = new Bracket(false);
1240 b.addOr(new oneChar((char) 32));
1241 b.addOr(new Range((char) 8, (char) 10));
1242 b.addOr(new oneChar((char) 13));
1245 else if (sp.dontMatch && sp.c == 'd')
1247 // Regex r = new Regex();
1248 // r._compile("[0-9]",mk);
1249 // add(new Goop("\\d",r.thePattern));
1250 Range digit = new Range('0', '9');
1251 digit.printBrackets = true;
1254 else if (sp.dontMatch && sp.c == 'W')
1256 // Regex r = new Regex();
1257 // r._compile("[^a-zA-Z0-9_]",mk);
1258 // add(new Goop("\\W",r.thePattern));
1259 Bracket b = new Bracket(true);
1260 b.addOr(new Range('a', 'z'));
1261 b.addOr(new Range('A', 'Z'));
1262 b.addOr(new Range('0', '9'));
1263 b.addOr(new oneChar('_'));
1266 else if (sp.dontMatch && sp.c == 'S')
1268 // Regex r = new Regex();
1269 // r._compile("[^ \t\n\r\b]",mk);
1270 // add(new Goop("\\S",r.thePattern));
1271 Bracket b = new Bracket(true);
1272 b.addOr(new oneChar((char) 32));
1273 b.addOr(new Range((char) 8, (char) 10));
1274 b.addOr(new oneChar((char) 13));
1277 else if (sp.dontMatch && sp.c == 'D')
1279 // Regex r = new Regex();
1280 // r._compile("[^0-9]",mk);
1281 // add(new Goop("\\D",r.thePattern));
1282 Bracket b = new Bracket(true);
1283 b.addOr(new Range('0', '9'));
1286 else if (sp.dontMatch && sp.c == 'B')
1288 Regex r = new Regex();
1289 r._compile("(?!" + back_slash + "b)", mk);
1292 else if (isOctalString(sp))
1296 d = 8 * d + sp.c - '0';
1297 StrPos sp2 = new StrPos(sp);
1299 if (isOctalDigit(sp2, false))
1302 d = 8 * d + sp.c - '0';
1304 add(new oneChar((char) d));
1306 else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9')
1308 int iv = sp.c - '0';
1309 StrPos s2 = new StrPos(sp);
1311 if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9')
1313 iv = 10 * iv + (s2.c - '0');
1316 add(new BackMatch(iv));
1318 else if (sp.dontMatch && sp.c == 'b')
1320 add(new Boundary());
1322 else if (sp.match('\b'))
1324 add(new Boundary());
1326 else if (sp.match('$'))
1330 else if (sp.dontMatch && sp.c == 'Z')
1332 add(new End(false));
1334 else if (sp.match('.'))
1338 else if (sp.incMatch("(??"))
1340 StringBuffer sb = new StringBuffer();
1341 StringBuffer sb2 = new StringBuffer();
1342 while (!sp.match(')') && !sp.match(':'))
1347 if (sp.incMatch(":"))
1349 while (!sp.match(')'))
1355 String sbs = sb.toString();
1356 if (validators.get(sbs) instanceof String)
1358 String pat = (String) validators.get(sbs);
1359 Regex r = newRegex();
1360 Rthings rth = new Rthings(this);
1361 rth.noBackRefs = true;
1362 r._compile(pat, rth);
1367 Custom cm = new Custom(sb.toString());
1370 Validator v2 = cm.v.arg(sb2.toString());
1373 v2.argsave = sb2.toString();
1374 String p = cm.v.pattern;
1378 Regex r = newRegex();
1379 Rthings rth = new Rthings(this);
1380 rth.noBackRefs = true;
1381 r._compile(cm.v.pattern, rth);
1382 cm.sub = r.thePattern;
1383 cm.sub.add(new CustomEndpoint(cm));
1384 cm.sub.setParent(cm);
1389 else if (sp.match('('))
1392 Regex r = newRegex();
1395 if (sp.incMatch("?:"))
1399 else if (sp.incMatch("?="))
1401 r.or = new lookAhead(false);
1403 else if (sp.incMatch("?!"))
1405 r.or = new lookAhead(true);
1407 else if (sp.match('?'))
1414 mk.ignoreCase = true;
1418 mk.dontMatchInQuotes = true;
1422 mk.optimizeMe = true;
1437 } while (!sp.match(')') && !sp.eos);
1440 if (sp.eos) // throw new RegSyntax
1442 RegSyntaxError.endItAll("Unclosed ()");
1446 { // just ordinary parenthesis
1447 r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++);
1451 add(r._compile(sp, mk));
1454 else if (sp.match('^'))
1456 add(new Start(true));
1458 else if (sp.dontMatch && sp.c == 'A')
1460 add(new Start(false));
1462 else if (sp.match('*'))
1464 addMulti(new patInt(0), new patInf());
1466 else if (sp.match('+'))
1468 addMulti(new patInt(1), new patInf());
1470 else if (sp.match('?'))
1472 addMulti(new patInt(0), new patInt(1));
1474 else if (sp.match('{'))
1476 boolean bad = false;
1477 StrPos sp2 = new StrPos(sp);
1478 // StringBuffer sb = new StringBuffer();
1480 patInt i1 = sp.getPatInt();
1491 * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed
1503 i2 = sp.getPatInt();
1506 if (i1 == null || i2 == null)
1509 * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}");
1516 add(new oneChar(sp.c));
1523 else if (sp.escMatch('x') && next2Hex(sp))
1526 int d = getHexDigit(sp);
1528 d = 16 * d + getHexDigit(sp);
1529 add(new oneChar((char) d));
1531 else if (sp.escMatch('c'))
1534 if (sp.c < Ctrl.cmap.length)
1536 add(new oneChar(Ctrl.cmap[sp.c]));
1540 add(new oneChar(sp.c));
1543 else if (sp.escMatch('f'))
1545 add(new oneChar((char) 12));
1547 else if (sp.escMatch('a'))
1549 add(new oneChar((char) 7));
1551 else if (sp.escMatch('t'))
1553 add(new oneChar('\t'));
1555 else if (sp.escMatch('n'))
1557 add(new oneChar('\n'));
1559 else if (sp.escMatch('r'))
1561 add(new oneChar('\r'));
1563 else if (sp.escMatch('b'))
1565 add(new oneChar('\b'));
1567 else if (sp.escMatch('e'))
1569 add(new oneChar((char) 27));
1573 add(new oneChar(sp.c));
1576 RegSyntaxError.endItAll("Unmatched right paren in pattern");
1581 // compiles all Pattern elements, internal method
1582 private Pattern _compile(String pat, Rthings mk) throws RegSyntax
1585 sFlag = mFlag = ignoreCase = gFlag = false;
1586 StrPos sp = new StrPos(pat, 0);
1587 thePattern = _compile(sp, mk);
1596 Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax
1598 while (!(sp.eos || (or != null && sp.match(')'))))
1607 else if (sp.eos && mk.parenLevel != 0)
1609 RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel);
1615 p = new NullPattern();
1620 return p == null ? new NullPattern() : p;
1623 // add a multi object to the end of the chain
1624 // which applies to the last object
1625 void addMulti(patInt i1, patInt i2) throws RegSyntax
1627 Pattern last, last2;
1628 for (last = p; last != null && last.next != null; last = last.next)
1632 if (last == null || last == p)
1638 for (last2 = p; last2.next != last; last2 = last2.next)
1643 if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1)
1645 ((Multi) last).matchFewest = true;
1647 else if (last instanceof FastMulti && i1.intValue() == 0
1648 && i2.intValue() == 1)
1650 ((FastMulti) last).matchFewest = true;
1652 else if (last instanceof DotMulti && i1.intValue() == 0
1653 && i2.intValue() == 1)
1655 ((DotMulti) last).matchFewest = true;
1657 else if (last instanceof Multi || last instanceof DotMulti
1658 || last instanceof FastMulti)
1660 throw new RegSyntax("Syntax error.");
1662 else if (last2 == null)
1664 p = mkMulti(i1, i2, p);
1668 last2.next = mkMulti(i1, i2, last);
1672 final static Pattern mkMulti(patInt lo, patInt hi, Pattern p)
1675 if (p instanceof Any && p.next == null)
1677 return new DotMulti(lo, hi);
1679 return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p)
1680 : (Pattern) new Multi(lo, hi, p);
1683 // process the bracket operator
1684 Pattern matchBracket(StrPos sp) throws RegSyntax
1689 ret = new Bracket(true);
1694 ret = new Bracket(false);
1698 // throw new RegSyntax
1699 RegSyntaxError.endItAll("Unmatched []");
1702 while (!sp.eos && !sp.match(']'))
1704 StrPos s1 = new StrPos(sp);
1706 StrPos s1_ = new StrPos(s1);
1708 if (s1.match('-') && !s1_.match(']'))
1710 StrPos s2 = new StrPos(s1);
1714 ret.addOr(new Range(sp.c, s2.c));
1719 else if (sp.escMatch('Q'))
1722 while (!sp.escMatch('E'))
1724 ret.addOr(new oneChar(sp.c));
1728 else if (sp.escMatch('d'))
1730 ret.addOr(new Range('0', '9'));
1732 else if (sp.escMatch('s'))
1734 ret.addOr(new oneChar((char) 32));
1735 ret.addOr(new Range((char) 8, (char) 10));
1736 ret.addOr(new oneChar((char) 13));
1738 else if (sp.escMatch('w'))
1740 ret.addOr(new Range('a', 'z'));
1741 ret.addOr(new Range('A', 'Z'));
1742 ret.addOr(new Range('0', '9'));
1743 ret.addOr(new oneChar('_'));
1745 else if (sp.escMatch('D'))
1747 ret.addOr(new Range((char) 0, (char) 47));
1748 ret.addOr(new Range((char) 58, (char) 65535));
1750 else if (sp.escMatch('S'))
1752 ret.addOr(new Range((char) 0, (char) 7));
1753 ret.addOr(new Range((char) 11, (char) 12));
1754 ret.addOr(new Range((char) 14, (char) 31));
1755 ret.addOr(new Range((char) 33, (char) 65535));
1757 else if (sp.escMatch('W'))
1759 ret.addOr(new Range((char) 0, (char) 64));
1760 ret.addOr(new Range((char) 91, (char) 94));
1761 ret.addOr(new oneChar((char) 96));
1762 ret.addOr(new Range((char) 123, (char) 65535));
1764 else if (sp.escMatch('x') && next2Hex(sp))
1767 int d = getHexDigit(sp);
1769 d = 16 * d + getHexDigit(sp);
1770 ret.addOr(new oneChar((char) d));
1772 else if (sp.escMatch('a'))
1774 ret.addOr(new oneChar((char) 7));
1776 else if (sp.escMatch('f'))
1778 ret.addOr(new oneChar((char) 12));
1780 else if (sp.escMatch('e'))
1782 ret.addOr(new oneChar((char) 27));
1784 else if (sp.escMatch('n'))
1786 ret.addOr(new oneChar('\n'));
1788 else if (sp.escMatch('t'))
1790 ret.addOr(new oneChar('\t'));
1792 else if (sp.escMatch('r'))
1794 ret.addOr(new oneChar('\r'));
1796 else if (sp.escMatch('c'))
1799 if (sp.c < Ctrl.cmap.length)
1801 ret.addOr(new oneChar(Ctrl.cmap[sp.c]));
1805 ret.addOr(new oneChar(sp.c));
1808 else if (isOctalString(sp))
1812 d = 8 * d + sp.c - '0';
1813 StrPos sp2 = new StrPos(sp);
1815 if (isOctalDigit(sp2, false))
1818 d = 8 * d + sp.c - '0';
1820 ret.addOr(new oneChar((char) d));
1824 ret.addOr(new oneChar(sp.c));
1832 * Converts the stored Pattern to a String -- this is a decompile. Note that
1833 * \t and \n will really print out here, Not just the two character
1834 * representations. Also be prepared to see some strange output if your
1835 * characters are not printable.
1838 public String toString()
1840 if (false && thePattern == null)
1846 StringBuffer sb = new StringBuffer();
1847 if (esc != Pattern.ESC)
1853 if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase
1854 || dontMatchInQuotes || optimized())
1865 if (sFlag || !dotDoesntMatchCR)
1869 if (dontMatchInQuotes)
1883 String patstr = thePattern.toString();
1884 if (esc != Pattern.ESC)
1886 patstr = reEscape(patstr, Pattern.ESC, esc);
1889 return sb.toString();
1893 // Re-escape Pattern, allows us to use a different escape
1895 static String reEscape(String s, char oldEsc, char newEsc)
1897 if (oldEsc == newEsc)
1902 StringBuffer sb = new StringBuffer();
1903 for (i = 0; i < s.length(); i++)
1905 if (s.charAt(i) == oldEsc && i + 1 < s.length())
1907 if (s.charAt(i + 1) == oldEsc)
1914 sb.append(s.charAt(i + 1));
1918 else if (s.charAt(i) == newEsc)
1925 sb.append(s.charAt(i));
1928 return sb.toString();
1932 * This method implements FilenameFilter, allowing one to use a Regex to
1933 * search through a directory using File.list. There is a FileRegex now that
1936 * @see com.stevesoft.pat.FileRegex
1939 public boolean accept(File dir, String s)
1944 /** The version of this package */
1945 final static public String version()
1947 return "lgpl release 1.5.3";
1951 * Once this method is called, the state of variables ignoreCase and
1952 * dontMatchInQuotes should not be changed as the results will be
1953 * unpredictable. However, search and matchAt will run more quickly. Note that
1954 * you can check to see if the pattern has been optimized by calling the
1955 * optimized() method.
1957 * This method will attempt to rewrite your pattern in a way that makes it
1958 * faster (not all patterns execute at the same speed). In general,
1959 * "(?: ... )" will be faster than "( ... )" so if you don't need the
1960 * backreference, you should group using the former pattern.
1962 * It will also introduce new pattern elements that you can't get to
1963 * otherwise, for example if you have a large table of strings, i.e. the
1964 * months of the year "(January|February|...)" optimize() will make a
1965 * Hashtable that takes it to the next appropriate pattern element --
1966 * eliminating the need for a linear search.
1968 * @see com.stevesoft.pat.Regex#optimized
1969 * @see com.stevesoft.pat.Regex#ignoreCase
1970 * @see com.stevesoft.pat.Regex#dontMatchInQuotes
1971 * @see com.stevesoft.pat.Regex#matchAt
1972 * @see com.stevesoft.pat.Regex#search
1974 public void optimize()
1976 if (optimized() || thePattern == null)
1980 minMatch = new patInt(0); // thePattern.countMinChars();
1981 thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes);
1982 skipper = Skip.findSkip(this);
1983 // RegOpt.setParents(this);
1990 * This function returns true if the optimize method has been called.
1992 public boolean optimized()
1994 return minMatch != null;
1998 * A bit of syntactic surgar for those who want to make their code look more
1999 * perl-like. To use this initialize your Regex object by saying:
2002 * Regex r1 = Regex.perlCode("s/hello/goodbye/");
2003 * Regex r2 = Regex.perlCode("s'fish'frog'i");
2004 * Regex r3 = Regex.perlCode("m'hello');
2007 * The i for ignoreCase is supported in this syntax, as well as m, s, and x.
2008 * The g flat is a bit of a special case.
2010 * If you wish to replace all occurences of a pattern, you do not put a 'g' in
2011 * the perlCode, but call Regex's replaceAll method.
2013 * If you wish to simply and only do a search for r2's pattern, you can do
2014 * this by calling the searchFrom method method repeatedly, or by calling
2015 * search repeatedly if the g flag is set.
2017 * Note: Currently perlCode does <em>not</em> support the (?e=#) syntax for
2018 * changing the escape character.
2021 public static Regex perlCode(String s)
2023 // this file is big enough, see parsePerl.java
2024 // for this function.
2025 return parsePerl.parse(s);
2028 static final char back_slash = '\\';
2031 * Checks to see if there are only literal and no special pattern elements in
2034 public boolean isLiteral()
2036 Pattern x = thePattern;
2039 if (x instanceof oneChar)
2043 else if (x instanceof Skipped)
2057 * You only need to know about this if you are inventing your own pattern
2060 public patInt countMinChars()
2062 return thePattern.countMinChars();
2066 * You only need to know about this if you are inventing your own pattern
2069 public patInt countMaxChars()
2071 return thePattern.countMaxChars();
2074 boolean isHexDigit(StrPos sp)
2078 && ((sp.c >= '0' && sp.c <= '9')
2079 || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F'));
2083 boolean isOctalDigit(StrPos sp, boolean first)
2085 boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0'
2090 int getHexDigit(StrPos sp)
2092 if (sp.c >= '0' && sp.c <= '9')
2096 if (sp.c >= 'a' && sp.c <= 'f')
2098 return sp.c - 'a' + 10;
2100 return sp.c - 'A' + 10;
2103 boolean next2Hex(StrPos sp)
2105 StrPos sp2 = new StrPos(sp);
2107 if (!isHexDigit(sp2))
2112 if (!isHexDigit(sp2))
2119 boolean isOctalString(StrPos sp)
2121 if (!isOctalDigit(sp, true))
2125 StrPos sp2 = new StrPos(sp);
2127 if (!isOctalDigit(sp2, false))