1 /*******************************************************************************
2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $(date) The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
20 *******************************************************************************/
22 // This software is now distributed according to
23 // the Lesser Gnu Public License. Please see
24 // http://www.gnu.org/copyleft/lesser.txt for
26 // -- Happy Computing!
28 package com.stevesoft.pat;
30 import jalview.util.MessageManager;
33 import java.io.FilenameFilter;
34 import java.util.BitSet;
35 import java.util.Hashtable;
37 import com.stevesoft.pat.wrap.StringWrap;
39 /** Matches a Unicode punctuation character. */
40 class UnicodePunct extends UniValidator
42 public int validate(StringLike s, int from, int to)
44 return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1;
48 /** Matches a Unicode white space character. */
49 class UnicodeWhite extends UniValidator
51 public int validate(StringLike s, int from, int to)
53 return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1;
58 * Matches a character that is not a Unicode punctuation character.
60 class NUnicodePunct extends UniValidator
62 public int validate(StringLike s, int from, int to)
64 return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1;
69 * Matches a character that is not a Unicode white space character.
71 class NUnicodeWhite extends UniValidator
73 public int validate(StringLike s, int from, int to)
75 return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1;
79 /** Matches a Unicode word character: an alphanumeric or underscore. */
80 class UnicodeW extends UniValidator
82 public int validate(StringLike s, int from, int to)
84 if (from >= s.length())
88 char c = s.charAt(from);
89 return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
94 /** Matches a character that is not a Unicode alphanumeric or underscore. */
95 class NUnicodeW extends UniValidator
97 public int validate(StringLike s, int from, int to)
99 if (from >= s.length())
103 char c = s.charAt(from);
104 return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to
109 /** Matches a Unicode decimal digit. */
110 class UnicodeDigit extends UniValidator
112 public int validate(StringLike s, int from, int to)
114 return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to
119 /** Matches a character that is not a Unicode digit. */
120 class NUnicodeDigit extends UniValidator
122 public int validate(StringLike s, int from, int to)
124 return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to
129 /** Matches a Unicode math character. */
130 class UnicodeMath extends UniValidator
132 public int validate(StringLike s, int from, int to)
134 return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1;
138 /** Matches a non-math Unicode character. */
139 class NUnicodeMath extends UniValidator
141 public int validate(StringLike s, int from, int to)
143 return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1;
147 /** Matches a Unicode currency symbol. */
148 class UnicodeCurrency extends UniValidator
150 public int validate(StringLike s, int from, int to)
152 return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1;
156 /** Matches a non-currency symbol Unicode character. */
157 class NUnicodeCurrency extends UniValidator
159 public int validate(StringLike s, int from, int to)
161 return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1;
165 /** Matches a Unicode alphabetic character. */
166 class UnicodeAlpha extends UniValidator
168 public int validate(StringLike s, int from, int to)
170 return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1;
174 /** Matches a non-alphabetic Unicode character. */
175 class NUnicodeAlpha extends UniValidator
177 public int validate(StringLike s, int from, int to)
179 return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to
184 /** Matches an upper case Unicode character. */
185 class UnicodeUpper extends UniValidator
187 public int validate(StringLike s, int from, int to)
189 return from < s.length() && isUpper(s.charAt(from)) ? to : -1;
192 final boolean isUpper(char c)
194 return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c);
198 /** Matches an upper case Unicode character. */
199 class UnicodeLower extends UniValidator
201 public int validate(StringLike s, int from, int to)
203 return from < s.length() && isLower(s.charAt(from)) ? to : -1;
206 final boolean isLower(char c)
208 return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c);
213 * Regex provides the parser which constructs the linked list of Pattern classes
216 * For the purpose of this documentation, the fact that java interprets the
217 * backslash will be ignored. In practice, however, you will need a double
218 * backslash to obtain a string that contains a single backslash character.
219 * Thus, the example pattern "\b" should really be typed as "\\b" inside java
222 * Note that Regex is part of package "com.stevesoft.pat". To use it, simply
223 * import com.stevesoft.pat.Regex at the top of your file.
225 * Regex is made with a constructor that takes a String that defines the regular
226 * expression. Thus, for example
229 * Regex r = new Regex("[a-c]*");
232 * matches any number of characters so long as the are 'a', 'b', or 'c').
234 * To attempt to match the Pattern to a given string, you can use either the
235 * search(String) member function, or the matchAt(String,int position) member
236 * function. These functions return a boolean which tells you whether or not the
237 * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in
238 * the Regex object appropriately.
240 * The portion of the string before the match can be obtained by the left()
241 * member, and the portion after the match can be obtained by the right()
244 * Essentially, this package implements a syntax that is very much like the perl
245 * 5 regular expression syntax.
250 * Regex r = new Regex("x(a|b)y");
251 * r.matchAt("xay", 0);
252 * System.out.println("sub = " + r.stringMatched(1));
255 * The above would print "sub = a".
258 * r.left() // would return "x"
259 * r.right() // would return "y"
263 * Differences between this package and perl5:<br>
264 * The extended Pattern for setting flags, is now supported, but the flags are
265 * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the
266 * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the
267 * escape character. The pattern
279 * , but note that the sequence
285 * <b>must</b> occur at the very beginning of the pattern. There may be other
286 * small differences as well. I will either make my package conform or note them
287 * as I become aware of them.
289 * This package supports additional patterns not in perl5: <center>
294 * <td>This matches all characters between the '(' character and the balancing
295 * ')' character. Thus, it will match "()" as well as "(())". The balancing
296 * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".</td>
300 * <td>Moves the pointer backwards within the text. This allows you to make a
301 * "look behind." It fails if it attempts to move to a position before the
302 * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1
303 * in this example, is the number of characters to move backwards.</td>
307 * @author Steven R. Brandt
308 * @version package com.stevesoft.pat, release 1.5.3
311 public class Regex extends RegRes implements FilenameFilter
314 * BackRefOffset gives the identity number of the first pattern. Version 1.0
315 * used zero, version 1.1 uses 1 to be more compatible with perl.
317 static int BackRefOffset = 1;
319 private static Pattern none = new NoPattern();
321 Pattern thePattern = none;
323 patInt minMatch = new patInt(0);
325 static Hashtable validators = new Hashtable();
328 define("p", "(?>1)", new UnicodePunct());
329 define("P", "(?>1)", new NUnicodePunct());
330 define("s", "(?>1)", new UnicodeWhite());
331 define("S", "(?>1)", new NUnicodeWhite());
332 define("w", "(?>1)", new UnicodeW());
333 define("W", "(?>1)", new NUnicodeW());
334 define("d", "(?>1)", new UnicodeDigit());
335 define("D", "(?>1)", new NUnicodeDigit());
336 define("m", "(?>1)", new UnicodeMath());
337 define("M", "(?>1)", new NUnicodeMath());
338 define("c", "(?>1)", new UnicodeCurrency());
339 define("C", "(?>1)", new NUnicodeCurrency());
340 define("a", "(?>1)", new UnicodeAlpha());
341 define("A", "(?>1)", new NUnicodeAlpha());
342 define("uc", "(?>1)", new UnicodeUpper());
343 define("lc", "(?>1)", new UnicodeLower());
346 /** Set the dontMatch in quotes flag. */
347 public void setDontMatchInQuotes(boolean b)
349 dontMatchInQuotes = b;
352 /** Find out if the dontMatchInQuotes flag is enabled. */
353 public boolean getDontMatchInQuotes()
355 return dontMatchInQuotes;
358 boolean dontMatchInQuotes = false;
361 * Set the state of the ignoreCase flag. If set to true, then the pattern
362 * matcher will ignore case when searching for a match.
364 public void setIgnoreCase(boolean b)
370 * Get the state of the ignoreCase flag. Returns true if we are ignoring the
371 * case of the pattern, false otherwise.
373 public boolean getIgnoreCase()
378 boolean ignoreCase = false;
380 static boolean defaultMFlag = false;
383 * Set the default value of the m flag. If it is set to true, then the MFlag
384 * will be on for any regex search executed.
386 public static void setDefaultMFlag(boolean mFlag)
388 defaultMFlag = mFlag;
392 * Get the default value of the m flag. If it is set to true, then the MFlag
393 * will be on for any regex search executed.
395 public static boolean getDefaultMFlag()
401 * Initializes the object without a Pattern. To supply a Pattern use
404 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
411 * Create and compile a Regex, but do not throw any exceptions. If you wish to
412 * have exceptions thrown for syntax errors, you must use the Regex(void)
413 * constructor to create the Regex object, and then call the compile method.
414 * Therefore, you should only call this method when you know your pattern is
415 * right. I will probably become more like
417 * @see com.stevesoft.pat.Regex#search(java.lang.String)
418 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
420 public Regex(String s)
425 } catch (RegSyntax rs)
430 ReplaceRule rep = null;
433 * Create and compile both a Regex and a ReplaceRule.
435 * @see com.stevesoft.pat.ReplaceRule
436 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
438 public Regex(String s, String rp)
441 rep = ReplaceRule.perlCode(rp);
445 * Create and compile a Regex, but give it the ReplaceRule specified. This
446 * allows the user finer control of the Replacement process, if that is
449 * @see com.stevesoft.pat.ReplaceRule
450 * @see com.stevesoft.pat.Regex#compile(java.lang.String)
452 public Regex(String s, ReplaceRule rp)
459 * Change the ReplaceRule of this Regex by compiling a new one using String
462 public void setReplaceRule(String rp)
464 rep = ReplaceRule.perlCode(rp);
465 repr = null; // Clear Replacer history
468 /** Change the ReplaceRule of this Regex to rp. */
469 public void setReplaceRule(ReplaceRule rp)
475 * Test to see if a custom defined rule exists.
477 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
479 public static boolean isDefined(String nm)
481 return validators.get(nm) != null;
485 * Removes a custom defined rule.
487 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator)
489 public static void undefine(String nm)
491 validators.remove(nm);
495 * Defines a method to create a new rule. See test/deriv2.java and
496 * test/deriv3.java for examples of how to use it.
498 public static void define(String nm, String pat, Validator v)
501 validators.put(nm, v);
505 * Defines a shorthand for a pattern. The pattern will be invoked by a string
506 * that has the form "(??"+nm+")".
508 public static void define(String nm, String pat)
510 validators.put(nm, pat);
513 /** Get the current ReplaceRule. */
514 public ReplaceRule getReplaceRule()
519 Replacer repr = null;
521 final Replacer _getReplacer()
523 return repr == null ? repr = new Replacer() : repr;
526 public Replacer getReplacer()
530 repr = new Replacer();
538 * Replace the first occurence of this pattern in String s according to the
541 * @see com.stevesoft.pat.ReplaceRule
542 * @see com.stevesoft.pat.Regex#getReplaceRule()
544 public String replaceFirst(String s)
546 return _getReplacer().replaceFirstRegion(s, this, 0, s.length())
551 * Replace the first occurence of this pattern in String s beginning with
552 * position pos according to the ReplaceRule.
554 * @see com.stevesoft.pat.ReplaceRule
555 * @see com.stevesoft.pat.Regex#getReplaceRule()
557 public String replaceFirstFrom(String s, int pos)
559 return _getReplacer().replaceFirstRegion(s, this, pos, s.length())
564 * Replace the first occurence of this pattern in String s beginning with
565 * position start and ending with end according to the ReplaceRule.
567 * @see com.stevesoft.pat.ReplaceRule
568 * @see com.stevesoft.pat.Regex#getReplaceRule()
570 public String replaceFirstRegion(String s, int start, int end)
572 return _getReplacer().replaceFirstRegion(s, this, start, end)
577 * Replace all occurences of this pattern in String s according to the
580 * @see com.stevesoft.pat.ReplaceRule
581 * @see com.stevesoft.pat.Regex#getReplaceRule()
583 public String replaceAll(String s)
585 return _getReplacer().replaceAllRegion(s, this, 0, s.length())
589 public StringLike replaceAll(StringLike s)
591 return _getReplacer().replaceAllRegion(s, this, 0, s.length());
595 * Replace all occurences of this pattern in String s beginning with position
596 * pos according to the ReplaceRule.
598 * @see com.stevesoft.pat.ReplaceRule
599 * @see com.stevesoft.pat.Regex#getReplaceRule()
601 public String replaceAllFrom(String s, int pos)
603 return _getReplacer().replaceAllRegion(s, this, pos, s.length())
608 * Replace all occurences of this pattern in String s beginning with position
609 * start and ending with end according to the ReplaceRule.
611 * @see com.stevesoft.pat.ReplaceRule
612 * @see com.stevesoft.pat.Regex#getReplaceRule()
614 public String replaceAllRegion(String s, int start, int end)
616 return _getReplacer().replaceAllRegion(s, this, start, end).toString();
619 /** Essentially clones the Regex object */
620 public Regex(Regex r)
623 dontMatchInQuotes = r.dontMatchInQuotes;
625 ignoreCase = r.ignoreCase;
633 rep = (ReplaceRule) r.rep.clone();
636 * try { compile(r.toString()); } catch(RegSyntax r_) {}
638 thePattern = r.thePattern.clone(new Hashtable());
639 minMatch = r.minMatch;
644 * By default, the escape character is the backslash, but you can make it
645 * anything you want by setting this variable.
647 public char esc = Pattern.ESC;
650 * This method compiles a regular expression, making it possible to call the
651 * search or matchAt methods.
653 * @exception com.stevesoft.pat.RegSyntax
654 * is thrown if a syntax error is encountered in the pattern. For
655 * example, "x{3,1}" or "*a" are not valid patterns.
656 * @see com.stevesoft.pat.Regex#search
657 * @see com.stevesoft.pat.Regex#matchAt
659 public void compile(String prepat) throws RegSyntax
661 String postpat = parsePerl.codify(prepat, true);
662 String pat = postpat == null ? prepat : postpat;
665 dontMatchInQuotes = false;
666 Rthings mk = new Rthings(this);
672 minMatch = new patInt(0);
673 StrPos sp = new StrPos(pat, 0);
674 if (sp.incMatch("(?e="))
680 newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC);
683 else if (esc != Pattern.ESC)
685 newpat = reEscape(pat, esc, Pattern.ESC);
687 thePattern = _compile(newpat, mk);
688 numSubs_ = mk.val - offset;
693 * If a Regex is compared against a Regex, a check is done to see that the
694 * patterns are equal as well as the most recent match. If a Regex is compare
695 * with a RegRes, only the result of the most recent match is compared.
697 public boolean equals(Object o)
699 if (o instanceof Regex)
701 if (toString().equals(o.toString()))
703 return super.equals(o);
712 return super.equals(o);
716 /** A clone by any other name would smell as sweet. */
717 public Object clone()
719 return new Regex(this);
722 /** Return a clone of the underlying RegRes object. */
723 public RegRes result()
725 return (RegRes) super.clone();
728 // prep sets global variables of class
729 // Pattern so that it can access them
730 // during an attempt at a match
731 Pthings pt = new Pthings();
733 final Pthings prep(StringLike s)
736 pt.lastPos = matchedTo();
741 if ((s == null ? null : s.unwrap()) != (src == null ? null : s.unwrap()))
746 pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag);
747 pt.mFlag = (mFlag | defaultMFlag);
748 pt.ignoreCase = ignoreCase;
750 if (pt.marks != null)
752 for (int i = 0; i < pt.marks.length; i++)
758 pt.nMarks = numSubs_;
760 if (dontMatchInQuotes)
772 * Attempt to match a Pattern beginning at a specified location within the
775 * @see com.stevesoft.pat.Regex#search
777 public boolean matchAt(String s, int start_pos)
779 return _search(s, start_pos, start_pos);
783 * Attempt to match a Pattern beginning at a specified location within the
786 * @see com.stevesoft.pat.Regex#search
788 public boolean matchAt(StringLike s, int start_pos)
790 return _search(s, start_pos, start_pos);
794 * Search through a String for the first occurrence of a match.
796 * @see com.stevesoft.pat.Regex#searchFrom
797 * @see com.stevesoft.pat.Regex#matchAt
799 public boolean search(String s)
803 throw new NullPointerException(
805 .getString("exception.null_string_given_to_regex_search"));
807 return _search(s, 0, s.length());
810 public boolean search(StringLike sl)
814 throw new NullPointerException(
816 .getString("exception.null_string_like_given_to_regex_search"));
818 return _search(sl, 0, sl.length());
821 public boolean reverseSearch(String s)
825 throw new NullPointerException(
827 .getString("exception.null_string_given_to_regex_reverse_search"));
829 return _reverseSearch(s, 0, s.length());
832 public boolean reverseSearch(StringLike sl)
836 throw new NullPointerException(
838 .getString("exception.null_string_like_given_to_regex_reverse_search"));
840 return _reverseSearch(sl, 0, sl.length());
844 * Search through a String for the first occurence of a match, but start at
851 public boolean searchFrom(String s, int start)
855 throw new NullPointerException(
857 .getString("exception.null_string_like_given_to_regex_search_from"));
859 return _search(s, start, s.length());
862 public boolean searchFrom(StringLike s, int start)
866 throw new NullPointerException(
868 .getString("exception.null_string_like_given_to_regex_search_from"));
870 return _search(s, start, s.length());
874 * Search through a region of a String for the first occurence of a match.
876 public boolean searchRegion(String s, int start, int end)
880 throw new NullPointerException(
882 .getString("exception.null_string_like_given_to_regex_search_region"));
884 return _search(s, start, end);
888 * Set this to change the default behavior of the "." pattern. By default it
889 * now matches perl's behavior and fails to match the '\n' character.
891 public static boolean dotDoesntMatchCR = true;
897 boolean gFlag = false;
899 /** Set the 'g' flag */
900 public void setGFlag(boolean b)
905 /** Get the state of the 'g' flag. */
906 public boolean getGFlag()
911 boolean sFlag = false;
913 /** Get the state of the sFlag */
914 public boolean getSFlag()
919 boolean mFlag = false;
921 /** Get the state of the sFlag */
922 public boolean getMFlag()
927 final boolean _search(String s, int start, int end)
929 return _search(new StringWrap(s), start, end);
932 final boolean _search(StringLike s, int start, int end)
934 if (gFlag && gFlagto > 0 && gFlags != null
935 && s.unwrap() == gFlags.unwrap())
941 Pthings pt = prep(s);
943 int up = (minMatch == null ? end : end - minMatch.i);
945 if (up < start && end >= start)
952 for (int i = start; i <= up; i++)
954 charsMatched_ = thePattern.matchAt(s, i, pt);
955 if (charsMatched_ >= 0)
957 matchFrom_ = thePattern.mfrom;
959 gFlagto = matchFrom_ + charsMatched_;
961 return didMatch_ = true;
968 for (int i = start; i <= up; i++)
970 i = skipper.find(src, i, up);
973 charsMatched_ = matchFrom_ = -1;
974 return didMatch_ = false;
976 charsMatched_ = thePattern.matchAt(s, i, pt);
977 if (charsMatched_ >= 0)
979 matchFrom_ = thePattern.mfrom;
981 gFlagto = matchFrom_ + charsMatched_;
983 return didMatch_ = true;
987 return didMatch_ = false;
991 * final boolean _search(LongStringLike s,long start,long end) { if(gFlag &&
992 * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null;
994 * Pthings pt=prep(s);
996 * int up = end;//(minMatch == null ? end : end-minMatch.i);
998 * if(up < start && end >= start) up = start;
1000 * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ =
1001 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
1002 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
1003 * return didMatch_=true; } } } else { pt.no_check = true; for(long
1004 * i=start;i<=up;i++) { i = skipper.find(src,i,up); if(i<0) { charsMatched_ =
1005 * matchFrom_ = -1; return didMatch_ = false; } charsMatched_ =
1006 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ =
1007 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_;
1008 * gFlags = s; return didMatch_=true; } else { i = s.adjustIndex(i); up =
1009 * s.adjustEnd(i); } } } return didMatch_=false; }
1012 boolean _reverseSearch(String s, int start, int end)
1014 return _reverseSearch(new StringWrap(s), start, end);
1017 boolean _reverseSearch(StringLike s, int start, int end)
1019 if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap())
1024 Pthings pt = prep(s);
1025 for (int i = end; i >= start; i--)
1027 charsMatched_ = thePattern.matchAt(s, i, pt);
1028 if (charsMatched_ >= 0)
1030 matchFrom_ = thePattern.mfrom;
1032 gFlagto = matchFrom_ - 1;
1034 return didMatch_ = true;
1037 return didMatch_ = false;
1040 // This routine sets the cbits variable
1041 // of class Pattern. Cbits is true for
1042 // the bit corresponding to a character inside
1044 static StringLike lasts = null;
1046 static BitSet lastbs = null;
1048 static void setCbits(StringLike s, Pthings pt)
1055 BitSet bs = new BitSet(s.length());
1057 boolean setBit = false;
1058 for (int i = 0; i < s.length(); i++)
1064 char c = s.charAt(i);
1065 if (!setBit && c == '"')
1071 else if (!setBit && c == '\'')
1077 else if (setBit && c == qc)
1081 else if (setBit && c == '\\' && i + 1 < s.length())
1090 pt.cbits = lastbs = bs;
1094 // Wanted user to over-ride this in alpha version,
1095 // but it wasn't really necessary because of this trick:
1100 return (Regex) getClass().newInstance();
1101 } catch (InstantiationException ie)
1104 } catch (IllegalAccessException iae)
1111 * Only needed for creating your own extensions of Regex. This method adds the
1112 * next Pattern in the chain of patterns or sets the Pattern if it is the
1115 protected void add(Pattern p2)
1129 * You only need to use this method if you are creating your own extentions to
1130 * Regex. compile1 compiles one Pattern element, it can be over-ridden to
1131 * allow the Regex compiler to understand new syntax. See deriv.java for an
1132 * example. This routine is the heart of class Regex. Rthings has one integer
1133 * member called intValue, it is used to keep track of the number of ()'s in
1136 * @exception com.stevesoft.pat.RegSyntax
1137 * is thrown when a nonsensensical pattern is supplied. For
1138 * example, a pattern beginning with *.
1140 protected void compile1(StrPos sp, Rthings mk) throws RegSyntax
1145 add(matchBracket(sp));
1147 else if (sp.match('|'))
1155 p = new NullPattern();
1160 else if (sp.incMatch("(?<"))
1162 patInt i = sp.getPatInt();
1165 RegSyntaxError.endItAll("No int after (?<");
1167 add(new Backup(i.intValue()));
1170 RegSyntaxError.endItAll("No ) after (?<");
1173 else if (sp.incMatch("(?>"))
1175 patInt i = sp.getPatInt();
1178 RegSyntaxError.endItAll("No int after (?>");
1180 add(new Backup(-i.intValue()));
1183 RegSyntaxError.endItAll("No ) after (?<");
1186 else if (sp.incMatch("(?@"))
1194 RegSyntaxError.endItAll("(?@ does not have closing paren");
1196 add(new Group(op, cl));
1198 else if (sp.incMatch("(?#"))
1200 while (!sp.match(')'))
1205 else if (sp.dontMatch && sp.c == 'w')
1207 // Regex r = new Regex();
1208 // r._compile("[a-zA-Z0-9_]",mk);
1209 // add(new Goop("\\w",r.thePattern));
1210 Bracket b = new Bracket(false);
1211 b.addOr(new Range('a', 'z'));
1212 b.addOr(new Range('A', 'Z'));
1213 b.addOr(new Range('0', '9'));
1214 b.addOr(new oneChar('_'));
1217 else if (sp.dontMatch && sp.c == 'G')
1221 else if (sp.dontMatch && sp.c == 's')
1223 // Regex r = new Regex();
1224 // r._compile("[ \t\n\r\b]",mk);
1225 // add(new Goop("\\s",r.thePattern));
1226 Bracket b = new Bracket(false);
1227 b.addOr(new oneChar((char) 32));
1228 b.addOr(new Range((char) 8, (char) 10));
1229 b.addOr(new oneChar((char) 13));
1232 else if (sp.dontMatch && sp.c == 'd')
1234 // Regex r = new Regex();
1235 // r._compile("[0-9]",mk);
1236 // add(new Goop("\\d",r.thePattern));
1237 Range digit = new Range('0', '9');
1238 digit.printBrackets = true;
1241 else if (sp.dontMatch && sp.c == 'W')
1243 // Regex r = new Regex();
1244 // r._compile("[^a-zA-Z0-9_]",mk);
1245 // add(new Goop("\\W",r.thePattern));
1246 Bracket b = new Bracket(true);
1247 b.addOr(new Range('a', 'z'));
1248 b.addOr(new Range('A', 'Z'));
1249 b.addOr(new Range('0', '9'));
1250 b.addOr(new oneChar('_'));
1253 else if (sp.dontMatch && sp.c == 'S')
1255 // Regex r = new Regex();
1256 // r._compile("[^ \t\n\r\b]",mk);
1257 // add(new Goop("\\S",r.thePattern));
1258 Bracket b = new Bracket(true);
1259 b.addOr(new oneChar((char) 32));
1260 b.addOr(new Range((char) 8, (char) 10));
1261 b.addOr(new oneChar((char) 13));
1264 else if (sp.dontMatch && sp.c == 'D')
1266 // Regex r = new Regex();
1267 // r._compile("[^0-9]",mk);
1268 // add(new Goop("\\D",r.thePattern));
1269 Bracket b = new Bracket(true);
1270 b.addOr(new Range('0', '9'));
1273 else if (sp.dontMatch && sp.c == 'B')
1275 Regex r = new Regex();
1276 r._compile("(?!" + back_slash + "b)", mk);
1279 else if (isOctalString(sp))
1283 d = 8 * d + sp.c - '0';
1284 StrPos sp2 = new StrPos(sp);
1286 if (isOctalDigit(sp2, false))
1289 d = 8 * d + sp.c - '0';
1291 add(new oneChar((char) d));
1293 else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9')
1295 int iv = sp.c - '0';
1296 StrPos s2 = new StrPos(sp);
1298 if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9')
1300 iv = 10 * iv + (s2.c - '0');
1303 add(new BackMatch(iv));
1305 else if (sp.dontMatch && sp.c == 'b')
1307 add(new Boundary());
1309 else if (sp.match('\b'))
1311 add(new Boundary());
1313 else if (sp.match('$'))
1317 else if (sp.dontMatch && sp.c == 'Z')
1319 add(new End(false));
1321 else if (sp.match('.'))
1325 else if (sp.incMatch("(??"))
1327 StringBuffer sb = new StringBuffer();
1328 StringBuffer sb2 = new StringBuffer();
1329 while (!sp.match(')') && !sp.match(':'))
1334 if (sp.incMatch(":"))
1336 while (!sp.match(')'))
1342 String sbs = sb.toString();
1343 if (validators.get(sbs) instanceof String)
1345 String pat = (String) validators.get(sbs);
1346 Regex r = newRegex();
1347 Rthings rth = new Rthings(this);
1348 rth.noBackRefs = true;
1349 r._compile(pat, rth);
1354 Custom cm = new Custom(sb.toString());
1357 Validator v2 = cm.v.arg(sb2.toString());
1360 v2.argsave = sb2.toString();
1361 String p = cm.v.pattern;
1365 Regex r = newRegex();
1366 Rthings rth = new Rthings(this);
1367 rth.noBackRefs = true;
1368 r._compile(cm.v.pattern, rth);
1369 cm.sub = r.thePattern;
1370 cm.sub.add(new CustomEndpoint(cm));
1371 cm.sub.setParent(cm);
1376 else if (sp.match('('))
1379 Regex r = newRegex();
1382 if (sp.incMatch("?:"))
1386 else if (sp.incMatch("?="))
1388 r.or = new lookAhead(false);
1390 else if (sp.incMatch("?!"))
1392 r.or = new lookAhead(true);
1394 else if (sp.match('?'))
1401 mk.ignoreCase = true;
1405 mk.dontMatchInQuotes = true;
1409 mk.optimizeMe = true;
1424 } while (!sp.match(')') && !sp.eos);
1427 if (sp.eos) // throw new RegSyntax
1429 RegSyntaxError.endItAll("Unclosed ()");
1433 { // just ordinary parenthesis
1434 r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++);
1438 add(r._compile(sp, mk));
1441 else if (sp.match('^'))
1443 add(new Start(true));
1445 else if (sp.dontMatch && sp.c == 'A')
1447 add(new Start(false));
1449 else if (sp.match('*'))
1451 addMulti(new patInt(0), new patInf());
1453 else if (sp.match('+'))
1455 addMulti(new patInt(1), new patInf());
1457 else if (sp.match('?'))
1459 addMulti(new patInt(0), new patInt(1));
1461 else if (sp.match('{'))
1463 boolean bad = false;
1464 StrPos sp2 = new StrPos(sp);
1465 // StringBuffer sb = new StringBuffer();
1467 patInt i1 = sp.getPatInt();
1478 * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed
1490 i2 = sp.getPatInt();
1493 if (i1 == null || i2 == null)
1496 * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}");
1503 add(new oneChar(sp.c));
1510 else if (sp.escMatch('x') && next2Hex(sp))
1513 int d = getHexDigit(sp);
1515 d = 16 * d + getHexDigit(sp);
1516 add(new oneChar((char) d));
1518 else if (sp.escMatch('c'))
1521 if (sp.c < Ctrl.cmap.length)
1523 add(new oneChar(Ctrl.cmap[sp.c]));
1527 add(new oneChar(sp.c));
1530 else if (sp.escMatch('f'))
1532 add(new oneChar((char) 12));
1534 else if (sp.escMatch('a'))
1536 add(new oneChar((char) 7));
1538 else if (sp.escMatch('t'))
1540 add(new oneChar('\t'));
1542 else if (sp.escMatch('n'))
1544 add(new oneChar('\n'));
1546 else if (sp.escMatch('r'))
1548 add(new oneChar('\r'));
1550 else if (sp.escMatch('b'))
1552 add(new oneChar('\b'));
1554 else if (sp.escMatch('e'))
1556 add(new oneChar((char) 27));
1560 add(new oneChar(sp.c));
1563 RegSyntaxError.endItAll("Unmatched right paren in pattern");
1568 // compiles all Pattern elements, internal method
1569 private Pattern _compile(String pat, Rthings mk) throws RegSyntax
1572 sFlag = mFlag = ignoreCase = gFlag = false;
1573 StrPos sp = new StrPos(pat, 0);
1574 thePattern = _compile(sp, mk);
1583 Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax
1585 while (!(sp.eos || (or != null && sp.match(')'))))
1594 else if (sp.eos && mk.parenLevel != 0)
1596 RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel);
1602 p = new NullPattern();
1607 return p == null ? new NullPattern() : p;
1610 // add a multi object to the end of the chain
1611 // which applies to the last object
1612 void addMulti(patInt i1, patInt i2) throws RegSyntax
1614 Pattern last, last2;
1615 for (last = p; last != null && last.next != null; last = last.next)
1619 if (last == null || last == p)
1625 for (last2 = p; last2.next != last; last2 = last2.next)
1630 if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1)
1632 ((Multi) last).matchFewest = true;
1634 else if (last instanceof FastMulti && i1.intValue() == 0
1635 && i2.intValue() == 1)
1637 ((FastMulti) last).matchFewest = true;
1639 else if (last instanceof DotMulti && i1.intValue() == 0
1640 && i2.intValue() == 1)
1642 ((DotMulti) last).matchFewest = true;
1644 else if (last instanceof Multi || last instanceof DotMulti
1645 || last instanceof FastMulti)
1647 throw new RegSyntax("Syntax error.");
1649 else if (last2 == null)
1651 p = mkMulti(i1, i2, p);
1655 last2.next = mkMulti(i1, i2, last);
1659 final static Pattern mkMulti(patInt lo, patInt hi, Pattern p)
1662 if (p instanceof Any && p.next == null)
1664 return (Pattern) new DotMulti(lo, hi);
1666 return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p)
1667 : (Pattern) new Multi(lo, hi, p);
1670 // process the bracket operator
1671 Pattern matchBracket(StrPos sp) throws RegSyntax
1676 ret = new Bracket(true);
1681 ret = new Bracket(false);
1685 // throw new RegSyntax
1686 RegSyntaxError.endItAll("Unmatched []");
1689 while (!sp.eos && !sp.match(']'))
1691 StrPos s1 = new StrPos(sp);
1693 StrPos s1_ = new StrPos(s1);
1695 if (s1.match('-') && !s1_.match(']'))
1697 StrPos s2 = new StrPos(s1);
1701 ret.addOr(new Range(sp.c, s2.c));
1706 else if (sp.escMatch('Q'))
1709 while (!sp.escMatch('E'))
1711 ret.addOr(new oneChar(sp.c));
1715 else if (sp.escMatch('d'))
1717 ret.addOr(new Range('0', '9'));
1719 else if (sp.escMatch('s'))
1721 ret.addOr(new oneChar((char) 32));
1722 ret.addOr(new Range((char) 8, (char) 10));
1723 ret.addOr(new oneChar((char) 13));
1725 else if (sp.escMatch('w'))
1727 ret.addOr(new Range('a', 'z'));
1728 ret.addOr(new Range('A', 'Z'));
1729 ret.addOr(new Range('0', '9'));
1730 ret.addOr(new oneChar('_'));
1732 else if (sp.escMatch('D'))
1734 ret.addOr(new Range((char) 0, (char) 47));
1735 ret.addOr(new Range((char) 58, (char) 65535));
1737 else if (sp.escMatch('S'))
1739 ret.addOr(new Range((char) 0, (char) 7));
1740 ret.addOr(new Range((char) 11, (char) 12));
1741 ret.addOr(new Range((char) 14, (char) 31));
1742 ret.addOr(new Range((char) 33, (char) 65535));
1744 else if (sp.escMatch('W'))
1746 ret.addOr(new Range((char) 0, (char) 64));
1747 ret.addOr(new Range((char) 91, (char) 94));
1748 ret.addOr(new oneChar((char) 96));
1749 ret.addOr(new Range((char) 123, (char) 65535));
1751 else if (sp.escMatch('x') && next2Hex(sp))
1754 int d = getHexDigit(sp);
1756 d = 16 * d + getHexDigit(sp);
1757 ret.addOr(new oneChar((char) d));
1759 else if (sp.escMatch('a'))
1761 ret.addOr(new oneChar((char) 7));
1763 else if (sp.escMatch('f'))
1765 ret.addOr(new oneChar((char) 12));
1767 else if (sp.escMatch('e'))
1769 ret.addOr(new oneChar((char) 27));
1771 else if (sp.escMatch('n'))
1773 ret.addOr(new oneChar('\n'));
1775 else if (sp.escMatch('t'))
1777 ret.addOr(new oneChar('\t'));
1779 else if (sp.escMatch('r'))
1781 ret.addOr(new oneChar('\r'));
1783 else if (sp.escMatch('c'))
1786 if (sp.c < Ctrl.cmap.length)
1788 ret.addOr(new oneChar(Ctrl.cmap[sp.c]));
1792 ret.addOr(new oneChar(sp.c));
1795 else if (isOctalString(sp))
1799 d = 8 * d + sp.c - '0';
1800 StrPos sp2 = new StrPos(sp);
1802 if (isOctalDigit(sp2, false))
1805 d = 8 * d + sp.c - '0';
1807 ret.addOr(new oneChar((char) d));
1811 ret.addOr(new oneChar(sp.c));
1819 * Converts the stored Pattern to a String -- this is a decompile. Note that
1820 * \t and \n will really print out here, Not just the two character
1821 * representations. Also be prepared to see some strange output if your
1822 * characters are not printable.
1824 public String toString()
1826 if (false && thePattern == null)
1832 StringBuffer sb = new StringBuffer();
1833 if (esc != Pattern.ESC)
1839 if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase
1840 || dontMatchInQuotes || optimized())
1851 if (sFlag || !dotDoesntMatchCR)
1855 if (dontMatchInQuotes)
1869 String patstr = thePattern.toString();
1870 if (esc != Pattern.ESC)
1872 patstr = reEscape(patstr, Pattern.ESC, esc);
1875 return sb.toString();
1879 // Re-escape Pattern, allows us to use a different escape
1881 static String reEscape(String s, char oldEsc, char newEsc)
1883 if (oldEsc == newEsc)
1888 StringBuffer sb = new StringBuffer();
1889 for (i = 0; i < s.length(); i++)
1891 if (s.charAt(i) == oldEsc && i + 1 < s.length())
1893 if (s.charAt(i + 1) == oldEsc)
1900 sb.append(s.charAt(i + 1));
1904 else if (s.charAt(i) == newEsc)
1911 sb.append(s.charAt(i));
1914 return sb.toString();
1918 * This method implements FilenameFilter, allowing one to use a Regex to
1919 * search through a directory using File.list. There is a FileRegex now that
1922 * @see com.stevesoft.pat.FileRegex
1924 public boolean accept(File dir, String s)
1929 /** The version of this package */
1930 final static public String version()
1932 return "lgpl release 1.5.3";
1936 * Once this method is called, the state of variables ignoreCase and
1937 * dontMatchInQuotes should not be changed as the results will be
1938 * unpredictable. However, search and matchAt will run more quickly. Note that
1939 * you can check to see if the pattern has been optimized by calling the
1940 * optimized() method.
1942 * This method will attempt to rewrite your pattern in a way that makes it
1943 * faster (not all patterns execute at the same speed). In general,
1944 * "(?: ... )" will be faster than "( ... )" so if you don't need the
1945 * backreference, you should group using the former pattern.
1947 * It will also introduce new pattern elements that you can't get to
1948 * otherwise, for example if you have a large table of strings, i.e. the
1949 * months of the year "(January|February|...)" optimize() will make a
1950 * Hashtable that takes it to the next appropriate pattern element --
1951 * eliminating the need for a linear search.
1953 * @see com.stevesoft.pat.Regex#optimized
1954 * @see com.stevesoft.pat.Regex#ignoreCase
1955 * @see com.stevesoft.pat.Regex#dontMatchInQuotes
1956 * @see com.stevesoft.pat.Regex#matchAt
1957 * @see com.stevesoft.pat.Regex#search
1959 public void optimize()
1961 if (optimized() || thePattern == null)
1965 minMatch = new patInt(0); // thePattern.countMinChars();
1966 thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes);
1967 skipper = Skip.findSkip(this);
1968 // RegOpt.setParents(this);
1975 * This function returns true if the optimize method has been called.
1977 public boolean optimized()
1979 return minMatch != null;
1983 * A bit of syntactic surgar for those who want to make their code look more
1984 * perl-like. To use this initialize your Regex object by saying:
1987 * Regex r1 = Regex.perlCode("s/hello/goodbye/");
1988 * Regex r2 = Regex.perlCode("s'fish'frog'i");
1989 * Regex r3 = Regex.perlCode("m'hello');
1992 * The i for ignoreCase is supported in this syntax, as well as m, s, and x.
1993 * The g flat is a bit of a special case.
1995 * If you wish to replace all occurences of a pattern, you do not put a 'g' in
1996 * the perlCode, but call Regex's replaceAll method.
1998 * If you wish to simply and only do a search for r2's pattern, you can do
1999 * this by calling the searchFrom method method repeatedly, or by calling
2000 * search repeatedly if the g flag is set.
2002 * Note: Currently perlCode does <em>not</em> support the (?e=#) syntax for
2003 * changing the escape character.
2006 public static Regex perlCode(String s)
2008 // this file is big enough, see parsePerl.java
2009 // for this function.
2010 return parsePerl.parse(s);
2013 static final char back_slash = '\\';
2016 * Checks to see if there are only literal and no special pattern elements in
2019 public boolean isLiteral()
2021 Pattern x = thePattern;
2024 if (x instanceof oneChar)
2028 else if (x instanceof Skipped)
2042 * You only need to know about this if you are inventing your own pattern
2045 public patInt countMinChars()
2047 return thePattern.countMinChars();
2051 * You only need to know about this if you are inventing your own pattern
2054 public patInt countMaxChars()
2056 return thePattern.countMaxChars();
2059 boolean isHexDigit(StrPos sp)
2063 && ((sp.c >= '0' && sp.c <= '9')
2064 || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F'));
2068 boolean isOctalDigit(StrPos sp, boolean first)
2070 boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0'
2075 int getHexDigit(StrPos sp)
2077 if (sp.c >= '0' && sp.c <= '9')
2081 if (sp.c >= 'a' && sp.c <= 'f')
2083 return sp.c - 'a' + 10;
2085 return sp.c - 'A' + 10;
2088 boolean next2Hex(StrPos sp)
2090 StrPos sp2 = new StrPos(sp);
2092 if (!isHexDigit(sp2))
2097 if (!isHexDigit(sp2))
2104 boolean isOctalString(StrPos sp)
2106 if (!isOctalDigit(sp, true))
2110 StrPos sp2 = new StrPos(sp);
2112 if (!isOctalDigit(sp2, false))