2 // This software is now distributed according to
3 // the Lesser Gnu Public License. Please see
4 // http://www.gnu.org/copyleft/lesser.txt for
8 package com.stevesoft.pat;
10 import com.stevesoft.pat.wrap.StringWrap;
13 * Like Skip, but implements a <a
14 * href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html">
15 * Boyer-Moore-Horspool</a> type search method that has been modified to be more
16 * like a "T-search" (see the Michael Tamm''s article in <i>C'T, magazin fuer
17 * computer und technic</i>, August 97 p 292). Yet another important source of
18 * information for me was the <a
19 * href="http://www.go2net.com/people/paulp/deep/1997/05/14/"> Deep Magic</a>
20 * article on string searching. As of this writing, I can beat String's indexOf
21 * method in many cases.
23 * @see com.stevesoft.pat.Skip
24 * @see com.stevesoft.pat.Skip2
26 public class SkipBMH extends Skip
28 // This number could be 256, but I think it's
29 // big enough. Note, it must be a power of 2.
30 final int MAX_CHAR = 64;
32 final char[] skip = new char[MAX_CHAR];
40 final boolean exact(char c)
42 return (ign && anyc(c)) || c == x;
45 final boolean anyc(char c)
47 return c == uc || c == lc || c == tc;
50 public SkipBMH(String pt, boolean ign)
55 public SkipBMH(String pt)
60 public SkipBMH(String pt, boolean ign, int offset)
62 super(pt, ign, offset);
63 for (int k = 0; k < MAX_CHAR; k++)
65 skip[k] = (char) src.length();
68 sm1 = src.length() - 1;
70 uc = CaseMgr.toUpperCase(x);
71 lc = CaseMgr.toLowerCase(x);
72 tc = CaseMgr.toTitleCase(x);
74 // We don't really want 65536 long arrays in skip[],
75 // so we mask of the higher bits. This can be combined
76 // with ignore case, so accounting for upper
77 // case costs us nothing extra.
78 for (int k = 0; k < src.length() - 1; k++)
80 char x_ = src.charAt(k);
83 char uc_ = CaseMgr.toUpperCase(x_);
84 char lc_ = CaseMgr.toLowerCase(x_);
85 char tc_ = CaseMgr.toTitleCase(x_);
86 skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
87 skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
88 skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
92 skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
96 // This trick can be found in the July issue of
97 // C-T magazine. This makes the method a type of
99 jump_ahead = src.length() - 1;
100 for (int k = 0; k < src.length() - 1; k++)
102 char y = src.charAt(sm1 - k - 1);
112 * Set to true if you only want to compare two of the characters in the
115 final public int searchRegion(String s, int start, int end)
117 return find(s, start, end);
120 final public int searchFrom(String s, int start)
122 return find(s, start, s.length());
125 final public int search(String s)
127 return find(s, 0, s.length());
130 public int find(String s, int start, int end)
132 start += offset + sm1;
133 int vend = min(s.length() - 1, end + sm1 + offset), k;
134 int vend1 = vend - jump_ahead;
137 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
139 // table look-up is expensive, avoid it if possible
140 if (anyc(s.charAt(k)))
142 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
144 return k - sm1 - offset;
149 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
151 // table look-up is expensive, avoid it if possible
152 if (anyc(s.charAt(k)))
154 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
156 return k - sm1 - offset;
168 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
170 // table look-up is expensive, avoid it if possible
171 if (x == s.charAt(k))
173 // if(src.regionMatches(0,s,k-sm1,sm1))
174 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
176 return k - sm1 - offset;
181 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
183 // table look-up is expensive, avoid it if possible
184 if (x == s.charAt(k))
186 // if(src.regionMatches(0,s,k-sm1,sm1))
187 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
189 return k - sm1 - offset;
203 public int find(StringLike s, int start, int end)
205 if (s instanceof StringWrap)
207 return find(s.toString(), start, end);
209 start += offset + sm1;
210 int vend = min(s.length() - 1, end + sm1 + offset), k;
211 int vend1 = vend - jump_ahead;
214 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
216 // table look-up is expensive, avoid it if possible
217 if (anyc(s.charAt(k)))
219 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
221 return k - sm1 - offset;
226 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
228 // table look-up is expensive, avoid it if possible
229 if (anyc(s.charAt(k)))
231 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
233 return k - sm1 - offset;
245 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
247 // table look-up is expensive, avoid it if possible
248 if (x == s.charAt(k))
250 // if(src.regionMatches(0,s,k-sm1,sm1))
251 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
253 return k - sm1 - offset;
258 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
260 // table look-up is expensive, avoid it if possible
261 if (x == s.charAt(k))
263 // if(src.regionMatches(0,s,k-sm1,sm1))
264 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
266 return k - sm1 - offset;