2 // This software is now distributed according to
\r
3 // the Lesser Gnu Public License. Please see
\r
4 // http://www.gnu.org/copyleft/lesser.txt for
\r
6 // -- Happy Computing!
\r
8 package com.stevesoft.pat;
\r
10 import com.stevesoft.pat.wrap.*;
\r
13 * Like Skip, but implements a <a
\r
14 * href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html">
\r
15 * Boyer-Moore-Horspool</a> type search method that has been modified to be
\r
16 * more like a "T-search" (see the Michael Tamm''s article in <i>C'T, magazin
\r
17 * fuer computer und technic</i>, August 97 p 292). Yet another important
\r
18 * source of information for me was the <a
\r
19 * href="http://www.go2net.com/people/paulp/deep/1997/05/14/"> Deep Magic</a>
\r
20 * article on string searching. As of this writing, I can beat String's indexOf
\r
21 * method in many cases.
\r
23 * @see com.stevesoft.pat.Skip
\r
24 * @see com.stevesoft.pat.Skip2
\r
26 public class SkipBMH extends Skip
\r
28 // This number could be 256, but I think it's
\r
29 // big enough. Note, it must be a power of 2.
\r
30 final int MAX_CHAR = 64;
\r
32 final char[] skip = new char[MAX_CHAR];
\r
40 final boolean exact(char c)
\r
42 return (ign && anyc(c)) || c == x;
\r
45 final boolean anyc(char c)
\r
47 return c == uc || c == lc || c == tc;
\r
50 public SkipBMH(String pt, boolean ign)
\r
55 public SkipBMH(String pt)
\r
60 public SkipBMH(String pt, boolean ign, int offset)
\r
62 super(pt, ign, offset);
\r
63 for (int k = 0; k < MAX_CHAR; k++)
\r
65 skip[k] = (char) src.length();
\r
68 sm1 = src.length() - 1;
\r
69 x = src.charAt(sm1);
\r
70 uc = CaseMgr.toUpperCase(x);
\r
71 lc = CaseMgr.toLowerCase(x);
\r
72 tc = CaseMgr.toTitleCase(x);
\r
74 // We don't really want 65536 long arrays in skip[],
\r
75 // so we mask of the higher bits. This can be combined
\r
76 // with ignore case, so accounting for upper
\r
77 // case costs us nothing extra.
\r
78 for (int k = 0; k < src.length() - 1; k++)
\r
80 char x_ = src.charAt(k);
\r
83 char uc_ = CaseMgr.toUpperCase(x_);
\r
84 char lc_ = CaseMgr.toLowerCase(x_);
\r
85 char tc_ = CaseMgr.toTitleCase(x_);
\r
86 skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
87 skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
88 skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
92 skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
96 // This trick can be found in the July issue of
\r
97 // C-T magazine. This makes the method a type of
\r
99 jump_ahead = src.length() - 1;
\r
100 for (int k = 0; k < src.length() - 1; k++)
\r
102 char y = src.charAt(sm1 - k - 1);
\r
112 * Set to true if you only want to compare two of the characters in the
\r
115 final public int searchRegion(String s, int start, int end)
\r
117 return find(s, start, end);
\r
120 final public int searchFrom(String s, int start)
\r
122 return find(s, start, s.length());
\r
125 final public int search(String s)
\r
127 return find(s, 0, s.length());
\r
130 public int find(String s, int start, int end)
\r
132 start += offset + sm1;
\r
133 int vend = min(s.length() - 1, end + sm1 + offset), k;
\r
134 int vend1 = vend - jump_ahead;
\r
137 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
139 // table look-up is expensive, avoid it if possible
\r
140 if (anyc(s.charAt(k)))
\r
142 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
144 return k - sm1 - offset;
\r
149 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
151 // table look-up is expensive, avoid it if possible
\r
152 if (anyc(s.charAt(k)))
\r
154 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
156 return k - sm1 - offset;
\r
168 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
170 // table look-up is expensive, avoid it if possible
\r
171 if (x == s.charAt(k))
\r
173 // if(src.regionMatches(0,s,k-sm1,sm1))
\r
174 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
176 return k - sm1 - offset;
\r
181 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
183 // table look-up is expensive, avoid it if possible
\r
184 if (x == s.charAt(k))
\r
186 // if(src.regionMatches(0,s,k-sm1,sm1))
\r
187 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
189 return k - sm1 - offset;
\r
203 public int find(StringLike s, int start, int end)
\r
205 if (s instanceof StringWrap)
\r
207 return find(s.toString(), start, end);
\r
209 start += offset + sm1;
\r
210 int vend = min(s.length() - 1, end + sm1 + offset), k;
\r
211 int vend1 = vend - jump_ahead;
\r
214 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
216 // table look-up is expensive, avoid it if possible
\r
217 if (anyc(s.charAt(k)))
\r
219 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
221 return k - sm1 - offset;
\r
226 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
228 // table look-up is expensive, avoid it if possible
\r
229 if (anyc(s.charAt(k)))
\r
231 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
233 return k - sm1 - offset;
\r
245 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
247 // table look-up is expensive, avoid it if possible
\r
248 if (x == s.charAt(k))
\r
250 // if(src.regionMatches(0,s,k-sm1,sm1))
\r
251 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
253 return k - sm1 - offset;
\r
258 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
260 // table look-up is expensive, avoid it if possible
\r
261 if (x == s.charAt(k))
\r
263 // if(src.regionMatches(0,s,k-sm1,sm1))
\r
264 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
266 return k - sm1 - offset;
\r