2 // This software is now distributed according to
\r
3 // the Lesser Gnu Public License. Please see
\r
4 // http://www.gnu.org/copyleft/lesser.txt for
\r
6 // -- Happy Computing!
\r
8 package com.stevesoft.pat;
\r
10 import com.stevesoft.pat.wrap.*;
\r
12 /** Like Skip, but implements a
\r
13 <a href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html">
\r
14 Boyer-Moore-Horspool</a> type search
\r
15 method that has been modified to be more like a "T-search" (see
\r
16 the Michael Tamm''s article in <i>C'T, magazin fuer computer und technic</i>, August 97
\r
17 p 292). Yet another important source of information for me was
\r
18 the <a href="http://www.go2net.com/people/paulp/deep/1997/05/14/">
\r
19 Deep Magic</a> article on string searching. As of this writing, I can
\r
20 beat String's indexOf method in many cases.
\r
21 @see com.stevesoft.pat.Skip
\r
22 @see com.stevesoft.pat.Skip2
\r
24 public class SkipBMH
\r
27 // This number could be 256, but I think it's
\r
28 // big enough. Note, it must be a power of 2.
\r
29 final int MAX_CHAR = 64;
\r
30 final char[] skip = new char[MAX_CHAR];
\r
34 final boolean exact(char c)
\r
36 return (ign && anyc(c)) || c == x;
\r
39 final boolean anyc(char c)
\r
41 return c == uc || c == lc || c == tc;
\r
44 public SkipBMH(String pt, boolean ign)
\r
49 public SkipBMH(String pt)
\r
54 public SkipBMH(String pt, boolean ign, int offset)
\r
56 super(pt, ign, offset);
\r
57 for (int k = 0; k < MAX_CHAR; k++)
\r
59 skip[k] = (char) src.length();
\r
62 sm1 = src.length() - 1;
\r
63 x = src.charAt(sm1);
\r
64 uc = CaseMgr.toUpperCase(x);
\r
65 lc = CaseMgr.toLowerCase(x);
\r
66 tc = CaseMgr.toTitleCase(x);
\r
68 // We don't really want 65536 long arrays in skip[],
\r
69 // so we mask of the higher bits. This can be combined
\r
70 // with ignore case, so accounting for upper
\r
71 // case costs us nothing extra.
\r
72 for (int k = 0; k < src.length() - 1; k++)
\r
74 char x_ = src.charAt(k);
\r
77 char uc_ = CaseMgr.toUpperCase(x_);
\r
78 char lc_ = CaseMgr.toLowerCase(x_);
\r
79 char tc_ = CaseMgr.toTitleCase(x_);
\r
80 skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
81 skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
82 skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
86 skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1);
\r
90 // This trick can be found in the July issue of
\r
91 // C-T magazine. This makes the method a type of
\r
93 jump_ahead = src.length() - 1;
\r
94 for (int k = 0; k < src.length() - 1; k++)
\r
96 char y = src.charAt(sm1 - k - 1);
\r
105 /** Set to true if you only want to compare two of the
\r
106 characters in the String. */
\r
107 final public int searchRegion(String s, int start, int end)
\r
109 return find(s, start, end);
\r
112 final public int searchFrom(String s, int start)
\r
114 return find(s, start, s.length());
\r
117 final public int search(String s)
\r
119 return find(s, 0, s.length());
\r
122 public int find(String s, int start, int end)
\r
124 start += offset + sm1;
\r
125 int vend = min(s.length() - 1, end + sm1 + offset), k;
\r
126 int vend1 = vend - jump_ahead;
\r
129 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
131 // table look-up is expensive, avoid it if possible
\r
132 if (anyc(s.charAt(k)))
\r
134 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
136 return k - sm1 - offset;
\r
141 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
143 // table look-up is expensive, avoid it if possible
\r
144 if (anyc(s.charAt(k)))
\r
146 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
148 return k - sm1 - offset;
\r
160 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
162 // table look-up is expensive, avoid it if possible
\r
163 if (x == s.charAt(k))
\r
165 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
166 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
168 return k - sm1 - offset;
\r
173 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
175 // table look-up is expensive, avoid it if possible
\r
176 if (x == s.charAt(k))
\r
178 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
179 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
181 return k - sm1 - offset;
\r
195 public int find(StringLike s, int start, int end)
\r
197 if (s instanceof StringWrap)
\r
199 return find(s.toString(), start, end);
\r
201 start += offset + sm1;
\r
202 int vend = min(s.length() - 1, end + sm1 + offset), k;
\r
203 int vend1 = vend - jump_ahead;
\r
206 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
208 // table look-up is expensive, avoid it if possible
\r
209 if (anyc(s.charAt(k)))
\r
211 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
213 return k - sm1 - offset;
\r
218 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
220 // table look-up is expensive, avoid it if possible
\r
221 if (anyc(s.charAt(k)))
\r
223 if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1))
\r
225 return k - sm1 - offset;
\r
237 for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
239 // table look-up is expensive, avoid it if possible
\r
240 if (x == s.charAt(k))
\r
242 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
243 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
245 return k - sm1 - offset;
\r
250 for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)])
\r
252 // table look-up is expensive, avoid it if possible
\r
253 if (x == s.charAt(k))
\r
255 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
256 if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1))
\r
258 return k - sm1 - offset;
\r