2 // This software is now distributed according to
\r
3 // the Lesser Gnu Public License. Please see
\r
4 // http://www.gnu.org/copyleft/lesser.txt for
\r
6 // -- Happy Computing!
\r
8 package com.stevesoft.pat;
\r
10 import com.stevesoft.pat.wrap.StringWrap;
\r
12 /** Like Skip, but implements a
\r
13 <a href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html">
\r
14 Boyer-Moore-Horspool</a> type search
\r
15 method that has been modified to be more like a "T-search" (see
\r
16 the Michael Tamm''s article in <i>C'T, magazin fuer computer und technic</i>, August 97
\r
17 p 292). Yet another important source of information for me was
\r
18 the <a href="http://www.go2net.com/people/paulp/deep/1997/05/14/">
\r
19 Deep Magic</a> article on string searching. As of this writing, I can
\r
20 beat String's indexOf method in many cases.
\r
21 @see com.stevesoft.pat.Skip
\r
22 @see com.stevesoft.pat.Skip2
\r
24 public class SkipBMH extends Skip {
\r
25 // This number could be 256, but I think it's
\r
26 // big enough. Note, it must be a power of 2.
\r
27 final int MAX_CHAR = 64;
\r
28 final char[] skip = new char[MAX_CHAR];
\r
32 final boolean exact(char c) {
\r
33 return (ign && anyc(c))||c==x;
\r
35 final boolean anyc(char c) {
\r
36 return c==uc||c==lc||c==tc;
\r
38 public SkipBMH(String pt,boolean ign) { this(pt,ign,0); }
\r
39 public SkipBMH(String pt) { this(pt,false,0); }
\r
40 public SkipBMH(String pt,boolean ign,int offset) {
\r
41 super(pt,ign,offset);
\r
42 for(int k=0;k<MAX_CHAR;k++)
\r
43 skip[k] = (char)src.length();
\r
45 sm1 = src.length()-1;
\r
46 x = src.charAt(sm1);
\r
47 uc=CaseMgr.toUpperCase(x);
\r
48 lc=CaseMgr.toLowerCase(x);
\r
49 tc=CaseMgr.toTitleCase(x);
\r
51 // We don't really want 65536 long arrays in skip[],
\r
52 // so we mask of the higher bits. This can be combined
\r
53 // with ignore case, so accounting for upper
\r
54 // case costs us nothing extra.
\r
55 for(int k=0;k<src.length()-1;k++) {
\r
56 char x_ = src.charAt(k);
\r
58 char uc_ = CaseMgr.toUpperCase(x_);
\r
59 char lc_ = CaseMgr.toLowerCase(x_);
\r
60 char tc_ = CaseMgr.toTitleCase(x_);
\r
61 skip[uc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);
\r
62 skip[lc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);
\r
63 skip[tc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);
\r
65 skip[x_ & (MAX_CHAR-1)] = (char)(src.length()-k-1);
\r
68 // This trick can be found in the July issue of
\r
69 // C-T magazine. This makes the method a type of
\r
71 jump_ahead = src.length()-1;
\r
72 for(int k=0;k<src.length()-1;k++) {
\r
73 char y=src.charAt(sm1-k-1);
\r
80 /** Set to true if you only want to compare two of the
\r
81 characters in the String. */
\r
82 final public int searchRegion(String s,int start,int end) {
\r
83 return find(s,start,end);
\r
85 final public int searchFrom(String s,int start) {
\r
86 return find(s,start,s.length());
\r
88 final public int search(String s) { return find(s,0,s.length()); }
\r
89 public int find(String s,int start,int end) {
\r
90 start += offset+sm1;
\r
91 int vend = min(s.length()-1,end+sm1+offset),k;
\r
92 int vend1 = vend-jump_ahead;
\r
94 for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
95 // table look-up is expensive, avoid it if possible
\r
96 if( anyc(s.charAt(k)) ) {
\r
97 if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))
\r
98 return k-sm1-offset;
\r
102 for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
103 // table look-up is expensive, avoid it if possible
\r
104 if( anyc(s.charAt(k)) ) {
\r
105 if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))
\r
106 return k-sm1-offset;
\r
108 if(k > vend) return -1;
\r
112 for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
113 // table look-up is expensive, avoid it if possible
\r
114 if( x==s.charAt(k) ) {
\r
115 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
116 if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))
\r
117 return k-sm1-offset;
\r
121 for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
122 // table look-up is expensive, avoid it if possible
\r
123 if( x==s.charAt(k) ) {
\r
124 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
125 if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))
\r
126 return k-sm1-offset;
\r
128 if(k > vend) return -1;
\r
135 public int find(StringLike s,int start,int end) {
\r
136 if(s instanceof StringWrap)
\r
137 return find(s.toString(),start,end);
\r
138 start += offset+sm1;
\r
139 int vend = min(s.length()-1,end+sm1+offset),k;
\r
140 int vend1 = vend-jump_ahead;
\r
142 for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
143 // table look-up is expensive, avoid it if possible
\r
144 if( anyc(s.charAt(k)) ) {
\r
145 if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))
\r
146 return k-sm1-offset;
\r
150 for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
151 // table look-up is expensive, avoid it if possible
\r
152 if( anyc(s.charAt(k)) ) {
\r
153 if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))
\r
154 return k-sm1-offset;
\r
156 if(k > vend) return -1;
\r
160 for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
161 // table look-up is expensive, avoid it if possible
\r
162 if( x==s.charAt(k) ) {
\r
163 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
164 if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))
\r
165 return k-sm1-offset;
\r
169 for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {
\r
170 // table look-up is expensive, avoid it if possible
\r
171 if( x==s.charAt(k) ) {
\r
172 //if(src.regionMatches(0,s,k-sm1,sm1))
\r
173 if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))
\r
174 return k-sm1-offset;
\r
176 if(k > vend) return -1;
\r