needed for applet search
[jalview.git] / src / com / stevesoft / pat / SkipBMH.java
1 //\r
2 // This software is now distributed according to\r
3 // the Lesser Gnu Public License.  Please see\r
4 // http://www.gnu.org/copyleft/lesser.txt for\r
5 // the details.\r
6 //    -- Happy Computing!\r
7 //\r
8 package com.stevesoft.pat;\r
9 \r
10 import com.stevesoft.pat.wrap.StringWrap;\r
11 \r
12 /** Like Skip, but implements a\r
13     <a href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html">\r
14     Boyer-Moore-Horspool</a> type search\r
15     method that has been modified to be more like a "T-search" (see\r
16     the Michael Tamm''s article in <i>C'T, magazin fuer computer und technic</i>, August 97\r
17     p 292).  Yet another important source of information for me was\r
18     the <a href="http://www.go2net.com/people/paulp/deep/1997/05/14/">\r
19     Deep Magic</a> article on string searching.  As of this writing, I can\r
20     beat String's indexOf method in many cases.\r
21     @see com.stevesoft.pat.Skip\r
22     @see com.stevesoft.pat.Skip2\r
23     */\r
24 public class SkipBMH extends Skip {\r
25     // This number could be 256, but I think it's\r
26     // big enough.  Note, it must be a power of 2.\r
27     final int MAX_CHAR = 64;\r
28     final char[] skip = new char[MAX_CHAR];\r
29     int sm1;\r
30     int jump_ahead = 0;\r
31     char uc,lc,tc,x;\r
32     final boolean exact(char c) {\r
33         return (ign && anyc(c))||c==x;\r
34     }\r
35     final boolean anyc(char c) {\r
36         return c==uc||c==lc||c==tc;\r
37     }\r
38     public SkipBMH(String pt,boolean ign) { this(pt,ign,0); }\r
39     public SkipBMH(String pt) { this(pt,false,0); }\r
40     public SkipBMH(String pt,boolean ign,int offset) {\r
41         super(pt,ign,offset);\r
42         for(int k=0;k<MAX_CHAR;k++)\r
43             skip[k] = (char)src.length();\r
44 \r
45         sm1 = src.length()-1;\r
46         x = src.charAt(sm1);\r
47         uc=CaseMgr.toUpperCase(x);\r
48         lc=CaseMgr.toLowerCase(x);\r
49         tc=CaseMgr.toTitleCase(x);\r
50 \r
51         // We don't really want 65536 long arrays in skip[],\r
52         // so we mask of the higher bits.  This can be combined\r
53         // with ignore case, so accounting for upper\r
54         // case costs us nothing extra.\r
55         for(int k=0;k<src.length()-1;k++) {\r
56             char x_ = src.charAt(k);\r
57             if(ign) {\r
58                 char uc_ = CaseMgr.toUpperCase(x_);\r
59                 char lc_ = CaseMgr.toLowerCase(x_);\r
60                 char tc_ = CaseMgr.toTitleCase(x_);\r
61                 skip[uc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);\r
62                 skip[lc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);\r
63                 skip[tc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);\r
64             } else\r
65                 skip[x_ & (MAX_CHAR-1)] = (char)(src.length()-k-1);\r
66         }\r
67 \r
68         // This trick can be found in the July issue of\r
69         // C-T magazine.  This makes the method a type of\r
70         // "T-search."\r
71         jump_ahead = src.length()-1;\r
72         for(int k=0;k<src.length()-1;k++) {\r
73             char y=src.charAt(sm1-k-1);\r
74             if(exact(y)) {\r
75                 jump_ahead = k;\r
76                 break;\r
77             }\r
78         }\r
79     }\r
80     /** Set to true if you only want to compare two of the\r
81         characters in the String. */\r
82     final public int searchRegion(String s,int start,int end) {\r
83         return find(s,start,end);\r
84     }\r
85     final public int searchFrom(String s,int start) {\r
86         return find(s,start,s.length());\r
87     }\r
88     final public int search(String s) { return find(s,0,s.length()); }\r
89     public int find(String s,int start,int end) {\r
90         start += offset+sm1;\r
91         int vend = min(s.length()-1,end+sm1+offset),k;\r
92         int vend1 = vend-jump_ahead;\r
93         if(ign) {\r
94             for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
95                 // table look-up is expensive, avoid it if possible\r
96                 if( anyc(s.charAt(k)) ) {\r
97                     if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
98                         return k-sm1-offset;\r
99                     k += jump_ahead;\r
100                 }\r
101             }\r
102             for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
103                 // table look-up is expensive, avoid it if possible\r
104                 if( anyc(s.charAt(k)) ) {\r
105                     if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
106                         return k-sm1-offset;\r
107                     k += jump_ahead;\r
108                     if(k > vend) return -1;\r
109                 }\r
110             }\r
111         } else {\r
112             for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
113                 // table look-up is expensive, avoid it if possible\r
114                 if( x==s.charAt(k) ) {\r
115                     //if(src.regionMatches(0,s,k-sm1,sm1))\r
116                     if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
117                         return k-sm1-offset;\r
118                     k += jump_ahead;\r
119                 }\r
120             }\r
121             for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
122                 // table look-up is expensive, avoid it if possible\r
123                 if( x==s.charAt(k) ) {\r
124                     //if(src.regionMatches(0,s,k-sm1,sm1))\r
125                     if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
126                         return k-sm1-offset;\r
127                     k += jump_ahead;\r
128                     if(k > vend) return -1;\r
129                 }\r
130             }\r
131         }\r
132 \r
133         return -1;\r
134     }\r
135     public int find(StringLike s,int start,int end) {\r
136         if(s instanceof StringWrap)\r
137           return find(s.toString(),start,end);\r
138         start += offset+sm1;\r
139         int vend = min(s.length()-1,end+sm1+offset),k;\r
140         int vend1 = vend-jump_ahead;\r
141         if(ign) {\r
142             for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
143                 // table look-up is expensive, avoid it if possible\r
144                 if( anyc(s.charAt(k)) ) {\r
145                     if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
146                         return k-sm1-offset;\r
147                     k += jump_ahead;\r
148                 }\r
149             }\r
150             for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
151                 // table look-up is expensive, avoid it if possible\r
152                 if( anyc(s.charAt(k)) ) {\r
153                     if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
154                         return k-sm1-offset;\r
155                     k += jump_ahead;\r
156                     if(k > vend) return -1;\r
157                 }\r
158             }\r
159         } else {\r
160             for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
161                 // table look-up is expensive, avoid it if possible\r
162                 if( x==s.charAt(k) ) {\r
163                     //if(src.regionMatches(0,s,k-sm1,sm1))\r
164                     if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
165                         return k-sm1-offset;\r
166                     k += jump_ahead;\r
167                 }\r
168             }\r
169             for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
170                 // table look-up is expensive, avoid it if possible\r
171                 if( x==s.charAt(k) ) {\r
172                     //if(src.regionMatches(0,s,k-sm1,sm1))\r
173                     if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
174                         return k-sm1-offset;\r
175                     k += jump_ahead;\r
176                     if(k > vend) return -1;\r
177                 }\r
178             }\r
179         }\r
180 \r
181         return -1;\r
182     }\r
183 }\r