// // This software is now distributed according to // the Lesser Gnu Public License. Please see // http://www.gnu.org/copyleft/lesser.txt for // the details. // -- Happy Computing! // package com.stevesoft.pat; import com.stevesoft.pat.wrap.*; /** Like Skip, but implements a Boyer-Moore-Horspool type search method that has been modified to be more like a "T-search" (see the Michael Tamm''s article in C'T, magazin fuer computer und technic, August 97 p 292). Yet another important source of information for me was the Deep Magic article on string searching. As of this writing, I can beat String's indexOf method in many cases. @see com.stevesoft.pat.Skip @see com.stevesoft.pat.Skip2 */ public class SkipBMH extends Skip { // This number could be 256, but I think it's // big enough. Note, it must be a power of 2. final int MAX_CHAR = 64; final char[] skip = new char[MAX_CHAR]; int sm1; int jump_ahead = 0; char uc, lc, tc, x; final boolean exact(char c) { return (ign && anyc(c)) || c == x; } final boolean anyc(char c) { return c == uc || c == lc || c == tc; } public SkipBMH(String pt, boolean ign) { this(pt, ign, 0); } public SkipBMH(String pt) { this(pt, false, 0); } public SkipBMH(String pt, boolean ign, int offset) { super(pt, ign, offset); for (int k = 0; k < MAX_CHAR; k++) { skip[k] = (char) src.length(); } sm1 = src.length() - 1; x = src.charAt(sm1); uc = CaseMgr.toUpperCase(x); lc = CaseMgr.toLowerCase(x); tc = CaseMgr.toTitleCase(x); // We don't really want 65536 long arrays in skip[], // so we mask of the higher bits. This can be combined // with ignore case, so accounting for upper // case costs us nothing extra. for (int k = 0; k < src.length() - 1; k++) { char x_ = src.charAt(k); if (ign) { char uc_ = CaseMgr.toUpperCase(x_); char lc_ = CaseMgr.toLowerCase(x_); char tc_ = CaseMgr.toTitleCase(x_); skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); } else { skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); } } // This trick can be found in the July issue of // C-T magazine. This makes the method a type of // "T-search." jump_ahead = src.length() - 1; for (int k = 0; k < src.length() - 1; k++) { char y = src.charAt(sm1 - k - 1); if (exact(y)) { jump_ahead = k; break; } } } /** Set to true if you only want to compare two of the characters in the String. */ final public int searchRegion(String s, int start, int end) { return find(s, start, end); } final public int searchFrom(String s, int start) { return find(s, start, s.length()); } final public int search(String s) { return find(s, 0, s.length()); } public int find(String s, int start, int end) { start += offset + sm1; int vend = min(s.length() - 1, end + sm1 + offset), k; int vend1 = vend - jump_ahead; if (ign) { for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (anyc(s.charAt(k))) { if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; } } for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (anyc(s.charAt(k))) { if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; if (k > vend) { return -1; } } } } else { for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (x == s.charAt(k)) { //if(src.regionMatches(0,s,k-sm1,sm1)) if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; } } for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (x == s.charAt(k)) { //if(src.regionMatches(0,s,k-sm1,sm1)) if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; if (k > vend) { return -1; } } } } return -1; } public int find(StringLike s, int start, int end) { if (s instanceof StringWrap) { return find(s.toString(), start, end); } start += offset + sm1; int vend = min(s.length() - 1, end + sm1 + offset), k; int vend1 = vend - jump_ahead; if (ign) { for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (anyc(s.charAt(k))) { if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; } } for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (anyc(s.charAt(k))) { if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; if (k > vend) { return -1; } } } } else { for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (x == s.charAt(k)) { //if(src.regionMatches(0,s,k-sm1,sm1)) if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; } } for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) { // table look-up is expensive, avoid it if possible if (x == s.charAt(k)) { //if(src.regionMatches(0,s,k-sm1,sm1)) if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) { return k - sm1 - offset; } k += jump_ahead; if (k > vend) { return -1; } } } } return -1; } }