X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fcom%2Fstevesoft%2Fpat%2FSkipBMH.java;fp=src%2Fcom%2Fstevesoft%2Fpat%2FSkipBMH.java;h=1397b9006099803b404f6d63d76116b9c4992515;hb=506d60f0e188723ddc91c26824b41ac7034df3fe;hp=632418e6cda6cc4d549af765f0fa5f6e4d288ab0;hpb=60f2d6c034560415fd0139c8bc7df0c19cae1186;p=jalview.git diff --git a/src/com/stevesoft/pat/SkipBMH.java b/src/com/stevesoft/pat/SkipBMH.java index 632418e..1397b90 100755 --- a/src/com/stevesoft/pat/SkipBMH.java +++ b/src/com/stevesoft/pat/SkipBMH.java @@ -1,271 +1,279 @@ -// -// This software is now distributed according to -// the Lesser Gnu Public License. Please see -// http://www.gnu.org/copyleft/lesser.txt for -// the details. -// -- Happy Computing! -// -package com.stevesoft.pat; - -import com.stevesoft.pat.wrap.*; - -/** Like Skip, but implements a - - Boyer-Moore-Horspool type search - method that has been modified to be more like a "T-search" (see - the Michael Tamm''s article in C'T, magazin fuer computer und technic, August 97 - p 292). Yet another important source of information for me was - the - Deep Magic article on string searching. As of this writing, I can - beat String's indexOf method in many cases. - @see com.stevesoft.pat.Skip - @see com.stevesoft.pat.Skip2 - */ -public class SkipBMH - extends Skip -{ - // This number could be 256, but I think it's - // big enough. Note, it must be a power of 2. - final int MAX_CHAR = 64; - final char[] skip = new char[MAX_CHAR]; - int sm1; - int jump_ahead = 0; - char uc, lc, tc, x; - final boolean exact(char c) - { - return (ign && anyc(c)) || c == x; - } - - final boolean anyc(char c) - { - return c == uc || c == lc || c == tc; - } - - public SkipBMH(String pt, boolean ign) - { - this(pt, ign, 0); - } - - public SkipBMH(String pt) - { - this(pt, false, 0); - } - - public SkipBMH(String pt, boolean ign, int offset) - { - super(pt, ign, offset); - for (int k = 0; k < MAX_CHAR; k++) - { - skip[k] = (char) src.length(); - } - - sm1 = src.length() - 1; - x = src.charAt(sm1); - uc = CaseMgr.toUpperCase(x); - lc = CaseMgr.toLowerCase(x); - tc = CaseMgr.toTitleCase(x); - - // We don't really want 65536 long arrays in skip[], - // so we mask of the higher bits. This can be combined - // with ignore case, so accounting for upper - // case costs us nothing extra. - for (int k = 0; k < src.length() - 1; k++) - { - char x_ = src.charAt(k); - if (ign) - { - char uc_ = CaseMgr.toUpperCase(x_); - char lc_ = CaseMgr.toLowerCase(x_); - char tc_ = CaseMgr.toTitleCase(x_); - skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); - skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); - skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); - } - else - { - skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); - } - } - - // This trick can be found in the July issue of - // C-T magazine. This makes the method a type of - // "T-search." - jump_ahead = src.length() - 1; - for (int k = 0; k < src.length() - 1; k++) - { - char y = src.charAt(sm1 - k - 1); - if (exact(y)) - { - jump_ahead = k; - break; - } - } - } - - /** Set to true if you only want to compare two of the - characters in the String. */ - final public int searchRegion(String s, int start, int end) - { - return find(s, start, end); - } - - final public int searchFrom(String s, int start) - { - return find(s, start, s.length()); - } - - final public int search(String s) - { - return find(s, 0, s.length()); - } - - public int find(String s, int start, int end) - { - start += offset + sm1; - int vend = min(s.length() - 1, end + sm1 + offset), k; - int vend1 = vend - jump_ahead; - if (ign) - { - for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (anyc(s.charAt(k))) - { - if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - } - } - for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (anyc(s.charAt(k))) - { - if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - if (k > vend) - { - return -1; - } - } - } - } - else - { - for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (x == s.charAt(k)) - { - //if(src.regionMatches(0,s,k-sm1,sm1)) - if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - } - } - for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (x == s.charAt(k)) - { - //if(src.regionMatches(0,s,k-sm1,sm1)) - if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - if (k > vend) - { - return -1; - } - } - } - } - - return -1; - } - - public int find(StringLike s, int start, int end) - { - if (s instanceof StringWrap) - { - return find(s.toString(), start, end); - } - start += offset + sm1; - int vend = min(s.length() - 1, end + sm1 + offset), k; - int vend1 = vend - jump_ahead; - if (ign) - { - for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (anyc(s.charAt(k))) - { - if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - } - } - for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (anyc(s.charAt(k))) - { - if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - if (k > vend) - { - return -1; - } - } - } - } - else - { - for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (x == s.charAt(k)) - { - //if(src.regionMatches(0,s,k-sm1,sm1)) - if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - } - } - for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) - { - // table look-up is expensive, avoid it if possible - if (x == s.charAt(k)) - { - //if(src.regionMatches(0,s,k-sm1,sm1)) - if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) - { - return k - sm1 - offset; - } - k += jump_ahead; - if (k > vend) - { - return -1; - } - } - } - } - - return -1; - } -} +// +// This software is now distributed according to +// the Lesser Gnu Public License. Please see +// http://www.gnu.org/copyleft/lesser.txt for +// the details. +// -- Happy Computing! +// +package com.stevesoft.pat; + +import com.stevesoft.pat.wrap.*; + +/** + * Like Skip, but implements a + * Boyer-Moore-Horspool type search method that has been modified to be + * more like a "T-search" (see the Michael Tamm''s article in C'T, magazin + * fuer computer und technic, August 97 p 292). Yet another important + * source of information for me was the Deep Magic + * article on string searching. As of this writing, I can beat String's indexOf + * method in many cases. + * + * @see com.stevesoft.pat.Skip + * @see com.stevesoft.pat.Skip2 + */ +public class SkipBMH extends Skip +{ + // This number could be 256, but I think it's + // big enough. Note, it must be a power of 2. + final int MAX_CHAR = 64; + + final char[] skip = new char[MAX_CHAR]; + + int sm1; + + int jump_ahead = 0; + + char uc, lc, tc, x; + + final boolean exact(char c) + { + return (ign && anyc(c)) || c == x; + } + + final boolean anyc(char c) + { + return c == uc || c == lc || c == tc; + } + + public SkipBMH(String pt, boolean ign) + { + this(pt, ign, 0); + } + + public SkipBMH(String pt) + { + this(pt, false, 0); + } + + public SkipBMH(String pt, boolean ign, int offset) + { + super(pt, ign, offset); + for (int k = 0; k < MAX_CHAR; k++) + { + skip[k] = (char) src.length(); + } + + sm1 = src.length() - 1; + x = src.charAt(sm1); + uc = CaseMgr.toUpperCase(x); + lc = CaseMgr.toLowerCase(x); + tc = CaseMgr.toTitleCase(x); + + // We don't really want 65536 long arrays in skip[], + // so we mask of the higher bits. This can be combined + // with ignore case, so accounting for upper + // case costs us nothing extra. + for (int k = 0; k < src.length() - 1; k++) + { + char x_ = src.charAt(k); + if (ign) + { + char uc_ = CaseMgr.toUpperCase(x_); + char lc_ = CaseMgr.toLowerCase(x_); + char tc_ = CaseMgr.toTitleCase(x_); + skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); + skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); + skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); + } + else + { + skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); + } + } + + // This trick can be found in the July issue of + // C-T magazine. This makes the method a type of + // "T-search." + jump_ahead = src.length() - 1; + for (int k = 0; k < src.length() - 1; k++) + { + char y = src.charAt(sm1 - k - 1); + if (exact(y)) + { + jump_ahead = k; + break; + } + } + } + + /** + * Set to true if you only want to compare two of the characters in the + * String. + */ + final public int searchRegion(String s, int start, int end) + { + return find(s, start, end); + } + + final public int searchFrom(String s, int start) + { + return find(s, start, s.length()); + } + + final public int search(String s) + { + return find(s, 0, s.length()); + } + + public int find(String s, int start, int end) + { + start += offset + sm1; + int vend = min(s.length() - 1, end + sm1 + offset), k; + int vend1 = vend - jump_ahead; + if (ign) + { + for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (anyc(s.charAt(k))) + { + if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + } + } + for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (anyc(s.charAt(k))) + { + if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + if (k > vend) + { + return -1; + } + } + } + } + else + { + for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (x == s.charAt(k)) + { + // if(src.regionMatches(0,s,k-sm1,sm1)) + if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + } + } + for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (x == s.charAt(k)) + { + // if(src.regionMatches(0,s,k-sm1,sm1)) + if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + if (k > vend) + { + return -1; + } + } + } + } + + return -1; + } + + public int find(StringLike s, int start, int end) + { + if (s instanceof StringWrap) + { + return find(s.toString(), start, end); + } + start += offset + sm1; + int vend = min(s.length() - 1, end + sm1 + offset), k; + int vend1 = vend - jump_ahead; + if (ign) + { + for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (anyc(s.charAt(k))) + { + if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + } + } + for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (anyc(s.charAt(k))) + { + if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + if (k > vend) + { + return -1; + } + } + } + } + else + { + for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (x == s.charAt(k)) + { + // if(src.regionMatches(0,s,k-sm1,sm1)) + if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + } + } + for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) + { + // table look-up is expensive, avoid it if possible + if (x == s.charAt(k)) + { + // if(src.regionMatches(0,s,k-sm1,sm1)) + if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) + { + return k - sm1 - offset; + } + k += jump_ahead; + if (k > vend) + { + return -1; + } + } + } + } + + return -1; + } +}