needed for applet search
[jalview.git] / src / com / stevesoft / pat / SkipBMH.java
diff --git a/src/com/stevesoft/pat/SkipBMH.java b/src/com/stevesoft/pat/SkipBMH.java
new file mode 100755 (executable)
index 0000000..4fbe381
--- /dev/null
@@ -0,0 +1,183 @@
+//\r
+// This software is now distributed according to\r
+// the Lesser Gnu Public License.  Please see\r
+// http://www.gnu.org/copyleft/lesser.txt for\r
+// the details.\r
+//    -- Happy Computing!\r
+//\r
+package com.stevesoft.pat;\r
+\r
+import com.stevesoft.pat.wrap.StringWrap;\r
+\r
+/** Like Skip, but implements a\r
+    <a href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html">\r
+    Boyer-Moore-Horspool</a> type search\r
+    method that has been modified to be more like a "T-search" (see\r
+    the Michael Tamm''s article in <i>C'T, magazin fuer computer und technic</i>, August 97\r
+    p 292).  Yet another important source of information for me was\r
+    the <a href="http://www.go2net.com/people/paulp/deep/1997/05/14/">\r
+    Deep Magic</a> article on string searching.  As of this writing, I can\r
+    beat String's indexOf method in many cases.\r
+    @see com.stevesoft.pat.Skip\r
+    @see com.stevesoft.pat.Skip2\r
+    */\r
+public class SkipBMH extends Skip {\r
+    // This number could be 256, but I think it's\r
+    // big enough.  Note, it must be a power of 2.\r
+    final int MAX_CHAR = 64;\r
+    final char[] skip = new char[MAX_CHAR];\r
+    int sm1;\r
+    int jump_ahead = 0;\r
+    char uc,lc,tc,x;\r
+    final boolean exact(char c) {\r
+        return (ign && anyc(c))||c==x;\r
+    }\r
+    final boolean anyc(char c) {\r
+        return c==uc||c==lc||c==tc;\r
+    }\r
+    public SkipBMH(String pt,boolean ign) { this(pt,ign,0); }\r
+    public SkipBMH(String pt) { this(pt,false,0); }\r
+    public SkipBMH(String pt,boolean ign,int offset) {\r
+        super(pt,ign,offset);\r
+        for(int k=0;k<MAX_CHAR;k++)\r
+            skip[k] = (char)src.length();\r
+\r
+        sm1 = src.length()-1;\r
+        x = src.charAt(sm1);\r
+        uc=CaseMgr.toUpperCase(x);\r
+        lc=CaseMgr.toLowerCase(x);\r
+        tc=CaseMgr.toTitleCase(x);\r
+\r
+        // We don't really want 65536 long arrays in skip[],\r
+        // so we mask of the higher bits.  This can be combined\r
+        // with ignore case, so accounting for upper\r
+        // case costs us nothing extra.\r
+        for(int k=0;k<src.length()-1;k++) {\r
+            char x_ = src.charAt(k);\r
+            if(ign) {\r
+                char uc_ = CaseMgr.toUpperCase(x_);\r
+                char lc_ = CaseMgr.toLowerCase(x_);\r
+                char tc_ = CaseMgr.toTitleCase(x_);\r
+                skip[uc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);\r
+                skip[lc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);\r
+                skip[tc_ & (MAX_CHAR-1)]=(char)(src.length()-k-1);\r
+            } else\r
+                skip[x_ & (MAX_CHAR-1)] = (char)(src.length()-k-1);\r
+        }\r
+\r
+        // This trick can be found in the July issue of\r
+        // C-T magazine.  This makes the method a type of\r
+        // "T-search."\r
+        jump_ahead = src.length()-1;\r
+        for(int k=0;k<src.length()-1;k++) {\r
+            char y=src.charAt(sm1-k-1);\r
+            if(exact(y)) {\r
+                jump_ahead = k;\r
+                break;\r
+            }\r
+        }\r
+    }\r
+    /** Set to true if you only want to compare two of the\r
+        characters in the String. */\r
+    final public int searchRegion(String s,int start,int end) {\r
+        return find(s,start,end);\r
+    }\r
+    final public int searchFrom(String s,int start) {\r
+        return find(s,start,s.length());\r
+    }\r
+    final public int search(String s) { return find(s,0,s.length()); }\r
+    public int find(String s,int start,int end) {\r
+        start += offset+sm1;\r
+        int vend = min(s.length()-1,end+sm1+offset),k;\r
+        int vend1 = vend-jump_ahead;\r
+        if(ign) {\r
+            for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( anyc(s.charAt(k)) ) {\r
+                    if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                }\r
+            }\r
+            for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( anyc(s.charAt(k)) ) {\r
+                    if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                    if(k > vend) return -1;\r
+                }\r
+            }\r
+        } else {\r
+            for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( x==s.charAt(k) ) {\r
+                    //if(src.regionMatches(0,s,k-sm1,sm1))\r
+                    if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                }\r
+            }\r
+            for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( x==s.charAt(k) ) {\r
+                    //if(src.regionMatches(0,s,k-sm1,sm1))\r
+                    if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                    if(k > vend) return -1;\r
+                }\r
+            }\r
+        }\r
+\r
+        return -1;\r
+    }\r
+    public int find(StringLike s,int start,int end) {\r
+        if(s instanceof StringWrap)\r
+          return find(s.toString(),start,end);\r
+        start += offset+sm1;\r
+        int vend = min(s.length()-1,end+sm1+offset),k;\r
+        int vend1 = vend-jump_ahead;\r
+        if(ign) {\r
+            for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( anyc(s.charAt(k)) ) {\r
+                    if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                }\r
+            }\r
+            for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( anyc(s.charAt(k)) ) {\r
+                    if(CaseMgr.regionMatches(src,ign,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                    if(k > vend) return -1;\r
+                }\r
+            }\r
+        } else {\r
+            for(k=start; k <= vend1;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( x==s.charAt(k) ) {\r
+                    //if(src.regionMatches(0,s,k-sm1,sm1))\r
+                    if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                }\r
+            }\r
+            for(; k <= vend;k += skip[s.charAt(k) & (MAX_CHAR-1)] ) {\r
+                // table look-up is expensive, avoid it if possible\r
+                if( x==s.charAt(k) ) {\r
+                    //if(src.regionMatches(0,s,k-sm1,sm1))\r
+                    if(CaseMgr.regionMatches(src,false,0,s,k-sm1,sm1))\r
+                        return k-sm1-offset;\r
+                    k += jump_ahead;\r
+                    if(k > vend) return -1;\r
+                }\r
+            }\r
+        }\r
+\r
+        return -1;\r
+    }\r
+}\r