/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* rk.c (originally from rnabob's patsearch.c) * * Contains a compiler and a search engine for Rabin-Karp * based primary sequence pattern searching on encoded * sequences. * * See Sedgewick, _Algorithms_, for a general discussion of * the Rabin-Karp algorithm. See the rkcomp or rkexec man * pages for specific details. * * RCS $Id: rk.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: rk.c,v 1.2 1998/10/09 18:07:16 eddy Exp) */ #include #include #include #include "squid.h" /* seq encoding utilities and typedefs */ #include "rk.h" #ifdef MEMDEBUG #include "dbmalloc.h" #endif Hashseq rkcomp(char *probe) /* A,C,G,T/U, N probe string, 0-8 nt long */ { Hashseq hashprobe = 0; char coded[RK_HASHSIZE + 1]; int len; int i; /* check bounds violation on probe */ if ((len = strlen(probe)) > RK_HASHSIZE) return 0; /* encode the probe */ if (seqencode(coded, probe) == 0) return 0; /* pack the probe into a Hashseq */ for (i = 0; i < len; i++) { hashprobe <<= 4; hashprobe |= (Hashseq) coded[i]; } /* left adjust as needed */ for (; i < RK_HASHSIZE; i++) { hashprobe <<= 4; hashprobe |= (Hashseq) NTN; } /* return the compiled probe */ return hashprobe; } int rkseq(Hashseq hashprobe, /* up to 8 nt packed into the probe */ char *sequence) /* encoded sequence */ { long i; long pos = 0; Hashseq target = 0; /* initialize the target hashseq */ for (i = 0; i < RK_HASHSIZE; i++) { if (*(sequence + i) == NTEND) break; target <<= 4; target |= (Hashseq) (*(sequence + i)); } while (*(sequence + pos + RK_HASHSIZE -1) != NTEND) { #ifdef DEBUG printf("hashprobe: "); writehash(hashprobe); printf("\ttarget: "); writehash(target); printf("\nhashprobe & target: "); writehash(hashprobe & target); printf("\n"); #endif if ((hashprobe & target) == target) return ((int) pos); target <<= 4; target |= (Hashseq) (*(sequence + pos + RK_HASHSIZE)); pos++; } /* now we deal with an end effect */ for (i = 0; i < RK_HASHSIZE; i++) { target |= (Hashseq) NTN; if ((hashprobe & target) == target) return ((int) pos); target <<=4; pos++; } return(-1); } #ifdef DEBUG /* Debugging aids */ static void writehash(Hashseq hashseq) { int idx; int sym; if (hashseq/16) writehash(hashseq/16); sym = (int) (hashseq % 16); if (sym == 0) putchar('-'); else { for (idx = 0; sym != iupac[idx].code && idx < IUPACSYMNUM; idx++); if (idx > IUPACSYMNUM) printf("(%d)", sym); else putchar(iupac[idx].sym); } } #endif