/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ #include #include #include #include #include #include #include "squid.h" #include "ssi.h" #ifdef CLUSTALO #include #endif static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */ static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */ static int read_i16(FILE *fp, sqd_uint16 *ret_result); static int read_i32(FILE *fp, sqd_uint32 *ret_result); static int read_i64(FILE *fp, sqd_uint64 *ret_result); static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset); static int write_i16(FILE *fp, sqd_uint16 n); static int write_i32(FILE *fp, sqd_uint32 n); static int write_i64(FILE *fp, sqd_uint64 n); static int write_offset(FILE *fp, SSIOFFSET *offset); static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, sqd_uint32 recsize, sqd_uint32 maxidx); static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n); static void clear_ssifile(SSIFILE *sfp); static sqd_uint64 current_index_size(SSIINDEX *g); static int activate_external_sort(SSIINDEX *g); static int load_indexfile(SSIFILE *sfp); static int parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey); static int parse_skey_info(char *buf, struct ssiskey_s *skey); /* Function: SSIOpen() * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis] * * Purpose: Opens the SSI index file {filename} and returns * a SSIFILE * stream thru {ret_sfp}. * The caller must eventually close this stream using * SSIClose(). More than one index file can be open * at once. * * Args: filename - full path to a SSI index file * * Returns: Returns 0 on success, nonzero on failure. */ int SSIOpen(char *filename, SSIFILE **ret_sfp) { SSIFILE *sfp = NULL; int status; if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; if ((sfp->fp = fopen(filename, "rb")) == NULL) { free(sfp); return SSI_ERR_NOFILE; } status = load_indexfile(sfp); *ret_sfp = sfp; return status; } /* load_indexfile(): given a SSIFILE structure with an open and positioned * stream (fp) -- but no other data loaded -- read the next SSIFILE * in from disk. We use this routine without its SSIOpen() wrapper * as part of the external mergesort when creating large indices. */ static int load_indexfile(SSIFILE *sfp) { sqd_uint32 magic; sqd_uint16 i; /* counter over files */ int status; /* overall return status if an error is thrown */ status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */ sfp->filename = NULL; sfp->fileformat = NULL; sfp->fileflags = NULL; sfp->bpl = NULL; sfp->rpl = NULL; sfp->nfiles = 0; if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; } if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; } if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE; /* If we have 64-bit offsets, make sure we can deal with them. */ #ifndef HAS_64BIT_FILE_OFFSETS if ((sfp->flags & SSI_USE64_INDEX) || (sfp->flags & SSI_USE64)) { status = SSI_ERR_NO64BIT; goto FAILURE; } #endif sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE; if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE; if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE; if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE; /* Read the file information and keep it. * We expect the number of files to be small, so reading it * once should be advantageous overall. If SSI ever had to * deal with large numbers of files, you'd probably want to * read file information on demand. */ if (sfp->nfiles == 0) goto FAILURE; if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL; if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } for (i = 0; i < sfp->nfiles; i++) { /* We have to explicitly position, because header and file * records may expand in the future; frecsize and foffset * give us forwards compatibility. */ if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE; if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE; } /* Success. Return 0. */ return 0; FAILURE: /* Failure: free the damaged structure, return status code. */ SSIClose(sfp); return status; } /* Function: SSIGetOffsetByName() * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis] * * Purpose: Looks up the string {key} in the open index {sfp}. * {key} can be either a primary or secondary key. If {key} * is found, {*ret_fh} contains a unique handle on * the file that contains {key} (suitable for an SSIFileInfo() * call, or for comparison to the handle of the last file * that was opened for retrieval), and {offset} is filled * in with the offset in that file. * * Args: sfp - open index file * key - string to search for * ret_fh - RETURN: handle on file that key is in * ret_offset - RETURN: offset of the start of that key's record * * Returns: 0 on success. * non-zero on error. */ int SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh, SSIOFFSET *ret_offset) { int status; sqd_uint16 fnum; /* Look in the primary keys. */ status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize, sfp->nprimary); if (status == 0) { /* We found it as a primary key; get our data & return. */ if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; *ret_fh = (int) fnum; if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; return 0; /* success! (we don't need the other key data) */ } else if (status == SSI_ERR_NO_SUCH_KEY) { /* Not in the primary keys? OK, try the secondary keys. */ if (sfp->nsecondary > 0) { char *pkey; status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize, sfp->nsecondary); if (status != 0) return status; if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset); free(pkey); } return status; } else return status; /*NOTREACHED*/ } /* Function: SSIGetOffsetByNumber() * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis] * * Purpose: Looks up primary key #{n} in the open index {sfp}. * {n} ranges from 0..nprimary-1. When key #{n} * is found, {*ret_fh} contains a unique * handle on the file that contains {key} (suitable * for an SSIFileInfo() call, or for comparison to * the handle of the last file that was opened for retrieval), * and {offset} is filled in with the offset in that file. * * Args: sfp - open index file * n - primary key number to retrieve. * ret_fh - RETURN: handle on file that key is in * ret_offset - RETURN: offset of the start of that key's record * * Returns: 0 on success. * non-zero on error. */ int SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset) { sqd_uint16 fnum; char *pkey; if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY; if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0) return SSI_ERR_SEEK_FAILED; if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; *ret_fh = fnum; free(pkey); return 0; } /* Function: SSIGetSubseqOffset() * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis] * * Purpose: Implements SSI_FAST_SUBSEQ. * * Looks up a primary or secondary {key} in the open * index {sfp}. Asks for the nearest offset to a * subsequence starting at position {requested_start} * in the sequence (numbering the sequence 1..L). * If {key} is found, on return, {ret_fh} * contains a unique handle on the file that contains * {key} (suitable for an SSIFileInfo() call, or for * comparison to the handle of the last file that was * opened for retrieval); {record_offset} contains the * disk offset to the start of the record; {data_offset} * contains the disk offset either exactly at the requested * residue, or at the start of the line containing the * requested residue; {ret_actual_start} contains the * coordinate (1..L) of the first valid residue at or * after {data_offset}. {ret_actual_start} is <= * {requested_start}. * * Args: sfp - open index file * key - primary or secondary key to find * requested_start - residue we'd like to start at (1..L) * ret_fh - RETURN: handle for file the key is in * record_offset - RETURN: offset of entire record * data_offset - RETURN: offset of subseq (see above) * ret_actual_start- RETURN: coord (1..L) of residue at data_offset * * Returns: 0 on success, non-zero on failure. */ int SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start, int *ret_fh, SSIOFFSET *record_offset, SSIOFFSET *data_offset, int *ret_actual_start) { int status; sqd_uint32 len; int r, b, i, l; /* tmp variables for "clarity", to match docs */ /* Look up the key. Rely on the fact that SSIGetOffsetByName() * leaves the index file positioned at the rest of the data for this key. */ status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset); if (status != 0) return status; /* Check that we're allowed to do subseq lookup on that file. */ if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ)) return SSI_ERR_NO_SUBSEQS; /* Read the data we need for subseq lookup */ if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA; if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA; /* Set up tmp variables for clarity of equations below, * and to make them match documentation (ssi-format.tex). */ r = sfp->rpl[*ret_fh]; /* residues per line */ b = sfp->bpl[*ret_fh]; /* bytes per line */ i = requested_start; /* start position 1..L */ l = (i-1)/r; /* data line # (0..) that the residue is on */ if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS; if (i < 0 || i > len) return SSI_ERR_RANGE; /* When b = r+1, there's nothing but sequence on each data line (and the \0), * and we can find each residue precisely. */ if (b == r+1) { if (sfp->smode == SSI_OFFSET_I32) { data_offset->mode = SSI_OFFSET_I32; data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r; } else if (sfp->smode == SSI_OFFSET_I64) { data_offset->mode = SSI_OFFSET_I64; data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r; } *ret_actual_start = requested_start; } else { /* else, there's other stuff on seq lines, so the best * we can do easily is to position at start of relevant line. */ if (sfp->smode == SSI_OFFSET_I32) { data_offset->mode = SSI_OFFSET_I32; data_offset->off.i32 = data_offset->off.i32 + l*b; } else if (sfp->smode == SSI_OFFSET_I64) { data_offset->mode = SSI_OFFSET_I64; data_offset->off.i64 = data_offset->off.i64 + l*b; } /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */ *ret_actual_start = 1 + l*r; } return 0; } /* Function: SSISetFilePosition() * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis] * * Purpose: Uses {offset} to sets the file position for {fp}, usually an * open sequence file, relative to the start of the file. * Hides the details of system-dependent shenanigans necessary for * file positioning in large (>2 GB) files. * * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit * offsets and <2 GB files. * * Warning: if all else fails, in desperation, it will try to * use fsetpos(). This requires making assumptions about fpos_t * that may be unwarranted... assumptions that ANSI C prohibits * me from making... though I believe the ./configure * script robustly tests whether I can play with fpos_t like this. * * Args: fp - file to position. * offset - SSI offset relative to file start. * * Returns: 0 on success, nonzero on error. */ int SSISetFilePosition(FILE *fp, SSIOFFSET *offset) { if (offset->mode == SSI_OFFSET_I32) { if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; } #ifndef HAS_64BIT_FILE_OFFSETS else return SSI_ERR_NO64BIT; #elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8 else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; #elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8 else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; #elif defined HAVE_FSEEK64 else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 else if (fsetpos(fp, &(offset->off.i64)) != 0) return SSI_ERR_SEEK_FAILED; #endif return 0; } /* Function: SSIFileInfo() * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis] * * Purpose: Given a file number {fh} in an open index file * {sfp}, retrieve file name {ret_filename} and * the file format {ret_format}. * * {ret_filename} is a pointer to a string maintained * internally by {sfp}. It should not be free'd; * SSIClose(sfp) takes care of it. * * Args: sfp - open index file * fh - handle on file to look up * ret_filename - RETURN: name of file n * ret_format - RETURN: format of file n * * Returns: 0 on success, nonzero on failure. */ int SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format) { if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG; *ret_filename = sfp->filename[fh]; *ret_format = sfp->fileformat[fh]; return 0; } /* Function: SSIClose() * Date: SRE, Sun Dec 31 14:56:37 2000 [St. Louis] * * Purpose: Close an open {SSIFILE *}. * * Args: sfp - index file to close. * * Returns: (void) */ void SSIClose(SSIFILE *sfp) { if (sfp != NULL) { clear_ssifile(sfp); if (sfp->fp != NULL) fclose(sfp->fp); free(sfp); } } /* clear_ssifile(): free the innards of SSIFILE, without * destroying the structure or closing the stream. */ static void clear_ssifile(SSIFILE *sfp) { int i; if (sfp->filename != NULL) { for (i = 0; i < sfp->nfiles; i++) if (sfp->filename[i] != NULL) free(sfp->filename[i]); free(sfp->filename); } if (sfp->fileformat != NULL) free(sfp->fileformat); if (sfp->fileflags != NULL) free(sfp->fileflags); if (sfp->bpl != NULL) free(sfp->bpl); if (sfp->rpl != NULL) free(sfp->rpl); } /* Function: SSIRecommendMode() * Date: SRE, Fri Feb 16 08:23:47 2001 [St. Louis] * * Purpose: Examines the file and determines whether it should be * indexed with large file support or not; returns * SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large * files, or -1 on failure. * * Args: file - name of file to check for size * * Returns: -1 on failure (including case where file is too big) * SSI_OFFSET_I32 for most files (<= 2^31-1 bytes) * SSI_OFFSET_I64 for large files (> 2^31-1 bytes) */ int SSIRecommendMode(char *file) { #if HAVE_STAT64 struct stat64 s1; if (stat64(file, &s1) == 0) { if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32; else return SSI_OFFSET_I64; } #else struct stat s2; if (stat(file, &s2) == 0) { if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32; else return SSI_OFFSET_I64; } #endif return -1; } /* Function: SSICreateIndex() * Date: SRE, Tue Jan 2 11:23:25 2001 [St. Louis] * * Purpose: Creates and initializes a SSI index structure. * Sequence file offset type is specified by {mode}. * * Args: mode - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode. * * Returns: ptr to new index structure, or NULL on failure. * Caller is responsible for free'ing the returned * structure with SSIFreeIndex(). */ SSIINDEX * SSICreateIndex(int mode) { SSIINDEX *g; g = NULL; if ((g = malloc(sizeof(SSIINDEX))) == NULL) goto FAILURE; g->smode = mode; g->imode = SSI_OFFSET_I32; /* index always starts as 32-bit; may get upgraded later */ g->external = FALSE; g->max_ram = SSI_MAXRAM; #ifndef HAS_64BIT_FILE_OFFSETS if (mode == SSI_OFFSET_I64) Die("\ Can't create a 64-bit SSI index on this system, sorry;\n\ I don't have 64-bit file offset functions available.\n"); #endif g->filenames = NULL; g->fileformat = NULL; g->bpl = NULL; g->rpl = NULL; g->flen = 0; g->nfiles = 0; g->pkeys = NULL; g->plen = 0; g->nprimary = 0; g->ptmpfile = "tmp.ssi.1"; /* hardcoded, for now. */ g->ptmp = NULL; g->skeys = NULL; g->slen = 0; g->nsecondary = 0; g->stmpfile = "tmp.ssi.2"; /* hardcoded, for now. */ g->stmp = NULL; /* All mallocs must go after NULL initializations, because of the cleanup strategy; * we'll try to free anything non-NULL if a malloc fails. */ if ((g->filenames = malloc(sizeof(char *) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; if ((g->bpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; if ((g->rpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; return g; FAILURE: SSIFreeIndex(g); /* free the damaged structure */ return NULL; } /* Function: SSIGetFilePosition() * Date: SRE, Tue Jan 2 09:59:26 2001 [St. Louis] * * Purpose: Fills {ret_offset} with the current disk * offset of {fp}, relative to the start of the file. * {mode} is set to either SSI_OFFSET_I32 or * SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps * a call to ftell(); otherwise deals with system-dependent * details of 64-bit file offsets. * * Args: fp - open stream * mode - SSI_OFFSET_I32 or SSI_OFFSET_I64 * ret_offset - RETURN: file position * * Returns: 0 on success. nonzero on error. */ int SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset) { if (mode == SSI_OFFSET_I32) { ret_offset->mode = SSI_OFFSET_I32; ret_offset->off.i32 = ftell(fp); if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED; } else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */ else { ret_offset->mode = SSI_OFFSET_I64; #ifndef HAS_64BIT_FILE_OFFSETS return SSI_ERR_NO64BIT; #elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8 if ((ret_offset->off.i64 = ftello(fp)) == -1) return SSI_ERR_TELL_FAILED; #elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8 if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED; #elif defined HAVE_FTELL64 if ((ret_offset->off.i64 = ftell64(fp)) == -1) return SSI_ERR_TELL_FAILED; #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 if (fgetpos(fp, &(ret_offset->off.i64)) != 0) return SSI_ERR_TELL_FAILED; #endif } return 0; } /* Function: SSIAddFileToIndex() * Date: SRE, Tue Jan 2 12:54:36 2001 [St. Louis] * * Purpose: Adds the sequence file {filename}, which is known to * be in format {fmt}, to the index {g}. Creates and returns * a unique filehandle {fh} for then associating primary keys * with this file using SSIAddPrimaryKeyToIndex(). * * Args: g - active index * filename - file to add * fmt - format code for this file (e.g. SQFILE_FASTA) * ret_fh - RETURN: unique handle for this file * * Returns: 0 on success; nonzero on error. */ int SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh) { int n; if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; n = strlen(filename); if ((n+1) > g->flen) g->flen = n+1; g->filenames[g->nfiles] = FileTail(filename, FALSE); g->fileformat[g->nfiles] = fmt; g->bpl[g->nfiles] = 0; g->rpl[g->nfiles] = 0; *ret_fh = g->nfiles; /* handle is simply = file number */ g->nfiles++; if (g->nfiles % SSI_FILE_BLOCK == 0) { g->filenames = realloc(g->filenames, sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK)); if (g->filenames == NULL) return SSI_ERR_MALLOC; g->fileformat= realloc(g->fileformat, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); if (g->fileformat == NULL) return SSI_ERR_MALLOC; g->bpl = realloc(g->bpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); if (g->bpl == NULL) return SSI_ERR_MALLOC; g->rpl = realloc(g->rpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); if (g->rpl == NULL) return SSI_ERR_MALLOC; } return 0; } /* Function: SSISetFileForSubseq() * Date: SRE, Tue Jan 9 10:02:05 2001 [St. Louis] * * Purpose: Set SSI_FAST_SUBSEQ for the file indicated by * filehandle {fh} in the index {g}, setting * parameters {bpl} and {rpl} to the values given. * {bpl} is the number of bytes per sequence data line. * {rpl} is the number of residues per sequence data line. * Caller must be sure that {bpl} and {rpl} do not change * on any line of any sequence record in the file * (except for the last data line of each record). If * this is not the case in this file, SSI_FAST_SUBSEQ * will not work, and this routine should not be * called. * * Args: g - the active index * fh - handle for file to set SSI_FAST_SUBSEQ on * bpl - bytes per data line * rpl - residues per data line * * Returns: 0 on success; 1 on error. */ int SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl) { if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG; if (bpl <= 0 || rpl <= 0) return SSI_ERR_BADARG; g->bpl[fh] = bpl; g->rpl[fh] = rpl; return 0; } /* Function: SSIAddPrimaryKeyToIndex() * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis] * * Purpose: Put primary key {key} in the index {g}, while telling * the index this primary key is in the file associated * with filehandle {fh} (returned by a previous call * to SSIAddFileToIndex()), and its record starts at * position {r_off} in the file. * * {d_off} and {L} are optional; they may be left unset * by passing NULL and 0, respectively. (If one is * provided, both must be provided.) If they are provided, * {d_off} gives the position of the first line of sequence * data in the record, and {L} gives the length of * the sequence in residues. They are used when * SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ * is not set for the file, {d_off} and {L} will be * ignored by the index reading API even if they are stored * by the index writing API, so it doesn't hurt for the * indexing program to provide them; typically they * won't know whether it's safe to set SSI_FAST_SUBSEQ * for the whole file until the whole file has been * read and every key has already been added to the index. * * Args: g - active index * key - primary key to add * fh - handle on file that this key's in * r_off - offset to start of record * d_off - offset to start of sequence data * L - length of sequence, or 0 * * Returns: 0 on success, nonzero on error. */ int SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh, SSIOFFSET *r_off, SSIOFFSET *d_off, int L) { int n; /* a string length */ if (fh >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; if (L > 0 && d_off == NULL) abort(); /* need both. */ /* Before adding the key: check how big our index is. * If it's getting too large, switch to external mode. */ if (!g->external && current_index_size(g) >= g->max_ram) if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE; /* Update maximum pkey length, if needed. */ n = strlen(key); if ((n+1) > g->plen) g->plen = n+1; /* External mode? Simply append to disk... */ if (g->external) { if (g->smode == SSI_OFFSET_I32) { fprintf(g->ptmp, "%s\t%d\t%lu\t%lu\t%lu\n", key, fh, (unsigned long) r_off->off.i32, (unsigned long) (d_off == NULL? 0 : d_off->off.i32), (unsigned long) L); } else { #ifdef CLUSTALO fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", key, fh, (unsigned long long)r_off->off.i64, d_off == NULL? 0 : (unsigned long long) d_off->off.i64, (unsigned long) L); #else fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", key, fh, r_off->off.i64, d_off == NULL? 0 : d_off->off.i64, (unsigned long) L); #endif } g->nprimary++; return 0; } /* Else: internal mode, keep keys in memory... */ if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; g->pkeys[g->nprimary].fnum = (sqd_uint16) fh; g->pkeys[g->nprimary].r_off = *r_off; if (d_off != NULL && L > 0) { g->pkeys[g->nprimary].d_off = *d_off; g->pkeys[g->nprimary].len = L; } else { /* yeah, this looks stupid, but look: we have to give a valid looking, non-NULL d_off of some sort, or writes will fail. It's going to be unused anyway. */ g->pkeys[g->nprimary].d_off = *r_off; g->pkeys[g->nprimary].len = 0; } g->nprimary++; if (g->nprimary % SSI_KEY_BLOCK == 0) { g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK)); if (g->pkeys == NULL) return SSI_ERR_MALLOC; } return 0; } /* Function: SSIAddSecondaryKeyToIndex() * Date: SRE, Tue Jan 2 12:44:40 2001 [St. Louis] * * Purpose: Puts secondary key {key} in the index {g}, associating * it with primary key {pkey} that was previously * registered by SSIAddPrimaryKeyToIndex(). * * Args: g - active index * key - secondary key to add * pkey - primary key to associate this key with * * Returns: 0 on success, nonzero on failure. */ int SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey) { int n; /* a string length */ if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; /* Before adding the key: check how big our index is. * If it's getting too large, switch to external mode. */ if (!g->external && current_index_size(g) >= g->max_ram) if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE; /* Update maximum secondary key length, if necessary. */ n = strlen(key); if ((n+1) > g->slen) g->slen = n+1; /* if external mode: write info to disk. */ if (g->external) { fprintf(g->stmp, "%s\t%s\n", key, pkey); g->nsecondary++; return 0; } /* else, internal mode... store info in memory. */ if ((g->skeys[g->nsecondary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC; g->nsecondary++; if (g->nsecondary % SSI_KEY_BLOCK == 0) { g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK)); if (g->skeys == NULL) return SSI_ERR_MALLOC; } return 0; } /* Function: SSIWriteIndex() * Date: SRE, Tue Jan 2 13:55:56 2001 [St. Louis] * * Purpose: Writes complete index {g} in SSI format to a * binary file {file}. Does all * the overhead of sorting the primary and secondary keys, * and maintaining the association of secondary keys * with primary keys during and after the sort. * * Args: file - file to write to * g - index to sort & write out. * * Returns: 0 on success, nonzero on error. */ /* needed for qsort() */ static int pkeysort(const void *k1, const void *k2) { struct ssipkey_s *key1; struct ssipkey_s *key2; key1 = (struct ssipkey_s *) k1; key2 = (struct ssipkey_s *) k2; return strcmp(key1->key, key2->key); } static int skeysort(const void *k1, const void *k2) { struct ssiskey_s *key1; struct ssiskey_s *key2; key1 = (struct ssiskey_s *) k1; key2 = (struct ssiskey_s *) k2; return strcmp(key1->key, key2->key); } int SSIWriteIndex(char *file, SSIINDEX *g) { FILE *fp; int status; int i; sqd_uint32 header_flags, file_flags; sqd_uint32 frecsize, precsize, srecsize; sqd_uint64 foffset, poffset, soffset; char *s, *s2; if ((fp = fopen(file,"wb")) == NULL) return SSI_ERR_NOFILE; status = 0; /* How big is the index? If it's going to be > 2GB, we need * to flip to 64-bit index mode. 2047 (instead of 2048) gives us * some slop room. * die'ing here is pretty brutal - if we flip to 64-bit index * mode, we hve 100's of millions of keys, so we've processed * a long time before reaching this point. Ah well. */ if (current_index_size(g) >= 2047) { g->imode = SSI_OFFSET_I64; #ifndef HAS_64BIT_FILE_OFFSETS Die("\ Can't switch to 64-bit SSI index mode on this system, sorry;\n\ I don't have 64-bit file offset functions available.\n"); #endif } /* Magic-looking numbers come from adding up sizes * of things in bytes */ frecsize = 16 + g->flen; precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen; srecsize = g->slen + g->plen; header_flags = 0; if (g->smode == SSI_OFFSET_I64) header_flags |= SSI_USE64; if (g->imode == SSI_OFFSET_I64) header_flags |= SSI_USE64_INDEX; /* Magic-looking numbers again come from adding up sizes * of things in bytes */ foffset = (header_flags & SSI_USE64_INDEX) ? 66 : 54; poffset = foffset + frecsize*g->nfiles; soffset = poffset + precsize*g->nprimary; /* Sort the keys * If external mode, make system calls to UNIX/POSIX "sort" in place, then * open new sorted files for reading thru ptmp and stmp handles. * If internal mode, call qsort. * * Note that you'd better force a POSIX locale for the sort; else, * some silly distro (e.g. Mandrake Linux >=8.1) may have specified * LC_COLLATE=en_US, and this'll give a sort "bug" in which it doesn't * sort by byte order. */ if (g->external) { char cmd[1024]; fclose(g->ptmp); g->ptmp = NULL; sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", g->ptmpfile, g->ptmpfile); if ((status = system(cmd)) != 0) return SSI_ERR_EXTERNAL_SORT; if ((g->ptmp = fopen(g->ptmpfile, "r")) == NULL) return SSI_ERR_EXTERNAL_SORT; fclose(g->stmp); g->stmp = NULL; sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", g->stmpfile, g->stmpfile); if ((status = system(cmd)) != 0) return SSI_ERR_EXTERNAL_SORT; if ((g->stmp = fopen(g->stmpfile, "r")) == NULL) return SSI_ERR_EXTERNAL_SORT; } else { qsort((void *) g->pkeys, g->nprimary, sizeof(struct ssipkey_s), pkeysort); qsort((void *) g->skeys, g->nsecondary, sizeof(struct ssiskey_s), skeysort); } /* Write the header */ if (! write_i32(fp, v20magic)) return SSI_ERR_FWRITE; if (! write_i32(fp, header_flags)) return SSI_ERR_FWRITE; if (! write_i16(fp, g->nfiles)) return SSI_ERR_FWRITE; if (! write_i32(fp, g->nprimary)) return SSI_ERR_FWRITE; if (! write_i32(fp, g->nsecondary)) return SSI_ERR_FWRITE; if (! write_i32(fp, g->flen)) return SSI_ERR_FWRITE; if (! write_i32(fp, g->plen)) return SSI_ERR_FWRITE; if (! write_i32(fp, g->slen)) return SSI_ERR_FWRITE; if (! write_i32(fp, frecsize)) return SSI_ERR_FWRITE; if (! write_i32(fp, precsize)) return SSI_ERR_FWRITE; if (! write_i32(fp, srecsize)) return SSI_ERR_FWRITE; if (g->imode == SSI_OFFSET_I32) { if (! write_i32(fp, foffset)) return SSI_ERR_FWRITE; if (! write_i32(fp, poffset)) return SSI_ERR_FWRITE; if (! write_i32(fp, soffset)) return SSI_ERR_FWRITE; } else { if (! write_i64(fp, foffset)) return SSI_ERR_FWRITE; if (! write_i64(fp, poffset)) return SSI_ERR_FWRITE; if (! write_i64(fp, soffset)) return SSI_ERR_FWRITE; } /* The file section */ if ((s = malloc(sizeof(char) * g->flen)) == NULL) return SSI_ERR_MALLOC; for (i = 0; i < g->nfiles; i++) { file_flags = 0; if (g->bpl[i] > 0 && g->rpl[i] > 0) file_flags |= SSI_FAST_SUBSEQ; strcpy(s, g->filenames[i]); if (fwrite(s, sizeof(char), g->flen, fp) != g->flen) return SSI_ERR_FWRITE; if (! write_i32(fp, g->fileformat[i])) return SSI_ERR_FWRITE; if (! write_i32(fp, file_flags)) return SSI_ERR_FWRITE; if (! write_i32(fp, g->bpl[i])) return SSI_ERR_FWRITE; if (! write_i32(fp, g->rpl[i])) return SSI_ERR_FWRITE; } free(s); /* The primary key section */ if ((s = malloc(sizeof(char) * g->plen)) == NULL) return SSI_ERR_MALLOC; if (g->external) { char *buf = NULL; int buflen = 0; struct ssipkey_s pkey; for (i = 0; i < g->nprimary; i++) { if (sre_fgets(&buf, &buflen, g->ptmp) == NULL) return SSI_ERR_NODATA; if (parse_pkey_info(buf, g->smode, &pkey) != 0) return SSI_ERR_BADFORMAT; strcpy(s, pkey.key); if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; if (! write_i16( fp, pkey.fnum)) return SSI_ERR_FWRITE; if (! write_offset(fp, &(pkey.r_off))) return SSI_ERR_FWRITE; if (! write_offset(fp, &(pkey.d_off))) return SSI_ERR_FWRITE; if (! write_i32( fp, pkey.len)) return SSI_ERR_FWRITE; } free(buf); } else { for (i = 0; i < g->nprimary; i++) { strcpy(s, g->pkeys[i].key); if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; if (! write_i16( fp, g->pkeys[i].fnum)) return SSI_ERR_FWRITE; if (! write_offset(fp, &(g->pkeys[i].r_off))) return SSI_ERR_FWRITE; if (! write_offset(fp, &(g->pkeys[i].d_off))) return SSI_ERR_FWRITE; if (! write_i32( fp, g->pkeys[i].len)) return SSI_ERR_FWRITE; } } /* The secondary key section */ if (g->nsecondary > 0) { if ((s2 = malloc(sizeof(char) * g->slen)) == NULL) return SSI_ERR_MALLOC; if (g->external) { struct ssiskey_s skey; char *buf = NULL; int n = 0; for (i = 0; i < g->nsecondary; i++) { if (sre_fgets(&buf, &n, g->stmp) == NULL) return SSI_ERR_NODATA; if (parse_skey_info(buf, &skey) != 0) return SSI_ERR_BADFORMAT; strcpy(s2, skey.key); strcpy(s, skey.pkey); if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE; if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; } free(buf); } else { for (i = 0; i < g->nsecondary; i++) { strcpy(s2, g->skeys[i].key); strcpy(s, g->skeys[i].pkey); if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE; if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; } } free(s2); } free(s); fclose(fp); return status; } /* Function: SSIFreeIndex() * Date: SRE, Tue Jan 2 11:44:08 2001 [St. Louis] * * Purpose: Free an index structure {g}. * * Args: g - ptr to an open index. * * Returns: (void) */ void SSIFreeIndex(SSIINDEX *g) { int i; if (g != NULL) { if (g->external == FALSE) { for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key); for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key); for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey); if (g->pkeys != NULL) free(g->pkeys); if (g->skeys != NULL) free(g->skeys); } else { if (g->ptmp != NULL) fclose(g->ptmp); if (g->stmp != NULL) fclose(g->stmp); #if DEBUGLEVEL == 0 remove(g->ptmpfile); remove(g->stmpfile); #endif } for (i = 0; i < g->nfiles; i++) free(g->filenames[i]); if (g->filenames != NULL) free(g->filenames); if (g->fileformat != NULL) free(g->fileformat); if (g->bpl != NULL) free(g->bpl); if (g->rpl != NULL) free(g->rpl); free(g); } } /* Function: SSIErrorString() * Date: SRE, Tue Jan 2 10:38:10 2001 [St. Louis] * * Purpose: Returns a ptr to an internal string corresponding * to error {n}, a code returned from any of the * functions in the API that return non-zero on error. * * Args: n - error code * * Returns: ptr to an internal string. */ char * SSIErrorString(int n) { switch (n) { case SSI_ERR_OK: return "ok (no error)"; case SSI_ERR_NODATA: return "no data, fread() failed"; case SSI_ERR_NO_SUCH_KEY: return "no such key"; case SSI_ERR_MALLOC: return "out of memory, malloc() failed"; case SSI_ERR_NOFILE: return "file not found, fopen() failed"; case SSI_ERR_BADMAGIC: return "not a SSI file? (bad magic)"; case SSI_ERR_BADFORMAT: return "corrupt format? unexpected data"; case SSI_ERR_NO64BIT: return "no large file support for this system"; case SSI_ERR_SEEK_FAILED: return "failed to reposition on disk"; case SSI_ERR_TELL_FAILED: return "failed to get file position on disk"; case SSI_ERR_NO_SUBSEQS: return "no fast subseq support for this seqfile"; case SSI_ERR_RANGE: return "subseq start is out of range"; case SSI_ERR_BADARG: return "an argument is out of range"; case SSI_ERR_TOOMANY_FILES: return "number of files exceeds limit"; case SSI_ERR_TOOMANY_KEYS: return "number of keys exceeds limit"; case SSI_ERR_FWRITE: return "an fwrite() failed"; case SSI_ERR_EXTERNAL_SORT: return "some problem with external sorting"; default: return "unrecognized code"; } /*NOTREACHED*/ } static int read_i16(FILE *fp, sqd_uint16 *ret_result) { sqd_uint16 result; if (fread(&result, sizeof(sqd_uint16), 1, fp) != 1) return 0; *ret_result = sre_ntoh16(result); return 1; } static int write_i16(FILE *fp, sqd_uint16 n) { n = sre_hton16(n); if (fwrite(&n, sizeof(sqd_uint16), 1, fp) != 1) return 0; return 1; } static int read_i32(FILE *fp, sqd_uint32 *ret_result) { sqd_uint32 result; if (fread(&result, sizeof(sqd_uint32), 1, fp) != 1) return 0; *ret_result = sre_ntoh32(result); return 1; } static int write_i32(FILE *fp, sqd_uint32 n) { n = sre_hton32(n); if (fwrite(&n, sizeof(sqd_uint32), 1, fp) != 1) return 0; return 1; } static int read_i64(FILE *fp, sqd_uint64 *ret_result) { sqd_uint64 result; if (fread(&result, sizeof(sqd_uint64), 1, fp) != 1) return 0; *ret_result = sre_ntoh64(result); return 1; } static int write_i64(FILE *fp, sqd_uint64 n) { n = sre_hton64(n); if (fwrite(&n, sizeof(sqd_uint64), 1, fp) != 1) return 0; return 1; } static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset) { if (mode == SSI_OFFSET_I32) { ret_offset->mode = SSI_OFFSET_I32; if (! read_i32(fp, &(ret_offset->off.i32))) return 0; } else if (mode == SSI_OFFSET_I64) { ret_offset->mode = SSI_OFFSET_I64; if (! read_i64(fp, &(ret_offset->off.i64))) return 0; } else return 0; return 1; } static int write_offset(FILE *fp, SSIOFFSET *offset) { if (offset->mode == SSI_OFFSET_I32) return write_i32(fp, offset->off.i32); else if (offset->mode == SSI_OFFSET_I64) return write_i64(fp, offset->off.i64); else abort(); /*UNREACHED*/ return 1; /* silence bitchy compilers */ } static int parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey) { char *s, *tok; int n; s = buf; if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->key = tok; if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->fnum = (sqd_uint16) atoi(tok); if (mode == SSI_OFFSET_I32) { if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->r_off.mode = mode; pkey->r_off.off.i32 = (sqd_uint32) strtoul(tok, NULL, 10); if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->d_off.mode = mode; pkey->d_off.off.i32 = (sqd_uint32) strtoul(tok, NULL, 10); } #ifdef HAS_64BIT_FILE_OFFSETS else { if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->r_off.mode = mode; pkey->r_off.off.i64 = (sqd_uint64) strtoull(tok, NULL, 10); if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->d_off.mode = mode; pkey->d_off.off.i64 = (sqd_uint64) strtoull(tok, NULL, 10); } #else else { return SSI_ERR_NO64BIT; } #endif if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; pkey->len = (sqd_uint32) strtoul(tok, NULL, 10); return 0; } static int parse_skey_info(char *buf, struct ssiskey_s *skey) { char *s, *tok; int n; s = buf; if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; skey->key = tok; if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; skey->pkey = tok; return 0; } /* Function: binary_search() * Date: SRE, Sun Dec 31 16:05:03 2000 [St. Louis] * * Purpose: Find a key in a SSI index, by a binary search * in an alphabetically sorted list of keys. If successful, * return 0, and the index file is positioned to read * the rest of the data for that key. Else returns nonzero. * * Args: sfp - an open SSIFILE * key - key to find * klen - key length to allocate (plen or slen from sfp) * base - base offset (poffset or soffset) * recsize - size of each key record in bytes (precsize or srecsize) * maxidx - # of keys (nprimary or nsecondary) * * Returns: 0 on success, and leaves file positioned for reading remaining * data for the key. * Nonzero on failure: * SSI_ERR_NO_SUCH_KEY - that key's not in the index * SSI_ERR_MALLOC - a memory allocation failure * SSI_ERR_NODATA - an fread() failed */ static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, sqd_uint32 recsize, sqd_uint32 maxidx) { char *name; sqd_uint32 left, right, mid; int cmp; int status; if (maxidx == 0) return SSI_ERR_NO_SUCH_KEY; /* special case: empty index */ if ((name = malloc (sizeof(char)*klen)) == NULL) return SSI_ERR_MALLOC; left = 0; right = maxidx-1; while (1) { /* A binary search: */ mid = (left+right) / 2; /* careful here. only works because we limit unsigned vars to signed ranges. */ if ((status = indexfile_position(sfp, base, recsize, mid)) != 0) { free(name); return status; } if (fread(name, sizeof(char), klen, sfp->fp) != klen) { free(name); return SSI_ERR_NODATA; } cmp = strcmp(name, key); if (cmp == 0) break; /* found it! */ else if (left >= right) /* oops, missed it; fail */ { free(name); return SSI_ERR_NO_SUCH_KEY; } else if (cmp < 0) left = mid+1; /* it's right of mid */ else if (cmp > 0) { if (mid == 0) { free(name); return SSI_ERR_NO_SUCH_KEY; } /* special case, beware */ else right = mid-1; /* it's left of mid */ } } free(name); return 0; /* and sfp->fp is positioned... */ } /* Function: indexfile_position() * Date: SRE, Mon Jan 1 19:32:49 2001 [St. Louis] * * Purpose: Position the open index file {sfp} at the start * of record {n} in a list of records that starts at * base offset {base}, where each record takes up {l} * bytes. (e.g. the position is byte (base + n*l)). * * Args: sfp - open SSIFILE * base - offset of record 0 (e.g. sfp->foffset) * len - size of each record in bytes (e.g. sfp->frecsize) * n - which record to get (e.g. 0..sfp->nfiles) * * Returns: 0 on success, non-zero on failure. */ static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n) { SSIOFFSET pos; int status; if (base->mode == SSI_OFFSET_I32) { pos.mode = SSI_OFFSET_I32; pos.off.i32 = base->off.i32 + n*len; } else if (base->mode == SSI_OFFSET_I64) { pos.mode = SSI_OFFSET_I64; pos.off.i64 = base->off.i64 + n*len; } else return 0; if ((status = SSISetFilePosition(sfp->fp, &pos)) != 0) return status; return 0; } /* Function: current_index_size() * Date: SRE, Tue Feb 20 18:23:30 2001 [St. Louis] * * Purpose: Calculates the size of the current index, * in megabytes. */ static sqd_uint64 current_index_size(SSIINDEX *g) { sqd_uint64 frecsize, precsize, srecsize; sqd_uint64 total; /* Magic-looking numbers come from adding up sizes * of things in bytes */ frecsize = 16 + g->flen; precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen; srecsize = g->plen+g->slen; total = (66L + /* header size, if 64bit index offsets */ frecsize * g->nfiles + /* file section size */ precsize * g->nprimary + /* primary key section size */ srecsize * g->nsecondary) / /* secondary key section size */ 1048576L; return total; } /* Function: activate_external_sort() * Date: SRE, Mon Feb 4 09:08:08 2002 [St. Louis] * * Purpose: Switch to external sort mode. * Open file handles for external index files (ptmp, stmp). * Flush current index information to these files. * Free current memory, turn over control to the tmpfiles. * * Return: 0 on success; non-zero on failure. */ static int activate_external_sort(SSIINDEX *g) { int i; /* it's a bit late to be checking this, but... */ if (g->external) return 0; /* we already are external, fool */ if (FileExists(g->ptmpfile)) return 1; if (FileExists(g->stmpfile)) return 1; if ((g->ptmp = fopen(g->ptmpfile, "w")) == NULL) return 1; if ((g->stmp = fopen(g->stmpfile, "w")) == NULL) return 1; /* Flush the current indices. */ SQD_DPRINTF1(("Switching to external sort - flushing ssiindex to disk...\n")); for (i = 0; i < g->nprimary; i++) { if (g->smode == SSI_OFFSET_I32) { fprintf(g->ptmp, "%s\t%u\t%lu\t%lu\t%lu\n", g->pkeys[i].key, g->pkeys[i].fnum, (unsigned long) g->pkeys[i].r_off.off.i32, (unsigned long) g->pkeys[i].d_off.off.i32, (unsigned long) g->pkeys[i].len); } else { fprintf(g->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n", g->pkeys[i].key, g->pkeys[i].fnum, (unsigned long long) g->pkeys[i].r_off.off.i64, (unsigned long long) g->pkeys[i].d_off.off.i64, (unsigned long) g->pkeys[i].len); } } for (i = 0; i < g->nsecondary; i++) fprintf(g->stmp, "%s\t%s\n", g->skeys[i].key, g->skeys[i].pkey); /* Free the memory now that we've flushed our lists to disk */ for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key); for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key); for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey); if (g->pkeys != NULL) free(g->pkeys); if (g->skeys != NULL) free(g->skeys); g->pkeys = NULL; g->skeys = NULL; /* Turn control over to external accumulation mode. */ g->external = TRUE; return 0; } /***************************************************************** * Debugging API *****************************************************************/ void SSIForceExternalSort(SSIINDEX *g) { if (activate_external_sort(g) != 0) Die("failed to turn external sorting on."); } /***************************************************************** * Test driving mode *****************************************************************/ #ifdef MUGGINS_LETS_ME_SLEEP /* Minimally: cc -g -Wall -o shiva -DDEBUGLEVEL=1 -DMUGGINS_LETS_ME_SLEEP ssi.c sqerror.c sre_string.c types.c sre_ctype.c sre_math.c file.c -lm */ int main(int argc, char **argv) { char name[32], accession[32]; SSIINDEX *ssi; int mode; SSIOFFSET r_off, d_off; FILE *ofp; int i; int fh; /* a file handle */ int status; /* return status from a SSI call */ mode = SSI_OFFSET_I32; if ((ssi = SSICreateIndex(mode)) == NULL) Die("Failed to allocate SSI index"); /* Generate two FASTA files, tmp.0 and tmp.1, and index them. */ if ((ofp = fopen("tmp.0", "w")) == NULL) Die("failed to open tmp.0"); if ((status = SSIAddFileToIndex(ssi, "tmp.0", SQFILE_FASTA, &fh)) != 0) Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status)); for (i = 0; i < 10; i++) { if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0) Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); sprintf(name, "seq%d", i); sprintf(accession, "ac%d", i); fprintf(ofp, ">%s [%s] Description? we don't need no steenking description.\n", name, accession); if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0) Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); fprintf(ofp, "AAAAAAAAAA\n"); fprintf(ofp, "CCCCCCCCCC\n"); fprintf(ofp, "GGGGGGGGGG\n"); fprintf(ofp, "TTTTTTTTTT\n"); if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0) Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status)); if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0) Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status)); } SSISetFileForSubseq(ssi, fh, 11, 10); fclose(ofp); if ((ofp = fopen("tmp.1", "w")) == NULL) Die("failed to open tmp.1"); if ((status = SSIAddFileToIndex(ssi, "tmp.1", SQFILE_FASTA, &fh)) != 0) Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status)); for (i = 10; i < 20; i++) { if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0) Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); sprintf(name, "seq%d", i); sprintf(accession, "ac%d", i); fprintf(ofp, ">%s [%s] i/o, i/o, it's off to disk we go.\n", name, accession); if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0) Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); fprintf(ofp, "AAAAAAAAAA 10\n"); fprintf(ofp, "CCCCCCCCCC 20\n"); fprintf(ofp, "GGGGGGGGGG 30\n"); fprintf(ofp, "TTTTTTTTTT 40\n"); if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0) Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status)); if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0) Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status)); } SSISetFileForSubseq(ssi, fh, 14, 10); fclose(ofp); /* Write the index to tmp.ssi */ if ((status = SSIWriteIndex("tmp.ssi", ssi)) != 0) Die("SSIWriteIndex() failed: %s", SSIErrorString(status)); SSIFreeIndex(ssi); /* Now reopen the index and run some tests. */ exit(0); } #endif /* test driving code */