1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
15 #include <sys/types.h>
20 static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */
21 static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */
23 static int read_i16(FILE *fp, sqd_uint16 *ret_result);
24 static int read_i32(FILE *fp, sqd_uint32 *ret_result);
25 static int read_i64(FILE *fp, sqd_uint64 *ret_result);
26 static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset);
27 static int write_i16(FILE *fp, sqd_uint16 n);
28 static int write_i32(FILE *fp, sqd_uint32 n);
29 static int write_i64(FILE *fp, sqd_uint64 n);
30 static int write_offset(FILE *fp, SSIOFFSET *offset);
31 static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base,
32 sqd_uint32 recsize, sqd_uint32 maxidx);
33 static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len,
35 static void clear_ssifile(SSIFILE *sfp);
36 static int write_index(FILE *fp, SSIINDEX *g);
37 static int write_index_chunk(SSIINDEX *g);
38 static sqd_uint64 current_chunk_size(SSIINDEX *g);
39 static int load_indexfile(SSIFILE *sfp);
41 /* Function: SSIOpen()
42 * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis]
44 * Purpose: Opens the SSI index file {filename} and returns
45 * a SSIFILE * stream thru {ret_sfp}.
46 * The caller must eventually close this stream using
47 * SSIClose(). More than one index file can be open
50 * Args: filename - full path to a SSI index file
52 * Returns: Returns 0 on success, nonzero on failure.
55 SSIOpen(char *filename, SSIFILE **ret_sfp)
59 if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC;
60 if ((sfp->fp = fopen(filename, "rb")) == NULL) return SSI_ERR_NOFILE;
61 status = load_indexfile(sfp);
65 /* load_indexfile(): given a SSIFILE structure with an open and positioned
66 * stream (fp) -- but no other data loaded -- read the next SSIFILE
67 * in from disk. We use this routine without its SSIOpen() wrapper
68 * as part of the external mergesort when creating large indices.
71 load_indexfile(SSIFILE *sfp)
74 sqd_uint16 i; /* counter over files */
75 int status; /* overall return status if an error is thrown */
77 status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */
80 sfp->fileformat = NULL;
81 sfp->fileflags = NULL;
85 if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; }
86 if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; }
87 if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE;
89 /* If we have 64-bit offsets, make sure we can deal with them.
91 #ifndef HAS_64BIT_FILE_OFFSETS
92 if ((sfp->flags & SSI_USE64_INDEX) ||
93 (sfp->flags & SSI_USE64))
94 { status = SSI_ERR_NO64BIT; goto FAILURE; }
97 sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32;
98 sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32;
100 if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE;
101 if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE;
102 if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE;
103 if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE;
104 if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE;
105 if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE;
106 if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE;
107 if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE;
108 if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE;
110 if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE;
111 if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE;
112 if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE;
114 /* Read the file information and keep it.
115 * We expect the number of files to be small, so reading it
116 * once should be advantageous overall. If SSI ever had to
117 * deal with large numbers of files, you'd probably want to
118 * read file information on demand.
120 if (sfp->nfiles == 0) goto FAILURE;
121 if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
122 for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL;
123 if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
124 if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
125 if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
126 if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
128 for (i = 0; i < sfp->nfiles; i++)
130 /* We have to explicitly position, because header and file
131 * records may expand in the future; frecsize and foffset
132 * give us forwards compatibility.
134 if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE;
135 if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
136 if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE;
137 if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE;
138 if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE;
139 if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE;
140 if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE;
143 /* Success. Return 0.
148 /* Failure: free the damaged structure, return status code.
156 /* Function: SSIGetOffsetByName()
157 * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis]
159 * Purpose: Looks up the string {key} in the open index {sfp}.
160 * {key} can be either a primary or secondary key. If {key}
161 * is found, {*ret_fh} contains a unique handle on
162 * the file that contains {key} (suitable for an SSIFileInfo()
163 * call, or for comparison to the handle of the last file
164 * that was opened for retrieval), and {offset} is filled
165 * in with the offset in that file.
167 * Args: sfp - open index file
168 * key - string to search for
169 * ret_fh - RETURN: handle on file that key is in
170 * ret_offset - RETURN: offset of the start of that key's record
172 * Returns: 0 on success.
176 SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh,
177 SSIOFFSET *ret_offset)
182 /* Look in the primary keys.
184 status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize,
187 /* We found it as a primary key; get our data & return.
189 if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA;
190 *ret_fh = (int) fnum;
191 if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA;
193 return 0; /* success! (we don't need the other key data) */
194 } else if (status == SSI_ERR_NO_SUCH_KEY) {
195 /* Not in the primary keys? OK, try the secondary keys.
197 if (sfp->nsecondary > 0) {
199 status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize,
201 if (status != 0) return status;
202 if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC;
203 if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA;
205 status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset);
210 } else return status;
214 /* Function: SSIGetOffsetByNumber()
215 * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis]
217 * Purpose: Looks up primary key #{n} in the open index {sfp}.
218 * {n} ranges from 0..nprimary-1. When key #{n}
219 * is found, {*ret_fh} contains a unique
220 * handle on the file that contains {key} (suitable
221 * for an SSIFileInfo() call, or for comparison to
222 * the handle of the last file that was opened for retrieval),
223 * and {offset} is filled in with the offset in that file.
225 * Args: sfp - open index file
226 * n - primary key number to retrieve.
227 * ret_fh - RETURN: handle on file that key is in
228 * ret_offset - RETURN: offset of the start of that key's record
230 * Returns: 0 on success.
234 SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset)
239 if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY;
240 if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0)
241 return SSI_ERR_SEEK_FAILED;
243 if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC;
244 if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA;
245 if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA;
246 if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA;
252 /* Function: SSIGetSubseqOffset()
253 * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis]
255 * Purpose: Implements SSI_FAST_SUBSEQ.
257 * Looks up a primary or secondary {key} in the open
258 * index {sfp}. Asks for the nearest offset to a
259 * subsequence starting at position {requested_start}
260 * in the sequence (numbering the sequence 1..L).
261 * If {key} is found, on return, {ret_fh}
262 * contains a unique handle on the file that contains
263 * {key} (suitable for an SSIFileInfo() call, or for
264 * comparison to the handle of the last file that was
265 * opened for retrieval); {record_offset} contains the
266 * disk offset to the start of the record; {data_offset}
267 * contains the disk offset either exactly at the requested
268 * residue, or at the start of the line containing the
269 * requested residue; {ret_actual_start} contains the
270 * coordinate (1..L) of the first valid residue at or
271 * after {data_offset}. {ret_actual_start} is <=
274 * Args: sfp - open index file
275 * key - primary or secondary key to find
276 * requested_start - residue we'd like to start at (1..L)
277 * ret_fh - RETURN: handle for file the key is in
278 * record_offset - RETURN: offset of entire record
279 * data_offset - RETURN: offset of subseq (see above)
280 * ret_actual_start- RETURN: coord (1..L) of residue at data_offset
282 * Returns: 0 on success, non-zero on failure.
285 SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start,
286 int *ret_fh, SSIOFFSET *record_offset,
287 SSIOFFSET *data_offset, int *ret_actual_start)
291 int r, b, i, l; /* tmp variables for "clarity", to match docs */
293 /* Look up the key. Rely on the fact that SSIGetOffsetByName()
294 * leaves the index file positioned at the rest of the data for this key.
296 status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset);
297 if (status != 0) return status;
299 /* Check that we're allowed to do subseq lookup on that file.
301 if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ))
302 return SSI_ERR_NO_SUBSEQS;
304 /* Read the data we need for subseq lookup
306 if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA;
307 if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA;
309 /* Set up tmp variables for clarity of equations below,
310 * and to make them match documentation (ssi-format.tex).
312 r = sfp->rpl[*ret_fh]; /* residues per line */
313 b = sfp->bpl[*ret_fh]; /* bytes per line */
314 i = requested_start; /* start position 1..L */
315 l = (i-1)/r; /* data line # (0..) that the residue is on */
316 if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS;
317 if (i < 0 || i > len) return SSI_ERR_RANGE;
319 /* When b = r+1, there's nothing but sequence on each data line (and the \0),
320 * and we can find each residue precisely.
323 if (sfp->smode == SSI_OFFSET_I32) {
324 data_offset->mode = SSI_OFFSET_I32;
325 data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r;
326 } else if (sfp->smode == SSI_OFFSET_I64) {
327 data_offset->mode = SSI_OFFSET_I64;
328 data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r;
330 *ret_actual_start = requested_start;
332 /* else, there's other stuff on seq lines, so the best
333 * we can do easily is to position at start of relevant line.
335 if (sfp->smode == SSI_OFFSET_I32) {
336 data_offset->mode = SSI_OFFSET_I32;
337 data_offset->off.i32 = data_offset->off.i32 + l*b;
338 } else if (sfp->smode == SSI_OFFSET_I64) {
339 data_offset->mode = SSI_OFFSET_I64;
340 data_offset->off.i64 = data_offset->off.i64 + l*b;
342 /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */
343 *ret_actual_start = 1 + l*r;
348 /* Function: SSISetFilePosition()
349 * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis]
351 * Purpose: Uses {offset} to sets the file position for {fp}, usually an
352 * open sequence file, relative to the start of the file.
353 * Hides the details of system-dependent shenanigans necessary for
354 * file positioning in large (>2 GB) files.
356 * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit
357 * offsets and <2 GB files.
359 * Warning: if all else fails, in desperation, it will try to
360 * use fsetpos(). This requires making assumptions about fpos_t
361 * that may be unwarranted... assumptions that ANSI C prohibits
362 * me from making... though I believe the ./configure
363 * script robustly tests whether I can play with fpos_t like this.
365 * Args: fp - file to position.
366 * offset - SSI offset relative to file start.
368 * Returns: 0 on success, nonzero on error.
371 SSISetFilePosition(FILE *fp, SSIOFFSET *offset)
373 if (offset->mode == SSI_OFFSET_I32) {
374 if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
376 #ifndef HAS_64BIT_FILE_OFFSETS
377 else return SSI_ERR_NO64BIT;
378 #elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8
379 else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
380 #elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8
381 else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
382 #elif defined HAVE_FSEEK64
383 else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
384 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
385 else if (fsetpos(fp, &(offset->off.i64)) != 0) return SSI_ERR_SEEK_FAILED;
391 /* Function: SSIFileInfo()
392 * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis]
394 * Purpose: Given a file number {fh} in an open index file
395 * {sfp}, retrieve file name {ret_filename} and
396 * the file format {ret_format}.
398 * {ret_filename} is a pointer to a string maintained
399 * internally by {sfp}. It should not be free'd;
400 * SSIClose(sfp) takes care of it.
402 * Args: sfp - open index file
403 * fh - handle on file to look up
404 * ret_filename - RETURN: name of file n
405 * ret_format - RETURN: format of file n
407 * Returns: 0 on success, nonzero on failure.
410 SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format)
412 if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG;
413 *ret_filename = sfp->filename[fh];
414 *ret_format = sfp->fileformat[fh];
418 /* Function: SSIClose()
419 * Date: SRE, Sun Dec 31 14:56:37 2000 [St. Louis]
421 * Purpose: Close an open {SSIFILE *}.
423 * Args: sfp - index file to close.
428 SSIClose(SSIFILE *sfp)
432 if (sfp->fp != NULL) fclose(sfp->fp);
436 /* clear_ssifile(): free the innards of SSIFILE, without
437 * destroying the structure or closing the stream.
440 clear_ssifile(SSIFILE *sfp)
444 if (sfp->filename != NULL) {
445 for (i = 0; i < sfp->nfiles; i++)
446 if (sfp->filename[i] != NULL) free(sfp->filename[i]);
449 if (sfp->fileformat != NULL) free(sfp->fileformat);
450 if (sfp->fileflags != NULL) free(sfp->fileflags);
451 if (sfp->bpl != NULL) free(sfp->bpl);
452 if (sfp->rpl != NULL) free(sfp->rpl);
456 /* Function: SSIRecommendMode()
457 * Date: SRE, Fri Feb 16 08:23:47 2001 [St. Louis]
459 * Purpose: Examines the file and determines whether it should be
460 * indexed with large file support or not; returns
461 * SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large
462 * files, or -1 on failure.
464 * Args: file - name of file to check for size
466 * Returns: -1 on failure (including case where file is too big)
467 * SSI_OFFSET_I32 for most files (<= 2^31-1 bytes)
468 * SSI_OFFSET_I64 for large files (> 2^31-1 bytes)
471 SSIRecommendMode(char *file)
475 if (stat64(file, &s1) == 0) {
476 if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32;
477 else return SSI_OFFSET_I64;
481 if (stat(file, &s2) == 0) {
482 if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32;
483 else return SSI_OFFSET_I64;
490 /* Function: SSICreateIndex()
491 * Date: SRE, Tue Jan 2 11:23:25 2001 [St. Louis]
493 * Purpose: Creates and initializes a SSI index structure.
494 * Sequence file offset type is specified by {mode}.
496 * Args: mode - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode.
498 * Returns: ptr to new index structure, or NULL on failure.
499 * Caller is responsible for free'ing the returned
500 * structure with SSIFreeIndex().
503 SSICreateIndex(int mode)
508 if ((g = malloc(sizeof(SSIINDEX))) == NULL) goto FAILURE;
510 g->imode = SSI_OFFSET_I32; /* index always starts as 32-bit; may get upgraded later */
512 #ifndef HAS_64BIT_FILE_OFFSETS
513 if (mode == SSI_OFFSET_I64)
515 Can't create a 64-bit SSI index on this system, sorry;\n\
516 I don't have 64-bit file offset functions available.\n");
520 g->fileformat = NULL;
534 g->tot_secondary = 0;
538 g->chunkoffset = NULL;
541 /* temporarily disabled: sort-on-disk needs more thought! */
542 /* g->max_chunk_size= maxchunk; */
543 g->max_chunk_size = 999999;
545 /* All mallocs must go after NULL initializations, because of the cleanup strategy;
546 * we'll try to free anything non-NULL if a malloc fails.
548 /* This is temporarily disabled. Sort-on-disk needs more thought!
549 if ((g->tmpbase = sre_strdup(tmpfile, -1)) == NULL) goto FAILURE;
552 if ((g->filenames = malloc(sizeof(char *) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
553 if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
554 if ((g->bpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
555 if ((g->rpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
557 if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE;
558 if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE;
563 SSIFreeIndex(g); /* free the damaged structure */
567 /* Function: SSIGetFilePosition()
568 * Date: SRE, Tue Jan 2 09:59:26 2001 [St. Louis]
570 * Purpose: Fills {ret_offset} with the current disk
571 * offset of {fp}, relative to the start of the file.
572 * {mode} is set to either SSI_OFFSET_I32 or
573 * SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps
574 * a call to ftell(); otherwise deals with system-dependent
575 * details of 64-bit file offsets.
577 * Args: fp - open stream
578 * mode - SSI_OFFSET_I32 or SSI_OFFSET_I64
579 * ret_offset - RETURN: file position
581 * Returns: 0 on success. nonzero on error.
584 SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset)
586 if (mode == SSI_OFFSET_I32)
588 ret_offset->mode = SSI_OFFSET_I32;
589 ret_offset->off.i32 = ftell(fp);
590 if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED;
592 else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */
594 ret_offset->mode = SSI_OFFSET_I64;
595 #ifndef HAS_64BIT_FILE_OFFSETS
596 return SSI_ERR_NO64BIT;
597 #elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8
598 if ((ret_offset->off.i64 = ftello(fp)) == -1) return SSI_ERR_TELL_FAILED;
599 #elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8
600 if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED;
601 #elif defined HAVE_FTELL64
602 if ((ret_offset->off.i64 = ftell64(fp)) == -1) return SSI_ERR_TELL_FAILED;
603 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
604 if (fgetpos(fp, &(ret_offset->off.i64)) != 0) return SSI_ERR_TELL_FAILED;
610 /* Function: SSIAddFileToIndex()
611 * Date: SRE, Tue Jan 2 12:54:36 2001 [St. Louis]
613 * Purpose: Adds the sequence file {filename}, which is known to
614 * be in format {fmt}, to the index {g}. Creates and returns
615 * a unique filehandle {fh} for then associating primary keys
616 * with this file using SSIAddPrimaryKeyToIndex().
618 * Args: g - active index
619 * filename - file to add
620 * fmt - format code for this file (e.g. SQFILE_FASTA)
621 * ret_fh - RETURN: unique handle for this file
623 * Returns: 0 on success; nonzero on error.
626 SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh)
630 if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES;
632 n = strlen(filename);
633 if ((n+1) > g->flen) g->flen = n+1;
635 if ((g->filenames[g->nfiles] = sre_strdup(filename, n)) == NULL) return SSI_ERR_MALLOC;
636 g->fileformat[g->nfiles] = fmt;
637 g->bpl[g->nfiles] = 0;
638 g->rpl[g->nfiles] = 0;
639 *ret_fh = g->nfiles; /* handle is simply = file number */
642 if (g->nfiles % SSI_FILE_BLOCK == 0) {
643 g->filenames = realloc(g->filenames, sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK));
644 if (g->filenames == NULL) return SSI_ERR_MALLOC;
645 g->fileformat= realloc(g->fileformat,sizeof(int) * (g->nfiles+SSI_FILE_BLOCK));
646 if (g->fileformat == NULL) return SSI_ERR_MALLOC;
647 g->bpl = realloc(g->fileformat,sizeof(int) * (g->nfiles+SSI_FILE_BLOCK));
648 if (g->bpl == NULL) return SSI_ERR_MALLOC;
649 g->rpl = realloc(g->fileformat,sizeof(int) * (g->nfiles+SSI_FILE_BLOCK));
650 if (g->rpl == NULL) return SSI_ERR_MALLOC;
656 /* Function: SSISetFileForSubseq()
657 * Date: SRE, Tue Jan 9 10:02:05 2001 [St. Louis]
659 * Purpose: Set SSI_FAST_SUBSEQ for the file indicated by
660 * filehandle {fh} in the index {g}, setting
661 * parameters {bpl} and {rpl} to the values given.
662 * {bpl} is the number of bytes per sequence data line.
663 * {rpl} is the number of residues per sequence data line.
664 * Caller must be sure that {bpl} and {rpl} do not change
665 * on any line of any sequence record in the file
666 * (except for the last data line of each record). If
667 * this is not the case in this file, SSI_FAST_SUBSEQ
668 * will not work, and this routine should not be
671 * Args: g - the active index
672 * fh - handle for file to set SSI_FAST_SUBSEQ on
673 * bpl - bytes per data line
674 * rpl - residues per data line
676 * Returns: 0 on success; 1 on error.
679 SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl)
681 if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG;
682 if (bpl <= 0 || rpl <= 0) return SSI_ERR_BADARG;
689 /* Function: SSIAddPrimaryKeyToIndex()
690 * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis]
692 * Purpose: Put primary key {key} in the index {g}, while telling
693 * the index this primary key is in the file associated
694 * with filehandle {fh} (returned by a previous call
695 * to SSIAddFileToIndex()), and its record starts at
696 * position {r_off} in the file.
698 * {d_off} and {L} are optional; they may be left unset
699 * by passing NULL and 0, respectively. (If one is
700 * provided, both must be provided.) If they are provided,
701 * {d_off} gives the position of the first line of sequence
702 * data in the record, and {L} gives the length of
703 * the sequence in residues. They are used when
704 * SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ
705 * is not set for the file, {d_off} and {L} will be
706 * ignored by the index reading API even if they are stored
707 * by the index writing API, so it doesn't hurt for the
708 * indexing program to provide them; typically they
709 * won't know whether it's safe to set SSI_FAST_SUBSEQ
710 * for the whole file until the whole file has been
711 * read and every key has already been added to the index.
713 * Args: g - active index
714 * key - primary key to add
715 * fh - handle on file that this key's in
716 * r_off - offset to start of record
717 * d_off - offset to start of sequence data
718 * L - length of sequence, or 0
720 * Returns: 0 on success, nonzero on error.
723 SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh,
724 SSIOFFSET *r_off, SSIOFFSET *d_off, int L)
726 int n; /* a string length */
728 if (fh >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES;
729 if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS;
730 if (L > 0 && d_off == NULL) abort(); /* need both. */
732 /* Before adding the key: check how big our chunk of
733 * index is. If it's getting too large, flush a chunk to disk tmpfile.
735 if (current_chunk_size(g) >= g->max_chunk_size) write_index_chunk(g);
738 if ((n+1) > g->plen) g->plen = n+1;
740 if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC;
741 g->pkeys[g->nprimary].fnum = (sqd_uint16) fh;
742 g->pkeys[g->nprimary].r_off = *r_off;
743 if (d_off != NULL && L > 0) {
744 g->pkeys[g->nprimary].d_off = *d_off;
745 g->pkeys[g->nprimary].len = L;
747 /* yeah, this looks stupid, but look: we have to give a valid
748 looking, non-NULL d_off of some sort, or writes will fail.
749 It's going to be unused anyway. */
750 g->pkeys[g->nprimary].d_off = *r_off;
751 g->pkeys[g->nprimary].len = 0;
753 g->pkeys[g->nprimary].handle = g->nprimary;
756 if (g->nprimary % SSI_KEY_BLOCK == 0) {
757 g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK));
758 if (g->pkeys == NULL) return SSI_ERR_MALLOC;
765 /* Function: SSIAddSecondaryKeyToIndex()
766 * Date: SRE, Tue Jan 2 12:44:40 2001 [St. Louis]
768 * Purpose: Puts secondary key {key} in the index {g}, associating
769 * it with primary key {pkey} that was previously
770 * registered by SSIAddPrimaryKeyToIndex().
772 * Args: g - active index
773 * key - secondary key to add
774 * pkey - primary key to associate this key with
776 * Returns: 0 on success, 1 on failure.
779 SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey)
781 int n; /* a string length */
783 if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS;
786 if ((n+1) > g->slen) g->slen = n+1;
788 if ((g->skeys[g->nsecondary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC;
789 if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC;
792 if (g->nsecondary % SSI_KEY_BLOCK == 0) {
793 g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK));
794 if (g->skeys == NULL) return SSI_ERR_MALLOC;
802 /* Function: SSIWriteIndex()
803 * Date: SRE, Tue Jan 2 13:55:56 2001 [St. Louis]
805 * Purpose: Writes complete index {g} in SSI format to a
806 * binary file {file}. Does all
807 * the overhead of sorting the primary and secondary keys,
808 * and maintaining the association of secondary keys
809 * with primary keys during and after the sort.
811 * Args: file - file to write to
812 * g - index to sort & write out.
814 * Returns: 0 on success, nonzero on error.
816 /* needed for qsort() */
818 pkeysort(const void *k1, const void *k2)
820 struct ssipkey_s *key1;
821 struct ssipkey_s *key2;
822 key1 = (struct ssipkey_s *) k1;
823 key2 = (struct ssipkey_s *) k2;
824 return strcmp(key1->key, key2->key);
827 skeysort(const void *k1, const void *k2)
829 struct ssiskey_s *key1;
830 struct ssiskey_s *key2;
831 key1 = (struct ssiskey_s *) k1;
832 key2 = (struct ssiskey_s *) k2;
833 return strcmp(key1->key, key2->key);
836 SSIWriteIndex(char *file, SSIINDEX *g)
841 /* Case 1. Simple: the whole index fit in memory; write it to disk,
845 if ((fp = fopen(file,"wb")) == NULL) return SSI_ERR_NOFILE;
846 status = write_index(fp, g);
848 g->tot_primary = g->nprimary;
849 g->tot_secondary = g->nsecondary;
853 /* Case 2. Ugly: the index is big (and possibly *really* big, necessitating
854 * 64-bit offsets in the index itself!); we had to write the index to a tmp
855 * file on disk. Flush the last chunk to disk; then mergesort the chunks
856 * until we have one chunk to rule them all, one chunk to bind them.
858 write_index_chunk(g); /* flush the last chunk. */
861 Die("oi, you haven't IMPLEMENTED the mergesort yet, dumbass.");
865 write_index(FILE *fp, SSIINDEX *g)
868 sqd_uint32 header_flags, file_flags;
869 sqd_uint32 frecsize, precsize, srecsize;
870 sqd_uint64 foffset, poffset, soffset;
873 /* Magic-looking numbers come from adding up sizes
876 frecsize = 16 + g->flen;
877 precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen;
878 srecsize = g->slen + g->plen;
881 if (g->smode == SSI_OFFSET_I64) header_flags |= SSI_USE64;
882 if (g->imode == SSI_OFFSET_I64) header_flags |= SSI_USE64_INDEX;
884 /* Magic-looking numbers again come from adding up sizes
887 foffset = (header_flags & SSI_USE64_INDEX) ? 66 : 54;
888 poffset = foffset + frecsize*g->nfiles;
889 soffset = poffset + precsize*g->nprimary;
893 qsort((void *) g->pkeys, g->nprimary, sizeof(struct ssipkey_s), pkeysort);
894 qsort((void *) g->skeys, g->nsecondary, sizeof(struct ssiskey_s), skeysort);
898 if (! write_i32(fp, v20magic)) return SSI_ERR_FWRITE;
899 if (! write_i32(fp, header_flags)) return SSI_ERR_FWRITE;
900 if (! write_i16(fp, g->nfiles)) return SSI_ERR_FWRITE;
901 if (! write_i32(fp, g->nprimary)) return SSI_ERR_FWRITE;
902 if (! write_i32(fp, g->nsecondary)) return SSI_ERR_FWRITE;
903 if (! write_i32(fp, g->flen)) return SSI_ERR_FWRITE;
904 if (! write_i32(fp, g->plen)) return SSI_ERR_FWRITE;
905 if (! write_i32(fp, g->slen)) return SSI_ERR_FWRITE;
906 if (! write_i32(fp, frecsize)) return SSI_ERR_FWRITE;
907 if (! write_i32(fp, precsize)) return SSI_ERR_FWRITE;
908 if (! write_i32(fp, srecsize)) return SSI_ERR_FWRITE;
909 if (g->imode == SSI_OFFSET_I32) {
910 if (! write_i32(fp, foffset)) return SSI_ERR_FWRITE;
911 if (! write_i32(fp, poffset)) return SSI_ERR_FWRITE;
912 if (! write_i32(fp, soffset)) return SSI_ERR_FWRITE;
914 if (! write_i64(fp, foffset)) return SSI_ERR_FWRITE;
915 if (! write_i64(fp, poffset)) return SSI_ERR_FWRITE;
916 if (! write_i64(fp, soffset)) return SSI_ERR_FWRITE;
921 if ((s = malloc(sizeof(char) * g->flen)) == NULL) return SSI_ERR_MALLOC;
922 for (i = 0; i < g->nfiles; i++)
925 if (g->bpl[i] > 0 && g->rpl[i] > 0) file_flags |= SSI_FAST_SUBSEQ;
927 strcpy(s, g->filenames[i]);
928 if (fwrite(s, sizeof(char), g->flen, fp) != g->flen) return SSI_ERR_FWRITE;
929 if (! write_i32(fp, g->fileformat[i])) return SSI_ERR_FWRITE;
930 if (! write_i32(fp, file_flags)) return SSI_ERR_FWRITE;
931 if (! write_i32(fp, g->bpl[i])) return SSI_ERR_FWRITE;
932 if (! write_i32(fp, g->rpl[i])) return SSI_ERR_FWRITE;
936 /* The primary key section
938 if ((s = malloc(sizeof(char) * g->plen)) == NULL) return SSI_ERR_MALLOC;
939 for (i = 0; i < g->nprimary; i++)
941 strcpy(s, g->pkeys[i].key);
942 if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE;
943 if (! write_i16( fp, g->pkeys[i].fnum)) return SSI_ERR_FWRITE;
944 if (! write_offset(fp, &(g->pkeys[i].r_off))) return SSI_ERR_FWRITE;
945 if (! write_offset(fp, &(g->pkeys[i].d_off))) return SSI_ERR_FWRITE;
946 if (! write_i32( fp, g->pkeys[i].len)) return SSI_ERR_FWRITE;
949 /* The secondary key section
951 if (g->nsecondary > 0) {
952 if ((s2 = malloc(sizeof(char) * g->slen)) == NULL) return SSI_ERR_MALLOC;
953 for (i = 0; i < g->nsecondary; i++)
955 strcpy(s2, g->skeys[i].key);
956 strcpy(s, g->skeys[i].pkey);
957 if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE;
958 if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE;
967 write_index_chunk(SSIINDEX *g)
972 SQD_DPRINTF1(("Writing index chunk %d to disk... \n", g->nchunks));
974 /* Save the offset for each chunk in an array; remember how many
975 * chunks we put into the tmp file t1.
979 if ((t1file = sre_strdup(g->tmpbase, -1)) == NULL) goto FAILURE;
980 if (sre_strcat(&t1file, -1, ".t1", 3) < 0) goto FAILURE;
981 if ((g->t1 = fopen(t1file, "wb")) == NULL) return SSI_ERR_NOFILE;
984 if ((g->chunkoffset = malloc(sizeof(fpos_t))) == NULL) goto FAILURE;
986 if ((g->chunkoffset = realloc(g->chunkoffset, sizeof(fpos_t) * (g->nchunks+1))) == NULL) goto FAILURE;
988 if (fgetpos(g->t1, &(g->chunkoffset[g->nchunks])) != 0)
989 Die("Index file size has apparently exceeded system limitations, sorry.");
992 /* Sort and append this chunk of the index to the open tmp file t1
994 if ((status = write_index(g->t1, g)) != 0) return status;
995 g->tot_primary += g->nprimary;
996 g->tot_secondary += g->nsecondary;
998 /* Now, a partial free'ing of the index - clear the keys, but leave the files
1000 for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key);
1001 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key);
1002 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey);
1006 /* Reset the primary and secondary keys sections, in preparation
1007 * for accumulating more
1017 if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE;
1018 if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE;
1023 return SSI_ERR_MALLOC;
1028 /* Function: SSIFreeIndex()
1029 * Date: SRE, Tue Jan 2 11:44:08 2001 [St. Louis]
1031 * Purpose: Free an index structure {g}.
1033 * Args: g - ptr to an open index.
1038 SSIFreeIndex(SSIINDEX *g)
1043 for (i = 0; i < g->nfiles; i++) free(g->filenames[i]);
1044 for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key);
1045 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key);
1046 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey);
1047 if (g->filenames != NULL) free(g->filenames);
1048 if (g->fileformat != NULL) free(g->fileformat);
1049 if (g->bpl != NULL) free(g->bpl);
1050 if (g->rpl != NULL) free(g->rpl);
1051 if (g->pkeys != NULL) free(g->pkeys);
1052 if (g->skeys != NULL) free(g->skeys);
1053 if (g->tmpbase != NULL) free(g->tmpbase);
1054 if (g->chunkoffset != NULL) free(g->chunkoffset);
1055 if (g->t1 != NULL) fclose(g->t1);
1061 /* Function: SSIErrorString()
1062 * Date: SRE, Tue Jan 2 10:38:10 2001 [St. Louis]
1064 * Purpose: Returns a ptr to an internal string corresponding
1065 * to error {n}, a code returned from any of the
1066 * functions in the API that return non-zero on error.
1068 * Args: n - error code
1070 * Returns: ptr to an internal string.
1073 SSIErrorString(int n)
1076 case SSI_ERR_OK: return "ok (no error)";
1077 case SSI_ERR_NODATA: return "no data, fread() failed";
1078 case SSI_ERR_NO_SUCH_KEY: return "no such key";
1079 case SSI_ERR_MALLOC: return "out of memory, malloc() failed";
1080 case SSI_ERR_NOFILE: return "file not found, fopen() failed";
1081 case SSI_ERR_BADMAGIC: return "not a SSI file? (bad magic)";
1082 case SSI_ERR_BADFORMAT: return "corrupt format? unexpected data";
1083 case SSI_ERR_NO64BIT: return "no large file support for this system";
1084 case SSI_ERR_SEEK_FAILED: return "failed to reposition on disk";
1085 case SSI_ERR_TELL_FAILED: return "failed to get file position on disk";
1086 case SSI_ERR_NO_SUBSEQS: return "no fast subseq support for this seqfile";
1087 case SSI_ERR_RANGE: return "subseq start is out of range";
1088 case SSI_ERR_BADARG: return "an argument is out of range";
1089 default: return "unrecognized code";
1095 read_i16(FILE *fp, sqd_uint16 *ret_result)
1098 if (fread(&result, sizeof(sqd_uint16), 1, fp) != 1) return 0;
1099 *ret_result = sre_ntoh16(result);
1103 write_i16(FILE *fp, sqd_uint16 n)
1106 if (fwrite(&n, sizeof(sqd_uint16), 1, fp) != 1) return 0;
1110 read_i32(FILE *fp, sqd_uint32 *ret_result)
1113 if (fread(&result, sizeof(sqd_uint32), 1, fp) != 1) return 0;
1114 *ret_result = sre_ntoh32(result);
1118 write_i32(FILE *fp, sqd_uint32 n)
1121 if (fwrite(&n, sizeof(sqd_uint32), 1, fp) != 1) return 0;
1125 read_i64(FILE *fp, sqd_uint64 *ret_result)
1128 if (fread(&result, sizeof(sqd_uint64), 1, fp) != 1) return 0;
1129 *ret_result = sre_ntoh64(result);
1133 write_i64(FILE *fp, sqd_uint64 n)
1136 if (fwrite(&n, sizeof(sqd_uint64), 1, fp) != 1) return 0;
1140 read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset)
1142 if (mode == SSI_OFFSET_I32) {
1143 ret_offset->mode = SSI_OFFSET_I32;
1144 if (! read_i32(fp, &(ret_offset->off.i32))) return 0;
1145 } else if (mode == SSI_OFFSET_I64) {
1146 ret_offset->mode = SSI_OFFSET_I64;
1147 if (! read_i64(fp, &(ret_offset->off.i64))) return 0;
1153 write_offset(FILE *fp, SSIOFFSET *offset)
1155 if (offset->mode == SSI_OFFSET_I32) return write_i32(fp, offset->off.i32);
1156 else if (offset->mode == SSI_OFFSET_I64) return write_i64(fp, offset->off.i64);
1159 return 1; /* silence bitchy compilers */
1163 /* Function: binary_search()
1164 * Date: SRE, Sun Dec 31 16:05:03 2000 [St. Louis]
1166 * Purpose: Find a key in a SSI index, by a binary search
1167 * in an alphabetically sorted list of keys. If successful,
1168 * return 0, and the index file is positioned to read
1169 * the rest of the data for that key. Else returns nonzero.
1171 * Args: sfp - an open SSIFILE
1173 * klen - key length to allocate (plen or slen from sfp)
1174 * base - base offset (poffset or soffset)
1175 * recsize - size of each key record in bytes (precsize or srecsize)
1176 * maxidx - # of keys (nprimary or nsecondary)
1178 * Returns: 0 on success, and leaves file positioned for reading remaining
1180 * Nonzero on failure:
1181 * SSI_ERR_NO_SUCH_KEY - that key's not in the index
1182 * SSI_ERR_MALLOC - a memory allocation failure
1183 * SSI_ERR_NODATA - an fread() failed
1186 binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base,
1187 sqd_uint32 recsize, sqd_uint32 maxidx)
1190 sqd_uint32 left, right, mid;
1194 if ((name = malloc (sizeof(char)*klen)) == NULL) return SSI_ERR_MALLOC;
1197 while (1) { /* A binary search: */
1198 mid = (left+right) / 2; /* careful here. only works because
1199 we limit unsigned vars to signed ranges. */
1200 if ((status = indexfile_position(sfp, base, recsize, mid)) != 0)
1201 { free(name); return status; }
1202 if (fread(name, sizeof(char), klen, sfp->fp) != klen)
1203 { free(name); return SSI_ERR_NODATA; }
1204 cmp = strcmp(name, key);
1205 if (cmp == 0) break; /* found it! */
1206 else if (left >= right) /* oops, missed it; fail */
1207 { free(name); return SSI_ERR_NO_SUCH_KEY; }
1208 else if (cmp < 0) left = mid+1; /* it's right of mid */
1209 else if (cmp > 0) right = mid-1; /* it's left of mid */
1212 return 0; /* and sfp->fp is positioned... */
1215 /* Function: indexfile_position()
1216 * Date: SRE, Mon Jan 1 19:32:49 2001 [St. Louis]
1218 * Purpose: Position the open index file {sfp} at the start
1219 * of record {n} in a list of records that starts at
1220 * base offset {base}, where each record takes up {l}
1221 * bytes. (e.g. the position is byte (base + n*l)).
1223 * Args: sfp - open SSIFILE
1224 * base - offset of record 0 (e.g. sfp->foffset)
1225 * len - size of each record in bytes (e.g. sfp->frecsize)
1226 * n - which record to get (e.g. 0..sfp->nfiles)
1228 * Returns: 0 on success, non-zero on failure.
1231 indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n)
1236 if (base->mode == SSI_OFFSET_I32) {
1237 pos.mode = SSI_OFFSET_I32;
1238 pos.off.i32 = base->off.i32 + n*len;
1239 } else if (base->mode == SSI_OFFSET_I64) {
1240 pos.mode = SSI_OFFSET_I64;
1241 pos.off.i64 = base->off.i64 + n*len;
1243 if ((status = SSISetFilePosition(sfp->fp, &pos)) != 0) return status;
1247 /* Function: current_chunk_size()
1248 * Date: SRE, Tue Feb 20 18:23:30 2001 [St. Louis]
1250 * Purpose: Calculates the size of the current indexfile chunk,
1254 current_chunk_size(SSIINDEX *g)
1256 sqd_uint64 frecsize, precsize, srecsize;
1259 /* Magic-looking numbers come from adding up sizes
1260 * of things in bytes
1262 frecsize = 16 + g->flen;
1263 precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen;
1264 srecsize = g->plen+g->slen;
1265 total = (66L + /* header size, if 64bit index offsets */
1266 frecsize * g->nfiles + /* file section size */
1267 precsize * g->nprimary + /* primary key section size */
1268 srecsize * g->nsecondary) / /* secondary key section size */
1276 mergesort(SSIINDEX *g)
1278 char *infile; /* reading "tape" 1: source. */
1279 char *outfile; /* writing "tape" 2: destination. */
1280 SSIFILE *in1; /* on read, a chunk of the SSI file goes in an SSIFILE. */
1281 SSIFILE *in2; /* and chunk 2 goes in here. */
1282 FILE *outfp; /* where we're writing the merged data */
1283 int b; /* b, b+1 are current chunks we're merging from infile */
1284 char *k1, *k2; /* buffers full of keys to be merged from ch1, ch2 */
1285 sqd_uint32 base1, pos1, buflen1; /* buffered key input for ch1 */
1286 sqd_uint32 base2, pos2, buflen2; /* buffered key input for ch2 */
1292 /* create the tmp file names */
1293 if ((infile = sre_strdup(g->tmpbase, -1)) == NULL) return SSI_ERR_MALLOC;
1294 if (sre_strcat(&infile, -1, ".t1", 3) < 0) return SSI_ERR_MALLOC;
1295 if ((outfile = sre_strdup(g->tmpbase, -1)) == NULL) return SSI_ERR_MALLOC;
1296 if (sre_strcat(&outfile, -1, ".t2", 3) < 0) return SSI_ERR_MALLOC;
1297 /* allocate the SSIFILEs for reading chunks */
1298 if ((in1 = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC;
1299 if ((in2 = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC;
1301 /* Open infile for read; both chunks (in1 and in2) are read from this file,
1302 * from different file offsets kept in g->chunkoffset[]
1304 if ((in1->fp = fopen(infile, "rb")) == NULL) return SSI_ERR_NOFILE;
1306 if ((outfp = fopen(outfile, "wb")) == NULL) return SSI_ERR_NOFILE;
1308 for (b = 0; b+1 < g->nchunks; b+=2)
1310 if (fsetpos(in1->fp, &(g->chunkoffset[b])) > 0) return SSI_ERR_SEEK_FAILED;
1311 if (fsetpos(in2->fp, &(g->chunkoffset[b+1])) > 0) return SSI_ERR_SEEK_FAILED;
1313 if (status = load_indexfile(in1) > 0) return status;
1314 if (status = load_indexfile(in2) > 0) return status;
1316 merge_headers(g, in1, in2);
1317 write_index_header(outfp, g);
1319 /* Merge the primary key section;
1320 * do a buffered read of the pkeys from ch1 and ch2.
1323 if ((k1 = malloc(sizeof(char) * (maxbuf*in1->precsize))) == NULL) return SSI_ERR_MALLOC;
1324 if ((k2 = malloc(sizeof(char) * (maxbuf*in2->precsize))) == NULL) return SSI_ERR_MALLOC;
1325 base1 = pos1 = buflen1 = 0;
1326 base2 = pos2 = buflen2 = 0;
1327 while (base1+pos1 < ch1->nprimary || base2+pos2 < ch2->nprimary) {
1328 /* refill buffer for ch1? */
1329 if (pos1 == buflen1) {
1332 buflen1 = MIN(in1->nprimary - base1, maxbuf);
1334 if (fread(k1, sizeof(char), (buflen1*in1->precsize), in1->fp)
1335 < buflen1*in1->precsize)
1336 return SSI_ERR_NODATA;
1339 /* refill buffer for ch2? */
1340 if (pos2 == buflen2) {
1343 buflen2 = MIN(in2->nprimary - base2, maxbuf);
1345 if (fread(k2, sizeof(char), (buflen1*in2->precsize), in2->fp)
1346 < buflen2*in2->precsize)
1347 return SSI_ERR_NODATA;
1350 /* mergesort on keys; be careful of case where we're
1351 out of keys in either ch1 or ch2 */
1352 if (base2+pos2 == ch2->nprimary ||
1353 strcmp(k1+(pos1*in1->precsize), k2+(pos2*in2->precsize)))
1354 write_pkey(t3, &(pk1[pos1]), s);
1357 write_pkey(t3, &(pk2[pos2]), s);
1365 /* Merge the secondary keys; much like the primary key code above.
1368 if ((sk1 = malloc(sizeof(struct ssiskey_s) * maxbuf)) == NULL) return SSI_ERR_MALLOC;
1369 if ((sk2 = malloc(sizeof(struct ssiskey_s) * maxbuf)) == NULL) return SSI_ERR_MALLOC;
1370 if ((s = malloc(sizeof(char) * newch->slen)) == NULL) return SSI_ERR_MALLOC;
1371 base1 = pos1 = buflen1 = 0;
1372 base2 = pos2 = buflen2 = 0;
1373 while (base1+pos1 < ch1->nsecondary || base2+pos2 < ch2->nsecondary) {
1374 /* refill buffer for ch1? */
1375 if (pos1 == buflen1) {
1378 buflen1 = MIN(ch1->nsecondary - base1, maxbuf);
1379 if (buflen1 > 0) read_skeys(ch1->fp, sk1, buflen1);
1381 /* refill buffer for ch2? */
1382 if (pos2 == buflen2) {
1385 buflen2 = MIN(ch2->nsecondary - base2, maxbuf);
1386 if (buflen2 > 0) read_skeys(ch2->fp, sk2, buflen2);
1388 /* mergesort on keys; be careful of case where we're
1389 out of keys in either ch1 or ch2 */
1390 if (base2+pos2 == ch2->nsecondary || pkeysort(&(sk1[pos1]), &(sk2[pos2])) < 0) {
1391 write_skey(t3, &(pk1[pos1]), s);
1394 write_skey(t3, &(pk2[pos2]), s);
1405 /* clear ch1, ch2, in prep for loading new chunks */
1408 } /* end loop over chunks */
1414 #ifdef MUGGINS_LETS_ME_SLEEP /* test driving code. */
1416 cc -g -Wall -o shiva -D MUGGINS_LETS_ME_SLEEP ssi.c sqerror.c sre_string.c types.c sre_ctype.c sre_math.c -lm
1420 main(int argc, char **argv)
1422 char name[32], accession[32];
1425 SSIOFFSET r_off, d_off;
1428 int fh; /* a file handle */
1429 int status; /* return status from a SSI call */
1431 mode = SSI_OFFSET_I32;
1432 if ((ssi = SSICreateIndex(mode)) == NULL)
1433 Die("Failed to allocate SSI index");
1435 /* Generate two FASTA files, tmp.0 and tmp.1, and index them.
1437 if ((ofp = fopen("tmp.0", "w")) == NULL)
1438 Die("failed to open tmp.0");
1439 if ((status = SSIAddFileToIndex(ssi, "tmp.0", SQFILE_FASTA, &fh)) != 0)
1440 Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status));
1441 for (i = 0; i < 10; i++) {
1442 if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0)
1443 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1444 sprintf(name, "seq%d", i);
1445 sprintf(accession, "ac%d", i);
1446 fprintf(ofp, ">%s [%s] Description? we don't need no steenking description.\n",
1448 if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0)
1449 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1450 fprintf(ofp, "AAAAAAAAAA\n");
1451 fprintf(ofp, "CCCCCCCCCC\n");
1452 fprintf(ofp, "GGGGGGGGGG\n");
1453 fprintf(ofp, "TTTTTTTTTT\n");
1455 if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0)
1456 Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status));
1457 if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0)
1458 Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status));
1460 SSISetFileForSubseq(ssi, fh, 11, 10);
1463 if ((ofp = fopen("tmp.1", "w")) == NULL)
1464 Die("failed to open tmp.1");
1465 if ((status = SSIAddFileToIndex(ssi, "tmp.1", SQFILE_FASTA, &fh)) != 0)
1466 Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status));
1467 for (i = 10; i < 20; i++) {
1468 if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0)
1469 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1470 sprintf(name, "seq%d", i);
1471 sprintf(accession, "ac%d", i);
1472 fprintf(ofp, ">%s [%s] i/o, i/o, it's off to disk we go.\n",
1474 if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0)
1475 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1476 fprintf(ofp, "AAAAAAAAAA 10\n");
1477 fprintf(ofp, "CCCCCCCCCC 20\n");
1478 fprintf(ofp, "GGGGGGGGGG 30\n");
1479 fprintf(ofp, "TTTTTTTTTT 40\n");
1481 if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0)
1482 Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status));
1483 if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0)
1484 Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status));
1486 SSISetFileForSubseq(ssi, fh, 14, 10);
1489 /* Write the index to tmp.ssi
1491 if ((status = SSIWriteIndex("tmp.ssi", ssi)) != 0)
1492 Die("SSIWriteIndex() failed: %s", SSIErrorString(status));
1495 /* Now reopen the index and run some tests.
1501 #endif /* test driving code */