forester/archive/RIO/others/hmmer/squid/Docs/ssi-format.tex

   1 % SRE, Mon Dec 25 13:00:46 2000
   2
   3 \documentclass[12pt]{report}
   4 \usepackage{fullpage}
   5 \usepackage{times}
   6 \usepackage{epsfig}
   7 %\usepackage{html}               % From the LaTeX2html translator
   8 \usepackage{apalike}
   9 \setcounter{secnumdepth}{2}
  10
  11 \input{macros}
  12
  13 \begin{document}
  14 \bibliographystyle{apalike}
  15
  16 \section{SSI format}
  17
  18 SSI format (Sequence/Subsequence Index format) indexes flatfile
  19 databases by names and/or accessions, enabling fast retrieval.
  20
  21 An SSI index is a binary file that stores sequence names or accessions
  22 as \emph{keys} that it can look up rapidly. It differentiates between
  23 \emph{primary keys} and \emph{secondary keys}.  There is one and only
  24 one primary key per sequence. There can be more than one secondary key
  25 per sequence. Both primary and secondary keys must be unique
  26 identifiers (no two records have the same key). A program (like
  27 HMMER's distributed PVM implementation) that needs to step through
  28 each sequence one at a time can refer to the list of primary keys. A
  29 program solely concerned with flexible sequence retrieval (such as
  30 SQUID's \prog{sfetch}) might consult an SSI index with accessions as
  31 primary keys, and names as secondary keys.
  32
  33 A single SSI file can index multiple sequence data files. This allows
  34 indexing multifile databases (e.g. Genbank flatfile distributions).
  35
  36 The SSI format is relatively simple and may prove useful for other
  37 indexing tasks besides sequence names. HMMER uses SSI format to index
  38 HMM files.
  39
  40 \subsection{Special features of SSI}
  41
  42 SSI superceded 1994's GSI format after human genome sequence files
  43 started exceeding 2 GB filesystem limitations, and after problems in
  44 the HMMER PVM implementation had to be hacked around. SSI has the
  45 following additional features compared to GSI.
  46
  47 \begin{description}
  48 \item[Separate primary key section]
  49 Primary keys are set apart in a separate section, enabling programs to
  50 step through a guaranteed one-to-one mapping of keys to sequences.  A
  51 secondary key section adds many-to-one mapping of keys to sequences.
  52
  53 \item[Arbitrary filename and key lengths]
  54 File name lengths and key name lengths are effectively unlimited.
  55
  56 \item[64-bit indexing]
  57 For sequence files exceeding 2GB, on architectures that support 64-bit
  58 filesystems (such as IRIX, Solaris, Tru64 UNIX, FreeBSD...), SSI
  59 supports 64-bit indexing; depending on the system, file sizes may
  60 theoretically be allowed to range up to millions of terabytes.
  61
  62 \item[Fast subsequence extraction]
  63 SSI can be used to greatly accelerate \emph{subsequence} extraction
  64 from very long sequences (example: human chromosome contigs). The
  65 sequence file must meet certain formatting conditions for this to
  66 work; see below for details.
  67 \end{description}
  68
  69 \subsection{SSI API in SQUID}
  70
  71 \subsubsection{Functions for using a SSI index file:}
  72
  73 \begin{sreapi}
  74 \item[int SSIOpen(char *filename, SSIFILE **ret\_sfp)]
  75
  76 Opens the SSI index file \prog{filename} and returns a \prog{SSIFILE
  77 *} stream through \prog{ret\_sfp}. Returns 0 on success, nonzero on
  78 failure. The caller must eventually close this stream using
  79 \prog{SSIClose()}.  More than one index can be open at once.
  80
  81 Error codes:\\
  82 \begin{tabular}{ll}
  83 \prog{SSI\_ERR\_NOFILE}   & failed to open file; doesn't exist or not readable\\
  84 \prog{SSI\_ERR\_BADMAGIC} & not a SSI file \\
  85 \prog{SSI\_ERR\_NO64BIT}  & it has 64-bit offsets, and we can't support that\\
  86 \prog{SSI\_ERR\_FORMAT}   & file appears to be corrupted\\
  87 \prog{SSI\_ERR\_MALLOC}   & malloc failed \\
  88 \end{tabular}
  89
  90 \item[int SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret\_fh, SSIOFFSET *ret\_offset)]
  91
  92 Looks up the string \prog{key} in the open index \prog{sfp}.
  93 \prog{key} can be either a primary or secondary key. If \prog{key} is
  94 found, \prog{*ret\_fh} contains a unique handle on the file
  95 that contains {key} (suitable for an \prog{SSIFileInfo()} call, or for
  96 comparison to the handle of the last file that was opened for
  97 retrieval), and \prog{offset} is filled in with the offset in that
  98 file. Returns 0 on success, non-zero on error.
  99
 100 Error codes:\\
 101 \begin{tabular}{ll}
 102 \prog{SSI\_ERR\_NO\_SUCH\_KEY} & key not found \\
 103 \prog{SSI\_ERR\_NODATA}        & fread() failed, file appears to be corrupted\\
 104 \end{tabular}
 105
 106 \item[int SSIGetOffsetByNumber(SSIFILE *sfp, int nkey, int
 107 *ret\_fh, SSIOFFSET *offset)]
 108
 109 Retrieves information for primary key number \prog{nkey}.  \prog{nkey}
 110 ranges from 0..\prog{nprimary-1}. When the key is found,
 111 \prog{*ret\_fh} contains a unique handle on the file that
 112 contains {key} (suitable for an SSIFileInfo() call, or for comparison
 113 to the handle of the last file that was opened for retrieval), and
 114 \prog{offset} is filled in with the offset in that file. Returns 0 on
 115 success, non-zero on error.
 116
 117 Error codes:\\
 118 \begin{tabular}{ll}
 119 \prog{SSI\_ERR\_SEEK\_FAILED}  & failed to reposition in index file\\
 120 \prog{SSI\_ERR\_NO\_SUCH\_KEY} & key not found \\
 121 \prog{SSI\_ERR\_NODATA}        & fread() failed, file appears to be corrupted\\
 122 \end{tabular}
 123
 124 \item[int SSIGetSubseqOffset(SSIFILE *sfp, char *key, int
 125 requested\_start,  int *ret\_fh,
 126 SSIOFFSET *record\_offset, SSIOFFSET *data\_offset, int *ret\_actual\_start)]
 127
 128 Implements \prog{SSI\_FAST\_SUBSEQ}.
 129
 130 Looks up the string \prog{key} in the open index \prog{sfp}, and
 131 asks for the nearest offset to a subsequence starting at position
 132 \prog{requested\_start} in the sequence (numbering the sequence 1..L).
 133 \prog{key} can be either a primary or secondary key. If \prog{key} is
 134 found, \prog{*ret\_fh} contains a unique handle on the file that
 135 contains {key} (suitable for an SSIFileInfo() call, or for comparison
 136 to the handle of the last file that was opened for retrieval);
 137 \prog{record\_offset} contains the disk offset to the start of the
 138 record; \prog{data\_offset} contains the disk offset either exactly at
 139 the requested residue, or at the start of the line containing the
 140 requested residue; \prog{ret\_actual\_start} contains the coordinate
 141 (1..L) of the first valid residue at or after
 142 \prog{data\_offset}. \prog{ret\_actual\_start} is $\leq$
 143 \prog{requested\_start}.  Returns 0 on success, non-zero on failure.
 144
 145 Error codes:\\
 146 \begin{tabular}{ll}
 147 \prog{SSI\_ERR\_NO\_SUBSEQS}   & this file or key doesn't allow subseq lookup\\
 148 \prog{SSI\_ERR\_NO\_SUCH\_KEY} & key not found \\
 149 \prog{SSI\_ERR\_RANGE}         & the requested\_start is out of bounds\\
 150 \prog{SSI\_ERR\_NODATA}        & fread() failed, file appears to be corrupted\\
 151 \end{tabular}
 152
 153 \item[int SSISetFilePosition(FILE *fp, SSIOFFSET *offset]
 154
 155 Uses \prog{offset} to sets the file position for \prog{fp} (usually an
 156 open sequence file) relative to the start of the file.  Hides the
 157 details of system-dependent shenanigans necessary for file positioning
 158 in large ($>2$ GB) files. Behaves just like \prog{fseek(fp, offset,
 159 SEEK\_SET)} for 32 bit offsets and $<2$ GB files. Returns 0 on
 160 success, nonzero on error.
 161
 162 Error codes:\\
 163 \begin{tabular}{ll}
 164 \prog{SSI\_ERR\_SEEK\_FAILED}  & failed to reposition the file\\
 165 \end{tabular}
 166
 167 \item[int SSIFileInfo(SSIFILE *sfp, int fh, char **ret\_filename, int *ret\_format)]
 168
 169 Given a file handle \prog{fh} in an open index file \prog{sfp},
 170 retrieve file name \prog{ret\_filename} and the file format
 171 \prog{ret\_format}.  \prog{ret\_filename} is a pointer to a string
 172 maintained internally by \prog{sfp}. It should not be free'd;
 173 \prog{SSIClose(sfp)} will take care of it.
 174
 175 Error codes:\\
 176 \begin{tabular}{ll}
 177 \prog{SSI\_ERR\_BADARG}  & no such file n\\
 178 \end{tabular}
 179
 180 \item[void SSIClose(SSIFILE *sfp)]
 181
 182 Close an open \prog{SSIFILE *}.
 183 \end{sreapi}
 184
 185 \subsubsection{Skeleton example code for using a SSI index file:}
 186
 187 \small\begin{verbatim}
 188     SSIFILE   *sfp;
 189     FILE       *fp;
 190     int         fh;
 191     char       *seqfile;
 192     int         fmt;
 193     SSIOFFSET  offset;
 194
 195     SSIOpen(``foo.gsi'', &sfp);
 196
 197     /* Finding an entry by name
 198      * (by number, with SSIGetOffsetByNumber(), is analogous)
 199      */
 200     SSIGetOffsetByName(sfp, ``important_key'', &fh, &offset);
 201     SSIGetFileInfo(sfp, fh, &seqfile, &fmt);
 202     fp = fopen(seqfile, ``r''); /* more usually SeqfileOpen(), using fmt */
 203     SSIFilePosition(fp, &offset);
 204          /* read the entry from there, do whatever... */
 205     free(seqfile);
 206     fclose(fp);
 207
 208     SSIClose(sfp);
 209 \end{verbatim}\normalsize
 210
 211 \subsubsection{Functions for creating a SSI index file:}
 212
 213 \begin{sreapi}
 214 \item[int SSIRecommendMode(char *file)]
 215
 216 Examines the file and determines whether it should be indexed with
 217 large file support or not; returns \prog{SSI\_OFFSET\_I32} for most
 218 files, \prog{SSI\_OFFSET\_I64} for large files, or -1 on failure.
 219
 220 \item[SSIINDEX *SSICreateIndex(int mode)]
 221
 222 Creates and initializes a SSI index structure. Sequence file offset
 223 type to be used is specified by \prog{mode}, which may be either
 224 \prog{SSI\_OFFSET\_I32} or \prog{SSI\_OFFSET\_I64}.  Returns a
 225 pointer to the new structure, or NULL on failure. The caller must free
 226 this structure with \prog{SSIFreeIndex()} when done.
 227
 228 \item[int SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret\_offset)]
 229
 230 Fills \prog{ret\_offset} with the current disk offset of \prog{fp},
 231 relative to the start of the file.  {mode} is the type of offset to
 232 use; it must be either \prog{SSI\_OFFSET\_I32} or
 233 \prog{SSI\_OFFSET\_I64}. Returns 0 on success, non-zero on error.
 234
 235 Error codes:\\
 236 \begin{tabular}{ll}
 237 \prog{SSI\_ERR\_NO64BIT}       & 64-bit mode unsupported on this system\\
 238 \prog{SSI\_ERR\_TELL\_FAILED}  & failed to determine position in file\\
 239 \end{tabular}
 240
 241 \item[int SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt,
 242 int *ret\_fh)]
 243
 244 Adds the sequence file \prog{filename}, which is known to be in format
 245 \prog{fmt}, to the index \prog{g}. Creates and returns a unique
 246 filehandle \prog{ret\_fh} for associating primary keys with this file
 247 using \prog{SSIAddPrimaryKeyToIndex()}. Returns 0 on success, non-zero
 248 on failure.
 249
 250 Error codes:\\
 251 \begin{tabular}{ll}
 252 \prog{SSI\_ERR\_TOOMANY\_FILES}  & exceeded file number limit\\
 253 \prog{SSI\_ERR\_MALLOC}          & a malloc() failed\\
 254 \end{tabular}
 255
 256 \item[int SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl)]
 257
 258 Set \prog{SSI\_FAST\_SUBSEQ} for the file indicated by filehandle
 259 \prog{fh} in the index \prog{g}, setting parameters \prog{bpl} and
 260 \prog{rpl} to the values given. \prog{bpl} is the number of bytes per
 261 sequence data line.  \prog{rpl} is the number of residues per sequence
 262 data line.  Caller must be sure that \prog{bpl} and \prog{rpl} do not
 263 change on any line of any sequence record in the file (except for the
 264 last data line of each record). If this is not the case in this file,
 265 \prog{SSI\_FAST\_SUBSEQ} will not work, and this routine should not be
 266 called. Returns 0 on success, non-zero on failure.
 267
 268 \item[int SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int
 269 fh, SSIOFFSET *r\_off, SSIOFFSET *d\_off, int L)]
 270
 271 Puts a primary key \prog{key} in the index \prog{g}, while telling the
 272 index that this primary key is in the file associated with filehandle
 273 \prog{fh} and its record starts at position \prog{r\_off} in that
 274 file.
 275
 276 \prog{d\_off} and \prog{L} are optional; they may be left unset by
 277 passing NULL and 0, respectively. (If one is provided, both must be
 278 provided.)  If they are provided, \prog{d\_off} gives the position of
 279 the first line of sequence data in the record, and \prog{L} gives
 280 the length of the sequence in residues. They are used when
 281 \prog{SSI\_FAST\_SUBSEQ} is set for the sequence file. If
 282 \prog{SSI\_FAST\_SUBSEQ} is not set for the file, \prog{d\_off} and
 283 \prog{L} will be ignored even if they are available, so it doesn't
 284 hurt for the indexing program to provide them; typically it won't know
 285 whether it's safe to set \prog{SSI\_FAST\_SUBSEQ} for the whole file
 286 until the whole file has been read and every key has already been
 287 added to the index.
 288
 289 Through \prog{ret\_kh} it provides a ``handle'' - a unique
 290 identifier for the primary key - that any subsequent calls to
 291 \prog{SSIAddSecondaryKeyToIndex()} will use to associate one or more
 292 secondary keys with this primary key.
 293
 294 Returns 0 on success, non-zero on error.
 295
 296 Error codes:\\
 297 \begin{tabular}{ll}
 298 \prog{SSI\_ERR\_TOOMANY\_KEYS}  & exceeded primary key limit\\
 299 \prog{SSI\_ERR\_TOOMANY\_FILES} & filenum exceeds file limit\\
 300 \prog{SSI\_ERR\_MALLOC}         & a malloc() failed\\
 301 \end{tabular}
 302
 303
 304 \item[int SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey)]
 305
 306 Puts a secondary key \prog{key} in the index \prog{g}, associating it
 307 with a primary key \prog{pkey} that has already been added to the index
 308 by \prog{SSIAddPrimaryKeyToIndex()}.
 309 Returns 0 on success, non-zero on error.
 310
 311 Error codes:\\
 312 \begin{tabular}{ll}
 313 \prog{SSI\_ERR\_TOOMANY\_KEYS}  & exceeded secondary key limit\\
 314 \prog{SSI\_ERR\_MALLOC}         & a malloc() failed\\
 315 \end{tabular}
 316
 317
 318
 319 \item[int SSIWriteIndex(char *file, SSIINDEX *g)]
 320
 321 Writes complete index \prog{g} in SSI format to a binary file
 322 \prog{file}.  Does all overhead of sorting the primary and secondary
 323 keys, and maintaining the association of secondary keys with primary
 324 keys during and after the sort. Returns 0 on success, non-zero on
 325 error.
 326
 327 Error codes:\\
 328 \begin{tabular}{ll}
 329 \prog{SSI\_ERR\_NOFILE}  & an fopen() failed\\
 330 \prog{SSI\_ERR\_FWRITE}  & an fwrite() failed\\
 331 \prog{SSI\_ERR\_MALLOC}  & a malloc() failed\\
 332 \end{tabular}
 333
 334
 335 \item[void SSIFreeIndex(SSIINDEX *g)]
 336
 337 Free an index structure.
 338 \end{sreapi}
 339
 340
 341 \subsubsection{Other SSI functions:}
 342
 343 \begin{sreapi}
 344 \item[char *SSIErrorString(int n)]
 345
 346 Returns a pointer to an internal string corresponding to error
 347 \prog{n}, a return code from any of the functions in the API that
 348 return non-zero on error.
 349 \end{sreapi}
 350
 351 \subsection{Detailed specification of SSI binary format}
 352
 353 There are four sections to the SSI file:
 354 \begin{sreitems}{\textbf{Secondary keys}}
 355 \item[\textbf{Header}]
 356 Contains a magic number indicating GSI version number, and
 357 various information about the number and sizes of things in the index.
 358
 359 \item[\textbf{Files}]
 360 Contains one or more \emph{file records}, one per sequence file that's
 361 indexed. These contain information about the individual files.
 362
 363 \item[\textbf{Primary keys}]
 364 Contains one or more \emph{primary key records}, one per primary key.
 365
 366 \item[\textbf{Secondary keys}]
 367 Contains one or more \emph{secondary key records}, one per secondary key.
 368 \end{sreitems}
 369
 370 All numeric quantities are stored as unsigned integers of known size
 371 in network (bigendian) order, for maximum crossplatform portability of
 372 the index files. \prog{sqd\_uint16}, \prog{sqd\_uint32}, and
 373 \prog{sqd\_uint64} are typically typedef'd as \prog{unsigned short},
 374 \prog{unsigned int}, and \prog{unsigned long long} or \prog{unsigned
 375 long} at SQUID compile-time. Values may need to be cast to signed
 376 quantities, so only half of their dynamic range is valid
 377 (e.g. 0..32,767 for values of type \prog{sqd\_uint16};
 378 0..2,146,483,647 (2 billion) for \prog{sqd\_uint32}; and 0..9.22e18 (9
 379 million trillion) for \prog{sqd\_uint64}).  These typedef's are
 380 handled automatically by the \prog{./configure} script (see
 381 \prog{squidconf.h.in} before configuration, \prog{squidconf.h} after
 382 configuration). If necessary, \prog{./configure}'s guess can be
 383 overridden in \prog{squidconf.h} after configuration.
 384
 385 \subsubsection{Header section}
 386
 387 The header section contains:
 388
 389 \vspace{1em}
 390 \begin{tabular}{llrr}
 391 Variable          & Description                              & Bytes      & Type \\\hline
 392 \prog{magic}      & SSI version magic number.               &  4         & \prog{sqd\_uint32}\\
 393 \prog{flags}      & Optional behavior flags (see below)      &  4         & \prog{sqd\_uint32}\\
 394 \prog{nfiles}     & Number of files in file section.         &  2         & \prog{sqd\_uint16}\\
 395 \prog{nprimary}   & Number of primary keys.                  &  4         & \prog{sqd\_uint32}\\
 396 \prog{nsecondary} & Number of secondary keys.                &  4         & \prog{sqd\_uint32}\\
 397 \prog{flen}       & Length of filenames (incl. '\verb+\0+')         &  4         & \prog{sqd\_uint32}\\
 398 \prog{plen}       & Length of primary key names (incl. '\verb+\0+') &  4         & \prog{sqd\_uint32}\\
 399 \prog{slen}       & Length of sec. key names (incl. '\verb+\0+')    &  4         & \prog{sqd\_uint32}\\
 400 \prog{frecsize}   & \# of bytes in a file record             &  4         & \prog{sqd\_uint32}\\
 401 \prog{precsize}   & \# of bytes in a primary key record      &  4         & \prog{sqd\_uint32}\\
 402 \prog{srecsize}   & \# of bytes in a sec. key record         &  4         & \prog{sqd\_uint32}\\
 403 \prog{foffset}    & disk offset, start of file records       &  \dag         & \dag\\
 404 \prog{poffset}    & disk offset, start of primary key recs   &  \dag         & \dag\\
 405 \prog{soffset}    & disk offset, start of sec. key records   &  \dag         & \dag\\
 406 \end{tabular}
 407 \vspace{1em}
 408
 409 The optional behavior flags are:
 410
 411 \vspace{1em}
 412 \begin{tabular}{lll}
 413 Flag             & Value& Note\\ \hline
 414 \prog{SSI\_USE64}         & $1 \ll 0$ & Large sequence files; all key offsets 64 bit.\\
 415 \prog{SSI\_USE64\_INDEX}  & $1 \ll 1$ & Large index; GSI file itself uses 64-bit offsets.\\\hline
 416 \end{tabular}
 417 \vspace{1em}
 418
 419 The optional behavior flags define whether the SSI file uses large
 420 file (64-bit) offsets.  This issue is discussed in greater detail
 421 below (see ``Large sequence files and large indices''). Briefly: if
 422 \prog{SSI\_USE64} is set, the sequence file is large, and all sequence
 423 file offsets are 64-bit integers.  If \prog{SSI\_USE64\_INDEX} is
 424 set, the index file itself is large, and \prog{foffset},
 425 \prog{poffset}, and \prog{soffset} (that is, all offsets within the
 426 index file itself, indicated as \dag\ in the above table) are 64-bit
 427 integers. \footnote{In the current API it is not expected that
 428 \prog{SSI\_USE64\_INDEX} would ever be set. The current index-writing
 429 API keeps the entire index in RAM (it has to sort the keys), and would
 430 presumably have to be modified or replaced to be able to generate very
 431 large indices.}
 432
 433 The reason to explicitly record various record sizes (\prog{frecsize},
 434 \prog{precsize}, \prog{srecsize}) and index file positions
 435 (\prog{foffset}, \prog{poffset}, \prog{soffset}) is to allow future
 436 extendibility. More fields might be added without breaking older SSI
 437 parsers.  The format is meant to be both forwards- and
 438 backwards-compatible.
 439
 440 \subsubsection{File section}
 441
 442 The file section consists of \prog{nfiles} file records. Each record
 443 is \prog{frecsize} bytes long, and contains:
 444
 445 \vspace{1em}
 446 \begin{tabular}{llrr}
 447 Variable & Description                                       & Bytes & Type \\\hline
 448 \prog{filename} & Name of file (possibly including full path)       & \prog{flen} & char *\\
 449 \prog{format}   & Format code for file; see squid.h for definitions & 4    & \prog{sqd\_uint32} \\
 450 \prog{flags}    & Optional behavior flags                           & 4    & \prog{sqd\_uint32} \\
 451 \prog{bpl}      & Bytes per sequence data line                      & 4    & \prog{sqd\_uint32} \\
 452 \prog{rpl}      & Residues per sequence data line                   & 4    & \prog{sqd\_uint32} \\\hline
 453 \end{tabular}
 454 \vspace{1em}
 455
 456 When a SSI file is written, \prog{frecsize} is equal to the sum of
 457 the sizes above.  When a SSI file is read by a parser, it is possible
 458 that \prog{frecsize} is larger than the parser expects, if the parser
 459 is expecting an older version of the SSI format: additional fields
 460 may be present, which increases \prog{frecsize}. The parser will only
 461 try to understand the data up to the \prog{frecsize} it expected to
 462 see, but still knows the absolutely correct \prog{frecsize} for
 463 purposes of skipping around in the index file.
 464
 465 Normally the SSI index resides in the same directory as the sequence
 466 data file(s), so \prog{filename} is relative to the location of the
 467 SSI index. In the event this is not true, \prog{filename} can contain
 468 a full path.
 469
 470 \prog{format} is a SQUID sequence file format code; e.g. something like
 471 \prog{SQFILE\_FASTA} or \prog{MSAFILE\_STOCKHOLM}. These constants are defined
 472 in \prog{squid.h}.
 473
 474 Only one possible optional behavior flag is defined:
 475
 476 \vspace{1em}
 477 \begin{tabular}{lll}
 478 Flag             & Value& Note\\ \hline
 479 \prog{SSI\_FAST\_SUBSEQ} & $1 \ll 0$ & Fast subseq retrieval is possible for this file.\\\hline
 480 \end{tabular}
 481 \vspace{1em}
 482
 483 When \prog{SSI\_FAST\_SUBSEQ} is set, \prog{bpl} and \prog{rpl} are
 484 nonzero. They can be used to calculate the offset of subsequence
 485 positions in the data file. This is described in the optional behavior
 486 section below.
 487
 488 \subsubsection{Primary key section}
 489
 490 The primary key section consists of \prog{nprimary} records. Each
 491 record is \prog{precsize} bytes long, and contains:
 492
 493 \vspace{1em}
 494 \begin{tabular}{llrr}
 495 Variable   & Description                                 & Bytes      & Type \\\hline
 496 \prog{key}         & Key name (seq name, identifier, accession) & \prog{plen}& char *\\
 497 \prog{fnum}       & File number (0..nfiles-1)                   & 2          & \prog{sqd\_uint16}\\
 498 \prog{offset1}    & Offset to start of record                   & \ddag      & \ddag \\
 499 \prog{offset2}    & Offset to start of sequence data            & \ddag      & \ddag \\
 500 \prog{len}        & Length of data (e.g. seq length, residues)  & 4          & \prog{sqd\_uint32} \\\hline
 501 \end{tabular}
 502 \vspace{1em}
 503
 504 The offsets are sequence file offsets (indicated by \ddag). They are
 505 4 bytes of type \prog{sqd\_uint32} normally, 8 bytes of type
 506 \prog{sqd\_uint32} if \prog{SSI\_USE64} is set, and \prog{sizeof(fpos\_t)}
 507 bytes of type \prog{fpos\_t} if \prog{SSI\_FPOS\_T} is set.
 508
 509 \prog{offset2} and \prog{len} are only meaningful if \prog{SSI\_FAST\_SUBSEQ}
 510 is set on this key's file. \prog{offset2} gives the absolute disk
 511 position of line 0 in the sequence data. \prog{len} is necessary for
 512 bounds checking in a subsequence retrieval, to be sure we don't try to
 513 reposition the disk outside the valid data.
 514
 515 \subsubsection{Secondary key section}
 516
 517 The secondary key section consists of \prog{nsecondary} records. Each
 518 record is \prog{srecsize} bytes long, and contains:
 519
 520 \vspace{1em}
 521 \begin{tabular}{llrr}
 522 Variable   & Description                                   & Bytes      & Type \\\hline
 523 \prog{key}   & Key name (seq name, identifier, accession)  & \prog{slen}& char *\\
 524 \prog{pkey}  & Primary key                                 &
 525 \prog{plen}& char *\\\hline
 526 \end{tabular}
 527 \vspace{1em}
 528
 529 All data are kept with the primary key records. Secondary keys are
 530 simply translated to primary keys, then the primary key has to be
 531 looked up.
 532
 533 \subsection{Optional behaviors}
 534
 535 \subsubsection{Large sequence files and large indices: 64-bit operation}
 536
 537 Normally a SSI index file can be no larger than 2 GB, and can index
 538 sequence files that are no larger than 2 GB each. This is due to
 539 limitations in the ANSI C/POSIX standards, which were developed for
 540 32-bit operating systems and filesystems. Most modern operating
 541 systems allow larger 64-bit file sizes, but as far as I'm aware (Dec
 542 2000), there are no standard interfaces yet for working with positions
 543 (offsets) in large files. On many platforms, SSI can extend to full
 544 64-bit capabilities, but on some platforms, it cannot. To understand
 545 the limitations (of SSI, and possibly of my understanding) you need
 546 to understand some details about what's happening behind the SSI API
 547 and how I understand C API's to modern 64-bit OS's and hardware.
 548
 549 First, some information on ANSI C APIs for file positioning. ANSI C
 550 provides the portable functions \prog{fseek()} and \prog{ftell()} for
 551 manipulating simple offsets in a file. They store the offset in a
 552 \prog{long} (which ranges up to 2 GB). The Standard says we're allowed
 553 to do arithmetic on this value if the file is binary. ANSI C also
 554 provides \prog{fgetpos()} and \prog{fsetpos()} which store file
 555 positions in an opaque data type called \prog{fpos\_t}. Modern
 556 operating systems with large file support define \prog{fpos\_t} in a
 557 way that permits files $>$2 GB. However, \prog{fpos\_t} is an opaque
 558 type. It has two disadvantages compared to a simple arithmetic type
 559 like \prog{long}: first, we're not allowed to do arithmetic on it, and
 560 second, we can't store it in a binary file in an
 561 architecture-independent manner. We need both features for SSI,
 562 unfortunately. \footnote{Surely the professional C community has the
 563 same problem; does \emph{everyone} hack around \prog{fpos\_t}?}
 564
 565 Therefore we have to rely on system dependent features. Most operating
 566 systems provide a non-compliant library call that returns an
 567 arithmetic offset. Fully 64-bit systems typically give us a 64-bit
 568 \prog{off\_t} and functions \prog{ftello()}/\prog{fseeko()} that work
 569 with that offset.  Many systems provide a ``transitional interface''
 570 where all normally named functions are 32-bits, but specially named
 571 64-bit varieties are available: e.g. \prog{off\_t} is 32 bits, but
 572 \prog{off64\_t} is 64 bits and we have functions \prog{ftello64()} and
 573 \prog{fseeko64()}. Some systems provide a \prog{ftell64()} and
 574 \prog{fseek64()} that work on offsets of type \prog{long long}. Many
 575 popular systems may even provide more than one of these models,
 576 depending on compiler flags.
 577
 578 And, unfortunately, some systems provide none of these models (FreeBSD
 579 for example). There, we will exploit the fact that most systems
 580 (including FreeBSD) do in fact implement \prog{fpos\_t} as a simple
 581 arithmetic type, such as an \prog{off\_t}, so we can misuse it.
 582
 583 At compile time, SQUID's \prog{./configure} script tests for the
 584 system's capabilities for 64-bit file offsets, and configures a
 585 section in the \prog{squidconf.h} file. (The configuration includes a
 586 custom autoconf macro, \prog{SQ\_ARITHMETIC\_FPOS\_T()}, to test
 587 \prog{fpos\_t} and define \prog{ARITHMETIC\_FPOS\_T} if it is.)  Four
 588 possible 64-bit models are tested in the following order; if one of
 589 them is possible, it will be used, and the constant
 590 \prog{HAS\_64BIT\_FILE\_OFFSETS} is set.
 591
 592 \begin{enumerate}
 593 \item has \prog{ftello()}, \prog{fseeko()}; sizeof(\prog{off\_t}) $= 8$.
 594 \item has \prog{ftello64()}, \prog{fseeko64()}; sizeof(\prog{off64\_t}) $= 8$.
 595 \item has \prog{ftell64()}, \prog{fseek64()}
 596 \item \prog{fpos\_t} is an arithmetic 64-bit type; (mis)use
 597 \prog{fgetpos()}, \prog{fsetpos()}.
 598 \end{enumerate}
 599
 600
 601
 602 \subsubsection{Fast subsequence retrieval}
 603
 604 In some files (notably vertebrate chromosome contigs) the size of each
 605 sequence is large. It may be slow to extract a subsequence by first
 606 reading the whole sequence into memory -- or even prohibitive, if the
 607 sequence is so large that it can't be stored in memory.
 608
 609 If the sequence data file is very consistently formatted so that each
 610 line in each record (except the last one) is of the same length, in
 611 both bytes and residues, we can determine a disk offset of the start
 612 of any subsequence by direct calculation.
 613 For example, a simple well-formatted FASTA
 614 file with 50 residues per line would have 51 bytes per sequence line
 615 (counting the '\verb+\0+') (\prog{bpl}=51, \prog{rpl}=50). Position $i$ in a sequence
 616 $1..L$ will be on line $l = (i-1)/\mbox{\prog{rpl}}$, and line $l$ starts at
 617 disk offset $l * \mbox{\prog{bpl}}$ relative to the start of the sequence
 618 data. If there are no nonsequence characters in the data line except
 619 the terminal '\verb+\0+' (which is true iff \prog{bpl} = \prog{rpl}+1 and 1 residue = 1
 620 byte), position $i$ can be precisely found:
 621
 622 \[
 623 \mbox{relative offset of residue $i$} =
 624 \left((i-1)/\mbox{\prog{rpl}}\right)*\mbox{\prog{bpl}} + (i-1) \% \mbox{ \prog{rpl}}
 625 \]
 626
 627 Even for sequence data lines with extra characters (e.g. spaces,
 628 coordinates, whatever), fast subsequence retrieval is possible; a
 629 parser can be positioned at the beginning of the appropriate line $l$,
 630 which starts at residue $(l*\mbox{\prog{rpl}}) + 1$, and it can start reading
 631 from there (e.g. the line that $i$ is on) rather than the beginning of
 632 the whole sequence record.
 633
 634 The program that creates the index is responsible for determining if
 635 \prog{bpl} and \prog{rpl} are consistent throughout a file; if so, it
 636 may set the \prog{SSI\_FAST\_SUBSEQ} flag for the file. Then any record
 637 whose primary key carries the optional data offset (\prog(offset2))
 638 and sequence length data is available for subsequence position
 639 calculations by \prog{SSIGetSubseqOffset()}.
 640
 641 \end{document}