1 /* @configure_input@ */
2 /*****************************************************************
3 * HMMER - Biological sequence analysis with profile HMMs
4 * Copyright (C) 1992-1999 Washington University School of Medicine
7 * This source code is distributed under the terms of the
8 * GNU General Public License. See the files COPYING and LICENSE
10 *****************************************************************/
12 #ifndef SQUIDH_INCLUDED
13 #define SQUIDH_INCLUDED
16 * Header file for my library of sequence functions.
18 * CVS $Id: squid.h.in,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $
24 #include <unistd.h> /* for sysconf() #define's */
28 #include <assert.h> /* for SQD_DASSERT1(), etc. */
31 #include "squidconf.h" /* #define's generated by ./configure script */
33 /*****************************************************************
34 * Integers of guaranteed size. (used for instance in gsi.c, gsi2.c)
35 * These are set by the ./configure script; if they show up as FIXME,
36 * they must be manually edited to appropriate type definitions. You
37 * do need 64-bit integers in the current code; email me if this
38 * prevents you from compiling SQUID and tell me your system (I don't
39 * know of any systems that don't have 64-bit integers these days).
40 *****************************************************************/
41 typedef @SQD_UINT16@ sqd_uint16;
42 typedef @SQD_UINT32@ sqd_uint32;
43 typedef @SQD_UINT64@ sqd_uint64;
45 #ifdef USE_HOST_BYTESWAP_FUNCTIONS
46 #include <sys/types.h> /* only for ntohl() and friends. */
47 #include <netinet/in.h> /* only for ntohl() and friends. */
48 #define sre_ntoh16(x) ntohs(x);
49 #define sre_ntoh32(x) ntohl(x);
50 #define sre_hton16(x) htons(x);
51 #define sre_hton32(x) htonl(x);
52 #endif /* USE_HOST_BYTESWAP_FUNCTIONS */
54 /* Library version info is made available as a global to
55 * any interested program. These are defined in iupac.c
56 * with the other globals.
58 extern char squid_version[]; /* version number */
59 extern char squid_date[]; /* date of release */
60 extern int squid_errno; /* error codes */
64 /****************************************************
65 * Error codes returned by squid library functions (squid_errno)
66 ****************************************************/
68 #define SQERR_OK 0 /* no error */
69 #define SQERR_UNKNOWN 1 /* generic error, unidentified */
70 #define SQERR_NODATA 2 /* unexpectedly NULL stream */
71 #define SQERR_MEM 3 /* malloc or realloc failed */
72 #define SQERR_NOFILE 4 /* file not found */
73 #define SQERR_FORMAT 5 /* file format not recognized */
74 #define SQERR_PARAMETER 6 /* bad parameter passed to func */
75 #define SQERR_DIVZERO 7 /* error in sre_math.c */
76 #define SQERR_INCOMPAT 8 /* incompatible parameters */
77 #define SQERR_EOD 9 /* end-of-data (often normal) */
79 /****************************************************
80 * Single sequence information
81 ****************************************************/
82 #define SQINFO_NAMELEN 64
83 #define SQINFO_DESCLEN 128
86 int flags; /* what extra data are available */
87 char name[SQINFO_NAMELEN];/* up to 63 characters of name */
88 char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */
89 char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */
90 char desc[SQINFO_DESCLEN];/* up to 127 char of description */
91 int len; /* length of this seq */
92 int start; /* (1..len) start position on source seq */
93 int stop; /* (1..len) end position on source seq */
94 int olen; /* original length of source seq */
95 int type; /* kRNA, kDNA, kAmino, or kOther */
96 char *ss; /* 0..len-1 secondary structure string */
97 char *sa; /* 0..len-1 % side chain surface access. */
99 typedef struct seqinfo_s SQINFO;
101 #define SQINFO_NAME (1 << 0)
102 #define SQINFO_ID (1 << 1)
103 #define SQINFO_ACC (1 << 2)
104 #define SQINFO_DESC (1 << 3)
105 #define SQINFO_START (1 << 4)
106 #define SQINFO_STOP (1 << 5)
107 #define SQINFO_LEN (1 << 6)
108 #define SQINFO_TYPE (1 << 7)
109 #define SQINFO_OLEN (1 << 8)
110 #define SQINFO_SS (1 << 9)
111 #define SQINFO_SA (1 << 10)
114 /****************************************************
115 * Sequence alphabet: see also iupac.c
116 ****************************************************/
117 /* IUPAC symbols defined globally in iupac.c */
119 char sym; /* character representation */
120 char symcomp; /* complement (regular char */
121 char code; /* my binary rep */
122 char comp; /* binary encoded complement */
124 extern struct iupactype iupac[];
125 #define IUPACSYMNUM 17
127 extern char *stdcode1[]; /* 1-letter amino acid translation code */
128 extern char *stdcode3[]; /* 3-letter amino acid translation code */
129 extern float dnafq[]; /* nucleotide occurrence frequencies */
130 extern float aafq[]; /* amino acid occurrence frequencies */
131 extern char aa_alphabet[]; /* amino acid alphabet */
132 extern int aa_index[]; /* convert 0..19 indices to 0..26 */
134 /* valid symbols in IUPAC code */
135 #define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
136 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
137 #define DNA_ALPHABET "ACGT"
138 #define RNA_ALPHABET "ACGU"
139 #define WHITESPACE " \t\n"
141 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~')
144 /****************************************************
145 * Sequence i/o: originally from Don Gilbert's readseq
146 ****************************************************/
147 #include "msa.h" /* for multiple sequence alignment support */
149 /* buffer size for reading in lines from sequence files*/
150 #define LINEBUFLEN 4096
152 /* sequence types parsed by Seqtype() */
153 /* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */
154 #define kOtherSeq 0 /* hmmNOTSETYET */
156 #define kRNA 2 /* hmmNUCLEIC */
157 #define kAmino 3 /* hmmAMINO */
159 /* Unaligned sequence file formats recognized
160 * Coexists with definitions of multiple alignment formats in msa.h:
161 * >100 reserved for alignment formats
162 * <100 reserved for unaligned formats
163 * 0 reserved for unknown
165 * Some "legacy" formats are supported only when explicitly
166 * requested; not autodetected by SeqfileFormat().
168 * DON'T REASSIGN THESE CODES. They're written into
169 * GSI index files. You can use new ones, but reassigning
170 * the sense of old ones will break GSI indices.
171 * Alignment format codes were reassigned with the creation
172 * of msa.c, but before Stockholm format, there were no
173 * indexed alignment databases.
175 #define SQFILE_UNKNOWN 0 /* unknown format */
176 #define SQFILE_IG 1 /* Intelligenetics (!) */
177 #define SQFILE_GENBANK 2 /* GenBank flatfile */
178 /* 3 was A2M. Now an alignment format */
179 #define SQFILE_EMBL 4 /* EMBL or Swissprot flatfile */
180 #define SQFILE_GCG 5 /* GCG single sequence files */
181 #define SQFILE_STRIDER 6 /* MacStrider (!!) */
182 #define SQFILE_FASTA 7 /* FASTA format: default */
183 #define SQFILE_ZUKER 8 /* Zuker MFOLD format (legacy) */
184 #define SQFILE_IDRAW 9 /* Idraw-style PostScript (legacy) */
185 /* 10 was SELEX. Now alignment format */
186 /* 11 was MSF. Now alignment format */
187 #define SQFILE_PIR 12 /* PIR format */
188 #define SQFILE_RAW 13 /* raw sequence */
189 #define SQFILE_SQUID 14 /* my obsolete squid format */
190 /* 15 was kXPearson, extended FASTA; withdrawn */
191 #define SQFILE_GCGDATA 16 /* GCG data library file */
192 /* 17 was Clustal. Now alignment format*/
194 #define IsUnalignedFormat(fmt) ((fmt) && (fmt) < 100)
199 FILE *f; /* open file pointer */
200 char *fname; /* name of file; used for diagnostics */
201 int linenumber; /* what line are we on in the file */
203 char *buf; /* dynamically allocated sre_fgets() buffer */
204 int buflen; /* allocation length for buf */
206 int ssimode; /* SSI_OFFSET_I32 or SSI_OFFSET_I64 */
207 SSIOFFSET ssioffset; /* disk offset to last line read into buf */
208 SSIOFFSET r_off; /* offset to start of record */
209 SSIOFFSET d_off; /* offset to start of sequence data */
211 int rpl; /* residues per data line for this file; -1 if unset, 0 if invalid */
212 int lastrpl; /* rpl on last line seen */
213 int maxrpl; /* max rpl on any line of the file */
214 int bpl; /* bytes per data line; -1 if unset, 0 if invalid */
215 int lastbpl; /* bpl on last line seen */
216 int maxbpl; /* max bpl on any line of the file */
218 char *seq; /* growing sequence during parse */
219 SQINFO *sqinfo; /* name, id, etc, gathered during parse */
221 int seqlen; /* current sequence length */
222 int maxseq; /* current allocation length for seq */
224 int format; /* format of seqfile we're reading. */
225 int do_gzip; /* TRUE if f is a pipe from gzip -dc */
226 int do_stdin; /* TRUE if f is stdin */
228 /* An (important) hack for sequential access of multiple alignment files:
229 * we read the whole alignment in,
230 * and then copy it one sequence at a time into seq and sqinfo.
231 * It is active if msa is non NULL.
232 * msa->lastidx is reused/overloaded: used to keep track of what
233 * seq we'll return next.
234 * afp->format is the real format, while SQFILE->format is kMSA.
235 * Because we keep it in the SQFILE structure,
236 * ReadSeq() and friends are always reentrant for multiple seqfiles.
241 typedef struct ReadSeqVars SQFILE;
244 /****************************************************
245 * Cluster analysis and phylogenetic tree support
246 ****************************************************/
248 /* struct phylo_s - a phylogenetic tree
250 * For N sequences, there will generally be an array of 0..N-2
251 * phylo_s structures representing the nodes of a tree.
252 * [0] is the root. The indexes of left and
253 * right children are somewhat confusing so be careful. The
254 * indexes can have values of 0..2N-2. If they are 0..N-1, they
255 * represent pointers to individual sequences. If they are
256 * >= N, they represent pointers to a phylo_s structure
260 int parent; /* index of parent, N..2N-2, or -1 for root */
261 int left; /* index of one of the branches, 0..2N-2 */
262 int right; /* index of other branch, 0..2N-2 */
263 float diff; /* difference score between seqs */
264 float lblen; /* left branch length */
265 float rblen; /* right branch length */
266 char *is_in; /* 0..N-1 flag array, 1 if seq included */
267 int incnum; /* number of seqs included at this node */
271 /* Strategies for cluster analysis; cluster by mean distance,
272 * minimum distance, or maximum distance.
274 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN };
276 /****************************************************
277 * Generic data structure support
278 ****************************************************/
280 /* a struct intstack_s implements a pushdown stack for storing
285 struct intstack_s *nxt;
288 /****************************************************
289 * Binary nucleotide alphabet support
290 ****************************************************/
292 /* Binary encoding of the IUPAC code for nucleotides
294 * four-bit "word", permitting rapid degenerate matching
303 #define NTN 15 /* A|C|G|T */
304 #define NTR 10 /* A|G */
305 #define NTY 5 /* C|T */
306 #define NTM 12 /* A|C */
307 #define NTK 3 /* G|T */
308 #define NTS 6 /* C|G */
309 #define NTW 9 /* A|T */
310 #define NTH 13 /* A|C|T */
311 #define NTB 7 /* C|G|T */
312 #define NTV 14 /* A|C|G */
313 #define NTD 11 /* A|G|T */
314 #define NTGAP 16 /* GAP */
315 #define NTEND 0 /* null string terminator */
317 /* ntmatch(): bitwise comparison of two nuc's
318 * note that it's sensitive to the order;
319 * probe may be degenerate but target should not be
321 #define ntmatch(probe, target) ((probe & target) == target)
323 /****************************************************
324 * Support for a portable, flexible Getopt()
325 ****************************************************/
329 * Structure for declaring options to a main().
332 char *name; /* name of option, e.g. "--option1" or "-o" */
333 int single; /* TRUE if a single letter option */
334 int argtype; /* for typechecking, e.g. sqdARG_INT */
336 /* acceptable argtype's... */
337 #define sqdARG_NONE 0 /* no argument */
338 #define sqdARG_INT 1 /* something that atoi() can grok */
339 #define sqdARG_FLOAT 2 /* something that atof() can grok */
340 #define sqdARG_CHAR 3 /* require single character or digit */
341 #define sqdARG_STRING 4 /* anything goes */
343 /****************************************************
344 * Support for convenient Perl-y regexp matching
345 * See hsregexp.c for copyright notice: this code is derived
346 * from Henry Spencer's freely distributed regexp library.
347 ****************************************************/
350 typedef struct sqd_regexp {
351 char *startp[NSUBEXP];
353 char regstart; /* Internal use only. */
354 char reganch; /* Internal use only. */
355 char *regmust; /* Internal use only. */
356 int regmlen; /* Internal use only. */
357 char program[1]; /* Unwarranted chumminess with compiler. */
360 /* Strparse() defines and manages these.
361 * sqd_parse[0] contains the substring that matched the pattern.
362 * sqd_parse[1-9] contain substrings matched with ()'s.
364 extern char *sqd_parse[10];
366 /****************************************************
367 * Portable detection of multiprocessor # of CPUs.
368 * #include <unistd.h>
369 * long foo = SQD_NPROC;
370 * returns the number of available processors.
371 * if foo == -1, we failed.
372 ****************************************************/
374 /* Our problem here is that POSIX apparently doesn't specify
375 * a standard for how to get sysconf() to report the number of
376 * processors on-line. _SC_NPROCESSORS_ONLN is specified
377 * by SVR4.0MP. Thanks to W. Gish for help here.
380 #ifdef _SC_NPROCESSORS_ONLN /* Sun Solaris, Digital UNIX */
381 #define SQD_NPROC sysconf(_SC_NPROCESSORS_ONLN)
383 #ifdef _SC_NPROC_ONLN /* Silicon Graphics IRIX */
384 #define SQD_NPROC sysconf(_SC_NPROC_ONLN)
385 #else /* FreeBSD, Linux don't support getting ncpu via sysconf() */
390 /****************************************************
391 * Three levels of debugging printf's and assert's
392 * level 1: little impact on verbosity or performance
393 * level 2: moderate impact
394 * level 3: high impact
396 * SQD_DPRINTF3(("Matrix row %d col %d = %f\n", i, j, val));
397 * Note the double parentheses; these are important.
398 ****************************************************/
404 #if (DEBUGLEVEL >= 1)
405 #define SQD_DPRINTF1(x) printf x
406 #define SQD_DASSERT1(x) assert x
408 #define SQD_DPRINTF1(x)
409 #define SQD_DASSERT1(x)
411 #if (DEBUGLEVEL >= 2)
412 #define SQD_DPRINTF2(x) printf x
413 #define SQD_DASSERT2(x) assert x
415 #define SQD_DPRINTF2(x)
416 #define SQD_DASSERT2(x)
418 #if (DEBUGLEVEL >= 3)
419 #define SQD_DPRINTF3(x) printf x
420 #define SQD_DASSERT3(x) assert x
422 #define SQD_DPRINTF3(x)
423 #define SQD_DASSERT3(x)
426 /* PANIC is called for failures of Std C/POSIX functions,
427 * instead of my own functions. Panic() calls perror() and exits
430 #define PANIC Panic(__FILE__, __LINE__)
432 /* Malloc/realloc calls are wrapped
434 #define MallocOrDie(x) sre_malloc(__FILE__, __LINE__, (x))
435 #define ReallocOrDie(x,y) sre_realloc(__FILE__, __LINE__, (x), (y))
437 /****************************************************
438 * Miscellaneous macros and defines
439 ****************************************************/
441 #define CHOOSE(a) ((int) (sre_random() * (a)))
442 /* must declare swapfoo to use SWAP() */
443 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;}
444 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7)
447 #define MIN(a,b) (((a)<(b))?(a):(b))
450 #define MAX(a,b) (((a)>(b))?(a):(b))
453 /* For convenience and (one hopes) clarity in boolean tests:
462 /* Somewhere, there is a universe in which Unix vendors comply
463 * with the ANSI C standard. Unfortunately, it is not ours:
466 #define EXIT_SUCCESS 0
469 #define EXIT_FAILURE 1
472 #include "sqfuncs.h" /* squid function declarations */
473 #endif /* SQUIDH_INCLUDED */