1 /* Generated automatically from squid.h.in by configure. */
2 /*****************************************************************
3 * SQUID - a library of functions for biological sequence analysis
4 * Copyright (C) 1992-2002 Washington University School of Medicine
6 * This source code is freely distributed under the terms of the
7 * GNU General Public License. See the files COPYRIGHT and LICENSE
9 *****************************************************************/
11 #ifndef SQUIDH_INCLUDED
12 #define SQUIDH_INCLUDED
15 * Header file for my library of sequence functions.
17 * CVS $Id: squid.h.in,v 1.5 2002/10/09 14:26:09 eddy Exp)
23 #include <unistd.h> /* for sysconf() #define's */
27 #include <assert.h> /* for SQD_DASSERT1(), etc. */
30 /* include clustal's config.h */
35 #include "clustal-omega-config.h"
40 /* we don't want squidconf.h but our own config header. but, there are
41 * some checks, espcially at the end of squidconf.h might be
42 * necessary for squid to work. They follow after the inclusion of
47 /* squidconf.h checks:
49 #if defined HAVE_NTOHL && defined HAVE_NTOHS && defined HAVE_HTONS && defined HAVE_HTONL
50 #define USE_HOST_BYTESWAP_FUNCTIONS 1
52 /* On 64-bit machines like Alphas, strtoull doesn't exist, strotul will work
54 #if SIZEOF_UNSIGNED_LONG == 8 && defined HAVE_STRTOUL && ! defined HAVE_STRTOULL
55 #define strtoull strtoul
58 #if defined HAVE_FTELLO && defined HAVE_FSEEKO && SIZEOF_OFF_T == 8
59 #define HAS_64BIT_FILE_OFFSETS 1
60 #elif defined HAVE_FTELLO64 && defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8
61 #define HAS_64BIT_FILE_OFFSETS 1
62 #elif defined HAVE_FTELL64 && defined HAVE_FSEEK64
63 #define HAS_64BIT_FILE_OFFSETS 1
64 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
65 #define HAS_64BIT_FILE_OFFSETS 1
67 #undef HAS_64BIT_FILE_OFFSETS
70 /* The following check seems like nonsense to me (AW), therefore */
72 /* Stuff to work around Tru64 not having strtoull() -
73 * on systems with 64-bit longs, we can use strtoul()
76 #if ! defined HAVE_STRTOULL && SIZEOF_UNSIGNED_LONG == 8
77 #define strtoull strtoul
82 #include "squidconf.h" /* #define's generated by ./configure script */
85 /*****************************************************************
86 * Integers of guaranteed size. (used for instance in gsi.c, gsi2.c)
87 * These are set by the ./configure script; if they show up as FIXME,
88 * they must be manually edited to appropriate type definitions. You
89 * do need 64-bit integers in the current code; email me if this
90 * prevents you from compiling SQUID and tell me your system (I don't
91 * know of any systems that don't have 64-bit integers these days).
92 *****************************************************************/
93 typedef unsigned short sqd_uint16;
94 typedef unsigned int sqd_uint32;
95 typedef unsigned long sqd_uint64;
97 #ifdef USE_HOST_BYTESWAP_FUNCTIONS
98 #include <sys/types.h> /* only for ntohl() and friends. */
99 #include <netinet/in.h> /* only for ntohl() and friends. */
100 #define sre_ntoh16(x) ntohs(x);
101 #define sre_ntoh32(x) ntohl(x);
102 #define sre_hton16(x) htons(x);
103 #define sre_hton32(x) htonl(x);
104 #endif /* USE_HOST_BYTESWAP_FUNCTIONS */
106 /* Library version info is made available as a global to
107 * any interested program. These are defined in iupac.c
108 * with the other globals.
110 extern char squid_version[]; /* version number */
111 extern char squid_date[]; /* date of release */
112 extern int squid_errno; /* error codes */
116 /****************************************************
117 * Error codes returned by squid library functions (squid_errno)
118 ****************************************************/
120 #define SQERR_OK 0 /* no error */
121 #define SQERR_UNKNOWN 1 /* generic error, unidentified */
122 #define SQERR_NODATA 2 /* unexpectedly NULL stream */
123 #define SQERR_MEM 3 /* malloc or realloc failed */
124 #define SQERR_NOFILE 4 /* file not found */
125 #define SQERR_FORMAT 5 /* file format not recognized */
126 #define SQERR_PARAMETER 6 /* bad parameter passed to func */
127 #define SQERR_DIVZERO 7 /* error in sre_math.c */
128 #define SQERR_INCOMPAT 8 /* incompatible parameters */
129 #define SQERR_EOD 9 /* end-of-data (often normal) */
131 /****************************************************
132 * Single sequence information
133 ****************************************************/
134 #define SQINFO_NAMELEN 64
135 #define SQINFO_DESCLEN 128
138 int flags; /* what extra data are available */
139 char name[SQINFO_NAMELEN];/* up to 63 characters of name */
140 char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */
141 char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */
142 char desc[SQINFO_DESCLEN];/* up to 127 char of description */
143 int len; /* length of this seq */
144 int start; /* (1..len) start position on source seq */
145 int stop; /* (1..len) end position on source seq */
146 int olen; /* original length of source seq */
147 int type; /* kRNA, kDNA, kAmino, or kOther */
148 char *ss; /* 0..len-1 secondary structure string */
149 char *sa; /* 0..len-1 % side chain surface access. */
151 typedef struct seqinfo_s SQINFO;
153 #define SQINFO_NAME (1 << 0)
154 #define SQINFO_ID (1 << 1)
155 #define SQINFO_ACC (1 << 2)
156 #define SQINFO_DESC (1 << 3)
157 #define SQINFO_START (1 << 4)
158 #define SQINFO_STOP (1 << 5)
159 #define SQINFO_LEN (1 << 6)
160 #define SQINFO_TYPE (1 << 7)
161 #define SQINFO_OLEN (1 << 8)
162 #define SQINFO_SS (1 << 9)
163 #define SQINFO_SA (1 << 10)
166 /****************************************************
167 * Sequence alphabet: see also iupac.c
168 ****************************************************/
169 /* IUPAC symbols defined globally in iupac.c */
171 char sym; /* character representation */
172 char symcomp; /* complement (regular char */
173 char code; /* my binary rep */
174 char comp; /* binary encoded complement */
176 extern struct iupactype iupac[];
177 #define IUPACSYMNUM 17
179 extern char *stdcode1[]; /* 1-letter amino acid translation code */
180 extern char *stdcode3[]; /* 3-letter amino acid translation code */
181 extern float dnafq[]; /* nucleotide occurrence frequencies */
182 extern float aafq[]; /* amino acid occurrence frequencies */
183 extern char aa_alphabet[]; /* amino acid alphabet */
184 extern int aa_index[]; /* convert 0..19 indices to 0..26 */
186 /* valid symbols in IUPAC code */
187 #define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
188 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
189 #define DNA_ALPHABET "ACGT"
190 #define RNA_ALPHABET "ACGU"
191 #define WHITESPACE " \t\n"
193 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~')
196 /****************************************************
197 * Sequence i/o: originally from Don Gilbert's readseq
198 ****************************************************/
199 #include "msa.h" /* for multiple sequence alignment support */
201 /* buffer size for reading in lines from sequence files*/
202 #define LINEBUFLEN 4096
204 /* sequence types parsed by Seqtype() */
205 /* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */
206 #define kOtherSeq 0 /* hmmNOTSETYET */
208 #define kRNA 2 /* hmmNUCLEIC */
209 #define kAmino 3 /* hmmAMINO */
211 /* Unaligned sequence file formats recognized
212 * Coexists with definitions of multiple alignment formats in msa.h:
213 * >100 reserved for alignment formats
214 * <100 reserved for unaligned formats
215 * 0 reserved for unknown
217 * Some "legacy" formats are supported only when explicitly
218 * requested; not autodetected by SeqfileFormat().
220 * DON'T REASSIGN THESE CODES. They're written into
221 * GSI index files. You can use new ones, but reassigning
222 * the sense of old ones will break GSI indices.
223 * Alignment format codes were reassigned with the creation
224 * of msa.c, but before Stockholm format, there were no
225 * indexed alignment databases.
227 #define SQFILE_UNKNOWN 0 /* unknown format */
228 #define SQFILE_IG 1 /* Intelligenetics (!) */
229 #define SQFILE_GENBANK 2 /* GenBank flatfile */
230 /* 3 was A2M. Now an alignment format */
231 #define SQFILE_EMBL 4 /* EMBL or Swissprot flatfile */
232 #define SQFILE_GCG 5 /* GCG single sequence files */
233 #define SQFILE_STRIDER 6 /* MacStrider (!!) */
234 #define SQFILE_FASTA 7 /* FASTA format: default */
235 #define SQFILE_ZUKER 8 /* Zuker MFOLD format (legacy) */
236 #define SQFILE_IDRAW 9 /* Idraw-style PostScript (legacy) */
237 /* 10 was SELEX. Now alignment format */
238 /* 11 was MSF. Now alignment format */
239 #define SQFILE_PIR 12 /* PIR format */
240 #define SQFILE_RAW 13 /* raw sequence */
241 #define SQFILE_SQUID 14 /* my obsolete squid format */
242 /* 15 was kXPearson, extended FASTA; withdrawn */
243 #define SQFILE_GCGDATA 16 /* GCG data library file */
244 /* 17 was Clustal. Now alignment format*/
246 #define SQFILE_VIENNA 18 /* Vienna format: concatenated fasta */
248 #define IsUnalignedFormat(fmt) ((fmt) && (fmt) < 100)
253 FILE *f; /* open file pointer */
254 char *fname; /* name of file; used for diagnostics */
255 int linenumber; /* what line are we on in the file */
257 char *buf; /* dynamically allocated sre_fgets() buffer */
258 int buflen; /* allocation length for buf */
260 int ssimode; /* SSI_OFFSET_I32 or SSI_OFFSET_I64 */
261 SSIOFFSET ssioffset; /* disk offset to last line read into buf */
262 SSIOFFSET r_off; /* offset to start of record */
263 SSIOFFSET d_off; /* offset to start of sequence data */
265 int rpl; /* residues per data line for this file; -1 if unset, 0 if invalid */
266 int lastrpl; /* rpl on last line seen */
267 int maxrpl; /* max rpl on any line of the file */
268 int bpl; /* bytes per data line; -1 if unset, 0 if invalid */
269 int lastbpl; /* bpl on last line seen */
270 int maxbpl; /* max bpl on any line of the file */
272 char *seq; /* growing sequence during parse */
273 SQINFO *sqinfo; /* name, id, etc, gathered during parse */
275 int seqlen; /* current sequence length */
276 int maxseq; /* current allocation length for seq */
278 int format; /* format of seqfile we're reading. */
279 int do_gzip; /* TRUE if f is a pipe from gzip -dc */
280 int do_stdin; /* TRUE if f is stdin */
282 /* An (important) hack for sequential access of multiple alignment files:
283 * we read the whole alignment in,
284 * and then copy it one sequence at a time into seq and sqinfo.
285 * It is active if msa is non NULL.
286 * msa->lastidx is reused/overloaded: used to keep track of what
287 * seq we'll return next.
288 * afp->format is the real format, while SQFILE->format is kMSA.
289 * Because we keep it in the SQFILE structure,
290 * ReadSeq() and friends are always reentrant for multiple seqfiles.
295 typedef struct ReadSeqVars SQFILE;
298 /****************************************************
299 * Cluster analysis and phylogenetic tree support
300 ****************************************************/
302 /* struct phylo_s - a phylogenetic tree
304 * For N sequences, there will generally be an array of 0..N-2
305 * phylo_s structures representing the nodes of a tree.
306 * [0] is the root. The indexes of left and
307 * right children are somewhat confusing so be careful. The
308 * indexes can have values of 0..2N-2. If they are 0..N-1, they
309 * represent pointers to individual sequences. If they are
310 * >= N, they represent pointers to a phylo_s structure
314 int parent; /* index of parent, N..2N-2, or -1 for root */
315 int left; /* index of one of the branches, 0..2N-2 */
316 int right; /* index of other branch, 0..2N-2 */
317 float diff; /* difference score between seqs */
318 float lblen; /* left branch length */
319 float rblen; /* right branch length */
320 char *is_in; /* 0..N-1 flag array, 1 if seq included */
321 int incnum; /* number of seqs included at this node */
325 /* Strategies for cluster analysis; cluster by mean distance,
326 * minimum distance, or maximum distance.
328 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN };
330 /****************************************************
331 * Generic data structure support
332 ****************************************************/
334 /* a struct intstack_s implements a pushdown stack for storing
339 struct intstack_s *nxt;
342 /****************************************************
343 * Binary nucleotide alphabet support
344 ****************************************************/
346 /* Binary encoding of the IUPAC code for nucleotides
348 * four-bit "word", permitting rapid degenerate matching
357 #define NTN 15 /* A|C|G|T */
358 #define NTR 10 /* A|G */
359 #define NTY 5 /* C|T */
360 #define NTM 12 /* A|C */
361 #define NTK 3 /* G|T */
362 #define NTS 6 /* C|G */
363 #define NTW 9 /* A|T */
364 #define NTH 13 /* A|C|T */
365 #define NTB 7 /* C|G|T */
366 #define NTV 14 /* A|C|G */
367 #define NTD 11 /* A|G|T */
368 #define NTGAP 16 /* GAP */
369 #define NTEND 0 /* null string terminator */
371 /* ntmatch(): bitwise comparison of two nuc's
372 * note that it's sensitive to the order;
373 * probe may be degenerate but target should not be
375 #define ntmatch(probe, target) ((probe & target) == target)
377 /****************************************************
378 * Support for a portable, flexible Getopt()
379 ****************************************************/
383 * Structure for declaring options to a main().
386 char *name; /* name of option, e.g. "--option1" or "-o" */
387 int single; /* TRUE if a single letter option */
388 int argtype; /* for typechecking, e.g. sqdARG_INT */
390 /* acceptable argtype's... */
391 #define sqdARG_NONE 0 /* no argument */
392 #define sqdARG_INT 1 /* something that atoi() can grok */
393 #define sqdARG_FLOAT 2 /* something that atof() can grok */
394 #define sqdARG_CHAR 3 /* require single character or digit */
395 #define sqdARG_STRING 4 /* anything goes */
397 /****************************************************
398 * Support for convenient Perl-y regexp matching
399 * See hsregexp.c for copyright notice: this code is derived
400 * from Henry Spencer's freely distributed regexp library.
401 ****************************************************/
404 typedef struct sqd_regexp {
405 char *startp[NSUBEXP];
407 char regstart; /* Internal use only. */
408 char reganch; /* Internal use only. */
409 char *regmust; /* Internal use only. */
410 int regmlen; /* Internal use only. */
411 char program[1]; /* Unwarranted chumminess with compiler. */
414 /* Strparse() defines and manages these.
415 * sqd_parse[0] contains the substring that matched the pattern.
416 * sqd_parse[1-9] contain substrings matched with ()'s.
418 extern char *sqd_parse[10];
420 /****************************************************
421 * Portable detection of multiprocessor # of CPUs.
422 * #include <unistd.h>
423 * long foo = SQD_NPROC;
424 * returns the number of available processors.
425 * if foo == -1, we failed.
426 ****************************************************/
428 /* Our problem here is that POSIX apparently doesn't specify
429 * a standard for how to get sysconf() to report the number of
430 * processors on-line. _SC_NPROCESSORS_ONLN is specified
431 * by SVR4.0MP. Thanks to W. Gish for help here.
434 #ifdef _SC_NPROCESSORS_ONLN /* Sun Solaris, Digital UNIX */
435 #define SQD_NPROC sysconf(_SC_NPROCESSORS_ONLN)
437 #ifdef _SC_NPROC_ONLN /* Silicon Graphics IRIX */
438 #define SQD_NPROC sysconf(_SC_NPROC_ONLN)
439 #else /* FreeBSD, Linux don't support getting ncpu via sysconf() */
444 /****************************************************
445 * Three levels of debugging printf's and assert's
446 * level 1: little impact on verbosity or performance
447 * level 2: moderate impact
448 * level 3: high impact
450 * SQD_DPRINTF3(("Matrix row %d col %d = %f\n", i, j, val));
451 * Note the double parentheses; these are important.
452 ****************************************************/
458 #if (DEBUGLEVEL >= 1)
459 #define SQD_DPRINTF1(x) printf x
460 #define SQD_DASSERT1(x) assert x
462 #define SQD_DPRINTF1(x)
463 #define SQD_DASSERT1(x)
465 #if (DEBUGLEVEL >= 2)
466 #define SQD_DPRINTF2(x) printf x
467 #define SQD_DASSERT2(x) assert x
469 #define SQD_DPRINTF2(x)
470 #define SQD_DASSERT2(x)
472 #if (DEBUGLEVEL >= 3)
473 #define SQD_DPRINTF3(x) printf x
474 #define SQD_DASSERT3(x) assert x
476 #define SQD_DPRINTF3(x)
477 #define SQD_DASSERT3(x)
480 /* PANIC is called for failures of Std C/POSIX functions,
481 * instead of my own functions. Panic() calls perror() and exits
484 #define PANIC Panic(__FILE__, __LINE__)
486 /* Malloc/realloc calls are wrapped
488 #define MallocOrDie(x) sre_malloc(__FILE__, __LINE__, (x))
489 #define ReallocOrDie(x,y) sre_realloc(__FILE__, __LINE__, (x), (y))
491 /****************************************************
492 * Miscellaneous macros and defines
493 ****************************************************/
495 #define SQDCONST_E 2.71828182845904523536028747135
496 #define SQDCONST_PI 3.14159265358979323846264338328
498 /* must declare swapfoo to use SWAP() */
499 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;}
500 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7)
503 #define MIN(a,b) (((a)<(b))?(a):(b))
506 #define MAX(a,b) (((a)>(b))?(a):(b))
509 /* For convenience and (one hopes) clarity in boolean tests:
518 /* Somewhere, there is a universe in which Unix vendors comply
519 * with the ANSI C standard. Unfortunately, it is not ours:
522 #define EXIT_SUCCESS 0
525 #define EXIT_FAILURE 1
528 #include "sqfuncs.h" /* squid function declarations */
529 #include "sre_random.h" /* random number generator and samplers */
530 #include "vectorops.h" /* vector operations */
531 #endif /* SQUIDH_INCLUDED */