1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
13 * Data structures used in HMMER.
14 * Also, a few miscellaneous macros and global variable declarations.
16 * RCS $Id: structs.h,v 1.1.1.1 2005/03/22 08:34:01 cmzmasek Exp $
19 #ifndef STRUCTSH_INCLUDED
20 #define STRUCTSH_INCLUDED
26 /* Miscellaneous math macros used in the package
28 #define sreLOG2(x) ((x) > 0 ? log(x) * 1.44269504 : -9999.)
29 #define sreEXP2(x) (exp((x) * 0.69314718 ))
30 #define SQR(x) ((x) * (x))
32 /* an idiom for determining a symbol's position in the array
33 * by pointer arithmetic.
34 * does no error checking, so caller must already be damned sure x is
35 * valid in the alphabet!
37 #define SYMIDX(x) (strchr(Alphabet, (x)) - Alphabet)
39 /* The symbol alphabet.
40 * Must deal with IUPAC degeneracies. Nondegenerate symbols
41 * come first in Alphabet[], followed by degenerate symbols.
42 * Nucleic alphabet also must deal with other common symbols
43 * like U (in RNA) and X (often misused for N).
45 * Nucleic: "ACGTUNRYMKSWHBVDX" size=4 iupac=17
46 * Amino: "ACDEFGHIKLMNPQRSTVWYBZX" size=20 iupac=23
48 * Parts of the code assume that the last symbol is a
49 * symbol for an unknown residue, i.e. 'X'.
51 * MAXCODE and MAXABET constants are defined in config.h
53 extern char Alphabet[MAXCODE]; /* "ACDEFGHIKLMNPQRSTVWYBZX" for example */
54 extern int Alphabet_type; /* hmmNUCLEIC or hmmAMINO */
55 extern int Alphabet_size; /* uniq alphabet size: 4 or 20 */
56 extern int Alphabet_iupac; /* total size of alphabet + IUPAC degen. */
57 extern char Degenerate[MAXCODE][MAXABET];
58 extern int DegenCount[MAXCODE];
59 #define hmmNOTSETYET 0
60 #define hmmNUCLEIC 2 /* compatibility with squid's kRNA */
61 #define hmmAMINO 3 /* compatibility with squid's kAmino */
63 /**********************************************************************
66 * Implementation of the new Plan7 HMM architecture.
67 * Fully probabilistic even for hmmsw, hmmls, and hmmfs;
68 * No insert->delete or delete->insert transitions;
69 * Improved structure layout.
71 * The strategy is to infiltrate plan7 code into HMMER in
72 * an evolutionary rather than revolutionary manner.
74 **********************************************************************/
76 /* Plan 7 construction strategies.
78 enum p7_construction {
79 P7_MAP_CONSTRUCTION, /* maximum a posteriori architecture */
80 P7_HAND_CONSTRUCTION, /* hand specified architecture */
81 P7_FAST_CONSTRUCTION /* fast ad hoc architecture */
84 /* Plan 7 parameter optimization strategies
87 P7_MAP_PARAM, /* standard maximum a posteriori */
88 P7_MD_PARAM, /* maximum discrimination */
89 P7_MRE_PARAM, /* maximum relative entropy */
90 P7_WMAP_PARAM /* ad hoc weighted MAP */
95 * Declaration of a Plan 7 profile-HMM.
98 /* Annotation on the model. A name is mandatory.
99 * Other fields are optional; whether they are present is
100 * flagged in the stateflags bit array.
102 * desc is only valid if PLAN7_DESC is set in flags.
103 * acc is only valid if PLAN7_ACC is set in flags.
104 * rf is only valid if PLAN7_RF is set in flags.
105 * cs is only valid if PLAN7_CS is set in flags.
106 * ca is only valid if PLAN7_CA is set in flags.
107 * map is only valid if PLAN7_MAP is set in flags.
109 char *name; /* name of the model +*/
110 char *acc; /* accession number of model (Pfam) +*/
111 char *desc; /* brief description of model +*/
112 char *rf; /* reference line from alignment 0..M +*/
113 char *cs; /* consensus structure line 0..M +*/
114 char *ca; /* consensus accessibility line 0..M */
115 char *comlog; /* command line(s) that built model +*/
116 int nseq; /* number of training sequences +*/
117 char *ctime; /* creation date +*/
118 int *map; /* map of alignment cols onto model 1..M+*/
119 int checksum; /* checksum of training sequences +*/
121 /* The following are annotations added to support work by Michael Asman,
122 * CGR Stockholm. They are not stored in model files; they are only
123 * used in model construction.
125 * #=GC X-PRM (PRT,PRI) annotation is picked up by hmmbuild and interpreted
126 * as specifying which mixture Dirichlet component to use. If these flags
127 * are non-NULL, the normal mixture Dirichlet code is bypassed, and a
128 * single specific Dirichlet is used at each position.
130 int *tpri; /* which transition mixture prior to use */
131 int *mpri; /* which match mixture prior to use */
132 int *ipri; /* which insert mixture prior to use */
134 /* Pfam-specific score cutoffs.
136 * ga1, ga2 are valid if PLAN7_GA is set in flags.
137 * tc1, tc2 are valid if PLAN7_TC is set in flags.
138 * nc1, nc2 are valid if PLAN7_NC is set in flags.
140 float ga1, ga2; /* per-seq/per-domain gathering thresholds (bits) +*/
141 float tc1, tc2; /* per-seq/per-domain trusted cutoff (bits) +*/
142 float nc1, nc2; /* per-seq/per-domain noise cutoff (bits) +*/
144 /* The main model in probability form: data-dependent probabilities.
145 * This is the core Krogh/Haussler model.
146 * Transition probabilities are usually accessed as a
147 * two-D array: hmm->t[k][TMM], for instance. They are allocated
148 * such that they can also be stepped through in 1D by pointer
149 * manipulations, for efficiency in DP algorithms.
151 int M; /* length of the model (# nodes) +*/
152 float **t; /* transition prob's. t[1..M-1][0..6] +*/
153 float **mat; /* match emissions. mat[1..M][0..19] +*/
154 float **ins; /* insert emissions. ins[1..M-1][0..19] +*/
155 float tbd1; /* B->D1 prob (data dependent) +*/
157 /* The unique states of Plan 7 in probability form.
158 * These are the algorithm-dependent, data-independent probabilities.
159 * Some parts of the code may briefly use a trick of copying tbd1
160 * into begin[0]; this makes it easy to call FChoose() or FNorm()
161 * on the resulting vector. However, in general begin[0] is not
164 float xt[4][2]; /* N,E,C,J extra states: 2 transitions +*/
165 float *begin; /* 1..M B->M state transitions +*/
166 float *end; /* 1..M M->E state transitions (!= a dist!) +*/
168 /* The null model probabilities.
170 float null[MAXABET]; /* "random sequence" emission prob's +*/
171 float p1; /* null model loop probability +*/
173 /* The model in log-odds score form.
174 * These are created from the probabilities by LogoddsifyHMM().
175 * By definition, null[] emission scores are all zero.
176 * Note that emission distributions are over 26 upper-case letters,
177 * not just the unambiguous protein or DNA alphabet: we
178 * precalculate the scores for all IUPAC degenerate symbols we
179 * may see. Non-IUPAC symbols simply have a -INFTY score.
180 * Note the reversed indexing on msc and isc -- for efficiency reasons.
182 * Only valid if PLAN7_HASBITS is set.
184 int **tsc; /* transition scores [1.M-1][0.6] -*/
185 int **msc; /* match emission scores [0.MAXCODE-1][1.M] -*/
186 int **isc; /* ins emission scores [0.MAXCODE-1][1.M-1] -*/
187 int xsc[4][2]; /* N,E,C,J transitions -*/
188 int *bsc; /* begin transitions [1.M] -*/
189 int *esc; /* end transitions [1.M] -*/
191 /* DNA translation scoring parameters
192 * For aligning protein Plan7 models to DNA sequence.
193 * Lookup value for a codon is calculated by pos1 * 16 + pos2 * 4 + pos3,
194 * where 'pos1' is the digitized value of the first nucleotide position;
195 * if any of the positions are ambiguous codes, lookup value 64 is used
196 * (which will generally have a score of zero)
198 * Only valid if PLAN7_HASDNA is set.
200 int **dnam; /* triplet match scores [0.64][1.M] -*/
201 int **dnai; /* triplet insert scores [0.64][1.M] -*/
202 int dna2; /* -1 frameshift, doublet emission, M or I -*/
203 int dna4; /* +1 frameshift, doublet emission, M or I -*/
205 /* P-value and E-value statistical parameters
206 * Only valid if PLAN7_STATS is set.
208 float mu; /* EVD mu +*/
209 float lambda; /* EVD lambda +*/
211 int flags; /* bit flags indicating state of HMM, valid data +*/
214 /* Flags for plan7->flags.
215 * Note: Some models have scores but no probabilities (for instance,
216 * after reading from an HMM save file). Other models have
217 * probabilities but no scores (for instance, during training
218 * or building). Since it costs time to convert either way,
219 * I use PLAN7_HASBITS and PLAN7_HASPROB flags to defer conversion
220 * until absolutely necessary. This means I have to be careful
221 * about keeping these flags set properly when I fiddle a model.
223 #define PLAN7_HASBITS (1<<0) /* raised if model has log-odds scores */
224 #define PLAN7_DESC (1<<1) /* raised if description exists */
225 #define PLAN7_RF (1<<2) /* raised if #RF annotation available */
226 #define PLAN7_CS (1<<3) /* raised if #CS annotation available */
227 #define PLAN7_XRAY (1<<4) /* raised if structural data available */
228 #define PLAN7_HASPROB (1<<5) /* raised if model has probabilities */
229 #define PLAN7_HASDNA (1<<6) /* raised if protein HMM->DNA seq params set*/
230 #define PLAN7_STATS (1<<7) /* raised if EVD parameters are available */
231 #define PLAN7_MAP (1<<8) /* raised if alignment map is available */
232 #define PLAN7_ACC (1<<9) /* raised if accession number is available */
233 #define PLAN7_GA (1<<10) /* raised if gathering thresholds available */
234 #define PLAN7_TC (1<<11) /* raised if trusted cutoffs available */
235 #define PLAN7_NC (1<<12) /* raised if noise cutoffs available */
236 #define PLAN7_CA (1<<13) /* raised if surface accessibility avail. */
238 /* Indices for special state types, I: used for dynamic programming xmx[][]
239 * mnemonic: eXtra Matrix for B state = XMB
247 /* Indices for special state types, II: used for hmm->xt[] indexing
248 * mnemonic: eXtra Transition for N state = XTN
255 /* Indices for Plan7 main model state transitions.
256 * Used for indexing hmm->t[k][]
257 * mnemonic: Transition from Match to Match = TMM
267 /* Indices for extra state transitions
268 * Used for indexing hmm->xt[][].
270 #define MOVE 0 /* trNB, trEC, trCT, trJB */
271 #define LOOP 1 /* trNN, trEJ, trCC, trJJ */
273 /* Declaration of Plan7 dynamic programming matrix structure.
276 int **xmx; /* special scores [0.1..N][BECJN] */
277 int **mmx; /* match scores [0.1..N][0.1..M] */
278 int **imx; /* insert scores [0.1..N][0.1..M-1.M] */
279 int **dmx; /* delete scores [0.1..N][0.1..M-1.M] */
282 /* Declaration of Plan7 shadow matrix structure.
283 * In general, allowed values are STM, STI, etc.
284 * However, E state has M possible sources, from 1..M match states;
285 * hence the esrc array.
288 char **xtb; /* special state traces [0.1..N][BECJN] */
289 char **mtb; /* match state traces [0.1..N][0.1..M] */
290 char **itb; /* insert state traces [0.1..N][0.1..M-1.M] */
291 char **dtb; /* delete state traces [0.1..N][0.1..M-1.M] */
292 int *esrc; /* E trace is special; must store a M state number 1..M */
295 /* Structure: HMMFILE
297 * Purpose: An open HMM file or HMM library. See hmmio.c.
300 FILE *f; /* pointer to file opened for reading */
301 SSIFILE *ssi; /* pointer to open SSI index, or NULL */
302 int (*parser)(struct hmmfile_s *, struct plan7_s **); /* parsing function */
303 int is_binary; /* TRUE if format is a binary one */
304 int byteswap; /* TRUE if binary and byteswapped */
306 /* Ewan (GeneWise) needs the input API to know the offset of each
307 * HMM on the disk, as it's being read. This might be enough
308 * support for him. hmmindex also uses this. Ewan, see
309 * HMMFilePositionByIndex() for an example of how to use this
310 * opaque offset type in the SSI API - the call you need
311 * is SSISetFilePosition().
313 int is_seekable; /* TRUE if we use offsets in this HMM file */
314 int mode; /* type of offset */
315 SSIOFFSET offset; /* Disk offset for beginning of the current HMM */
317 typedef struct hmmfile_s HMMFILE;
320 /* Plan 7 model state types
321 * used in traceback structure
335 /* Structure: p7trace_s
337 * Traceback structure for alignments of model to sequence.
338 * Each array in a trace_s is 0..tlen-1.
339 * Element 0 is always to STATE_S. Element tlen-1 is always to STATE_T.
342 int tlen; /* length of traceback */
343 char *statetype; /* state type used for alignment */
344 int *nodeidx; /* index of aligned node, 1..M (if M,D,I), or 0 */
345 int *pos; /* position in dsq, 1..L, or 0 if none */
348 /* Structure: p7prior_s
350 * Dirichlet priors on HMM parameters.
353 int strategy; /* PRI_DCHLET, etc. */
355 int tnum; /* number of transition Dirichlet mixtures */
356 float tq[MAXDCHLET]; /* probabilities of tnum components */
357 float t[MAXDCHLET][7]; /* transition terms per mix component */
359 int mnum; /* number of mat emission Dirichlet mixtures */
360 float mq[MAXDCHLET]; /* probabilities of mnum components */
361 float m[MAXDCHLET][MAXABET]; /* match emission terms per mix component */
363 int inum; /* number of insert emission Dirichlet mixes */
364 float iq[MAXDCHLET]; /* probabilities of inum components */
365 float i[MAXDCHLET][MAXABET]; /* insert emission terms */
367 #define PRI_DCHLET 0 /* simple or mixture Dirichlets */
368 #define PRI_PAM 1 /* PAM prior hack */
371 /**********************************************************************
372 * Other structures, not having to do with HMMs.
373 **********************************************************************/
375 /* Structure: histogram_s
377 * Keep a score histogram.
379 * The main implementation issue here is that the range of
380 * scores is unknown, and will go negative. histogram is
381 * a 0..max-min array that represents the range min..max.
382 * A given score is indexed in histogram array as score-min.
383 * The AddToHistogram() function deals with dynamically
384 * resizing the histogram array when necessary.
387 int *histogram; /* counts of hits */
388 int min; /* elem 0 of histogram == min */
389 int max; /* last elem of histogram == max */
390 int highscore; /* highest active elem has this score */
391 int lowscore; /* lowest active elem has this score */
392 int lumpsize; /* when resizing, overalloc by this */
393 int total; /* total # of hits counted */
395 float *expect; /* expected counts of hits */
396 int fit_type; /* flag indicating distribution type */
397 float param[3]; /* parameters used for fits */
398 float chisq; /* chi-squared val for goodness of fit*/
399 float chip; /* P value for chisquared */
401 #define HISTFIT_NONE 0 /* no fit done yet */
402 #define HISTFIT_EVD 1 /* fit type = extreme value dist */
403 #define HISTFIT_GAUSSIAN 2 /* fit type = Gaussian */
404 #define EVD_MU 0 /* EVD fit parameter mu */
405 #define EVD_LAMBDA 1 /* EVD fit parameter lambda */
406 #define EVD_WONKA 2 /* EVD fit fudge factor */
407 #define GAUSS_MEAN 0 /* Gaussian parameter mean */
408 #define GAUSS_SD 1 /* Gaussian parameter std. dev. */
410 /* Structure: fancyali_s
412 * Alignment of a hit to an HMM, for printing.
415 char *rfline; /* reference coord info */
416 char *csline; /* consensus structure info */
417 char *model; /* aligned query consensus sequence */
418 char *mline; /* "identities", conservation +'s, etc. */
419 char *aseq; /* aligned target sequence */
420 int len; /* length of strings */
421 char *query; /* name of query HMM */
422 char *target; /* name of target sequence */
423 int sqfrom; /* start position on sequence (1..L) */
424 int sqto; /* end position on sequence (1..L) */
429 * Info about a high-scoring database hit.
430 * We keep this info in memory, so we can output a
431 * sorted list of high hits at the end.
433 * sqfrom and sqto are the coordinates that will be shown
434 * in the results, not coords in arrays... therefore, reverse
435 * complements have sqfrom > sqto
438 double sortkey; /* number to sort by; big is better */
439 float score; /* score of the hit */
440 double pvalue; /* P-value of the hit */
441 float mothersc; /* score of whole sequence */
442 double motherp; /* P-value of whole sequence */
443 char *name; /* name of the target */
444 char *acc; /* accession of the target */
445 char *desc; /* description of the target */
446 int sqfrom; /* start position in seq (1..N) */
447 int sqto; /* end position in seq (1..N) */
448 int sqlen; /* length of sequence (N) */
449 int hmmfrom; /* start position in HMM (1..M) */
450 int hmmto; /* end position in HMM (1..M) */
451 int hmmlen; /* length of HMM (M) */
452 int domidx; /* index of this domain */
453 int ndom; /* total # of domains in this seq */
454 struct fancyali_s *ali; /* ptr to optional alignment info */
458 /* Structure: tophit_s
460 * Array of high scoring hits, suitable for efficient sorting
461 * when we prepare to output results. "hit" list is NULL and
462 * unavailable until after we do a sort.
465 struct hit_s **hit; /* array of ptrs to top scoring hits */
466 struct hit_s *unsrt; /* unsorted array */
467 int alloc; /* current allocation size */
468 int num; /* number of hits in list now */
469 int lump; /* allocation lumpsize */
472 /* struct threshold_s
473 * Contains score/evalue threshold settings.
475 * made first for hmmpfam:
476 * Since we're going to loop over all HMMs in a Pfam (or pfam-like)
477 * database in main_loop_{serial,pvm}, and we're going to
478 * allow autocutoffs using Pfam GA, NC, TC lines, we will need
479 * to reset those cutoffs with each HMM in turn. Therefore the
480 * main loops need to know whether they're supposed to be
481 * doing autocutoff. This amount of info was unwieldy enough
482 * to pass through the argument list that I put it
486 float globT; /* T parameter: keep only hits > globT bits */
487 double globE; /* E parameter: keep hits < globE E-value */
488 float domT; /* T parameter for individual domains */
489 double domE; /* E parameter for individual domains */
490 /* autosetting of cutoffs using Pfam annot: */
491 enum { CUT_NONE, CUT_GA, CUT_NC, CUT_TC } autocut;
492 int Z; /* nseq to base E value calculation on */
495 /**********************************************************
496 * PVM parallelization
497 **********************************************************/
502 #define HMMPVM_INIT 0 /* an initialization packet to all slaves */
503 #define HMMPVM_WORK 1 /* a work packet sent to a slave */
504 #define HMMPVM_RESULTS 2 /* a results packet sent back to master */
505 #define HMMPVM_TASK_TROUBLE 3 /* a notification of bad things in a slave task */
506 #define HMMPVM_HOST_TROUBLE 4 /* a notification of bad things in a PVM host */
511 #define HMMPVM_NO_HMMFILE 1
512 #define HMMPVM_NO_INDEX 2
513 #define HMMPVM_BAD_INIT 3 /* failed to initialize a slave somehow */
518 /**********************************************************
519 * Plan 9: obsolete HMMER1.x code. We still need these structures
520 * for reading old HMM files (e.g. backwards compatibility)
521 **********************************************************/
523 /* We define a "basic" state, which covers the basic match, insert, and
524 * delete states from the Haussler paper. Numbers are stored as
525 * pre-calculated negative logs.
528 float t[3]; /* state transitions to +1 M, +0 I, +1 D */
529 float p[MAXABET]; /* symbol emission probabilities */
532 /* A complete hidden Markov model
535 int M; /* length of the model */
536 struct basic_state *ins; /* insert states 0..M+1 */
537 struct basic_state *mat; /* match 0..M+1; 0 = BEGIN, M+1 = END */
538 struct basic_state *del; /* delete 0..M+1 */
540 float null[MAXABET]; /* the *suggested* null model */
542 /* Optional annotation on the HMM, taken from alignment
544 char *name; /* a name for the HMM */
545 char *ref; /* reference coords and annotation */
546 char *cs; /* consensus structure annotation */
547 float *xray; /* Structural annotation: xray[0..M+1][NINPUTS], indexed manually */
549 int flags; /* flags for what optional info is in HMM */
552 /* Flags for optional info in an HMM structure
554 #define HMM_REF (1<<0)
555 #define HMM_CS (1<<1)
556 #define HMM_XRAY (1<<2)
564 #endif /* STRUCTSH_INCLUDED */