1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * Configuration of the global symbol alphabet information.
13 * RCS $Id: alphabet.c,v 1.1.1.1 2005/03/22 08:34:08 cmzmasek Exp $
21 #endif /* HMMER_THREADS */
32 static void set_degenerate(char iupac, char *syms);
35 /* Function: DetermineAlphabet()
37 * Purpose: From a set of sequences (raw or aligned), make a good
38 * guess whether they're Nucleic, Amino, or something
39 * else, and set alphabet accordingly.
41 * If Alphabet_type is already set, that means our
42 * autodetection was overridden from the command line,
43 * and we just set the other globals accordingly.
46 DetermineAlphabet(char **rseqs, int nseq)
49 int other, nucleic, amino;
52 /* Autodetection of alphabet type.
55 other = nucleic = amino = 0;
56 for (idx = 0; idx < nseq; idx++) {
57 switch (Seqtype(rseqs[idx])) {
58 case kRNA: nucleic++; break;
59 case kDNA: nucleic++; break;
60 case kAmino: amino++; break;
61 case kOtherSeq: other++; break;
62 default: Die("No such alphabet type");
66 if (nucleic == nseq) type = hmmNUCLEIC;
67 else if (amino == nseq) type = hmmAMINO;
68 else if (nucleic > amino && nucleic > other) {
69 Warn("Looks like nucleic acid sequence, hope that's right");
72 else if (amino > nucleic && amino > other) {
73 Warn("Looks like amino acid sequence, hope that's right");
76 else Die("Sorry, I can't tell if that's protein or DNA");
78 /* Now set up the alphabet.
84 /* Function: SetAlphabet()
86 * Purpose: Set the alphabet globals, given an alphabet type
87 * of either hmmAMINO or hmmNUCLEIC.
94 pthread_mutex_t alphabet_lock; /* alphabet is global; must protect to be threadsafe */
95 int rtn; /* return code from pthreads */
97 if ((rtn = pthread_mutex_init(&alphabet_lock, NULL)) != 0)
98 Die("pthread_mutex_init FAILED; %s\n", strerror(rtn));
99 if ((rtn = pthread_mutex_lock(&alphabet_lock)) != 0)
100 Die("pthread_mutex_lock FAILED: %s\n", strerror(rtn));
103 /* Because the alphabet information is global, we must
104 * be careful to make this a thread-safe function. The mutex
105 * (above) takes care of that. But, indeed, it's also
106 * just good sense (and more efficient) to simply never
107 * allow resetting the alphabet. If type is Alphabet_type,
108 * silently return; else die with an alphabet mismatch
111 if (Alphabet_type != hmmNOTSETYET)
113 if (type != Alphabet_type)
114 Die("An alphabet type conflict occurred.\nYou probably mixed a DNA seq file with a protein model, or vice versa.");
117 if ((rtn = pthread_mutex_unlock(&alphabet_lock)) != 0)
118 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
123 switch(type) { /* Alphabet is not a string - careful! */
125 Alphabet_type = type;
126 strncpy(Alphabet, "ACDEFGHIKLMNPQRSTVWYBZX", 23);
129 for (x = 0; x < Alphabet_iupac; x++) {
130 memset(Degenerate[x], 0, Alphabet_size);
132 for (x = 0; x < Alphabet_size; x++) {
133 Degenerate[x][x] = 1;
136 set_degenerate('B', "ND");
137 set_degenerate('Z', "QE");
138 set_degenerate('X', "ACDEFGHIKLMNPQRSTVWY");
141 Alphabet_type = type;
142 strncpy(Alphabet, "ACGTUNRYMKSWHBVDX", 17);
145 for (x = 0; x < Alphabet_iupac; x++) {
146 memset(Degenerate[x], 0, Alphabet_size);
148 for (x = 0; x < Alphabet_size; x++) {
149 Degenerate[x][x] = 1;
152 set_degenerate('U', "T");
153 set_degenerate('N', "ACGT");
154 set_degenerate('X', "ACGT");
155 set_degenerate('R', "AG");
156 set_degenerate('Y', "CT");
157 set_degenerate('M', "AC");
158 set_degenerate('K', "GT");
159 set_degenerate('S', "CG");
160 set_degenerate('W', "AT");
161 set_degenerate('H', "ACT");
162 set_degenerate('B', "CGT");
163 set_degenerate('V', "ACG");
164 set_degenerate('D', "AGT");
166 default: Die("No support for non-nucleic or protein alphabets");
170 if ((rtn = pthread_mutex_unlock(&alphabet_lock)) != 0)
171 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
175 /* Function: SymbolIndex()
177 * Purpose: Convert a symbol to its index in Alphabet[].
178 * Bogus characters are converted to 'X'.
179 * More robust than the SYMIDX() macro but
183 SymbolIndex(char sym)
186 return ((s = strchr(Alphabet, (char) toupper((int) sym))) == NULL) ?
187 Alphabet_iupac-1 : s - Alphabet;
191 /* Function: DigitizeSequence()
193 * Purpose: Internal representation of a sequence in HMMER is
194 * as a char array. 1..L are the indices
195 * of seq symbols in Alphabet[]. 0,L+1 are sentinel
196 * bytes, set to be Alphabet_iupac -- i.e. one more
197 * than the maximum allowed index.
199 * Assumes that 'X', the fully degenerate character,
200 * is the last character in the allowed alphabet.
202 * Args: seq - sequence to be digitized (0..L-1)
203 * L - length of sequence
205 * Return: digitized sequence, dsq.
206 * dsq is allocated here and must be free'd by caller.
209 DigitizeSequence(char *seq, int L)
214 dsq = MallocOrDie (sizeof(char) * (L+2));
215 dsq[0] = dsq[L+1] = (char) Alphabet_iupac;
216 for (i = 1; i <= L; i++)
217 dsq[i] = SymbolIndex(seq[i-1]);
222 /* Function: DedigitizeSequence()
223 * Date: SRE, Tue Dec 16 10:39:19 1997 [StL]
225 * Purpose: Returns a 0..L-1 character string, converting the
226 * dsq back to the real alphabet.
229 DedigitizeSequence(char *dsq, int L)
234 seq = MallocOrDie(sizeof(char) * (L+1));
235 for (i = 0; i < L; i++)
236 seq[i] = Alphabet[(int) dsq[i+1]];
242 /* Function: DigitizeAlignment()
244 * Purpose: Given an alignment, return digitized unaligned
245 * sequence array. (Tracebacks are always relative
246 * to digitized unaligned seqs, even if they are
247 * faked from an existing alignment in modelmakers.c.)
249 * Args: msa - alignment to digitize
250 * ret_dsqs - RETURN: array of digitized unaligned sequences
253 * dsqs is alloced here. Free2DArray(dseqs, nseq).
256 DigitizeAlignment(MSA *msa, char ***ret_dsqs)
259 int idx; /* counter for sequences */
260 int dpos; /* position in digitized seq */
261 int apos; /* position in aligned seq */
263 dsq = (char **) MallocOrDie (sizeof(char *) * msa->nseq);
264 for (idx = 0; idx < msa->nseq; idx++) {
265 dsq[idx] = (char *) MallocOrDie (sizeof(char) * (msa->alen+2));
267 dsq[idx][0] = (char) Alphabet_iupac; /* sentinel byte at start */
269 for (apos = 0, dpos = 1; apos < msa->alen; apos++) {
270 if (! isgap(msa->aseq[idx][apos])) /* skip gaps */
271 dsq[idx][dpos++] = SymbolIndex(msa->aseq[idx][apos]);
273 dsq[idx][dpos] = (char) Alphabet_iupac; /* sentinel byte at end */
279 /* Function: P7CountSymbol()
281 * Purpose: Given a possibly degenerate symbol code, increment
282 * a symbol counter array (generally an emission
283 * probability vector in counts form) appropriately.
285 * Args: counters: vector to count into. [0..Alphabet_size-1]
286 * symidx: symbol index to count: [0..Alphabet_iupac-1]
287 * wt: weight to use for the count; often 1.0
292 P7CountSymbol(float *counters, char symidx, float wt)
296 if (symidx < Alphabet_size)
297 counters[(int) symidx] += wt;
299 for (x = 0; x < Alphabet_size; x++) {
300 if (Degenerate[(int) symidx][x])
301 counters[x] += wt / (float) DegenCount[(int) symidx];
306 /* Function: DefaultGeneticCode()
308 * Purpose: Configure aacode, mapping triplets to amino acids.
309 * Triplet index: AAA = 0, AAC = 1, ... UUU = 63.
310 * AA index: alphabetical: A=0,C=1... Y=19
312 * Uses the stdcode1[] global translation table from SQUID.
314 * Args: aacode - preallocated 0.63 array for genetic code
319 DefaultGeneticCode(int *aacode)
323 for (x = 0; x < 64; x++) {
324 if (*(stdcode1[x]) == '*') aacode[x] = -1;
325 else aacode[x] = SYMIDX(*(stdcode1[x]));
330 /* Function: DefaultCodonBias()
332 * Purpose: Configure a codonbias table, mapping triplets to
333 * probability of using the triplet for the amino acid
334 * it represents: P(triplet | aa).
335 * The default is to assume codons are used equiprobably.
337 * Args: codebias: 0..63 array of P(triplet|aa), preallocated.
342 DefaultCodonBias(float *codebias)
344 codebias[0] = 1./2.; /* AAA Lys 2 */
345 codebias[1] = 1./2.; /* AAC Asn 2 */
346 codebias[2] = 1./2.; /* AAG Lys 2 */
347 codebias[3] = 1./2.; /* AAU Asn 2 */
348 codebias[4] = 1./4.; /* ACA Thr 4 */
349 codebias[5] = 1./4.; /* ACC Thr 4 */
350 codebias[6] = 1./4.; /* ACG Thr 4 */
351 codebias[7] = 1./4.; /* ACU Thr 4 */
352 codebias[8] = 1./6.; /* AGA Ser 6 */
353 codebias[9] = 1./6.; /* AGC Arg 6 */
354 codebias[10] = 1./6.; /* AGG Ser 6 */
355 codebias[11] = 1./6.; /* AGU Arg 6 */
356 codebias[12] = 1./3.; /* AUA Ile 3 */
357 codebias[13] = 1./3.; /* AUC Ile 3 */
358 codebias[14] = 1.; /* AUG Met 1 */
359 codebias[15] = 1./3.; /* AUU Ile 3 */
360 codebias[16] = 1./2.; /* CAA Gln 2 */
361 codebias[17] = 1./2.; /* CAC His 2 */
362 codebias[18] = 1./2.; /* CAG Gln 2 */
363 codebias[19] = 1./2.; /* CAU His 2 */
364 codebias[20] = 1./4.; /* CCA Pro 4 */
365 codebias[21] = 1./4.; /* CCC Pro 4 */
366 codebias[22] = 1./4.; /* CCG Pro 4 */
367 codebias[23] = 1./4.; /* CCU Pro 4 */
368 codebias[24] = 1./6.; /* CGA Arg 6 */
369 codebias[25] = 1./6.; /* CGC Arg 6 */
370 codebias[26] = 1./6.; /* CGG Arg 6 */
371 codebias[27] = 1./6.; /* CGU Arg 6 */
372 codebias[28] = 1./6.; /* CUA Leu 6 */
373 codebias[29] = 1./6.; /* CUC Leu 6 */
374 codebias[30] = 1./6.; /* CUG Leu 6 */
375 codebias[31] = 1./6.; /* CUU Leu 6 */
376 codebias[32] = 1./2.; /* GAA Glu 2 */
377 codebias[33] = 1./2.; /* GAC Asp 2 */
378 codebias[34] = 1./2.; /* GAG Glu 2 */
379 codebias[35] = 1./2.; /* GAU Asp 2 */
380 codebias[36] = 1./4.; /* GCA Ala 4 */
381 codebias[37] = 1./4.; /* GCC Ala 4 */
382 codebias[38] = 1./4.; /* GCG Ala 4 */
383 codebias[39] = 1./4.; /* GCU Ala 4 */
384 codebias[40] = 1./4.; /* GGA Gly 4 */
385 codebias[41] = 1./4.; /* GGC Gly 4 */
386 codebias[42] = 1./4.; /* GGG Gly 4 */
387 codebias[43] = 1./4.; /* GGU Gly 4 */
388 codebias[44] = 1./4.; /* GUA Val 4 */
389 codebias[45] = 1./4.; /* GUC Val 4 */
390 codebias[46] = 1./4.; /* GUG Val 4 */
391 codebias[47] = 1./4.; /* GUU Val 4 */
392 codebias[48] = 0.; /* UAA och - */
393 codebias[49] = 1./2.; /* UAC Tyr 2 */
394 codebias[50] = 0.; /* UAG amb - */
395 codebias[51] = 1./2.; /* UAU Tyr 2 */
396 codebias[52] = 1./6.; /* UCA Ser 6 */
397 codebias[53] = 1./6.; /* UCC Ser 6 */
398 codebias[54] = 1./6.; /* UCG Ser 6 */
399 codebias[55] = 1./6.; /* UCU Ser 6 */
400 codebias[56] = 0.; /* UGA opa - */
401 codebias[57] = 1./2.; /* UGC Cys 2 */
402 codebias[58] = 1.; /* UGG Trp 1 */
403 codebias[59] = 1./2.; /* UGU Cys 2 */
404 codebias[60] = 1./6.; /* UUA Leu 6 */
405 codebias[61] = 1./2.; /* UUC Phe 2 */
406 codebias[62] = 1./6.; /* UUG Leu 6 */
407 codebias[63] = 1./2.; /* UUU Phe 2 */
412 /* Function: set_degenerate()
414 * Purpose: convenience function for setting up
415 * Degenerate[][] global for the alphabet.
418 set_degenerate(char iupac, char *syms)
420 DegenCount[strchr(Alphabet,iupac)-Alphabet] = strlen(syms);
422 Degenerate[strchr(Alphabet,iupac)-Alphabet]
423 [strchr(Alphabet,*syms)-Alphabet] = 1;