2 /* copyright (c) 1996, 1997, 1998, 1999 William R. Pearson and the
5 /* $Name: fa_34_26_5 $ - $Id: faatran.c,v 1.6 2007/04/02 18:08:11 wrp Exp $ */
7 /* aatran.c translates from nt to aa, 1 char codes */
8 /* modified July 2, 1987 for all 6 frames */
9 /* 23 Jan 1991 fixed bug for short sequences */
11 /* this mapping is not alphabet independent */
21 1. The Standard Code (transl_table=1)
23 By default all transl_table in GenBank flatfiles are equal to id 1, and this
24 is not shown. When transl_table is not equal to id 1, it is shown as a
25 qualifier on the CDS feature.
29 char *AA1="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
31 Starts = ---M---------------M---------------M----------------------------
32 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
33 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
34 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
36 2. The Vertebrate Mitochondrial Code (transl_table=2)
39 char *AA2 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG";
41 Starts = --------------------------------MMMM---------------M------------
42 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
43 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
44 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
46 3. The Yeast Mitochondrial Code (transl_table=3)
49 char *AA3 ="FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
51 Starts = -----------------------------------M----------------------------
52 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
53 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
54 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
56 4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the
57 Mycoplasma/Spiroplasma Code (transl_table=4)
60 char *AA4 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
62 Starts = --MM---------------M------------MMMM---------------M------------
63 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
64 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
65 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
67 5. The Invertebrate Mitochondrial Code (transl_table=5)
70 char *AA5 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG";
72 Starts = ---M----------------------------MMMM---------------M------------
73 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
74 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
75 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
77 6. The Ciliate, Dasycladacean and Hexamita Nuclear Code (transl_table=6)
80 char *AA6 ="FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
82 Starts = -----------------------------------M----------------------------
83 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
84 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
85 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
87 9. The Echinoderm Mitochondrial Code (transl_table=9)
90 char *AA7 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
92 Starts = -----------------------------------M----------------------------
93 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
94 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
95 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
97 10. The Euplotid Nuclear Code (transl_table=10)
100 char *AA10="FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
102 Starts = -----------------------------------M----------------------------
103 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
104 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
105 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
107 11. The Bacterial "Code" (transl_table=11)
110 char *AA11="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
112 Starts = ---M---------------M------------MMMM---------------M------------
113 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
114 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
115 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
117 12. The Alternative Yeast Nuclear Code (transl_table=12)
120 char *AA12 ="FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
122 Starts = -------------------M---------------M----------------------------
123 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
124 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
125 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
127 13. The Ascidian Mitochondrial Code (transl_table=13)
130 char *AA13="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG";
132 Starts = -----------------------------------M----------------------------
133 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
134 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
135 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
137 14. The Flatworm Mitochondrial Code (transl_table=14)
140 char *AA14 ="FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
142 Starts = -----------------------------------M----------------------------
143 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
144 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
145 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
147 15. Blepharisma Nuclear Code (transl_table=15)
150 char *AA15="FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
152 Starts = -----------------------------------M----------------------------
153 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
154 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
155 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
159 char *AA16 ="FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
162 name "Chlorophycean Mitochondrial" ,
163 sncbieaa "-----------------------------------M----------------------------"
164 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
165 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
166 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
170 char *AA21 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
172 name "Trematode Mitochondrial" ,
174 sncbieaa "-----------------------------------M---------------M------------"
175 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
176 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
177 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
181 char *AA22 ="FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
183 name "Scenedesmus obliquus Mitochondrial" ,
185 sncbieaa "-----------------------------------M----------------------------"
186 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
187 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
188 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
192 char *AA23 ="FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
194 name "Thraustochytrium Mitochondrial" ,
196 sncbieaa "--------------------------------M--M---------------M------------"
197 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
198 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
199 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
203 static char aacmap[64]={
204 'K','N','K','N','T','T','T','T','R','S','R','S','I','I','M','I',
205 'Q','H','Q','H','P','P','P','P','R','R','R','R','L','L','L','L',
206 'E','D','E','D','A','A','A','A','G','G','G','G','V','V','V','V',
207 '*','Y','*','Y','S','S','S','S','*','C','W','C','L','F','L','F'
210 static int aamap[64]; /* integer aa values */
211 static int aamapr[64]; /* reverse sequence map */
213 /* tnt is used only by aatran.c. It must be consistent with lascii and
214 the nt alphabet. It uses 3,3 because T and U are considered separately
216 static int tnt[]={0,0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0,
217 0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0};
219 static int debug_set;
222 aatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame)
226 const unsigned char *nts0;
228 register unsigned char *aap;
230 iaa=nna=(maxs-(frame<3?frame:frame-3))/3;
240 nts0 = &ntseq[frame];
243 im = nnp[*nts0++]<<4;
244 im += nnp[*nts0++]<<2;
248 /* this check is included because of a bug in tfasty
249 which occurs only during the alignment process */
252 if (debug_set && aamp[im] > MAXUC) {
253 fprintf(stderr,"faatran: %d %d %d %d %d?%d\n",
254 *(nts0-3),*(nts0-2),*(nts0-1), im, aamp[im],aamap[im]);
256 /* this allows recovery, but should not be done frequently */
257 for (i=0; i<64; i++) {
258 aamap[i]=aascii[aacmap[i]];
259 aamapr[i]=aascii[aacmap[(~i)&63]];
268 nts0 = &ntseq[maxs-(frame-3)];
271 im = nnp[*--nts0]<<4;
272 im += nnp[*--nts0]<<2;
275 /* this check is included because of a bug in tfasty
276 which occurs only during the alignment process */
279 if (debug_set && aamp[im] > MAXUC) {
280 fprintf(stderr,"faatran: %d %d %d %d %d?%d\n",
281 *(nts0-3),*(nts0-2),*(nts0-1), im, aamp[im],aamap[im]);
283 /* this allows recovery, but should not be done frequently */
284 for (i=0; i<64; i++) {
285 aamap[i]=aascii[aacmap[i]];
286 aamapr[i]=aascii[aacmap[(~i)&63]];
297 /* slower version that masks out NNN,XXX */
299 /* - A C G T U R Y M W S K D H V B N X */
300 static int snt[]={0,0,1,2,3,3,0,1,0,0,4,4,4,4,4,4,4,4};
303 saatran(const unsigned char *ntseq,
304 unsigned char *aaseq, int maxs, int frame)
306 int iaa, im, it, nna, xflag;
308 const unsigned char *nts0;
310 register unsigned char *aap;
312 iaa=nna=(maxs-(frame<3?frame:frame-3))/3;
321 nts0 = &ntseq[frame];
325 if ((it=nnp[*nts0++])<4) {im = it<<4;}
326 else {xflag = 1; im=0;}
327 if ((it=nnp[*nts0++])<4) {im += it<<2;}
329 if ((it=nnp[*nts0++])<4) {im += it;}
331 if (xflag) *aap++ = aascii['X'];
332 else *aap++ = aamp[im];
337 nts0 = &ntseq[maxs-(frame-3)];
341 if ((it=nnp[*--nts0]) < 4) im = it<<4;
342 else {xflag = 1; im=0;}
343 if ((it=nnp[*--nts0]) < 4) im += it<<2;
345 if ((it=nnp[*--nts0]) < 4) im += it;
347 if (xflag) *aap++ = aascii['X'];
348 else *aap++ = aamp[im];
356 aainit(int tr_type, int debug)
360 int imap[4]={3,1,0,2}, i0, i1, i2, ii;
366 /* need to put in a new translation table */
368 case 1: aasmap = AA1; break;
369 case 2: aasmap = AA2; break;
370 case 3: aasmap = AA3; break;
371 case 4: aasmap = AA4; break;
372 case 5: aasmap = AA5; break;
373 case 6: aasmap = AA6; break;
374 case 7: aasmap = AA7; break;
375 case 10: aasmap = AA10; break;
376 case 11: aasmap = AA11; break;
377 case 12: aasmap = AA12; break;
378 case 13: aasmap = AA13; break;
379 case 14: aasmap = AA14; break;
380 case 15: aasmap = AA15; break;
381 case 16: aasmap = AA16; break;
382 case 21: aasmap = AA21; break;
383 case 22: aasmap = AA22; break;
384 case 23: aasmap = AA23; break;
386 default: aasmap = AA1; break;
389 if (debug) fprintf(stderr," codon table: %d\n new old\n",tr_type);
390 for (i0 = 0; i0 < 4; i0++)
391 for (i1 = 0; i1 < 4; i1++)
392 for (i2 = 0; i2 < 4; i2++) {
393 ii = (imap[i0]<<4) + (imap[i1]<<2) + imap[i2];
394 if (debug && aacmap[ii] != *aasmap)
395 fprintf(stderr," %c%c%c: %c - %c\n",
396 nt[imap[i0]+1],nt[imap[i1]+1],nt[imap[i2]+1],
398 aacmap[ii]= *aasmap++;
402 for (i=0; i<64; i++) {
403 fprintf(stderr,"'%c',",aacmap[i]);
404 if ((i%16)==15) fputc('\n',stderr);
409 for (i=0; i<64; i++) {
410 aamap[i]=aascii[aacmap[i]];
411 aamapr[i]=aascii[aacmap[(~i)&63]];
416 aagetmap(char *to, int n)
419 for (i=0; i<n; i++) to[i] = aacmap[i];