Next version of JABA
[jabaws.git] / binaries / src / muscle / alpha.cpp
1 #include "muscle.h"
2 #include <ctype.h>
3
4 /***
5 From Bioperl docs:
6 Extended DNA / RNA alphabet
7 ------------------------------------------
8 Symbol       Meaning      Nucleic Acid
9 ------------------------------------------
10     A            A           Adenine
11     C            C           Cytosine
12     G            G           Guanine
13     T            T           Thymine
14     U            U           Uracil
15     M          A or C
16     R          A or G
17     W          A or T
18     S          C or G
19     Y          C or T
20     K          G or T
21     V        A or C or G
22     H        A or C or T
23     D        A or G or T
24     B        C or G or T
25     X      G or A or T or C
26     N      G or A or T or C
27
28 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
29          Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
30 ***/
31
32 unsigned g_CharToLetter[MAX_CHAR];
33 unsigned g_CharToLetterEx[MAX_CHAR];
34
35 char g_LetterToChar[MAX_ALPHA];
36 char g_LetterExToChar[MAX_ALPHA_EX];
37
38 char g_UnalignChar[MAX_CHAR];
39 char g_AlignChar[MAX_CHAR];
40
41 bool g_IsWildcardChar[MAX_CHAR];
42 bool g_IsResidueChar[MAX_CHAR];
43
44 ALPHA g_Alpha = ALPHA_Undefined;
45 unsigned g_AlphaSize = 0;
46
47 #define Res(c, Letter)                                                                                          \
48         {                                                                                                                               \
49         const unsigned char Upper = (unsigned char) toupper(c);                 \
50         const unsigned char Lower = (unsigned char) tolower(c);                 \
51         g_CharToLetter[Upper] = Letter;                                                                 \
52         g_CharToLetter[Lower] = Letter;                                                                 \
53         g_CharToLetterEx[Upper] = Letter;                                                               \
54         g_CharToLetterEx[Lower] = Letter;                                                               \
55         g_LetterToChar[Letter] = Upper;                                                                 \
56         g_LetterExToChar[Letter] = Upper;                                                               \
57         g_IsResidueChar[Upper] = true;                                                                  \
58         g_IsResidueChar[Lower] = true;                                                                  \
59         g_AlignChar[Upper] = Upper;                                                                             \
60         g_AlignChar[Lower] = Upper;                                                                             \
61         g_UnalignChar[Upper] = Lower;                                                                   \
62         g_UnalignChar[Lower] = Lower;                                                                   \
63         }
64
65 #define Wild(c, Letter)                                                                                         \
66         {                                                                                                                               \
67         const unsigned char Upper = (unsigned char) toupper(c);                 \
68         const unsigned char Lower = (unsigned char) tolower(c);                 \
69         g_CharToLetterEx[Upper] = Letter;                                                               \
70         g_CharToLetterEx[Lower] = Letter;                                                               \
71         g_LetterExToChar[Letter] = Upper;                                                               \
72         g_IsResidueChar[Upper] = true;                                                                  \
73         g_IsResidueChar[Lower] = true;                                                                  \
74         g_AlignChar[Upper] = Upper;                                                                             \
75         g_AlignChar[Lower] = Upper;                                                                             \
76         g_UnalignChar[Upper] = Lower;                                                                   \
77         g_UnalignChar[Lower] = Lower;                                                                   \
78         g_IsWildcardChar[Lower] = true;                                                                 \
79         g_IsWildcardChar[Upper] = true;                                                                 \
80         }
81
82 static unsigned GetAlphaSize(ALPHA Alpha)
83         {
84         switch (Alpha)
85                 {
86         case ALPHA_Amino:
87                 return 20;
88
89         case ALPHA_RNA:
90         case ALPHA_DNA:
91                 return 4;
92                 }
93         Quit("Invalid Alpha=%d", Alpha);
94         return 0;
95         }
96
97 static void InitArrays()
98         {
99         memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter));
100         memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx));
101
102         memset(g_LetterToChar, '?', sizeof(g_LetterToChar));
103         memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar));
104
105         memset(g_AlignChar, '?', sizeof(g_UnalignChar));
106         memset(g_UnalignChar, '?', sizeof(g_UnalignChar));
107
108         memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar));
109         }
110
111 static void SetGapChar(char c)
112         {
113         unsigned char u = (unsigned char) c;
114
115         g_CharToLetterEx[u] = AX_GAP;
116         g_LetterExToChar[AX_GAP] = u;
117         g_AlignChar[u] = u;
118         g_UnalignChar[u] = u;
119         }
120
121 static void SetAlphaDNA()
122         {
123         Res('A', NX_A)
124         Res('C', NX_C)
125         Res('G', NX_G)
126         Res('T', NX_T)
127         Wild('M', NX_M)
128         Wild('R', NX_R)
129         Wild('W', NX_W)
130         Wild('S', NX_S)
131         Wild('Y', NX_Y)
132         Wild('K', NX_K)
133         Wild('V', NX_V)
134         Wild('H', NX_H)
135         Wild('D', NX_D)
136         Wild('B', NX_B)
137         Wild('X', NX_X)
138         Wild('N', NX_N)
139         }
140
141 static void SetAlphaRNA()
142         {
143         Res('A', NX_A)
144         Res('C', NX_C)
145         Res('G', NX_G)
146         Res('U', NX_U)
147         Res('T', NX_T)
148         Wild('M', NX_M)
149         Wild('R', NX_R)
150         Wild('W', NX_W)
151         Wild('S', NX_S)
152         Wild('Y', NX_Y)
153         Wild('K', NX_K)
154         Wild('V', NX_V)
155         Wild('H', NX_H)
156         Wild('D', NX_D)
157         Wild('B', NX_B)
158         Wild('X', NX_X)
159         Wild('N', NX_N)
160         }
161
162 static void SetAlphaAmino()
163         {
164         Res('A', AX_A)
165         Res('C', AX_C)
166         Res('D', AX_D)
167         Res('E', AX_E)
168         Res('F', AX_F)
169         Res('G', AX_G)
170         Res('H', AX_H)
171         Res('I', AX_I)
172         Res('K', AX_K)
173         Res('L', AX_L)
174         Res('M', AX_M)
175         Res('N', AX_N)
176         Res('P', AX_P)
177         Res('Q', AX_Q)
178         Res('R', AX_R)
179         Res('S', AX_S)
180         Res('T', AX_T)
181         Res('V', AX_V)
182         Res('W', AX_W)
183         Res('Y', AX_Y)
184
185         Wild('B', AX_B)
186         Wild('X', AX_X)
187         Wild('Z', AX_Z)
188         }
189
190 void SetAlpha(ALPHA Alpha)
191         {
192         InitArrays();
193
194         SetGapChar('.');
195         SetGapChar('-');
196
197         switch (Alpha)
198                 {
199         case ALPHA_Amino:
200                 SetAlphaAmino();
201                 break;
202
203         case ALPHA_DNA:
204                 SetAlphaDNA();
205
206         case ALPHA_RNA:
207                 SetAlphaRNA();
208                 break;
209
210         default:
211                 Quit("Invalid Alpha=%d", Alpha);
212                 }
213
214         g_AlphaSize = GetAlphaSize(Alpha);
215         g_Alpha = Alpha;
216
217         if (g_bVerbose)
218                 Log("Alphabet %s\n", ALPHAToStr(g_Alpha));
219         }
220
221 char GetWildcardChar()
222         {
223         switch (g_Alpha)
224                 {
225         case ALPHA_Amino:
226                 return 'X';
227
228         case ALPHA_DNA:
229         case ALPHA_RNA:
230                 return 'N';
231
232         default:
233                 Quit("Invalid Alpha=%d", g_Alpha);
234                 }
235         return '?';
236         }
237
238 bool IsNucleo(char c)
239         {
240         return strchr("ACGTURYNacgturyn", c) != 0;
241         }
242
243 bool IsDNA(char c)
244         {
245         return strchr("AGCTNagctn", c) != 0;
246         }
247
248 bool IsRNA(char c)
249         {
250         return strchr("AGCUNagcun", c) != 0;
251         }
252
253 static char InvalidLetters[256];
254 static int InvalidLetterCount = 0;
255
256 void ClearInvalidLetterWarning()
257         {
258         memset(InvalidLetters, 0, 256);
259         }
260
261 void InvalidLetterWarning(char c, char w)
262         {
263         InvalidLetters[(unsigned char) c] = 1;
264         ++InvalidLetterCount;
265         }
266
267 void ReportInvalidLetters()
268         {
269         if (0 == InvalidLetterCount)
270                 return;
271
272         char Str[257];
273         memset(Str, 0, 257);
274
275         int n = 0;
276         for (int i = 0; i < 256; ++i)
277                 {
278                 if (InvalidLetters[i])
279                         Str[n++] = (char) i;
280                 }
281         Warning("Assuming %s (see -seqtype option), invalid letters found: %s",
282           ALPHAToStr(g_Alpha), Str);
283         }