2 copyright (c) 1987 William R. Pearson
3 read in the alphabet and pam matrix data
4 designed for universal matcher
6 This version reads BLAST format (square) PAM files
9 /* $Name: fa_34_26_5 $ - $Id: apam.c,v 1.41 2007/03/31 18:47:20 wrp Exp $ */
24 extern void alloc_pam (int d1, int d2, struct pstruct *ppst);
27 pam_opts(char *smstr, struct pstruct *ppst) {
33 if ((bp=strchr(smstr,'-'))!=NULL) {
34 if (!strncmp(bp+1,"MS",2) || !strncmp(bp+1,"ms",2)) {
38 ppst->pamoff=atoi(bp+1);
42 else if ((bp=strchr(smstr,'+'))!=NULL) {
43 ppst->pamoff= -atoi(bp+1);
48 /* modified 13-Oct-2005 to accomodate assymetrical matrices */
51 initpam (char *mfname, struct pstruct *ppst)
58 int ess_tmp, max_val, min_val;
62 pam_opts(mfname, ppst);
64 if ((fmat = fopen (mfname, "r")) == NULL)
66 printf ("***WARNING*** cannot open scoring matrix file %s\n", mfname);
67 fprintf (stderr,"***WARNING*** cannot open scoring matrix file %s\n", mfname);
72 the size of the alphabet is determined in advance
77 ppst->nt_align = (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA);
80 look for alphabet line, skipping the comments
81 alphabet ends up in line[]
83 while (fgets (line, sizeof(line), fmat) != NULL && line[0]=='#');
85 /* decide whether this is a protein or DNA matrix */
86 if (ppst->nt_align) sascii = &nascii[0];
87 else sascii = &aascii[0];
90 re-initialize sascii[] for matrix alphabet
93 /* save ',' value used by FASTS/FASTM/FASTF */
94 ess_tmp = sascii[','];
96 /* clear out sascii */
97 for (i = 0; i <= AAMASK; i++) sascii[i] = NA;
99 /* set end of line stop */
100 sascii[0] = sascii['\r'] = sascii['\n'] = EL;
102 sascii[','] = ess_tmp;
104 /* read the alphabet - determine alphabet nsq */
106 for (i = 0, nsq = 1; line[i]; i++) {
107 if (line[i] == '*') have_es = 1;
108 if (line[i] > ' ') sq[nsq++] = toupper (line[i]);
113 /* set end of sequence stop */
114 fprintf(stderr,"sq[%d]: %s\n",nsq,sq+1);
116 /* initialize sascii */
117 for (iaa = 1; iaa <= nsq; iaa++) {
118 sascii[sq[iaa]] = iaa;
120 if (ppst->dnaseq==SEQT_DNA) {
121 sascii['U'] = sascii['T'];
122 sascii['u'] = sascii['t'];
124 else if (ppst->dnaseq==SEQT_RNA) {
125 sascii['T'] = sascii['U'];
126 sascii['t'] = sascii['u'];
130 finished with sascii[]
134 setup hnt (ambiguous nt hash) values
137 for (iaa = 1; iaa <= nsq; iaa++) {
140 if (ppst->nt_align) { /* DNA ambiguitities */
141 hsq[sascii['R']]=hsq[sascii['M']]=hsq[sascii['W']]=hsq[sascii['A']];
142 hsq[sascii['D']]=hsq[sascii['H']]=hsq[sascii['V']]=hsq[sascii['A']];
143 hsq[sascii['N']]=hsq[sascii['X']]=hsq[sascii['A']];
144 hsq[sascii['Y']]=hsq[sascii['S']]=hsq[sascii['B']]=hsq[sascii['C']];
145 hsq[sascii['K']]=hsq[sascii['G']];
147 else /* protein ambiguities */
148 if (ppst->dnaseq == SEQT_UNK || ppst->dnaseq == SEQT_PROT ||
149 (ppst->nsq >= 20 && ppst->nsq <= 24)) {
150 hsq[sascii['B']] = hsq[sascii['N']];
151 hsq[sascii['Z']] = hsq[sascii['E']];
152 hsq[sascii['X']] = hsq[sascii['A']];
154 /* here if non-DNA, non-protein sequence */
155 else ppst->dnaseq = SEQT_OTHER;
158 check for 2D pam - if not found, allocate it
161 if (!ppst->have_pam2) {
162 alloc_pam (MAXSQ, MAXSQ, ppst);
167 read the scoring matrix values
172 for (j=0; j < nsq; j++) ppst->pam2[0][0][j] = -BIGNUM;
173 for (iaa = 1; iaa <= nsq; iaa++) { /* read pam value line */
174 if (fgets(line,sizeof(line),fmat)==NULL) {
175 fprintf (stderr," error reading pam line: %s\n",line);
178 /* fprintf(stderr,"%d/%d %s",iaa,nsq,line); */
179 strtok(line," \t\n"); /* skip the letter (residue) */
180 ppst->pam2[0][i][0] = -BIGNUM;
181 for (j = 1; j <= nsq; j++) { /* iaa limits to triangle */
182 lp=strtok(NULL," \t\n"); /* get the number string */
183 pval=ppst->pam2[0][iaa][j]=atoi(lp); /* convert to integer */
184 if (pval > max_val) max_val = pval;
185 if (pval < min_val) min_val = pval;
194 for (j=1; j<=nsq; j++) ppst->pam2[0][nsq][j]= -1;
195 ppst->pam2[0][nsq][nsq]= max_val/2;
198 ppst->sqx[0]='\0'; /* initialize sqx[] */
199 for (i=1; i<= nsq; i++) {
200 ppst->sqx[i] = sq[i];
201 ppst->sqx[i+nsq] = tolower(sq[i]);
202 if (sascii[aa[i]] < NA && sq[i] >= 'A' && sq[i] <= 'Z')
203 sascii[aa[i] - 'A' + 'a'] = sascii[aa[i]]+nsq;
206 ppst->nsq = nsq; /* save new nsq */
207 ppst->nsqx = nsq*2; /* save new nsqx */
209 ppst->pam_h = max_val;
210 ppst->pam_l = min_val;
212 strncpy (ppst->pamfile, mfname, MAX_FN);
213 ppst->pamfile[MAX_FN-1]='\0';
216 strncat(ppst->pamfile,"-MS",MAX_FN-strlen(ppst->pamfile)-1);
218 ppst->pamfile[MAX_FN-1]='\0';
223 /* make a DNA scoring from +match/-mismatch values */
225 void mk_n_pam(int *arr,int siz, int mat, int mis)
228 /* current default match/mismatch values */
231 float f_val, f_scale;
233 f_scale = (float)(mat - mis)/(float)(max_mat - min_mis);
236 for (i = 0; i<nnt-1; i++)
237 for (j = 0; j <= i; j++ ) {
238 if (arr[k] == max_mat) arr[k] = mat;
239 else if (arr[k] == min_mis) arr[k] = mis;
240 else if (arr[k] != -1) {
241 f_val = (arr[k] - min_mis)*f_scale + 0.5;
242 arr[k] = f_val + mis;
257 struct std_pam_str std_pams[] = {
258 {"P120", "PAM120", apam120, 0.346574, -20, -3},
259 {"P250", "PAM250", apam250, 0.231049, -12, -2},
260 {"P10", "MD10", a_md10, 0.346574, -27, -4},
261 {"M10", "MD10", a_md10, 0.346574, -27, -4},
262 {"MD10", "MD10", a_md10, 0.346574, -27, -4},
263 {"P20", "MD20", a_md20, 0.346574, -26, -4},
264 {"M20", "MD20", a_md20, 0.346574, -26, -4},
265 {"MD20", "MD20", a_md20, 0.346574, -26, -4},
266 {"P40", "MD40", a_md40, 0.346574, -25, -4},
267 {"M40", "MD40", a_md40, 0.346574, -25, -4},
268 {"MD40", "MD40", a_md40, 0.346574, -25, -4},
269 {"BL50", "BL50", abl50, 0.231049, -12, -2},
270 {"BL62", "BL62", abl62, 0.346574, -8, -1},
271 {"BP62", "BL62", abl62, 0.346574, -12, -1},
272 {"BL80", "BL80", abl80, 0.346574, -12, -2},
273 {"\0", "\0", NULL, 0.0, 0, 0}
277 standard_pam(char *smstr, struct pstruct *ppst, int del_set, int gap_set) {
279 struct std_pam_str *std_pam_p;
281 pam_opts(smstr, ppst);
283 for (std_pam_p = std_pams; std_pam_p->abbrev[0]; std_pam_p++ ) {
284 if (strcmp(smstr,std_pam_p->abbrev)==0) {
285 pam = std_pam_p->pam;
286 strncpy(ppst->pamfile,std_pam_p->name,MAX_FN);
287 ppst->pamfile[MAX_FN-1]='\0';
289 strncat(ppst->pamfile,"-MS",MAX_FN-strlen(ppst->pamfile)-1);
291 ppst->pamfile[MAX_FN-1]='\0';
293 if (!del_set) ppst->gdelval = std_pam_p->gdel;
295 if (!del_set) ppst->gdelval = std_pam_p->gdel-std_pam_p->ggap;
297 if (!gap_set) ppst->ggapval = std_pam_p->ggap;
298 ppst->pamscale = std_pam_p->scale;
305 /* ESS must match uascii.h */
309 build_xascii(int *qascii, char *save_str) {
311 int comma_val, term_val;
312 int save_arr[MAX_SSTR];
314 comma_val = qascii[','];
315 term_val = qascii['*'];
317 /* preserve special characters */
318 for (i=0; i < MAX_SSTR && save_str[i]; i++ ) {
319 save_arr[i] = qascii[save_str[i]];
323 for (i=1; i<128; i++) {
326 /* range of values in aax, ntx is from 1..naax,nntx -
327 do not zero-out qascii[0] - 9 Oct 2002 */
329 for (i=1; i<naax; i++) {
330 qascii[aax[i]]=aax[i];
333 for (i=1; i<nntx; i++) {
334 qascii[ntx[i]]=ntx[i];
337 qascii['\n']=qascii['\r']=qascii[0] = EL;
339 qascii[','] = comma_val;
340 qascii['*'] = term_val;
342 for (i=0; i < max_save; i++) {
343 qascii[save_str[i]]=save_arr[i];
348 checks for lower case letters in *sq array;
349 if not present, map lowercase to upper
352 init_ascii(int is_ext, int *sascii, int is_dna) {
358 if (is_dna==SEQT_UNK) return;
360 term_char = sascii['*'];
362 if (is_dna==SEQT_DNA || is_dna == SEQT_RNA) {
367 else {sq = &nt[0]; nsq = nnt;}
370 if (is_ext) { sq = &aax[0]; nsq = naax; }
371 else {sq = &aa[0]; nsq = naa;}
375 /* initialize sascii from sq[], checking for lower-case letters */
377 for (isq = 1; isq <= nsq; isq++) {
378 sascii[sq[isq]] = isq;
379 if (sq[isq] >= 'a' && sq[isq] <= 'z') have_lc = 1;
382 /* no lower case letters in alphabet, map lower case to upper */
384 for (isq = 1; isq <= nsq; isq++) {
385 if (sq[isq] >= 'A' && sq[isq] <= 'Z') sascii[sq[isq]-'A'+'a'] = isq;
387 if (is_dna==1) sascii['u'] = sascii['t'];
390 sascii['*']=term_char;
393 print_pam(struct pstruct *ppst) {
397 fprintf(stderr," ext_sq_set: %d\n",ppst->ext_sq_set);
403 fprintf(stderr," sq[%d]: %s\n",nsq, sq);
405 if (ppst->ext_sq_set) {
409 fprintf(stderr," sq[%d]: %s\n",nsq, sq);
412 for (i=1; i<=nsq; i++) {
413 fprintf(stderr," %c:%c - %3d\n",sq[i], sq[i], ppst->pam2[ip][i][i]);