/* pam.c 19-June-86 copyright (c) 1987 William R. Pearson read in the alphabet and pam matrix data designed for universal matcher This version reads BLAST format (square) PAM files */ /* $Name: fa_34_26_5 $ - $Id: apam.c,v 1.41 2007/03/31 18:47:20 wrp Exp $ */ #include #include #include #include #include "defs.h" #include "param.h" #define XTERNAL #include "uascii.h" #include "upam.h" #undef XTERNAL extern void alloc_pam (int d1, int d2, struct pstruct *ppst); void pam_opts(char *smstr, struct pstruct *ppst) { char *bp; ppst->pam_ms = 0; ppst->pamoff = 0; if ((bp=strchr(smstr,'-'))!=NULL) { if (!strncmp(bp+1,"MS",2) || !strncmp(bp+1,"ms",2)) { ppst->pam_ms = 1; } else { ppst->pamoff=atoi(bp+1); } *bp = '\0'; } else if ((bp=strchr(smstr,'+'))!=NULL) { ppst->pamoff= -atoi(bp+1); *bp = '\0'; } } /* modified 13-Oct-2005 to accomodate assymetrical matrices */ int initpam (char *mfname, struct pstruct *ppst) { char line[512], *lp; int i, j, iaa, pval; int *hsq, nsq; int *sascii; char *sq; int ess_tmp, max_val, min_val; int have_es = 0; FILE *fmat; pam_opts(mfname, ppst); if ((fmat = fopen (mfname, "r")) == NULL) { printf ("***WARNING*** cannot open scoring matrix file %s\n", mfname); fprintf (stderr,"***WARNING*** cannot open scoring matrix file %s\n", mfname); return 0; } /* the size of the alphabet is determined in advance */ hsq = ppst->hsq; sq = ppst->sq; ppst->nt_align = (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA); /* look for alphabet line, skipping the comments alphabet ends up in line[] */ while (fgets (line, sizeof(line), fmat) != NULL && line[0]=='#'); /* decide whether this is a protein or DNA matrix */ if (ppst->nt_align) sascii = &nascii[0]; else sascii = &aascii[0]; /* re-initialize sascii[] for matrix alphabet */ /* save ',' value used by FASTS/FASTM/FASTF */ ess_tmp = sascii[',']; /* clear out sascii */ for (i = 0; i <= AAMASK; i++) sascii[i] = NA; /* set end of line stop */ sascii[0] = sascii['\r'] = sascii['\n'] = EL; sascii[','] = ess_tmp; /* read the alphabet - determine alphabet nsq */ sq[0] = '\0'; for (i = 0, nsq = 1; line[i]; i++) { if (line[i] == '*') have_es = 1; if (line[i] > ' ') sq[nsq++] = toupper (line[i]); } sq[nsq]='\0'; nsq--; /* set end of sequence stop */ fprintf(stderr,"sq[%d]: %s\n",nsq,sq+1); /* initialize sascii */ for (iaa = 1; iaa <= nsq; iaa++) { sascii[sq[iaa]] = iaa; } if (ppst->dnaseq==SEQT_DNA) { sascii['U'] = sascii['T']; sascii['u'] = sascii['t']; } else if (ppst->dnaseq==SEQT_RNA) { sascii['T'] = sascii['U']; sascii['t'] = sascii['u']; } /* finished with sascii[] */ /* setup hnt (ambiguous nt hash) values */ hsq[0] = 0; for (iaa = 1; iaa <= nsq; iaa++) { hsq[iaa]=iaa; } if (ppst->nt_align) { /* DNA ambiguitities */ hsq[sascii['R']]=hsq[sascii['M']]=hsq[sascii['W']]=hsq[sascii['A']]; hsq[sascii['D']]=hsq[sascii['H']]=hsq[sascii['V']]=hsq[sascii['A']]; hsq[sascii['N']]=hsq[sascii['X']]=hsq[sascii['A']]; hsq[sascii['Y']]=hsq[sascii['S']]=hsq[sascii['B']]=hsq[sascii['C']]; hsq[sascii['K']]=hsq[sascii['G']]; } else /* protein ambiguities */ if (ppst->dnaseq == SEQT_UNK || ppst->dnaseq == SEQT_PROT || (ppst->nsq >= 20 && ppst->nsq <= 24)) { hsq[sascii['B']] = hsq[sascii['N']]; hsq[sascii['Z']] = hsq[sascii['E']]; hsq[sascii['X']] = hsq[sascii['A']]; } /* here if non-DNA, non-protein sequence */ else ppst->dnaseq = SEQT_OTHER; /* check for 2D pam - if not found, allocate it */ if (!ppst->have_pam2) { alloc_pam (MAXSQ, MAXSQ, ppst); ppst->have_pam2 = 1; } /* read the scoring matrix values */ max_val = -1; min_val = 1; for (j=0; j < nsq; j++) ppst->pam2[0][0][j] = -BIGNUM; for (iaa = 1; iaa <= nsq; iaa++) { /* read pam value line */ if (fgets(line,sizeof(line),fmat)==NULL) { fprintf (stderr," error reading pam line: %s\n",line); exit (1); } /* fprintf(stderr,"%d/%d %s",iaa,nsq,line); */ strtok(line," \t\n"); /* skip the letter (residue) */ ppst->pam2[0][i][0] = -BIGNUM; for (j = 1; j <= nsq; j++) { /* iaa limits to triangle */ lp=strtok(NULL," \t\n"); /* get the number string */ pval=ppst->pam2[0][iaa][j]=atoi(lp); /* convert to integer */ if (pval > max_val) max_val = pval; if (pval < min_val) min_val = pval; } } if (have_es==0) { sascii['*']=nsq; nsq++; sq[nsq]='*'; sq[nsq+1]='\0'; for (j=1; j<=nsq; j++) ppst->pam2[0][nsq][j]= -1; ppst->pam2[0][nsq][nsq]= max_val/2; } ppst->sqx[0]='\0'; /* initialize sqx[] */ for (i=1; i<= nsq; i++) { ppst->sqx[i] = sq[i]; ppst->sqx[i+nsq] = tolower(sq[i]); if (sascii[aa[i]] < NA && sq[i] >= 'A' && sq[i] <= 'Z') sascii[aa[i] - 'A' + 'a'] = sascii[aa[i]]+nsq; } ppst->nsq = nsq; /* save new nsq */ ppst->nsqx = nsq*2; /* save new nsqx */ ppst->pam_h = max_val; ppst->pam_l = min_val; strncpy (ppst->pamfile, mfname, MAX_FN); ppst->pamfile[MAX_FN-1]='\0'; if (ppst->pam_ms) { strncat(ppst->pamfile,"-MS",MAX_FN-strlen(ppst->pamfile)-1); } ppst->pamfile[MAX_FN-1]='\0'; fclose (fmat); return 1; } /* make a DNA scoring from +match/-mismatch values */ void mk_n_pam(int *arr,int siz, int mat, int mis) { int i, j, k; /* current default match/mismatch values */ int max_mat = +5; int min_mis = -4; float f_val, f_scale; f_scale = (float)(mat - mis)/(float)(max_mat - min_mis); k = 0; for (i = 0; iabbrev[0]; std_pam_p++ ) { if (strcmp(smstr,std_pam_p->abbrev)==0) { pam = std_pam_p->pam; strncpy(ppst->pamfile,std_pam_p->name,MAX_FN); ppst->pamfile[MAX_FN-1]='\0'; if (ppst->pam_ms) { strncat(ppst->pamfile,"-MS",MAX_FN-strlen(ppst->pamfile)-1); } ppst->pamfile[MAX_FN-1]='\0'; #ifdef OLD_FASTA_GAP if (!del_set) ppst->gdelval = std_pam_p->gdel; #else if (!del_set) ppst->gdelval = std_pam_p->gdel-std_pam_p->ggap; #endif if (!gap_set) ppst->ggapval = std_pam_p->ggap; ppst->pamscale = std_pam_p->scale; return 1; } } return 0; } /* ESS must match uascii.h */ #define ESS 49 void build_xascii(int *qascii, char *save_str) { int i, max_save; int comma_val, term_val; int save_arr[MAX_SSTR]; comma_val = qascii[',']; term_val = qascii['*']; /* preserve special characters */ for (i=0; i < MAX_SSTR && save_str[i]; i++ ) { save_arr[i] = qascii[save_str[i]]; } max_save = i; for (i=1; i<128; i++) { qascii[i]=NA; } /* range of values in aax, ntx is from 1..naax,nntx - do not zero-out qascii[0] - 9 Oct 2002 */ for (i=1; i= 'a' && sq[isq] <= 'z') have_lc = 1; } /* no lower case letters in alphabet, map lower case to upper */ if (have_lc != 1) { for (isq = 1; isq <= nsq; isq++) { if (sq[isq] >= 'A' && sq[isq] <= 'Z') sascii[sq[isq]-'A'+'a'] = isq; } if (is_dna==1) sascii['u'] = sascii['t']; } sascii['*']=term_char; } print_pam(struct pstruct *ppst) { int i, nsq, ip; char *sq; fprintf(stderr," ext_sq_set: %d\n",ppst->ext_sq_set); nsq = ppst->nsq; ip = 0; sq = ppst->sq; fprintf(stderr," sq[%d]: %s\n",nsq, sq); if (ppst->ext_sq_set) { nsq = ppst->nsqx; ip = 1; sq = ppst->sqx; fprintf(stderr," sq[%d]: %s\n",nsq, sq); } for (i=1; i<=nsq; i++) { fprintf(stderr," %c:%c - %3d\n",sq[i], sq[i], ppst->pam2[ip][i][i]); } }