/* copyright (c) 1998, 1999 William R. Pearson and the U. of Virginia */ /* $Name: fa_34_26_5 $ - $Id: dropfx.c,v 1.68 2007/04/26 18:37:18 wrp Exp $ */ /* implements the fastx algorithm, see: W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of DNA sequences with protein sequences" Genomics 46:24-36 see dropnfa.c for better variable descriptions and comments */ /* 18-Sept-2006 - remove global variables used for alignment */ /* 22-June-2006 - correct incorrect alignment coordinates generated after pro_dna() on projected DNA region. */ /* 9-May-2003 -> 3.46 changed lx_band to use projected protein boundary end. this fixes some addressing issues on MacOSX, and speeds up alignment on very long proteins */ #include #include #include #include #include #include "defs.h" #include "param.h" #define XTERNAL #include "upam.h" /* this must be consistent with upam.h */ #define MAXHASH 32 #define NMAP MAXHASH+1 /* globals for fasta */ #define MAXWINDOW 64 #ifndef MAXSAV #define MAXSAV 10 #endif #ifndef ALLOCN0 static char *verstr="3.5 Sept 2006"; #else static char *verstr="3.5an0 May 2006"; #endif struct dstruct /* diagonal structure for saving current run */ { int score; /* hash score of current match */ int start; /* start of current match */ int stop; /* end of current match */ struct savestr *dmax; /* location in vmax[] where best score data saved */ }; struct savestr { int score; /* pam score with segment optimization */ int score0; /* pam score of best single segment */ int gscore; /* score from global match */ int dp; /* diagonal of match */ int start; /* start of match in lib seq */ int stop; /* end of match in lib seq */ }; struct swstr { int H, E;}; struct bdstr { int CC, DD, CP, DP;}; void savemax(); void kpsort(); struct sx_s {int C1, C2, C3, I1, I2, I3, flag; }; struct f_struct { struct dstruct *diag; struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */ struct savestr *vptr[MAXSAV]; struct savestr *lowmax; int ndo; int noff; int hmask; /* hash constants */ int *pamh1; /* pam based array */ int *pamh2; /* pam based kfact array */ int *link, *harr; /* hash arrays */ int kshft; /* shift width */ int nsav, lowscor; /* number of saved runs, worst saved run */ #ifndef TFAST unsigned char *aa0x; /* contains translated codons 111222333*/ unsigned char *aa0y; /* contains translated codons 123123123*/ #else unsigned char *aa1x; /* contains translated codons 111222333 */ unsigned char *aa1y; /* contains translated codons 123123123 */ #endif struct sx_s *cur; int *waa0; int *waa1; int *res; int max_res; }; #define DROP_INTERN #include "drop_func.h" static int dmatchx(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int hoff, int window, int **pam2, int gdelval, int ggapval, int gshift, struct f_struct *f_str); int shscore(unsigned char *aa0, int n0, int **pam2); int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame); int spam (const unsigned char *aa0, const unsigned char *aa1, struct savestr *dmax, int **pam2, struct f_struct *f_str); int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str); int lx_band(const unsigned char *prot_seq, int len_prot, const unsigned char *dna_prot_seq, int len_dna_prot, int **pam_matrix, int gopen, int gext, int gshift, int start_diag, int width, struct f_struct *f_str); static void update_code(char *al_str, int al_str_max, int op, int op_cnt, char *op_char); extern void w_abort (char *p, char *p1); /* initialize for fasta */ void init_work (unsigned char *aa0, int n0, struct pstruct *ppst, struct f_struct **f_arg) { int mhv, phv; int hmax; int i0, hv; int pamfact; int btemp; struct f_struct *f_str; int ktup; /* word size examined */ int fact; /* factor used to scale ktup match value */ int kt1; /* ktup-1 */ int lkt; /* last ktup - initiall kt1, but can be increased for hsq >= NMAP */ int maxn0; int *pwaa; int i, j, q; struct swstr *ss, *r_ss; int *waa; int *res; int nsq, ip, *hsq; #ifndef TFAST int last_n0, itemp; unsigned char *fd, *fs, *aa0x, *aa0y, *aa0s; int n0x, n0x3; #endif if (ppst->ext_sq_set) { nsq = ppst->nsqx; ip = 1; hsq = ppst->hsqx; } else { nsq = ppst->nsq; ip = 0; hsq = ppst->hsq; } f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct)); btemp = 2 * ppst->param_u.fa.bestoff / 3 + n0 / ppst->param_u.fa.bestscale + ppst->param_u.fa.bkfact * (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup); btemp = min (btemp, ppst->param_u.fa.bestmax); if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5; ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3; if (ppst->param_u.fa.optcut_set != 1) #ifndef TFAST ppst->param_u.fa.optcut = (btemp*5)/4; #else ppst->param_u.fa.optcut = (btemp*4)/3; #endif #ifdef OLD_FASTA_GAP ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval; #else ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval; #endif pamfact = ppst->param_u.fa.pamfact; ktup = ppst->param_u.fa.ktup; fact = ppst->param_u.fa.scfact * ktup; if (pamfact == -1) pamfact = 0; else if (pamfact == -2) pamfact = 1; for (i0 = 1, mhv = -1; i0 <=nsq; i0++) if (hsq[i0] < NMAP && hsq[i0] > mhv) mhv = hsq[i0]; if (mhv <= 0) { fprintf (stderr, " maximum hsq <=0 %d\n", mhv); exit (1); } for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++; /* kshft = 2; */ kt1 = ktup - 1; hv = 1; for (i0 = 0; i0 < ktup; i0++) { hv = hv << f_str->kshft; } hmax = hv; f_str->hmask = (hmax >> f_str->kshft) - 1; if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash array\n"); exit (1); } if ((f_str->pamh1 = (int *) calloc (nsq+1, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh1 array\n"); exit (1); } if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh2 array\n"); exit (1); } if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash link array"); exit (1); } #ifdef TFAST if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2); exit (1); } f_str->aa1x++; if ((f_str->aa1y =(unsigned char *)calloc((size_t)ppst->maxlen+2, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa1y array %d\n", ppst->maxlen+2); exit (1); } f_str->aa1y++; #else /* FASTX */ maxn0 = n0 + 2; if ((aa0x =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0); exit (1); } aa0x++; f_str->aa0x = aa0x; if ((aa0y =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa0y array %d\n", maxn0); exit (1); } aa0y++; f_str->aa0y = aa0y; last_n0 = 0; for (itemp=0; itemp<3; itemp++) { n0x = saatran(aa0,&aa0x[last_n0],n0,itemp); /* for (i=0; iharr[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1; /* encode the aa0 array */ phv = hv = 0; lkt = kt1; for (i0 = 0; i0 < min(lkt,n0); i0++) { if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt=i0+ktup; continue;} hv = (hv << f_str->kshft) + hsq[aa0[i0]]; phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup; } for (; i0 < n0; i0++) { if (hsq[aa0[i0]] >= NMAP) { hv=phv=0; lkt = i0+ktup; /* restart hv, phv calculation */ for (; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0= NMAP) {hv=phv=0; lkt = i0+ktup; continue;} hv = (hv << f_str->kshft) + hsq[aa0[i0]]; phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup; } } if (i0 >= n0) break; hv = ((hv & f_str->hmask) << f_str->kshft) + hsq[aa0[i0]]; f_str->link[i0] = f_str->harr[hv]; f_str->harr[hv] = i0; if (pamfact) { f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup); /* this check should always be true, but just in case */ if (hsq[aa0[i0-kt1]]pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup; } else f_str->pamh2[hv] = fact * ktup; } #ifndef TFAST /* done hashing, now switch aa0, aa0x back */ fs = aa0; aa0 = aa0x; aa0x = fs; #endif /* this has been modified from 0..pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup; else for (i0 = 1; i0 <= nsq; i0++) f_str->pamh1[i0] = fact; f_str->ndo = 0; /* used to save time on diagonals with long queries */ #ifndef ALLOCN0 if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG, sizeof (struct dstruct)))==NULL) { fprintf (stderr," cannot allocate diagonal arrays: %ld\n", (long) MAXDIAG *sizeof (struct dstruct)); exit (1); }; #else if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0, sizeof (struct dstruct)))==NULL) { fprintf (stderr," cannot allocate diagonal arrays: %ld\n", (long)n0*sizeof (struct dstruct)); exit (1); }; #endif if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) { fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0); exit(1); } pwaa = waa; for (i=0; i<=nsq; i++) { for (j=0;jpam2[ip][i][aa0[j]]; pwaa++; } } f_str->waa0 = waa; if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) { fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0); exit(1); } pwaa = waa; for (i=0; i<=nsq; i++) { for (j=0;jpam2[0][i][aa0[j]]; pwaa++; } } f_str->waa1 = waa; #ifndef TFAST maxn0 = max(2*n0,MIN_RES); #else /* maxn0 needs to be large enough to accomodate introns for TFASTX. For all other functions, it will be more reasonable. */ maxn0 = max(4*n0,MIN_RES); #endif if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) { fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0); exit(1); } f_str->res = res; f_str->max_res = maxn0; *f_arg = f_str; } /* pstring1 is a message to the manager, currently 512 */ /* pstring2 is the same information, but in a markx==10 format */ void get_param (struct pstruct *pstr, char *pstring1, char *pstring2) { #ifndef TFAST char *pg_str="FASTX"; #else char *pg_str="TFASTX"; #endif if (!pstr->param_u.fa.optflag) #ifdef OLD_FASTA_GAP sprintf (pstring1, "%s (%s) function [%s matrix (%d:%d:%d)%s] ktup: %d\n join: %d, gap-pen: %d/%d, shift: %d width: %3d",pg_str,verstr, #else sprintf (pstring1, "%s (%s) function [%s matrix (o=%d:%d:%d:%d)%s] ktup: %d\n join: %d, open/ext: %d/%d, shift: %d width: %3d",pg_str,verstr, #endif pstr->pamfile, pstr->pam_h,pstr->pam_l,pstr->pam_xx,pstr->pam_xm, (pstr->ext_sq_set) ? "xS":"\0", pstr->param_u.fa.ktup, pstr->param_u.fa.cgap, pstr->gdelval, pstr->ggapval, pstr->gshift, pstr->param_u.fa.optwid); else #ifdef OLD_FASTA_GAP sprintf (pstring1, "%s (%s) function [optimized, %s matrix (%d:%d:%d)%s] ktup: %d\n join: %d, opt: %d, gap-pen: %d/%d shift: %3d, width: %3d",pg_str,verstr, #else sprintf (pstring1, "%s (%s) function [optimized, %s matrix (o=%d:%d:%d:%d)%s] ktup: %d\n join: %d, opt: %d, open/ext: %d/%d shift: %3d, width: %3d",pg_str,verstr, #endif pstr->pamfile, pstr->pam_h,pstr->pam_l,pstr->pam_xx, pstr->pam_xm, (pstr->ext_sq_set) ? "xS":"\0", pstr->param_u.fa.ktup, pstr->param_u.fa.cgap, pstr->param_u.fa.optcut, pstr->gdelval, pstr->ggapval, pstr->gshift,pstr->param_u.fa.optwid); if (pstr->param_u.fa.iniflag) strcat(pstring1," init1"); /* if (pstr->zsflag==0) strcat(pstring1," not-scaled"); else if (pstr->zsflag==1) strcat(pstring1," reg.-scaled"); */ if (pstring2 != NULL) { #ifdef OLD_FASTA_GAP sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)%s\n\ ; pg_gap-pen: %d %d\n; pg_ktup: %d\n; pg_optcut: %d\n; pg_cgap: %d\n", #else sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)%s\n\ ; pg_open_ext: %d %d\n; pg_ktup: %d\n; pg_optcut: %d\n; pg_cgap: %d\n", #endif pg_str,verstr,pstr->pamfile, pstr->pam_h,pstr->pam_l, (pstr->ext_sq_set) ? "xS":"\0", pstr->gdelval, pstr->ggapval,pstr->param_u.fa.ktup,pstr->param_u.fa.optcut, pstr->param_u.fa.cgap); } } void close_work (const unsigned char *aa0, int n0, struct pstruct *ppst, struct f_struct **f_arg) { struct f_struct *f_str; f_str = *f_arg; if (f_str != NULL) { free(f_str->cur); #ifndef TFAST f_str->aa0y--; free(f_str->aa0y); f_str->aa0x--; free(f_str->aa0x); #else f_str->aa1y--; free(f_str->aa1y); f_str->aa1x--; free(f_str->aa1x); #endif free(f_str->res); free(f_str->waa1); free(f_str->waa0); free(f_str->diag); free(f_str->link); free(f_str->pamh2); free(f_str->pamh1); free(f_str->harr); free(f_str); *f_arg = NULL; } } void do_fastx (const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, struct pstruct *ppst, struct f_struct *f_str, struct rstruct *rst, int *hoff) { int nd; /* diagonal array size */ int lhval; int kfact; int i; int my_hoff; register struct dstruct *dptr; register int tscor; #ifndef ALLOCN0 register struct dstruct *diagp; #else register int dpos; int lposn0; #endif struct dstruct *dpmax; register int lpos; int tpos; struct savestr *vmptr; int scor, tmp; int im, ib, nsave; int ktup, kt1, *hsq, ip, lkt; #ifndef TFAST int n0x31, n0x32; n0x31 = (n0-2)/3; n0x32 = n0x31+1+(n0-n0x31-1)/2; #else const unsigned char *fs; unsigned char *fd; int n1x31, n1x32, last_n1, itemp; n1x31 = (n1-2)/3; n1x32 = n1x31+1+(n1-n1x31-1)/2; #endif if (ppst->ext_sq_set) { ip = 1; hsq = ppst->hsqx; } else { ip = 0; hsq = ppst->hsq; } ktup = ppst->param_u.fa.ktup; kt1 = ktup-1; if (n1 < ktup) { rst->score[0] = rst->score[1] = rst->score[2] = 0; return; } if (n0+n1+1 >= MAXDIAG) { fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1); rst->score[0] = rst->score[1] = rst->score[2] = -1; return; } f_str->noff = n0 - 1; #ifdef ALLOCN0 nd = n0; #endif #ifndef ALLOCN0 nd = n0 + n1; #endif dpmax = &f_str->diag[nd]; for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;) { dptr->stop = -1; dptr->dmax = NULL; dptr++->score = 0; } for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++) vmptr->score = 0; f_str->lowmax = f_str->vmax; f_str->lowscor = 0; /* start hashing */ lhval = 0; lkt = kt1; for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos=NMAP) { lhval = 0; lkt=lpos+ktup; continue; #ifdef ALLOCN0 /* reinitialize dptr */ dptr = &f_str->diag[lpos % nd]; dptr->stop = -1; dptr->dmax = NULL; dptr->score = 0; #endif } lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]]; } #ifndef ALLOCN0 diagp = &f_str->diag[f_str->noff + lkt]; for (; lpos < n1; lpos++, diagp++) { /* if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; continue;} */ if (hsq[aa1[lpos]]>=NMAP) { lpos++ ; diagp++; while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;} if (lpos >= n1) break; lhval = 0; } lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]]; for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) { if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) { #else lposn0 = f_str->noff + lpos; for (; lpos < n1; lpos++, lposn0++) { if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;} lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]]; for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) { dpos = lposn0 - tpos; if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) { #endif tscor += ktup; if ((tscor -= lpos) <= 0) { /* better to start over */ scor = dptr->score; if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && f_str->lowscor < scor) #ifdef ALLOCN0 savemax (dptr, dpos, f_str); #else savemax (dptr, f_str); #endif if ((tscor += scor) >= kfact) { dptr->score = tscor; dptr->stop = lpos; } else { dptr->score = kfact; dptr->start = (dptr->stop = lpos) - kt1; } } /* continue current run in diagonal */ else { dptr->score += f_str->pamh1[aa0[tpos]]; dptr->stop = lpos; } } else { dptr->score = f_str->pamh2[lhval]; dptr->start = (dptr->stop = lpos) - kt1; } } /* end tpos */ #ifdef ALLOCN0 /* reinitialize diag structure */ loopl: if ((dptr = &f_str->diag[lpos % nd])->score > f_str->lowscor) { savemax (dptr, lpos, f_str); } dptr->stop = -1; dptr->dmax = NULL; dptr->score = 0; #endif } /* end lpos */ #ifdef ALLOCN0 for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) { if ((dptr = &f_str->diag[dpos % nd])->score > f_str->lowscor) savemax (dptr, dpos, f_str); } #else for (dptr = f_str->diag; dptr < dpmax;) { if (dptr->score > f_str->lowscor) savemax (dptr, f_str); dptr->stop = -1; dptr->dmax = NULL; dptr++->score = 0; } f_str->ndo = nd; #endif /* at this point all of the elements of aa1[lpos] have been searched for elements of aa0[tpos] with the results in diag[dpos] */ for (nsave = 0, vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++) { /* fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n", f_str->noff+vmptr->start-vmptr->dp, f_str->noff+vmptr->stop-vmptr->dp, vmptr->start,vmptr->stop, vmptr->dp,vmptr->score); */ if (vmptr->score > 0) { vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], f_str); f_str->vptr[nsave++] = vmptr; } } if (nsave <= 0) { rst->score[0] = rst->score[1] = rst->score[2] = 0; return; } #ifndef TFAST /* FASTX code here to modify the start, stop points for the three phases of the translated protein sequence */ /* fprintf(stderr,"n0x: %d; n0x31:%d; n0x32: %d\n",n0,n0x31,n0x32); for (ib=0; ibnoff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp, f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp, f_str->vptr[ib]->start,f_str->vptr[ib]->stop, f_str->vptr[ib]->dp,f_str->vptr[ib]->score); } fprintf(stderr,"---\n"); */ for (ib=0; ibnoff-f_str->vptr[ib]->dp+f_str->vptr[ib]->start >= n0x32) f_str->vptr[ib]->dp += n0x32; if (f_str->noff-f_str->vptr[ib]->dp +f_str->vptr[ib]->start >= n0x31) f_str->vptr[ib]->dp += n0x31; } /* for (ib=0; ibnoff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp, f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp, f_str->vptr[ib]->start,f_str->vptr[ib]->stop, f_str->vptr[ib]->dp,f_str->vptr[ib]->score); } */ #else /* TFASTX code here to modify the start, stop points for the three phases of the translated protein sequence TFASTX modifies library start points, rather than query start points */ /* fprintf(stderr,"n0: %d; noff: %d; n1: %d; n1x31: %d n1x32 %d\n",n0, f_str->noff,n1,n1x31,n1x32); for (ib=0; ibnoff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp, f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp, f_str->vptr[ib]->start,f_str->vptr[ib]->stop, f_str->vptr[ib]->dp,f_str->vptr[ib]->score); } fprintf(stderr,"---\n"); */ for (ib=0; ibvptr[ib]->start >= n1x32) { f_str->vptr[ib]->start -= n1x32; f_str->vptr[ib]->stop -= n1x32; f_str->vptr[ib]->dp -= n1x32; } if (f_str->vptr[ib]->start >= n1x31) { f_str->vptr[ib]->start -= n1x31; f_str->vptr[ib]->stop -= n1x31; f_str->vptr[ib]->dp -= n1x31; } } /* for (ib=0; ibnoff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp, f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp, f_str->vptr[ib]->start,f_str->vptr[ib]->stop, f_str->vptr[ib]->dp,f_str->vptr[ib]->score); } */ #endif /* TFASTX */ scor = sconn (f_str->vptr, nsave, ppst->param_u.fa.cgap, ppst->param_u.fa.pgap, f_str); for (vmptr=f_str->vptr[0],ib=1; ibvptr[ib]->score > vmptr->score) vmptr=f_str->vptr[ib]; /* kssort (f_str->vptr, nsave); */ rst->score[1] = vmptr->score; /* best single score - init1*/ rst->score[0] = max (scor, vmptr->score); /* initn */ rst->score[2] = rst->score[0]; /* initn */ my_hoff=f_str->noff - vmptr->dp; /* if (n1 > 5000) { fprintf(stderr," Long n1: %d\n",n1); } */ if (ppst->param_u.fa.optflag) { if (rst->score[0] > ppst->param_u.fa.optcut) { #ifndef TFAST rst->score[2] = dmatchx(aa0, n0,aa1,n1,my_hoff, ppst->param_u.fa.optwid, ppst->pam2[ip], ppst->gdelval,ppst->ggapval,ppst->gshift,f_str); #else /* TFASTX */ /* generate f_str->aa1y */ /* for (i=0; isq[aa1[i]],stderr); if (i%60==59) fputc('\n',stderr); } fprintf(stderr,"\n-----\n"); */ for (fs=aa1,itemp=0; itemp <3; itemp++,fs++) { for (fd= &f_str->aa1y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs; *fd=EOSEQ; } /* for (i=0; isq[f_str->aa1y[i]],stderr); if (i%60==59) fputc('\n',stderr); } */ rst->score[2] = dmatchx(aa0, n0, aa1, n1, my_hoff=vmptr->dp-f_str->noff, ppst->param_u.fa.optwid, ppst->pam2[ip], ppst->gdelval,ppst->ggapval,ppst->gshift,f_str); #endif /* TFASTX */ } } *hoff = my_hoff; } /* returns rst.score[0] - initn rst.score[1] - init1 rst.score[2] - opt */ void do_work (const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int frame, struct pstruct *ppst, struct f_struct *f_str, int qr_flg, struct rstruct *rst) { int hoff; int last_n1, itx, itt, n10, i; #ifdef TFAST unsigned char *aa1x; /* aa0 has a protein sequence */ /* aa1 has a raw DNA sequence */ itt = frame; last_n1 = 0; aa1x = f_str->aa1x; for (itx= itt*3; itx< itt*3+3; itx++) { n10 = saatran(aa1,&aa1x[last_n1],n1,itx); /* fprintf(stderr," itt %d itx: %d\n",itt,itx); for (i=0; iaa1x[last_n1+i]]); if ((i%60)==59) fprintf(stderr,"\n"); } fprintf(stderr,"\n"); */ last_n1 += n10+1; } n10 = last_n1-1; #endif rst->score[0] = rst->score[1] = rst->score[2] = 0; rst->escore = 1.0; rst->segnum = rst->seglen = 1; #ifndef TFAST do_fastx (f_str->aa0x, n0, aa1, n1, ppst, f_str, rst, &hoff); #else /* tfastx */ do_fastx (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff); #endif } void do_opt (const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int frame, struct pstruct *ppst, struct f_struct *f_str, struct rstruct *rst) { int optflag, tscore, hoff; optflag = ppst->param_u.fa.optflag; ppst->param_u.fa.optflag = 1; #ifndef TFAST do_fastx (f_str->aa0x, n0, aa1, n1, ppst, f_str, rst, &hoff); #else do_fastx (aa0, n0, aa1, n1, ppst, f_str, rst, &hoff); #endif ppst->param_u.fa.optflag = optflag; } #ifdef ALLOCN0 void savemax (dptr, dpos, f_str) register struct dstruct *dptr; int dpos; struct f_struct *f_str; { register struct savestr *vmptr; register int i; #else void savemax (dptr, f_str) register struct dstruct *dptr; struct f_struct *f_str; { register int dpos; register struct savestr *vmptr; register int i; dpos = (int) (dptr - f_str->diag); #endif /* check to see if this is the continuation of a run that is already saved */ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos && vmptr->start == dptr->start) { vmptr->stop = dptr->stop; if ((i = dptr->score) <= vmptr->score) return; vmptr->score = i; if (vmptr != f_str->lowmax) return; } else { i = f_str->lowmax->score = dptr->score; f_str->lowmax->dp = dpos; f_str->lowmax->start = dptr->start; f_str->lowmax->stop = dptr->stop; dptr->dmax = f_str->lowmax; } for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++) if (vmptr->score < i) { i = vmptr->score; f_str->lowmax = vmptr; } f_str->lowscor = i; } int spam (const unsigned char *aa0, const unsigned char *aa1, struct savestr *dmax, int **pam2, struct f_struct *f_str) { int lpos; int tot, mtot; struct { int start, stop, score; } curv, maxv; const unsigned char *aa0p, *aa1p; aa1p = &aa1[lpos = dmax->start]; aa0p = &aa0[lpos - dmax->dp + f_str->noff]; curv.start = lpos; tot = curv.score = maxv.score = 0; for (; lpos <= dmax->stop; lpos++) { tot += pam2[*aa0p++][*aa1p++]; if (tot > curv.score) { curv.stop = lpos; curv.score = tot; } else if (tot < 0) { if (curv.score > maxv.score) { maxv.start = curv.start; maxv.stop = curv.stop; maxv.score = curv.score; } tot = curv.score = 0; curv.start = lpos+1; } } if (curv.score > maxv.score) { maxv.start = curv.start; maxv.stop = curv.stop; maxv.score = curv.score; } /* if (maxv.start != dmax->start || maxv.stop != dmax->stop) printf(" new region: %3d %3d %3d %3d\n",maxv.start, dmax->start,maxv.stop,dmax->stop); */ dmax->start = maxv.start; dmax->stop = maxv.stop; return maxv.score; } #define XFACT 10 int sconn (struct savestr **v, int n, int cgap, int pgap, struct f_struct *f_str) { int i, si; struct slink { int score; struct savestr *vp; struct slink *next; } *start, *sl, *sj, *so, sarr[MAXSAV]; int lstart, tstart, plstop, ptstop; /* sort the score left to right in lib pos */ kpsort (v, n); start = NULL; /* for the remaining runs, see if they fit */ for (i = 0, si = 0; i < n; i++) { /* if the score is less than the gap penalty, it never helps */ if (v[i]->score < cgap) continue; lstart = v[i]->start; tstart = lstart - v[i]->dp + f_str->noff; /* put the run in the group */ sarr[si].vp = v[i]; sarr[si].score = v[i]->score; sarr[si].next = NULL; /* if it fits, then increase the score */ for (sl = start; sl != NULL; sl = sl->next) { plstop = sl->vp->stop; ptstop = plstop - sl->vp->dp + f_str->noff; if (plstop < lstart+XFACT && ptstop < tstart+XFACT) { sarr[si].score = sl->score + v[i]->score + pgap; break; } } /* now recalculate where the score fits */ if (start == NULL) start = &sarr[si]; else for (sj = start, so = NULL; sj != NULL; sj = sj->next) { if (sarr[si].score > sj->score) { sarr[si].next = sj; if (so != NULL) so->next = &sarr[si]; else start = &sarr[si]; break; } so = sj; } si++; } if (start != NULL) return (start->score); else return (0); } void kssort (v, n) struct savestr *v[]; int n; { int gap, i, j; struct savestr *tmp; for (gap = n / 2; gap > 0; gap /= 2) for (i = gap; i < n; i++) for (j = i - gap; j >= 0; j -= gap) { if (v[j]->score >= v[j + gap]->score) break; tmp = v[j]; v[j] = v[j + gap]; v[j + gap] = tmp; } } void kpsort (v, n) struct savestr *v[]; int n; { int gap, i, j; struct savestr *tmp; for (gap = n / 2; gap > 0; gap /= 2) for (i = gap; i < n; i++) for (j = i - gap; j >= 0; j -= gap) { if (v[j]->start <= v[j + gap]->start) break; tmp = v[j]; v[j] = v[j + gap]; v[j + gap] = tmp; } } static int dmatchx(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int hoff, int window, int **pam2, int gdelval, int ggapval, int gshift, struct f_struct *f_str) { hoff -= window/2; #ifndef TFAST return lx_band(aa1,n1,f_str->aa0y,n0, pam2, #ifdef OLD_FASTA_GAP -(gdelval-ggapval), #else -gdelval, #endif -ggapval,-gshift, hoff,window,f_str); #else return lx_band(aa0,n0,f_str->aa1y,n1, pam2, #ifdef OLD_FASTA_GAP -(gdelval-ggapval), #else -gdelval, #endif -ggapval,-gshift, hoff,window,f_str); #endif } static void init_row(struct sx_s *row, int sp) { int i; for (i = 0; i < sp; i++) { row[i].C1 = row[i].I1 = 0; row[i].C2 = row[i].I2 = 0; row[i].C3 = row[i].I3 = 0; row[i].flag = 0; } } int lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/ int len_prot, /* length of prot. seq */ const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/ int len_dna_prot, /* length trans. seq. */ int **pam_matrix, /* scoring matrix */ int gopen, int gext, /* gap open, gap extend penalties */ int gshift, /* frame-shift penalty */ int start_diag, /* start diagonal of band */ int width, /* width for band alignment */ struct f_struct *f_str) { void *ckalloc(); int i, j, bd, bd1, x1, sp, p1=0, p2=0, end_prot; int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg; register int *wt; const unsigned char *dp; register struct sx_s *ap, *aq; sp = width+7; gg = gopen+gext; /* sp = sp/3; */ if (f_str->cur == NULL) f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp); init_row(f_str->cur, sp); /* if (start_diag %3 !=0) start_diag = start_diag/3-1; else start_diag = start_diag/3; */ /* if (width % 3 != 0) width = width/3+1; else width = width /3; */ /* currently, this code assumes that the DNA sequence is longer than the protein sequence. This is not always true. len_prot in the loop below should be decreased to the projection of the DNA on the protein */ x1 = start_diag; /* x1 = lower bound of DNA */ end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width; end_prot = min(end_prot,len_prot); /* i counts through protein sequence, x1 through DNAp */ for (i = max(0, -width-start_diag), x1+=i; i < end_prot; i++, x1++) { bd = min(x1+width, len_dna_prot/3); /* upper bound of band */ bd1 = max(0,x1); /* lower bound of band */ wt = pam_matrix[prot_seq[i]]; del = 1-x1; /*adjustment*/ bd += del; bd1 +=del; ap = &f_str->cur[bd1]; aq = ap+1; e1 = f_str->cur[bd1-1].C3; e2 = ap->C1; cd1 = cd2= cd3= 0; for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) { sc = max(max(e1, (e3=ap->C2))-gshift, e2)+wt[*dp++]; if (cd1 > sc) sc = cd1; cd1 -= gext; if ((ci = aq->I1) > 0) { if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;} else { ap->C1 = sc; sc -= gg; if (sc > 0) { if (sc > best) best =sc; if (cd1 < sc) cd1 = sc; ap->I1 = max(ci-gext, sc); } else ap->I1 = ci-gext; } } else { if (sc <= 0) { ap->I1 = ap->C1 = 0; } else { ap->C1 = sc; sc-=gg; if (sc >0) { if (sc > best) best =sc; if (cd1 < sc) cd1 = sc; ap->I1 = sc; } else ap->I1 = 0; } } sc = max(max(e2, (e1=ap->C3))-gshift, e3)+wt[*dp++]; if (cd2 > sc) sc = cd2; cd2 -= gext; if ((ci = aq->I2) > 0) { if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;} else { ap->C2 = sc; sc -= gg; if (sc > 0) { if (sc > best) best =sc; if (cd2 < sc) cd2 = sc; ap->I2 = max(ci-gext, sc); } } } else { if (sc <= 0) { ap->I2 = ap->C2 = 0; } else { ap->C2 = sc; sc-=gg; if (sc >0) { if (sc > best) best =sc; if (cd2 < sc) cd2 = sc; ap->I2 = sc; } else ap->I2 = 0; } } sc = max(max(e3, (e2=aq->C1))-gshift, e1)+wt[*dp++]; if (cd3 > sc) sc = cd3; cd3 -= gext; if ((ci = aq++->I3) > 0) { if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;} else { ap->C3 = sc; sc -= gg; if (sc > 0) { if (sc > best) best =sc; if (cd3 < sc) cd3 = sc; ap->I3 = max(ci-gext, sc); } } } else { if (sc <= 0) { ap->I3 = ap->C3 = 0; } else { ap->C3 = sc; sc-=gg; if (sc >0) { if (sc > best) best =sc; if (cd3 < sc) cd3 = sc; ap->I3 = sc; } else ap->I3 = 0; } } } } /* printf("The best score is %d\n", best); */ return best+gopen+gext; } /* ckalloc - allocate space; check for success */ void *ckalloc(size_t amount) { void *p; if ((p = (void *)malloc( (size_t)amount)) == NULL) w_abort("Ran out of memory.",""); return(p); } /* calculate the 100% identical score */ int shscore(unsigned char *aa0, int n0, int **pam2) { int i, sum; for (i=0,sum=0; imin0 = x; /* prot */ a_res->max0 = ex; /* prot */ a_res->min1 = y; /* DNA-prot */ a_res->max1 = ey; /* DNA-prot */ align = global(x, y, ex, ey, pam_matrix, gopen, gex, gshift, dna_prot_seq, prot_seq, 0, 0, &up, &down, &tp); alignment = a_res->res; /* from earlier version */ /* alignment[0] = x; */ /* start of alignment in prot */ /* alignment[1] = y; */ /* start of alignment in DNA */ for (ap = align, i= 0; ap; i++) { if (i < max_res) {alignment[i] = ap->l;} aq = ap->next; free(ap); ap = aq; } if (i >= max_res) { fprintf(stderr," alignment truncated: %d/%d\n", max_res,i); } up = &up[-3]; down = &down[-3]; tp = &tp[-3]; free(up); free(tp); free(down); /* free(st_up); */ /* moved into local align */ a_res->nres = i; /* i has the length of the alignment */ return score; } static void swap(void **a, void **b) { void *t; t = *a; *a = *b; *b = t; } /* local alignment find the best local alignment x and y is the starting position of the best local alignment and ex ey is the ending position */ static int local_align(int *x, int *y, int *ex, int *ey, int **wgts, int gop, int gext, int shift, unsigned char *dnap, int ld, unsigned char *pro, int lp, st_ptr up, st_ptr down) { int i, j, score, x1,x2,x3,x4, e1, e2 = 0, e3, sc, del, e, best = 0, *wt, cd, ci; state_ptr cur_st, last_st, cur_i_st; st_ptr cur, last; unsigned char *dp; int *st_up, *cur_d_st; /* Array rowiC store the best scores of alignment ending at a position Arrays rowiD, and rowiI store the best scores of alignment ending at a position with a deletion or insrtion Arrays sti stores the starting position of the best alignment whose score stored in the corresponding row array. The program stores two rows to complete the computation, same is for the global alignment routine. */ /* for fastx (but not tfastx), this could be moved into init_work(), and done only once */ st_up = (int *) ckalloc(sizeof(int)*(ld+10)); init_row2(st_up, ld+5); ld += 2; init_ROW(up, ld+1); /* set to zero */ init_ROW(down, ld+1); /* set to zero */ cur = up+1; last = down+1; /* for fastx (but not tfastx), these could be moved into init_work(), and done only once */ cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1)); last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1)); cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1)); cur_d_st = st_up; dp = dnap-2; for (i = 0; i < lp; i++) { wt = &wgts[pro[i]][0]; for (j = 0; j < 2; j++) { cur_st[j].i = i+1; cur_st[j].j = j+1; } for (j = 2; j < ld; j++) { score = wt[dp[j]]; del = -1; if (j >= 3) { sc = -score; e3 = e2-shift; e2 = last[j-3].C; e1 = last[j-2].C-shift; if (e1 > sc) {sc = e1; del = 2;} if (e2 > sc) {sc = e2; del = 3;} if (e3 > sc) {sc = e3; del = 4;} } else { sc = e2 = 0; if (sc < -score) sc=-score; else del = 3; } sc += score; if (sc < (ci=last[j].I)) { sc = ci; del = 0; } if (sc < (cd=cur[j].D)) { sc = cd; del = 5; } cur[j].C = sc; e = sc - gop; if (e > cd) { cur[j+3].D = e-gext; cur_d_st[j+3] = 3; } else { cur[j+3].D = cd-gext; cur_d_st[j+3] = cur_d_st[j]+3; } switch(del) { case 5: e1 = cur_d_st[j]; cur_st[j].i = cur_st[j-e1].i; cur_st[j].j = cur_st[j-e1].j; break; case 0: cur_st[j].i = cur_i_st[j].i; cur_st[j].j = cur_i_st[j].j; break; case 2: case 3: case 4: if (i) { if (j-del >= 0) { cur_st[j].i = last_st[j-del].i; cur_st[j].j = last_st[j-del].j; } else { cur_st[j].i = i; cur_st[j].j = 0; } } else { cur_st[j].i = 0; cur_st[j].j = max(0, j-del+1); } break; case -1: cur_st[j].i = i+1; cur_st[j].j = j+1; break; } if (e > ci) { cur[j].I = e -gext; cur_i_st[j].i = cur_st[j].i; cur_i_st[j].j = cur_st[j].j; } else { cur[j].I = ci- gext; } if (sc > best) { x1 = cur_st[j].i; x2 = cur_st[j].j; best =sc; x3 = i; x4 = j; } } swap((void **)&last, (void **)&cur); swap((void **)&cur_st, (void **)&last_st); } /* printf("The best score is %d\n", best); */ *x = x1; *y = x2; *ex = x3; *ey = x4; free(cur_st); free(last_st); free(cur_i_st); free(st_up); return best; } /* Both global_up and global_down do linear space score only global alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey]. global_up do the algorithm upwards, from row x towards row y. global_down do the algorithm downwards, from row y towards x. */ static void global_up(st_ptr *row1, st_ptr *row2, int x, int y, int ex, int ey, int **wgts, int gop, int gext, int shift, unsigned char *dnap, unsigned char *pro, int N) { int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score, *wt; st_ptr cur, last; cur = *row1; last = *row2; sc = -gop-gext; for (j = 1; j <= ey-y+1; j++) { if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;} else { last[j].I = last[j].C = -10000;} cur[j].I = -10000; } last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000; last[0].D = last[1].D = last[2].D = -10000; if (N) last[0].I = -gext; else last[0].I = -gop-gext; for (i = 1; i <= ex-x+1; i++) { wt = &wgts[pro[i+x-1]][0]; e2 = last[0].C; e1 = -10000; for (j = 0; j <= ey-y+1; j++) { t = j+y; sc = -10000; if (t < 3) score = -10000; else score = wt[dnap[t-3]]; if (j < 4) { if (j == 3) sc = e2; else if (j == 2) sc = e2-shift; } else { e3 = e2; e2 = e1; e1 = last[j-2].C; sc = max(max(e1, e3)-shift, e2); } sc += score; sc = max(sc, max(ci=last[j].I, cd = cur[j].D)); cur[j].C = sc; cur[j+3].D = max(cd, sc-gop)-gext; cur[j].I = max(ci, sc-gop)-gext; } swap((void **)&last, (void **)&cur); } for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I; if (*row1 != last) swap((void **)row1, (void **)row2); } static void global_down(st_ptr *row1, st_ptr *row2, int x, int y, int ex, int ey, int **wgts, int gop, int gext, int shift, unsigned char *dnap, unsigned char *pro, int N) { int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, s1, s2, s3, *wt; st_ptr cur, last; cur = (*row1); last = *row2; sc = -gop-gext; for (j = ey-y; j >= 0; j--) { if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;} else last[j].I = last[j].C = -10000; } last[ey-y+1].C = 0; cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000; last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000; if (N) last[ey-y+1].I = -gext; else last[ey-y+1].I = -gop-gext; for (i = ex-x; i >= 0; i--) { wt = &wgts[pro[i+x]][0]; e2 = last[ey-y+1].C; e1 = s2 = s3 = -10000; for (j = ey-y+1; j >= 0; j--) { t = j+y; s1 = wt[dnap[t-1]]; sc = -10000; if (t+3 > ey) { if (t+2==ey) sc = e2+s2; else if (t+1==ey) sc = e2-shift+s1; } else { e3 = e2; e2 = e1; e1 = last[j+2].C; sc = max(max(e1+s1, e3+s3)-shift, e2+s2); } if (sc < (cd= cur[j].D)) { sc = cd; cur[j-3].D = cd-gext; } else cur[j-3].D =max(cd, sc-gop)-gext; if (sc < (ci= last[j].I)) { sc = ci; del = 0; cur[j].I = ci - gext; } else cur[j].I = max(sc-gop,ci)-gext; cur[j].C = sc; s3 = s2; s2 = s1; } swap((void **)&last, (void **)&cur); } for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I; if (*row1 != last) swap((void **)row1, (void **)row2); } static void init_row2(int *row, int ld) { int i; for (i = 0; i < ld; i++) row[i] = 0; } static void init_ROW(st_ptr row, int ld) { int i; for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0; } static match_ptr combine(match_ptr x1, match_ptr x2, int st) { match_ptr x; if (x1 == NULL) return x2; for (x = x1; x->next; x = x->next); x->next = x2; if (st) { for (x = x2; x; x = x->next) { x->j++; if (x->l == 3 || x->l == 4) break; } x->l--; } return x1; } /* global use the two upwards and downwards score only linear space global alignment subroutine to recursively build the alignment. */ match_ptr global(int x, int y, int ex, int ey, int **wgts, int gop, int gext, int shift, unsigned char *dnap, unsigned char *pro, int N1, int N2, st_ptr *up_stp, st_ptr *dn_stp, st_ptr *tp_stp ) { int m; int m1, m2; match_ptr x1, x2, mm1, mm2; /*printf("%d %d %d %d\n", x,y, ex, ey);*/ /* if the space required is limited, we can do a quadratic space algorithm to find the alignment. */ if (ex <= x) { mm1 = NULL; mm2= NULL; for (m = y+3; m <= ey; m+=3) { x1 = (match_ptr) ckalloc(sizeof(match_node)); x1->l = 5; x1->next = mm1; if (mm1== NULL) mm2 = x1; mm1 = x1; } if (ex == x) { if ((ey-y) % 3 != 0) { x1 = (match_ptr) ckalloc(sizeof(match_node)); x1->l = ((ey-y) % 3) +1; x1->next = NULL; if (mm2) mm2->next = x1; else mm1 = x1; } else { if (mm2) mm2->l = 4; } } return mm1; } if (ey <= y) { mm1 = NULL; for (m = x; m <= ex; m++) { x1 = (match_ptr) ckalloc(sizeof(match_node)); x1->l = 0; x1->next = mm1; mm1 = x1; } return mm1; } if (ex -x < SGW1-1 && ey-y < SGW2-1) return small_global(x,y,ex,ey, wgts, gop, gext, shift, dnap, pro, N1, N2); m = (x+ex)/2; /* Do the score only global alignment from row x to row m, m is the middle row of x and ex. Store the information of row m in upC, upD, and upI. */ global_up(up_stp, tp_stp, x, y, m, ey, wgts, gop, gext, shift, dnap, pro, N1); /* Do the score only global alignment downwards from row ex to row m+1, store information of row m+1 in downC downI and downD */ global_down(dn_stp, tp_stp, m+1, y, ex, ey, wgts, gop, gext, shift, dnap, pro, N2); /* Use these information of row m and m+1, to find the crossing point of the best alignment with the middle row. The crossing point is given by m1 and m2. Then we recursively call global itself to compute alignments in two smaller regions found by the crossing point and combine the two alignments to form a whole alignment. Return that alignment. */ if (find_best(*up_stp, *dn_stp, &m1, &m2, ey-y+1, y, gop)) { x1 = global(x, y, m, m1, wgts, gop, gext, shift, dnap, pro, N1, 0, up_stp, dn_stp, tp_stp); x2 = global(m+1, m2, ex, ey, wgts, gop, gext, shift, dnap, pro, 0, N2, up_stp, dn_stp, tp_stp); if (m1 == m2) x1 = combine(x1,x2,1); else x1 = combine(x1, x2,0); } else { x1 = global(x, y, m-1, m1, wgts, gop, gext, shift, dnap, pro, N1, 1, up_stp, dn_stp, tp_stp); x2 = global(m+2, m2, ex, ey, wgts, gop, gext, shift, dnap, pro, 1, N2, up_stp, dn_stp, tp_stp); mm1 = (match_ptr) ckalloc(sizeof(match_node)); mm1->i = m; mm1->l = 0; mm1->j = m1; mm2 = (match_ptr) ckalloc(sizeof(match_node)); mm2->i = m+1; mm2->l = 0; mm2->j = m1; mm1->next = mm2; mm2->next = x2; x1 = combine(x1, mm1, 0); } return x1; } static int find_best(st_ptr up, st_ptr down, int *m1, int *m2, int ld, int y, int gop) { int i, best = -100000, j = 0, s1, s2, s3, s4, st; up++; for (i = 1; i < ld; i++) { s2 = up[i-1].C + down[i].C; s4 = up[i-1].I + down[i].I + gop; if (best < s2) { best = s2; j = i; st = 1; } if (best < s4) { best = s4; j = i; st = 0; } } *m1 = j-1+y; *m2 = j+y; /*printf("find best score =%d\n", best);*/ return st; } /* An alignment is represented as a linked list whose element is of type match_node. Each element represent an edge in the path of the alignment graph. The fields of match_node are l --- gives the type of the edge. i, j --- give the end position. */ static match_ptr small_global(int x, int y, int ex, int ey, int **wgts, int gop, int gext, int shift, unsigned char *dnap, unsigned char *pro, int N1, int N2) { static int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1]; int i, j, e, sc, score, del, k, t, *wt, ci, cd; int *cI, *cD, *cC, *lC, *cst, e2, e3, e4; match_ptr mp, first; /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/ sc = -gop-gext; C[0][0] = 0; if (N1) I[0] = -gext; else I[0] = sc; for (j = 1; j <= ey-y+1; j++) { if (j % 3== 0) { C[0][j] = sc; sc -= gext; I[j] = sc-gop; } else I[j] = C[0][j] = -10000; st[0][j] = 5; } lC = &C[0][0]; cD = D; D[0] = D[1] = D[2] = -10000; cI = I; for (i = 1; i <= ex-x+1; i++) { cC = &C[i][0]; wt = &wgts[pro[i+x-1]][0]; cst = &st[i][0]; for (j = 0; j <=ey-y+1; j++) { sc = -10000; del = 0; ci = cI[j]; cd= cD[j]; t = j+y; if (t < 3) score = -10000; else score = wt[dnap[t-3]]; if (j >= 4) { e2 = lC[j-2]-shift; sc = lC[j-3]; e4 = lC[j-4]-shift; del = 3; if (e2 > sc) { sc = e2; del = 2;} if (e4 >= sc) { sc = e4; del = 4;} } else { if (j ==3) {sc= lC[0]; del = 3;} else if (j == 2) {sc = lC[0]-shift; del = 2;} } sc = sc+score; if (sc < ci) { sc = ci; del = 0; } if (sc <= cd) { sc = cd; del = 5; } cC[j] = sc; sc -= gop; if (sc < cd) { del += 10; cD[j+3] = cd - gext; } else cD[j+3] = sc -gext; if (sc < ci) { del += 20; cI[j] = ci-gext; } else cI[j] = sc-gext; *(cst++) = del; } lC = cC; } if (N2 && ci +gop > cC[ey-y+1]) { st[ex-x+1][ey-y+1] = 0; /*printf("small score = %d\n", ci+gop);*/ } /*else printf("small score =%d\n", cC[ey-y+1]);*/ first = NULL; e = 1; for (i = ex+1, j = ey+1; i > x || j > y; i--) { mp = (match_ptr) ckalloc(sizeof(match_node)); mp->i = i-1; k = (t=st[i-x][j-y])%10; mp->j = j-1; if (e == 5 && (t/10)%2 == 1) k = 5; if (e == 0 && (t/20)== 1) k = 0; if (k == 5) { j -= 3; i++; e=5;} else {j -= k;if (k==0) e= 0; else e = 1;} mp->l = k; mp->next = first; first = mp; } /* for (i = 0; i <= ex-x; i++) { for (j = 0; j <= ey-y; j++) printf("%d ", C[i][j]); printf("\n"); } */ return first; } #define XTERNAL #include "upam.h" extern void display_alig(a, dna, pro,length, ld) int *a; unsigned char *dna, *pro; int length, ld; { int len = 0, i, j, x, y, lines, k; static char line1[100], line2[100], line3[100], tmp[10] = " "; unsigned char *dna1, c1, c2, c3, *st; dna1 = ckalloc((size_t)ld); for (st = dna, i = 0; i < ld; i++, st++) dna1[i] = aa[*st]; line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-1; for (len = 0, j = 2, lines = 0; j < length; j++) { i = a[j]; /*printf("%d %d %d\n", i, len, b->j);*/ if (i > 0 && i < 5) tmp[i-2] = aa[pro[x++]]; if (i == 5) { i = 3; tmp[0] = tmp[1] = tmp[2] = '-'; if (a[j+1] == 2) tmp[2] = ' '; } if (i > 0) { strncpy(&line1[len], (const char *)&dna1[y], i); y+=i; } else {line1[len] = '-'; i = 1; tmp[0] = aa[pro[x++]];} strncpy(&line2[len], tmp, i); for (k = 0; k < i; k++) { if (tmp[k] != ' ' && tmp[k] != '-') { if (k == 2) tmp[k] = '\\'; else if (k == 1) tmp[k] = '|'; else tmp[k] = '/'; } else tmp[k] = ' '; } if (i == 1) tmp[0] = ' '; strncpy(&line3[len], tmp, i); tmp[0] = tmp[1] = tmp[2] = ' '; len += i; line1[len] = line2[len] =line3[len] = '\0'; if (len >= WIDTH) { printf("\n%5d", WIDTH*lines++); for (k = 10; k <= WIDTH; k+=10) printf(" . :"); if (k-5 < WIDTH) printf(" ."); c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH]; line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0'; printf("\n %s\n %s\n %s\n", line1, line3, line2); line1[WIDTH] = c1; line2[WIDTH] = c2; line3[WIDTH] = c3; strncpy(line1, &line1[WIDTH], sizeof(line1)-1); strncpy(line2, &line2[WIDTH], sizeof(line2)-1); strncpy(line3, &line3[WIDTH], sizeof(line3)-1); len = len - WIDTH; } } printf("\n%5d", WIDTH*lines); for (k = 10; k < len; k+=10) printf(" . :"); if (k-5 < len) printf(" ."); printf("\n %s\n %s\n %s\n", line1, line3, line2); } /* alignment store the operation that align the protein and dna sequence. The code of the number in the array is as follows: 0: delete of an amino acid. 2: frame shift, 2 nucleotides match with an amino acid 3: match an amino acid with a codon 4: the other type of frame shift 5: delete of a codon Also the first two element of the array stores the starting point in the protein and dna sequences in the local alignment. Display looks like where WIDTH is assumed to be divisible by 10. 0 . : . : . : . : . : . : CCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTG P M I L G Y W N V R G L T H P I R M L L 60 . : . : . : . : . : . : GAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTT E Y T D S S Y D E K R Y T M G D A P D F */ /* fatal - print message and die */ void fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } int do_walign (const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int frame, struct pstruct *ppst, struct f_struct *f_str, struct a_res_str *a_res, int *have_ares) { int score; int i, last_n1, itemp, n10; int n_aa, n_nt, hoff, nt_min, nt_max, w_fact; unsigned char *fs, *fd; struct rstruct rst; int itx; #ifndef TFAST /* FASTX */ n_aa = n1; n_nt = n0; /* check for large differences in sequence length */ nt_min = 0; nt_max = n_nt; if (n_nt > 6 * n_aa) { /* find out where the diagonal is - get hoff hoff < 0 => seq0 is in the middle of seq1 */ do_fastx(f_str->aa0x, n0, aa1, n1, ppst, f_str, &rst, &hoff); if (rst.score[0] > 2 * rst.score[2]) {w_fact = 4;} else w_fact = 2; if (hoff > n_aa) { /* hoff > 0 => seq1 is in the middle of seq0 */ nt_min = max(0,(hoff-w_fact*n_aa)*3); nt_max = min((hoff+w_fact*n_aa)*3,n_nt); } else { nt_max = min(3*w_fact*n_aa,n_nt); } } a_res->res = f_str->res; score = pro_dna(aa1, n1, f_str->aa0y+nt_min, nt_max-nt_min, ppst->pam2[0], #ifdef OLD_FASTA_GAP -(ppst->gdelval - ppst->ggapval), #else -ppst->gdelval, #endif -ppst->ggapval, -ppst->gshift, f_str->max_res, a_res); /* correct for nt_min missing residues in alignment */ #else /* TFASTX */ /* for (i=0; isq[f_str->aa1x[i]],stderr); if (i%60==59) fputc('\n',stderr); } fprintf(stderr,"\n-----\n"); */ last_n1 = 0; for (itx=3*frame; itx<3+3*frame; itx++) { n10 = saatran(aa1,&f_str->aa1x[last_n1],n1,itx); /* for (i=0; iaa1x,itemp=0; itemp <3; itemp++,fs++) { for (fd= &f_str->aa1y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs; *fd=EOSEQ; } /* for (i=0; isq[f_str->aa1y[i]],stderr); if (i%60==59) fputc('\n',stderr); } fprintf(stderr,"\n-----\n"); */ n_aa = n0; n_nt = n1; /* check for large differences in sequence length */ nt_min = 0; nt_max = n_nt; if (n_nt > 6 * n_aa) { /* find out where the diagonal is - get hoff hoff < 0 => seq0 is in the middle of seq1 */ do_fastx(aa0, n0, f_str->aa1x, n10, ppst, f_str, &rst, &hoff); if (rst.score[0] > 2 * rst.score[2]) {w_fact = 4;} else w_fact = 2; if ( hoff > n_aa) { /* hoff > 0 => seq1 is in the middle of seq0 */ nt_min = max(0,(hoff-w_fact*n_aa)*3); nt_max = min((hoff+w_fact*n_aa)*3,n_nt); } else { nt_max = min(3*w_fact*n_aa,n_nt); } } a_res->res = f_str->res; score = pro_dna(aa0, n0, f_str->aa1y+nt_min, nt_max-nt_min, ppst->pam2[0], #ifdef OLD_FASTA_GAP -(ppst->gdelval - ppst->ggapval), #else -ppst->gdelval, #endif -ppst->ggapval, -ppst->gshift, f_str->max_res, a_res); #endif /* TFASTX */ /* pro_dna always compares protein to DNA, and returns protein coordinates in a_res->min0,max0 */ a_res->min1 += nt_min; a_res->max1 += nt_min; /* display_alig(f_str->res,f_str->aa0y,aa1,*nres,n0); */ *have_ares = 1; return score; } /* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */ /* call from calcons, calc_id, calc_code */ void aln_func_vals(int frame, struct a_struct *aln) { #ifndef TFAST aln->llrev = 0; aln->llfact = 1; aln->llmult = 1; aln->qlfact = 3; aln->frame = 0; if (frame > 0) aln->qlrev = 1; else aln->qlrev = 0; #else /* TFASTX */ aln->qlfact = 1; aln->qlrev = 0; aln->llfact = 3; aln->llmult = 1; aln->frame = 0; if (frame > 0) aln->llrev = 1; else aln->llrev = 0; #endif /* TFASTX */ } /* this function is required for programs like tfastx/y/s that do translations on DNA sequences and save them in f_str->aa1?? */ void pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) { #ifdef TFAST int i, last_n1, itemp, n10; unsigned char *fs, *fd; int itx; last_n1 = 0; for (itx=3*frame; itx<3+3*frame; itx++) { n10 = saatran(aa1,&f_str->aa1x[last_n1],n1,itx); /* for (i=0; iaa1x,itemp=0; itemp <3; itemp++,fs++) { for (fd= &f_str->aa1y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs; *fd=EOSEQ; } #endif } /* Alignment: store the operation that align the protein and dna sequence. The code of the number in the array is as follows: 0: delete of an amino acid. 2: frame shift, 2 nucleotides match with an amino acid 3: match an amino acid with a codon 4: the other type of frame shift 5: delete of a codon The first two elements of the array stores the starting point in the protein and dna sequences in the local alignment. */ #include "a_mark.h" int calcons(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int *nc, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, char *seqc0, char *seqc1, char *seqca, struct f_struct *f_str) { int i0, i1, i, j; int lenc, not_c, itmp, ngap_p, ngap_d, nfs; char *sp0, *sp1, *spa, *sq; const unsigned char *ap0, *ap1; int *rp, *rpmax; if (pst.ext_sq_set) {sq = pst.sqx;} else {sq = pst.sq;} #ifndef TFAST /* FASTX */ aln->amin1 = aln->smin1 = a_res.min0; /* prot */ aln->amin0 = aln->smin0 = a_res.min1; /* DNA */ ap0 = f_str->aa0y; /* translated DNA */ ap1 = aa1; /* protein */ sp0 = seqc0; sp1 = seqc1; #else /* TFASTX */ aln->amin0 = aln->smin0 = a_res.min0; /* DNA */ aln->amin1 = aln->smin1 = a_res.min1; /* prot */ ap1 = aa0; /* protein */ ap0 = f_str->aa1y; /* translated DNA */ sp1 = seqc0; sp0 = seqc1; #endif rp = a_res.res; rpmax = rp+a_res.nres; spa = seqca; lenc = not_c = aln->nident = aln->nsim = ngap_p = ngap_d = nfs= 0; i0 = a_res.min1; i1 = a_res.min0; while (rp < rpmax) { /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n" ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]); */ switch (*rp++) { case 0: /* aa insertion */ *sp0++ = '-'; *sp1++ = sq[ap1[i1++]]; *spa++ = M_DEL; lenc++; ngap_d++; break; case 2: /* -1 frameshift */ nfs++; *sp0++ = '/'; i0 -= 1; *sp1++ = '-'; *spa++ = M_DEL; not_c++; if ((itmp=pst.pam2[0][ap0[i0]][ap1[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_POS || *spa == M_ZERO) { aln->nsim++;} *sp0 = sq[ap0[i0]]; i0 += 3; *sp1 = sq[ap1[i1++]]; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; lenc++; break; case 3: /* codon/aa match */ if ((itmp=pst.pam2[0][ap0[i0]][ap1[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_POS || *spa == M_ZERO) { aln->nsim++;} *sp0 = sq[ap0[i0]]; i0 += 3; *sp1 = sq[ap1[i1++]]; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; lenc++; break; case 4: /* +1 frameshift */ nfs++; *sp0++ = '\\'; i0 += 1; *sp1++ = '-'; *spa++ = M_DEL; not_c++; if ((itmp=pst.pam2[0][ap0[i0]][ap1[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_POS || *spa == M_ZERO) { aln->nsim++;} *sp0 = sq[ap0[i0]]; i0 += 3; *sp1 = sq[ap1[i1++]]; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; lenc++; break; case 5: /* codon insertion */ *sp0++ = sq[ap0[i0]]; i0 += 3; *sp1++ = '-'; *spa++ = M_DEL; lenc++; ngap_p++; break; } } *spa = '\0'; #ifndef TFAST /* FASTX */ aln->amax0 = i0; aln->amax1 = i1; aln->ngap_q = ngap_d; aln->ngap_l = ngap_p; #else aln->amax1 = i0; aln->amax0 = i1; aln->amin1 = aln->smin1; aln->amin0 = aln->smin0; aln->ngap_q = ngap_p; aln->ngap_l = ngap_d; #endif aln->nfs = nfs; if (lenc < 0) lenc = 1; *nc = lenc; /* now we have the middle, get the right end */ return lenc+not_c; } int calcons_a(const unsigned char *aa0, unsigned char *aa0a, int n0, const unsigned char *aa1, int n1, int *nc, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, char *seqc0, char *seqc0a, char *seqc1, char *seqca, char *ann_arr, struct f_struct *f_str) { int i0, i1, i, j; int lenc, not_c, itmp, ngap_p, ngap_d, nfs; char *sp0, *sp0a, *sp1, *spa, *sq; const unsigned char *ap0, *ap1; int *rp, *rpmax; if (pst.ext_sq_set) {sq = pst.sqx;} else {sq = pst.sq;} #ifndef TFAST /* FASTX */ aln->amin1 = aln->smin1 = a_res.min0; /* prot */ aln->amin0 = aln->smin0 = a_res.min1; /* DNA */ ap0 = f_str->aa0y; /* translated DNA */ ap1 = aa1; /* protein */ #else /* TFASTX */ aln->amin0 = aln->smin0 = a_res.min0; /* DNA */ aln->amin1 = aln->smin1 = a_res.min1; /* prot */ ap1 = aa0; ap0 = f_str->aa1y; #endif rp = a_res.res; rpmax = &a_res.res[a_res.nres]; #ifndef TFAST sp0 = seqc0; sp1 = seqc1; #else sp1 = seqc0; sp0 = seqc1; #endif spa = seqca; sp0a = seqc0a; lenc = not_c = aln->nident = aln->nsim = ngap_p = ngap_d = nfs= 0; i0 = a_res.min1; i1 = a_res.min0; while (rp < rpmax) { /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n" ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]); */ switch (*rp++) { case 0: /* aa insertion */ *sp0++ = '-'; *sp1++ = sq[ap1[i1++]]; *spa++ = M_DEL; *sp0a++ = ' '; lenc++; ngap_d++; break; case 2: /* -1 frameshift */ nfs++; *sp0++ = '/'; i0 -= 1; *sp1++ = '-'; *spa++ = M_DEL; *sp0a++ = ' '; not_c++; if ((itmp=pst.pam2[0][ap0[i0]][ap1[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_POS || *spa == M_ZERO) { aln->nsim++;} #ifndef TFAST *sp0a++ = ' '; #else *sp0a++ = ann_arr[aa0a[i1]]; #endif *sp0 = sq[ap0[i0]]; i0 += 3; *sp1 = sq[ap1[i1++]]; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; lenc++; break; case 3: /* codon/aa match */ if ((itmp=pst.pam2[0][ap0[i0]][ap1[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_POS || *spa == M_ZERO) { aln->nsim++;} #ifndef TFAST *sp0a++ = ' '; #else *sp0a++ = ann_arr[aa0a[i1]]; #endif *sp0 = sq[ap0[i0]]; i0 += 3; *sp1 = sq[ap1[i1++]]; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; lenc++; break; case 4: /* +1 frameshift */ nfs++; *sp0a++ = ' '; *sp0++ = '\\'; i0 += 1; *sp1++ = '-'; *spa++ = M_DEL; not_c++; if ((itmp=pst.pam2[0][ap0[i0]][ap1[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_POS || *spa == M_ZERO) { aln->nsim++;} *sp0 = sq[ap0[i0]]; i0 += 3; *sp1 = sq[ap1[i1++]]; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; lenc++; break; case 5: /* codon insertion */ *sp0a++ = ' '; *sp0++ = sq[ap0[i0]]; i0 += 3; *sp1++ = '-'; *spa++ = M_DEL; lenc++; ngap_p++; break; } } *sp0a = *spa = '\0'; #ifndef TFAST aln->amax0 = i0; aln->amax1 = i1; aln->ngap_q = ngap_d; aln->ngap_l = ngap_p; #else aln->amax1 = i0; aln->amax0 = i1; aln->ngap_q = ngap_p; aln->ngap_l = ngap_d; #endif aln->nfs = nfs; if (lenc < 0) lenc = 1; *nc = lenc; /* now we have the middle, get the right end */ return lenc+not_c; } /* build an array of match/ins/del - length strings */ int calc_code(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, char *al_str, int al_str_n, struct f_struct *f_str) { int i0, i1, i, j; int lenc, not_c, itmp, ngap_p, ngap_d, nfs; char op_char[10]; int op, op_cnt; char sp0, sp1, *sq; const unsigned char *ap0, *ap1; int *rp, *rpmax; if (pst.ext_sq_set) {sq = pst.sqx;} else {sq = pst.sq;} #ifndef TFAST /* FASTX */ strncpy(op_char,"- /=\\+*",sizeof(op_char)); aln->amin1 = aln->smin1 = a_res.min0; /* prot */ aln->amin0 = aln->smin0 = a_res.min1; /* DNA */ ap0 = f_str->aa0y; ap1 = aa1; #else /* TFASTX */ strncpy(op_char,"+ /=\\-*",sizeof(op_char)); aln->amin0 = aln->smin0 = a_res.min0; /* DNA */ aln->amin1 = aln->smin1 = a_res.min1; /* prot */ ap1 = aa0; ap0 = f_str->aa1y; #endif rp = a_res.res; rpmax = &a_res.res[a_res.nres]; op_cnt = lenc = not_c = aln->nident = aln->nsim = ngap_p = ngap_d = nfs = 0; op = 3; /* code for a match - all alignments start with a match */ i0 = a_res.min1; i1 = a_res.min0; while (rp < rpmax) { switch (*rp++) { case 0: /* aa insertion */ if (op == 0) op_cnt++; else { update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op = 0; op_cnt = 1; } i1++; lenc++; ngap_d++; break; case 2: /* -1 frameshift */ if (pst.pam2[0][ap0[i0]][ap1[i1]]>=0) { aln->nsim++;} update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op = 2; op_cnt = 1; update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op = 3; op_cnt = 1; nfs++; i0 -= 1; not_c++; sp0 = sq[ap0[i0]]; i0 += 3; sp1 = sq[ap1[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; lenc++; break; case 3: /* codon/aa match */ if (pst.pam2[0][ap0[i0]][ap1[i1]]>=0) { aln->nsim++;} sp0 = sq[ap0[i0]]; i0 += 3; sp1 = sq[ap1[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; if (op == 3 || op == 6) { if (sp0 != '*' && sp1 != '*') { if (op == 6 ) { update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op_cnt = 1; op = 3; } else {op_cnt++;} } else { update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op_cnt = 1; op = 6; } } else { update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); if (op == 2 || op == 4) op_cnt = 2; else op_cnt = 1; op = 3; } lenc++; break; case 4: /* +1 frameshift */ update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op = 4; op_cnt = 1; update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op = 3; op_cnt = 1; nfs++; i0 += 1; not_c++; if (pst.pam2[0][ap0[i0]][ap1[i1]]>=0) { aln->nsim++;} sp0 = sq[ap0[i0]]; i0 += 3; sp1 = sq[ap1[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; lenc++; break; case 5: /* codon insertion */ if (op == 5) op_cnt++; else { update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); op = 5; op_cnt = 1; } i0 += 3; lenc++; ngap_p++; break; } } update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char); #ifndef TFAST aln->amax0 = i0; aln->amax1 = i1; aln->ngap_q = ngap_d; aln->ngap_l = ngap_p; #else aln->amax1 = i0; aln->amax0 = i1; aln->ngap_q = ngap_p; aln->ngap_l = ngap_d; #endif aln->nfs = nfs; if (lenc < 0) lenc = 1; return lenc; } static void update_code(char *al_str, int al_str_max, int op, int op_cnt, char *op_char) { char tmp_cnt[20]; sprintf(tmp_cnt,"%c%d",op_char[op],op_cnt); strncat(al_str,tmp_cnt,al_str_max-1); al_str[al_str_max-1]='\0'; } int calc_id(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, struct f_struct *f_str) { int i0, i1, i, j; int lenc, not_c, itmp, ngap_p, ngap_d, nfs; char sp0, sp1, *sq; const unsigned char *ap0, *ap1; int *rp, *rpmax; if (pst.ext_sq_set) {sq = pst.sqx;} else {sq = pst.sq;} #ifndef TFAST /* FASTX */ aln->amin1 = aln->smin1 = a_res.min0; /* prot */ aln->amin0 = aln->smin0 = a_res.min1; /* DNA */ ap0 = f_str->aa0y; ap1 = aa1; #else /* TFASTX */ aln->amin0 = aln->smin0 = a_res.min0; /* DNA */ aln->amin1 = aln->smin1 = a_res.min1; /* prot */ ap1 = aa0; ap0 = f_str->aa1y; #endif rp = a_res.res; rpmax = &a_res.res[a_res.nres]; lenc = not_c = aln->nident = aln->nsim = ngap_p = ngap_d = nfs = 0; i0 = a_res.min1; i1 = a_res.min0; while (rp < rpmax) { /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n" ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]); */ switch (*rp++) { case 0: /* aa insertion */ i1++; lenc++; ngap_d++; break; case 2: /* -1 frameshift */ nfs++; i0 -= 1; not_c++; if (pst.pam2[0][ap0[i0]][ap1[i1]]>=0) { aln->nsim++;} sp0 = sq[ap0[i0]]; i0 += 3; sp1 = sq[ap1[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; lenc++; break; case 3: /* codon/aa match */ if (pst.pam2[0][ap0[i0]][ap1[i1]]>=0) { aln->nsim++;} sp0 = sq[ap0[i0]]; i0 += 3; sp1 = sq[ap1[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; lenc++; break; case 4: /* +1 frameshift */ nfs++; i0 += 1; not_c++; if (pst.pam2[0][ap0[i0]][ap1[i1]]>=0) { aln->nsim++;} sp0 = sq[ap0[i0]]; i0 += 3; sp1 = sq[ap1[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; lenc++; break; case 5: /* codon insertion */ i0 += 3; lenc++; ngap_p++; break; } } #ifndef TFAST aln->amax0 = i0; aln->amax1 = i1; aln->ngap_q = ngap_d; aln->ngap_l = ngap_p; #else aln->amax1 = i0; aln->amax0 = i1; aln->ngap_q = ngap_p; aln->ngap_l = ngap_d; #endif aln->nfs = nfs; if (lenc < 0) lenc = 1; /* now we have the middle, get the right end */ return lenc; } #ifdef PCOMPLIB #include "p_mw.h" void update_params(struct qmng_str *qm_msg, struct pstruct *ppst) { ppst->n0 = qm_msg->n0; } #endif