2 /* copyright (c) 1998, 1999 William R. Pearson and the U. of Virginia */
4 /* $Name: fa_34_26_5 $ - $Id: dropfz2.c,v 1.57 2007/04/26 18:37:19 wrp Exp $ */
6 /* 18-Sept-2006 - removed static global variables for alignment */
8 /* 2002/06/23 finally correctly implement fix to translate 'N' to 'X' */
10 /* 1999/11/29 modification by Z. Zhang to translate DNA 'N' as 'X' */
12 /* implements an improved version of the fasty algorithm, see:
14 W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of
15 DNA sequences with protein sequences" Genomics 46:24-36
32 /* globals for fasta */
40 static char *verstr="3.5 Sept 2006";
42 static char *verstr="3.5an0 Sept 2006";
45 struct dstruct /* diagonal structure for saving current run */
47 int score; /* hash score of current match */
48 int start; /* start of current match */
49 int stop; /* end of current match */
50 struct savestr *dmax; /* location in vmax[] where best score data saved */
55 int score; /* pam score with segment optimization */
56 int score0; /* pam score of best single segment */
57 int gscore; /* score from global match */
58 int dp; /* diagonal of match */
59 int start; /* start of match in lib seq */
60 int stop; /* end of match in lib seq */
66 struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };
68 struct wgt { int iii, ii, iv;};
69 struct wgtc {char c2, c3, c4, c5;};
71 typedef struct st_s { int C, I, D;} *st_ptr;
75 struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
76 struct savestr *vptr[MAXSAV];
77 struct savestr *lowmax;
80 int hmask; /* hash constants */
81 int *pamh1; /* pam based array */
82 int *pamh2; /* pam based kfact array */
83 int *link, *harr; /* hash arrays */
84 int kshft; /* shift width */
85 int nsav, lowscor; /* number of saved runs, worst saved run */
87 unsigned char *aa0x, *aa0v; /* aa0x - 111122223333 */
89 unsigned char *aa1x, *aa1v; /* aa1x - 111122223333 */
90 #endif /* aa1v - computed codons */
94 struct wgtc **weight_c;
102 #include "drop_func.h"
104 static int dmatchx(const unsigned char *aa0, int n0,
105 const unsigned char *aa1, int n1,
106 int hoff, int window,
107 int **pam2, int gdelval, int ggapval, int gshift,
108 struct f_struct *f_str);
110 int shscore(unsigned char *aa0, int n0, int **pam2);
111 int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
112 int spam (const unsigned char *aa0, const unsigned char *aa1,
113 struct savestr *dmax, int **pam2,
114 struct f_struct *f_str);
115 int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);
116 int lx_band(const unsigned char *prot_seq, int len_prot,
117 const unsigned char *dna_prot_seq, int len_dna_prot,
118 int **pam_matrix, int gopen, int gext,
119 int gshift, int start_diag, int width, struct f_struct *f_str);
120 static void update_code(char *al_str, int al_str_max, int op, int op_cnt, char *op_char);
121 extern void w_abort (char *p, char *p1);
122 extern void aagetmap(char *to, int n);
124 /* initialize for fasta */
125 /* modified 30-August-1999 by Zheng Zhang to work with an extended alphabet */
126 /* Assume naa=47, and wgts[47][23] matches both upper and lower case
127 amoino acids with another amino acid. And also assume the DNA letter
128 does not have upper/lower case difference. If you also allow DNA
129 sequence to be upper/lower case letters, more needs be changed. Not
130 only here, but also in the alignment code, the way that pack a codon
131 into a number between 0-63 need be changed. */
133 /* modified so that if **weightci==NULL, do not fiddle with characters */
136 init_weights(struct wgt ***weighti, struct wgtc ***weightci,
137 int **wgts, int gshift, int gsubs, int naa)
140 int aa, b, a, x, y, z;
143 struct wgtc **weightc;
145 int temp[49][64]; /*change*/
149 if ((*weighti=(struct wgt **)calloc((size_t)(naa+1),sizeof(struct wgt *)))
151 fprintf(stderr," cannot allocate weights array: %d\n",naa);
156 for (aa=0; aa <= naa; aa++) {
157 if ((weight[aa]=(struct wgt *)calloc((size_t)256,sizeof(struct wgt)))
159 fprintf(stderr," cannot allocate weight[]: %d/%d\n",aa,naa);
164 if (weightci !=NULL) {
165 if ((*weightci=(struct wgtc **)calloc((size_t)(naa+1),
166 sizeof(struct wgtc *)))==NULL) {
167 fprintf(stderr," cannot allocate weight_c array: %d\n",naa);
172 for (aa=0; aa <= naa; aa++) {
173 if ((weightc[aa]=(struct wgtc *)calloc((size_t)256,sizeof(struct wgtc)))
175 fprintf(stderr," cannot allocate weightc[]: %d/%d\n",aa,naa);
185 for (aa = 0; aa <= naa; aa++) { /* change*/
187 for (i = 0; i < 64; i++) { /* j iterates through the codons */
190 for (j = 0; j < 64; j++) { /* j iterates through the codons */
191 z = ((~i & j) | (i & ~j));
192 b = 0; /* score = 0 */
193 if (z % 4) b-= gsubs;
194 if (z /16) b-= gsubs;
195 if ((z /4) % 4) b -= gsubs;
196 b += wwt[aascii[aacmap[j]]]; /* add the match score for char j*/
198 x = b; /* x has the score */
199 y = j; /* y has the character */
202 /* if (y < 0 || y > 63) printf("%d %d %d %d ",aa, i, x, y); */
209 for (aa= 0; aa <= naa; aa++) {
211 for (i = 0; i < 256; i++) {
212 for (x=-100,b = 0; b < 4; b++) {
213 z = (i/ (1 << ((b+1)*2)))*(1<<(b*2))+(i%(1<<(b*2)));
214 if (x < (e=wwt[z])) {
216 if (do_wgtc) weightc[aa][i].c4 = aacmap[le[aa][z]];
219 weight[aa][i].iv=x-gshift;
220 weight[aa][i].iii = wwt[i%64];
223 weightc[aa][i].c5 = aacmap[le[aa][i%64]];
224 weightc[aa][i].c3 = aacmap[i%64];
227 for (y = -100, b = 0; b < 3; b++) {
228 z = ((x >> (b*2)) << (b*2+2)) + (x % (1 << (b*2)));
229 for (a = 0; a < 4; a++) {
230 if ((e =wwt[z+(a<<(b*2))]) > y) {
233 weightc[aa][i].c2 = aacmap[le[aa][z+(a<<(b*2))]];
237 weight[aa][i].ii = y-gshift;
241 for (aa = 0; aa <= naa; aa++) {
242 weight[aa][106].iii = wgts[aa][23]; /* is 23 the code for 'X'?*/
243 weight[aa][106].iv = weight[aa][106].ii = weight[aa][106].iii-gshift;
245 weightc[aa][106].c5 = weightc[aa][106].c4 = weightc[aa][106].c3
246 = weightc[aa][106].c2 = 'X';
252 free_weights(struct wgt ***weighti0, struct wgt ***weighti1,
253 struct wgtc ***weightci, int naa)
256 struct wgt **weight0;
257 struct wgt **weight1;
258 struct wgtc **weightc;
264 for (aa=0; aa <= naa; aa++) {free(weight0[aa]);}
265 for (aa=0; aa <= naa; aa++) {free(weight1[aa]);}
266 for (aa=0; aa <= naa; aa++) {free(weightc[aa]);}
274 pre_com(const unsigned char *aa0, int n0, unsigned char *aa0v)
277 dnav = (hnt[aa0[0]]<<2) + hnt[aa0[1]];
278 for (i=2; i<n0; i++) {
279 dnav = ((dnav<<2)+hnt[aa0[i]])&255;
280 if (aa0[i] == NT_N || aa0[i-1]==NT_N || aa0[i-2] == NT_N)
283 if (dnav == 106/*CGGG*/) dnav = 42/*AGGG*/;
290 pre_com_r(const unsigned char *aa0, int n0, unsigned char *aa0v)
293 dnav = (3-hnt[aa0[n0-1]]<<2) + 3-hnt[aa0[n0-2]];
294 for (i=2, ir=n0-3; i<n0; i++,ir--) {
295 dnav = ((dnav<<2)+3-hnt[aa0[ir]])&255;
296 if (aa0[ir] == NT_N || aa0[ir+1]==NT_N || aa0[ir+2] == NT_N)
299 if (dnav == 106) dnav = 42;
306 init_work (unsigned char *aa0, int n0,
307 struct pstruct *ppst,
308 struct f_struct **f_arg)
315 struct f_struct *f_str;
317 /* these used to be globals, but do not need to be */
318 int ktup, fact, kt1, lkt;
323 struct swstr *ss, *r_ss;
326 int nsq, ip, *hsq, naat;
328 int last_n0, itemp, dnav;
329 unsigned char *fd, *fs, *aa0x, *aa0v;
333 if (nt[NT_N] != 'N') {
334 fprintf(stderr," nt[NT_N] (%d) != 'X' (%c) - recompile\n",NT_N,nt[NT_N]);
338 if (ppst->ext_sq_set) {
339 nsq = ppst->nsqx; ip = 1;
343 nsq = ppst->nsq; ip = 0;
347 f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
349 btemp = 2 * ppst->param_u.fa.bestoff / 3 +
350 n0 / ppst->param_u.fa.bestscale +
351 ppst->param_u.fa.bkfact *
352 (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
353 btemp = min (btemp, ppst->param_u.fa.bestmax);
354 if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
356 ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
357 if (ppst->param_u.fa.optcut_set != 1)
359 ppst->param_u.fa.optcut = (btemp*5)/4;
361 ppst->param_u.fa.optcut = (btemp*4)/3;
365 ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
367 ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
369 pamfact = ppst->param_u.fa.pamfact;
370 ktup = ppst->param_u.fa.ktup;
371 fact = ppst->param_u.fa.scfact * ktup;
374 /* before hashing, we must set up some space and translate the sequence */
377 if ((aa0x =(unsigned char *)calloc((size_t)maxn0,
378 sizeof(unsigned char)))
380 fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0);
387 if ((aa0v =(unsigned char *)calloc((size_t)maxn0,
388 sizeof(unsigned char)))
390 fprintf (stderr, "cannot allocate aa0v array %d\n", maxn0);
396 /* make a precomputed codon number series */
397 pre_com(aa0, n0, aa0v);
400 for (itemp=0; itemp<3; itemp++) {
401 n0x=saatran(aa0,&aa0x[last_n0],n0,itemp);
402 /* for (i=0; i<n0x; i++) {
403 fprintf(stderr,"%c",aa[aa0x[last_n0+i]]);
404 if ((i%60)==59) fprintf(stderr,"\n");
406 fprintf(stderr,"\n");
411 /* fprintf(stderr,"\n"); */
415 /* now switch aa0 and aa0x for hashing functions */
421 if (ppst->ext_sq_set) naat = MAXLC;
424 init_weights(&f_str->weight0, NULL,
425 ppst->pam2[ip],-ppst->gshift,-ppst->gsubs,naat);
426 init_weights(&f_str->weight1, &f_str->weight_c,
427 ppst->pam2[0],-ppst->gshift,-ppst->gsubs,naat);
431 else if (pamfact == -2)
434 for (i0 = 1, mhv = -1; i0 <= ppst->nsq; i0++)
435 if (hsq[i0] < NMAP && hsq[i0] > mhv)
440 fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
444 for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;
449 for (i0 = 0; i0 < ktup; i0++)
450 hv = hv << f_str->kshft;
452 f_str->hmask = (hmax >> f_str->kshft) - 1;
454 if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
455 fprintf (stderr, " cannot allocate hash array\n");
458 if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
459 fprintf (stderr, " cannot allocate pamh1 array\n");
462 if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
463 fprintf (stderr, " cannot allocate pamh2 array\n");
466 if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
467 fprintf (stderr, " cannot allocate hash link array");
471 for (i0 = 0; i0 < hmax; i0++)
472 f_str->harr[i0] = -1;
473 for (i0 = 0; i0 < n0; i0++)
474 f_str->link[i0] = -1;
476 /* encode the aa0 array */
479 for (i0 = 0; i0 < min(n0,lkt); i0++) {
480 if (hsq[aa0[i0]] >= NMAP) {
481 hv=phv=0; lkt = i0+ktup; continue;
483 hv = (hv << f_str->kshft) + ppst->hsq[aa0[i0]];
484 phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
487 for (; i0 < n0; i0++) {
488 if (hsq[aa0[i0]] >= NMAP) {
490 /* restart hv, phv calculation */
491 for (lkt = i0+kt1; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
492 if (hsq[aa0[i0]] >= NMAP) {
497 hv = (hv << f_str->kshft) + hsq[aa0[i0]];
498 phv += ppst->pam2[ip][aa0[i0]][aa0[i0]]*ktup;
502 hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]];
503 f_str->link[i0] = f_str->harr[hv];
504 f_str->harr[hv] = i0;
506 f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
507 if (hsq[aa0[i0-kt1]] < NMAP)
508 phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
510 else f_str->pamh2[hv] = fact * ktup;
513 /* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the
514 pam2[0][0] is now undefined for consistency with blast
518 for (i0 = 1; i0 <= ppst->nsq; i0++)
519 f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
521 for (i0 = 1; i0 <= ppst->nsq; i0++)
522 f_str->pamh1[i0] = fact;
524 f_str->ndo = 0; /* used to save time on diagonals with long queries */
528 if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
529 sizeof (struct dstruct)))==NULL) {
530 fprintf (stderr," cannot allocate diagonal arrays: %lu\n",
531 MAXDIAG *sizeof (struct dstruct));
535 if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0,
536 sizeof (struct dstruct)))==NULL) {
537 fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
538 (long)n0*sizeof (struct dstruct));
544 /* done hashing, now switch aa0, aa0x back */
549 if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+4,
550 sizeof(unsigned char)))
552 fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+4);
557 if ((f_str->aa1v =(unsigned char *)calloc((size_t)ppst->maxlen+4,
558 sizeof(unsigned char))) == NULL) {
559 fprintf (stderr, "cannot allocate aa1v array %d\n", ppst->maxlen+4);
566 if ((waa= (int *)malloc (sizeof(int)*(ppst->nsq+1)*n0)) == NULL) {
567 fprintf(stderr,"cannot allocate waa struct %3d\n",ppst->nsq*n0);
572 for (i=0; i<=ppst->nsq; i++) {
573 for (j=0;j<n0; j++) {
574 *pwaa = ppst->pam2[ip][i][aa0[j]];
581 maxn0 = max(2*n0,MIN_RES);
583 maxn0 = max(4*n0,MIN_RES);
585 if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
586 fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
590 f_str->max_res = maxn0;
595 /* pstring1 is a message to the manager, currently 512 */
596 /* pstring2 is the same information, but in a markx==10 format */
598 get_param (struct pstruct *pstr, char *pstring1, char *pstring2)
601 char *pg_str="FASTY";
603 char *pg_str="TFASTY";
606 if (!pstr->param_u.fa.optflag)
608 sprintf (pstring1, "%s (%s) function [%s matrix (%d:%d)%s] ktup: %d\n join: %d, gap-pen: %d/%d, shift: %d subs: %d width: %3d",pg_str,verstr,
610 sprintf (pstring1, "%s (%s) function [%s matrix (%d:%d)%s] ktup: %d\n join: %d, open/ext: %d/%d, shift: %d subs: %d width: %3d",pg_str,verstr,
612 pstr->pamfile, pstr->pam_h,pstr->pam_l,
613 (pstr->ext_sq_set) ? "xS":"\0",
614 pstr->param_u.fa.ktup, pstr->param_u.fa.cgap,
615 pstr->gdelval, pstr->ggapval, pstr->gshift, pstr->gsubs,
616 pstr->param_u.fa.optwid);
619 sprintf (pstring1, "%s (%s) function [optimized, %s matrix (%d:%d)%s] ktup: %d\n join: %d, opt: %d, gap-pen: %3d/%3d shift: %3d, subs: %3d width: %3d",pg_str,verstr,
621 sprintf (pstring1, "%s (%s) function [optimized, %s matrix (%d:%d)%s] ktup: %d\n join: %d, opt: %d, open/ext: %3d/%3d shift: %3d, subs: %3d width: %3d",pg_str,verstr,
623 pstr->pamfile, pstr->pam_h,pstr->pam_l,
624 (pstr->ext_sq_set) ? "xS":"\0",
625 pstr->param_u.fa.ktup, pstr->param_u.fa.cgap,
626 pstr->param_u.fa.optcut, pstr->gdelval, pstr->ggapval,
627 pstr->gshift,pstr->gsubs,pstr->param_u.fa.optwid);
629 if (pstr->param_u.fa.iniflag) strcat(pstring1," init1");
631 if (pstr->zsflag==0) strcat(pstring1," not-scaled");
632 else if (pstr->zsflag==1) strcat(pstring1," reg.-scaled");
635 if (pstring2 != NULL) {
637 sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)%s\n\
638 ; pg_gap-pen: %d %d\n; pg_ktup: %d\n; pg_optcut: %d\n; pg_cgap: %d\n",
640 sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)%s\n\
641 ; pg_open-ext: %d %d\n; pg_ktup: %d\n; pg_optcut: %d\n; pg_cgap: %d\n",
643 pg_str,verstr,pstr->pamfile, pstr->pam_h,pstr->pam_l,
644 (pstr->ext_sq_set) ? "xS":"\0", pstr->gdelval,
645 pstr->ggapval,pstr->param_u.fa.ktup,pstr->param_u.fa.optcut,
646 pstr->param_u.fa.cgap);
651 close_work (const unsigned char *aa0, int n0,
652 struct pstruct *ppst,
653 struct f_struct **f_arg)
655 struct f_struct *f_str;
661 if (ppst->ext_sq_set) naat = MAXLC;
663 free_weights(&f_str->weight0,&f_str->weight1,&f_str->weight_c,naat);
688 void do_fasta (const unsigned char *aa0, int n0,
689 const unsigned char *aa1, int n1,
690 struct pstruct *ppst, struct f_struct *f_str,
691 struct rstruct *rst, int *hoff)
693 int nd; /* diagonal array size */
697 register struct dstruct *dptr;
702 register struct dstruct *diagp;
707 struct dstruct *dpmax;
710 struct savestr *vmptr;
713 int ktup, kt1, *hsq, ip, lkt;
717 n0x32 = n0x31+1+(n0-n0x31-1)/2;
719 unsigned char *fs, *fd;
720 int n1x31, n1x32, last_n1, itemp;
722 n1x32 = n1x31+1+(n1-n1x31-1)/2;
725 if (ppst->ext_sq_set) {
734 ktup = ppst->param_u.fa.ktup;
738 rst->score[0] = rst->score[1] = rst->score[2] = 0;
742 if (n0+n1+1 >= MAXDIAG) {
743 fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
744 rst->score[0] = rst->score[1] = rst->score[2] = -1;
748 f_str->noff = n0 - 1;
758 dpmax = &f_str->diag[nd];
759 for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
766 for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++)
768 f_str->lowmax = f_str->vmax;
771 if (n1 > 1000 && aa1[0]==23 && aa1[100]==23 &&
772 aa1[1400]==23 && aa1[1401]!=23) {
780 for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos<n1; lpos++) {
781 /* restart lhval calculation */
782 if (hsq[aa1[lpos]]>=NMAP) {
783 lhval = 0; lkt=lpos+ktup;
786 lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
790 diagp = &f_str->diag[f_str->noff + lkt];
791 for (; lpos < n1; lpos++, diagp++) {
792 if (hsq[aa1[lpos]]>=NMAP) {
794 while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
795 if (lpos >= n1) break;
798 lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
799 for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
800 if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
802 lposn0 = f_str->noff + lpos;
803 for (; lpos < n1; lpos++, lposn0++) {
804 if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;}
805 lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
806 for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
807 dpos = lposn0 - tpos;
808 if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) {
811 if ((tscor -= lpos) <= 0) {
813 if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && f_str->lowscor < scor)
815 savemax (dptr, dpos, f_str);
817 savemax (dptr, f_str);
819 if ((tscor += scor) >= kfact) {
825 dptr->start = (dptr->stop = lpos) - kt1;
829 dptr->score += f_str->pamh1[aa0[tpos]];
834 dptr->score = f_str->pamh2[lhval];
835 dptr->start = (dptr->stop = lpos) - kt1;
840 /* reinitialize diag structure */
842 if ((dptr = &f_str->diag[lpos % nd])->score > f_str->lowscor)
843 savemax (dptr, lpos, f_str);
851 for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) {
852 if ((dptr = &f_str->diag[dpos % nd])->score > f_str->lowscor)
853 savemax (dptr, dpos, f_str);
856 for (dptr = f_str->diag; dptr < dpmax;) {
857 if (dptr->score > f_str->lowscor) savemax (dptr, f_str);
866 at this point all of the elements of aa1[lpos]
867 have been searched for elements of aa0[tpos]
868 with the results in diag[dpos]
872 fprintf(stderr,"n0: %d; noff: %d; n1: %d; n1x31: %d n1x32 %d\n",
873 n0, f_str->noff,n1,n1x31,n1x32);
876 for (nsave = 0, vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++)
880 fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
881 f_str->noff+vmptr->start-vmptr->dp,
882 f_str->noff+vmptr->stop-vmptr->dp,
883 vmptr->start,vmptr->stop,
884 vmptr->dp,vmptr->score);
886 if (vmptr->score > 0) {
887 vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[0], f_str);
888 f_str->vptr[nsave++] = vmptr;
893 rst->score[0] = rst->score[1] = rst->score[2] = 0;
898 /* FASTX code here to modify the start, stop points for
899 the three phases of the translated protein sequence
903 fprintf(stderr,"n0x: %d; n0x31:%d; n0x32: %d\n",n0,n0x31,n0x32);
904 for (ib=0; ib<nsave; ib++) {
905 fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
906 f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
907 f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
908 f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
909 f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
912 fprintf(stderr,"---\n");
915 for (ib=0; ib<nsave; ib++) {
916 if (f_str->noff-f_str->vptr[ib]->dp+f_str->vptr[ib]->start >= n0x32)
917 f_str->vptr[ib]->dp += n0x32;
918 if (f_str->noff-f_str->vptr[ib]->dp +f_str->vptr[ib]->start >= n0x31)
919 f_str->vptr[ib]->dp += n0x31;
923 for (ib=0; ib<nsave; ib++) {
924 fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
925 f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
926 f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
927 f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
928 f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
932 /* TFAST code here to modify the start, stop points for
933 the three phases of the translated protein sequence
934 TFAST modifies library start points, rather than
939 fprintf(stderr,"n0: %d; noff: %d; n1: %d; n1x31: %d n1x32 %d\n",n0, f_str->noff,n1,n1x31,n1x32);
940 for (ib=0; ib<nsave; ib++) {
941 fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
942 f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
943 f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
944 f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
945 f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
948 fprintf(stderr,"---\n");
951 for (ib=0; ib<nsave; ib++) {
952 if (f_str->vptr[ib]->start >= n1x32) {
953 f_str->vptr[ib]->start -= n1x32;
954 f_str->vptr[ib]->stop -= n1x32;
955 f_str->vptr[ib]->dp -= n1x32;
957 if (f_str->vptr[ib]->start >= n1x31) {
958 f_str->vptr[ib]->start -= n1x31;
959 f_str->vptr[ib]->stop -= n1x31;
960 f_str->vptr[ib]->dp -= n1x31;
965 for (ib=0; ib<nsave; ib++) {
966 fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
967 f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
968 f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
969 f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
970 f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
976 scor = sconn (f_str->vptr, nsave, ppst->param_u.fa.cgap,
977 ppst->param_u.fa.pgap, f_str);
979 for (vmptr=f_str->vptr[0],ib=1; ib<nsave; ib++)
980 if (f_str->vptr[ib]->score > vmptr->score) vmptr=f_str->vptr[ib];
982 /* kssort (f_str->vptr, nsave); */
984 rst->score[1] = vmptr->score;
985 rst->score[0] = max (scor, vmptr->score);
986 rst->score[2] = rst->score[0]; /* initn */
988 if (ppst->param_u.fa.optflag) {
989 if (rst->score[0] > ppst->param_u.fa.optcut) {
991 rst->score[2] = dmatchx(aa0, n0,aa1,n1,*hoff=f_str->noff - vmptr->dp,
992 ppst->param_u.fa.optwid, ppst->pam2[0],
993 ppst->gdelval,ppst->ggapval,ppst->gshift,f_str);
995 /* generate f_str->aa1x */
997 for (i=0; i<n1; i++) {
998 fputc(ppst->sq[aa1[i]],stderr);
999 if (i%60==59) fputc('\n',stderr);
1001 fprintf(stderr,"\n-----\n");
1004 fprintf(stderr,"n1: %d, aa1x[n1]: %d; EOSEQ: %d\n",
1005 n1,f_str->aa1x[n1],EOSEQ);
1006 for (fs=aa1,itemp=0; itemp <3; itemp++,fs++) {
1007 for (fd= &f_str->aa1x[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
1008 fprintf(stderr,"fs stopped at: %d\n",(int)(fs-f_str->aa1x));
1013 for (i=0; i<n1; i++) {
1014 fputc(ppst->sq[f_str->aa1x[i]],stderr);
1015 if (i%60==59) fputc('\n',stderr);
1018 rst->score[2] = dmatchx(aa0, n0, aa1, n1, *hoff=vmptr->dp-f_str->noff,
1019 ppst->param_u.fa.optwid, ppst->pam2[0],
1020 ppst->gdelval,ppst->ggapval,ppst->gshift,f_str);
1026 void do_work (const unsigned char *aa0, int n0,
1027 const unsigned char *aa1, int n1,
1029 struct pstruct *ppst,
1030 struct f_struct *f_str,
1031 int qr_flg, struct rstruct *rst)
1034 int last_n1, itx, dnav, n10, i, ir;
1035 unsigned char *aa1x;
1038 rst->segnum = rst->seglen = 1;
1040 if (n1 < ppst->param_u.fa.ktup) {
1041 rst->score[0] = rst->score[1] = rst->score[2] = 0;
1046 do_fasta (f_str->aa0x, n0, aa1, n1, ppst, f_str, rst, &hoff);
1048 /* make a precomputed codon number series */
1051 pre_com(aa1, n1, f_str->aa1v);
1054 pre_com_r(aa1, n1, f_str->aa1v);
1057 /* make translated sequence */
1060 for (itx= frame*3; itx< frame*3+3; itx++) {
1061 n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
1063 fprintf(stderr," itt %d frame: %d\n",itx,frame);
1064 for (i=0; i<n10; i++) {
1065 fprintf(stderr,"%c",aa[f_str->aa1x[last_n1+i]]);
1066 if ((i%60)==59) fprintf(stderr,"\n");
1068 fprintf(stderr,"\n");
1070 fprintf(stderr,"n10: %d aa1x[] %d last_n1: %d\n",n10,aa1x[last_n1+n10],
1077 do_fasta (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff);
1081 void do_opt (const unsigned char *aa0, int n0,
1082 const unsigned char *aa1, int n1,
1084 struct pstruct *ppst,
1085 struct f_struct *f_str,
1086 struct rstruct *rst)
1088 int optflag, tscore, hoff;
1090 optflag = ppst->param_u.fa.optflag;
1091 ppst->param_u.fa.optflag = 1;
1094 do_fasta (f_str->aa0x, n0, aa1, n1, ppst, f_str, rst, &hoff);
1096 do_fasta (aa0, n0, aa1, n1, ppst, f_str, rst, &hoff);
1099 ppst->param_u.fa.optflag = optflag;
1104 savemax (dptr, dpos, f_str)
1105 register struct dstruct *dptr;
1107 struct f_struct *f_str;
1109 register struct savestr *vmptr;
1114 savemax (dptr, f_str)
1115 register struct dstruct *dptr;
1116 struct f_struct *f_str;
1119 register struct savestr *vmptr;
1122 dpos = (int) (dptr - f_str->diag);
1126 /* check to see if this is the continuation of a run that is already saved */
1128 if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
1129 vmptr->start == dptr->start)
1131 vmptr->stop = dptr->stop;
1132 if ((i = dptr->score) <= vmptr->score)
1135 if (vmptr != f_str->lowmax)
1140 i = f_str->lowmax->score = dptr->score;
1141 f_str->lowmax->dp = dpos;
1142 f_str->lowmax->start = dptr->start;
1143 f_str->lowmax->stop = dptr->stop;
1144 dptr->dmax = f_str->lowmax;
1147 for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++)
1148 if (vmptr->score < i)
1151 f_str->lowmax = vmptr;
1156 int spam (const unsigned char *aa0,
1157 const unsigned char *aa1,
1158 struct savestr *dmax, int **pam2,
1159 struct f_struct *f_str)
1164 int start, stop, score;
1166 const unsigned char *aa0p, *aa1p;
1168 aa1p = &aa1[lpos = dmax->start];
1169 aa0p = &aa0[lpos - dmax->dp + f_str->noff];
1172 tot = curv.score = maxv.score = 0;
1173 for (; lpos <= dmax->stop; lpos++) {
1174 tot += pam2[*aa0p++][*aa1p++];
1175 if (tot > curv.score) {
1180 if (curv.score > maxv.score) {
1181 maxv.start = curv.start;
1182 maxv.stop = curv.stop;
1183 maxv.score = curv.score;
1185 tot = curv.score = 0;
1186 curv.start = lpos+1;
1190 if (curv.score > maxv.score) {
1191 maxv.start = curv.start;
1192 maxv.stop = curv.stop;
1193 maxv.score = curv.score;
1196 /* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
1197 printf(" new region: %3d %3d %3d %3d\n",maxv.start,
1198 dmax->start,maxv.stop,dmax->stop);
1200 dmax->start = maxv.start;
1201 dmax->stop = maxv.stop;
1208 int sconn (struct savestr **v, int n,
1209 int cgap, int pgap, struct f_struct *f_str)
1216 } *start, *sl, *sj, *so, sarr[MAXSAV];
1217 int lstart, tstart, plstop, ptstop;
1219 /* sort the score left to right in lib pos */
1225 /* for the remaining runs, see if they fit */
1227 for (i = 0, si = 0; i < n; i++)
1230 /* if the score is less than the gap penalty, it never helps */
1231 if (v[i]->score < cgap)
1233 lstart = v[i]->start;
1234 tstart = lstart - v[i]->dp + f_str->noff;
1236 /* put the run in the group */
1238 sarr[si].score = v[i]->score;
1239 sarr[si].next = NULL;
1241 /* if it fits, then increase the score */
1242 for (sl = start; sl != NULL; sl = sl->next)
1244 plstop = sl->vp->stop;
1245 ptstop = plstop - sl->vp->dp + f_str->noff;
1246 if (plstop < lstart+XFACT && ptstop < tstart+XFACT) {
1247 sarr[si].score = sl->score + v[i]->score + pgap;
1252 /* now recalculate where the score fits */
1256 for (sj = start, so = NULL; sj != NULL; sj = sj->next)
1258 if (sarr[si].score > sj->score)
1262 so->next = &sarr[si];
1273 return (start->score);
1280 struct savestr *v[];
1284 struct savestr *tmp;
1286 for (gap = n / 2; gap > 0; gap /= 2)
1287 for (i = gap; i < n; i++)
1288 for (j = i - gap; j >= 0; j -= gap)
1290 if (v[j]->score >= v[j + gap]->score)
1300 struct savestr *v[];
1304 struct savestr *tmp;
1306 for (gap = n / 2; gap > 0; gap /= 2)
1307 for (i = gap; i < n; i++)
1308 for (j = i - gap; j >= 0; j -= gap)
1310 if (v[j]->start <= v[j + gap]->start)
1319 dmatchx(const unsigned char *aa0, int n0,
1320 const unsigned char *aa1, int n1,
1321 int hoff, int window,
1322 int **pam2, int gdelval, int ggapval, int gshift,
1323 struct f_struct *f_str)
1329 return lx_band(aa1,n1,f_str->aa0v,n0-2,
1331 #ifdef OLD_FASTA_GAP
1332 -(gdelval - ggapval),
1339 return lx_band(aa0,n0,f_str->aa1v,n1-2,
1341 #ifdef OLD_FASTA_GAP
1342 -(gdelval - ggapval),
1352 init_row(struct sx_s *row, int sp) {
1354 for (i = 0; i < sp; i++) {
1355 row[i].C1 = row[i].I1 = 0;
1356 row[i].C2 = row[i].I2 = 0;
1357 row[i].C3 = row[i].I3 = 0;
1362 int lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/
1363 int len_prot, /* length of prot. seq */
1364 const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/
1365 int len_dna_prot, /* length trans. seq. */
1366 int **pam_matrix, /* scoring matrix */
1367 int gopen, int gext, /* gap open, gap extend penalties */
1368 int gshift, /* frame-shift penalty */
1369 int start_diag, /* start diagonal of band */
1370 int width, /* width for band alignment */
1371 struct f_struct *f_str)
1374 int i, j, bd, bd1, x1, x2, sp, p1=0, p2=0, end_prot;
1375 struct sx_s *last, *tmp;
1376 int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg;
1377 const unsigned char *dp;
1378 register struct sx_s *ap, *aq;
1379 struct wgt *wt, *ww;
1386 if (f_str->cur == NULL) {
1387 f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
1390 init_row(f_str->cur, sp);
1393 if (start_diag %3 !=0) start_diag = start_diag/3-1;
1394 else start_diag = start_diag/3;
1395 if (width % 3 != 0) width = width/3+1;
1396 else width = width /3;
1399 x1 = start_diag; /* x1 = lower bound of DNA */
1400 x2 = 1; /* the amount of position shift from last row*/
1402 end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width;
1403 end_prot = min(end_prot,len_prot);
1405 /* i counts through protein sequence, x1 through DNAp */
1407 for (i = max(0, -width-start_diag), x1+=i; i < len_prot; i++, x1++) {
1408 bd = min(x1+width, (len_dna_prot+2)/3); /* upper bound of band */
1409 bd1 = max(0,x1); /* lower bound of band */
1410 wt = f_str->weight0[prot_seq[i]];
1411 del = 1-x1; /*adjustment*/
1415 ap = &f_str->cur[bd1]; aq = ap+1;
1416 e1 = f_str->cur[bd1-1].C3; e2 = ap->C1; cd1 = cd2= cd3= 0;
1417 for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) {
1418 ww = &wt[(unsigned char) *dp++];
1419 sc = max(max(e1+ww->iv, (e3=ap->C2)+ww->ii), e2+ww->iii);
1420 if (cd1 > sc) sc = cd1;
1422 if ((ci = aq->I1) > 0) {
1423 if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;}
1428 if (sc > best) best =sc;
1429 if (cd1 < sc) cd1 = sc;
1430 ap->I1 = max(ci-gext, sc);
1431 } else ap->I1 = ci-gext;
1435 ap->I1 = ap->C1 = 0;
1437 ap->C1 = sc; sc-=gg;
1439 if (sc > best) best =sc;
1440 if (cd1 < sc) cd1 = sc;
1445 ww = &wt[(unsigned char) *dp++];
1446 sc = max(max(e2+ww->iv, (e1=ap->C3)+ww->ii), e3+ww->iii);
1447 if (cd2 > sc) sc = cd2;
1449 if ((ci = aq->I2) > 0) {
1450 if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;}
1455 if (sc > best) best =sc;
1456 if (cd2 < sc) cd2 = sc;
1457 ap->I2 = max(ci-gext, sc);
1462 ap->I2 = ap->C2 = 0;
1464 ap->C2 = sc; sc-=gg;
1466 if (sc > best) best =sc;
1467 if (cd2 < sc) cd2 = sc;
1472 ww = &wt[(unsigned char)*dp++];
1473 sc = max(max(e3+ww->iv, (e2=aq->C1)+ww->ii), e1+ww->iii);
1474 if (cd3 > sc) sc = cd3;
1476 if ((ci = aq++->I3) > 0) {
1477 if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;}
1482 if (sc > best) best =sc;
1483 if (cd3 < sc) cd3 = sc;
1484 ap->I3 = max(ci-gext, sc);
1489 ap->I3 = ap->C3 = 0;
1491 ap->C3 = sc; sc-=gg;
1493 if (sc > best) best =sc;
1494 if (cd3 < sc) cd3 = sc;
1501 /* printf("The best score is %d\n", best); */
1505 /* ckalloc - allocate space; check for success */
1506 void *ckalloc(size_t amount)
1510 if ((p = (void *)malloc( (size_t)amount)) == NULL)
1511 w_abort("Ran out of memory.","");
1515 /* calculate the 100% identical score */
1517 shscore(unsigned char *aa0, int n0, int **pam2)
1520 for (i=0,sum=0; i<n0; i++)
1521 sum += pam2[aa0[i]][aa0[i]];
1529 typedef struct mat *match_ptr;
1531 typedef struct mat {
1536 typedef struct { int i,j;} state;
1537 typedef state *state_ptr;
1541 static match_ptr small_global(), global();
1542 static int local_align(), find_best();
1543 static void init_row2(), init_ROW();
1546 pro_dna(const unsigned char *prot_seq, /* array with prot. seq. numbers*/
1547 int len_prot, /* length of prot. seq */
1548 const unsigned char *dna_prot_seq, /* trans. DNA seq. numbers*/
1549 int len_dna_prot, /* length trans. seq. */
1550 int **pam_matrix, /* scoring matrix */
1551 int gopen, int gext, /* gap open, gap extend penalties */
1552 int gshift, /* frame-shift penalty */
1553 struct f_struct *f_str,
1555 struct a_res_str *a_res) /* alignment info */
1557 match_ptr align, ap, aq;
1558 int x, y, ex, ey, i, score;
1561 f_str->up = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
1562 f_str->down = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
1563 f_str->tp = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
1565 /*local alignment find the best local alignment x and y
1566 is the starting position of the best local alignment
1567 and ex ey is the ending position */
1569 score= local_align(&x, &y, &ex, &ey,
1570 pam_matrix, gopen, gext,
1571 dna_prot_seq, len_dna_prot,
1572 prot_seq, len_prot, f_str);
1574 f_str->up += 3; f_str->down += 3; f_str->tp += 3;
1576 /* x, y - start in prot, dna_prot */
1577 a_res->min0 = x; /* prot */
1578 a_res->min1 = y; /* DNA */
1579 a_res->max0 = ex; /* prot */
1580 a_res->max1 = ey; /* DNA */
1582 align = global(x, y, ex, ey,
1583 pam_matrix, gopen, gext,
1584 dna_prot_seq, prot_seq,
1587 alignment = a_res->res;
1589 for (ap = align, i= 0; ap; i++) {
1590 if (i < max_res) alignment[i] = ap->l;
1591 aq = ap->next; free(ap); ap = aq;
1594 fprintf(stderr,"***alignment truncated: %d/%d***\n", max_res,i);
1596 /* up = &up[-3]; down = &down[-3]; tp = &tp[-3]; */
1597 free(&f_str->up[-3]); free(&f_str->tp[-3]); free(&f_str->down[-3]);
1604 swap(void **a, void **b)
1611 local alignment find the best local alignment x and y
1612 is the starting position of the best local alignment
1613 and ex ey is the ending position
1616 local_align(int *x, int *y, int *ex, int *ey,
1617 int **wgts, int gop, int gext,
1618 const unsigned char *dnap, int ld,
1619 const unsigned char *pro, int lp,
1620 struct f_struct *f_str)
1622 int i, j, score, x1,x2,x3,x4, e1 = 0, e2 = 0, e3,
1623 sc, del, e, best = 0, cd, ci, c;
1624 struct wgt *wt, *ww;
1625 state_ptr cur_st, last_st, cur_i_st;
1627 const unsigned char *dp;
1628 int *cur_d_st, *st_up;
1631 Array rowiC stores the best scores of alignment ending at a position
1632 Arrays rowiD and rowiI store the best scores of alignment ending
1633 at a position with a deletion or insrtion
1634 Arrays sti stores the starting position of the best alignment whose
1635 score stored in the corresponding row array.
1636 The program stores two rows to complete the computation, same is
1637 for the global alignment routine.
1641 st_up = (int *) ckalloc(sizeof(int)*(ld+10));
1642 init_row2(st_up, ld+5);
1646 init_ROW(f_str->up, ld+1);
1647 init_ROW(f_str->down, ld+1);
1649 last = f_str->down+1;
1651 cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
1652 last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
1653 cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
1656 for (i = 0; i < lp; i++) {
1657 wt = f_str->weight1[pro[i]]; e2 =0; e1 = last[0].C;
1658 for (j = 0; j < 2; j++) {
1662 for (j = 2; j < ld; j++) {
1663 ww = &wt[(unsigned char) dp[j]];
1669 if ((e=e2+ww->iii) > sc) {sc = e; del = 3;}
1670 if ((e=e1+ww->ii) > sc) {sc = e; del = 2;}
1671 if ((e = e3+ww->iv) > sc) {sc = e; del = 4;}
1674 if (ww->iii > 0) {sc = ww->iii; del = 3;}
1676 if (sc < (ci=last[j].I)) {
1679 if (sc < (cd=cur[j].D)) {
1685 cur[j+3].D = e-gext;
1688 cur[j+3].D = cd-gext;
1689 cur_d_st[j+3] = cur_d_st[j]+3;
1694 cur_st[j].i = cur_st[j-c].i;
1695 cur_st[j].j = cur_st[j-c].j;
1698 cur_st[j].i = cur_i_st[j].i;
1699 cur_st[j].j = cur_i_st[j].j;
1706 cur_st[j].i = last_st[j-del].i;
1707 cur_st[j].j = last_st[j-del].j;
1714 cur_st[j].j = max(0, j-del+1);
1724 cur_i_st[j].i = cur_st[j].i;
1725 cur_i_st[j].j = cur_st[j].j;
1727 cur[j].I = ci- gext;
1737 swap((void *)&last, (void *)&cur);
1738 swap((void *)&cur_st, (void *)&last_st);
1740 /* printf("The best score is %d\n", best);*/
1741 *x = x1; *y = x2; *ex = x3; *ey = x4;
1742 free(cur_st); free(last_st); free(cur_i_st);
1748 Both global_up and global_down do linear space score only global
1749 alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey].
1750 global_up do the algorithm upwards, from row x towards row y.
1751 global_down do the algorithm downwards, from row y towards x.
1755 global_up(st_ptr *row1, st_ptr *row2,
1756 int x, int y, int ex, int ey,
1757 int **wgts, int gop, int gext,
1758 unsigned char *dnap, unsigned char *pro,
1759 int N, struct f_struct *f_str)
1761 int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score;
1762 struct wgt *wt, *ww;
1765 cur = *row1; last = *row2;
1767 for (j = 0; j <= ey-y+1; j++) {
1768 if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;}
1769 else { last[j].I = last[j].C = -10000;}
1771 last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000;
1772 last[0].D = last[1].D = last[2].D = -10000;
1773 if (N) last[0].I = -gext;
1774 for (i = 1; i <= ex-x+1; i++) {
1775 wt = f_str->weight1[pro[i+x-1]]; e1 = -10000; e2 = last[0].C;
1776 for (j = 0; j <= ey-y+1; j++) {
1779 ww = &wt[(unsigned char) dnap[t-3]];
1783 } else if (j == 2) {
1789 sc = max(e2+ww->iii, max(e1+ww->ii, e3+ww->iv));
1791 sc = max(sc, max(ci=last[j].I, cd = cur[j].D));
1793 cur[j+3].D = max(cd, sc-gop)-gext;
1794 cur[j].I = max(ci, sc-gop)-gext;
1796 swap((void *)&last, (void *)&cur);
1798 /*printf("global up score =%d\n", last[ey-y+1].C);*/
1799 for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
1800 if (*row1 != last) swap((void *)row1, (void *)row2);
1804 global_down(st_ptr *row1, st_ptr *row2,
1805 int x, int y, int ex, int ey,
1806 int **wgts, int gop, int gext,
1807 unsigned char *dnap, unsigned char *pro,
1808 int N, struct f_struct *f_str)
1810 int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, score;
1811 struct wgt *wt, *w1, *w2, *w3;
1814 cur = (*row1); last = *row2;
1816 for (j = ey-y+1; j >= 0; j--) {
1817 if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;}
1818 else last[j].I = last[j].C = -10000;
1822 if (N) last[ey-y+1].I = -gext;
1823 cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000;
1824 last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000;
1825 for (i = ex-x; i >= 0; i--) {
1826 wt = f_str->weight1[pro[i+x]]; e2 = last[ey-y+1].C;
1828 w3 = &wt[(unsigned char) dnap[ey]];
1829 w2 = &wt[(unsigned char) dnap[ey-1]];
1830 for (j = ey-y+1; j >= 0; j--) {
1832 w1 = &wt[(unsigned char) dnap[t-1]];
1837 } else if (t+1 == ey) {
1843 sc = max(e2+w2->iii, max(e1+w1->ii,e3+w3->iv)) ;
1845 if (sc < (cd= cur[j].D)) {
1847 cur[j-3].D = cd-gext;
1848 } else cur[j-3].D =max(cd, sc-gop)-gext;
1849 if (sc < (ci= last[j].I)) {
1851 cur[j].I = ci - gext;
1852 } else cur[j].I = max(sc-gop,ci)-gext;
1856 swap((void *)&last, (void *)&cur);
1858 for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
1859 if (*row1 != last) swap((void *)row1, (void *)row2);
1863 init_row2(int *row, int ld) {
1865 for (i = 0; i < ld; i++) row[i] = 0;
1868 static void init_ROW(st_ptr row, int ld) {
1870 for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0;
1874 combine(match_ptr x1, match_ptr x2, int st) {
1877 if (x1 == NULL) return x2;
1878 for (x = x1; x->next; x = x->next);
1881 for (x = x2; x; x = x->next) {
1883 if (x->l == 3 || x->l == 4) break;
1891 global use the two upwards and downwards score only linear
1892 space global alignment subroutine to recursively build the
1897 global(int x, int y, int ex, int ey,
1898 int **wgts, int gop, int gext,
1899 unsigned char *dnap, unsigned char *pro, int N1, int N2,
1900 struct f_struct *f_str)
1904 match_ptr x1, x2, mm1, mm2;
1906 /*printf("%d %d %d %d %d %d\n", x,y, ex, ey, N1, N2);*/
1908 if the space required is limited, we can do a quadratic space
1909 algorithm to find the alignment.
1914 for (m = y+3; m <= ey; m+=3) {
1915 x1 = (match_ptr) ckalloc(sizeof(match_node));
1916 x1->l = 5; x1->next = mm1;
1917 if (mm1== NULL) mm2 = x1;
1921 if ((ey-y) % 3 != 0) {
1922 x1 = (match_ptr) ckalloc(sizeof(match_node));
1923 x1->l = ((ey-y) % 3) +1; x1->next = NULL;
1924 if (mm1) mm2->next = x1; else mm1 = x1;
1931 for (m = x; m <= ex; m++) {
1932 x1 = (match_ptr) ckalloc(sizeof(match_node));
1933 x1->l = 0; x1->next = mm1; mm1 = x1;
1937 if (ex -x < SGW1 && ey-y < SGW2)
1938 return small_global(x,y,ex,ey,wgts, gop, gext, dnap, pro, N1, N2,f_str);
1941 Do the score only global alignment from row x to row m, m is
1942 the middle row of x and ex. Store the information of row m in
1945 global_up(&f_str->up, &f_str->tp, x, y, m, ey,
1947 dnap, pro, N1, f_str);
1949 Do the score only global alignment downwards from row ex
1950 to row m+1, store information of row m+1 in downC downI and downD
1952 global_down(&f_str->down, &f_str->tp, m+1, y, ex, ey,
1954 dnap, pro, N2, f_str);
1957 Use this information for row m and m+1 to find the crossing
1958 point of the best alignment with the middle row. The crossing
1959 point is given by m1 and m2. Then we recursively call global
1960 itself to compute alignments in two smaller regions found by
1961 the crossing point and combine the two alignments to form a
1962 whole alignment. Return that alignment.
1964 if (find_best(f_str->up, f_str->down, &m1, &m2, ey-y+1, y, gop)) {
1965 x1 = global(x, y, m, m1, wgts, gop, gext, dnap, pro, N1, 0, f_str);
1966 x2 = global(m+1, m2, ex, ey, wgts, gop, gext, dnap, pro, 0, N2, f_str);
1967 if (m1 == m2) x1 = combine(x1,x2,1);
1968 else x1 = combine(x1, x2,0);
1970 x1 = global(x, y, m-1, m1, wgts, gop, gext, dnap, pro, N1, 1, f_str);
1971 x2 = global(m+2, m2, ex, ey, wgts, gop, gext, dnap, pro, 1, N2, f_str);
1972 mm1 = (match_ptr) ckalloc(sizeof(match_node));
1973 mm1->i = m; mm1->l = 0; mm1->j = m1;
1974 mm2 = (match_ptr) ckalloc(sizeof(match_node));
1975 mm2->i = m+1; mm2->l = 0; mm2->j = m1;
1976 mm1->next = mm2; mm2->next = x2;
1977 x1 = combine(x1, mm1, 0);
1983 find_best(st_ptr up, st_ptr down, int *m1, int *m2, int ld, int y, int gop) {
1985 int i, best = -1000, j = 0, s1, s2, s3, s4, st;
1987 for (i = 1; i < ld; i++) {
1988 s2 = up[i].C + down[i].C;
1989 s4 = up[i].I + down[i].I + gop;
1991 best = s2; j = i; st = 1;
1994 best = s4; j = i; st = 0;
1999 /*printf("score=%d\n", best);*/
2004 An alignment is represented as a linked list whose element
2005 is of type match_node. Each element represent an edge in the
2006 path of the alignment graph. The fields of match_node are
2007 l --- gives the type of the edge.
2008 i, j --- give the end position.
2012 small_global(int x, int y, int ex, int ey,
2013 int **wgts, int gop, int gext,
2014 unsigned char *dnap, unsigned char *pro,
2015 int N1, int N2, struct f_struct *f_str) {
2017 static int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1];
2018 int i, j, e, sc, score, del, k, t, ci, cd;
2019 int *cI, *cD, *cC, *lC, *cst, e2, e3, e4;
2020 match_ptr mp, first;
2021 struct wgt *wt, *ww;
2023 /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/
2024 sc = -gop-gext; C[0][0] = 0;
2025 if (N1) I[0] = -gext; else I[0] = sc;
2027 for (j = 1; j <= ey-y+1; j++) {
2029 C[0][j] = sc; sc -= gext; I[j] = sc-gop;
2030 } else I[j] = C[0][j] = -10000;
2033 lC = &C[0][0]; cD = D; D[0] = D[1] = D[2] = -10000;
2035 for (i = 1; i <= ex-x+1; i++) {
2037 wt = f_str->weight1[pro[i+x-1]]; cst = &st[i][0];
2038 for (j = 0; j <=ey-y+1; j++) {
2042 ww = &wt[(unsigned char) dnap[t-3]];
2044 sc = lC[j-3]+ww->iii; e2 = lC[j-2]+ww->ii;
2045 e4 = lC[j-4]+ww->iv; del = 3;
2046 if (e2 > sc) { sc = e2; del = 2;}
2047 if (e4 >= sc) { sc = e4; del = 4;}
2050 sc = lC[0]+ww->iii; del =3;
2051 } else if (j == 2) {
2052 sc = lC[0]+ww->ii; del = 2;
2053 } else {sc = -10000; del = 0;}
2066 cD[j+3] = cd - gext;
2067 } else cD[j+3] = sc -gext;
2071 } else cI[j] = sc-gext;
2076 /*printf("small global score =%d\n", C[ex-x+1][ey-y+1]);*/
2077 if (N2 && cC[ey-y+1] < ci+gop) st[ex-x+1][ey-y+1] =0;
2078 first = NULL; e = 1;
2079 for (i = ex+1, j = ey+1; i > x || j > y; i--) {
2080 mp = (match_ptr) ckalloc(sizeof(match_node));
2082 k = (t=st[i-x][j-y])%10;
2084 if (e == 5 && (t/10)%2 == 1) k = 5;
2085 if (e == 0 && (t/20)== 1) k = 0;
2086 if (k == 5) { j -= 3; i++; e=5;}
2087 else {j -= k;if (k==0) e= 0; else e = 1;}
2093 /* for (i = 0; i <= ex-x; i++) {
2094 for (j = 0; j <= ey-y; j++)
2095 printf("%d ", C[i][j]);
2106 display_alig(int *a, unsigned char *dna, unsigned char *pro,
2107 int length, int ld, struct f_struct *f_str)
2109 int len = 0, i, j, x, y, lines, k, iaa;
2110 static char line1[100], line2[100], line3[100],
2112 char *dna1, c1, c2, c3;
2114 line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-3;
2116 printf("\n%5d\n%5d", y+3, x);
2117 for (len = 0, j = 2, lines = 0; j < length; j++) {
2123 line2[len] = aa[iaa=pro[x++]];
2124 line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c5;
2125 if (line1[len] != f_str->weight_c[iaa][(unsigned char) dna[y]].c3)
2126 line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
2132 line2[len] = aa[iaa=pro[x++]];
2133 line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c2;
2134 line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
2140 line2[len] = aa[iaa=pro[x++]];
2141 line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c4;
2142 line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
2146 line1[len] = f_str->weight_c[0][(unsigned char) dna[y]].c3;
2151 line2[len] = aa[pro[x++]];
2155 line1[len] = line2[len] = line3[len] = '\0';
2157 for (k = 10; k <= WIDTH; k+=10)
2159 if (k-5 < WIDTH) printf(" .");
2160 c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH];
2161 line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0';
2162 printf("\n %s\n %s\n %s\n", line1, line3, line2);
2163 line1[WIDTH] = c1; line2[WIDTH] = c2;
2164 strncpy(line1, &line1[WIDTH], sizeof(line1)-1);
2165 strncpy(line2, &line2[WIDTH], sizeof(line2)-1);
2166 strncpy(line3, &line3[WIDTH], sizeof(line3)-1);
2168 printf("\n%5d\n%5d", y+3, x);
2171 for (k = 10; k < len; k+=10)
2173 if (k-5 < len) printf(" .");
2174 printf("\n %s\n %s\n %s\n", line1, line3, line2);
2178 /* alignment store the operation that align the protein and dna sequence.
2179 The code of the number in the array is as follows:
2180 0: delete of an amino acid.
2181 2: frame shift, 2 nucleotides match with an amino acid
2182 3: match an amino acid with a codon
2183 4: the other type of frame shift
2184 5: delete of a codon
2187 Also the first two element of the array stores the starting point
2188 in the protein and dna sequences in the local alignment.
2190 Display looks like where WIDTH is assumed to be divisible by 10.
2192 0 . : . : . : . : . : . :
2193 AACE/N\PLK\G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ
2194 I S G S V F N R Q L A G S V F N R Q L A
2195 AACE P P-- G HK Y TWA A C E P P---- G HK Y TWA A C E P P----
2197 60 . : . : . : . : . : . :
2198 /G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LW
2199 G S V F N R Q L A G S V F N R Q L A G S V F
2200 G HK Y TWA A C E P P---- G HK Y TWA A C E P P---- G HK Y TW
2202 For frame shift, the middle row show the letter in the original sequence,
2203 and the letter in the top row is the amino acid that is chose by the
2204 alignment (translated codon chosen from 4 nucleotides, or 2+1).
2207 /* fatal - print message and die */
2212 fprintf(stderr, "%s\n", msg);
2216 int do_walign (const unsigned char *aa0, int n0,
2217 const unsigned char *aa1, int n1,
2219 struct pstruct *ppst,
2220 struct f_struct *f_str,
2221 struct a_res_str *a_res,
2225 int i, ir, last_n1, itemp, n10, itx, dnav;
2226 unsigned char *aa1x;
2228 a_res->res = f_str->res;
2231 score = pro_dna(aa1, n1, f_str->aa0v, n0-2, ppst->pam2[0],
2232 #ifdef OLD_FASTA_GAP
2233 -(ppst->gdelval - ppst->ggapval),
2239 f_str, f_str->max_res, a_res);
2240 /* display_alig(f_str->res,f_str->aa0v+2,aa1,*nres,n0-2,f_str); */
2243 /* make a precomputed codon number series */
2245 pre_com(aa1, n1, f_str->aa1v);
2247 else { /* must do things backwards */
2248 pre_com_r(aa1, n1, f_str->aa1v);
2251 /* make translated sequence */
2254 for (itx= frame*3; itx< frame*3+3; itx++) {
2255 n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
2257 fprintf(stderr," itt %d itx: %d\n",itt,itx);
2258 for (i=0; i<n10; i++) {
2259 fprintf(stderr,"%c",aa[f_str->aa1x[last_n1+i]]);
2260 if ((i%60)==59) fprintf(stderr,"\n");
2262 fprintf(stderr,"\n");
2268 score = pro_dna(aa0, n0, f_str->aa1v, n1-2, ppst->pam2[0],
2269 #ifdef OLD_FASTA_GAP
2270 -(ppst->gdelval - ppst->ggapval),
2276 f_str, f_str->max_res, a_res);
2277 /* display_alig(f_str->res,f_str->aa0y,aa1,*nres,n0,f_str); */
2279 a_res->res = f_str->res;
2286 pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
2289 int i, last_n1, itemp, n10;
2290 unsigned char *fs, *fd;
2293 /* make a precomputed codon number series */
2295 pre_com(aa1, n1, f_str->aa1v);
2297 else { /* must do things backwards */
2298 pre_com_r(aa1, n1, f_str->aa1v);
2303 /* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
2304 /* call from calcons, calc_id, calc_code */
2306 aln_func_vals(int frame, struct a_struct *aln) {
2314 if (frame > 0) aln->qlrev = 1;
2315 else aln->qlrev = 0;
2322 if (frame > 0) aln->llrev = 1;
2323 else aln->llrev = 0;
2327 #include "structs.h"
2330 int calcons(const unsigned char *aa0, int n0,
2331 const unsigned char *aa1, int n1,
2333 struct a_struct *aln,
2334 struct a_res_str a_res,
2336 char *seqc0, char *seqc1, char *seqca,
2337 struct f_struct *f_str)
2340 int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
2341 char *sp0, *sp1, *spa, *sq;
2343 const unsigned char *ap0, *ap1;
2347 /* don't fill in the ends */
2351 rpmax = &res[a_res.nres]; /* end of alignment info */
2353 if (pst.ext_sq_set) {sq = pst.sqx;}
2356 /* res[0] has start of protein sequence */
2357 /* res[1] has start of translated DNA sequence */
2359 #ifndef TFAST /* FASTX */
2360 ap0 = f_str->aa0v; /* computed codons -> ap0*/
2361 ap1 = aa1; /* protein sequence -> ap1 */
2362 aln->smin1 = a_res.min0; /* start in protein sequence */
2363 aln->smin0= a_res.min1; /* start in DNA/codon sequence */
2365 ap0 = f_str->aa1v; /* computed codons -> ap0*/
2366 ap1 = aa0; /* protein sequence */
2367 aln->smin0 = a_res.min0; /* start in protein sequence */
2368 aln->smin1 = a_res.min1; /* start in codon sequence */
2371 rp = a_res.res; /* start of alignment info */
2373 /* now get the middle */
2376 sp0 = seqc0; /* sp0/seqc0 is codon sequence */
2377 sp1 = seqc1; /* sp1/seqc1 is protein sequence */
2379 sp1 = seqc0; /* sp1/seqc0 is protein sequence */
2380 sp0 = seqc1; /* sp0/seqc1 is codon sequence */
2383 lenc = not_c = aln->nident = aln->nsim = ngap_d = ngap_p = nfs = 0;
2384 i0 = a_res.min1-3; /* start of codon sequence */
2385 i1 = a_res.min0; /* start of protein sequence */
2387 while (rp < rpmax ) {
2391 *sp1 = sq[aap=ap1[i1++]];
2392 *sp0 = f_str->weight_c[aap][ap0[i0]].c5;
2394 if ((itmp=pst.pam2[0][aap][pascii[*sp0]])<0) { *spa = M_NEG; }
2395 else if (itmp == 0) { *spa = M_ZERO;}
2396 else {*spa = M_POS;}
2397 if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;}
2399 if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
2400 sp0++; sp1++; spa++;
2403 case 2: /* frame shift +2, then match */
2410 *sp1 = sq[aap=ap1[i1++]];
2411 *sp0 = f_str->weight_c[aap][ap0[i0]].c2;
2412 if ((itmp=pst.pam2[0][aap][pascii[*sp0]])<0) { *spa = M_NEG; }
2413 else if (itmp == 0) { *spa = M_ZERO;}
2414 else {*spa = M_POS;}
2416 if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
2417 sp0++; sp1++; spa++;
2420 case 4: /* frame shift, -1, then match */
2427 *sp1 = sq[aap=ap1[i1++]];
2428 *sp0 = f_str->weight_c[aap][ap0[i0]].c4;
2429 if ((itmp=pst.pam2[0][aap][pascii[*sp0]])<0) { *spa = M_NEG; }
2430 else if (itmp == 0) { *spa = M_ZERO;}
2431 else {*spa = M_POS;}
2433 if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
2434 sp0++; sp1++; spa++;
2437 case 5: /* insertion in 1 */
2439 *sp0++ = f_str->weight_c[0][ap0[i0]].c3;
2445 case 0: /* insertion in 0 */
2447 *sp1++ = sq[ap1[i1++]];
2458 aln->amax0 = i0+3; /* end of codon sequence */
2459 aln->amax1 = i1; /* end of protein sequence */
2460 aln->ngap_q = ngap_d;
2461 aln->ngap_l = ngap_p;
2463 aln->amax1 = i0+3; /* end of codon sequence */
2464 aln->amax0 = i1; /* end of protein sequence */
2465 aln->ngap_q = ngap_p;
2466 aln->ngap_l = ngap_d;
2469 aln->amin0 = aln->smin0;
2470 aln->amin1 = aln->smin1;
2472 if (lenc < 0) lenc = 1;
2475 /* now we have the middle, get the right end */
2480 int calcons_a(const unsigned char *aa0, unsigned char *aa0a, int n0,
2481 const unsigned char *aa1, int n1,
2483 struct a_struct *aln,
2484 struct a_res_str a_res,
2486 char *seqc0, char *seqc0a, char *seqc1, char *seqca,
2487 char *ann_arr, struct f_struct *f_str)
2490 int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
2491 char *sp0, *sp0a, *sp1, *spa, *sq;
2493 const unsigned char *ap0, *ap1;
2496 /* don't fill in the ends */
2498 rpmax = &a_res.res[a_res.nres]; /* end of alignment info */
2500 if (pst.ext_sq_set) {sq = pst.sqx;}
2503 /* res[0] has start of protein sequence */
2504 /* res[1] has start of translated DNA sequence */
2507 ap0 = f_str->aa0v; /* computed codons -> ap0*/
2508 ap1 = aa1; /* protein sequence -> ap1 */
2509 aln->smin1 = a_res.min0; /* start in protein sequence */
2510 aln->smin0= a_res.min1; /* start in DNA/codon sequence */
2512 ap0 = f_str->aa1v; /* computed codons -> ap0*/
2513 ap1 = aa0; /* protein sequence */
2514 aln->smin0 = a_res.min0; /* start in protein sequence */
2515 aln->smin1 = a_res.min1; /* start in codon sequence */
2518 rp = a_res.res; /* start of alignment info */
2521 /* now get the middle */
2525 sp0 = seqc0; /* sp0/seqc0 is codon sequence */
2526 sp1 = seqc1; /* sp1/seqc1 is protein sequence */
2528 sp1 = seqc0; /* sp1/seqc0 is protein sequence */
2529 sp0 = seqc1; /* sp0/seqc1 is codon sequence */
2532 lenc = not_c = aln->nident = aln->nsim = ngap_d = ngap_p = nfs = 0;
2533 i0 = a_res.min1-3; /* start of codon sequence */
2534 i1 = a_res.min0; /* start of protein sequence */
2536 while (rp < rpmax ) {
2541 *sp1 = sq[aap=ap1[i1++]];
2542 *sp0 = f_str->weight_c[aap][ap0[i0]].c5;
2544 if ((itmp=pst.pam2[0][aap][pascii[*sp0]])<0) { *spa = M_NEG; }
2545 else if (itmp == 0) { *spa = M_ZERO;}
2546 else {*spa = M_POS;}
2547 if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;}
2549 if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
2550 sp0++; sp1++; spa++;
2553 case 2: /* frame shift +2, then match */
2565 *sp0a++ = ann_arr[aa0a[i1]];
2567 *sp1 = sq[aap=ap1[i1++]];
2568 *sp0 = f_str->weight_c[aap][ap0[i0]].c2;
2569 if ((itmp=pst.pam2[0][aap][pascii[*sp0]])<0) { *spa = M_NEG; }
2570 else if (itmp == 0) { *spa = M_ZERO;}
2571 else {*spa = M_POS;}
2572 if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;}
2574 if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
2575 sp0++; sp1++; spa++;
2578 case 4: /* frame shift, -1, then match */
2584 *sp0a++ = ann_arr[aa0a[i1]];
2590 *sp1 = sq[aap=ap1[i1++]];
2591 *sp0 = f_str->weight_c[aap][ap0[i0]].c4;
2592 if ((itmp=pst.pam2[0][aap][pascii[*sp0]])<0) { *spa = M_NEG; }
2593 else if (itmp == 0) { *spa = M_ZERO;}
2594 else {*spa = M_POS;}
2595 if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;}
2597 if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
2598 sp0++; sp1++; spa++;
2601 case 5: /* insertion in 1 */
2603 *sp0++ = f_str->weight_c[0][ap0[i0]].c3;
2610 case 0: /* insertion in 0 */
2615 *sp0a++ = ann_arr[aa0a[i1]];
2617 *sp1++ = sq[ap1[i1++]];
2625 *sp0a = *spa = '\0';
2628 aln->amax0 = i0+3; /* end of codon sequence */
2629 aln->amax1 = i1; /* end of protein sequence */
2630 aln->ngap_q = ngap_d;
2631 aln->ngap_l = ngap_p;
2633 aln->amax1 = i0+3; /* end of codon sequence */
2634 aln->amax0 = i1; /* end of protein sequence */
2635 aln->ngap_q = ngap_p;
2636 aln->ngap_l = ngap_d;
2639 aln->amin0 = aln->smin0;
2640 aln->amin1 = aln->smin1;
2642 if (lenc < 0) lenc = 1;
2645 /* now we have the middle, get the right end */
2651 update_code(char *al_str, int al_str_max, int op, int op_cnt, char *op_char) {
2655 sprintf(tmp_cnt,"%c%d",op_char[op],op_cnt);
2656 strncat(al_str,tmp_cnt,al_str_max);
2659 /* build an array of match/ins/del - length strings */
2660 int calc_code(const unsigned char *aa0, int n0,
2661 const unsigned char *aa1, int n1,
2662 struct a_struct *aln,
2663 struct a_res_str a_res,
2665 char *al_str, int al_str_n, struct f_struct *f_str)
2668 int lenc, not_c, itmp, ngap_d, ngap_p, nfs;
2670 char sp0, sp1, op_char[10];
2672 const unsigned char *ap0, *ap1;
2675 /* don't fill in the ends */
2678 strncpy(op_char,"- /=\\+*",sizeof(op_char));
2679 ap0 = f_str->aa0v; /* computed codons -> ap0*/
2680 ap1 = aa1; /* protein sequence -> ap1 */
2681 aln->smin1 = a_res.min0; /* start in protein sequence */
2682 aln->smin0= a_res.min1; /* start in DNA/codon sequence */
2684 strncpy(op_char,"+ /=\\-*",sizeof(op_char));
2685 ap0 = f_str->aa1v; /* computed codons -> ap0*/
2686 ap1 = aa0; /* protein sequence */
2687 aln->smin0 = a_res.min0; /* start in protein sequence */
2688 aln->smin1 = a_res.min1; /* start in codon sequence */
2691 rp = a_res.res; /* start of alignment info */
2692 rpmax = &a_res.res[a_res.nres]; /* end of alignment info */
2694 /* now get the middle */
2696 lenc = not_c = aln->nident = aln->nsim = ngap_d = ngap_p = nfs = 0;
2700 i0 = a_res.min1-3; /* start of codon sequence */
2701 i1 = a_res.min0; /* start of protein sequence */
2703 while (rp < rpmax ) {
2706 sp1 = pst.sq[aap=ap1[i1++]];
2708 sp0 = f_str->weight_c[aap][ap0[i0]].c5;
2709 if (pst.pam2[0][aap][pascii[sp0]]>=0) { aln->nsim++; }
2711 if (op == 3 || op == 6) {
2712 if (sp0 != '*' && sp1 != '*') {
2714 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char);
2720 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt,op_char);
2725 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2728 if (sp0 == sp1) aln->nident++;
2731 case 2: /* -1 frame shift */
2732 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2734 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2740 sp1 = pst.sq[aap=ap1[i1++]];
2741 sp0 = f_str->weight_c[aap][ap0[i0]].c2;
2742 if (pst.pam2[0][aap][pascii[sp0]]>=0) { aln->nsim++; }
2743 if (sp0 == sp1) aln->nident++;
2746 case 4: /* +1 frame shift */
2747 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2749 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2755 sp1 = pst.sq[aap=ap1[i1++]];
2756 sp0 = f_str->weight_c[aap][ap0[i0]].c4;
2757 if (pst.pam2[0][aap][pascii[sp0]]>=0) { aln->nsim++; }
2758 if (sp0 == sp1) aln->nident++;
2761 case 5: /* insert in 1 */
2762 if (op == 5) op_cnt++;
2764 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2772 case 0: /* insert in 0 */
2773 if (op == 0) op_cnt++;
2775 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2786 update_code(al_str, al_str_n-strlen(al_str),op, op_cnt, op_char);
2789 aln->amax0 = i0+3; /* end of codon sequence */
2790 aln->amax1 = i1; /* end of protein sequence */
2791 aln->ngap_q = ngap_d;
2792 aln->ngap_l = ngap_p;
2794 aln->amax1 = i0+3; /* end of codon sequence */
2795 aln->amax0 = i1; /* end of protein sequence */
2796 aln->ngap_q = ngap_p;
2797 aln->ngap_l = ngap_d;
2800 aln->amin0 = aln->smin0;
2801 aln->amin1 = aln->smin1;
2803 if (lenc < 0) lenc = 1;
2805 /* now we have the middle, get the right end */
2810 int calc_id(const unsigned char *aa0, int n0,
2811 const unsigned char *aa1, int n1,
2812 struct a_struct *aln,
2813 struct a_res_str a_res,
2815 struct f_struct *f_str)
2818 int lenc, not_c, itmp, ngap_d, ngap_p, nfs;
2821 const unsigned char *ap0, *ap1;
2824 /* don't fill in the ends */
2826 #ifndef TFAST /* FASTYZ */
2827 ap0 = f_str->aa0v; /* computed codons -> ap0*/
2828 ap1 = aa1; /* protein sequence -> ap1 */
2829 aln->smin1 = a_res.min0; /* start in protein sequence */
2830 aln->smin0 = a_res.min1; /* start in DNA/codon sequence */
2832 ap0 = f_str->aa1v; /* computed codons -> ap0*/
2833 ap1 = aa0; /* protein sequence */
2834 aln->smin0 = a_res.min0; /* start in protein sequence */
2835 aln->smin1 = a_res.min1; /* start in codon sequence */
2838 rp = a_res.res; /* start of alignment info */
2839 rpmax = &a_res.res[a_res.nres]; /* end of alignment info */
2841 /* now get the middle */
2843 lenc = not_c = aln->nident = aln->nsim = ngap_d = ngap_p = nfs = 0;
2844 i0 = a_res.min1-3; /* start of codon sequence */
2845 i1 = a_res.min0; /* start of protein sequence */
2847 while (rp < rpmax ) {
2851 sp1 = pst.sq[aap=ap1[i1++]];
2852 sp0 = f_str->weight_c[aap][ap0[i0]].c5;
2853 if (pst.pam2[0][aap][pascii[sp0]]>=0) { aln->nsim++; }
2854 if (sp0 == sp1) aln->nident++;
2861 sp1 = pst.sq[aap=ap1[i1++]];
2862 sp0 = f_str->weight_c[aap][ap0[i0]].c2;
2863 if (pst.pam2[0][aap][pascii[sp0]]>=0) { aln->nsim++; }
2864 if (sp0 == sp1) aln->nident++;
2871 sp1 = pst.sq[aap=ap1[i1++]];
2872 sp0 = f_str->weight_c[aap][ap0[i0]].c4;
2873 if (pst.pam2[0][aap][pascii[sp0]]>=0) { aln->nsim++; }
2874 if (sp0 == sp1) aln->nident++;
2891 aln->amax0 = i0+3; /* end of codon sequence */
2892 aln->amax1 = i1; /* end of protein sequence */
2893 aln->ngap_q = ngap_d;
2894 aln->ngap_l = ngap_p;
2896 aln->amax1 = i0+3; /* end of codon sequence */
2897 aln->amax0 = i1; /* end of protein sequence */
2898 aln->ngap_q = ngap_p;
2899 aln->ngap_l = ngap_d;
2902 aln->amin0 = aln->smin0;
2903 aln->amin1 = aln->smin1;
2905 if (lenc < 0) lenc = 1;
2907 /* now we have the middle, get the right end */
2915 update_params(struct qmng_str *qm_msg, struct pstruct *ppst)
2917 ppst->n0 = qm_msg->n0;