/* copyright (c) 1998, 1999 William R. Pearson and the U. of Virginia */ /* $Name: fa_34_26_5 $ - $Id: dropfs2.c,v 1.40 2007/02/26 21:56:59 wrp Exp $ */ /* changed to return 2.0, rather than -1.0, for failure */ /* Feb 4, 2005 - modifications to allow searches with ktup=2 for very long queries. This is a temporary solution to savemax(), spam() which do not preserve exact matches do_fasts() has been modified to allow higher maxsav for do_walign than for do_work (2*nsegs, 6*nsegs) */ /* this code implements the "fasts" algorithm, which compares a set of protein fragments to a protein sequence. Comma's are used to separate the sequence fragments, which need not be the same length. The expected input is: >mgstm1 MGDAPDFD, MILGYW, MLLEYTDS The fragments do not need to be in the correct order (which is presumably unknown from the peptide sequencing. */ #include #include #include #include #include #include "defs.h" #include "param.h" #include "tatstats.h" #define EOSEQ 0 #define ESS 49 #define NMAP_X 23 /* for 'X' */ #define NMAP_Z 24 /* for '*' */ #define MAXHASH 32 #define NMAP MAXHASH+1 static char *verstr="4.32 Feb 2007"; #define DROP_INTERN #include "drop_func.h" int shscore(const unsigned char *aa0, const int n0, int **pam2, int nsq); static void update_code(char *al_str, int al_str_max, int op, int op_cnt, int fnum); extern void aancpy(char *to, char *from, int count, struct pstruct pst); #ifdef TFAST extern int aatran(const unsigned char *ntseq, unsigned char *aaseq, const int maxs, const int frame); #endif void savemax(struct dstruct *, struct f_struct *, int maxsav, int exact,int t_end); int spam(const unsigned char *, const unsigned char *, int, struct savestr *, int **, struct f_struct *); int sconn(struct savestr **v, int nsave, struct f_struct *, struct rstruct *, struct pstruct *, const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int opt_prob); void kpsort(struct savestr **, int); void kssort(struct savestr **, int); /* sort by score */ int sconn_a(unsigned char *, int, const unsigned char *, int, struct f_struct *, struct a_res_str *, struct pstruct *); void kpsort(struct savestr **, int); /* initialize for fasta */ void init_work (unsigned char *aa0, const int n0, struct pstruct *ppst, struct f_struct **f_arg ) { int mhv, phv; int hmax, nsegs; int i0, ib, hv, old_hv; int pamfact; struct f_struct *f_str; /* these used to be globals, but do not need to be */ int ktup, fact, kt1; int maxn0; int stmp; /* temporary score */ int i, j, q; int tat_size; int *res; unsigned char *query; int k, l, m, n, N, length, index; double *tatprobptr; f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct)); ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval; ktup = ppst->param_u.fa.ktup; if ( ktup > 2 ) { ktup = ppst->param_u.fa.ktup = 2; } fact = ppst->param_u.fa.scfact; /* fasts3 cannot work with lowercase symbols as low complexity; thus, NMAP must be disabled; this depends on aascii['X'] */ if (ppst->hsq[NMAP_X] == NMAP ) {ppst->hsq[NMAP_X]=1;} if (ppst->hsq[NMAP_Z] == NMAP ) {ppst->hsq[NMAP_Z]=1;} /* this does not work in a threaded environment */ /* else {fprintf(stderr," cannot find 'X'==NMAP\n");} */ for (i0 = 1, mhv = -1; i0 <= ppst->nsq; i0++) if (ppst->hsq[i0] < NMAP && ppst->hsq[i0] > mhv) mhv = ppst->hsq[i0]; if (mhv <= 0) { fprintf (stderr, " maximum hsq <=0 %d\n", mhv); exit (1); } for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++; /* kshft = 2; */ kt1 = ktup-1; hv = 1; for (i0 = 0; i0 < ktup; i0++) hv = hv << f_str->kshft; hmax = hv; f_str->hmask = (hmax >> f_str->kshft) - 1; if ((f_str->aa0t = (unsigned char *) calloc(n0+1, sizeof(char))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0t array; %d\n",n0+1); exit (1); } if ((f_str->aa0ti = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0ti array; %d\n",n0+1); exit (1); } if ((f_str->aa0b = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0b array; %d\n",n0+1); exit (1); } if ((f_str->aa0e = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0e array; %d\n",n0+1); exit (1); } if ((f_str->aa0i = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0i array; %d\n",n0+1); exit (1); } if ((f_str->aa0s = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0s array; %d\n",n0+1); exit (1); } if ((f_str->aa0l = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0l array; %d\n",n0+1); exit (1); } if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash array: hmax: %d hmask: %d\n", hmax, f_str->hmask); exit (1); } if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh1 array\n"); exit (1); } if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh2 array\n"); exit (1); } if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash link array"); exit (1); } /* for FASTS/FASTM, we want to know when we get to the end of a peptide, so we can ensure that we set the end and restart */ if ((f_str->l_end = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate link end array"); exit (1); } for (i0 = 0; i0 < hmax; i0++) f_str->harr[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->l_end[i0] = 0; /* count the number of peptides */ nsegs = 1; for (i0 = 0; i0 < n0; i0++) { if (aa0[i0] == ESS || aa0[i0] == 0) nsegs++; } /* allocate space for peptides offsets, nm_u */ if ((f_str->nmoff = (int *)calloc(nsegs+1, sizeof(int)))==NULL) { fprintf(stderr, " cannot allocat nmoff array: %d\n", nsegs); exit(1); } if ((f_str->nm_u = (int *)calloc(nsegs+1, sizeof(int)))==NULL) { fprintf(stderr, " cannot allocat nm_u array: %d\n", nsegs); exit(1); } phv = hv = 0; f_str->nmoff[0] = 0; f_str->nm0 = 1; /* encode the aa0 array */ if (kt1 > 0) { hv = ppst->hsq[aa0[0]]; phv = ppst->pam2[0][aa0[0]][aa0[0]]; } for (i0=kt1 ; i0 < n0; i0++) { if (aa0[i0] == ESS || aa0[i0] == 0) { /* fprintf(stderr," converted %d to 0\n",aa0[i0]); */ aa0[i0] = EOSEQ; /* set ESS to 0 */ f_str->nmoff[f_str->nm0++] = i0+1; f_str->l_end[i0-1] = 1; phv = hv = 0; if (kt1 > 0) { i0++; hv = ppst->hsq[aa0[i0]]; phv = ppst->pam2[0][aa0[i0]][aa0[i0]]; } continue; } hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]]; f_str->link[i0] = f_str->harr[hv]; f_str->harr[hv] = i0; f_str->pamh2[hv] = (phv += ppst->pam2[0][aa0[i0]][aa0[i0]]); phv -= ppst->pam2[0][aa0[i0 - kt1]][aa0[i0 - kt1]]; } f_str->l_end[n0-1] = 1; f_str->nmoff[f_str->nm0] = n0+1; /* #ifdef DEBUG fprintf(stderr, ">>%s\n",qtitle); for (j=0; jnm0; j++) { for (i=f_str->nmoff[j]; i < f_str->nmoff[j+1]-1; i++) { fprintf(stderr,"%c",ppst->sq[aa0[i]]); } fprintf(stderr," %d\n",aa0[i]); } for (j=1; j<=ppst->nsq; j++) { fprintf(stderr, "%c %d\n", ppst->sq[j], f_str->harr[j]); } for (j=0; j<=n0; j++) { fprintf(stderr, "%c %d\n", ppst->sq[aa0[j]], f_str->link[j]); } #endif */ /* build an integer array of the max score that can be achieved from that position - use in savemax to mark some segments as fixed */ /* setup aa0b[], aa0e[], which specify the begining and end of each segment */ stmp = 0; q = -1; for (ib = i0 = 0; i0 < n0; i0++) { f_str->aa0l[i0] = i0 - q; if (aa0[i0]==EOSEQ) { f_str->aa0b[i0] = -1; f_str->aa0e[i0] = -1; f_str->aa0i[i0] = -1; f_str->aa0l[i0] = -1; q = i0; if (i0 > 0)f_str->aa0s[i0-1] = stmp; stmp = 0; ib++; } else { stmp += ppst->pam2[0][aa0[i0]][aa0[i0]]; } f_str->aa0b[i0] = f_str->nmoff[ib]; f_str->aa0e[i0] = f_str->nmoff[ib+1]-2; f_str->aa0i[i0] = ib; /* fprintf(stderr,"%2d %c: %2d %2d %2d\n",i0,ppst->sq[aa0[i0]], f_str->aa0b[i0],f_str->aa0e[i0],f_str->aa0i[i0]); */ } f_str->aa0s[n0-1]=stmp; /* save last best possible score */ /* maxsav - maximum number of peptide alignments saved in search */ /* maxsav_w - maximum number of peptide alignments saved in alignment */ f_str->maxsav = max(MAXSAV,2*f_str->nm0); f_str->maxsav_w = max(MAXSAV,6*f_str->nm0); if ((f_str->vmax = (struct savestr *) calloc(f_str->maxsav_w,sizeof(struct savestr)))==NULL) { fprintf(stderr, "Couldn't allocate vmax[%d].\n",f_str->maxsav_w); exit(1); } if ((f_str->vptr = (struct savestr **) calloc(f_str->maxsav_w,sizeof(struct savestr *)))==NULL) { fprintf(stderr, "Couldn't allocate vptr[%d].\n",f_str->maxsav_w); exit(1); } if ((f_str->sarr = (struct slink *) calloc(f_str->maxsav_w,sizeof(struct slink)))==NULL) { fprintf(stderr, "Couldn't allocate sarr[%d].\n",f_str->maxsav_w); exit(1); } /* Tatusov Statistics Setup */ /* initialize priors array. */ if((f_str->priors = (double *)calloc(ppst->nsq+1, sizeof(double))) == NULL) { fprintf(stderr, "Couldn't allocate priors array.\n"); exit(1); } calc_priors(f_str->priors, ppst, f_str, NULL, 0, ppst->pseudocts); /* pre-calculate the Tatusov probability array for each full segment */ if(ppst->zsflag >= 1 && ppst->zsflag <= 3 && f_str->nm0 <= 10) { tat_size = (1<nm0) -1; f_str->dotat = 1; f_str->tatprobs = (struct tat_str **) malloc((size_t)tat_size*sizeof(struct tat_str *)); if (f_str->tatprobs == NULL) { fprintf (stderr, " cannot allocate tatprobs array: %ld\n", tat_size * sizeof(struct tat_str *)); exit (1); } f_str->intprobs = (double **) malloc((size_t)tat_size * sizeof(double *)); if(f_str->intprobs == NULL) { fprintf(stderr, "Couldn't allocate intprobs array.\n"); exit(1); } for(k = 0, l = f_str->nm0 ; k < l ; k++) { query = &(aa0[f_str->nmoff[k]]); length = f_str->nmoff[k+1] - f_str->nmoff[k] - 1; /* this segment alone */ index = (1 << k) - 1; generate_tatprobs(query, 0, length - 1, f_str->priors, ppst->pam2[0], ppst->nsq, &(f_str->tatprobs[index]), NULL); /* integrate the probabilities */ N = f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore; tatprobptr = (double *) calloc(N+1, sizeof(double)); if(tatprobptr == NULL) { fprintf(stderr, "Couldn't calloc tatprobptr.\n"); exit(1); } f_str->intprobs[index] = tatprobptr; for (i = 0; i <= N ; i++ ) { tatprobptr[i] = f_str->tatprobs[index]->probs[i]; for (j = i + 1 ; j <= N ; j++ ) { tatprobptr[i] += f_str->tatprobs[index]->probs[j]; } } /* this segment built on top of all other subcombinations */ for(i = 0, j = (1 << k) - 1 ; i < j ; i++) { index = (1 << k) + i; generate_tatprobs(query, 0, length - 1, f_str->priors, ppst->pam2[0], ppst->nsq, &(f_str->tatprobs[index]), f_str->tatprobs[i]); /* integrate the probabilities */ N = f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore; tatprobptr = (double *) calloc(N+1, sizeof(double)); if(tatprobptr == NULL) { fprintf(stderr, "Couldn't calloc tatprobptr.\n"); exit(1); } f_str->intprobs[index] = tatprobptr; for (m = 0; m <= N ; m++ ) { tatprobptr[m] = f_str->tatprobs[index]->probs[m]; for (n = m + 1 ; n <= N ; n++ ) { tatprobptr[m] += f_str->tatprobs[index]->probs[n]; } } } } } else { f_str->dotat = 0; f_str->shuff_cnt = ppst->shuff_node; } /* End of Tatusov Statistics Setup */ /* for (i0=1; i0<=ppst->nsq; i0++) { fprintf(stderr," %c: %2d ",ppst->sq[i0],f_str->harr[i0]); hv = f_str->harr[i0]; while (hv >= 0) { fprintf(stderr," %2d",f_str->link[hv]); hv = f_str->link[hv]; } fprintf(stderr,"\n"); } */ /* this has been modified from 0..nsq to 1..<=ppst->nsq because the pam2[0][0] is now undefined for consistency with blast */ for (i0 = 1; i0 <= ppst->nsq; i0++) f_str->pamh1[i0] = ppst->pam2[0][i0][i0]; f_str->ndo = 0; f_str->noff = n0-1; if (f_str->diag==NULL) f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG, sizeof (struct dstruct)); if (f_str->diag == NULL) { fprintf (stderr, " cannot allocate diagonal arrays: %ld\n", (long) MAXDIAG * (long) (sizeof (struct dstruct))); exit (1); } #ifdef TFAST if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2); exit (1); } f_str->aa1x++; #endif maxn0 = max(3*n0/2,MIN_RES); if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) { fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0); exit(1); } f_str->res = res; f_str->max_res = maxn0; *f_arg = f_str; } /* pstring1 is a message to the manager, currently 512 */ /* pstring2 is the same information, but in a markx==10 format */ void get_param (struct pstruct *pstr, char *pstring1, char *pstring2) { #ifdef FASTS #ifndef TFAST char *pg_str="FASTS"; #else char *pg_str="TFASTS"; #endif #endif #ifdef FASTM #ifndef TFAST char *pg_str="FASTM"; #else char *pg_str="TFASTM"; #endif #endif sprintf (pstring1, "%s (%s) function [%s matrix (%d:%d)] ktup=%d",pg_str,verstr, pstr->pamfile, pstr->pam_h,pstr->pam_l, pstr->param_u.fa.ktup); if (pstr->param_u.fa.iniflag) strcat(pstring1," init1"); /* if (pstr->zsflag==0) strcat(pstring1," not-scaled"); else if (pstr->zsflag==1) strcat(pstring1," reg.-scaled"); */ if (pstring2 != NULL) { sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)\n\ ; pg_gap-pen: %d %d\n; pg_ktup: %d\n", pg_str,verstr,pstr->pamfile, pstr->pam_h,pstr->pam_l, pstr->gdelval, pstr->ggapval,pstr->param_u.fa.ktup); } } void close_work (const unsigned char *aa0, const int n0, struct pstruct *ppst, struct f_struct **f_arg) { struct f_struct *f_str; int i, j; f_str = *f_arg; if (f_str != NULL) { free(f_str->res); #ifdef TFAST free(f_str->aa1x - 1); /* because f_str->aa1x got ++'ed when allocated! */ #endif free(f_str->diag); free(f_str->l_end); free(f_str->link); free(f_str->pamh2); free(f_str->pamh1); free(f_str->harr); free(f_str->vmax); free(f_str->vptr); free(f_str->sarr); free(f_str->aa0i); free(f_str->aa0e); free(f_str->aa0b); free(f_str->aa0ti); free(f_str->aa0t); free(f_str->nmoff); free(f_str->nm_u); if(f_str->dotat) { for(i = 0, j = (1 << f_str->nm0) - 1 ; i < j ; i++) { free(f_str->tatprobs[i]->probs); free(f_str->tatprobs[i]); free(f_str->intprobs[i]); } free(f_str->tatprobs); free(f_str->intprobs); } free(f_str->priors); free(f_str); *f_arg = NULL; } } void do_fasts (const unsigned char *aa0, const int n0, const unsigned char *aa1, const int n1, struct pstruct *ppst, struct f_struct *f_str, struct rstruct *rst, int *hoff, int opt_prob, int maxsav) { int nd; /* diagonal array size */ int lhval; int kfact; register struct dstruct *dptr; register int tscor; register struct dstruct *diagp; struct dstruct *dpmax; register int lpos; int tpos; struct savestr *vmptr, *vmaxmax; int scor, tmp; int im, ib, nsave; int cmps (); /* comparison routine for ksort */ int ktup; int doffset; vmaxmax = &f_str->vmax[maxsav]; ktup = ppst->param_u.fa.ktup; if (n1 < ktup) { rst->score[0] = rst->score[1] = rst->score[2] = 0; rst->escore = 1.0; rst->segnum = 0; rst->seglen = 0; return; } if (n0+n1+1 >= MAXDIAG) { fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1); rst->score[0] = rst->score[1] = rst->score[2] = -1; rst->escore = 2.0; rst->segnum = 0; rst->seglen = 0; return; } nd = n0 + n1; dpmax = &f_str->diag[nd]; for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;) { dptr->stop = -1; dptr->dmax = NULL; dptr++->score = 0; } for (vmptr = f_str->vmax; vmptr < vmaxmax; vmptr++) { vmptr->score = 0; vmptr->exact = 0; } f_str->lowmax = f_str->vmax; f_str->lowscor = 0; /* start hashing */ diagp = &f_str->diag[f_str->noff]; for (lhval=lpos=0; lpos < n1; lpos++, diagp++) { if (ppst->hsq[aa1[lpos]]>=NMAP) { /* skip residue */ lpos++ ; diagp++; while (lpos < n1 && ppst->hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;} if (lpos >= n1) break; lhval = 0; } lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]]; for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) { dptr = &diagp[-tpos]; if (f_str->l_end[tpos]) { if (dptr->score + f_str->pamh1[aa0[tpos]] == f_str->aa0s[tpos]) { dptr->stop = lpos; dptr->score = f_str->aa0s[tpos]; savemax(dptr, f_str, maxsav, 1, tpos); dptr->dmax = NULL; } else if (dptr->score + f_str->pamh1[aa0[tpos]] > f_str->aa0s[tpos]) { /* fprintf(stderr,"exact match score too high: %d:%d %d < %d + %d - %d:%d - %d > %d\n", tpos, lpos, f_str->aa0s[tpos],dptr->score, f_str->pamh1[aa0[tpos]], dptr->start, dptr->stop, dptr->stop - dptr->start, f_str->aa0l[tpos]); */ dptr->stop = lpos; dptr->start = lpos - f_str->aa0l[tpos]; dptr->score = f_str->aa0s[tpos]; savemax(dptr, f_str, maxsav, 1, tpos); dptr->dmax = NULL; } } else if ((tscor = dptr->stop) >= 0) { tscor++; /* tscor is stop of current, increment it */ if ((tscor -= lpos) <= 0) { /* tscor, the end of the current match, is before lpos, so there is a mismatch - this is also the mismatch cost */ tscor *= 2; scor = dptr->score; /* save the run score on the diag */ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && f_str->lowscor < scor) { /* if what we will get (tscor + kfact) is < 0 and the score is better than the worst savemax() score, save it */ savemax (dptr, f_str, maxsav,0,-1); } /* if extending is better than starting over, extend */ if ((tscor += scor) >= kfact) { dptr->score = tscor; dptr->stop = lpos; if (f_str->l_end[tpos]) { if (dptr->score == f_str->aa0s[tpos]) { savemax(dptr, f_str, maxsav,1,tpos); dptr->dmax = NULL; } else if (dptr->score > f_str->lowscor) savemax(dptr, f_str, maxsav,0,tpos); } } else { /* otherwise, start new */ dptr->score = kfact; dptr->start = dptr->stop = lpos; } } else { /* tscor is after lpos, so extend one residue */ dptr->score += f_str->pamh1[aa0[tpos]]; dptr->stop = lpos; if (f_str->l_end[tpos]) { if (dptr->score == f_str->aa0s[tpos]) { savemax(dptr, f_str, maxsav,1,tpos); dptr->dmax = NULL; } else if (dptr->score > f_str->lowscor) savemax(dptr, f_str, maxsav,0,tpos); } } } else { /* start new */ dptr->score = f_str->pamh2[lhval]; dptr->start = dptr->stop = lpos; } } /* end tpos */ } /* end lpos */ for (dptr = f_str->diag; dptr < dpmax;) { if (dptr->score > f_str->lowscor) savemax (dptr, f_str, maxsav,0,-1); dptr->stop = -1; dptr->dmax = NULL; dptr++->score = 0; } f_str->ndo = nd; /* at this point all of the elements of aa1[lpos] have been searched for elements of aa0[tpos] with the results in diag[dpos] */ for (nsave=0, vmptr=f_str->vmax; vmptr< vmaxmax; vmptr++) { if (vmptr->score > 0) { /* fprintf(stderr,"%c 0: %4d-%4d 1: %4d-%4d dp: %d score: %d", (vmptr->exact ? 'x' : ' '), f_str->noff+vmptr->start-vmptr->dp, f_str->noff+vmptr->stop-vmptr->dp, vmptr->start,vmptr->stop, vmptr->dp,vmptr->score); */ vmptr->score = spam (aa0, aa1, n1, vmptr, ppst->pam2[0], f_str); /* fprintf(stderr," sscore: %d %d-%d\n",vmptr->score,vmptr->start,vmptr->stop); */ if (vmptr->score > 0) f_str->vptr[nsave++] = vmptr; } } if (nsave <= 0) { rst->score[0] = rst->score[1] = rst->score[2] = 0; rst->escore = 1.0; rst->segnum = 0; rst->seglen = 0; f_str->nsave = 0; return; } /* fprintf(stderr,"n0: %d; n1: %d; noff: %d\n",n0,n1,f_str->noff); for (ib=0; ibvptr[ib]->exact ? 'x' : ' ', f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp, f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp, f_str->vptr[ib]->start,f_str->vptr[ib]->stop, f_str->vptr[ib]->dp,f_str->vptr[ib]->score); } fprintf(stderr,"---\n"); */ kssort(f_str->vptr,nsave); /* make certain each seg is used only once */ for (ib=0; ibnm0; ib++) f_str->nm_u[ib]=0; for (ib=0; ib < nsave; ib++) { doffset = f_str->vptr[ib]->dp - f_str->noff; tpos=f_str->aa0i[f_str->vptr[ib]->start - doffset]; if (f_str->nm_u[tpos] == 0) { f_str->nm_u[tpos]=1; } else { f_str->vptr[ib]->score = -1; } } kssort(f_str->vptr,nsave); for (ib = nsave-1; ib >= 0; ib--) if (f_str->vptr[ib]->score > -1) break; nsave = ib+1; scor = sconn (f_str->vptr, nsave, f_str, rst, ppst, aa0, n0, aa1, n1, opt_prob); if (rst->escore < 0.0) rst->escore = 2.0; kssort(f_str->vptr,nsave); /* here we should use an nsave that is consistent with sconn and nm0 */ f_str->nsave = nsave; if (nsave > f_str->nm0) f_str->nsave = f_str->nm0; rst->score[1] = f_str->vptr[0]->score; rst->score[0] = rst->score[2] = max(scor, f_str->vptr[0]->score); } void do_work (const unsigned char *aa0, const int n0, const unsigned char *aa1, const int n1, int frame, struct pstruct *ppst, struct f_struct *f_str, int qr_flg, struct rstruct *rst) { int opt_prob; int hoff, n10, i; if (qr_flg==1 && f_str->shuff_cnt <= 0) { rst->escore = 2.0; rst->score[0]=rst->score[1]=rst->score[2]= -1; return; } if (f_str->dotat || ppst->zsflag == 4 || ppst->zsflag == 14 ) opt_prob=1; else opt_prob = 0; if (ppst->zsflag == 2 || ppst->zsflag == 12) opt_prob = 0; if (qr_flg) { opt_prob=1; /* if (frame==1) */ f_str->shuff_cnt--; } if (n1 < ppst->param_u.fa.ktup) { rst->score[0] = rst->score[1] = rst->score[2] = -1; rst->escore = 2.0; return; } #ifdef TFAST n10=aatran(aa1,f_str->aa1x,n1,frame); if (ppst->debug_lib) for (i=0; iaa1x[i]>ppst->nsq) { fprintf(stderr, "residue[%d/%d] %d range (%d)\n",i,n1, f_str->aa1x[i],ppst->nsq); f_str->aa1x[i]=0; n10=i-1; } do_fasts (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, opt_prob, f_str->maxsav); #else /* FASTA */ do_fasts (aa0, n0, aa1, n1, ppst, f_str, rst, &hoff, opt_prob, f_str->maxsav); #endif rst->comp = rst->H = -1.0; } void do_opt (const unsigned char *aa0, const int n0, const unsigned char *aa1, const int n1, int frame, struct pstruct *ppst, struct f_struct *f_str, struct rstruct *rst) { int lag, tscore, hoff, n10; #ifdef TFAST n10=aatran(aa1,f_str->aa1x,n1,frame); do_fasts (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, 1, f_str->maxsav); #else /* FASTA */ do_fasts(aa0,n0,aa1,n1,ppst,f_str,rst, &hoff, 1, f_str->maxsav); #endif } /* modify savemax() so that full length 100% matches are marked so that they cannot be removed - if we have a 100% match, mark "exact" modify savemax() to split alignments that include a comma */ /* savemax(dptr, f_str, maxsav) takes a current diagonal run (saved in dptr), and places it in the set of runs to be saved (in f_str->vmax[]) */ void savemax (struct dstruct *dptr, struct f_struct *f_str, int maxsav, int exact, int tpos) { register int dpos; /* position along the diagonal, -n0 .. n1 */ int i, j, lowj; register struct savestr *vmptr; struct savestr *vmaxmax; vmaxmax = &f_str->vmax[maxsav]; dpos = (int) (dptr - f_str->diag); /* current diagonal */ /* check to see if this is the continuation of a run that is already saved */ /* if we are at the end of the query, save it regardless */ /* if (t_end > 0 && t_end < dptr->stop - dptr->start) {return;} */ if ((vmptr = dptr->dmax) != NULL /* have an active run */ && vmptr->dp == dpos && /* on the correct diagonal */ vmptr->start == dptr->start) { /* and it starts at the same place */ vmptr->stop = dptr->stop; /* update the end of the match in vmax[] */ if (exact == 1) { /* fprintf(stderr,"have cont exact match: %d - %d:%d %d:%d = %d\n", dptr->score, dptr->start, dptr->stop, vmptr->start, vmptr->stop, dptr->stop - dptr->start+1); */ exact = 1; } /* if the score is worse, don't update, return - if the score gets bad enough, it will restart in the diagonal scan */ if ((i = dptr->score) <= vmptr->score) { return;} /* score is better, update */ vmptr->score = i; vmptr->exact = exact; /* if the score is not the worst, return */ if (vmptr != f_str->lowmax) { return;} } else { /* not a continuation */ /* save in the lowest place */ /* fprintf(stderr," Replacing: %d - %d:%d => %d - %d:%d", f_str->lowmax->score, f_str->lowmax->start, f_str->lowmax->stop, dptr->score, dptr->start, dptr->stop); */ vmptr = f_str->lowmax; /* if (exact == 1) { fprintf(stderr,"have new exact match: %d - %d:%d = %d\n", dptr->score, dptr->start, dptr->stop, dptr->stop - dptr->start+1); } */ vmptr->exact = exact; i = vmptr->score = dptr->score; /* 'i' is used as a bound */ vmptr->dp = dpos; vmptr->start = dptr->start; vmptr->stop = dptr->stop; dptr->dmax = vmptr; } /* rescan the list for the worst score */ for (vmptr = f_str->vmax; vmptr < &f_str->vmax[maxsav] ; vmptr++) { if (vmptr->score < i && !vmptr->exact) { i = vmptr->score; f_str->lowmax = vmptr; } } f_str->lowscor = i; } /* this version of spam scans the diagonal to find the best local score, then resets the boundaries for a global alignment and re-scans */ /* NOOVERHANG allows one to score any overhanging alignment as zero. Useful for SAGE alignments. Normally, one allows overhangs because of the possibility of partial sequences. */ #undef NOOVERHANG /* May, 2005 - spam() has an intesting bug that occurs when two peptides match in order, separated by one position (the comma). In this case, spam() splits the match, and only returns the better of the two matches. So, if spam splits an alignment at a comma, it needs the ability to insert the missing match. */ int spam (const unsigned char *aa0, const unsigned char *aa1,int n1, struct savestr *dmax, int **pam2, struct f_struct *f_str) { int lpos, doffset; int tot, mtot; struct { int start, stop, score; } curv, maxv; register const unsigned char *aa0p, *aa1p; doffset = dmax->dp - f_str->noff; curv.start = dmax->start; aa1p = &aa1[dmax->start]; aa0p = &aa0[dmax->start - doffset]; tot = curv.score = maxv.score = 0; for (lpos = dmax->start; lpos <= dmax->stop; lpos++) { tot += pam2[*aa0p++][*aa1p++]; if (tot > curv.score) { curv.stop = lpos; /* here, curv.stop is actually curv.max */ curv.score = tot; } else if (tot < 0) { if (curv.score > maxv.score) { maxv.start = curv.start; maxv.stop = curv.stop; maxv.score = curv.score; } tot = curv.score = 0; curv.start = lpos+1; } } if (curv.score > maxv.score) { maxv.start = curv.start; maxv.stop = curv.stop; maxv.score = curv.score; } if (maxv.score <= 0) return 0; /* now, reset the boundaries of the alignment using aa0b[] and aa0e[], which specify the residues that start and end the segment */ maxv.start = f_str->aa0b[maxv.stop-doffset] + doffset; if (maxv.start < 0) { maxv.start = 0; #ifdef NOOVERHANG return 0; #endif } maxv.stop = f_str->aa0e[maxv.stop-doffset] + doffset; if (maxv.stop > n1) { maxv.stop = n1-1; #ifdef NOOVERHANG return 0; #endif } aa1p = &aa1[lpos = maxv.start]; aa0p = &aa0[lpos - doffset]; for (tot=0; lpos <= maxv.stop; lpos++) { tot += pam2[*aa0p++][*aa1p++]; } maxv.score = tot; /* if (maxv.start != dmax->start || maxv.stop != dmax->stop) printf(" new region: %3d %3d %3d %3d\n",maxv.start, dmax->start,maxv.stop,dmax->stop); */ dmax->start = maxv.start; dmax->stop = maxv.stop; return maxv.score; } int sconn (struct savestr **v, int n, struct f_struct *f_str, struct rstruct *rst, struct pstruct *ppst, const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int opt_prob) { int i, si, cmpp (); struct slink *start, *sl, *sj, *so, *sarr; int lstart, ltmp, tstart, plstop, ptstop, ptstart, tstop; double tatprob; int dotat; sarr = f_str->sarr; /* sort the score left to right in lib pos */ kpsort (v, n); start = NULL; rst->score[0] = 0; rst->escore = 2.0; /* for the remaining runs, see if they fit */ /* lstart/lstop -> start/stop in library sequence tstart/tstop -> start/stop in query sequence plstart/plstop -> */ for (i = 0, si = 0; i < n; i++) { /* the segment is worth adding; find out where? */ lstart = v[i]->start; ltmp = v[i]->stop; tstart = lstart - v[i]->dp + f_str->noff; tstop = ltmp - v[i]->dp + f_str->noff; /* put the run in the group */ sarr[si].vp = v[i]; sarr[si].score = v[i]->score; sarr[si].next = NULL; sarr[si].prev = NULL; sarr[si].tat = NULL; /* opt_prob for FASTS only has to do with using aa1 for priors, i.e. we always calculate tatprobs for segments in FASTS (unlike FASTF) */ if(opt_prob) { sarr[si].tatprob = calc_tatusov(NULL, &sarr[si], aa0, n0, aa1, n1, ppst->pam2[0], ppst->nsq, f_str, ppst->pseudocts, opt_prob, ppst->zsflag); if (sarr[si].tatprob < 0.0) { fprintf(stderr," negative tatprob: %lg\n",sarr[si].tatprob); sarr[si].tatprob = 1.0; } sarr[si].tat = sarr[si].newtat; } /* if it fits, then increase the score start points to the highest scoring run -> next is the second highest, etc. put the segment into the highest scoring run that it fits into */ for (sl = start; sl != NULL; sl = sl->next) { ltmp = sl->vp->start; /* plstop -> previous lstop */ plstop = sl->vp->stop; /* ptstart -> previous t(query) start */ ptstart = ltmp - sl->vp->dp + f_str->noff; /* ptstop -> previous t(query) stop */ ptstop = plstop - sl->vp->dp + f_str->noff; #ifndef FASTM /* if the previous library stop is before the current library start */ if (plstop < lstart && ( ptstop < tstart || ptstart > tstop)) #else /* if the previous library stop is before the current library start */ if (plstop < lstart && ptstop < tstart) #endif { if(!opt_prob) { sarr[si].score = sl->score + v[i]->score; sarr[si].prev = sl; break; } else { tatprob = calc_tatusov(sl, &sarr[si], aa0, n0, aa1, n1, ppst->pam2[0], ppst->nsq, f_str, ppst->pseudocts, opt_prob, ppst->zsflag); /* if our tatprob gets worse when we add this, forget it */ if(tatprob > sarr[si].tatprob) { free(sarr[si].newtat->probs); /* get rid of new tat struct */ free(sarr[si].newtat); continue; /* reuse this sarr[si] */ } else { sarr[si].tatprob = tatprob; free(sarr[si].tat->probs); /* get rid of old tat struct */ free(sarr[si].tat); sarr[si].tat = sarr[si].newtat; sarr[si].prev = sl; sarr[si].score = sl->score + v[i]->score; /* fprintf(stderr,"sconn %d added %d/%d getting %d; si: %d, tat: %g\n", i,v[i]->start, v[i]->score,sarr[si].score,si, tatprob); */ break; } } } } /* now recalculate where the score fits */ if (start == NULL) start = &sarr[si]; else { if(!opt_prob) { for (sj = start, so = NULL; sj != NULL; sj = sj->next) { if (sarr[si].score > sj->score) { sarr[si].next = sj; if (so != NULL) so->next = &sarr[si]; else start = &sarr[si]; break; } so = sj; } } else { for (sj = start, so = NULL; sj != NULL; sj = sj->next) { if ( sarr[si].tatprob < sj->tatprob || ((sarr[si].tatprob == sj->tatprob) && sarr[si].score > sj->score) ) { sarr[si].next = sj; if (so != NULL) so->next = &sarr[si]; else start = &sarr[si]; break; } so = sj; } } } si++; } if(opt_prob) { for (i = 0 ; i < si ; i++) { free(sarr[i].tat->probs); free(sarr[i].tat); } } if (start != NULL) { if(opt_prob) { rst->escore = start->tatprob; } else { rst->escore = 2.0; } rst->segnum = rst->seglen = 0; for(sj = start ; sj != NULL; sj = sj->prev) { rst->segnum++; rst->seglen += sj->vp->stop - sj->vp->start + 1; } return (start->score); } else { rst->escore = 1.0; } rst->segnum = rst->seglen = 0; return (0); } void kssort (v, n) struct savestr *v[]; int n; { int gap, i, j; struct savestr *tmp; for (gap = n / 2; gap > 0; gap /= 2) for (i = gap; i < n; i++) for (j = i - gap; j >= 0; j -= gap) { if (v[j]->score >= v[j + gap]->score) break; tmp = v[j]; v[j] = v[j + gap]; v[j + gap] = tmp; } } void kpsort (v, n) struct savestr *v[]; int n; { int gap, i, j; struct savestr *tmp; for (gap = n / 2; gap > 0; gap /= 2) for (i = gap; i < n; i++) for (j = i - gap; j >= 0; j -= gap) { if (v[j]->start <= v[j + gap]->start) break; tmp = v[j]; v[j] = v[j + gap]; v[j + gap] = tmp; } } /* calculate the 100% identical score */ int shscore(const unsigned char *aa0, const int n0, int **pam2, int nsq) { int i, sum; for (i=0,sum=0; i 0; gap /= 2) for (i = gap; i < n; i++) for (j = i - gap; j >= 0; j -= gap) { if (v[j]->stop > v[j + gap]->stop) break; tmp = v[j]; v[j] = v[j + gap]; v[j + gap] = tmp; } } int do_walign (const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int frame, struct pstruct *ppst, struct f_struct *f_str, struct a_res_str *a_res, int *have_ares) { int hoff, n10; struct rstruct rst; int ib, i; unsigned char *aa0t; const unsigned char *aa1p; struct savestr *vmptr; #ifdef TFAST f_str->n10 = n10 = aatran(aa1,f_str->aa1x,n1,frame); aa1p = f_str->aa1x; #else n10 = n1; aa1p = aa1; #endif do_fasts(aa0, n0, aa1p, n10, ppst, f_str, &rst, &hoff, 1, f_str->maxsav_w); /* the alignment portion takes advantage of the information left over in f_str after do_fasts is done. in particular, it is easy to run a modified sconn() to produce the alignments. unfortunately, the alignment display routine wants to have things encoded as with bd_align and sw_align, so we need to do that. */ /* unnecessary; do_fasts just did this */ /* kssort(f_str->vptr,f_str->nsave); */ /* at some point, we want one best score for each of the segments */ for ( ; f_str->nsave > 0; f_str->nsave--) if (f_str->vptr[f_str->nsave-1]->score >0) break; if ((aa0t = (unsigned char *)calloc(n0+1,sizeof(unsigned char)))==NULL) { fprintf(stderr," cannot allocate aa0t %d\n",n0+1); exit(1); } /* copy aa0[] into f_str->aa0t[] */ for (i=0; iaa0t[i] = aa0t[i] = aa0[i]; f_str->aa0t[i] = aa0t[i] = '\0'; a_res->nres = sconn_a (aa0t,n0,aa1p,n10,f_str, a_res, ppst); free(aa0t); a_res->res = f_str->res; *have_ares = 0; return rst.score[0]; } /* this version of sconn is modified to provide alignment information */ /* in addition, it needs to know whether a segment has been used before */ /* sconn_a fills in the res[nres] array, but this is passed implicitly through f_str->res[f_str->nres] */ int sconn_a (unsigned char *aa0, int n0, const unsigned char *aa1, int n1, struct f_struct *f_str, struct a_res_str *a_res, struct pstruct *ppst) { int i, si, cmpp (), n; unsigned char *aa0p; int sx, dx, doff, *aa0tip; struct savestr **v; struct slink *start, *sl, *sj, *so, *sarr; int lstart, lstop, ltmp, plstart, tstart, plstop, ptstop, ptstart, tstop; int *res, nres, tres; double tatprob; /* sort the score left to right in lib pos */ v = f_str->vptr; n = f_str->nsave; sarr = f_str->sarr; /* set things up in case nothing fits */ if (n <=0 || v[0]->score <= 0) return 0; if (v[0]->score < 0) { sarr[0].vp = v[0]; sarr[0].score = v[0]->score; sarr[0].next = NULL; sarr[0].prev = NULL; start = &sarr[0]; } else { krsort (v, n); /* sort from left to right in library */ start = NULL; /* for each alignment, see if it fits */ for (i = 0, si = 0; i < n; i++) { /* if the score is less than the join threshold, skip it */ if (v[i]->score < 0) continue; lstart = v[i]->start; lstop = v[i]->stop; tstart = lstart - v[i]->dp + f_str->noff; tstop = lstop - v[i]->dp + f_str->noff; /* put the alignment in the group */ sarr[si].vp = v[i]; sarr[si].score = v[i]->score; sarr[si].next = NULL; sarr[si].prev = NULL; sarr[si].tat = NULL; sarr[si].tatprob = calc_tatusov(NULL, &sarr[si], aa0, n0, aa1, n1, ppst->pam2[0], ppst->nsq, f_str, ppst->pseudocts, 1, ppst->zsflag); sarr[si].tat = sarr[si].newtat; /* if it fits, then increase the score */ /* start points to a sorted (by total score) list of candidate overlaps */ for (sl = start; sl != NULL; sl = sl->next) { plstart = sl->vp->start; plstop = sl->vp->stop; ptstart = plstart - sl->vp->dp + f_str->noff; ptstop = plstop - sl->vp->dp + f_str->noff; #ifndef FASTM if (plstart > lstop && (ptstop < tstart || ptstart > tstop)) { #else if (plstop > lstart && ptstart > tstop) { #endif /* alignment always uses probabilistic scoring ... */ /* sarr[si].score = sl->score + v[i]->score; sarr[si].prev = sl; break; */ /* quit as soon as the alignment has been added */ tatprob = calc_tatusov(sl, &sarr[si], aa0, n0, aa1, n1, ppst->pam2[0], ppst->nsq, f_str, ppst->pseudocts, 1, ppst->zsflag); /* if our tatprob gets worse when we add this, forget it */ if(tatprob > sarr[si].tatprob) { free(sarr[si].newtat->probs); /* get rid of new tat struct */ free(sarr[si].newtat); continue; /* reuse this sarr[si] */ } else { sarr[si].tatprob = tatprob; free(sarr[si].tat->probs); /* get rid of old tat struct */ free(sarr[si].tat); sarr[si].tat = sarr[si].newtat; sarr[si].prev = sl; sarr[si].score = sl->score + v[i]->score; /* fprintf(stderr,"sconn %d added %d/%d getting %d; si: %d, tat: %g\n", i,v[i]->start, v[i]->score,sarr[si].score,si, tatprob); */ break; } } } /* now recalculate the list of best scores */ if (start == NULL) start = &sarr[si]; /* put the first one in the list */ else for (sj = start, so = NULL; sj != NULL; sj = sj->next) { /* if (sarr[si].score > sj->score) { */ /* new score better than old */ if ( sarr[si].tatprob < sj->tatprob || ((sarr[si].tatprob == sj->tatprob) && sarr[si].score > sj->score) ) { sarr[si].next = sj; /* next best after new score */ if (so != NULL) so->next = &sarr[si]; /* prev_best->next points to best */ else start = &sarr[si]; /* start points to best */ break; /* stop looking */ } so = sj; /* previous candidate best */ } si++; /* increment to next alignment */ } } for (i = 0 ; i < si ; i++) { free(sarr[i].tat->probs); free(sarr[i].tat); } res = f_str->res; tres = nres = 0; aa0p = aa0; aa0tip = f_str->aa0ti; /* point to temporary index */ a_res->min1 = start->vp->start; a_res->min0 = 0; for (sj = start; sj != NULL; sj = sj->prev ) { doff = (int)(aa0p-aa0) - (sj->vp->start-sj->vp->dp+f_str->noff); /* fprintf(stderr,"doff: %3d\n",doff); */ for (dx=sj->vp->start,sx=sj->vp->start-sj->vp->dp+f_str->noff; dx <= sj->vp->stop; dx++) { *aa0tip++ = f_str->aa0i[sx]; /* save index */ *aa0p++ = f_str->aa0t[sx++]; /* save sequence at index */ tres++; res[nres++] = 0; } sj->vp->dp -= doff; if (sj->prev != NULL) { if (sj->prev->vp->start - sj->vp->stop - 1 > 0 ) tres += res[nres++] = (sj->prev->vp->start - sj->vp->stop - 1); } /* fprintf(stderr,"t0: %3d, tx: %3d, l0: %3d, lx: %3d, dp: %3d noff: %3d, score: %3d\n", sj->vp->start - sj->vp->dp + f_str->noff, sj->vp->stop - sj->vp->dp + f_str->noff, sj->vp->start,sj->vp->stop,sj->vp->dp, f_str->noff,sj->vp->score); fprintf(stderr,"%3d - %3d: %3d\n", sj->vp->start,sj->vp->stop,sj->vp->score); */ a_res->max1 = sj->vp->stop+1; a_res->max0 = a_res->max1 - sj->vp->dp + f_str->noff; } /* fprintf(stderr,"(%3d - %3d):(%3d - %3d)\n", a_res->min0,a_res->max0,a_res->min1,a_res->max1); */ /* now replace f_str->aa0t with aa0 (f_str->aa0t is permanent, aa0 is not)*/ for (i=0; iaa0t[i] = aa0[i]; return tres; } /* for fasts (and fastf), pre_cons needs to set up f_str as well as do necessary translations - for right now, simply do do_walign */ void pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) { #ifdef TFAST f_str->n10=aatran(aa1,f_str->aa1x,n1,frame); #endif } /* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */ /* call from calcons, calc_id, calc_code */ void aln_func_vals(int frame, struct a_struct *aln) { #ifdef TFAST aln->qlrev = 0; aln->qlfact= 1; aln->llfact = aln->llmult = 3; if (frame > 3) aln->llrev = 1; else aln->llrev = 0; aln->frame = 0; #else /* FASTS */ aln->llfact = aln->llmult = aln->qlfact = 1; aln->llrev = aln->qlrev = 0; aln->frame = 0; #endif } #include "a_mark.h" int calcons(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int *nc, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, char *seqc0, char *seqc1, char *seqca, struct f_struct *f_str) { int i0, i1, nn1, n0t; int op, lenc, len_gap, nd, ns, itmp; const unsigned char *aa1p; char *sp0, *sp1, *spa; int *rp; int mins, smins; #ifndef TFAST aa1p = aa1; nn1 = n1; #else aa1p = f_str->aa1x; nn1 = f_str->n10; #endif aln->amin0 = a_res.min0; aln->amin1 = a_res.min1; aln->amax0 = a_res.max0; aln->amax1 = a_res.max1; /* first fill in the ends */ n0 -= (f_str->nm0-1); if (min(a_res.min0,a_res.min1)llen || aln->showall==1) /* will we show all the start ?*/ if (a_res.min0>=a_res.min1) { /* aa0 extends more to left */ smins=0; if (aln->showall==1) mins=a_res.min0; else mins = min(a_res.min0,aln->llen/2); aancpy(seqc0,(char *)f_str->aa0t+a_res.min0-mins,mins,pst); aln->smin0 = a_res.min0-mins; if ((mins-a_res.min1)>0) { memset(seqc1,' ',mins-a_res.min1); aancpy(seqc1+mins-a_res.min1,(char *)aa1p,a_res.min1,pst); aln->smin1 = 0; } else { aancpy(seqc1,(char *)aa1p+a_res.min1-mins,mins,pst); aln->smin1 = a_res.min1-mins; } } else { smins=0; if (aln->showall == 1) mins=a_res.min1; else mins = min(a_res.min1,aln->llen/2); aancpy(seqc1,(char *)(aa1p+a_res.min1-mins),mins,pst); aln->smin1 = a_res.min1-mins; if ((mins-a_res.min0)>0) { memset(seqc0,' ',mins-a_res.min0); aancpy(seqc0+mins-a_res.min0,(char *)f_str->aa0t,a_res.min0,pst); aln->smin0 = 0; } else { aancpy(seqc0,(char *)f_str->aa0t+a_res.min0-mins,mins,pst); aln->smin0 = a_res.min0-mins; } } else { mins= min(aln->llen/2,min(a_res.min0,a_res.min1)); smins=mins; aln->smin0=a_res.min0; aln->smin1=a_res.min1; aancpy(seqc0,(char *)f_str->aa0t+a_res.min0-mins,mins,pst); aancpy(seqc1,(char *)aa1p+a_res.min1-mins,mins,pst); } memset(seqca,M_BLANK,mins); /* now get the middle */ spa = seqca+mins; sp0 = seqc0+mins; sp1 = seqc1+mins; rp = a_res.res; n0t = lenc = len_gap = aln->nident = aln->nsim = aln->ngap_q = aln->ngap_l = op = 0; i0 = a_res.min0; i1 = a_res.min1; /* op is the previous "match/insert" operator; *rp is the current operator or repeat count */ while (i0 < a_res.max0 || i1 < a_res.max1) { if (op == 0 && *rp == 0) { /* previous was match (or start), current is match */ op = *rp++; /* get the next match/insert operator */ /* get the alignment symbol */ if ((itmp=pst.pam2[0][f_str->aa0t[i0]][aa1p[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;} *sp0 = pst.sq[f_str->aa0t[i0++]]; /* get the residues for the consensus */ *sp1 = pst.sq[aa1p[i1++]]; n0t++; lenc++; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; } else { /* either op != 0 (previous was insert) or *rp != 0 (current is insert) */ if (op==0) { op = *rp++;} /* previous was match, start insert */ /* previous was insert - count through gap */ *sp0++ = '-'; *sp1++ = pst.sq[aa1p[i1++]]; *spa++ = M_DEL; op--; len_gap++; lenc++; } } *spa = '\0'; *nc = lenc-len_gap; /* now we have the middle, get the right end */ ns = mins + lenc + aln->llen; ns -= (itmp = ns %aln->llen); if (itmp>aln->llen/2) ns += aln->llen; nd = ns - (mins+lenc); if (nd > max(n0t-a_res.max0,nn1-a_res.max1)) nd = max(n0t-a_res.max0,nn1-a_res.max1); if (aln->showall==1) { nd = max(n0t-a_res.max0,nn1-a_res.max1); /* reset for showall=1 */ /* get right end */ aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res.max0,n0t-a_res.max0,pst); aancpy(seqc1+mins+lenc,(char *)aa1p+a_res.max1,nn1-a_res.max1,pst); /* fill with blanks - this is required to use one 'nc' */ memset(seqc0+mins+lenc+n0t-a_res.max0,' ',nd-(n0t-a_res.max0)); memset(seqc1+mins+lenc+nn1-a_res.max1,' ',nd-(nn1-a_res.max1)); } else { if ((nd-(n0t-a_res.max0))>0) { aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res.max0, n0t-a_res.max0,pst); memset(seqc0+mins+lenc+n0t-a_res.max0,' ',nd-(n0t-a_res.max0)); } else aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res.max0,nd,pst); if ((nd-(nn1-a_res.max1))>0) { aancpy(seqc1+mins+lenc,(char *)aa1p+a_res.max1,nn1-a_res.max1,pst); memset(seqc1+mins+lenc+nn1-a_res.max1,' ',nd-(nn1-a_res.max1)); } else aancpy(seqc1+mins+lenc,(char *)aa1p+a_res.max1,nd,pst); } return mins+lenc+nd; } int calcons_a(const unsigned char *aa0, unsigned char *aa0a, int n0, const unsigned char *aa1, int n1, int *nc, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, char *seqc0, char *seqc0a, char *seqc1, char *seqca, char *ann_arr, struct f_struct *f_str) { int i0, i1, nn1, n0t; int op, lenc, len_gap, nd, ns, itmp, p_ac, fnum, o_fnum; const unsigned char *aa1p; unsigned char *aa0ap; char *sp0, *sp0a, *sp1, *spa; int *rp; int mins, smins; #ifndef TFAST aa1p = aa1; nn1 = n1; #else aa1p = f_str->aa1x; nn1 = f_str->n10; #endif aln->amin0 = a_res.min0; aln->amin1 = a_res.min1; aln->amax0 = a_res.max0; aln->amax1 = a_res.max1; /* first fill in the ends */ n0 -= (f_str->nm0-1); if (min(a_res.min0,a_res.min1)llen || aln->showall==1) /* will we show all the start ?*/ if (a_res.min0>=a_res.min1) { /* aa0 extends more to left */ smins=0; if (aln->showall==1) mins=a_res.min0; else mins = min(a_res.min0,aln->llen/2); aancpy(seqc0,(char *)f_str->aa0t+a_res.min0-mins,mins,pst); aln->smin0 = a_res.min0-mins; if ((mins-a_res.min1)>0) { memset(seqc1,' ',mins-a_res.min1); aancpy(seqc1+mins-a_res.min1,(char *)aa1p,a_res.min1,pst); aln->smin1 = 0; } else { aancpy(seqc1,(char *)aa1p+a_res.min1-mins,mins,pst); aln->smin1 = a_res.min1-mins; } } else { smins=0; if (aln->showall == 1) mins=a_res.min1; else mins = min(a_res.min1,aln->llen/2); aancpy(seqc1,(char *)(aa1p+a_res.min1-mins),mins,pst); aln->smin1 = a_res.min1-mins; if ((mins-a_res.min0)>0) { memset(seqc0,' ',mins-a_res.min0); aancpy(seqc0+mins-a_res.min0,(char *)f_str->aa0t,a_res.min0,pst); aln->smin0 = 0; } else { aancpy(seqc0,(char *)f_str->aa0t+a_res.min0-mins,mins,pst); aln->smin0 = a_res.min0-mins; } } else { mins= min(aln->llen/2,min(a_res.min0,a_res.min1)); smins=mins; aln->smin0=a_res.min0; aln->smin1=a_res.min1; aancpy(seqc0,(char *)f_str->aa0t+a_res.min0-mins,mins,pst); aancpy(seqc1,(char *)aa1p+a_res.min1-mins,mins,pst); } memset(seqca,M_BLANK,mins); memset(seqc0a,' ', mins); /* now get the middle */ spa = seqca+mins; sp0 = seqc0+mins; sp0a = seqc0a+mins; sp1 = seqc1+mins; rp = a_res.res; n0t=lenc=len_gap=aln->nident=aln->nsim=aln->ngap_q=aln->ngap_l=op=p_ac= 0; i0 = a_res.min0; i1 = a_res.min1; /* op is the previous "match/insert" operator; *rp is the current operator or repeat count */ o_fnum = f_str->aa0ti[i0]; aa0ap = &aa0a[f_str->nmoff[o_fnum]+i0]; while (i0 < a_res.max0 || i1 < a_res.max1) { fnum = f_str->aa0ti[i0]; if (op == 0 && *rp == 0) { /* previous was match (or start), current is match */ if (p_ac == 0) { /* previous code was a match */ if (fnum != o_fnum) { /* continuing a match, but with a different fragment */ aa0ap = &aa0a[f_str->nmoff[fnum]]; o_fnum = fnum; } } else { p_ac = 0; o_fnum = fnum = f_str->aa0ti[i0]; aa0ap = &aa0a[f_str->nmoff[fnum]]; } op = *rp++; /* get the next match/insert operator */ /* get the alignment symbol */ if ((itmp=pst.pam2[0][f_str->aa0t[i0]][aa1p[i1]])<0) { *spa = M_NEG; } else if (itmp == 0) { *spa = M_ZERO;} else {*spa = M_POS;} if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;} *sp0 = pst.sq[f_str->aa0t[i0++]]; /* get the residues for the consensus */ *sp0a++ = ann_arr[*aa0ap++]; *sp1 = pst.sq[aa1p[i1++]]; n0t++; lenc++; if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;} sp0++; sp1++; spa++; } else { /* either op != 0 (previous was insert) or *rp != 0 (current is insert) */ if (op==0) { op = *rp++;} /* previous was match, start insert */ /* previous was insert - count through gap */ if (p_ac != 1) { p_ac = 1; fnum = f_str->aa0ti[i0]; } *sp0++ = '-'; *sp1++ = pst.sq[aa1p[i1++]]; *spa++ = M_DEL; *sp0a++ = ' '; op--; len_gap++; lenc++; } } *sp0a = *spa = '\0'; *nc = lenc-len_gap; /* now we have the middle, get the right end */ ns = mins + lenc + aln->llen; ns -= (itmp = ns %aln->llen); if (itmp>aln->llen/2) ns += aln->llen; nd = ns - (mins+lenc); if (nd > max(n0t-a_res.max0,nn1-a_res.max1)) nd = max(n0t-a_res.max0,nn1-a_res.max1); if (aln->showall==1) { nd = max(n0t-a_res.max0,nn1-a_res.max1); /* reset for showall=1 */ /* get right end */ aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res.max0,n0t-a_res.max0,pst); aancpy(seqc1+mins+lenc,(char *)aa1p+a_res.max1,nn1-a_res.max1,pst); /* fill with blanks - this is required to use one 'nc' */ memset(seqc0+mins+lenc+n0t-a_res.max0,' ',nd-(n0t-a_res.max0)); memset(seqc1+mins+lenc+nn1-a_res.max1,' ',nd-(nn1-a_res.max1)); } else { if ((nd-(n0t-a_res.max0))>0) { aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res.max0, n0t-a_res.max0,pst); memset(seqc0+mins+lenc+n0t-a_res.max0,' ',nd-(n0t-a_res.max0)); } else aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res.max0,nd,pst); if ((nd-(nn1-a_res.max1))>0) { aancpy(seqc1+mins+lenc,(char *)aa1p+a_res.max1,nn1-a_res.max1,pst); memset(seqc1+mins+lenc+nn1-a_res.max1,' ',nd-(nn1-a_res.max1)); } else aancpy(seqc1+mins+lenc,(char *)aa1p+a_res.max1,nd,pst); } return mins+lenc+nd; } void aaptrshuffle(unsigned char *res, int n) { int i, j; unsigned char tmp; for( i = n; --i; ) { /* j = nrand(i); if (i == j) continue; */ /* shuffle */ j = (n - 1) - i; if (i <= j ) break; /* reverse */ tmp = res[i]; res[i] = res[j]; res[j] = tmp; } } void aa0shuffle(unsigned char *aa0, int n0, struct f_struct *f_str) { int i; int j; for(i = 0 ; i < f_str->nm0 ; i++) { /* for each fragment */ aaptrshuffle(&(aa0[f_str->nmoff[i]]), f_str->nmoff[i+1] - f_str->nmoff[i] - 1 ); } } /* build an array of match/ins/del - length strings */ int calc_code(const unsigned char *aa0, const int n0, const unsigned char *aa1, const int n1, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, char *al_str, int al_str_n, struct f_struct *f_str) { int i0, i1, nn1; int op, lenc, len_gap; int p_ac, op_cnt; const unsigned char *aa1p; char tmp_cnt[20]; char sp0, sp1, *sq; int *rp; int mins, smins; int o_fnum,fnum = 0; if (pst.ext_sq_set) {sq = pst.sqx;} else {sq = pst.sq;} #ifndef TFAST aa1p = aa1; nn1 = n1; #else aa1p = f_str->aa1x; nn1 = f_str->n10; #endif aln->amin0 = a_res.min0; aln->amin1 = a_res.min1; aln->amax0 = a_res.max0; aln->amax1 = a_res.max1; rp = a_res.res; lenc = len_gap =aln->nident=aln->nsim=aln->ngap_q=aln->ngap_l=aln->nfs=op=p_ac = 0; op_cnt = 0; i0 = a_res.min0; /* start in aa0 (f_str->aa0t) */ i1 = a_res.min1; /* start in aa1 */ tmp_cnt[0]='\0'; o_fnum = f_str->aa0ti[i0] + 1; /* fragment number */ while (i0 < a_res.max0 || i1 < a_res.max1) { fnum = f_str->aa0ti[i0]+1; if (op == 0 && *rp == 0) { /* previous was match, this is match */ if (p_ac == 0) { /* previous code was a match */ if (fnum == o_fnum) { op_cnt++;} else { /* continuing a match, but with a different fragment */ update_code(al_str,al_str_n-strlen(al_str), p_ac, op_cnt, o_fnum); o_fnum = fnum; op_cnt=1; } } else { update_code(al_str,al_str_n-strlen(al_str),p_ac,op_cnt,o_fnum); op_cnt = 1; p_ac = 0; o_fnum = fnum = f_str->aa0ti[i0] + 1; } op = *rp++; lenc++; if (pst.pam2[0][f_str->aa0t[i0]][aa1p[i1]]>=0) {aln->nsim++;} sp0 = pst.sq[f_str->aa0t[i0++]]; sp1 = pst.sq[aa1p[i1++]]; if (toupper(sp0) == toupper(sp1)) aln->nident++; } else { if (op==0) op = *rp++; if (p_ac == 1) { op_cnt++;} else { update_code(al_str,al_str_n - strlen(al_str),p_ac,op_cnt,o_fnum); op_cnt = 1; p_ac = 1; fnum = f_str->aa0ti[i0] + 1; } op--; lenc++; i1++; len_gap++; } } update_code(al_str,al_str_n - strlen(al_str),p_ac,op_cnt,o_fnum); return lenc - len_gap; } /* update_code(): if "op" == 0, this is the end of a match of length "op_cnt" involving fragment "fnum" otherwise, this is an insertion (op==1) or deletion (op==2) */ void update_code(char *al_str, int al_str_max, int op, int op_cnt, int fnum) { char op_char[4]={"=-+"}; char tmp_cnt[20]; if (op == 0) sprintf(tmp_cnt,"%c%d[%d]",op_char[op],op_cnt,fnum); else sprintf(tmp_cnt,"%c%d",op_char[op],op_cnt); strncat(al_str,tmp_cnt,al_str_max); } int calc_id(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, struct a_struct *aln, struct a_res_str a_res, struct pstruct pst, struct f_struct *f_str) { int i0, i1, nn1; int op, lenc, len_gap; const unsigned char *aa1p; int sp0, sp1; int *rp; int mins, smins; #ifndef TFAST aa1p = aa1; nn1 = n1; #else aa1p = f_str->aa1x; nn1 = f_str->n10; #endif aln->amin0 = a_res.min0; aln->amin1 = a_res.min1; aln->amax0 = a_res.max0; aln->amax1 = a_res.max1; /* first fill in the ends */ n0 -= (f_str->nm0-1); /* now get the middle */ rp = a_res.res; lenc=len_gap=aln->nident=aln->nsim=aln->ngap_q = aln->ngap_l = aln->nfs = op = 0; i0 = a_res.min0; i1 = a_res.min1; while (i0 < a_res.max0 || i1 < a_res.max1) { if (op == 0 && *rp == 0) { op = *rp++; if (pst.pam2[0][f_str->aa0t[i0]][aa1p[i1]]>=0) {aln->nsim++;} sp0 = pst.sq[f_str->aa0t[i0++]]; sp1 = pst.sq[aa1p[i1++]]; lenc++; if (toupper(sp0) == toupper(sp1)) aln->nident++; } else { if (op==0) { op = *rp++;} i1++; op--; len_gap++; lenc++; } } return lenc-len_gap; } #ifdef PCOMPLIB #include "structs.h" #include "p_mw.h" void update_params(struct qmng_str *qm_msg, struct mngmsg *m_msg, struct pstruct *ppst) { m_msg->n0 = ppst->n0 = qm_msg->n0; m_msg->nm0 = qm_msg->nm0; m_msg->escore_flg = qm_msg->escore_flg; m_msg->qshuffle = qm_msg->qshuffle; } #endif