#include #include #include #include #include "defs.h" #include "param.h" #include "tatstats.h" #ifndef PCOMPLIB #include "mw.h" #else #include "p_mw.h" #endif /* calc_priors() - calculate frequencies of amino-acids, possibly with counts */ /* generate_tatprobs() - build the table of score probabilities if the sequences are not too long */ double det(double a11, double a12, double a13, double a21, double a22, double a23, double a31, double a32, double a33); double power(double r, int p) { double tr; int neg; if (r==0.0) return(p==0?1.0:0.0); if (neg = p<0) p = -p; tr = 1.0; while (p>0) { if (p & 1) tr *= r; p >>= 1; if (p) r *= r; } return((neg? 1.0/tr: tr)); } double factorial (int a, int b) { double res = 1.0; if(a == 0) { return 1.0; } while(a > b) { res *= (double) a; a--; } return res; } void calc_priors(double *priors, struct pstruct *ppst, struct f_struct *f_str, const unsigned char *aa1, int n1, int pseudocts) { long counts[25], sum; int i; if(n1 == 0 && f_str->priors[1] > 0.0) { for(i = 1 ; i <= ppst->nsq ; i++) { priors[i] = f_str->priors[i]; } return; } if(n1 == 0) { if (ppst->dnaseq==SEQT_PROT ) { /* Robinson & Robinson residue counts from Stephen Altschul */ counts[ 1] = 35155; /* A */ counts[ 2] = 23105; /* R */ counts[ 3] = 20212; /* N */ counts[ 4] = 24161; /* D */ counts[ 5] = 8669; /* C */ counts[ 6] = 19208; /* Q */ counts[ 7] = 28354; /* E */ counts[ 8] = 33229; /* G */ counts[ 9] = 9906; /* H */ counts[10] = 23161; /* I */ counts[11] = 40625; /* L */ counts[12] = 25872; /* K */ counts[13] = 10101; /* M */ counts[14] = 17367; /* F */ counts[15] = 23435; /* P */ counts[16] = 32070; /* S */ counts[17] = 26311; /* T */ counts[18] = 5990; /* W */ counts[19] = 14488; /* Y */ counts[20] = 29012; /* V */ counts[21] = 0; /* B */ counts[22] = 0; /* Z */ counts[23] = 0; /* X */ counts[24] = 0; /* * */ } else { /* SEQT_DNA */ counts[1] = 250; counts[2] = 250; counts[3] = 250; counts[4] = 250; for (i=5; i<=ppst->nsq; i++) counts[i]=0; } } else { memset(&counts[0], 0, sizeof(counts)); for(i = 0 ; i < n1 ; i++) { if(aa1[i] > ppst->nsq || aa1[i] < 1) continue; counts[aa1[i]]++; } } sum = 0; for(i = 1 ; i <= ppst->nsq ; i++) sum += counts[i]; for(i = 1 ; i <= ppst->nsq ; i++) { if(n1 == 0) { priors[i] = (double) counts[i] / (double) sum; } else { priors[i] = ( ((double) pseudocts * f_str->priors[i]) + (double) counts[i] ) / ( (double) sum + (double) pseudocts ); } } return; } int max_score(int *scores, int nsq) { int max, i; max = -BIGNUM; for ( i = 1 ; i <= nsq ; i++ ) { if (scores[i] > max) max = scores[i]; } return max; } int min_score(int *scores, int nsq) { int min, i; min = BIGNUM; for (i = 1 ; i <= nsq ; i++ ) { if (scores[i] < min) min = scores[i]; } return min; } double calc_tatusov ( struct slink *last, struct slink *this, const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int **pam2, int nsq, struct f_struct *f_str, int pseudocts, int do_opt, int zsflag ) { int i, is, j, k; double *priors, my_priors[MAXSQ], tatprob, left_tatprob, right_tatprob; unsigned char *query = NULL; int length, maxlength, sumlength, sumscore, tmp, seg; int start, stop; struct slink *sl; int N; double *tatprobsptr; #if defined(FASTS) || defined(FASTM) int index = 0; int notokay = 0; #endif struct tat_str *oldtat = NULL, *newtat = NULL; #if defined(FASTS) || defined(FASTM) start = this->vp->start - this->vp->dp + f_str->noff; stop = this->vp->stop - this->vp->dp + f_str->noff; tmp = stop - start + 1; #else /* FASTF alignments can also hang off the end of library sequences, but no query residues are used up in the process, but we have to keep track of which are */ tmp = 0; for(i = 0, j = 0 ; i < n0 ; i++) { if (this->vp->used[i] == 1) {tmp++; } } #endif sumlength = maxlength = length = tmp; seg = 1; sumscore = this->vp->score; #if defined(FASTS) || defined(FASTM) if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) { index |= (1 << f_str->aa0i[start]); } else { notokay |= (1 << f_str->aa0i[start]); } #endif for(sl = last; sl != NULL ; sl = sl->prev) { #if defined(FASTS) || defined(FASTM) start = sl->vp->start - sl->vp->dp + f_str->noff; stop = sl->vp->stop - sl->vp->dp + f_str->noff; tmp = stop - start + 1; #else tmp = 0; for(i = 0, j = 0 ; i < n0 ; i++) { if(sl->vp->used[i] == 1) { tmp++; } } #endif sumlength += tmp; maxlength = tmp > maxlength ? tmp : maxlength; seg++; sumscore += sl->vp->score; #if defined(FASTS) || defined(FASTM) if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) { index |= (1 << f_str->aa0i[start]); } else { notokay |= (1 << f_str->aa0i[start]); } #endif } tatprob = -1.0; #if defined(FASTS) || defined(FASTM) /* for T?FASTS, we try to use what we've precalculated: */ /* with z = 3, do_opt is true, but we can use precalculated - with all other z's we can use precalculated only if !do_opt */ if(!notokay && f_str->tatprobs != NULL) { /* create our own newtat and copy f_str's tat into it */ index--; newtat = (struct tat_str *) malloc(sizeof(struct tat_str)); if(newtat == NULL) { fprintf(stderr, "Couldn't calloc memory for newtat.\n"); exit(1); } memcpy(newtat, f_str->tatprobs[index], sizeof(struct tat_str)); newtat->probs = (double *) calloc(f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1, sizeof(double)); if(newtat->probs == NULL) { fprintf(stderr, "Coudln't calloc memory for newtat->probs.\n"); exit(1); } memcpy(newtat->probs, f_str->tatprobs[index]->probs, (f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1) * sizeof(double)); tatprob = f_str->intprobs[index][sumscore - f_str->tatprobs[index]->lowscore]; } else { /* we need to recalculate from scratch */ #endif /* for T?FASTF, we're always recalculating from scratch: */ query = (unsigned char *) calloc(length, sizeof(unsigned char)); if(query == NULL) { fprintf(stderr, "Couldn't calloc memory for query.\n"); exit(1); } #if defined(FASTS) || defined(FASTM) start = this->vp->start - this->vp->dp + f_str->noff; for(i = 0, j = 0 ; i < length ; i++) { query[j++] = aa0[start + i]; } #else for(i = 0, j = 0 ; i < n0 ; i++) { if (this->vp->used[i] == 1) {query[j++] = aa0[i];} } #endif /* calc_priors - not currently implemented for aa1 dependent */ /* if( (do_opt && zsflag == 2) || zsflag == 4 ) { priors = &my_priors[0]; calc_priors(priors, f_str, aa1, n1, pseudocts); } else { priors = f_str->priors; } */ priors = f_str->priors; oldtat = (last != NULL ? last->tat : NULL); generate_tatprobs(query, 0, length - 1, priors, pam2, nsq, &newtat, oldtat); free(query); #if defined(FASTS) || defined(FASTM) } /* close the FASTS-specific if-else from above */ #endif this->newtat = newtat; if(tatprob < 0.0) { /* hasn't been set by precalculated FASTS intprobs */ /* integrate probabilities >= sumscore */ tatprobsptr = newtat->probs; is = i = newtat->highscore - newtat->lowscore; N = sumscore - newtat->lowscore; right_tatprob = 0; for ( ; i >= N; i--) { right_tatprob += tatprobsptr[i]; } left_tatprob = tatprobsptr[0]; for (i = 1 ; i < N ; i++ ) { left_tatprob += tatprobsptr[i]; } if (right_tatprob < left_tatprob) {tatprob = right_tatprob;} else {tatprob = 1.0 - left_tatprob;} tatprob /= (right_tatprob+left_tatprob); } if (maxlength > 0) { n1 += 2 * (maxlength - 1); } #ifndef FASTM tatprob *= factorial(n1 - sumlength + seg, n1 - sumlength); #else tatprob *= power(n1 - sumlength,seg)/(1< 0.01) tatprob = 1.0 - exp(-tatprob); return tatprob; } void generate_tatprobs(const unsigned char *query, int begin, int end, double *priors, int **pam2, int nsq, struct tat_str **tatarg, struct tat_str *oldtat) { int i, j, k, l, m, n, N, highscore, lowscore; int *lowrange = NULL, *highrange = NULL; double *probs = NULL, *newprobs = NULL, *priorptr, tmp; struct tat_str *tatprobs = NULL; int *pamptr, *pamptrsave; if((tatprobs = (struct tat_str *) calloc(1, sizeof(struct tat_str)))==NULL) { fprintf(stderr, "Couldn't allocate individual tatprob struct.\n"); exit(1); } n = end - begin + 1; if ( (lowrange = (int *) calloc(n, sizeof(int))) == NULL ) { fprintf(stderr, "Couldn't allocate memory for lowrange.\n"); exit(1); } if ( (highrange = (int *) calloc(n, sizeof(int))) == NULL ) { fprintf(stderr, "Couldn't allocate memory for highrange.\n"); exit(1); } /* calculate the absolute highest and lowest score possible for this */ /* segment. Also, set the range we need to iterate over at each position */ /* in the query: */ if(oldtat == NULL) { highscore = lowscore = 0; } else { highscore = oldtat->highscore; lowscore = oldtat->lowscore; } for ( i = 0 ; i < n ; i++ ) { if (query[begin+i] == 0) break; highscore = (highrange[i] = highscore + max_score(pam2[query[begin + i]], nsq)); lowscore = (lowrange[i] = lowscore + min_score(pam2[query[begin + i]], nsq)); /* fprintf(stderr, "i: %d, max: %d, min: %d, high[i]: %d, low[i]: %d, high: %d, low: %d, char: %d\n", i, max_score(pam2[query[begin + i]], nsq), min_score(pam2[query[begin + i]], nsq), highrange[i], lowrange[i], highscore, lowscore, query[begin + i]); */ } /* allocate an array of probabilities for all possible scores */ /* i.e. if highest score possible is 50 and lowest score possible */ /* is -20, then there are 50 - (-20) + 1 = 71 possible different */ /* scores (including 0): */ N = highscore - lowscore; if ( (probs = (double *) calloc(N + 1, sizeof(double))) == NULL ) { fprintf(stderr, "Couldn't allocate probability matrix : %d.\n", N + 1); exit(1); } if(oldtat == NULL) { /* for the first position, iterate over the only possible scores, */ /* summing the priors for the amino acids that can yield each score. */ pamptr = pam2[query[begin]]; for ( i = 1 ; i <= nsq ; i++ ) { if(priors[i] > 0.0) { probs[(pamptr[i] - lowscore)] += priors[i]; } } } else { /* Need to copy the data out of oldtat->probs into probs */ memcpy( &probs[oldtat->lowscore - lowscore], oldtat->probs, (oldtat->highscore - oldtat->lowscore + 1) * sizeof(double)); } if ( (newprobs = (double *) calloc(N + 1, sizeof(double))) == NULL ) { fprintf(stderr, "Couldn't allocate newprobs matrix.\n"); exit(1); } /* now for each remaining residue in the segment ... */ for ( i = (oldtat == NULL ? 1 : 0) ; i < n ; i++ ) { pamptrsave = pam2[query[begin + i]]; /* ... calculate new probability distribution .... */ /* ... for each possible score (limited to current range) ... */ for ( j = lowrange[i] - lowscore, k = highrange[i] - lowscore ; j <= k ; j++ ) { tmp = 0.0; pamptr = &pamptrsave[1]; priorptr = &priors[1]; /* ... for each of the possible alignment scores at this position ... */ for ( l = 1 ; l <= nsq ; l++) { /* make sure we don't go past highest possible score, or past the lowest possible score; not sure why this can happen */ m = j - *pamptr++; if ( m <= N && m >= 0 ) { /* update the probability of getting score j: */ tmp += probs[m] * *priorptr++; } } newprobs[j] += tmp; } /* save the new set of probabilities, get rid of old; we don't necessarily have to copy/clear all N+1 slots, we could use high/low score boundaries -- not sure that's worth the effort. */ memcpy(probs, newprobs, (N + 1) * sizeof(double)); memset(newprobs, 0, (N + 1) * sizeof(double)); } free(newprobs); free(highrange); free(lowrange); tatprobs->probs = probs; /* tatprobs->intprobs = intprobs; */ tatprobs->lowscore = lowscore; tatprobs->highscore = highscore; *tatarg = tatprobs; }