16 /* calc_priors() - calculate frequencies of amino-acids, possibly with counts */
17 /* generate_tatprobs() - build the table of score probabilities if the
18 sequences are not too long */
21 det(double a11, double a12, double a13,
22 double a21, double a22, double a23,
23 double a31, double a32, double a33);
25 double power(double r, int p)
30 if (r==0.0) return(p==0?1.0:0.0);
31 if (neg = p<0) p = -p;
38 return((neg? 1.0/tr: tr));
42 factorial (int a, int b) {
46 if(a == 0) { return 1.0; }
57 calc_priors(double *priors,
59 struct f_struct *f_str,
60 const unsigned char *aa1, int n1,
66 if(n1 == 0 && f_str->priors[1] > 0.0) {
67 for(i = 1 ; i <= ppst->nsq ; i++) {
68 priors[i] = f_str->priors[i];
74 if (ppst->dnaseq==SEQT_PROT ) {
76 /* Robinson & Robinson residue counts from Stephen Altschul */
77 counts[ 1] = 35155; /* A */
78 counts[ 2] = 23105; /* R */
79 counts[ 3] = 20212; /* N */
80 counts[ 4] = 24161; /* D */
81 counts[ 5] = 8669; /* C */
82 counts[ 6] = 19208; /* Q */
83 counts[ 7] = 28354; /* E */
84 counts[ 8] = 33229; /* G */
85 counts[ 9] = 9906; /* H */
86 counts[10] = 23161; /* I */
87 counts[11] = 40625; /* L */
88 counts[12] = 25872; /* K */
89 counts[13] = 10101; /* M */
90 counts[14] = 17367; /* F */
91 counts[15] = 23435; /* P */
92 counts[16] = 32070; /* S */
93 counts[17] = 26311; /* T */
94 counts[18] = 5990; /* W */
95 counts[19] = 14488; /* Y */
96 counts[20] = 29012; /* V */
97 counts[21] = 0; /* B */
98 counts[22] = 0; /* Z */
99 counts[23] = 0; /* X */
100 counts[24] = 0; /* * */
102 else { /* SEQT_DNA */
107 for (i=5; i<=ppst->nsq; i++) counts[i]=0;
110 memset(&counts[0], 0, sizeof(counts));
112 for(i = 0 ; i < n1 ; i++) {
113 if(aa1[i] > ppst->nsq || aa1[i] < 1) continue;
119 for(i = 1 ; i <= ppst->nsq ; i++) sum += counts[i];
121 for(i = 1 ; i <= ppst->nsq ; i++) {
123 priors[i] = (double) counts[i] / (double) sum;
125 priors[i] = ( ((double) pseudocts * f_str->priors[i]) + (double) counts[i] ) / ( (double) sum + (double) pseudocts );
133 max_score(int *scores, int nsq) {
138 for ( i = 1 ; i <= nsq ; i++ ) {
139 if (scores[i] > max) max = scores[i];
146 min_score(int *scores, int nsq) {
151 for (i = 1 ; i <= nsq ; i++ ) {
152 if (scores[i] < min) min = scores[i];
159 calc_tatusov ( struct slink *last,
161 const unsigned char *aa0, int n0,
162 const unsigned char *aa1, int n1,
164 struct f_struct *f_str,
172 double *priors, my_priors[MAXSQ], tatprob, left_tatprob, right_tatprob;
173 unsigned char *query = NULL;
174 int length, maxlength, sumlength, sumscore, tmp, seg;
180 #if defined(FASTS) || defined(FASTM)
185 struct tat_str *oldtat = NULL, *newtat = NULL;
187 #if defined(FASTS) || defined(FASTM)
188 start = this->vp->start - this->vp->dp + f_str->noff;
189 stop = this->vp->stop - this->vp->dp + f_str->noff;
190 tmp = stop - start + 1;
193 FASTF alignments can also hang off the end of library sequences,
194 but no query residues are used up in the process, but we have to
195 keep track of which are
198 for(i = 0, j = 0 ; i < n0 ; i++) {
199 if (this->vp->used[i] == 1) {tmp++; }
203 sumlength = maxlength = length = tmp;
205 sumscore = this->vp->score;
207 #if defined(FASTS) || defined(FASTM)
208 if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) {
209 index |= (1 << f_str->aa0i[start]);
211 notokay |= (1 << f_str->aa0i[start]);
215 for(sl = last; sl != NULL ; sl = sl->prev) {
217 #if defined(FASTS) || defined(FASTM)
218 start = sl->vp->start - sl->vp->dp + f_str->noff;
219 stop = sl->vp->stop - sl->vp->dp + f_str->noff;
220 tmp = stop - start + 1;
223 for(i = 0, j = 0 ; i < n0 ; i++) {
224 if(sl->vp->used[i] == 1) {
230 maxlength = tmp > maxlength ? tmp : maxlength;
232 sumscore += sl->vp->score;
234 #if defined(FASTS) || defined(FASTM)
235 if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) {
236 index |= (1 << f_str->aa0i[start]);
238 notokay |= (1 << f_str->aa0i[start]);
246 #if defined(FASTS) || defined(FASTM)
248 /* for T?FASTS, we try to use what we've precalculated: */
250 /* with z = 3, do_opt is true, but we can use precalculated - with
251 all other z's we can use precalculated only if !do_opt */
252 if(!notokay && f_str->tatprobs != NULL) {
253 /* create our own newtat and copy f_str's tat into it */
256 newtat = (struct tat_str *) malloc(sizeof(struct tat_str));
258 fprintf(stderr, "Couldn't calloc memory for newtat.\n");
262 memcpy(newtat, f_str->tatprobs[index], sizeof(struct tat_str));
264 newtat->probs = (double *) calloc(f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1, sizeof(double));
265 if(newtat->probs == NULL) {
266 fprintf(stderr, "Coudln't calloc memory for newtat->probs.\n");
270 memcpy(newtat->probs, f_str->tatprobs[index]->probs,
271 (f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1) * sizeof(double));
274 tatprob = f_str->intprobs[index][sumscore - f_str->tatprobs[index]->lowscore];
276 } else { /* we need to recalculate from scratch */
279 /* for T?FASTF, we're always recalculating from scratch: */
281 query = (unsigned char *) calloc(length, sizeof(unsigned char));
283 fprintf(stderr, "Couldn't calloc memory for query.\n");
287 #if defined(FASTS) || defined(FASTM)
288 start = this->vp->start - this->vp->dp + f_str->noff;
289 for(i = 0, j = 0 ; i < length ; i++) {
290 query[j++] = aa0[start + i];
293 for(i = 0, j = 0 ; i < n0 ; i++) {
294 if (this->vp->used[i] == 1) {query[j++] = aa0[i];}
298 /* calc_priors - not currently implemented for aa1 dependent */
300 if( (do_opt && zsflag == 2) || zsflag == 4 ) {
301 priors = &my_priors[0];
302 calc_priors(priors, f_str, aa1, n1, pseudocts);
304 priors = f_str->priors;
308 priors = f_str->priors;
309 oldtat = (last != NULL ? last->tat : NULL);
311 generate_tatprobs(query, 0, length - 1, priors, pam2, nsq, &newtat, oldtat);
314 #if defined(FASTS) || defined(FASTM)
315 } /* close the FASTS-specific if-else from above */
318 this->newtat = newtat;
320 if(tatprob < 0.0) { /* hasn't been set by precalculated FASTS intprobs */
322 /* integrate probabilities >= sumscore */
323 tatprobsptr = newtat->probs;
325 is = i = newtat->highscore - newtat->lowscore;
326 N = sumscore - newtat->lowscore;
329 for ( ; i >= N; i--) {
330 right_tatprob += tatprobsptr[i];
333 left_tatprob = tatprobsptr[0];
334 for (i = 1 ; i < N ; i++ ) {
335 left_tatprob += tatprobsptr[i];
338 if (right_tatprob < left_tatprob) {tatprob = right_tatprob;}
339 else {tatprob = 1.0 - left_tatprob;}
341 tatprob /= (right_tatprob+left_tatprob);
345 n1 += 2 * (maxlength - 1);
349 tatprob *= factorial(n1 - sumlength + seg, n1 - sumlength);
351 tatprob *= power(n1 - sumlength,seg)/(1<<seg);
355 tatprob = 1.0 - exp(-tatprob);
361 generate_tatprobs(const unsigned char *query,
367 struct tat_str **tatarg,
368 struct tat_str *oldtat)
371 int i, j, k, l, m, n, N, highscore, lowscore;
372 int *lowrange = NULL, *highrange = NULL;
373 double *probs = NULL, *newprobs = NULL, *priorptr, tmp;
374 struct tat_str *tatprobs = NULL;
375 int *pamptr, *pamptrsave;
377 if((tatprobs = (struct tat_str *) calloc(1, sizeof(struct tat_str)))==NULL) {
378 fprintf(stderr, "Couldn't allocate individual tatprob struct.\n");
384 if ( (lowrange = (int *) calloc(n, sizeof(int))) == NULL ) {
385 fprintf(stderr, "Couldn't allocate memory for lowrange.\n");
389 if ( (highrange = (int *) calloc(n, sizeof(int))) == NULL ) {
390 fprintf(stderr, "Couldn't allocate memory for highrange.\n");
394 /* calculate the absolute highest and lowest score possible for this */
395 /* segment. Also, set the range we need to iterate over at each position */
398 highscore = lowscore = 0;
400 highscore = oldtat->highscore;
401 lowscore = oldtat->lowscore;
404 for ( i = 0 ; i < n ; i++ ) {
406 if (query[begin+i] == 0) break;
409 (highrange[i] = highscore + max_score(pam2[query[begin + i]], nsq));
412 (lowrange[i] = lowscore + min_score(pam2[query[begin + i]], nsq));
415 fprintf(stderr, "i: %d, max: %d, min: %d, high[i]: %d, low[i]: %d, high: %d, low: %d, char: %d\n",
417 max_score(pam2[query[begin + i]], nsq),
418 min_score(pam2[query[begin + i]], nsq),
419 highrange[i], lowrange[i],
420 highscore, lowscore, query[begin + i]);
424 /* allocate an array of probabilities for all possible scores */
425 /* i.e. if highest score possible is 50 and lowest score possible */
426 /* is -20, then there are 50 - (-20) + 1 = 71 possible different */
427 /* scores (including 0): */
428 N = highscore - lowscore;
429 if ( (probs = (double *) calloc(N + 1, sizeof(double))) == NULL ) {
430 fprintf(stderr, "Couldn't allocate probability matrix : %d.\n", N + 1);
435 /* for the first position, iterate over the only possible scores, */
436 /* summing the priors for the amino acids that can yield each score. */
437 pamptr = pam2[query[begin]];
438 for ( i = 1 ; i <= nsq ; i++ ) {
439 if(priors[i] > 0.0) {
440 probs[(pamptr[i] - lowscore)] += priors[i];
444 /* Need to copy the data out of oldtat->probs into probs */
445 memcpy( &probs[oldtat->lowscore - lowscore],
447 (oldtat->highscore - oldtat->lowscore + 1) * sizeof(double));
450 if ( (newprobs = (double *) calloc(N + 1, sizeof(double))) == NULL ) {
451 fprintf(stderr, "Couldn't allocate newprobs matrix.\n");
455 /* now for each remaining residue in the segment ... */
456 for ( i = (oldtat == NULL ? 1 : 0) ; i < n ; i++ ) {
458 pamptrsave = pam2[query[begin + i]];
460 /* ... calculate new probability distribution .... */
462 /* ... for each possible score (limited to current range) ... */
463 for ( j = lowrange[i] - lowscore,
464 k = highrange[i] - lowscore ;
469 pamptr = &pamptrsave[1];
470 priorptr = &priors[1];
471 /* ... for each of the possible alignment scores at this position ... */
476 /* make sure we don't go past highest possible score, or past
477 the lowest possible score; not sure why this can happen */
479 if ( m <= N && m >= 0 ) {
480 /* update the probability of getting score j: */
481 tmp += probs[m] * *priorptr++;
487 /* save the new set of probabilities, get rid of old; we don't
488 necessarily have to copy/clear all N+1 slots, we could use
489 high/low score boundaries -- not sure that's worth the
491 memcpy(probs, newprobs, (N + 1) * sizeof(double));
492 memset(newprobs, 0, (N + 1) * sizeof(double));
499 tatprobs->probs = probs;
500 /* tatprobs->intprobs = intprobs; */
501 tatprobs->lowscore = lowscore;
502 tatprobs->highscore = highscore;