3 /* $Name: fa_34_26_5 $ - $Id: scaleswt.c,v 1.21 2006/04/12 18:50:01 wrp Exp $ */
4 /* as of 24 Sept, 2000 - scaleswn uses no global variables */
7 copyright (c) 1995, 1996, 2000, 2002 William R. Pearson
9 This version is designed for fasts/f, which used Tatusov
10 probabilities for statistical estimates, but still needs a
11 quick-and-dirty linear regression fit to rank things
13 For comparisons that obey tatusov statistics, we try whenever
14 possible to provide accurate e_scores, rather than raw scores. As a
15 result, no lambda/K fitting is required; and process_hist() can be
16 called atthe very beginning of the search to initialize some of the
17 statistics structures and find_zp().
19 find_zp() must still return a valid z_score surrogate, as
20 comp_lib.c/p2_complib.c continue to use z_score's to rank hits, save
23 If e_score's cannot be calculated, the process_hist() provides
24 linear regression fitting for conventional z_score estimates.
51 #define MAX_SSCORE 300
53 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
57 #define M_LN2 0.69314718055994530942
60 #define EULER_G 0.57721566490153286060
61 #define PI_SQRT6 1.28254983016186409554
64 #define M_SQRT2 1.41421356237
66 #define LN200 5.2983173666
67 #define ZS_MAX 400.0 /* used to prevent underflow on some machines */
68 #define TOLERANCE 1.0e-12
71 /* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
73 double ngLambda, ngK, ngH;
74 double rho, rho_e, mu, mu_e, mean_var, var_e; /* ?_e:std. error of ? */
75 /* used by REG2_STATS */
76 double rho2, mu2, var_cutoff;
77 int n_trimmed; /* excluded because of high z-score */
78 int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
79 double tat_a, tat_b, tat_c, spacefactor;
84 #define AVE_STATS 0 /* no length effect, only mean/variance */
85 double find_zt(int score, double escore, int len, double comp, struct rstat_str *);
87 double find_zn(int score, double escore, int len, double comp, struct rstat_str *);
89 double power(double, int);
91 void sortbesto(double *, int );
92 extern void sortbeste(struct beststr **bptr, int nbest);
94 int proc_hist_n(struct stat_str *sptr, int n,
95 struct pstruct pst, struct hist_str *histp, int do_trim,
98 #define REG_STATS 1 /* length-regression scaled */
99 double find_zr(int score, double escore, int len, double comp, struct rstat_str *);
101 int proc_hist_r(struct stat_str *sptr, int n,
102 struct pstruct pst, struct hist_str *histp,
103 int do_trim, struct rstat_str *rs);
105 double (*find_zp)(int score, double escore, int len, double comp,
106 struct rstat_str *) = &find_zr;
110 int max_score, min_score;
112 double *score_sums, *score2_sums;
114 int max_length, min_length, zero_s;
118 static void inithist(struct llen_str *, struct pstruct, int);
119 static void free_hist( struct llen_str *);
120 static void addhist(struct llen_str *, int, int, int);
121 static void prune_hist(struct llen_str *, int, int, int, long *);
122 void inithistz(int, struct hist_str *histp);
123 void addhistz(double zs, struct hist_str *histp);
125 static void fit_llen(struct llen_str *, struct rstat_str *);
126 static void fit_llens(struct llen_str *, struct rstat_str *);
128 void linreg(double *lny, double *x, double *lnx, int n,
129 double *a, double *b, double *c, int start);
131 double calc_spacefactor(const unsigned char *, int, int, int);
133 double det(double a11, double a12, double a13,
134 double a21, double a22, double a23,
135 double a31, double a32, double a33);
137 double factorial (int a, int b);
139 /* void set_db_size(int, struct db_str *, struct hist_str *); */
146 process_hist(struct stat_str *sptr, int nstats,
149 struct hist_str *histp,
150 struct rstat_str **rs_sp,
155 struct rstat_str *rs_s;
157 if (pst.zsflag < 0) {
162 if (*rs_sp == NULL) {
163 if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {
164 fprintf(stderr," cannot allocate rs_snion: %ld\n",sizeof(struct rstat_str));
171 memset(rs_s,0,sizeof(struct rstat_str));
174 if (m_msg.escore_flg) {
176 inithistz(MAXHIST,histp);
181 fprintf(stderr," too few sequences for sampling: %d\n",nstats);
187 rs_s->ngLambda = m_msg.Lambda;
200 return proc_hist_r(sptr, nstats,pst, histp, do_trim, rs_s);
204 calc_thresh(struct pstruct pst, int nstats,
205 double Lambda, double K, double H, double *zstrim)
208 double ave_n1, tmp_score, z, l_fact;
210 if (pst.dnaseq == SEQT_DNA || pst.dnaseq == SEQT_RNA) {
219 /* max_hscore = MAX_SSCORE; */
220 /* mean expected for pst.n0 * 400 for protein, 5000 for DNA */
221 /* we want a number of offsets that is appropriate for the database size so
226 the calculation below sets a high-score threshold using an
227 ungapped lambda, but errs towards the high-score side by using
228 E()=0.001 and calculating with 0.70*lambda, which is the correct for
229 going from ungapped to -12/-2 gapped lambda with BLOSUM50
233 tmp_score = 0.01/((double)nstats*K*(double)pst.n0*ave_n1);
234 tmp_score = -log(tmp_score)/(Lambda*l_fact);
235 max_hscore = (int)(tmp_score+0.5);
237 z = 1.0/(double)nstats;
238 z = (log(z)+EULER_G)/(-PI_SQRT6);
243 *zstrim = 10.0*z+50.0;
248 proc_hist_r(struct stat_str *sptr, int nstats,
249 struct pstruct pst, struct hist_str *histp,
250 int do_trim, struct rstat_str *rs)
255 struct llen_str llen;
260 max_hscore = calc_thresh(pst, nstats, rs->ngLambda,
261 rs->ngK, rs->ngH, &ztrim);
263 inithist(&llen,pst,max_hscore);
264 f_string = &(histp->stat_info[0]);
266 for (i = 0; i<nstats; i++)
267 addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
268 histp->entries = nstats - llen.zero_s;
270 if ((llen.max_score - llen.min_score) < 10) {
274 return proc_hist_n(sptr, nstats, pst, histp, do_trim, rs);
277 fit_llen(&llen, rs); /* now we have rho, mu, rho2, mu2, mean_var
278 to set the parameters for the histogram */
280 if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_n */
283 return proc_hist_n(sptr,nstats, pst, histp, do_trim, rs);
286 rs->n_trimmed= rs->n1_trimmed = rs->nb_trimmed = 0;
290 for (i = 0; i < nstats; i++) {
291 zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
292 if (zs < 20.0 || zs > ztrim) {
294 prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
300 /* fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", rs->n_trimmed); */
302 if (llen.fit_flag) fit_llens(&llen, rs);
304 /* fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", rs->n1_trimmed,rs->nb_trimmed); */
310 /* rst all the scores in the histogram */
312 if (pst.zsflag < 10) s_string[0]='\0';
313 else if (pst.zs_win > 0)
314 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
315 else strncpy(s_string,"(shuffled)",sizeof(s_string));
317 inithistz(MAXHIST, histp);
319 sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d B-trim: %d in %d/%d\n Lambda= %6.4f",
321 rs->rho*LN_FACT,sqrt(rs->rho_e),rs->mu,sqrt(rs->mu_e), rs->mean_var,sqrt(rs->var_e),
322 llen.zero_s, rs->n_trimmed, rs->n1_trimmed, rs->nb_trimmed, rs->nb_tot,
323 PI_SQRT6/sqrt(rs->mean_var));
329 proc_hist_n(struct stat_str *sptr, int nstats,
330 struct pstruct pst, struct hist_str *histp,
331 int do_trim, struct rstat_str *rs)
334 double s_score, s2_score, ssd;
340 f_string = &(histp->stat_info[0]);
341 /* db->entries = db->length = db->carry = 0; */
343 max_hscore = calc_thresh(pst, nstats, rs->ngLambda,
344 rs->ngK, rs->ngH, &ztrim);
346 s_score = s2_score = 0.0;
350 for ( j = 0, i = 0; i < nstats; i++) {
351 if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
352 s_score += (ssd=(double)sptr[i].score);
353 s2_score += ssd * ssd;
356 db->length += sptr[i].n1;
357 if (db->length > LONG_MAX) {
359 db->length -= LONG_MAX;
367 rs->mu = s_score/(double)j;
368 rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
369 rs->mean_var /= (double)(j-1);
376 if (rs->mean_var < 0.01) {
377 rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
380 /* now remove some scores */
386 for (i=0; i< nstats; i++) {
387 if (sptr[i].n1 < 0) continue;
388 ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
389 if (ssd > ztrim || ssd < 20.0) {
390 /* fprintf(stderr,"removing %3d %3d %4.1f\n",
391 sptr[i].score, sptr[i].n1,ssd); */
398 sptr[i].n1 = -sptr[i].n1;
403 rs->mu = s_score/(double)j;
404 rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
405 rs->mean_var /= (double)(j-1);
412 if (rs->mean_var < 0.01) {
413 rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
416 if (rs->n_trimmed < LHISTC) {
418 fprintf(stderr,"nprune %d at %d\n",nprune,nit);
424 if (pst.zsflag < 10) s_string[0]='\0';
425 else if (pst.zs_win > 0)
426 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
427 else strncpy(s_string,"(shuffled)",sizeof(s_string));
429 sprintf(f_string,"%s unscaled statistics: mu= %6.4f var=%6.4f; Lambda= %6.4f",
430 s_string, rs->mu,rs->mean_var,PI_SQRT6/sqrt(rs->mean_var));
436 This routine calculates the maximum likelihood estimates for the
437 extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
439 <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
440 <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
442 The <a> parameter can be transformed into and K
443 of the formula: 1 - exp ( - K m n exp ( - lambda S ))
444 using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
445 1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
447 a = log(K m n) / lambda
448 a lambda = log (K m n)
449 exp(a lambda) = K m n
450 but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
451 so: K m n = (1/n sum{ exp( -x[i] *lambda)})
452 K = sum{}/(nlib m n )
457 alloc_hist(struct llen_str *llen)
460 max_llen = llen->max;
462 if (llen->hist == NULL) {
463 llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
464 llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
465 llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
466 llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
469 for (i=0; i< max_llen+1; i++) {
471 llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
476 free_hist(struct llen_str *llen)
478 if (llen->hist!=NULL) {
479 free(llen->score_var);
480 free(llen->score2_sums);
481 free(llen->score_sums);
488 inithist(struct llen_str *llen, struct pstruct pst, int max_hscore)
490 llen->max = MAX_LLEN;
492 llen->max_score = -1;
493 llen->min_score=10000;
498 llen->min_length = 10000;
499 llen->max_length = 0;
503 addhist(struct llen_str *llen, int score, int length, int max_hscore)
508 if ( score<=0 || length < LENGTH_CUTOFF) {
514 if (score < llen->min_score) llen->min_score = score;
515 if (score > llen->max_score) llen->max_score = score;
517 if (length > llen->max_length) llen->max_length = length;
518 if (length < llen->min_length) llen->min_length = length;
519 if (score > max_hscore) score = max_hscore;
521 llength = (int)(LN_FACT*log((double)length)+0.5);
523 if (llength < 0 ) llength = 0;
524 if (llength > llen->max) llength = llen->max;
525 llen->hist[llength]++;
526 dscore = (double)score;
527 llen->score_sums[llength] += dscore;
528 llen->score2_sums[llength] += dscore * dscore;
532 db->length += length;
533 if (db->length > LONG_MAX) {db->carry++;db->length -= LONG_MAX;}
537 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
541 inithistz(int mh, struct hist_str *histp )
545 histp->min_hist = 20;
546 histp->max_hist = 120;
548 histp->histint = (int)
549 ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
551 ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
553 if (histp->hist_a==NULL) {
554 if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
556 fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
559 else histp->histflg = 1;
562 for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
566 /* fasts/f will not show any histogram */
568 addhistz(double zs, struct hist_str *histp)
573 prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
579 if (score <= 0 || length < LENGTH_CUTOFF) return;
581 if (score > max_hscore) score = max_hscore;
583 llength = (int)(LN_FACT*log((double)length)+0.5);
585 if (llength < 0 ) llength = 0;
586 if (llength > llen->max) llength = llen->max;
587 llen->hist[llength]--;
588 dscore = (double)score;
589 llen->score_sums[llength] -= dscore;
590 llen->score2_sums[llength] -= dscore * dscore;
594 if (length < db->length) db->length -= length;
595 else {db->carry--; db->length += (LONG_MAX - (unsigned long)length);}
599 /* fit_llen: no trimming
600 (1) regress scores vs log(n) using weighted variance
601 (2) calculate mean variance after length regression
605 fit_llen(struct llen_str *llen, struct rstat_str *pr)
611 double mean_x, mean_y, var_x, var_y, covar_xy;
612 double mean_y2, covar_xy2, var_y2, dllj;
614 double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
616 /* now fit scores to best linear function of log(n), using
617 simple linear regression */
619 for (llen->min=0; llen->min < llen->max; llen->min++)
620 if (llen->hist[llen->min]) break;
623 for (n_size=0,j = llen->min; j < llen->max; j++) {
624 if (llen->hist[j] > 1) {
625 dllj = (double)llen->hist[j];
626 llen->score_var[j] = llen->score2_sums[j]/dllj
627 - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
628 llen->score_var[j] /= (double)(llen->hist[j]-1);
629 if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
637 sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
638 for (j = llen->min; j < llen->max; j++)
639 if (llen->hist[j] > 1) {
641 dllj = (double)llen->hist[j];
642 n_w += dllj/llen->score_var[j];
643 sum_x += dllj * x / llen->score_var[j] ;
644 sum_y += llen->score_sums[j] / llen->score_var[j];
645 sum_x2 += dllj * x * x /llen->score_var[j];
646 sum_xy += x * llen->score_sums[j]/llen->score_var[j];
656 delta = n_w * sum_x2 - sum_x * sum_x;
658 pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta;
659 pr->rho_e = n_w/delta;
660 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
661 pr->mu_e = sum_x2/delta;
671 delta = n_w * sum_x2 - sum_x * sum_x;
672 pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta;
673 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
676 mean_x = mean_y = mean_y2 = 0.0;
678 covar_xy = covar_xy2 = 0.0;
680 for (j = llen->min; j <= llen->max; j++)
681 if (llen->hist[j] > 1 ) {
684 mean_x += (double)llen->hist[j] * x;
685 mean_y += llen->score_sums[j];
686 var_x += (double)llen->hist[j] * x * x;
687 var_y += llen->score2_sums[j];
688 covar_xy += x * llen->score_sums[j];
690 mean_x /= n; mean_y /= n;
691 var_x = var_x / n - mean_x * mean_x;
692 var_y = var_y / n - mean_y * mean_y;
694 covar_xy = covar_xy / n - mean_x * mean_y;
696 pr->rho = covar_xy / var_x;
697 pr->mu = mean_y - pr->rho * mean_x;
699 mean_y2 = covar_xy2 = var_y2 = 0.0;
700 for (j = llen->min; j <= llen->max; j++)
701 if (llen->hist[j] > 1) {
703 u = pr->rho * x + pr->mu;
704 y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
706 dllj = (double)llen->hist[j];
707 fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
708 llen->score_sums[j]/dllj,y2/dllj);
713 /* fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
714 x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
717 pr->mean_var = mean_y2 /= (double)n;
718 covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
720 if (pr->mean_var <= 0.01) {
722 pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
726 fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
727 pr->rho*LN_FACT,pr->mu,pr->mean_var);
729 if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
730 else pr->var_e = 0.0;
732 if (llen->fit_flag) {
733 pr->rho2 = covar_xy2 / var_x;
734 pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
738 pr->mu2 = pr->mean_var;
742 z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
743 else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
744 if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
746 pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
749 /* fit_llens: trim high variance bins
750 (1) regress scores vs log(n) using weighted variance
751 (2) regress residuals vs log(n)
752 (3) remove high variance bins
753 (4) calculate mean variance after length regression
757 fit_llens(struct llen_str *llen, struct rstat_str *pr)
761 double x, y, y2, u, u2, v, z;
762 double mean_x, mean_y, var_x, var_y, covar_xy;
763 double mean_y2, covar_xy2;
764 double mean_u2, mean_3u2, dllj;
765 double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
767 /* now fit scores to best linear function of log(n), using
768 simple linear regression */
770 for (llen->min=0; llen->min < llen->max; llen->min++)
771 if (llen->hist[llen->min]) break;
774 for (j = llen->min; j < llen->max; j++) {
775 if (llen->hist[j] > 1) {
776 dllj = (double)llen->hist[j];
777 llen->score_var[j] = (double)llen->score2_sums[j]/dllj
778 - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
779 llen->score_var[j] /= (double)(llen->hist[j]-1);
780 if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
785 sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
786 for (j = llen->min; j < llen->max; j++)
787 if (llen->hist[j] > 1) {
789 dllj = (double)llen->hist[j];
790 n_w += dllj/llen->score_var[j];
791 sum_x += dllj * x / llen->score_var[j] ;
792 sum_y += llen->score_sums[j] / llen->score_var[j];
793 sum_x2 += dllj * x * x /llen->score_var[j];
794 sum_xy += x * llen->score_sums[j]/llen->score_var[j];
797 delta = n_w * sum_x2 - sum_x * sum_x;
798 pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta;
799 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
801 /* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
804 mean_x = mean_y = mean_y2 = 0.0;
806 covar_xy = covar_xy2 = 0.0;
808 for (j = llen->min; j <= llen->max; j++)
809 if (llen->hist[j] > 1 ) {
812 dllj = (double)llen->hist[j];
814 mean_y += llen->score_sums[j];
815 var_x += dllj * x * x;
816 var_y += llen->score2_sums[j];
817 covar_xy += x * llen->score_sums[j];
819 mean_x /= n; mean_y /= n;
820 var_x = var_x / n - mean_x * mean_x;
821 var_y = var_y / n - mean_y * mean_y;
823 covar_xy = covar_xy / n - mean_x * mean_y;
824 /* pr->rho = covar_xy / var_x;
825 pr->mu = mean_y - pr->rho * mean_x;
828 mean_y2 = covar_xy2 = 0.0;
829 for (j = llen->min; j <= llen->max; j++)
830 if (llen->hist[j] > 1) {
832 u = pr->rho * x + pr->mu;
833 y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
839 covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
840 pr->rho2 = covar_xy2 / var_x;
841 pr->mu2 = mean_y2 - pr->rho2 * mean_x;
844 z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
845 else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
846 if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
848 pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
850 /* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
855 for ( j = llen->min; j < llen->max; j++) {
857 dllj = (double)llen->hist[j];
858 x = pr->rho * y + pr->mu;
859 v = pr->rho2 * y + pr->mu2;
860 if (v < pr->var_cutoff) v = pr->var_cutoff;
861 if (llen->hist[j]> 1) {
862 u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
863 mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
865 /* fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
866 j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
868 else llen->score_var[j] = -1.0;
871 mean_u2 = sqrt(mean_u2/(double)n_u2);
872 /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */
874 mean_3u2 = mean_u2*3.0;
876 for (j = llen->min; j < llen->max; j++) {
877 if (llen->hist[j] <= 1) continue;
878 if (sqrt(llen->score_var[j]) > mean_3u2) {
879 /* fprintf(stderr," removing %d %d %.2f\n",
880 j, (int)(exp((double)j/LN_FACT)-0.5),
881 sqrt(llen->score_var[j]));
884 pr->n1_trimmed += llen->hist[j];
892 /* REG_STATS - Z() from rho/mu/mean_var */
893 double find_zr(int score, double escore, int length, double comp,
894 struct rstat_str *rs)
898 if (score <= 0) return 0.0;
899 if ( length < LENGTH_CUTOFF) return 0.0;
901 log_len = LN_FACT*log((double)(length));
902 /* var = rs->rho2 * log_len + rs->mu2;
903 if (var < rs->var_cutoff) var = rs->var_cutoff;
906 z = ((double)score - rs->rho * log_len - rs->mu) / sqrt(rs->mean_var);
908 return (50.0 + z*10.0);
911 double find_zt(int score, double escore, int length, double comp,
912 struct rstat_str *rs)
914 if (escore > 0.0) return -log(escore)/M_LN2;
915 else return 744.440071/M_LN2;
918 double find_zn(int score, double escore, int length, double comp,
919 struct rstat_str *rs)
923 z = ((double)score - rs->mu) / sqrt(rs->mean_var);
925 return (50.0 + z*10.0);
928 /* computes E value for a given z value, assuming extreme value distribution */
930 z_to_E(double zs, long entries, struct db_str db)
934 /* if (db->entries < 5) return (double)db.entries; */
935 if (entries < 1) { n = db.entries;}
938 if (zs > ZS_MAX) return 0.0;
940 e = exp(-PI_SQRT6 * zs - EULER_G);
941 return n * (e > .01 ? 1.0 - exp(-e) : e);
950 /* this version assumes the probability is in the ->zscore variable,
951 which is provided by this file after last_scale()
955 zs_to_bit(double zs, int n0, int n1)
957 return zs+log((double)(n0*n1))/M_LN2 ;
960 /* computes E-value for a given z value, assuming extreme value distribution */
962 zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
966 /* if (db->entries < 5) return 0.0; */
968 if (zs > ZS_MAX ) return 0.0;
970 if (entries < 1) entries = db.entries;
972 if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
973 k = (double)db.length /(double)n1;
974 if (db.carry > 0) { k *= (double)db.carry * (double)LONG_MAX;}
976 else k = (double)entries;
978 if (k < 1.0) k = 1.0;
981 if ( zs > 100.0) e = 0.0;
986 /* computes E-value for a given z value, assuming extreme value distribution */
988 E_to_zs(double E, long entries)
993 e = E/(double)entries;
996 z = (log(e)+EULER_G)/(-PI_SQRT6);
999 z = np_to_z(1.0-e,&error);
1001 if (!error) return z*10.0+50.0;
1006 /* computes 1.0 - E value for a given z value, assuming extreme value
1009 zs_to_Ec(double zs, long entries)
1013 if (entries < 5) return 0.0;
1015 z = (zs - 50.0)/10.0;
1017 if (z > ZS_MAX) return 1.0;
1019 e = exp(-PI_SQRT6 * z - EULER_G);
1020 return (double)entries * (e > .01 ? exp(-e) : 1.0 - e);
1025 double *v; int *s, n;
1031 for (gap=n/2; gap>0; gap/=2)
1032 for (i=gap; i<n; i++)
1033 for (j=i-gap; j>=0; j -= gap) {
1034 if (v[j] >= v[j+gap]) break;
1035 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
1036 itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
1041 sort_escore(double *v, int n)
1046 for (gap=n/2; gap>0; gap/=2) {
1047 for (i=gap; i<n; i++) {
1048 for (j=i-gap; j>=0; j -= gap) {
1049 if (v[j] <= v[j+gap]) break;
1058 /* scale_tat - compute 'a', 'b', 'c' coefficients for scaling fasts/f
1060 5-May-2003 - also calculate index for high ties
1063 scale_tat(double *escore, int nstats,
1064 long db_entries, int do_trim,
1065 struct rstat_str *rs)
1068 double *x, *lnx, *lny;
1070 /* sort_escore(escore, nstats); */
1072 while (*escore<0.0) {escore++; nstats--; }
1074 x = (double *) calloc(nstats, sizeof(double));
1076 fprintf(stderr, "Couldn't calloc tatE/x\n");
1080 lnx = (double *) calloc(nstats,sizeof(double));
1082 fprintf(stderr, "Couldn't calloc tatE/lnx\n");
1086 lny = (double *) calloc(nstats,sizeof(double));
1088 fprintf(stderr, "Couldn't calloc tatE/lny\n");
1092 for(i = 0 ; i < nstats ; ) {
1094 lny[i] = log(escore[i]);
1096 for(j = i+1 ; j < nstats ; j++) {
1097 if(escore[j] != escore[i]) break;
1100 x[i] = ((((double)i + (double)(j - i - 1)/2.0)*(double)nstats/(double)db_entries)+1.0)/(double)nstats;
1103 for(k = i+1 ; k < j ; k++) {
1114 start = 0.05 * (double) nstats;
1115 start = start > 500 ? 500 : start;
1118 linreg(lny, x, lnx, nstats, &rs->tat_a, &rs->tat_b, &rs->tat_c, start);
1120 /* I have the coefficients I need - a, b, c; free arrays */
1126 /* calculate tie_j - the index below which all scores are considered
1129 rs->tie_j = 0.005 * db_entries;
1133 linreg(double *lny, double *x, double *lnx, int n,
1134 double *a, double *b, double *c, int start) {
1136 double yf1, yf2, yf3;
1137 double f1f1, f1f2, f1f3;
1143 yf1 = yf2 = yf3 = 0.0;
1144 f1f1 = f1f2 = f1f3 = f2f2 = f2f3 = f3f3 = 0.0;
1146 for(i = start; i < n; i++) {
1147 yf1 += lny[i] * lnx[i];
1148 yf2 += lny[i] * x[i];
1151 f1f1 += lnx[i] * lnx[i];
1152 f1f2 += lnx[i] * x[i];
1155 f2f2 += x[i] * x[i];
1161 delta = det(f1f1, f1f2, f1f3, f1f2, f2f2, f2f3, f1f3, f2f3, f3f3);
1163 *a = det(yf1, f1f2, f1f3, yf2, f2f2, f2f3, yf3, f2f3, f3f3) / delta;
1164 *b = det(f1f1, yf1, f1f3, f1f2, yf2, f2f3, f1f3, yf3, f3f3) / delta;
1165 *c = det(f1f1, f1f2, yf1, f1f2, f2f2, yf2, f1f3, f2f3, yf3) / delta;
1169 double det(double a11, double a12, double a13,
1170 double a21, double a22, double a23,
1171 double a31, double a32, double a33)
1175 result = a11 * (a22 * a33 - a32 * a23);
1176 result -= a12 * (a21 * a33 - a31 * a23);
1177 result += a13 * (a21 * a32 - a31 * a22);
1183 last_stats(const unsigned char *aa0, int n0,
1184 struct stat_str *sptr, int nstats,
1185 struct beststr **bestp_arr, int nbest,
1186 struct mngmsg m_msg, struct pstruct pst,
1187 struct hist_str *histp, struct rstat_str **rs_sp)
1190 int i, nobs, nobs_t, do_trim;
1192 struct rstat_str *rs_s;
1194 if (*rs_sp == NULL) {
1195 if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {
1196 fprintf(stderr," cannot allocate rs_s: %ld\n",sizeof(struct rstat_str));
1205 sortbeste(bestp_arr,nbest);
1208 calc_spacefactor(aa0, n0, m_msg.nm0,pst.nsq);
1210 if (pst.zsflag >= 1 && pst.zsflag <= 4) {
1211 if (m_msg.escore_flg) {
1220 if ((obs_escore = (double *)calloc(nobs,sizeof(double)))==NULL) {
1221 fprintf(stderr," cannot allocate obs_escore[%d]\n",nbest);
1225 if (m_msg.escore_flg) {
1226 for (i=nobs=0; i<nbest; i++) {
1227 if (bestp_arr[i]->escore<= 1.00)
1228 obs_escore[nobs++]=bestp_arr[i]->escore;
1232 for (i=0; i<nbest; i++) {
1233 if (bestp_arr[i]->escore >= 0.99 &&
1234 bestp_arr[i]->escore <= 1.00)
1235 obs_escore[nobs++]=bestp_arr[i]->escore;
1238 db_entries = m_msg.db.entries;
1241 for (i=nobs=0; i<nstats; i++) {
1242 if (sptr[i].escore <= 1.00 ) obs_escore[nobs++]=sptr[i].escore;
1246 for (i=0; i<nstats; i++) {
1247 if (sptr[i].escore >= 0.99 &&
1248 sptr[i].escore <= 1.0) obs_escore[nobs++]=sptr[i].escore;
1252 /* db_entries = m_msg.db.entries;*/
1255 sortbesto(obs_escore,nobs);
1257 scale_tat(obs_escore,nobs,db_entries,do_trim,rs_s);
1259 sprintf(histp->stat_info,"scaled Tatusov statistics (%d): tat_a: %6.4f tat_b: %6.4f tat_c: %6.4f",
1260 nobs,rs_s->tat_a, rs_s->tat_b, rs_s->tat_c);
1264 sprintf(histp->stat_info,"Space_factor %.4g scaled statistics",
1271 histp->stat_info[0] = '\0';
1275 /* scale_scores() takes the best (real) scores and re-scales them;
1276 beststr bptr[] must be sorted */
1279 scale_scores(struct beststr **bptr, int nbest, struct db_str db,
1280 struct pstruct pst, struct rstat_str *rs)
1283 double obs, r_a, r_b, r_c;
1285 /* this scale function absolutely requires that the results be sorted
1286 before it is used */
1288 sortbeste(bptr,nbest);
1290 if (!rs->have_tat) {
1291 for (i=0; i<nbest; i++) {
1292 bptr[i]->escore *= rs->spacefactor;
1297 /* here if more than 1000 scores */
1299 r_a = rs->tat_a; r_b = rs->tat_b; r_c = rs->tat_c;
1301 /* the problem with scaletat is that the E() value is related to
1302 ones position in the list of top scores - thus, knowing the score
1303 is not enough - one must know the rank */
1305 for(i = 0 ; i < nbest ; ) {
1306 /* take the bottom 0.5%, and the ties, and treat them all the same */
1309 (j <= (0.005 * db.entries) || bptr[j]->escore == bptr[i]->escore)
1314 /* observed frequency */
1315 obs = ((double)i + ((double)(j - i - 1)/ 2.0) + 1.0)/(double)db.entries;
1317 /* make certain ties all have the same correction */
1318 for (k = i ; k < j ; k++) {
1319 bptr[k]->escore *= obs/exp(r_a*log(obs) + r_b*obs + r_c);
1325 for (i=0; i<nbest; i++) {
1326 if(bptr[i]->escore > 0.01)
1327 bptr[i]->escore = 1.0 - exp(-bptr[i]->escore);
1328 if (bptr[i]->escore > 0.0)
1329 bptr[i]->zscore = -log(bptr[i]->escore)/M_LN2;
1331 bptr[i]->zscore = 744.440071/M_LN2;
1332 bptr[i]->escore *= pst.zdb_size;
1336 double scale_one_score (int ipos, double escore,
1338 struct rstat_str *rs) {
1343 return escore * rs->spacefactor;
1345 if (ipos < rs->tie_j) ipos = rs->tie_j/2;
1347 a = rs->tat_a; b = rs->tat_b; c = rs->tat_c;
1349 obs = ((double)ipos + 1.0)/(double)db.entries;
1351 escore *= obs/exp(a*log(obs) + b*obs + c);
1356 double calc_spacefactor(const unsigned char *aa0, int n0,
1360 return pow(2.0, (double) nm0) - 1.0;
1363 int i, j, n, l, nr, bin, k;
1367 double tmp, result = 0.0;
1369 nmoff = (n0 - nm0 + 1)/nm0+1;
1371 counts = (int **) calloc(nsq, sizeof(int *));
1372 if(counts == NULL) {
1373 fprintf(stderr, "couldn't calloc counts array!\n");
1377 counts[0] = (int *) calloc(nsq * (nmoff - 1), sizeof(int));
1378 if(counts[0] == NULL) {
1379 fprintf(stderr, "couldn't calloc counts array!\n");
1383 for(i = 0 ; i < nsq ; i++) {
1384 counts[i] = counts[0] + (i * (nmoff - 1));
1387 for(i = 0 ; i < nm0 ; i++) {
1388 for(j = 0 ; j < (nmoff - 1) ; j++) {
1389 counts[ aa0[nmoff * i + j] ] [ j ] ++;
1393 factors = (int **) calloc(nm0 + 1, sizeof(int *));
1394 if(factors == NULL) {
1395 fprintf(stderr, "Couldn't calloc factors array!\n");
1399 factors[0] = (int *) calloc((nm0 + 1) * (nmoff - 1), sizeof(int));
1400 if(factors[0] == NULL) {
1401 fprintf(stderr, "Couldn't calloc factors array!\n");
1405 for(i = 0 ; i <= nm0 ; i++) {
1406 factors[i] = factors[0] + (i * (nmoff - 1));
1410 this algorithm was adapted from the GAP4 library's NrArrangement function:
1411 The GAP Group, GAP --- Groups, Algorithms, and Programming,
1412 Version 4.1; Aachen, St Andrews, 1999.
1413 (http://www-gap.dcs.st-and.ac.uk/ gap)
1416 /* calculate K factors for each column in query: */
1417 for(j = 0 ; j < (nmoff - 1) ; j++) {
1419 /* only one way to select 0 elements */
1422 /* for each of the possible elements in this column */
1423 for(n = 0 ; n < nsq ; n++) {
1425 /* if there aren't any of these, skip it */
1426 if(counts[n][j] == 0) { continue; }
1428 /* loop over the possible lengths of the arrangement: K..0 */
1429 for(l = nm0 ; l >= 0 ; l--) {
1434 compute the number of arrangements of length <l>
1435 using only the first <n> elements of <mset>
1437 for(i = 0, k = min(counts[n][j], l); i <= k ; i++) {
1440 add the number of arrangements of length <l>
1441 that consist of <l>-<i> of the first <n>-1 elements
1442 and <i> copies of the <n>th element
1444 nr += bin * factors[l-i][j];
1445 bin = (int) ((float) bin * (float) (l - i) / (float) (i + 1));
1454 for(i = 1 ; i <= nm0 ; i++) {
1456 for(j = 0 ; j < (nmoff - 1) ; j++) {
1457 tmp *= (double) factors[i][j];
1459 tmp /= factorial(i, 1);
1472 void sortbesto (double *obs, int nobs)
1476 int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
1477 13776, 4592, 1968, 861, 336,
1478 112, 48, 21, 7, 3, 1 };
1480 for ( k = 0; k < 16; k++)
1481 for (gap = incs[k], i=gap; i < nobs; i++) {
1484 while ( j >= gap && obs[j-gap] > v) {
1485 obs[j] = obs[j - gap];