3 /* $Name: fa_34_26_5 $ - $Id: scaleswn.c,v 1.60 2007/04/26 18:32:48 wrp Exp $ */
5 /* as of 24 Sept, 2000 - scaleswn uses no global variables */
8 Provide statistical estimates using an extreme value distribution
10 copyright (c) 1995, 1996, 2000 William R. Pearson
12 This code provides multiple methods for scaling sequence
13 similarity scores to correct for length effects.
15 Currently, six methods are available:
17 pst.zsflag = 0 - no scaling (AVE_STATS)
18 pst.zsflag = 1 - regression-scaled scores (REG_STATS)
19 pst.zsflag = 2 - (revised) MLE Lmabda/K scaled scores (MLE_STATS)
20 pst.zsflag = 3 - scaling using Altschul's parameters (AG_STATS)
21 pst.zsflag = 4 - regression-scaled with iterative outlier removal (REGI_STATS)
22 pst.zsflag = 5 = like 1, but length scaled variance (REG2_STATS)
23 pst.zsflag = 6 = like 2, but uses lambda composition/scale (MLE2_STATS)
24 pst.zsflag = 11 = 10 + 1 - use random shuffles, method 1
47 #define MAX_SSCORE 300
49 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
53 #define M_LN2 0.69314718055994530942
55 #define EULER_G 0.57721566490153286060
56 #define PI_SQRT6 1.28254983016186409554
59 #define M_SQRT2 1.41421356237
61 #define LN200 5.2983173666
62 #define ZS_MAX 400.0 /* used to prevent underflow on some machines */
63 #define TOLERANCE 1.0e-12
66 /* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
68 double rho, rho_e, mu, mu_e, mean_var, var_e; /* ?_e:std. error of ? */
69 /* used by REG2_STATS */
70 double rho2, mu2, var_cutoff;
71 int n_trimmed; /* excluded because of high z-score */
72 int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
75 /* used by AG_STATS, MLE_STATS */
77 double K, Lambda, H, a_n0f, a_n0;
80 /* used by MLE2_STATS */
81 struct mle2_stat_str {
83 double mle2_a0, mle2_a1, mle2_a2, mle2_b1;
84 double ave_comp, max_comp, ave_H;
88 double ngLambda, ngK, ngH;
91 struct ag_stat_str ag;
92 struct mle2_stat_str m2;
96 #define AVE_STATS 0 /* no length effect, only mean/variance */
97 double find_zn(int score, double escore, int len, double comp, struct pstat_str *);
99 int proc_hist_n(struct stat_str *sptr, int n,
100 struct pstruct pst, struct hist_str *histp, int do_trim,
103 #define REG_STATS 1 /* length-regression scaled */
104 #define REGI_STATS 4 /* length regression, iterative */
105 double find_zr(int score, double escore, int len, double comp, struct pstat_str *);
106 int proc_hist_r(struct stat_str *sptr, int n,
107 struct pstruct pst, struct hist_str *histp,
108 int do_trim, struct pstat_str *pu);
110 #define MLE_STATS 2 /* MLE for lambda, K */
111 double find_ze(int score, double escore, int len, double comp, struct pstat_str *);
112 int proc_hist_ml(struct stat_str *sptr, int n,
113 struct pstruct pst, struct hist_str *histp, int do_trim,
116 #define AG_STATS 3 /* Altschul-Gish parameters */
117 double find_za(int score, double escore, int len, double comp, struct pstat_str *);
118 int proc_hist_a(struct stat_str *sptr, int n,
119 struct pstruct pst, struct hist_str *histp, int do_trim,
122 #define REG2_STATS 5 /* length regression on mean + variance */
123 double find_zr2(int score, double escore, int len, double comp, struct pstat_str *);
124 int proc_hist_r2(struct stat_str *sptr, int n,
125 struct pstruct pst, struct hist_str *histp, int do_trim,
128 #define MLE2_STATS 6 /* MLE stats using comp(lambda) */
129 double find_ze2(int score, double escore, int length, double comp, struct pstat_str *);
130 int proc_hist_ml2(struct stat_str *sptr, int n,
131 struct pstruct pst, struct hist_str *histp, int do_trim,
136 double find_zl(int score, double escore, int len, double comp, struct pstat_str *);
137 int proc_hist_ln(struct stat_str *sptr, int n,
138 struct pstruct pst, struct hist_str *histp, int do_trim,
142 /* scaleswn.c local variables that belong in their own structure */
144 double (*find_zp)(int score, double escore, int len, double comp, struct pstat_str *) = &find_zr;
146 /* void s_sort (double **ptr, int nbest); */
147 void ss_sort ( int *sptr, int n);
151 int max_score, min_score;
153 double *score_sums, *score2_sums;
155 int max_length, min_length, zero_s;
159 static void inithist(struct llen_str *, struct pstruct, int);
160 static void free_hist( struct llen_str *);
161 static void addhist(struct llen_str *, int, int, int);
162 static void prune_hist(struct llen_str *, int, int, int, long *);
163 void inithistz(int, struct hist_str *histp);
164 void addhistz(double zs, struct hist_str *histp);
165 void addhistzp(double zs, struct hist_str *histp);
167 static void fit_llen(struct llen_str *, struct rstat_str *);
168 static void fit_llen2(struct llen_str *, struct rstat_str *);
169 static void fit_llens(struct llen_str *, struct rstat_str *);
171 extern void sortbeste(struct beststr **bptr, int nbest);
173 /* void set_db_size(int, struct db_str *, struct hist_str *); */
180 process_hist(struct stat_str *sptr, int nstats,
183 struct hist_str *histp,
184 struct pstat_str **ps_sp,
187 int zsflag, do_trim, i;
188 struct pstat_str *ps_s;
190 if (pst.zsflag < 0) {
195 if (*ps_sp == NULL) {
196 if ((ps_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {
197 fprintf(stderr," cannot allocate pstat_union: %ld\n",sizeof(struct pstat_str));
204 memset(ps_s,0,sizeof(struct pstat_str));
207 ps_s->ngLambda = m_msg.Lambda;
211 if (nstats < 10) pst.zsflag = AG_STATS;
218 tmpf=fopen("tmp_stats.res","w+");
219 for (i=0; i<nstats; i++) fprintf(tmpf,"%d\t%d\n",sptr[i].score,sptr[i].n1);
232 if (zsflag==LN_STATS) {
234 pst.zsflag = proc_hist_ln(sptr, nstats, histp, do_trim, ps_s);
237 if (zsflag==MLE_STATS) {
239 pst.zsflag = proc_hist_ml(sptr, nstats, pst, histp, do_trim, ps_s);
242 else if (zsflag==REG_STATS) {
244 pst.zsflag = proc_hist_r(sptr, nstats,pst, histp, do_trim, ps_s);
246 else if (zsflag==AG_STATS) {
248 pst.zsflag = proc_hist_a(sptr, nstats, pst, histp, do_trim, ps_s);
250 else if (zsflag==REGI_STATS) {
252 pst.zsflag = proc_hist_r2(sptr,nstats, pst, histp, do_trim, ps_s);
254 else if (zsflag==REG2_STATS) {
256 pst.zsflag = proc_hist_r(sptr,nstats,pst, histp, do_trim, ps_s);
258 #if !defined(TFAST) && !defined(FASTX)
259 else if (zsflag == MLE2_STATS) {
261 pst.zsflag = proc_hist_ml2(sptr, nstats, pst, histp, do_trim, ps_s);
264 else { /* AVE_STATS */
266 pst.zsflag = proc_hist_n(sptr,nstats, pst, histp, do_trim, ps_s);
270 histp->entries = nstats; /* db->entries = 0; */
271 inithistz(MAXHIST, histp);
272 for (i = 0; i < nstats; i++) {
273 if (sptr[i].n1 < 0) sptr[i].n1 = -sptr[i].n1;
274 addhistz(find_zp(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,ps_s),
282 calc_thresh(struct pstruct pst, int nstats,
283 double Lambda, double K, double H, double *zstrim)
286 double ave_n1, tmp_score, z, l_fact;
288 if (pst.dnaseq == SEQT_DNA || pst.dnaseq == SEQT_RNA) {
297 /* max_hscore = MAX_SSCORE; */
298 /* mean expected for pst.n0 * 400 for protein, 5000 for DNA */
299 /* we want a number of offsets that is appropriate for the database size so
304 the calculation below sets a high-score threshold using an
305 ungapped lambda, but errs towards the high-score side by using
306 E()=0.001 and calculating with 0.70*lambda, which is the correct for
307 going from ungapped to -12/-2 gapped lambda with BLOSUM50
311 tmp_score = 0.01/((double)nstats*K*(double)pst.n0*ave_n1);
312 tmp_score = -log(tmp_score)/(Lambda*l_fact);
313 max_hscore = (int)(tmp_score+0.5);
315 z = 1.0/(double)nstats;
316 z = (log(z)+EULER_G)/(- PI_SQRT6);
321 *zstrim = 10.0*z+50.0;
326 proc_hist_r(struct stat_str *sptr, int nstats,
327 struct pstruct pst, struct hist_str *histp,
328 int do_trim, struct pstat_str *pu)
333 struct llen_str llen;
338 max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
339 pu->ngK, pu->ngH, &ztrim);
341 inithist(&llen,pst,max_hscore);
343 f_string = &(histp->stat_info[0]);
345 for (i = 0; i<nstats; i++)
346 addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
348 if ((llen.max_score - llen.min_score) < 10) {
352 return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
355 fit_llen(&llen, &(pu->r_u.rg)); /* now we have rho, mu, rho2, mu2, mean_var
356 to set the parameters for the histogram */
358 if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_ml */
361 return proc_hist_ml(sptr,nstats, pst, histp, do_trim, pu);
364 pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
368 for (i = 0; i < nstats; i++) {
369 zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
370 if (zs < 20.0 || zs > ztrim) {
371 pu->r_u.rg.n_trimmed++;
372 prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
378 /* fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", pu->r_u.rg.n_trimmed); */
380 if (llen.fit_flag) fit_llens(&llen, &(pu->r_u.rg));
382 /* fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", pu->r_u.rg.n1_trimmed,pu->r_u.rg.nb_trimmed); */
387 /* put all the scores in the histogram */
389 if (pst.zsflag < 10) s_string[0]='\0';
390 else if (pst.zs_win > 0)
391 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
392 else strncpy(s_string,"(shuffled)",sizeof(s_string));
394 if (pst.zsflag == REG2_STATS || pst.zsflag == 10+REG2_STATS)
395 sprintf(f_string,"%s Expectation_v fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n rho2=%6.2f; mu2= %6.2f, 0's: %d Z-trim: %d B-trim: %d in %d/%d",
396 s_string, pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
397 pu->r_u.rg.rho2,pu->r_u.rg.mu2,llen.zero_s,
398 pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot);
400 sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d B-trim: %d in %d/%d\n Lambda= %8.6f",
402 pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e), pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),
403 llen.zero_s, pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot,
404 PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
410 proc_hist_r2(struct stat_str *sptr, int nstats,
411 struct pstruct pst, struct hist_str *histp,
412 int do_trim, struct pstat_str *pu)
414 int i, nit, nprune, max_hscore;
418 struct llen_str llen;
423 max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
424 pu->ngK, pu->ngH, &ztrim);
426 inithist(&llen, pst,max_hscore);
427 f_string = &(histp->stat_info[0]);
429 for (i = 0; i<nstats; i++)
430 addhist(&llen,sptr[i].score,sptr[i].n1,max_hscore);
432 pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
433 if (do_trim) nit = 5;
438 fit_llen2(&llen, &(pu->r_u.rg));
440 for (i = 0; i < nstats; i++) {
441 if (sptr[i].n1 < 0) continue;
442 zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,pu);
443 if (zs < 20.0 || zs > ztrim ) {
445 pu->r_u.rg.n_trimmed++;
446 prune_hist(&llen,sptr[i].score,sptr[i].n1,max_hscore,
448 sptr[i].n1 = -sptr[i].n1;
451 /* fprintf(stderr," %d Z-trimmed at %d\n",nprune,nit); */
452 if (nprune < LHISTC) { break; }
455 fit_llen(&llen, &(pu->r_u.rg));
459 if (pst.zsflag < 10) s_string[0]='\0';
460 else if (pst.zs_win > 0)
461 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
462 else strncpy(s_string,"(shuffled)",sizeof(s_string));
464 sprintf(f_string,"%s Expectation_i fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n mean_var=%6.4f+/-%6.3f 0's: %d Z-trim: %d N-it: %d\n Lambda= %8.6f",
466 pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
467 pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),llen.zero_s,pu->r_u.rg.n_trimmed, nit,
468 PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
472 /* this procedure implements Altschul's pre-calculated values for lambda, K */
474 #include "alt_parms.h"
477 look_p(struct alt_p parm[], int gap, int ext,
478 double *K, double *Lambda, double *H);
481 proc_hist_a(struct stat_str *sptr, int nstats,
482 struct pstruct pst, struct hist_str *histp,
483 int do_trim, struct pstat_str *pu)
488 int t_gdelval, t_ggapval;
491 t_gdelval = pst.gdelval;
492 t_ggapval = pst.ggapval;
494 t_gdelval = pst.gdelval+pst.ggapval;
495 t_ggapval = pst.ggapval;
498 f_string = &(histp->stat_info[0]);
500 if (strcmp(pst.pamfile,"BL50")==0 || strcmp(pst.pamfile,"BLOSUM50")==0)
501 r_v = look_p(bl50_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
502 else if (strcmp(pst.pamfile,"BL62")==0 || strcmp(pst.pamfile,"BLOSUM62")==0)
503 r_v = look_p(bl62_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
504 else if (strcmp(pst.pamfile,"BL80")==0 || strcmp(pst.pamfile,"BLOSUM80")==0)
505 r_v = look_p(bl80_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
506 else if (strcmp(pst.pamfile,"P250")==0)
507 r_v = look_p(p250_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
508 else if (strcmp(pst.pamfile,"P120")==0)
509 r_v = look_p(p120_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
510 else if (strcmp(pst.pamfile,"MD_10")==0)
511 r_v = look_p(md10_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
512 else if (strcmp(pst.pamfile,"MD_20")==0)
513 r_v = look_p(md20_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
514 else if (strcmp(pst.pamfile,"MD_40")==0)
515 r_v = look_p(md40_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
516 else if (strcmp(pst.pamfile,"DNA")==0 || strcmp(pst.pamfile,"+5/-4")==0)
517 r_v = look_p(nt54_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
518 else if (strcmp(pst.pamfile,"+3/-2")==0)
519 r_v = look_p(nt32_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
520 else if (strcmp(pst.pamfile,"+1/-3")==0)
521 r_v = look_p(nt13_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
524 pu->r_u.ag.Lambda = Lambda;
529 fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
530 pst.pamfile,t_gdelval,t_ggapval);
533 return proc_hist_r(sptr, nstats,pst, histp, do_trim, pu);
537 fprintf(stderr," the parameters are: Lambda: %5.3f K: %5.3f H: %5.3f\n",
541 pu->r_u.ag.a_n0 = (double)pst.n0;
542 pu->r_u.ag.a_n0f = log (K * pu->r_u.ag.a_n0)/H;
544 sprintf(f_string,"Altschul/Gish params: n0: %d Lambda: %5.3f K: %5.3f H: %5.3f",
545 pst.n0,Lambda, K, H);
550 ag_parm(char *pamfile, int gdelval, int ggapval, struct pstat_str *pu)
555 if (strcmp(pamfile,"BL50")==0)
556 r_v = look_p(bl50_p,gdelval,ggapval,&K,&Lambda,&H);
557 else if (strcmp(pamfile,"BL62")==0)
558 r_v = look_p(bl62_p,gdelval,ggapval,&K,&Lambda,&H);
559 else if (strcmp(pamfile,"P250")==0)
560 r_v = look_p(p250_p,gdelval,ggapval,&K,&Lambda,&H);
561 else if (strcmp(pamfile,"P120")==0)
562 r_v = look_p(p120_p,gdelval,ggapval,&K,&Lambda,&H);
563 else if (strcmp(pamfile,"MD_10")==0)
564 r_v = look_p(md10_p,gdelval,ggapval,&K,&Lambda,&H);
565 else if (strcmp(pamfile,"MD_20")==0)
566 r_v = look_p(md20_p,gdelval,ggapval,&K,&Lambda,&H);
567 else if (strcmp(pamfile,"MD_40")==0)
568 r_v = look_p(md40_p,gdelval,ggapval,&K,&Lambda,&H);
569 else if (strcmp(pamfile,"DNA")==0 || strcmp(pamfile,"+5/-4")==0)
570 r_v = look_p(nt54_p,gdelval,ggapval, &K,&Lambda,&H);
571 else if (strcmp(pamfile,"+3/-2")==0)
572 r_v = look_p(nt32_p,gdelval,ggapval, &K,&Lambda,&H);
573 else if (strcmp(pamfile,"+1/-3")==0)
574 r_v = look_p(nt13_p,gdelval,ggapval, &K,&Lambda,&H);
578 pu->r_u.ag.Lambda = Lambda;
582 fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
583 pamfile,gdelval,ggapval);
589 look_p(struct alt_p parm[], int gap, int ext,
590 double *K, double *Lambda, double *H)
597 if (gap > parm[1].gap) {
599 *Lambda = parm[0].Lambda;
604 for (i=1; parm[i].gap > 0; i++) {
605 if (parm[i].gap > gap) continue;
606 else if (parm[i].gap == gap && parm[i].ext > ext ) continue;
607 else if (parm[i].gap == gap && parm[i].ext == ext) {
609 *Lambda = parm[i].Lambda;
618 /* uncensored and censored maximum likelihood estimates developed
619 by Aaron Mackey based on a preprint from Sean Eddy */
621 int mle_cen (struct stat_str *, int, int, double, double *, double *);
624 proc_hist_ml(struct stat_str *sptr, int nstats,
625 struct pstruct pst, struct hist_str *histp,
626 int do_trim, struct pstat_str *pu)
632 f_string = &(histp->stat_info[0]);
633 pu->r_u.ag.a_n0 = (double)pst.n0;
635 if (pst.zsflag < 10) s_string[0]='\0';
636 else if (pst.zs_win > 0)
637 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
638 else strncpy(s_string,"(shuffled)",sizeof(s_string));
641 if (mle_cen(sptr, nstats, pst.n0, 0.0, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
643 sprintf(f_string,"%s MLE statistics: Lambda= %6.4f; K=%6.4g",
644 s_string,pu->r_u.ag.Lambda,pu->r_u.ag.K);
647 if (nstats/20 > 1000) f_cen = 1000.0/(double)nstats;
649 if (mle_cen(sptr, nstats, pst.n0, f_cen, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
651 sprintf(f_string,"MLE_cen statistics: Lambda= %6.4f; K=%6.4g (cen=%d)",
652 pu->r_u.ag.Lambda,pu->r_u.ag.K,(int)((double)nstats*f_cen));
659 return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
663 mle_cen2 (struct stat_str *, int, int, double, double *, double *, double *, double *);
667 proc_hist_ml2(struct stat_str *sptr, int nstats,
668 struct pstruct pst, struct hist_str *histp,
669 int do_trim, struct pstat_str *pu)
672 double f_cen, ave_lambda;
673 char s_string[128], ex_string[64];
676 f_string = &(histp->stat_info[0]);
677 pu->r_u.m2.a_n0 = (double)pst.n0;
679 if (pst.zsflag < 10) s_string[0]='\0';
680 else if (pst.zs_win > 0)
681 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
682 else strncpy(s_string,"(shuffled)",sizeof(s_string));
684 pu->r_u.m2.ave_comp = 0.0;
685 pu->r_u.m2.max_comp = -1.0;
688 for (i=0; i<nstats; i++) {
689 if (sptr[i].comp > pu->r_u.m2.max_comp) pu->r_u.m2.max_comp = sptr[i].comp;
690 if (sptr[i].comp > 0.0) {
691 pu->r_u.m2.ave_comp += log(sptr[i].comp);
696 pu->r_u.m2.ave_comp /= (double)ns;
697 pu->r_u.m2.ave_comp = exp(pu->r_u.m2.ave_comp);
698 for (i=0; i<nstats; i++) if (sptr[i].comp < 0.0) {
699 sptr[i].comp = pu->r_u.m2.ave_comp;
703 sprintf(ex_string,"composition = -1 for %d sequences",nneg);
704 else ex_string[0]='\0';
707 if (mle_cen2(sptr, nstats, pst.n0, 0.0,
708 &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1,
709 &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1) == -1) goto bad_mle2;
710 ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
712 sprintf(f_string,"%s MLE-2 statistics: a0= %6.4f; a1=%6.4f; a2=%6.4f; b1=%6.4f\n ave Lamdba: %6.4f",
713 s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1,ave_lambda);
716 if (nstats/20 > 500) f_cen = 500.0/(double)nstats;
718 if (mle_cen2(sptr, nstats, pst.n0, f_cen, &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1, &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1)== -1) goto bad_mle2;
720 ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
722 sprintf(f_string,"%s MLE-2-cen statistics: a0= %6.4f; a1=%6.4f; a2=%6.4f; b1=%6.4f (cen=%d)\n ave Lambda:%6.4f",
723 s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1, (int)((double)nstats*f_cen),ave_lambda);
729 return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
732 double first_deriv_cen(double lambda, struct stat_str *sptr,
734 double sumlenL, double cenL,
735 double sumlenH, double cenH);
737 double second_deriv_cen(double lambda, struct stat_str *sptr,
739 double sumlenL, double cenL,
740 double sumlenH, double cenH);
743 st_sort (struct stat_str *v, int n) {
747 for (gap = 1; gap < n/3; gap = 3*gap +1) ;
749 for (; gap > 0; gap = (gap-1)/3)
750 for (i = gap; i < n; i++)
751 for (j = i - gap; j >= 0; j -= gap) {
752 if (v[j].score <= v[j + gap].score) break;
755 v[j].score = v[j + gap].score;
756 v[j + gap].score = tmp;
759 v[j].n1 = v[j + gap].n1;
764 /* sptr[].score, sptr[].n1; sptr[] must be sorted
765 int n = total number of samples
766 int M = length of query
767 double fn = fraction of scores to be censored fn/2.0 from top, bottom
768 double *Lambda = Lambda estimate
769 double *K = K estimate
775 mle_cen(struct stat_str *sptr, int n, int M, double fc,
776 double *Lambda, double *K) {
778 double sumlenL, sumlenH, cenL, cenH;
779 double sum_s, sum2_s, mean_s, var_s, dtmp;
783 double deriv, deriv2, lambda, old_lambda, sum = 0.0;
785 int sumlenL, int sumlenghtsR = sum of low (Left), right (High) seqs.
786 int cenL, cenH = censoring score low, high
795 sum_s = sum2_s = 0.0;
796 for (i=start; i<stop; i++) {
797 sum_s += sptr[i].score;
799 dtmp = (double)(stop-start);
802 for (i=start; i<stop; i++) {
803 sum2_s += sptr[i].score * sptr[i].score;
805 var_s = sum2_s/(dtmp-1.0);
807 sumlenL = sumlenH = 0.0;
808 for (i=0; i<start; i++) sumlenL += (double)sptr[i].n1;
809 for (i=stop; i<n; i++) sumlenH += (double)sptr[i].n1;
812 cenL = (double)sptr[start].score;
813 cenH = (double)sptr[stop].score;
816 cenL = (double)sptr[start].score/2.0;
817 cenH = (double)sptr[start].score*2.0;
820 if (cenL >= cenH) return -1;
822 /* initial guess for lambda is 0.2 - this does not work for matrices
823 with very different scales */
825 lambda = PI_SQRT6/sqrt(var_s);
827 fprintf(stderr," Lambda initial estimate error: lambda: %6.4g; var_s: %6.4g\n",lambda,var_s);
832 deriv = first_deriv_cen(lambda, sptr, start, stop,
833 sumlenL, cenL, sumlenH, cenH);
834 /* (uncensored version)
835 first_deriv(lambda, &sptr[start], stop - start))
838 /* (uncensored version)
839 deriv2 = second_deriv(lambda, &sptr[start], stop-start);
841 deriv2 = second_deriv_cen(lambda, sptr, start, stop,
842 sumlenL, cenL, sumlenH, cenH);
845 if (lambda - deriv/deriv2 > 0.0) lambda = lambda - deriv/deriv2;
846 else lambda = lambda/2.0;
848 } while (fabs((lambda - old_lambda)/lambda) > TINY && nit < MAX_NIT);
850 /* fprintf(stderr," mle_cen nit: %d\n",nit); */
852 if (nit >= MAX_NIT) return -1;
854 for(i = start; i < stop ; i++) {
855 sum += (double) sptr[i].n1 * exp(- lambda * (double)sptr[i].score);
860 *K = (double)(stop-start)/((double)M*sum);
862 *K = (double)n/((double)M*
863 (sum+sumlenL*exp(-lambda*cenL)-sumlenH*exp(-lambda*cenH)));
869 first_deriv(double lambda, struct stat_str *sptr, int n) {
872 double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
875 for(i = 0 ; i < n ; i++) {
876 s = (double)sptr[i].score;
877 l = (double)sptr[i].n1;
878 es = exp(-lambda * s );
884 return (1.0/lambda) - (sum/(double)n) + (sum1/sum2);
890 second_deriv(double lambda, struct stat_str *sptr, int n) {
891 double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
895 for(i = 0 ; i < n ; i++) {
896 l = (double)sptr[i].n1;
897 s = (double)sptr[i].score;
898 es = exp(-lambda * s);
901 sum3 += l * s * s * es;
904 return ((sum1*sum1)/(sum2*sum2)) - (sum3/sum2) - (1.0/(lambda*lambda));
909 first_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
910 double sumlenL, double cenL, double sumlenH, double cenH) {
912 double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
915 for(i = start ; i < stop ; i++) {
916 s = (double)sptr[i].score;
917 l = (double)sptr[i].n1;
918 es = exp(-lambda * s );
924 sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
925 sum2 += sumlenL*exp(-lambda*cenL) - sumlenH*exp(-lambda*cenH);
927 return (1.0 / lambda) - (sum /(double)(stop-start)) + (sum1 / sum2);
931 second_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
932 double sumlenL, double cenL, double sumlenH, double cenH) {
934 double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
938 for(i = start ; i < stop ; i++) {
939 s = (double)sptr[i].score;
940 l = (double)sptr[i].n1;
941 es = exp(-lambda * s);
944 sum3 += l * s * s * es;
947 sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
948 sum2 += sumlenL*exp(-lambda * cenL) - sumlenH*exp(-lambda * cenH);
949 sum3 += sumlenL*cenL*cenL * exp(-lambda * cenL) -
950 sumlenH*cenH*cenH * exp(-lambda * cenH);
951 return ((sum1 * sum1) / (sum2 * sum2)) - (sum3 / sum2)
952 - (1.0 / (lambda * lambda));
955 double mle2_func(double *params,
957 struct stat_str *values,
958 int n, int start, int stop);
960 void simplex(double *fitparams,
963 double (*minfunc) (double *tryparams, double *consts,
964 struct stat_str *data, int ndata,
965 int start, int stop),
968 int ndata, int start, int stop
972 mle_cen2(struct stat_str *sptr, int n, int M, double fc,
973 double *a0, double *a1, double *a2, double *b1) {
975 double params[4], lambdas[4], consts[9];
976 double avglenL, avglenH, avgcompL, avgcompH, cenL, cenH;
986 /* choose arithmetic or geometic mean for compositions by appropriate commenting */
989 avglenL = avglenH = 0.0;
990 avgcompL = avgcompH = 0.0;
991 /* avgcompL = avgcompH = 1.0 */
992 for (i=0; i<start; i++) {
993 avglenL += (double)sptr[i].n1;
994 avgcompL += (double)sptr[i].comp;
995 /* avgcompL *= (double) sptr[i].comp; */
997 avglenL /= (double) start;
998 avgcompL /= (double) start;
999 /* avgcompL = pow(avgcompL, 1.0/(double) start); */
1001 for (i=stop; i<n; i++) {
1002 avglenH += (double)sptr[i].n1;
1003 avgcompH += (double)sptr[i].comp;
1004 /* avgcompH *= (double) sptr[i].comp; */
1006 avglenH /= (double) (n - stop);
1007 avgcompH /= (double) (n - stop);
1008 /* avgcompL = pow(avgcompL, 1.0/(double) (n - stop)); */
1010 cenL = (double)sptr[start].score;
1011 cenH = (double)sptr[stop].score;
1012 if (cenL >= cenH) return -1;
1015 avglenL = avglenH = cenL = cenH = 0.0;
1016 avgcompL = avgcompH = 1.0;
1030 consts[1] = (double) start;
1031 consts[2] = (double) stop;
1034 consts[5] = avglenL;
1035 consts[6] = avglenH;
1036 consts[7] = avgcompL;
1037 consts[8] = avgcompH;
1039 simplex(params, lambdas, 4,
1040 (double (*) (double *, double *, struct stat_str *, int, int, int) )mle2_func,
1041 consts, sptr, n, start, stop);
1051 double mle2_func(double *params,
1053 struct stat_str *values,
1054 int n, int start, int stop
1057 double a0, a1, a2, b1, M;
1058 double score, length, comp;
1059 double cenL, cenH, avglenL, avglenH, avgcompL, avgcompH;
1071 start = (int) consts[1];
1072 stop = (int) consts[2];
1076 avglenL = consts[5];
1077 avglenH = consts[6];
1078 avgcompL = consts[7];
1079 avgcompH = consts[8];
1085 y = -(cenL - (a0 + a1*avgcompL +a2*avgcompL*log(M*avglenL)))/(b1*avgcompL);
1086 L += (double) start * exp(y);
1089 for(i = start ; i < stop ; i++) {
1090 score = (double) values[i].score;
1091 length = (double) values[i].n1;
1092 comp = (double) values[i].comp;
1094 y = - (score - (a0 + a1*comp + a2 * comp * log(M*length))) / (b1*comp);
1096 L += -y + exp(y) + log(b1 * comp);
1100 y = -(cenH -(a0 + a1*avgcompH + a2*avgcompH*log(M*avglenH)))/(b1*avgcompH);
1101 L -= (double) (n - stop) * exp(y);
1106 /* Begin Nelder-Mead simplex code: */
1108 double evalfunc(double **param,
1113 double (*minfunc) (double *params, double *consts,
1114 struct stat_str *data, int ndata,
1115 int start, int stop),
1118 int ndata, int start, int stop,
1122 void simplex(double *fitparams,
1125 double (*minfunc) (double *tryparams, double *consts,
1126 struct stat_str *data, int ndata,
1127 int start, int stop),
1136 int i, j, ilo, ihi, inhi;
1137 double rtol, sum, tmp, ysave, ytry;
1138 double *psum, *vals, *ptry, **param;
1141 psum = (double *) calloc(nparam, sizeof(double));
1142 ptry = (double *) calloc(nparam, sizeof(double));
1144 vals = (double *) calloc(nparam + 1, sizeof(double));
1146 param = (double **) calloc(nparam + 1, sizeof(double *));
1147 param[0] = (double *) calloc((nparam + 1) * nparam, sizeof(double));
1148 for( i = 1 ; i < (nparam + 1) ; i++ ) {
1149 param[i] = param[0] + i * nparam;
1152 /* Get our N+1 initial parameter values for the simplex */
1154 for( i = 0 ; i < nparam ; i++ ) {
1155 param[0][i] = fitparams[i];
1158 for( i = 1 ; i < (nparam + 1) ; i++ ) {
1159 for( j = 0 ; j < nparam ; j++ ) {
1160 param[i][j] = fitparams[j] + lambda[j] * ( (i - 1) == j ? 1 : 0 );
1164 /* calculate initial values at the simplex nodes */
1166 for( i = 0 ; i < (nparam + 1) ; i++ ) {
1167 vals[i] = minfunc(param[i], consts, data, ndata, start, stop);
1170 /* Begin Nelder-Mead simplex algorithm from Numerical Recipes in C */
1172 for( j = 0 ; j < nparam ; j++ ) {
1173 for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
1182 determine which point is highest (ihi), next highest (inhi) and
1183 lowest (ilo) by looping over the points in the simplex
1187 /* ihi = vals[0] > vals[1] ? (inhi = 1, 0) : (inhi = 0, 1); */
1188 if(vals[0] > vals[1]) { ihi = 0; inhi = 1; }
1189 else { ihi = 1; inhi = 0; }
1191 for( i = 0 ; i < nparam + 1 ; i++) {
1192 if( vals[i] <= vals[ilo] ) ilo = i;
1193 if( vals[i] > vals[ihi] ) {
1196 } else if ( vals[i] > vals[inhi] && i != ihi ) inhi = i;
1199 /* Are we finished? */
1201 rtol = 2.0 * fabs(vals[ihi] - vals[ilo]) /
1202 (fabs(vals[ihi]) + fabs(vals[ilo]) + TINY);
1204 if( rtol < TOLERANCE ) {
1206 /* put the best value and best parameters into the first index */
1209 vals[0] = vals[ilo];
1212 for( i = 0 ; i < nparam ; i++ ) {
1214 param[0][i] = param[ilo][i];
1215 param[ilo][i] = tmp;
1218 /* et voila, c'est finis */
1222 /* Begin a new iteration */
1224 /* first, extrapolate by -1 through the face of the simplex across from ihi */
1226 ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
1227 data, ndata, start, stop, ihi, -1.0);
1229 if( ytry <= vals[ilo] ) {
1231 /* Good result, try additional extrapolation by 2 */
1233 ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
1234 data, ndata, start, stop, ihi, 2.0);
1236 } else if ( ytry >= vals[inhi] ) {
1238 /* no good, look for an intermediate lower point by contracting */
1241 ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
1242 data, ndata, start, stop, ihi, 0.5);
1244 if( ytry >= ysave ) {
1246 /* Still no good. Contract around lowest (best) point. */
1248 for( i = 0 ; i < nparam + 1 ; i++ ) {
1250 for ( j = 0 ; j < nparam ; j++ ) {
1251 param[i][j] = psum[j] = 0.5 * (param[i][j] + param[ilo][j]);
1253 vals[i] = minfunc(psum, consts, data, ndata, start, stop);
1258 for( j = 0 ; j < nparam ; j++ ) {
1259 for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
1269 for( i = 0 ; i < nparam ; i++ ) {
1270 fitparams[i] = param[0][i];
1284 double evalfunc(double **param,
1289 double (*minfunc)(double *tryparam, double *consts,
1290 struct stat_str *data, int ndata,
1291 int start, int stop),
1294 int ndata, int start, int stop,
1299 double fac1, fac2, ytry;
1302 fac1 = (1.0 - factor) / nparam;
1303 fac2 = fac1 - factor;
1305 for( j = 0 ; j < nparam ; j++ ) {
1306 ptry[j] = psum[j] * fac1 - param[ihi][j] * fac2;
1309 ytry = minfunc(ptry, consts, data, ndata, start, stop);
1311 if( ytry < vals[ihi] ) {
1313 for( j = 0 ; j < nparam ; j++ ) {
1314 psum[j] += ptry[j] - param[ihi][j];
1315 param[ihi][j] = ptry[j];
1322 /* end of Nelder-Mead simplex code */
1325 proc_hist_n(struct stat_str *sptr, int nstats,
1326 struct pstruct pst, struct hist_str *histp,
1327 int do_trim, struct pstat_str *pu)
1330 double s_score, s2_score, ssd, ztrim;
1331 int nit, max_hscore;
1335 f_string = &(histp->stat_info[0]);
1337 max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
1338 pu->ngK, pu->ngH, &ztrim);
1340 s_score = s2_score = 0.0;
1342 for ( j = 0, i = 0; i < nstats; i++) {
1343 if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
1344 s_score += (ssd=(double)sptr[i].score);
1345 s2_score += ssd * ssd;
1351 pu->r_u.rg.mu = s_score/(double)j;
1352 pu->r_u.rg.mean_var = s2_score - (double)j * pu->r_u.rg.mu * pu->r_u.rg.mu;
1353 pu->r_u.rg.mean_var /= (double)(j-1);
1356 pu->r_u.rg.mu = 50.0;
1357 pu->r_u.rg.mean_var = 10.0;
1360 if (pu->r_u.rg.mean_var < 0.01) {
1361 pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
1364 /* now remove some scores */
1368 pu->r_u.rg.n_trimmed = 0;
1370 for (i=0; i< nstats; i++) {
1371 if (sptr[i].n1 < 0) continue;
1372 ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
1373 if (ssd > ztrim || ssd < 20.0) {
1374 /* fprintf(stderr,"removing %3d %3d %4.1f\n",
1375 sptr[i].score, sptr[i].n1,ssd); */
1376 ssd = sptr[i].score;
1378 s2_score -= ssd*ssd;
1380 pu->r_u.rg.n_trimmed++;
1382 sptr[i].n1 = -sptr[i].n1;
1387 pu->r_u.rg.mu = s_score/(double)j;
1388 pu->r_u.rg.mean_var = s2_score - (double)j * pu->r_u.rg.mu * pu->r_u.rg.mu;
1389 pu->r_u.rg.mean_var /= (double)(j-1);
1392 pu->r_u.rg.mu = 50.0;
1393 pu->r_u.rg.mean_var = 10.0;
1396 if (pu->r_u.rg.mean_var < 0.01) {
1397 pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
1400 if (pu->r_u.rg.n_trimmed < LHISTC) {
1402 fprintf(stderr,"nprune %d at %d\n",nprune,nit);
1408 if (pst.zsflag < 10) s_string[0]='\0';
1409 else if (pst.zs_win > 0)
1410 sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
1411 else strncpy(s_string,"(shuffled)",sizeof(s_string));
1413 sprintf(f_string,"%s unscaled statistics: mu= %6.4f var=%6.4f; Lambda= %6.4f",
1414 s_string, pu->r_u.rg.mu,pu->r_u.rg.mean_var,PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
1419 This routine calculates the maximum likelihood estimates for the
1420 extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
1422 <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
1423 <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
1425 The <a> parameter can be transformed into and K
1426 of the formula: 1 - exp ( - K m n exp ( - lambda S ))
1427 using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
1428 1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
1430 a = log(K m n) / lambda
1431 a lambda = log (K m n)
1432 exp(a lambda) = K m n
1433 but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
1434 so: K m n = (1/n sum{ exp( -x[i] *lambda)})
1435 K = sum{}/(nlib m n )
1440 alloc_hist(struct llen_str *llen)
1443 max_llen = llen->max;
1445 if (llen->hist == NULL) {
1446 llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
1447 llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
1448 llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
1449 llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
1452 for (i=0; i< max_llen+1; i++) {
1454 llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
1459 free_hist(struct llen_str *llen)
1461 if (llen->hist!=NULL) {
1462 free(llen->score_var);
1463 free(llen->score2_sums);
1464 free(llen->score_sums);
1471 inithist(struct llen_str *llen, struct pstruct pst, int max_hscore)
1473 llen->max = MAX_LLEN;
1475 llen->max_score = -1;
1476 llen->min_score=10000;
1481 llen->min_length = 10000;
1482 llen->max_length = 0;
1486 addhist(struct llen_str *llen, int score, int length, int max_hscore)
1491 if ( score<=0 || length < LENGTH_CUTOFF) {
1492 llen->min_score = 0;
1497 if (score < llen->min_score) llen->min_score = score;
1498 if (score > llen->max_score) llen->max_score = score;
1500 if (length > llen->max_length) llen->max_length = length;
1501 if (length < llen->min_length) llen->min_length = length;
1502 if (score > max_hscore) score = max_hscore;
1504 llength = (int)(LN_FACT*log((double)length)+0.5);
1506 if (llength < 0 ) llength = 0;
1507 if (llength > llen->max) llength = llen->max;
1508 llen->hist[llength]++;
1509 dscore = (double)score;
1510 llen->score_sums[llength] += dscore;
1511 llen->score2_sums[llength] += dscore * dscore;
1514 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
1517 inithistz(int mh, struct hist_str *histp )
1523 histp->min_hist = 20;
1524 histp->max_hist = 120;
1526 histp->histint = (int)
1527 ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
1529 ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
1531 if (histp->hist_a==NULL) {
1532 if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
1534 fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
1537 else histp->histflg = 1;
1540 for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
1545 static double nrv[100]={
1546 0.3098900570,-0.0313400923, 0.1131975903,-0.2832547606, 0.0073672659,
1547 0.2914489107, 0.4209306311,-0.4630181404, 0.3326537896, 0.0050140359,
1548 -0.1117435426,-0.2835630301, 0.2302997065,-0.3102716394, 0.0819894916,
1549 -0.1676455701,-0.3782225018,-0.3204509938,-0.3594969187,-0.0308950398,
1550 0.2922813812, 0.1337170751, 0.4666577031,-0.2917784349,-0.2438179916,
1551 0.3002301394, 0.0231147123, 0.5687927366,-0.2318208709,-0.1476839273,
1552 -0.0385043851,-0.1213476523, 0.1486341995, 0.1027917167, 0.1409192644,
1553 -0.3280652579, 0.4232041455, 0.0775993309, 0.1159071787, 0.2769424442,
1554 0.3197284751, 0.1507346903, 0.0028580909, 0.4825103412,-0.0496843610,
1555 -0.2754357656, 0.6021881753,-0.0816123956,-0.0899148991, 0.4847183201,
1556 0.2151621865,-0.4542246220, 0.0690709102, 0.2461894193, 0.2126042295,
1557 -0.0767060668, 0.4819746149, 0.3323031326, 0.0177600676, 0.1143185210,
1558 0.2653977455, 0.0921872958,-0.1330986718, 0.0412287716,-0.1691604748,
1559 -0.0529679078,-0.0194157955,-0.6117493924, 0.1199067932, 0.0210243193,
1560 -0.5832259838,-0.1685528664, 0.0008591271,-0.1120347822, 0.0839125069,
1561 -0.2787486831,-0.1937017962,-0.1915733940,-0.7888453635,-0.3316745163,
1562 0.1180885226,-0.3347001067,-0.2477492636,-0.2445697600, 0.0001342482,
1563 -0.0015759812,-0.1516473992,-0.5202267615, 0.2136975210, 0.2500423188,
1564 -0.2402926401,-0.1094186280,-0.0618869933,-0.0815221188, 0.2623337275,
1565 0.0219427302 -0.1774469919, 0.0828245026,-0.3271952808,-0.0632898028};
1568 addhistz(double zs, struct hist_str *histp)
1573 rv = nrv[histp->z_calls++ % 100];
1574 zi = (int)(zs + 0.5+rv );
1576 if ((zi >= 0) && (zi <= 120)) histp->entries++;
1578 if (zi < histp->min_hist) zi = histp->min_hist;
1579 if (zi > histp->max_hist) zi = histp->max_hist;
1581 ih = (zi - histp->min_hist)/histp->histint;
1583 histp->hist_a[ih]++;
1586 /* addhistzp() does not increase histp->entries since addhist did it already */
1589 addhistzp(double zs, struct hist_str *histp)
1594 rv = nrv[histp->z_calls++ %100];
1595 zi = (int)(zs + 0.5 + rv);
1597 if (zi < histp->min_hist) zi = histp->min_hist;
1598 if (zi > histp->max_hist) zi = histp->max_hist;
1600 ih = (zi - histp->min_hist)/histp->histint;
1602 histp->hist_a[ih]++;
1607 prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
1613 if (score <= 0 || length < LENGTH_CUTOFF) return;
1615 if (score > max_hscore) score = max_hscore;
1617 llength = (int)(LN_FACT*log((double)length)+0.5);
1619 if (llength < 0 ) llength = 0;
1620 if (llength > llen->max) llength = llen->max;
1621 llen->hist[llength]--;
1622 dscore = (double)score;
1623 llen->score_sums[llength] -= dscore;
1624 llen->score2_sums[llength] -= dscore * dscore;
1626 /* (*entries)--; histp->entries is not yet initialized */
1629 /* fit_llen: no trimming
1630 (1) regress scores vs log(n) using weighted variance
1631 (2) calculate mean variance after length regression
1635 fit_llen(struct llen_str *llen, struct rstat_str *pr)
1641 double mean_x, mean_y, var_x, var_y, covar_xy;
1642 double mean_y2, covar_xy2, var_y2, dllj;
1644 double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
1646 /* now fit scores to best linear function of log(n), using
1647 simple linear regression */
1649 for (llen->min=0; llen->min < llen->max; llen->min++)
1650 if (llen->hist[llen->min]) break;
1653 for (n_size=0,j = llen->min; j < llen->max; j++) {
1654 if (llen->hist[j] > 1) {
1655 dllj = (double)llen->hist[j];
1656 llen->score_var[j] = llen->score2_sums[j]/dllj
1657 - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
1658 llen->score_var[j] /= (double)(llen->hist[j]-1);
1659 if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
1664 pr->nb_tot = n_size;
1667 sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
1668 for (j = llen->min; j < llen->max; j++)
1669 if (llen->hist[j] > 1) {
1671 dllj = (double)llen->hist[j];
1672 n_w += dllj/llen->score_var[j];
1673 sum_x += dllj * x / llen->score_var[j] ;
1674 sum_y += llen->score_sums[j] / llen->score_var[j];
1675 sum_x2 += dllj * x * x /llen->score_var[j];
1676 sum_xy += x * llen->score_sums[j]/llen->score_var[j];
1686 det = n_w * sum_x2 - sum_x * sum_x;
1688 pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
1689 pr->rho_e = n_w/det;
1690 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1691 pr->mu_e = sum_x2/det;
1701 det = n_w * sum_x2 - sum_x * sum_x;
1702 pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
1703 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1706 mean_x = mean_y = mean_y2 = 0.0;
1707 var_x = var_y = 0.0;
1708 covar_xy = covar_xy2 = 0.0;
1710 for (j = llen->min; j <= llen->max; j++)
1711 if (llen->hist[j] > 1 ) {
1713 x = (double)j + 0.5;
1714 mean_x += (double)llen->hist[j] * x;
1715 mean_y += llen->score_sums[j];
1716 var_x += (double)llen->hist[j] * x * x;
1717 var_y += llen->score2_sums[j];
1718 covar_xy += x * llen->score_sums[j];
1720 mean_x /= n; mean_y /= n;
1721 var_x = var_x / n - mean_x * mean_x;
1722 var_y = var_y / n - mean_y * mean_y;
1724 covar_xy = covar_xy / n - mean_x * mean_y;
1726 pr->rho = covar_xy / var_x;
1727 pr->mu = mean_y - pr->rho * mean_x;
1729 mean_y2 = covar_xy2 = var_y2 = 0.0;
1730 for (j = llen->min; j <= llen->max; j++)
1731 if (llen->hist[j] > 1) {
1732 x = (double)j + 0.5;
1733 u = pr->rho * x + pr->mu;
1734 y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
1736 dllj = (double)llen->hist[j];
1737 fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
1738 llen->score_sums[j]/dllj,y2/dllj);
1742 covar_xy2 += x * y2;
1743 /* fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
1744 x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
1747 pr->mean_var = mean_y2 /= (double)n;
1748 covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
1750 if (pr->mean_var <= 0.01) {
1752 pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
1756 fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
1757 pr->rho*LN_FACT,pr->mu,pr->mean_var);
1759 if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
1760 else pr->var_e = 0.0;
1762 if (llen->fit_flag) {
1763 pr->rho2 = covar_xy2 / var_x;
1764 pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
1768 pr->mu2 = pr->mean_var;
1771 if (pr->rho2 < 0.0 )
1772 z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
1773 else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
1774 if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
1776 pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
1779 /* fit_llens: trim high variance bins
1780 (1) regress scores vs log(n) using weighted variance
1781 (2) regress residuals vs log(n)
1782 (3) remove high variance bins
1783 (4) calculate mean variance after length regression
1787 fit_llens(struct llen_str *llen, struct rstat_str *pr)
1791 double x, y, y2, u, u2, v, z;
1792 double mean_x, mean_y, var_x, var_y, covar_xy;
1793 double mean_y2, covar_xy2;
1794 double mean_u2, mean_3u2, dllj;
1795 double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
1797 /* now fit scores to best linear function of log(n), using
1798 simple linear regression */
1800 for (llen->min=0; llen->min < llen->max; llen->min++)
1801 if (llen->hist[llen->min]) break;
1804 for (j = llen->min; j < llen->max; j++) {
1805 if (llen->hist[j] > 1) {
1806 dllj = (double)llen->hist[j];
1807 llen->score_var[j] = (double)llen->score2_sums[j]/dllj
1808 - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
1809 llen->score_var[j] /= (double)(llen->hist[j]-1);
1810 if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
1815 sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
1816 for (j = llen->min; j < llen->max; j++)
1817 if (llen->hist[j] > 1) {
1819 dllj = (double)llen->hist[j];
1820 n_w += dllj/llen->score_var[j];
1821 sum_x += dllj * x / llen->score_var[j] ;
1822 sum_y += llen->score_sums[j] / llen->score_var[j];
1823 sum_x2 += dllj * x * x /llen->score_var[j];
1824 sum_xy += x * llen->score_sums[j]/llen->score_var[j];
1827 det = n_w * sum_x2 - sum_x * sum_x;
1828 pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
1829 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1831 /* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
1834 mean_x = mean_y = mean_y2 = 0.0;
1835 var_x = var_y = 0.0;
1836 covar_xy = covar_xy2 = 0.0;
1838 for (j = llen->min; j <= llen->max; j++)
1839 if (llen->hist[j] > 1 ) {
1841 x = (double)j + 0.5;
1842 dllj = (double)llen->hist[j];
1844 mean_y += llen->score_sums[j];
1845 var_x += dllj * x * x;
1846 var_y += llen->score2_sums[j];
1847 covar_xy += x * llen->score_sums[j];
1849 mean_x /= n; mean_y /= n;
1850 var_x = var_x / n - mean_x * mean_x;
1851 var_y = var_y / n - mean_y * mean_y;
1853 covar_xy = covar_xy / n - mean_x * mean_y;
1854 /* pr->rho = covar_xy / var_x;
1855 pr->mu = mean_y - pr->rho * mean_x;
1858 mean_y2 = covar_xy2 = 0.0;
1859 for (j = llen->min; j <= llen->max; j++)
1860 if (llen->hist[j] > 1) {
1861 x = (double)j + 0.5;
1862 u = pr->rho * x + pr->mu;
1863 y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
1865 covar_xy2 += x * y2;
1869 covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
1870 pr->rho2 = covar_xy2 / var_x;
1871 pr->mu2 = mean_y2 - pr->rho2 * mean_x;
1873 if (pr->rho2 < 0.0 )
1874 z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
1875 else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
1876 if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
1878 pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
1880 /* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
1885 for ( j = llen->min; j < llen->max; j++) {
1887 dllj = (double)llen->hist[j];
1888 x = pr->rho * y + pr->mu;
1889 v = pr->rho2 * y + pr->mu2;
1890 if (v < pr->var_cutoff) v = pr->var_cutoff;
1891 if (llen->hist[j]> 1) {
1892 u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
1893 mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
1895 /* fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
1896 j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
1898 else llen->score_var[j] = -1.0;
1901 mean_u2 = sqrt(mean_u2/(double)n_u2);
1902 /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */
1904 mean_3u2 = mean_u2*3.0;
1906 for (j = llen->min; j < llen->max; j++) {
1907 if (llen->hist[j] <= 1) continue;
1908 if (sqrt(llen->score_var[j]) > mean_3u2) {
1909 /* fprintf(stderr," removing %d %d %.2f\n",
1910 j, (int)(exp((double)j/LN_FACT)-0.5),
1911 sqrt(llen->score_var[j]));
1914 pr->n1_trimmed += llen->hist[j];
1921 struct s2str {double s; int n;};
1922 void s2_sort ( struct s2str *sptr, int n);
1925 fit_llen2(struct llen_str *llen, struct rstat_str *pr)
1928 int n, n_y2, llen_delta, llen_del05;
1931 double mean_x, mean_y, var_x, var_y, covar_xy;
1932 double mean_y2, covar_xy2;
1935 double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
1937 /* now fit scores to best linear function of log(n), using
1938 simple linear regression */
1940 for (llen->min=0; llen->min < llen->max; llen->min++)
1941 if (llen->hist[llen->min]) break;
1943 for ( ; llen->max > llen->min; llen->max--)
1944 if (llen->hist[llen->max]) break;
1946 for (n_size=0,j = llen->min; j < llen->max; j++) {
1947 if (llen->hist[j] > 1) {
1948 llen->score_var[j] = llen->score2_sums[j]/(double)llen->hist[j]
1949 - (llen->score_sums[j]/(double)llen->hist[j])
1950 * (llen->score_sums[j]/(double)llen->hist[j]);
1951 llen->score_var[j] /= (double)(llen->hist[j]-1);
1952 if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
1958 sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
1959 for (j = llen->min; j < llen->max; j++)
1960 if (llen->hist[j] > 1) {
1962 n_w += (double)llen->hist[j]/llen->score_var[j];
1963 sum_x += (double)llen->hist[j] * x / llen->score_var[j] ;
1964 sum_y += llen->score_sums[j] / llen->score_var[j];
1965 sum_x2 += (double)llen->hist[j] * x * x /llen->score_var[j];
1966 sum_xy += x * llen->score_sums[j]/llen->score_var[j];
1975 det = n_w * sum_x2 - sum_x * sum_x;
1977 pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
1978 pr->rho_e = n_w/det;
1979 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1980 pr->mu_e = sum_x2/det;
1989 det = n_w * sum_x2 - sum_x * sum_x;
1990 pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
1991 pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1993 /* fprintf(stderr," rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
1996 mean_x = mean_y = mean_y2 = 0.0;
1997 var_x = var_y = 0.0;
1998 covar_xy = covar_xy2 = 0.0;
2000 for (j = llen->min; j <= llen->max; j++)
2001 if (llen->hist[j] > 1 ) {
2003 x = (double)j + 0.5;
2004 mean_x += (double)llen->hist[j] * x;
2005 mean_y += llen->score_sums[j];
2006 var_x += (double)llen->hist[j] * x * x;
2007 var_y += llen->score2_sums[j];
2008 covar_xy += x * llen->score_sums[j];
2010 mean_x /= n; mean_y /= n;
2011 var_x = var_x / n - mean_x * mean_x;
2012 var_y = var_y / n - mean_y * mean_y;
2014 covar_xy = covar_xy / n - mean_x * mean_y;
2016 pr->rho = covar_xy / var_x;
2017 pr->mu = mean_y - pr->rho * mean_x;
2020 if ((ss2=(struct s2str *)calloc(llen->max+1,sizeof(struct s2str)))==NULL) {
2021 fprintf(stderr," cannot allocate ss2\n");
2027 for (j = llen->min; j <= llen->max; j++)
2028 if (llen->hist[j] > VHISTC) {
2030 n_y2 += ss2[j].n = llen->hist[j];
2031 x = (double)j + 0.5;
2032 u = pr->rho * x + pr->mu;
2033 ss2[j].s = y2 = llen->score2_sums[j] - 2*llen->score_sums[j]*u + llen->hist[j]*u*u;
2036 pr->mean_var = mean_y2/(double)n_y2;
2038 s2_sort(ss2+llen->min,llen->max-llen->min+1);
2040 /* fprintf(stderr,"llen->min: %d, max: %d\n",llen->min,llen->max); */
2042 for (j=llen->min; j<=llen->max; j++) {
2045 /* fprintf(stderr,"%d\t%d\t%.2f\t%.4f\n",
2046 j,ss2[j].n,ss2[j].s,ss2[j].s/ss2[j].n);
2051 llen_del05 = llen_delta/20;
2054 for (j = llen->min; j<llen->min+llen_del05; j++) {
2055 pr->n1_trimmed += ss2[j].n;
2058 for (j = llen->min+llen_del05; j <= llen->min+llen_delta-llen_del05; j++)
2060 mean_y2 += ss2[j].s;
2063 for (j = llen->min+llen_delta-llen_del05+1; j< llen->max; j++) {
2064 pr->n1_trimmed += ss2[j].n;
2069 if (n_y2 > 1) pr->mean_var = mean_y2/(double)n_y2;
2071 /* fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var: %.4f/%d\n",
2072 pr->rho*LN_FACT,pr->mu,pr->mean_var,n); */
2077 /* REG_STATS - Z() from rho/mu/mean_var */
2078 double find_zr(int score, double escore, int length, double comp, struct pstat_str *pu)
2082 if (score <= 0) return 0;
2083 if ( length < LENGTH_CUTOFF) return 0;
2085 log_len = LN_FACT*log((double)(length));
2086 /* var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
2087 if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.var_cutoff;
2090 z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
2092 return (50.0 + z*10.0);
2095 /* REG2_STATS Z() from rho/mu, rho2/mu2 */
2096 double find_zr2(int score, double escore, int length, double comp, struct pstat_str *pu)
2098 double log_len, var;
2101 if ( length < LENGTH_CUTOFF) return 0;
2103 log_len = LN_FACT*log((double)(length));
2105 var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
2106 if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.mean_var;
2108 z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(var);
2110 return (50.0 + z*10.0);
2114 /* LN_STATS - ln()-scaled mu, mean_var */
2115 double find_zl(int score, int length, double comp, struct pstat_str *pu)
2119 ls = (double)score*LN200/log((double)length);
2121 z = (ls - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
2123 return (50.0 + z*10.0);
2127 /* MLE_STATS - Z() from MLE for lambda, K */
2129 find_ze(int score, double escore, int length, double comp, struct pstat_str *pu)
2131 double z, mp, np, a_n1;
2133 a_n1 = (double)length;
2135 mp = pu->r_u.ag.a_n0;
2138 if (np < 1.0) np = 1.0;
2139 if (mp < 1.0) mp = 1.0;
2141 z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
2146 return (50.0 + z*10.0);
2149 /* MLE2_STATS - Z() from MLE for mle_a0..2, mle_b1, length, comp */
2151 find_ze2(int score, double escore, int length, double comp, struct pstat_str *pu)
2153 double z, mp, np, a_n1;
2155 a_n1 = (double)length;
2157 if (comp <= 0.0) comp = pu->r_u.m2.ave_comp;
2159 /* avoid very biased comp estimates */
2160 /* comp = exp((4.0*log(comp)+log(pu->r_u.m2.ave_comp))/5.0); */
2162 mp = pu->r_u.m2.a_n0;
2165 if (np < 1.0) np = 1.0;
2166 if (mp < 1.0) mp = 1.0;
2168 z = (-(pu->r_u.m2.mle2_a0 + pu->r_u.m2.mle2_a1 * comp + pu->r_u.m2.mle2_a2 * comp * log(np * mp)) + score) / (pu->r_u.m2.mle2_b1 * comp);
2173 return (50.0 + z*10.0);
2176 /* AG_STATS - Altschul-Gish Lamdba, K */
2178 find_za(int score, double escore, int length, double comp, struct pstat_str *pu)
2180 double z, mp, np, a_n1, a_n1f;
2182 a_n1 = (double)length;
2183 a_n1f = log(a_n1)/pu->r_u.ag.H;
2185 mp = pu->r_u.ag.a_n0 - pu->r_u.ag.a_n0f - a_n1f;
2186 np = a_n1 - pu->r_u.ag.a_n0f - a_n1f;
2188 if (np < 1.0) np = 1.0;
2189 if (mp < 1.0) mp = 1.0;
2191 z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
2196 return (50.0 + z*10.0);
2199 double find_zn(int score, double escore, int length, double comp, struct pstat_str *pu)
2203 z = ((double)score - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
2205 return (50.0 + z*10.0);
2208 /* computes E value for a given z value, assuming extreme value distribution */
2210 z_to_E(double zs, long entries, struct db_str db)
2214 /* if (db->entries < 5) return (double)db.entries; */
2215 if (entries < 1) { n = db.entries;}
2218 if (zs > ZS_MAX) return 0.0;
2221 e = exp(- PI_SQRT6 * zs - .577216);
2222 return n * (e > .01 ? 1.0 - exp(-e) : e);
2224 return n * erfc(zs/M_SQRT2)/2.0;
2233 /* if (db.entries < 5) return 0.0; */
2235 z = (zs - 50.0)/10.0;
2237 if (z > ZS_MAX) return 0.0;
2240 e = exp(- PI_SQRT6 * z - EULER_G);
2241 return (e > .01 ? 1.0 - exp(-e) : e);
2243 return erfc(zs/M_SQRT2)/2.0;
2248 zs_to_bit(double zs, int n0, int n1)
2250 double z, a_n0, a_n1;
2252 z = (zs - 50.0)/10.0;
2256 return (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;
2259 /* computes E-value for a given z value, assuming extreme value distribution */
2261 zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
2265 /* if (db->entries < 5) return 0.0; */
2267 z = (zs - 50.0)/10.0;
2269 if (z > ZS_MAX ) return 0.0;
2271 if (entries < 1) entries = db.entries;
2273 if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
2274 k = (double)db.length /(double)n1;
2276 k += ((double)db.carry * (double)LONG_MAX)/(double)n1;
2279 else k = (double)entries;
2281 if (k < 1.0) k = 1.0;
2287 return k * (e > .01 ? 1.0 - exp(-e) : e);
2289 return k * erfc(z/M_SQRT2)/2.0;
2294 double np_to_z(double, int *);
2297 /* computes E-value for a given z value, assuming extreme value distribution */
2299 E_to_zs(double E, long entries)
2304 e = E/(double)entries;
2307 z = (log(e)+EULER_G)/(- PI_SQRT6);
2310 z = np_to_z(1.0-e,&error);
2312 if (!error) return z*10.0+50.0;
2317 /* computes 1.0 - E value for a given z value, assuming extreme value
2320 zs_to_Ec(double zs, long entries)
2324 if (entries < 5) return 0.0;
2326 z = (zs - 50.0)/10.0;
2328 if (z > ZS_MAX) return 1.0;
2331 e = exp(- PI_SQRT6 * z - EULER_G);
2332 return (double)entries * (e > .01 ? exp(-e) : 1.0 - e);
2334 return (double)entries*erf(z/M_SQRT2)/2.0;
2338 /* calculate a threshold score, given an E() value and Lambda,K,H */
2341 E1_to_s(double e_val, int n0, int n1, struct pstat_str *pu) {
2342 double mp, np, a_n0, a_n0f, a_n1;
2347 a_n0f = log(pu->r_u.ag.K * a_n0 * a_n1)/pu->r_u.ag.H;
2352 if (np < 1.0) np = 1.0;
2353 if (mp < 1.0) mp = 1.0;
2355 score = (int)((log( pu->r_u.ag.K * mp * np) - log(e_val))/pu->r_u.ag.Lambda +0.5);
2356 if (score < 0) score = 0;
2360 /* no longer used; stat_str returned by process_hist
2362 summ_stats(char *s_str, struct pstat_str *pu)
2364 strcpy(s_str,f_string);
2370 double *v; int *s, n;
2376 for (gap=n/2; gap>0; gap/=2)
2377 for (i=gap; i<n; i++)
2378 for (j=i-gap; j>=0; j -= gap) {
2379 if (v[j] >= v[j+gap]) break;
2380 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
2381 itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
2386 void s_sort (double **ptr, int nbest)
2391 for (gap = nbest/2; gap > 0; gap /= 2)
2392 for (i = gap; i < nbest; i++)
2393 for (j = i - gap; j >= 0; j-= gap) {
2394 if (*ptr[j] >= *ptr[j + gap]) break;
2396 ptr[j] = ptr[j + gap];
2402 void ss_sort (int *ptr, int n)
2407 for (gap = n/2; gap > 0; gap /= 2)
2408 for (i = gap; i < n; i++)
2409 for (j = i - gap; j >= 0; j-= gap) {
2410 if (ptr[j] >= ptr[j + gap]) break;
2412 ptr[j] = ptr[j + gap];
2418 void s2_sort (struct s2str *ptr, int n)
2423 for (gap = n/2; gap > 0; gap /= 2)
2424 for (i = gap; i < n; i++)
2425 for (j = i - gap; j >= 0; j-= gap) {
2426 if (ptr[j].s >= ptr[j + gap].s) break;
2429 ptr[j].s = ptr[j + gap].s;
2430 ptr[j].n = ptr[j + gap].n;
2431 ptr[j + gap].s = tmp.s;
2432 ptr[j + gap].n = tmp.n;
2436 void last_stats() {}
2439 scale_scores(struct beststr **bptr, int nbest, struct db_str db,
2440 struct pstruct pst, struct pstat_str *rs)
2445 if (pst.zsflag < 0 || pst.zsflag_f < 0) return;
2447 for (i=0; i<nbest; i++) {
2448 zscore = find_zp(bptr[i]->score[pst.score_ix], bptr[i]->escore,
2449 bptr[i]->n1,bptr[i]->comp,rs);
2450 bptr[i]->zscore = zscore;
2452 =zs_to_E(zscore,bptr[i]->n1,pst.dnaseq, pst.zdb_size,db);
2454 sortbeste(bptr,nbest);
2458 /* ALGORITHM AS241 APPL. STATIST. (1988) VOL. 37, NO. 3
2460 Produces the normal deviate Z corresponding to a given lower
2461 tail area of P; Z is accurate to about 1 part in 10**16.
2463 The hash sums below are the sums of the mantissas of the
2464 coefficients. They are included for use in checking
2468 double np_to_z(double p, int *fault) {
2470 double q, r, ppnd16;
2472 double zero = 0.0, one = 1.0, half = 0.5;
2473 double split1 = 0.425, split2 = 5.0;
2474 double const1 = 0.180625, const2 = 1.6;
2476 /* Coefficients for P close to 0.5 */
2478 double a0 = 3.3871328727963666080e0;
2479 double a1 = 1.3314166789178437745e+2;
2480 double a2 = 1.9715909503065514427e+3;
2481 double a3 = 1.3731693765509461125e+4;
2482 double a4 = 4.5921953931549871457e+4;
2483 double a5 = 6.7265770927008700853e+4;
2484 double a6 = 3.3430575583588128105e+4;
2485 double a7 = 2.5090809287301226727e+3;
2486 double b1 = 4.2313330701600911252e+1;
2487 double b2 = 6.8718700749205790830e+2;
2488 double b3 = 5.3941960214247511077e+3;
2489 double b4 = 2.1213794301586595867e+4;
2490 double b5 = 3.9307895800092710610e+4;
2491 double b6 = 2.8729085735721942674e+4;
2492 double b7 = 5.2264952788528545610e+3;
2494 double sum_ab= 55.8831928806149014439;
2496 Coefficients for P not close to 0, 0.5 or 1.
2499 double c0 = 1.42343711074968357734;
2500 double c1 = 4.63033784615654529590;
2501 double c2 = 5.76949722146069140550;
2502 double c3 = 3.64784832476320460504;
2503 double c4 = 1.27045825245236838258;
2504 double c5 = 2.41780725177450611770e-1;
2505 double c6 = 2.27238449892691845833e-2;
2506 double c7 = 7.74545014278341407640e-4;
2507 double d1 = 2.05319162663775882187;
2508 double d2 = 1.67638483018380384940;
2509 double d3 = 6.89767334985100004550e-1;
2510 double d4 = 1.48103976427480074590e-1;
2511 double d5 = 1.51986665636164571966e-2;
2512 double d6 = 5.47593808499534494600e-4;
2513 double d7 = 1.05075007164441684324e-9;
2515 double sum_cd=49.33206503301610289036;
2517 Coefficients for P near 0 or 1.
2519 double e0 = 6.65790464350110377720e0;
2520 double e1 = 5.46378491116411436990e0;
2521 double e2 = 1.78482653991729133580e0;
2522 double e3 = 2.96560571828504891230e-1;
2523 double e4 = 2.65321895265761230930e-2;
2524 double e5 = 1.24266094738807843860e-3;
2525 double e6 = 2.71155556874348757815e-5;
2526 double e7 = 2.01033439929228813265e-7;
2527 double f1 = 5.99832206555887937690e-1;
2528 double f2 = 1.36929880922735805310e-1;
2529 double f3 = 1.48753612908506148525e-2;
2530 double f4 = 7.86869131145613259100e-4;
2531 double f5 = 1.84631831751005468180e-5;
2532 double f6 = 1.42151175831644588870e-7;
2533 double f7 = 2.04426310338993978564e-15;
2535 double sum_ef=47.52583317549289671629;
2537 double sum_tmp = 0.0;
2540 sum_tmp = a0+a1+a2+a3+a4+a5+a6+a7+b1+b2+b3+b4+b5+b6+b7;
2541 if (fabs(sum_tmp - sum_ab) > 1e-12) {
2542 fprintf (stderr," sum_ab error: %lg %lg\n",sum_tmp,sum_ab);
2547 sum_tmp = c0+c1+c2+c3+c4+c5+c6+c7+d1+d2+d3+d4+d5+d6+d7;
2548 if (fabs(sum_tmp - sum_cd) > 1e-12) {
2549 fprintf (stderr," sum_cd error: %lg %lg\n",sum_tmp,sum_cd);
2553 sum_tmp = e0+e1+e2+e3+e4+e5+e6+e7+f1+f2+f3+f4+f5+f6+f7;
2554 if (fabs(sum_tmp - sum_ef) > 1e-12) {
2555 fprintf (stderr," sum_ef error: %lg %lg\n",sum_tmp,sum_ef);
2563 if (fabs(q) <= split1) {
2565 return q * (((((((a7 * r + a6) * r + a5) * r + a4) * r + a3)
2566 * r + a2) * r + a1) * r + a0) /
2567 (((((((b7 * r + b6) * r + b5) * r + b4) * r + b3)
2568 * r + b2) * r + b1) * r + one);
2571 r = (q < zero) ? p : one - p;
2579 ppnd16 = (((((((c7 * r + c6) * r + c5) * r + c4) * r + c3)
2580 * r + c2) * r + c1) * r + c0) /
2581 (((((((d7 * r + d6) * r + d5) * r + d4) * r + d3)
2582 * r + d2) * r + d1) * r + one);
2586 ppnd16 = (((((((e7 * r + e6) * r + e5) * r + e4) * r + e3)
2587 * r + e2) * r + e1) * r + e0) /
2588 (((((((f7 * r + f6) * r + f5) * r + f4) * r + f3)
2589 * r + f2) * r + f1) * r + one);
2591 if (q < zero) return -ppnd16;