binaries/src/fasta34/scaleswn.c

   1 /* scaleswn.c */
   2
   3 /* $Name: fa_34_26_5 $ - $Id: scaleswn.c,v 1.60 2007/04/26 18:32:48 wrp Exp $ */
   4
   5 /* as of 24 Sept, 2000 - scaleswn uses no global variables */
   6
   7 /*
   8         Provide statistical estimates using an extreme value distribution
   9
  10         copyright (c) 1995, 1996, 2000 William R. Pearson
  11
  12         This code provides multiple methods for scaling sequence
  13         similarity scores to correct for length effects.
  14
  15         Currently, six methods are available:
  16
  17         pst.zsflag = 0 - no scaling  (AVE_STATS)
  18         pst.zsflag = 1 - regression-scaled scores (REG_STATS)
  19         pst.zsflag = 2 - (revised) MLE Lmabda/K scaled scores (MLE_STATS)
  20         pst.zsflag = 3 - scaling using Altschul's parameters (AG_STATS)
  21         pst.zsflag = 4 - regression-scaled with iterative outlier removal (REGI_STATS)
  22         pst.zsflag = 5 = like 1, but length scaled variance (REG2_STATS)
  23         pst.zsflag = 6 = like 2, but uses lambda composition/scale (MLE2_STATS)
  24         pst.zsflag = 11 = 10 + 1 - use random shuffles, method 1
  25 */
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <math.h>
  30 #include <string.h>
  31
  32 #include <limits.h>
  33
  34 #include "defs.h"
  35 #include "param.h"
  36 #include "structs.h"
  37 #ifndef PCOMPLIB
  38 #include "mw.h"
  39 #else
  40 #include "p_mw.h"
  41 #endif
  42
  43 #define MAXHIST 50
  44 #define MAX_LLEN 200
  45 #define LHISTC 5
  46 #define VHISTC 5
  47 #define MAX_SSCORE 300
  48
  49 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
  50
  51 #define LN_FACT 10.0
  52 #ifndef M_LN2
  53 #define M_LN2 0.69314718055994530942
  54 #endif
  55 #define EULER_G 0.57721566490153286060
  56 #define PI_SQRT6 1.28254983016186409554
  57
  58 #ifndef M_SQRT2
  59 #define M_SQRT2 1.41421356237
  60 #endif
  61 #define LN200 5.2983173666
  62 #define ZS_MAX 400.0    /* used to prevent underflow on some machines */
  63 #define TOLERANCE 1.0e-12
  64 #define TINY 1.0e-6
  65
  66 /* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
  67 struct rstat_str {
  68   double rho, rho_e, mu, mu_e, mean_var, var_e;  /* ?_e:std. error of ? */
  69 /* used by REG2_STATS */
  70   double rho2, mu2, var_cutoff;
  71   int n_trimmed; /* excluded because of high z-score */
  72   int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
  73 };
  74
  75 /* used by AG_STATS, MLE_STATS */
  76 struct ag_stat_str {
  77   double K, Lambda, H, a_n0f, a_n0;
  78 };
  79
  80 /* used by MLE2_STATS */
  81 struct mle2_stat_str {
  82   double a_n0;
  83   double mle2_a0, mle2_a1, mle2_a2, mle2_b1;
  84   double ave_comp, max_comp, ave_H;
  85 };
  86
  87 struct pstat_str {
  88   double ngLambda, ngK, ngH;
  89   union {
  90     struct rstat_str rg;
  91     struct ag_stat_str ag;
  92     struct mle2_stat_str m2;
  93   } r_u;
  94 };
  95
  96 #define AVE_STATS 0     /* no length effect, only mean/variance */
  97 double find_zn(int score, double escore, int len, double comp, struct pstat_str *);
  98
  99 int proc_hist_n(struct stat_str *sptr, int n,
 100                 struct pstruct pst, struct hist_str *histp, int do_trim,
 101                 struct pstat_str *);
 102
 103 #define REG_STATS 1     /* length-regression scaled */
 104 #define REGI_STATS 4    /* length regression, iterative */
 105 double find_zr(int score, double escore, int len, double comp, struct pstat_str *);
 106 int proc_hist_r(struct stat_str *sptr, int n,
 107                 struct pstruct pst, struct hist_str *histp,
 108                 int do_trim, struct pstat_str *pu);
 109
 110 #define MLE_STATS 2     /* MLE for lambda, K */
 111 double find_ze(int score, double escore, int len, double comp, struct pstat_str *);
 112 int proc_hist_ml(struct stat_str *sptr, int n,
 113                  struct pstruct pst, struct hist_str *histp, int do_trim,
 114                  struct pstat_str *);
 115
 116 #define AG_STATS 3      /* Altschul-Gish parameters */
 117 double find_za(int score, double escore, int len, double comp, struct pstat_str *);
 118 int proc_hist_a(struct stat_str *sptr, int n,
 119                 struct pstruct pst, struct hist_str *histp, int do_trim,
 120                 struct pstat_str *);
 121
 122 #define REG2_STATS 5    /* length regression on mean + variance */
 123 double find_zr2(int score, double escore, int len, double comp, struct pstat_str *);
 124 int proc_hist_r2(struct stat_str *sptr, int n,
 125                  struct pstruct pst, struct hist_str *histp, int do_trim,
 126                  struct pstat_str *);
 127
 128 #define MLE2_STATS 6    /* MLE stats using comp(lambda) */
 129 double find_ze2(int score, double escore, int length, double comp, struct pstat_str *);
 130 int proc_hist_ml2(struct stat_str *sptr, int n,
 131                   struct pstruct pst, struct hist_str *histp, int do_trim,
 132                   struct pstat_str *);
 133
 134 #ifdef USE_LNSTATS
 135 #define LN_STATS 2
 136 double find_zl(int score, double escore, int len, double comp, struct pstat_str *);
 137 int proc_hist_ln(struct stat_str *sptr, int n,
 138                  struct pstruct pst, struct hist_str *histp, int do_trim,
 139                  struct pstat_str *);
 140 #endif
 141
 142 /* scaleswn.c local variables that belong in their own structure */
 143
 144 double (*find_zp)(int score, double escore, int len, double comp, struct pstat_str *) = &find_zr;
 145
 146 /* void s_sort (double **ptr, int nbest); */
 147 void ss_sort ( int *sptr, int n);
 148
 149 struct llen_str {
 150   int min, max;
 151   int max_score, min_score;
 152   int *hist;
 153   double *score_sums, *score2_sums;
 154   double *score_var;
 155   int max_length, min_length, zero_s;
 156   int fit_flag;
 157 };
 158
 159 static void inithist(struct llen_str *, struct pstruct, int);
 160 static void free_hist( struct llen_str *);
 161 static void addhist(struct llen_str *, int, int, int);
 162 static void prune_hist(struct llen_str *, int, int, int, long *);
 163 void inithistz(int, struct hist_str *histp);
 164 void addhistz(double zs, struct hist_str *histp);
 165 void addhistzp(double zs, struct hist_str *histp);
 166
 167 static void fit_llen(struct llen_str *, struct rstat_str *);
 168 static void fit_llen2(struct llen_str *, struct rstat_str *);
 169 static void fit_llens(struct llen_str *, struct rstat_str *);
 170
 171 extern void sortbeste(struct beststr **bptr, int nbest);
 172
 173 /* void set_db_size(int, struct db_str *, struct hist_str *); */
 174
 175 #ifdef DEBUG
 176 FILE *tmpf;
 177 #endif
 178
 179 int
 180 process_hist(struct stat_str *sptr, int nstats,
 181              struct mngmsg m_msg,
 182              struct pstruct pst,
 183              struct hist_str *histp,
 184              struct pstat_str **ps_sp,
 185              int do_hist)
 186 {
 187   int zsflag, do_trim, i;
 188   struct pstat_str *ps_s;
 189
 190   if (pst.zsflag < 0) {
 191     *ps_sp = NULL;
 192     return pst.zsflag;
 193   }
 194
 195   if (*ps_sp == NULL) {
 196     if ((ps_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {
 197       fprintf(stderr," cannot allocate pstat_union: %ld\n",sizeof(struct pstat_str));
 198       exit(1);
 199     }
 200     else *ps_sp = ps_s;
 201   }
 202   else {
 203     ps_s = *ps_sp;
 204     memset(ps_s,0,sizeof(struct pstat_str));
 205   }
 206
 207   ps_s->ngLambda = m_msg.Lambda;
 208   ps_s->ngK = m_msg.K;
 209   ps_s->ngH = m_msg.H;
 210
 211   if (nstats < 10) pst.zsflag = AG_STATS;
 212
 213   zsflag = pst.zsflag;
 214
 215 /*
 216 #ifdef DEBUG
 217   if (pst.debug_lib) {
 218     tmpf=fopen("tmp_stats.res","w+");
 219     for (i=0; i<nstats; i++) fprintf(tmpf,"%d\t%d\n",sptr[i].score,sptr[i].n1);
 220     fclose(tmpf);
 221   }
 222 #endif
 223 */
 224
 225   if (zsflag >= 10) {
 226     zsflag -= 10;
 227     do_trim = 0;
 228   }
 229   else do_trim = 1;
 230
 231 #ifdef USE_LNSCALE
 232   if (zsflag==LN_STATS) {
 233     find_zp = &find_zl;
 234     pst.zsflag = proc_hist_ln(sptr, nstats, histp, do_trim, ps_s);
 235   }
 236 #else
 237   if (zsflag==MLE_STATS) {
 238     find_zp = &find_ze;
 239     pst.zsflag = proc_hist_ml(sptr, nstats, pst, histp, do_trim, ps_s);
 240   }
 241 #endif
 242   else if (zsflag==REG_STATS) {
 243     find_zp = &find_zr;
 244     pst.zsflag = proc_hist_r(sptr, nstats,pst, histp, do_trim,  ps_s);
 245   }
 246   else if (zsflag==AG_STATS) {
 247     find_zp = &find_za;
 248     pst.zsflag = proc_hist_a(sptr, nstats, pst, histp, do_trim,  ps_s);
 249   }
 250   else if (zsflag==REGI_STATS) {
 251     find_zp = &find_zr;
 252     pst.zsflag = proc_hist_r2(sptr,nstats, pst, histp, do_trim,  ps_s);
 253   }
 254   else if (zsflag==REG2_STATS) {
 255     find_zp = &find_zr2;
 256     pst.zsflag = proc_hist_r(sptr,nstats,pst, histp, do_trim,  ps_s);
 257   }
 258 #if !defined(TFAST) && !defined(FASTX)
 259   else if (zsflag == MLE2_STATS) {
 260     find_zp = &find_ze2;
 261     pst.zsflag = proc_hist_ml2(sptr, nstats, pst, histp, do_trim,  ps_s);
 262   }
 263 #endif
 264   else {        /* AVE_STATS */
 265     find_zp = &find_zn;
 266     pst.zsflag = proc_hist_n(sptr,nstats, pst, histp, do_trim,  ps_s);
 267   }
 268
 269   if (!do_hist) {
 270     histp->entries = nstats; /* db->entries = 0; */
 271     inithistz(MAXHIST, histp);
 272     for (i = 0; i < nstats; i++) {
 273       if (sptr[i].n1 < 0) sptr[i].n1 = -sptr[i].n1;
 274       addhistz(find_zp(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,ps_s),
 275                histp);
 276     }
 277   }
 278   return pst.zsflag;
 279 }
 280
 281 int
 282 calc_thresh(struct pstruct pst, int nstats,
 283             double Lambda, double K, double H, double *zstrim)
 284 {
 285   int max_hscore;
 286   double ave_n1, tmp_score, z, l_fact;
 287
 288   if (pst.dnaseq == SEQT_DNA || pst.dnaseq == SEQT_RNA) {
 289     ave_n1 = 5000.0;
 290     l_fact = 1.0;
 291   }
 292   else {
 293     ave_n1 = 400.0;
 294     l_fact = 0.7;
 295   }
 296
 297 /*  max_hscore = MAX_SSCORE; */
 298 /*  mean expected for pst.n0 * 400 for protein, 5000 for DNA */
 299 /*  we want a number of offsets that is appropriate for the database size so
 300     far (nstats)
 301 */
 302
 303 /*
 304   the calculation below sets a high-score threshold using an
 305   ungapped lambda, but errs towards the high-score side by using
 306   E()=0.001 and calculating with 0.70*lambda, which is the correct for
 307   going from ungapped to -12/-2 gapped lambda with BLOSUM50
 308 */
 309
 310 #ifndef NORMAL_DIST
 311   tmp_score = 0.01/((double)nstats*K*(double)pst.n0*ave_n1);
 312   tmp_score = -log(tmp_score)/(Lambda*l_fact);
 313   max_hscore = (int)(tmp_score+0.5);
 314
 315   z = 1.0/(double)nstats;
 316   z = (log(z)+EULER_G)/(- PI_SQRT6);
 317 #else
 318   max_hscore = 100;
 319   z = 5.0;
 320 #endif
 321   *zstrim = 10.0*z+50.0;
 322   return max_hscore;
 323 }
 324
 325 int
 326 proc_hist_r(struct stat_str *sptr, int nstats,
 327             struct pstruct pst, struct hist_str *histp,
 328             int do_trim, struct pstat_str *pu)
 329 {
 330   int i, max_hscore;
 331   double zs, ztrim;
 332   char s_string[128];
 333   struct llen_str llen;
 334   char *f_string;
 335   llen.fit_flag=1;
 336   llen.hist=NULL;
 337
 338   max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
 339                            pu->ngK, pu->ngH, &ztrim);
 340
 341   inithist(&llen,pst,max_hscore);
 342
 343   f_string = &(histp->stat_info[0]);
 344
 345   for (i = 0; i<nstats; i++)
 346     addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
 347
 348   if ((llen.max_score - llen.min_score) < 10) {
 349     free_hist(&llen);
 350     llen.fit_flag = 0;
 351     find_zp = &find_zn;
 352     return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
 353   }
 354
 355   fit_llen(&llen, &(pu->r_u.rg)); /* now we have rho, mu, rho2, mu2, mean_var
 356                                  to set the parameters for the histogram */
 357
 358   if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_ml */
 359     free_hist(&llen);
 360     find_zp = &find_ze;
 361     return proc_hist_ml(sptr,nstats, pst, histp, do_trim, pu);
 362   }
 363
 364   pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
 365
 366   if (do_trim) {
 367     if (llen.fit_flag) {
 368       for (i = 0; i < nstats; i++) {
 369         zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
 370         if (zs < 20.0 || zs > ztrim) {
 371           pu->r_u.rg.n_trimmed++;
 372           prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
 373                      &(histp->entries));
 374         }
 375       }
 376     }
 377
 378   /*  fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", pu->r_u.rg.n_trimmed); */
 379
 380     if (llen.fit_flag) fit_llens(&llen, &(pu->r_u.rg));
 381
 382   /*   fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", pu->r_u.rg.n1_trimmed,pu->r_u.rg.nb_trimmed); */
 383   }
 384
 385   free_hist(&llen);
 386
 387   /* put all the scores in the histogram */
 388
 389   if (pst.zsflag < 10) s_string[0]='\0';
 390   else if (pst.zs_win > 0)
 391     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
 392   else strncpy(s_string,"(shuffled)",sizeof(s_string));
 393
 394   if (pst.zsflag == REG2_STATS || pst.zsflag == 10+REG2_STATS)
 395     sprintf(f_string,"%s Expectation_v fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n rho2=%6.2f; mu2= %6.2f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d",
 396             s_string, pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
 397             pu->r_u.rg.rho2,pu->r_u.rg.mu2,llen.zero_s,
 398             pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot);
 399   else
 400     sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d\n Lambda= %8.6f",
 401             s_string,
 402             pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e), pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),
 403             llen.zero_s, pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot,
 404             PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
 405   return REG_STATS;
 406 }
 407
 408
 409 int
 410 proc_hist_r2(struct stat_str *sptr, int nstats,
 411              struct pstruct pst, struct hist_str *histp,
 412              int do_trim, struct pstat_str *pu)
 413 {
 414   int i, nit, nprune, max_hscore;
 415   double zs, ztrim;
 416   char s_string[128];
 417   char *f_string;
 418   struct llen_str llen;
 419
 420   llen.fit_flag=1;
 421   llen.hist=NULL;
 422
 423   max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
 424                            pu->ngK, pu->ngH, &ztrim);
 425
 426   inithist(&llen, pst,max_hscore);
 427   f_string = &(histp->stat_info[0]);
 428
 429   for (i = 0; i<nstats; i++)
 430     addhist(&llen,sptr[i].score,sptr[i].n1,max_hscore);
 431
 432   pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
 433   if (do_trim) nit = 5;
 434   else nit = 0;
 435
 436   while (nit-- > 0) {
 437     nprune = 0;
 438     fit_llen2(&llen, &(pu->r_u.rg));
 439
 440     for (i = 0; i < nstats; i++) {
 441       if (sptr[i].n1 < 0) continue;
 442       zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,pu);
 443       if (zs < 20.0 || zs > ztrim ) {
 444         nprune++;
 445         pu->r_u.rg.n_trimmed++;
 446         prune_hist(&llen,sptr[i].score,sptr[i].n1,max_hscore,
 447                    &(histp->entries));
 448         sptr[i].n1 = -sptr[i].n1;
 449       }
 450     }
 451     /*    fprintf(stderr," %d Z-trimmed at %d\n",nprune,nit); */
 452     if (nprune < LHISTC) { break; }
 453   }
 454
 455   fit_llen(&llen, &(pu->r_u.rg));
 456
 457   free_hist(&llen);
 458
 459   if (pst.zsflag < 10) s_string[0]='\0';
 460   else if (pst.zs_win > 0)
 461     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
 462   else strncpy(s_string,"(shuffled)",sizeof(s_string));
 463
 464   sprintf(f_string,"%s Expectation_i fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n mean_var=%6.4f+/-%6.3f 0's: %d Z-trim: %d N-it: %d\n Lambda= %8.6f",
 465           s_string,
 466           pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
 467           pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),llen.zero_s,pu->r_u.rg.n_trimmed, nit,
 468           PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
 469   return REGI_STATS;
 470 }
 471
 472 /* this procedure implements Altschul's pre-calculated values for lambda, K */
 473
 474 #include "alt_parms.h"
 475
 476 int
 477 look_p(struct alt_p parm[], int gap, int ext,
 478        double *K, double *Lambda, double *H);
 479
 480 int
 481 proc_hist_a(struct stat_str *sptr, int nstats,
 482             struct pstruct pst, struct hist_str *histp,
 483             int do_trim, struct pstat_str *pu)
 484 {
 485   double Lambda, K, H;
 486   char *f_string;
 487   int r_v;
 488   int t_gdelval, t_ggapval;
 489
 490 #ifdef OLD_FASTA_GAP
 491   t_gdelval = pst.gdelval;
 492   t_ggapval = pst.ggapval;
 493 #else
 494   t_gdelval = pst.gdelval+pst.ggapval;
 495   t_ggapval = pst.ggapval;
 496 #endif
 497
 498   f_string = &(histp->stat_info[0]);
 499
 500   if (strcmp(pst.pamfile,"BL50")==0 || strcmp(pst.pamfile,"BLOSUM50")==0)
 501       r_v = look_p(bl50_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 502   else if (strcmp(pst.pamfile,"BL62")==0 || strcmp(pst.pamfile,"BLOSUM62")==0)
 503       r_v = look_p(bl62_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 504   else if (strcmp(pst.pamfile,"BL80")==0 || strcmp(pst.pamfile,"BLOSUM80")==0)
 505       r_v = look_p(bl80_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 506   else if (strcmp(pst.pamfile,"P250")==0)
 507       r_v = look_p(p250_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 508   else if (strcmp(pst.pamfile,"P120")==0)
 509       r_v = look_p(p120_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 510   else if (strcmp(pst.pamfile,"MD_10")==0)
 511       r_v = look_p(md10_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 512   else if (strcmp(pst.pamfile,"MD_20")==0)
 513       r_v = look_p(md20_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 514   else if (strcmp(pst.pamfile,"MD_40")==0)
 515       r_v = look_p(md40_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 516   else if (strcmp(pst.pamfile,"DNA")==0 || strcmp(pst.pamfile,"+5/-4")==0)
 517       r_v = look_p(nt54_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 518   else if (strcmp(pst.pamfile,"+3/-2")==0)
 519       r_v = look_p(nt32_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 520   else if (strcmp(pst.pamfile,"+1/-3")==0)
 521       r_v = look_p(nt13_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
 522   else r_v = 0;
 523
 524   pu->r_u.ag.Lambda = Lambda;
 525   pu->r_u.ag.K = K;
 526   pu->r_u.ag.H = H;
 527
 528   if (r_v == 0) {
 529     fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
 530             pst.pamfile,t_gdelval,t_ggapval);
 531
 532     find_zp = &find_zr;
 533     return proc_hist_r(sptr, nstats,pst, histp, do_trim, pu);
 534   }
 535
 536   /*
 537     fprintf(stderr," the parameters are: Lambda: %5.3f K: %5.3f H: %5.3f\n",
 538             Lambda, K, H);
 539             */
 540
 541     pu->r_u.ag.a_n0 = (double)pst.n0;
 542     pu->r_u.ag.a_n0f = log (K * pu->r_u.ag.a_n0)/H;
 543
 544     sprintf(f_string,"Altschul/Gish params: n0: %d Lambda: %5.3f K: %5.3f H: %5.3f",
 545             pst.n0,Lambda, K, H);
 546     return AG_STATS;
 547 }
 548
 549 int
 550 ag_parm(char *pamfile, int gdelval, int ggapval, struct pstat_str *pu)
 551 {
 552   double Lambda, K, H;
 553   int r_v;
 554
 555   if (strcmp(pamfile,"BL50")==0)
 556     r_v = look_p(bl50_p,gdelval,ggapval,&K,&Lambda,&H);
 557   else if (strcmp(pamfile,"BL62")==0)
 558       r_v = look_p(bl62_p,gdelval,ggapval,&K,&Lambda,&H);
 559   else if (strcmp(pamfile,"P250")==0)
 560       r_v = look_p(p250_p,gdelval,ggapval,&K,&Lambda,&H);
 561   else if (strcmp(pamfile,"P120")==0)
 562       r_v = look_p(p120_p,gdelval,ggapval,&K,&Lambda,&H);
 563   else if (strcmp(pamfile,"MD_10")==0)
 564       r_v = look_p(md10_p,gdelval,ggapval,&K,&Lambda,&H);
 565   else if (strcmp(pamfile,"MD_20")==0)
 566       r_v = look_p(md20_p,gdelval,ggapval,&K,&Lambda,&H);
 567   else if (strcmp(pamfile,"MD_40")==0)
 568       r_v = look_p(md40_p,gdelval,ggapval,&K,&Lambda,&H);
 569   else if (strcmp(pamfile,"DNA")==0 || strcmp(pamfile,"+5/-4")==0)
 570       r_v = look_p(nt54_p,gdelval,ggapval, &K,&Lambda,&H);
 571   else if (strcmp(pamfile,"+3/-2")==0)
 572       r_v = look_p(nt32_p,gdelval,ggapval, &K,&Lambda,&H);
 573   else if (strcmp(pamfile,"+1/-3")==0)
 574       r_v = look_p(nt13_p,gdelval,ggapval, &K,&Lambda,&H);
 575   else r_v = 0;
 576
 577   pu->r_u.ag.K = K;
 578   pu->r_u.ag.Lambda = Lambda;
 579   pu->r_u.ag.H = H;
 580
 581   if (r_v == 0) {
 582     fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
 583             pamfile,gdelval,ggapval);
 584     }
 585   return r_v;
 586 }
 587
 588 int
 589 look_p(struct alt_p parm[], int gap, int ext,
 590        double *K, double *Lambda, double *H)
 591 {
 592   int i;
 593
 594   gap = -gap;
 595   ext = -ext;
 596
 597   if (gap > parm[1].gap) {
 598     *K = parm[0].K;
 599     *Lambda = parm[0].Lambda;
 600     *H = parm[0].H;
 601     return 1;
 602   }
 603
 604   for (i=1; parm[i].gap > 0; i++) {
 605     if (parm[i].gap > gap) continue;
 606     else if (parm[i].gap == gap && parm[i].ext > ext ) continue;
 607     else if (parm[i].gap == gap && parm[i].ext == ext) {
 608       *K = parm[i].K;
 609       *Lambda = parm[i].Lambda;
 610       *H = parm[i].H;
 611       return 1;
 612     }
 613     else break;
 614   }
 615   return 0;
 616 }
 617
 618 /* uncensored and censored maximum likelihood estimates developed
 619    by Aaron Mackey based on a preprint from Sean Eddy */
 620
 621 int mle_cen  (struct stat_str *, int, int, double, double *, double *);
 622
 623 int
 624 proc_hist_ml(struct stat_str *sptr, int nstats,
 625              struct pstruct pst, struct hist_str *histp,
 626              int do_trim, struct pstat_str *pu)
 627 {
 628   double f_cen;
 629   char s_string[128];
 630   char *f_string;
 631
 632   f_string = &(histp->stat_info[0]);
 633   pu->r_u.ag.a_n0 = (double)pst.n0;
 634
 635   if (pst.zsflag < 10) s_string[0]='\0';
 636   else if (pst.zs_win > 0)
 637     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
 638   else strncpy(s_string,"(shuffled)",sizeof(s_string));
 639
 640   if (!do_trim) {
 641     if (mle_cen(sptr, nstats, pst.n0, 0.0, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
 642       goto bad_mle;
 643     sprintf(f_string,"%s MLE statistics: Lambda= %6.4f;  K=%6.4g",
 644             s_string,pu->r_u.ag.Lambda,pu->r_u.ag.K);
 645   }
 646   else {
 647     if (nstats/20 > 1000) f_cen = 1000.0/(double)nstats;
 648     else f_cen = 0.05;
 649     if (mle_cen(sptr, nstats, pst.n0, f_cen, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
 650       goto bad_mle;
 651     sprintf(f_string,"MLE_cen statistics: Lambda= %6.4f;  K=%6.4g (cen=%d)",
 652             pu->r_u.ag.Lambda,pu->r_u.ag.K,(int)((double)nstats*f_cen));
 653   }
 654
 655   return MLE_STATS;
 656  bad_mle:
 657   find_zp = &find_zn;
 658
 659   return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
 660 }
 661
 662 int
 663 mle_cen2  (struct stat_str *, int, int, double, double *, double *, double *, double *);
 664
 665
 666 int
 667 proc_hist_ml2(struct stat_str *sptr, int nstats,
 668               struct pstruct pst, struct hist_str *histp,
 669               int do_trim, struct pstat_str *pu)
 670 {
 671   int i, ns=0, nneg=0;
 672   double f_cen, ave_lambda;
 673   char s_string[128], ex_string[64];
 674   char *f_string;
 675
 676   f_string = &(histp->stat_info[0]);
 677   pu->r_u.m2.a_n0 = (double)pst.n0;
 678
 679   if (pst.zsflag < 10) s_string[0]='\0';
 680   else if (pst.zs_win > 0)
 681     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
 682   else strncpy(s_string,"(shuffled)",sizeof(s_string));
 683
 684   pu->r_u.m2.ave_comp = 0.0;
 685   pu->r_u.m2.max_comp = -1.0;
 686
 687   ns = nneg = 0;
 688   for (i=0; i<nstats; i++) {
 689     if (sptr[i].comp > pu->r_u.m2.max_comp) pu->r_u.m2.max_comp = sptr[i].comp;
 690     if (sptr[i].comp > 0.0) {
 691       pu->r_u.m2.ave_comp += log(sptr[i].comp);
 692       ns++;
 693     }
 694     else nneg++;
 695   }
 696   pu->r_u.m2.ave_comp /= (double)ns;
 697   pu->r_u.m2.ave_comp = exp(pu->r_u.m2.ave_comp);
 698   for (i=0; i<nstats; i++) if (sptr[i].comp < 0.0) {
 699     sptr[i].comp = pu->r_u.m2.ave_comp;
 700   }
 701
 702   if (nneg > 0)
 703     sprintf(ex_string,"composition = -1 for %d sequences",nneg);
 704   else ex_string[0]='\0';
 705
 706   if (!do_trim) {
 707     if (mle_cen2(sptr, nstats, pst.n0, 0.0,
 708              &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1,
 709              &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1) == -1) goto bad_mle2;
 710     ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
 711
 712     sprintf(f_string,"%s MLE-2 statistics: a0= %6.4f;  a1=%6.4f; a2=%6.4f; b1=%6.4f\n  ave Lamdba: %6.4f",
 713             s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1,ave_lambda);
 714   }
 715   else {
 716     if (nstats/20 > 500) f_cen = 500.0/(double)nstats;
 717     else f_cen = 0.05;
 718     if (mle_cen2(sptr, nstats, pst.n0, f_cen, &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1, &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1)== -1) goto bad_mle2;
 719
 720     ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
 721
 722     sprintf(f_string,"%s MLE-2-cen statistics: a0= %6.4f;  a1=%6.4f; a2=%6.4f; b1=%6.4f (cen=%d)\n  ave Lambda:%6.4f",
 723             s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1, (int)((double)nstats*f_cen),ave_lambda);
 724   }
 725
 726   return MLE2_STATS;
 727  bad_mle2:
 728   find_zp = &find_zn;
 729   return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
 730 }
 731
 732 double first_deriv_cen(double lambda, struct stat_str *sptr,
 733                        int start, int stop,
 734                        double sumlenL, double cenL,
 735                        double sumlenH, double cenH);
 736
 737 double second_deriv_cen(double lambda, struct stat_str *sptr,
 738                         int start, int stop,
 739                         double sumlenL, double cenL,
 740                         double sumlenH, double cenH);
 741
 742 static void
 743 st_sort (struct stat_str *v, int n) {
 744    int gap, i, j;
 745    int tmp;
 746
 747    for (gap = 1; gap < n/3; gap = 3*gap +1) ;
 748
 749    for (; gap > 0; gap = (gap-1)/3)
 750       for (i = gap; i < n; i++)
 751          for (j = i - gap; j >= 0; j -= gap) {
 752            if (v[j].score <= v[j + gap].score) break;
 753
 754            tmp = v[j].score;
 755            v[j].score = v[j + gap].score;
 756            v[j + gap].score = tmp;
 757
 758            tmp = v[j].n1;
 759            v[j].n1 = v[j + gap].n1;
 760            v[j + gap].n1 = tmp;
 761          }
 762 }
 763
 764 /* sptr[].score, sptr[].n1; sptr[] must be sorted
 765    int n = total number of samples
 766    int M = length of query
 767    double fn = fraction of scores to be censored fn/2.0 from top, bottom
 768    double *Lambda = Lambda estimate
 769    double *K = K estimate
 770 */
 771
 772 #define MAX_NIT 100
 773
 774 int
 775 mle_cen(struct stat_str *sptr, int n, int M, double fc,
 776         double *Lambda, double *K) {
 777
 778   double sumlenL, sumlenH, cenL, cenH;
 779   double sum_s, sum2_s, mean_s, var_s, dtmp;
 780   int start, stop;
 781   int i, nf;
 782   int nit = 0;
 783   double deriv, deriv2, lambda, old_lambda, sum = 0.0;
 784   /*
 785    int sumlenL, int sumlenghtsR = sum of low (Left), right (High) seqs.
 786    int cenL, cenH = censoring score low, high
 787   */
 788
 789   nf = (fc/2.0) * n;
 790   start = nf;
 791   stop = n - nf;
 792
 793   st_sort(sptr,n);
 794
 795   sum_s = sum2_s = 0.0;
 796   for (i=start; i<stop; i++) {
 797     sum_s += sptr[i].score;
 798   }
 799   dtmp = (double)(stop-start);
 800   mean_s = sum_s/dtmp;
 801
 802   for (i=start; i<stop; i++) {
 803     sum2_s += sptr[i].score * sptr[i].score;
 804   }
 805   var_s = sum2_s/(dtmp-1.0);
 806
 807   sumlenL = sumlenH = 0.0;
 808   for (i=0; i<start; i++) sumlenL += (double)sptr[i].n1;
 809   for (i=stop; i<n; i++) sumlenH += (double)sptr[i].n1;
 810
 811   if (nf > 0) {
 812     cenL = (double)sptr[start].score;
 813     cenH = (double)sptr[stop].score;
 814   }
 815   else {
 816     cenL = (double)sptr[start].score/2.0;
 817     cenH = (double)sptr[start].score*2.0;
 818   }
 819
 820   if (cenL >= cenH) return -1;
 821
 822   /* initial guess for lambda is 0.2 - this does not work for matrices
 823      with very different scales */
 824   /*  lambda = 0.2; */
 825   lambda = PI_SQRT6/sqrt(var_s);
 826   if (lambda > 1.0) {
 827     fprintf(stderr," Lambda initial estimate error: lambda: %6.4g; var_s: %6.4g\n",lambda,var_s);
 828     lambda = 0.2;
 829   }
 830
 831   do {
 832     deriv = first_deriv_cen(lambda, sptr, start, stop,
 833                             sumlenL, cenL, sumlenH, cenH);
 834     /*   (uncensored version)
 835          first_deriv(lambda, &sptr[start], stop - start))
 836     */
 837
 838     /*  (uncensored version)
 839     deriv2 = second_deriv(lambda, &sptr[start], stop-start);
 840     */
 841     deriv2 = second_deriv_cen(lambda, sptr, start, stop,
 842                              sumlenL, cenL, sumlenH, cenH);
 843
 844     old_lambda = lambda;
 845     if (lambda - deriv/deriv2 > 0.0) lambda = lambda - deriv/deriv2;
 846     else lambda = lambda/2.0;
 847     nit++;
 848   } while (fabs((lambda - old_lambda)/lambda) > TINY && nit < MAX_NIT);
 849
 850   /*  fprintf(stderr," mle_cen nit: %d\n",nit); */
 851
 852   if (nit >= MAX_NIT) return -1;
 853
 854   for(i = start; i < stop ; i++) {
 855     sum += (double) sptr[i].n1 * exp(- lambda * (double)sptr[i].score);
 856   }
 857
 858   *Lambda = lambda;
 859   /*
 860   *K = (double)(stop-start)/((double)M*sum);
 861   */
 862   *K = (double)n/((double)M*
 863                   (sum+sumlenL*exp(-lambda*cenL)-sumlenH*exp(-lambda*cenH)));
 864   return 0;
 865 }
 866
 867 /*
 868 double
 869 first_deriv(double lambda, struct stat_str *sptr, int n) {
 870
 871   int i;
 872   double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
 873   double s, l, es;
 874
 875   for(i = 0 ; i < n ; i++) {
 876     s = (double)sptr[i].score;
 877     l = (double)sptr[i].n1;
 878     es = exp(-lambda * s );
 879     sum += s;
 880     sum2 += l * es;
 881     sum1 += s * l * es;
 882   }
 883
 884   return (1.0/lambda) - (sum/(double)n) + (sum1/sum2);
 885 }
 886 */
 887
 888 /*
 889 double
 890 second_deriv(double lambda, struct stat_str *sptr, int n) {
 891   double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
 892   double s, l, es;
 893   int i;
 894
 895   for(i = 0 ; i < n ; i++) {
 896     l = (double)sptr[i].n1;
 897     s = (double)sptr[i].score;
 898     es = exp(-lambda * s);
 899     sum2 += l * es;
 900     sum1 += l * s * es;
 901     sum3 += l * s * s * es;
 902   }
 903
 904   return ((sum1*sum1)/(sum2*sum2)) - (sum3/sum2) -  (1.0/(lambda*lambda));
 905 }
 906 */
 907
 908 double
 909 first_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
 910                 double sumlenL, double cenL, double sumlenH, double cenH) {
 911   int i;
 912   double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
 913   double s, l, es;
 914
 915   for(i = start ; i < stop ; i++) {
 916     s = (double)sptr[i].score;
 917     l = (double)sptr[i].n1;
 918     es = exp(-lambda * s );
 919     sum += s;
 920     sum2 += l * es;
 921     sum1 += s * l * es;
 922   }
 923
 924   sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
 925   sum2 += sumlenL*exp(-lambda*cenL) - sumlenH*exp(-lambda*cenH);
 926
 927   return (1.0 / lambda) - (sum /(double)(stop-start)) + (sum1 / sum2);
 928 }
 929
 930 double
 931 second_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
 932                  double sumlenL, double cenL, double sumlenH, double cenH) {
 933
 934   double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
 935   double s, l, es;
 936   int i;
 937
 938   for(i = start ; i < stop ; i++) {
 939     s = (double)sptr[i].score;
 940     l = (double)sptr[i].n1;
 941     es = exp(-lambda * s);
 942     sum2 += l * es;
 943     sum1 += l * s * es;
 944     sum3 += l * s * s * es;
 945   }
 946
 947   sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
 948   sum2 += sumlenL*exp(-lambda * cenL) -  sumlenH*exp(-lambda * cenH);
 949   sum3 += sumlenL*cenL*cenL * exp(-lambda * cenL) -
 950     sumlenH*cenH*cenH * exp(-lambda * cenH);
 951   return ((sum1 * sum1) / (sum2 * sum2)) - (sum3 / sum2)
 952     - (1.0 / (lambda * lambda));
 953 }
 954
 955 double mle2_func(double *params,
 956                  double *consts,
 957                  struct stat_str *values,
 958                  int n, int start, int stop);
 959
 960 void simplex(double *fitparams,
 961              double *lambda,
 962              int nparam,
 963              double (*minfunc) (double *tryparams, double *consts,
 964                                 struct stat_str *data, int ndata,
 965                                 int start, int stop),
 966              double *consts,
 967              void *data,
 968              int ndata, int start, int stop
 969              );
 970
 971 int
 972 mle_cen2(struct stat_str *sptr, int n, int M, double fc,
 973         double *a0, double *a1, double *a2, double *b1) {
 974
 975   double params[4], lambdas[4], consts[9];
 976   double avglenL, avglenH, avgcompL, avgcompH, cenL, cenH;
 977   int start, stop;
 978   int i, nf;
 979
 980   nf = (fc/2.0) * n;
 981   start = nf;
 982   stop = n - nf;
 983
 984   st_sort(sptr,n);
 985
 986   /* choose arithmetic or geometic mean for compositions by appropriate commenting */
 987
 988   if (nf > 0) {
 989     avglenL = avglenH = 0.0;
 990     avgcompL = avgcompH = 0.0;
 991     /* avgcompL = avgcompH = 1.0 */
 992     for (i=0; i<start; i++) {
 993       avglenL += (double)sptr[i].n1;
 994       avgcompL += (double)sptr[i].comp;
 995       /* avgcompL *= (double) sptr[i].comp; */
 996     }
 997     avglenL /= (double) start;
 998     avgcompL /= (double) start;
 999     /* avgcompL = pow(avgcompL, 1.0/(double) start); */
1000
1001     for (i=stop; i<n; i++) {
1002       avglenH += (double)sptr[i].n1;
1003       avgcompH += (double)sptr[i].comp;
1004       /* avgcompH *= (double) sptr[i].comp; */
1005     }
1006     avglenH /= (double) (n - stop);
1007     avgcompH /= (double) (n - stop);
1008     /* avgcompL = pow(avgcompL, 1.0/(double) (n - stop)); */
1009
1010     cenL = (double)sptr[start].score;
1011     cenH = (double)sptr[stop].score;
1012     if (cenL >= cenH) return -1;
1013   }
1014   else {
1015     avglenL = avglenH = cenL = cenH = 0.0;
1016     avgcompL = avgcompH = 1.0;
1017   }
1018
1019   params[0] = 10.0;
1020   params[1] = -10.0;
1021   params[2] = 1.0;
1022   params[3] = 1.0;
1023
1024   lambdas[0] = 1.0;
1025   lambdas[1] = 0.5;
1026   lambdas[2] = 0.1;
1027   lambdas[3] = 0.01;
1028
1029   consts[0] = M;
1030   consts[1] = (double) start;
1031   consts[2] = (double) stop;
1032   consts[3] = cenL;
1033   consts[4] = cenH;
1034   consts[5] = avglenL;
1035   consts[6] = avglenH;
1036   consts[7] = avgcompL;
1037   consts[8] = avgcompH;
1038
1039   simplex(params, lambdas, 4,
1040           (double (*) (double *, double *, struct stat_str *, int, int, int) )mle2_func,
1041           consts, sptr, n, start, stop);
1042
1043   *a0 = params[0];
1044   *a1 = params[1];
1045   *a2 = params[2];
1046   *b1 = params[3];
1047
1048   return 0;
1049 }
1050
1051 double mle2_func(double *params,
1052                  double *consts,
1053                  struct stat_str *values,
1054                  int n, int start, int stop
1055                  ) {
1056
1057   double a0, a1, a2, b1, M;
1058   double score, length, comp;
1059   double cenL, cenH, avglenL, avglenH, avgcompL, avgcompH;
1060   double L, y;
1061
1062   int i;
1063
1064   a0 = params[0];
1065   a1 = params[1];
1066   a2 = params[2];
1067   b1 = params[3];
1068
1069   M = consts[0];
1070   /*
1071   start = (int) consts[1];
1072   stop = (int) consts[2];
1073   */
1074   cenL = consts[3];
1075   cenH = consts[4];
1076   avglenL = consts[5];
1077   avglenH = consts[6];
1078   avgcompL = consts[7];
1079   avgcompH = consts[8];
1080
1081   L = 0;
1082   y = 0;
1083
1084   if (start > 0) {
1085     y = -(cenL - (a0 + a1*avgcompL +a2*avgcompL*log(M*avglenL)))/(b1*avgcompL);
1086     L += (double) start * exp(y);
1087   }
1088
1089   for(i = start ; i < stop ; i++) {
1090     score = (double) values[i].score;
1091     length = (double) values[i].n1;
1092     comp = (double) values[i].comp;
1093
1094     y = - (score - (a0 + a1*comp + a2 * comp * log(M*length))) / (b1*comp);
1095
1096     L += -y + exp(y) + log(b1 * comp);
1097   }
1098
1099   if (stop < n) {
1100     y = -(cenH -(a0 + a1*avgcompH + a2*avgcompH*log(M*avglenH)))/(b1*avgcompH);
1101     L -= (double) (n - stop) * exp(y);
1102   }
1103   return L;
1104 }
1105
1106 /* Begin Nelder-Mead simplex code: */
1107
1108 double evalfunc(double **param,
1109                 double *vals,
1110                 double *psums,
1111                 double *ptry,
1112                 int nparam,
1113                 double (*minfunc) (double *params, double *consts,
1114                                    struct stat_str *data, int ndata,
1115                                    int start, int stop),
1116                 double *consts,
1117                 void *data,
1118                 int ndata, int start, int stop,
1119                 int ihi,
1120                 double factor);
1121
1122 void simplex(double *fitparams,
1123              double *lambda,
1124              int nparam,
1125              double (*minfunc) (double *tryparams, double *consts,
1126                                 struct stat_str *data, int ndata,
1127                                 int start, int stop),
1128              double *consts,
1129              void *data,
1130              int ndata,
1131              int start,
1132              int stop
1133              )
1134 {
1135
1136   int i, j, ilo, ihi, inhi;
1137   double rtol, sum, tmp, ysave, ytry;
1138   double *psum, *vals, *ptry, **param;
1139
1140
1141   psum = (double *) calloc(nparam, sizeof(double));
1142   ptry = (double *) calloc(nparam, sizeof(double));
1143
1144   vals = (double *) calloc(nparam + 1, sizeof(double));
1145
1146   param = (double **) calloc(nparam + 1, sizeof(double *));
1147   param[0] = (double *) calloc((nparam + 1) * nparam, sizeof(double));
1148   for( i = 1 ; i < (nparam + 1) ; i++ ) {
1149     param[i] = param[0] + i * nparam;
1150   }
1151
1152   /* Get our N+1 initial parameter values for the simplex */
1153
1154   for( i = 0 ; i < nparam ; i++ ) {
1155     param[0][i] = fitparams[i];
1156   }
1157
1158   for( i = 1 ; i < (nparam + 1) ; i++ ) {
1159     for( j = 0 ; j < nparam ; j++ ) {
1160       param[i][j] = fitparams[j] + lambda[j] * ( (i - 1) == j ? 1 : 0 );
1161     }
1162   }
1163
1164   /* calculate initial values at the simplex nodes */
1165
1166   for( i = 0 ; i < (nparam + 1) ; i++ ) {
1167     vals[i] = minfunc(param[i], consts, data, ndata, start, stop);
1168   }
1169
1170   /* Begin Nelder-Mead simplex algorithm from Numerical Recipes in C */
1171
1172   for( j = 0 ; j < nparam ; j++ ) {
1173     for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
1174       sum += param[i][j];
1175     }
1176     psum[j] = sum;
1177   }
1178
1179
1180   while( 1 ) {
1181 /*
1182       determine which point is highest (ihi), next highest (inhi) and
1183       lowest (ilo) by looping over the points in the simplex
1184 */
1185     ilo = 0;
1186
1187 /*  ihi = vals[0] > vals[1] ? (inhi = 1, 0) : (inhi = 0, 1); */
1188     if(vals[0] > vals[1]) { ihi = 0; inhi = 1; }
1189     else { ihi = 1; inhi = 0; }
1190
1191     for( i = 0 ; i < nparam + 1 ; i++) {
1192       if( vals[i] <= vals[ilo] ) ilo = i;
1193       if( vals[i] > vals[ihi] ) {
1194         inhi = ihi;
1195         ihi = i;
1196       } else if ( vals[i] > vals[inhi] && i != ihi ) inhi = i;
1197     }
1198
1199     /* Are we finished? */
1200
1201     rtol = 2.0 * fabs(vals[ihi] - vals[ilo]) /
1202       (fabs(vals[ihi]) + fabs(vals[ilo]) + TINY);
1203
1204     if( rtol < TOLERANCE ) {
1205
1206 /* put the best value and best parameters into the first index */
1207
1208       tmp = vals[0];
1209       vals[0] = vals[ilo];
1210       vals[ilo] = tmp;
1211
1212       for( i = 0 ; i < nparam ; i++ ) {
1213         tmp = param[0][i];
1214         param[0][i] = param[ilo][i];
1215         param[ilo][i] = tmp;
1216       }
1217
1218       /* et voila, c'est finis */
1219       break;
1220     }
1221
1222     /* Begin a new iteration */
1223
1224     /* first, extrapolate by -1 through the face of the simplex across from ihi */
1225
1226     ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
1227                     data, ndata, start, stop, ihi, -1.0);
1228
1229     if( ytry <= vals[ilo] ) {
1230
1231       /* Good result, try additional extrapolation by 2 */
1232
1233       ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
1234                       data, ndata, start, stop, ihi, 2.0);
1235
1236     } else if ( ytry >= vals[inhi] ) {
1237
1238       /* no good, look for an intermediate lower point by contracting */
1239
1240       ysave = vals[ihi];
1241       ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
1242                       data, ndata, start, stop, ihi, 0.5);
1243
1244       if( ytry >= ysave ) {
1245
1246         /* Still no good.  Contract around lowest (best) point. */
1247
1248         for( i = 0 ; i < nparam + 1 ; i++ ) {
1249           if( i != ilo ) {
1250             for ( j = 0 ; j < nparam ; j++ ) {
1251               param[i][j] = psum[j] = 0.5 * (param[i][j] + param[ilo][j]);
1252             }
1253             vals[i] = minfunc(psum, consts, data, ndata, start, stop);
1254           }
1255         }
1256
1257
1258         for( j = 0 ; j < nparam ; j++ ) {
1259           for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
1260             sum += param[i][j];
1261           }
1262           psum[j] = sum;
1263         }
1264
1265       }
1266     }
1267   }
1268
1269   for( i = 0 ; i < nparam ; i++ ) {
1270     fitparams[i] = param[0][i];
1271   }
1272
1273   if (ptry!=NULL) {
1274     free(ptry);
1275     ptry=NULL;
1276   }
1277   free(param[0]);
1278   free(param);
1279   free(vals);
1280   free(psum);
1281 }
1282
1283
1284 double evalfunc(double **param,
1285                 double *vals,
1286                 double *psum,
1287                 double *ptry,
1288                 int nparam,
1289                 double (*minfunc)(double *tryparam, double *consts,
1290                                   struct stat_str *data, int ndata,
1291                                   int start, int stop),
1292                 double *consts,
1293                 void *data,
1294                 int ndata, int start, int stop,
1295                 int ihi,
1296                 double factor) {
1297
1298   int j;
1299   double fac1, fac2, ytry;
1300
1301
1302   fac1 = (1.0 - factor) / nparam;
1303   fac2 = fac1 - factor;
1304
1305   for( j = 0 ; j < nparam ; j++ ) {
1306     ptry[j] = psum[j] * fac1 - param[ihi][j] * fac2;
1307   }
1308
1309   ytry = minfunc(ptry, consts, data, ndata, start, stop);
1310
1311   if( ytry < vals[ihi] ) {
1312     vals[ihi] = ytry;
1313     for( j = 0 ; j < nparam ; j++ ) {
1314       psum[j] += ptry[j] - param[ihi][j];
1315       param[ihi][j] = ptry[j];
1316     }
1317   }
1318
1319   return ytry;
1320 }
1321
1322 /* end of Nelder-Mead simplex code */
1323
1324 int
1325 proc_hist_n(struct stat_str *sptr, int nstats,
1326             struct pstruct pst, struct hist_str *histp,
1327             int do_trim, struct pstat_str *pu)
1328 {
1329   int i, j;
1330   double s_score, s2_score, ssd, ztrim;
1331   int nit, max_hscore;
1332   char s_string[128];
1333   char *f_string;
1334
1335   f_string = &(histp->stat_info[0]);
1336
1337   max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
1338                            pu->ngK, pu->ngH, &ztrim);
1339
1340   s_score = s2_score = 0.0;
1341
1342   for ( j = 0, i = 0; i < nstats; i++) {
1343     if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
1344       s_score += (ssd=(double)sptr[i].score);
1345       s2_score += ssd * ssd;
1346       j++;
1347     }
1348   }
1349
1350   if (j > 1 ) {
1351     pu->r_u.rg.mu = s_score/(double)j;
1352     pu->r_u.rg.mean_var = s2_score - (double)j * pu->r_u.rg.mu * pu->r_u.rg.mu;
1353     pu->r_u.rg.mean_var /= (double)(j-1);
1354   }
1355   else {
1356     pu->r_u.rg.mu = 50.0;
1357     pu->r_u.rg.mean_var = 10.0;
1358   }
1359
1360   if (pu->r_u.rg.mean_var < 0.01) {
1361     pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
1362   }
1363
1364   /* now remove some scores */
1365
1366   nit = 5;
1367   while (nit-- > 0) {
1368     pu->r_u.rg.n_trimmed = 0;
1369
1370     for (i=0; i< nstats; i++) {
1371       if (sptr[i].n1 < 0) continue;
1372       ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
1373       if (ssd > ztrim || ssd < 20.0) {
1374         /*      fprintf(stderr,"removing %3d %3d %4.1f\n",
1375                 sptr[i].score, sptr[i].n1,ssd); */
1376         ssd = sptr[i].score;
1377         s_score -= ssd;
1378         s2_score -= ssd*ssd;
1379         j--;
1380         pu->r_u.rg.n_trimmed++;
1381         histp->entries--;
1382         sptr[i].n1 = -sptr[i].n1;
1383       }
1384     }
1385
1386     if (j > 1 ) {
1387       pu->r_u.rg.mu = s_score/(double)j;
1388       pu->r_u.rg.mean_var = s2_score - (double)j * pu->r_u.rg.mu * pu->r_u.rg.mu;
1389       pu->r_u.rg.mean_var /= (double)(j-1);
1390     }
1391     else {
1392       pu->r_u.rg.mu = 50.0;
1393       pu->r_u.rg.mean_var = 10.0;
1394     }
1395
1396     if (pu->r_u.rg.mean_var < 0.01) {
1397       pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
1398     }
1399
1400     if (pu->r_u.rg.n_trimmed < LHISTC) {
1401       /*
1402         fprintf(stderr,"nprune %d at %d\n",nprune,nit);
1403         */
1404       break;
1405     }
1406   }
1407
1408   if (pst.zsflag < 10) s_string[0]='\0';
1409   else if (pst.zs_win > 0)
1410     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
1411   else strncpy(s_string,"(shuffled)",sizeof(s_string));
1412
1413   sprintf(f_string,"%s unscaled statistics: mu= %6.4f  var=%6.4f; Lambda= %6.4f",
1414           s_string, pu->r_u.rg.mu,pu->r_u.rg.mean_var,PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
1415   return AVE_STATS;
1416 }
1417
1418 /*
1419 This routine calculates the maximum likelihood estimates for the
1420 extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
1421
1422         <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
1423         <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
1424
1425         The <a> parameter can be transformed into and K
1426         of the formula: 1 - exp ( - K m n exp ( - lambda S ))
1427         using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
1428                         1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
1429
1430                         a = log(K m n) / lambda
1431                         a lambda = log (K m n)
1432                         exp(a lambda)  = K m n
1433          but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
1434          so:            K m n = (1/n sum{ exp( -x[i] *lambda)})
1435                         K = sum{}/(nlib m n )
1436
1437 */
1438
1439 void
1440 alloc_hist(struct llen_str *llen)
1441 {
1442   int max_llen, i;
1443   max_llen = llen->max;
1444
1445   if (llen->hist == NULL) {
1446     llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
1447     llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
1448     llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
1449     llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
1450   }
1451
1452   for (i=0; i< max_llen+1; i++) {
1453       llen->hist[i] = 0;
1454       llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
1455   }
1456 }
1457
1458 void
1459 free_hist(struct llen_str *llen)
1460 {
1461   if (llen->hist!=NULL) {
1462     free(llen->score_var);
1463     free(llen->score2_sums);
1464     free(llen->score_sums);
1465     free(llen->hist);
1466     llen->hist=NULL;
1467   }
1468 }
1469
1470 void
1471 inithist(struct llen_str *llen, struct pstruct pst, int max_hscore)
1472 {
1473   llen->max = MAX_LLEN;
1474
1475   llen->max_score = -1;
1476   llen->min_score=10000;
1477
1478   alloc_hist(llen);
1479
1480   llen->zero_s = 0;
1481   llen->min_length = 10000;
1482   llen->max_length = 0;
1483 }
1484
1485 void
1486 addhist(struct llen_str *llen, int score, int length, int max_hscore)
1487 {
1488   int llength;
1489   double dscore;
1490
1491   if ( score<=0 || length < LENGTH_CUTOFF) {
1492     llen->min_score = 0;
1493     llen->zero_s++;
1494     return;
1495   }
1496
1497   if (score < llen->min_score) llen->min_score = score;
1498   if (score > llen->max_score) llen->max_score = score;
1499
1500   if (length > llen->max_length) llen->max_length = length;
1501   if (length < llen->min_length) llen->min_length = length;
1502   if (score > max_hscore) score = max_hscore;
1503
1504   llength = (int)(LN_FACT*log((double)length)+0.5);
1505
1506   if (llength < 0 ) llength = 0;
1507   if (llength > llen->max) llength = llen->max;
1508   llen->hist[llength]++;
1509   dscore = (double)score;
1510   llen->score_sums[llength] += dscore;
1511   llen->score2_sums[llength] += dscore * dscore;
1512 }
1513
1514 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
1515
1516 void
1517 inithistz(int mh, struct hist_str *histp )
1518 {
1519   int i;
1520
1521   histp->z_calls = 0;
1522
1523   histp->min_hist = 20;
1524   histp->max_hist = 120;
1525
1526   histp->histint = (int)
1527     ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
1528   histp->maxh = (int)
1529     ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
1530
1531   if (histp->hist_a==NULL) {
1532     if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
1533         NULL) {
1534       fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
1535       histp->histflg = 0;
1536     }
1537     else histp->histflg = 1;
1538   }
1539   else {
1540     for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
1541   }
1542   histp->entries = 0;
1543 }
1544
1545 static double nrv[100]={
1546   0.3098900570,-0.0313400923, 0.1131975903,-0.2832547606, 0.0073672659,
1547   0.2914489107, 0.4209306311,-0.4630181404, 0.3326537896, 0.0050140359,
1548  -0.1117435426,-0.2835630301, 0.2302997065,-0.3102716394, 0.0819894916,
1549  -0.1676455701,-0.3782225018,-0.3204509938,-0.3594969187,-0.0308950398,
1550   0.2922813812, 0.1337170751, 0.4666577031,-0.2917784349,-0.2438179916,
1551   0.3002301394, 0.0231147123, 0.5687927366,-0.2318208709,-0.1476839273,
1552  -0.0385043851,-0.1213476523, 0.1486341995, 0.1027917167, 0.1409192644,
1553  -0.3280652579, 0.4232041455, 0.0775993309, 0.1159071787, 0.2769424442,
1554   0.3197284751, 0.1507346903, 0.0028580909, 0.4825103412,-0.0496843610,
1555  -0.2754357656, 0.6021881753,-0.0816123956,-0.0899148991, 0.4847183201,
1556   0.2151621865,-0.4542246220, 0.0690709102, 0.2461894193, 0.2126042295,
1557  -0.0767060668, 0.4819746149, 0.3323031326, 0.0177600676, 0.1143185210,
1558   0.2653977455, 0.0921872958,-0.1330986718, 0.0412287716,-0.1691604748,
1559  -0.0529679078,-0.0194157955,-0.6117493924, 0.1199067932, 0.0210243193,
1560  -0.5832259838,-0.1685528664, 0.0008591271,-0.1120347822, 0.0839125069,
1561  -0.2787486831,-0.1937017962,-0.1915733940,-0.7888453635,-0.3316745163,
1562   0.1180885226,-0.3347001067,-0.2477492636,-0.2445697600, 0.0001342482,
1563  -0.0015759812,-0.1516473992,-0.5202267615, 0.2136975210, 0.2500423188,
1564  -0.2402926401,-0.1094186280,-0.0618869933,-0.0815221188, 0.2623337275,
1565   0.0219427302 -0.1774469919, 0.0828245026,-0.3271952808,-0.0632898028};
1566
1567 void
1568 addhistz(double zs, struct hist_str *histp)
1569 {
1570   int ih, zi;
1571   double rv;
1572
1573    rv = nrv[histp->z_calls++ % 100];
1574   zi = (int)(zs + 0.5+rv );
1575
1576   if ((zi >= 0) && (zi <= 120)) histp->entries++;
1577
1578   if (zi < histp->min_hist) zi = histp->min_hist;
1579   if (zi > histp->max_hist) zi = histp->max_hist;
1580
1581   ih = (zi - histp->min_hist)/histp->histint;
1582
1583   histp->hist_a[ih]++;
1584 }
1585
1586 /* addhistzp() does not increase histp->entries since addhist did it already */
1587 /*
1588 void
1589 addhistzp(double zs, struct hist_str *histp)
1590 {
1591   int ih, zi;
1592   double rv;
1593
1594   rv = nrv[histp->z_calls++ %100];
1595   zi = (int)(zs + 0.5 + rv);
1596
1597   if (zi < histp->min_hist) zi = histp->min_hist;
1598   if (zi > histp->max_hist) zi = histp->max_hist;
1599
1600   ih = (zi - histp->min_hist)/histp->histint;
1601
1602   histp->hist_a[ih]++;
1603 }
1604 */
1605
1606 void
1607 prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
1608            long *entries)
1609 {
1610   int llength;
1611   double dscore;
1612
1613   if (score <= 0 || length < LENGTH_CUTOFF) return;
1614
1615   if (score > max_hscore) score = max_hscore;
1616
1617   llength = (int)(LN_FACT*log((double)length)+0.5);
1618
1619   if (llength < 0 ) llength = 0;
1620   if (llength > llen->max) llength = llen->max;
1621   llen->hist[llength]--;
1622   dscore = (double)score;
1623   llen->score_sums[llength] -= dscore;
1624   llen->score2_sums[llength] -= dscore * dscore;
1625
1626 /*  (*entries)--; histp->entries is not yet initialized */
1627 }
1628
1629 /* fit_llen: no trimming
1630    (1) regress scores vs log(n) using weighted variance
1631    (2) calculate mean variance after length regression
1632 */
1633
1634 void
1635 fit_llen(struct llen_str *llen, struct rstat_str *pr)
1636 {
1637   int j;
1638   int n;
1639   int n_size;
1640   double x, y2, u, z;
1641   double mean_x, mean_y, var_x, var_y, covar_xy;
1642   double mean_y2, covar_xy2, var_y2, dllj;
1643
1644   double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
1645
1646 /* now fit scores to best linear function of log(n), using
1647    simple linear regression */
1648
1649   for (llen->min=0; llen->min < llen->max; llen->min++)
1650     if (llen->hist[llen->min]) break;
1651   llen->min--;
1652
1653   for (n_size=0,j = llen->min; j < llen->max; j++) {
1654     if (llen->hist[j] > 1) {
1655       dllj = (double)llen->hist[j];
1656       llen->score_var[j] = llen->score2_sums[j]/dllj
1657         - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
1658       llen->score_var[j] /= (double)(llen->hist[j]-1);
1659       if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
1660       n_size++;
1661     }
1662   }
1663
1664   pr->nb_tot = n_size;
1665
1666   n_w = 0.0;
1667   sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
1668   for (j = llen->min; j < llen->max; j++)
1669     if (llen->hist[j] > 1) {
1670       x = j + 0.5;
1671       dllj = (double)llen->hist[j];
1672       n_w += dllj/llen->score_var[j];
1673       sum_x +=   dllj * x / llen->score_var[j] ;
1674       sum_y += llen->score_sums[j] / llen->score_var[j];
1675       sum_x2 +=  dllj * x * x /llen->score_var[j];
1676       sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
1677     }
1678
1679   if (n_size < 5 ) {
1680     llen->fit_flag=0;
1681     pr->rho = 0;
1682     pr->mu = sum_y/n_w;
1683     return;
1684   }
1685   else {
1686     det = n_w * sum_x2 - sum_x * sum_x;
1687     if (det > 0.001) {
1688       pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;
1689       pr->rho_e = n_w/det;
1690       pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1691       pr->mu_e = sum_x2/det;
1692     }
1693     else {
1694       llen->fit_flag = 0;
1695       pr->rho = 0;
1696       pr->mu = sum_y/n_w;
1697       return;
1698     }
1699   }
1700
1701   det = n_w * sum_x2 - sum_x * sum_x;
1702   pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;
1703   pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1704
1705   n = 0;
1706   mean_x = mean_y = mean_y2 = 0.0;
1707   var_x = var_y = 0.0;
1708   covar_xy = covar_xy2 = 0.0;
1709
1710   for (j = llen->min; j <= llen->max; j++)
1711    if (llen->hist[j] > 1 ) {
1712       n += llen->hist[j];
1713       x = (double)j + 0.5;
1714       mean_x += (double)llen->hist[j] * x;
1715       mean_y += llen->score_sums[j];
1716       var_x += (double)llen->hist[j] * x * x;
1717       var_y += llen->score2_sums[j];
1718       covar_xy += x * llen->score_sums[j];
1719     }
1720   mean_x /= n; mean_y /= n;
1721   var_x = var_x / n - mean_x * mean_x;
1722   var_y = var_y / n - mean_y * mean_y;
1723
1724   covar_xy = covar_xy / n - mean_x * mean_y;
1725 /*
1726   pr->rho = covar_xy / var_x;
1727   pr->mu = mean_y - pr->rho * mean_x;
1728 */
1729   mean_y2 = covar_xy2 = var_y2 = 0.0;
1730   for (j = llen->min; j <= llen->max; j++)
1731     if (llen->hist[j] > 1) {
1732       x = (double)j + 0.5;
1733       u = pr->rho * x + pr->mu;
1734       y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
1735 /*
1736       dllj = (double)llen->hist[j];
1737       fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
1738               llen->score_sums[j]/dllj,y2/dllj);
1739 */
1740       mean_y2 += y2;
1741       var_y2 += y2 * y2;
1742       covar_xy2 += x * y2;
1743       /*      fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
1744               x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
1745     }
1746
1747   pr->mean_var = mean_y2 /= (double)n;
1748   covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
1749
1750   if (pr->mean_var <= 0.01) {
1751     llen->fit_flag = 0;
1752     pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
1753   }
1754
1755   /*
1756   fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
1757           pr->rho*LN_FACT,pr->mu,pr->mean_var);
1758   */
1759   if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
1760   else pr->var_e = 0.0;
1761
1762   if (llen->fit_flag) {
1763     pr->rho2 = covar_xy2 / var_x;
1764     pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
1765   }
1766   else {
1767     pr->rho2 = 0;
1768     pr->mu2 = pr->mean_var;
1769   }
1770
1771   if (pr->rho2 < 0.0 )
1772     z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
1773   else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
1774   if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
1775
1776   pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
1777 }
1778
1779 /* fit_llens: trim high variance bins
1780    (1) regress scores vs log(n) using weighted variance
1781    (2) regress residuals vs log(n)
1782    (3) remove high variance bins
1783    (4) calculate mean variance after length regression
1784 */
1785
1786 void
1787 fit_llens(struct llen_str *llen, struct rstat_str *pr)
1788 {
1789   int j;
1790   int n, n_u2;
1791   double x, y, y2, u, u2, v, z;
1792   double mean_x, mean_y, var_x, var_y, covar_xy;
1793   double mean_y2, covar_xy2;
1794   double mean_u2, mean_3u2, dllj;
1795   double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
1796
1797 /* now fit scores to best linear function of log(n), using
1798    simple linear regression */
1799
1800   for (llen->min=0; llen->min < llen->max; llen->min++)
1801     if (llen->hist[llen->min]) break;
1802   llen->min--;
1803
1804   for (j = llen->min; j < llen->max; j++) {
1805     if (llen->hist[j] > 1) {
1806       dllj = (double)llen->hist[j];
1807       llen->score_var[j] = (double)llen->score2_sums[j]/dllj
1808         - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
1809       llen->score_var[j] /= (double)(llen->hist[j]-1);
1810       if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
1811     }
1812   }
1813
1814   n_w = 0.0;
1815   sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
1816   for (j = llen->min; j < llen->max; j++)
1817     if (llen->hist[j] > 1) {
1818       x = j + 0.5;
1819       dllj = (double)llen->hist[j];
1820       n_w += dllj/llen->score_var[j];
1821       sum_x +=   dllj * x / llen->score_var[j] ;
1822       sum_y += llen->score_sums[j] / llen->score_var[j];
1823       sum_x2 +=  dllj * x * x /llen->score_var[j];
1824       sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
1825     }
1826
1827   det = n_w * sum_x2 - sum_x * sum_x;
1828   pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;
1829   pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1830
1831 /* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
1832
1833   n = 0;
1834   mean_x = mean_y = mean_y2 = 0.0;
1835   var_x = var_y = 0.0;
1836   covar_xy = covar_xy2 = 0.0;
1837
1838   for (j = llen->min; j <= llen->max; j++)
1839     if (llen->hist[j] > 1 ) {
1840       n += llen->hist[j];
1841       x = (double)j + 0.5;
1842       dllj = (double)llen->hist[j];
1843       mean_x += dllj * x;
1844       mean_y += llen->score_sums[j];
1845       var_x += dllj * x * x;
1846       var_y += llen->score2_sums[j];
1847       covar_xy += x * llen->score_sums[j];
1848     }
1849   mean_x /= n; mean_y /= n;
1850   var_x = var_x / n - mean_x * mean_x;
1851   var_y = var_y / n - mean_y * mean_y;
1852
1853   covar_xy = covar_xy / n - mean_x * mean_y;
1854 /*  pr->rho = covar_xy / var_x;
1855   pr->mu = mean_y - pr->rho * mean_x;
1856 */
1857
1858   mean_y2 = covar_xy2 = 0.0;
1859   for (j = llen->min; j <= llen->max; j++)
1860     if (llen->hist[j] > 1) {
1861       x = (double)j + 0.5;
1862       u = pr->rho * x + pr->mu;
1863       y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
1864       mean_y2 += y2;
1865       covar_xy2 += x * y2;
1866     }
1867
1868   mean_y2 /= n;
1869   covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
1870   pr->rho2 = covar_xy2 / var_x;
1871   pr->mu2 = mean_y2 - pr->rho2 * mean_x;
1872
1873   if (pr->rho2 < 0.0 )
1874     z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
1875   else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
1876   if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
1877
1878   pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
1879
1880 /*  fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
1881          pr->var_cutoff,z);
1882 */
1883   mean_u2 = 0.0;
1884   n_u2 = 0;
1885   for ( j = llen->min; j < llen->max; j++) {
1886     y = j+0.5;
1887     dllj = (double)llen->hist[j];
1888     x = pr->rho * y + pr->mu;
1889     v = pr->rho2 * y + pr->mu2;
1890     if (v < pr->var_cutoff) v = pr->var_cutoff;
1891     if (llen->hist[j]> 1) {
1892       u2 =  (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
1893       mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
1894       n_u2++;
1895       /*      fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
1896               j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
1897     }
1898     else llen->score_var[j] = -1.0;
1899   }
1900
1901   mean_u2 = sqrt(mean_u2/(double)n_u2);
1902   /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */
1903
1904   mean_3u2 = mean_u2*3.0;
1905
1906   for (j = llen->min; j < llen->max; j++) {
1907     if (llen->hist[j] <= 1) continue;
1908     if (sqrt(llen->score_var[j]) > mean_3u2) {
1909       /*      fprintf(stderr," removing %d %d %.2f\n",
1910              j, (int)(exp((double)j/LN_FACT)-0.5),
1911              sqrt(llen->score_var[j]));
1912              */
1913       pr->nb_trimmed++;
1914       pr->n1_trimmed += llen->hist[j];
1915       llen->hist[j] = 0;
1916     }
1917   }
1918   fit_llen(llen, pr);
1919 }
1920
1921 struct s2str {double s; int n;};
1922 void s2_sort ( struct s2str *sptr, int n);
1923
1924 void
1925 fit_llen2(struct llen_str *llen, struct rstat_str *pr)
1926 {
1927   int j;
1928   int n, n_y2, llen_delta, llen_del05;
1929   int n_size;
1930   double x, y2, u;
1931   double mean_x, mean_y, var_x, var_y, covar_xy;
1932   double mean_y2, covar_xy2;
1933   struct s2str *ss2;
1934
1935   double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
1936
1937 /* now fit scores to best linear function of log(n), using
1938    simple linear regression */
1939
1940   for (llen->min=0; llen->min < llen->max; llen->min++)
1941     if (llen->hist[llen->min]) break;
1942
1943   for ( ; llen->max > llen->min; llen->max--)
1944     if (llen->hist[llen->max]) break;
1945
1946   for (n_size=0,j = llen->min; j < llen->max; j++) {
1947     if (llen->hist[j] > 1) {
1948       llen->score_var[j] = llen->score2_sums[j]/(double)llen->hist[j]
1949         - (llen->score_sums[j]/(double)llen->hist[j])
1950         * (llen->score_sums[j]/(double)llen->hist[j]);
1951       llen->score_var[j] /= (double)(llen->hist[j]-1);
1952       if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
1953       n_size++;
1954     }
1955   }
1956
1957   n_w = 0.0;
1958   sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
1959   for (j = llen->min; j < llen->max; j++)
1960     if (llen->hist[j] > 1) {
1961       x = j + 0.5;
1962       n_w += (double)llen->hist[j]/llen->score_var[j];
1963       sum_x +=   (double)llen->hist[j] * x / llen->score_var[j] ;
1964       sum_y += llen->score_sums[j] / llen->score_var[j];
1965       sum_x2 +=  (double)llen->hist[j] * x * x /llen->score_var[j];
1966       sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
1967     }
1968
1969   if (n_size < 5 ) {
1970     llen->fit_flag=0;
1971     pr->rho = 0;
1972     pr->mu = sum_y/n_w;
1973   }
1974   else {
1975     det = n_w * sum_x2 - sum_x * sum_x;
1976     if (det > 0.001) {
1977       pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;
1978       pr->rho_e = n_w/det;
1979       pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1980       pr->mu_e = sum_x2/det;
1981     }
1982     else {
1983       llen->fit_flag = 0;
1984       pr->rho = 0;
1985       pr->mu = sum_y/n_w;
1986     }
1987   }
1988
1989   det = n_w * sum_x2 - sum_x * sum_x;
1990   pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;
1991   pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
1992
1993 /*   fprintf(stderr," rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
1994
1995   n = 0;
1996   mean_x = mean_y = mean_y2 = 0.0;
1997   var_x = var_y = 0.0;
1998   covar_xy = covar_xy2 = 0.0;
1999
2000   for (j = llen->min; j <= llen->max; j++)
2001     if (llen->hist[j] > 1 ) {
2002       n += llen->hist[j];
2003       x = (double)j + 0.5;
2004       mean_x += (double)llen->hist[j] * x;
2005       mean_y += llen->score_sums[j];
2006       var_x += (double)llen->hist[j] * x * x;
2007       var_y += llen->score2_sums[j];
2008       covar_xy += x * llen->score_sums[j];
2009     }
2010   mean_x /= n; mean_y /= n;
2011   var_x = var_x / n - mean_x * mean_x;
2012   var_y = var_y / n - mean_y * mean_y;
2013
2014   covar_xy = covar_xy / n - mean_x * mean_y;
2015 /*
2016   pr->rho = covar_xy / var_x;
2017   pr->mu = mean_y - pr->rho * mean_x;
2018 */
2019
2020   if ((ss2=(struct s2str *)calloc(llen->max+1,sizeof(struct s2str)))==NULL) {
2021     fprintf(stderr," cannot allocate ss2\n");
2022     return;
2023   }
2024
2025   mean_y2 = 0.0;
2026   n_y2 = n = 0;
2027   for (j = llen->min; j <= llen->max; j++)
2028     if (llen->hist[j] > VHISTC) {
2029       n++;
2030       n_y2 += ss2[j].n = llen->hist[j];
2031       x = (double)j + 0.5;
2032       u = pr->rho * x + pr->mu;
2033       ss2[j].s = y2 = llen->score2_sums[j] - 2*llen->score_sums[j]*u + llen->hist[j]*u*u;
2034       mean_y2 += y2;
2035     }
2036   pr->mean_var = mean_y2/(double)n_y2;
2037
2038   s2_sort(ss2+llen->min,llen->max-llen->min+1);
2039
2040   /*  fprintf(stderr,"llen->min: %d, max: %d\n",llen->min,llen->max); */
2041   llen_delta = 0;
2042   for (j=llen->min; j<=llen->max; j++) {
2043     if (ss2[j].n > 1) {
2044       llen_delta++;
2045 /*      fprintf(stderr,"%d\t%d\t%.2f\t%.4f\n",
2046               j,ss2[j].n,ss2[j].s,ss2[j].s/ss2[j].n);
2047 */
2048     }
2049   }
2050
2051   llen_del05 = llen_delta/20;
2052   mean_y2 = 0.0;
2053   n_y2 = 0;
2054   for (j = llen->min; j<llen->min+llen_del05; j++) {
2055     pr->n1_trimmed += ss2[j].n;
2056     pr->nb_trimmed++;
2057   }
2058   for (j = llen->min+llen_del05; j <= llen->min+llen_delta-llen_del05; j++)
2059     if (ss2[j].n > 1) {
2060       mean_y2 += ss2[j].s;
2061       n_y2 += ss2[j].n;
2062     }
2063   for (j = llen->min+llen_delta-llen_del05+1; j< llen->max; j++) {
2064     pr->n1_trimmed += ss2[j].n;
2065     pr->nb_trimmed++;
2066   }
2067
2068   free(ss2);
2069   if (n_y2 > 1) pr->mean_var = mean_y2/(double)n_y2;
2070
2071   /*    fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var: %.4f/%d\n",
2072           pr->rho*LN_FACT,pr->mu,pr->mean_var,n); */
2073
2074     pr->var_e = 0.0;
2075 }
2076
2077 /* REG_STATS - Z() from rho/mu/mean_var */
2078 double find_zr(int score, double escore, int length, double comp, struct pstat_str *pu)
2079 {
2080   double log_len, z;
2081
2082   if (score <= 0) return 0;
2083   if ( length < LENGTH_CUTOFF) return 0;
2084
2085   log_len = LN_FACT*log((double)(length));
2086 /*  var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
2087   if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.var_cutoff;
2088 */
2089
2090   z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
2091
2092   return (50.0 + z*10.0);
2093 }
2094
2095 /* REG2_STATS Z() from rho/mu, rho2/mu2 */
2096 double find_zr2(int score, double escore, int length, double comp, struct pstat_str *pu)
2097 {
2098   double log_len, var;
2099   double z;
2100
2101   if ( length < LENGTH_CUTOFF) return 0;
2102
2103   log_len = LN_FACT*log((double)(length));
2104
2105   var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
2106   if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.mean_var;
2107
2108   z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(var);
2109
2110   return (50.0 + z*10.0);
2111 }
2112
2113 #ifdef USE_LNSTATS
2114 /* LN_STATS - ln()-scaled mu, mean_var */
2115 double find_zl(int score, int length, double comp, struct pstat_str *pu)
2116 {
2117   double ls, z;
2118
2119   ls = (double)score*LN200/log((double)length);
2120
2121   z = (ls - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
2122
2123   return (50.0 + z*10.0);
2124 }
2125 #endif
2126
2127 /* MLE_STATS - Z() from MLE for lambda, K */
2128 double
2129 find_ze(int score, double escore, int length, double comp, struct pstat_str *pu)
2130 {
2131   double z, mp, np, a_n1;
2132
2133   a_n1 = (double)length;
2134
2135   mp = pu->r_u.ag.a_n0;
2136   np = a_n1;
2137
2138   if (np < 1.0) np = 1.0;
2139   if (mp < 1.0) mp = 1.0;
2140
2141   z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
2142
2143   z = -z + EULER_G;
2144   z /= - PI_SQRT6;
2145
2146   return (50.0 + z*10.0);
2147 }
2148
2149 /* MLE2_STATS - Z() from MLE for mle_a0..2, mle_b1, length, comp */
2150 double
2151 find_ze2(int score, double escore, int length, double comp, struct pstat_str *pu)
2152 {
2153   double z, mp, np, a_n1;
2154
2155   a_n1 = (double)length;
2156
2157   if (comp <= 0.0) comp = pu->r_u.m2.ave_comp;
2158
2159   /* avoid very biased comp estimates */
2160   /* comp = exp((4.0*log(comp)+log(pu->r_u.m2.ave_comp))/5.0); */
2161
2162   mp = pu->r_u.m2.a_n0;
2163   np = a_n1;
2164
2165   if (np < 1.0) np = 1.0;
2166   if (mp < 1.0) mp = 1.0;
2167
2168   z = (-(pu->r_u.m2.mle2_a0 + pu->r_u.m2.mle2_a1 * comp + pu->r_u.m2.mle2_a2 * comp * log(np * mp)) + score) / (pu->r_u.m2.mle2_b1 * comp);
2169
2170   z = -z + EULER_G;
2171   z /= - PI_SQRT6;
2172
2173   return (50.0 + z*10.0);
2174 }
2175
2176 /* AG_STATS - Altschul-Gish Lamdba, K */
2177 double
2178 find_za(int score, double escore, int length, double comp, struct pstat_str *pu)
2179 {
2180   double z, mp, np, a_n1, a_n1f;
2181
2182   a_n1 = (double)length;
2183   a_n1f = log(a_n1)/pu->r_u.ag.H;
2184
2185   mp = pu->r_u.ag.a_n0 - pu->r_u.ag.a_n0f - a_n1f;
2186   np = a_n1 - pu->r_u.ag.a_n0f - a_n1f;
2187
2188   if (np < 1.0) np = 1.0;
2189   if (mp < 1.0) mp = 1.0;
2190
2191   z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
2192
2193   z = -z + EULER_G;
2194   z /= - PI_SQRT6;
2195
2196   return (50.0 + z*10.0);
2197 }
2198
2199 double find_zn(int score, double escore, int length, double comp, struct pstat_str *pu)
2200 {
2201   double z;
2202
2203   z = ((double)score - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
2204
2205   return (50.0 + z*10.0);
2206 }
2207
2208 /* computes E value for a given z value, assuming extreme value distribution */
2209 double
2210 z_to_E(double zs, long entries, struct db_str db)
2211 {
2212   double e, n;
2213
2214   /*  if (db->entries < 5) return (double)db.entries; */
2215   if (entries < 1) { n = db.entries;}
2216   else {n = entries;}
2217
2218   if (zs > ZS_MAX) return 0.0;
2219
2220 #ifndef NORMAL_DIST
2221   e = exp(- PI_SQRT6 * zs - .577216);
2222   return n * (e > .01 ? 1.0 - exp(-e) : e);
2223 #else
2224   return n * erfc(zs/M_SQRT2)/2.0;
2225 #endif
2226 }
2227
2228 double
2229 zs_to_p(double zs)
2230 {
2231   double e, z;
2232
2233   /*  if (db.entries < 5) return 0.0; */
2234
2235   z = (zs - 50.0)/10.0;
2236
2237   if (z > ZS_MAX) return 0.0;
2238
2239 #ifndef NORMAL_DIST
2240   e = exp(- PI_SQRT6 * z - EULER_G);
2241   return (e > .01 ? 1.0 - exp(-e) : e);
2242 #else
2243   return erfc(zs/M_SQRT2)/2.0;
2244 #endif
2245 }
2246
2247 double
2248 zs_to_bit(double zs, int n0, int n1)
2249 {
2250   double z, a_n0, a_n1;
2251
2252   z = (zs - 50.0)/10.0;
2253   a_n0 = (double)n0;
2254   a_n1 = (double)n1;
2255
2256   return (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;
2257 }
2258
2259 /* computes E-value for a given z value, assuming extreme value distribution */
2260 double
2261 zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
2262 {
2263   double e, z, k;
2264
2265   /*  if (db->entries < 5) return 0.0; */
2266
2267   z = (zs - 50.0)/10.0;
2268
2269   if (z > ZS_MAX ) return 0.0;
2270
2271   if (entries < 1) entries = db.entries;
2272
2273   if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
2274     k = (double)db.length /(double)n1;
2275     if (db.carry > 0) {
2276       k += ((double)db.carry * (double)LONG_MAX)/(double)n1;
2277     }
2278   }
2279   else k = (double)entries;
2280
2281   if (k < 1.0) k = 1.0;
2282
2283 #ifndef NORMAL_DIST
2284   z *= PI_SQRT6;
2285   z += EULER_G;
2286   e = exp(-z);
2287   return k * (e > .01 ? 1.0 - exp(-e) : e);
2288 #else
2289   return k * erfc(z/M_SQRT2)/2.0;
2290 #endif
2291 }
2292
2293 #ifdef NORMAL_DIST
2294 double np_to_z(double, int *);
2295 #endif
2296
2297 /* computes E-value for a given z value, assuming extreme value distribution */
2298 double
2299 E_to_zs(double E, long entries)
2300 {
2301   double e, z;
2302   int error;
2303
2304   e = E/(double)entries;
2305
2306 #ifndef NORMAL_DIST
2307   z = (log(e)+EULER_G)/(- PI_SQRT6);
2308   return z*10.0+50.0;
2309 #else
2310   z = np_to_z(1.0-e,&error);
2311
2312   if (!error) return z*10.0+50.0;
2313   else return 0.0;
2314 #endif
2315 }
2316
2317 /* computes 1.0 - E value for a given z value, assuming extreme value
2318    distribution */
2319 double
2320 zs_to_Ec(double zs, long entries)
2321 {
2322   double e, z;
2323
2324   if (entries < 5) return 0.0;
2325
2326   z = (zs - 50.0)/10.0;
2327
2328   if (z > ZS_MAX) return 1.0;
2329
2330 #ifndef NORMAL_DIST
2331   e =  exp(- PI_SQRT6 * z - EULER_G);
2332   return (double)entries * (e > .01 ?  exp(-e) : 1.0 - e);
2333 #else
2334   return (double)entries*erf(z/M_SQRT2)/2.0;
2335 #endif
2336 }
2337
2338 /* calculate a threshold score, given an E() value and Lambda,K,H */
2339
2340 int
2341 E1_to_s(double e_val, int n0, int n1, struct pstat_str *pu) {
2342   double mp, np, a_n0, a_n0f, a_n1;
2343   int score;
2344
2345   a_n0 = (double)n0;
2346   a_n1 = (double)n1;
2347   a_n0f = log(pu->r_u.ag.K * a_n0 * a_n1)/pu->r_u.ag.H;
2348
2349   mp = a_n0 - a_n0f;
2350   np = a_n1 - a_n0f;
2351
2352   if (np < 1.0) np = 1.0;
2353   if (mp < 1.0) mp = 1.0;
2354
2355   score = (int)((log( pu->r_u.ag.K * mp * np) - log(e_val))/pu->r_u.ag.Lambda +0.5);
2356   if (score < 0) score = 0;
2357   return score;
2358 }
2359
2360 /* no longer used; stat_str returned by process_hist
2361 void
2362 summ_stats(char *s_str, struct pstat_str *pu)
2363 {
2364   strcpy(s_str,f_string);
2365 }
2366 */
2367
2368 void
2369 vsort(v,s,n)
2370         double *v; int *s, n;
2371 {
2372   int gap, i, j;
2373   double tmp;
2374   int itmp;
2375
2376   for (gap=n/2; gap>0; gap/=2)
2377     for (i=gap; i<n; i++)
2378       for (j=i-gap; j>=0; j -= gap) {
2379         if (v[j] >= v[j+gap]) break;
2380         tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
2381         itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
2382       }
2383 }
2384
2385 /*
2386 void s_sort (double **ptr, int nbest)
2387 {
2388   int gap, i, j;
2389   double *tmp;
2390
2391   for (gap = nbest/2; gap > 0; gap /= 2)
2392     for (i = gap; i < nbest; i++)
2393       for (j = i - gap; j >= 0; j-= gap) {
2394         if (*ptr[j] >= *ptr[j + gap]) break;
2395         tmp = ptr[j];
2396         ptr[j] = ptr[j + gap];
2397         ptr[j + gap] = tmp;
2398       }
2399 }
2400 */
2401
2402 void ss_sort (int *ptr, int n)
2403 {
2404   int gap, i, j;
2405   int tmp;
2406
2407   for (gap = n/2; gap > 0; gap /= 2)
2408     for (i = gap; i < n; i++)
2409       for (j = i - gap; j >= 0; j-= gap) {
2410         if (ptr[j] >= ptr[j + gap]) break;
2411         tmp = ptr[j];
2412         ptr[j] = ptr[j + gap];
2413         ptr[j + gap] = tmp;
2414       }
2415 }
2416
2417
2418 void s2_sort (struct s2str *ptr, int n)
2419 {
2420   int gap, i, j;
2421   struct s2str tmp;
2422
2423   for (gap = n/2; gap > 0; gap /= 2)
2424     for (i = gap; i < n; i++)
2425       for (j = i - gap; j >= 0; j-= gap) {
2426         if (ptr[j].s >= ptr[j + gap].s) break;
2427         tmp.s = ptr[j].s;
2428         tmp.n = ptr[j].n;
2429         ptr[j].s = ptr[j + gap].s;
2430         ptr[j].n = ptr[j + gap].n;
2431         ptr[j + gap].s = tmp.s;
2432         ptr[j + gap].n = tmp.n;
2433       }
2434 }
2435
2436 void last_stats() {}
2437
2438 void
2439 scale_scores(struct beststr **bptr, int nbest, struct db_str db,
2440              struct pstruct pst, struct pstat_str *rs)
2441 {
2442   int i;
2443   double zscore;
2444
2445   if (pst.zsflag < 0 || pst.zsflag_f < 0) return;
2446
2447   for (i=0; i<nbest; i++) {
2448     zscore = find_zp(bptr[i]->score[pst.score_ix], bptr[i]->escore,
2449                      bptr[i]->n1,bptr[i]->comp,rs);
2450     bptr[i]->zscore = zscore;
2451     bptr[i]->escore
2452       =zs_to_E(zscore,bptr[i]->n1,pst.dnaseq, pst.zdb_size,db);
2453   }
2454   sortbeste(bptr,nbest);
2455 }
2456
2457 #ifdef NORMAL_DIST
2458 /*     ALGORITHM AS241  APPL. STATIST. (1988) VOL. 37, NO. 3
2459
2460        Produces the normal deviate Z corresponding to a given lower
2461        tail area of P; Z is accurate to about 1 part in 10**16.
2462
2463        The hash sums below are the sums of the mantissas of the
2464        coefficients.   They are included for use in checking
2465        transcription.
2466 */
2467
2468 double np_to_z(double p, int *fault) {
2469
2470   double q, r, ppnd16;
2471
2472   double zero = 0.0, one = 1.0, half = 0.5;
2473   double split1 = 0.425, split2 = 5.0;
2474   double const1 = 0.180625, const2 = 1.6;
2475
2476 /*   Coefficients for P close to 0.5 */
2477
2478   double a0 = 3.3871328727963666080e0;
2479   double a1 = 1.3314166789178437745e+2;
2480   double a2 = 1.9715909503065514427e+3;
2481   double a3 = 1.3731693765509461125e+4;
2482   double a4 = 4.5921953931549871457e+4;
2483   double a5 = 6.7265770927008700853e+4;
2484   double a6 = 3.3430575583588128105e+4;
2485   double a7 = 2.5090809287301226727e+3;
2486   double b1 = 4.2313330701600911252e+1;
2487   double b2 = 6.8718700749205790830e+2;
2488   double b3 = 5.3941960214247511077e+3;
2489   double b4 = 2.1213794301586595867e+4;
2490   double b5 = 3.9307895800092710610e+4;
2491   double b6 = 2.8729085735721942674e+4;
2492   double b7 = 5.2264952788528545610e+3;
2493
2494   double sum_ab= 55.8831928806149014439;
2495 /*
2496   Coefficients for P not close to 0, 0.5 or 1.
2497 */
2498
2499   double c0 = 1.42343711074968357734;
2500   double c1 = 4.63033784615654529590;
2501   double c2 = 5.76949722146069140550;
2502   double c3 = 3.64784832476320460504;
2503   double c4 = 1.27045825245236838258;
2504   double c5 = 2.41780725177450611770e-1;
2505   double c6 = 2.27238449892691845833e-2;
2506   double c7 = 7.74545014278341407640e-4;
2507   double d1 = 2.05319162663775882187;
2508   double d2 = 1.67638483018380384940;
2509   double d3 = 6.89767334985100004550e-1;
2510   double d4 = 1.48103976427480074590e-1;
2511   double d5 = 1.51986665636164571966e-2;
2512   double d6 = 5.47593808499534494600e-4;
2513   double d7 = 1.05075007164441684324e-9;
2514
2515   double sum_cd=49.33206503301610289036;
2516 /*
2517        Coefficients for P near 0 or 1.
2518 */
2519   double e0 = 6.65790464350110377720e0;
2520   double e1 = 5.46378491116411436990e0;
2521   double e2 = 1.78482653991729133580e0;
2522   double e3 = 2.96560571828504891230e-1;
2523   double e4 = 2.65321895265761230930e-2;
2524   double e5 = 1.24266094738807843860e-3;
2525   double e6 = 2.71155556874348757815e-5;
2526   double e7 = 2.01033439929228813265e-7;
2527   double f1 = 5.99832206555887937690e-1;
2528   double f2 = 1.36929880922735805310e-1;
2529   double f3 = 1.48753612908506148525e-2;
2530   double f4 = 7.86869131145613259100e-4;
2531   double f5 = 1.84631831751005468180e-5;
2532   double f6 = 1.42151175831644588870e-7;
2533   double f7 = 2.04426310338993978564e-15;
2534
2535   double sum_ef=47.52583317549289671629;
2536
2537   double sum_tmp = 0.0;
2538
2539   /*
2540   sum_tmp = a0+a1+a2+a3+a4+a5+a6+a7+b1+b2+b3+b4+b5+b6+b7;
2541   if (fabs(sum_tmp - sum_ab) > 1e-12) {
2542     fprintf (stderr," sum_ab error: %lg %lg\n",sum_tmp,sum_ab);
2543     *fault = 1;
2544     return zero;
2545   }
2546
2547   sum_tmp = c0+c1+c2+c3+c4+c5+c6+c7+d1+d2+d3+d4+d5+d6+d7;
2548   if (fabs(sum_tmp - sum_cd) > 1e-12) {
2549     fprintf (stderr," sum_cd error: %lg %lg\n",sum_tmp,sum_cd);
2550     *fault = 1;
2551     return zero;
2552   }
2553   sum_tmp = e0+e1+e2+e3+e4+e5+e6+e7+f1+f2+f3+f4+f5+f6+f7;
2554   if (fabs(sum_tmp - sum_ef) > 1e-12) {
2555     fprintf (stderr," sum_ef error: %lg %lg\n",sum_tmp,sum_ef);
2556     *fault = 1;
2557     return zero;
2558   }
2559   */
2560
2561   *fault = 0;
2562   q = p - half;
2563   if (fabs(q) <= split1) {
2564     r = const1 - q * q;
2565     return q * (((((((a7 * r + a6) * r + a5) * r + a4) * r + a3)
2566                     * r + a2) * r + a1) * r + a0) /
2567       (((((((b7 * r + b6) * r + b5) * r + b4) * r + b3)
2568          * r + b2) * r + b1) * r + one);
2569   }
2570   else {
2571     r = (q < zero) ?  p : one - p;
2572     if (r <= zero) {
2573       *fault = 1;
2574       return zero;
2575     }
2576     r = sqrt(-log(r));
2577     if (r <= split2) {
2578       r -= const2;
2579       ppnd16 = (((((((c7 * r + c6) * r + c5) * r + c4) * r + c3)
2580                   * r + c2) * r + c1) * r + c0) /
2581         (((((((d7 * r + d6) * r + d5) * r + d4) * r + d3)
2582            * r + d2) * r + d1) * r + one);
2583     }
2584     else {
2585       r -= split2;
2586       ppnd16 = (((((((e7 * r + e6) * r + e5) * r + e4) * r + e3)
2587                   * r + e2) * r + e1) * r + e0) /
2588         (((((((f7 * r + f6) * r + f5) * r + f4) * r + f3)
2589            * r + f2) * r + f1) * r + one);
2590     }
2591     if (q < zero) return -ppnd16;
2592     else return ppnd16;
2593   }
2594 }
2595 #endif