binaries/src/fasta34/scaleswt.c

   1 /* scaleswt.c */
   2
   3 /* $Name: fa_34_26_5 $ - $Id: scaleswt.c,v 1.21 2006/04/12 18:50:01 wrp Exp $ */
   4 /* as of 24 Sept, 2000 - scaleswn uses no global variables */
   5
   6 /*
   7   copyright (c) 1995, 1996, 2000, 2002 William R. Pearson
   8
   9   This version is designed for fasts/f, which used Tatusov
  10   probabilities for statistical estimates, but still needs a
  11   quick-and-dirty linear regression fit to rank things
  12
  13   For comparisons that obey tatusov statistics, we try whenever
  14   possible to provide accurate e_scores, rather than raw scores.  As a
  15   result, no lambda/K fitting is required; and process_hist() can be
  16   called atthe very beginning of the search to initialize some of the
  17   statistics structures and find_zp().
  18
  19   find_zp() must still return a valid z_score surrogate, as
  20   comp_lib.c/p2_complib.c continue to use z_score's to rank hits, save
  21   the best, etc.
  22
  23   If e_score's cannot be calculated, the process_hist() provides
  24   linear regression fitting for conventional z_score estimates.
  25
  26 */
  27
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include <limits.h>
  33 #include <float.h>
  34 #include <math.h>
  35
  36 #include <limits.h>
  37
  38 #include "defs.h"
  39 #include "param.h"
  40 #include "structs.h"
  41 #ifndef PCOMPLIB
  42 #include "mw.h"
  43 #else
  44 #include "p_mw.h"
  45 #endif
  46
  47 #define MAXHIST 50
  48 #define MAX_LLEN 200
  49 #define LHISTC 5
  50 #define VHISTC 5
  51 #define MAX_SSCORE 300
  52
  53 #define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
  54
  55 #define LN_FACT 10.0
  56 #ifndef M_LN2
  57 #define M_LN2 0.69314718055994530942
  58 #endif
  59
  60 #define EULER_G 0.57721566490153286060
  61 #define PI_SQRT6 1.28254983016186409554
  62
  63 #ifndef M_SQRT2
  64 #define M_SQRT2 1.41421356237
  65 #endif
  66 #define LN200 5.2983173666
  67 #define ZS_MAX 400.0    /* used to prevent underflow on some machines */
  68 #define TOLERANCE 1.0e-12
  69 #define TINY 1.0e-6
  70
  71 /* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
  72 struct rstat_str {
  73   double ngLambda, ngK, ngH;
  74   double rho, rho_e, mu, mu_e, mean_var, var_e;  /* ?_e:std. error of ? */
  75 /* used by REG2_STATS */
  76   double rho2, mu2, var_cutoff;
  77   int n_trimmed; /* excluded because of high z-score */
  78   int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
  79   double tat_a, tat_b, tat_c, spacefactor;
  80   int have_tat;
  81   int tie_j;
  82 };
  83
  84 #define AVE_STATS 0     /* no length effect, only mean/variance */
  85 double find_zt(int score, double escore, int len, double comp, struct rstat_str *);
  86
  87 double find_zn(int score, double escore, int len, double comp, struct rstat_str *);
  88
  89 double power(double, int);
  90
  91 void sortbesto(double *, int );
  92 extern void sortbeste(struct beststr **bptr, int nbest);
  93
  94 int proc_hist_n(struct stat_str *sptr, int n,
  95                  struct pstruct pst, struct hist_str *histp, int do_trim,
  96                  struct rstat_str *);
  97
  98 #define REG_STATS 1     /* length-regression scaled */
  99 double find_zr(int score, double escore, int len, double comp, struct rstat_str *);
 100
 101 int proc_hist_r(struct stat_str *sptr, int n,
 102                  struct pstruct pst, struct hist_str *histp,
 103                  int do_trim, struct rstat_str *rs);
 104
 105 double (*find_zp)(int score, double escore, int len, double comp,
 106                  struct rstat_str *) = &find_zr;
 107
 108 struct llen_str {
 109   int min, max;
 110   int max_score, min_score;
 111   int *hist;
 112   double *score_sums, *score2_sums;
 113   double *score_var;
 114   int max_length, min_length, zero_s;
 115   int fit_flag;
 116 };
 117
 118 static void inithist(struct llen_str *, struct pstruct, int);
 119 static void free_hist( struct llen_str *);
 120 static void addhist(struct llen_str *, int, int, int);
 121 static void prune_hist(struct llen_str *, int, int, int, long *);
 122 void inithistz(int, struct hist_str *histp);
 123 void addhistz(double zs, struct hist_str *histp);
 124
 125 static void fit_llen(struct llen_str *, struct rstat_str *);
 126 static void fit_llens(struct llen_str *, struct rstat_str *);
 127
 128 void linreg(double *lny, double *x, double *lnx, int n,
 129             double *a, double *b, double *c, int start);
 130
 131 double calc_spacefactor(const unsigned char *, int, int, int);
 132
 133 double det(double a11, double a12, double a13,
 134            double a21, double a22, double a23,
 135            double a31, double a32, double a33);
 136
 137 double factorial (int a, int b);
 138
 139 /* void set_db_size(int, struct db_str *, struct hist_str *); */
 140
 141 #ifdef DEBUG
 142 FILE *tmpf;
 143 #endif
 144
 145 int
 146 process_hist(struct stat_str *sptr, int nstats,
 147              struct mngmsg m_msg,
 148              struct pstruct pst,
 149              struct hist_str *histp,
 150              struct rstat_str **rs_sp,
 151              int do_hist
 152              )
 153 {
 154   int zsflag, do_trim;
 155   struct rstat_str *rs_s;
 156
 157   if (pst.zsflag < 0) {
 158     *rs_sp = NULL;
 159     return pst.zsflag;
 160   }
 161
 162   if (*rs_sp == NULL) {
 163     if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {
 164       fprintf(stderr," cannot allocate rs_snion: %ld\n",sizeof(struct rstat_str));
 165       exit(1);
 166     }
 167     else *rs_sp = rs_s;
 168   }
 169   else {
 170     rs_s = *rs_sp;
 171     memset(rs_s,0,sizeof(struct rstat_str));
 172   }
 173
 174   if (m_msg.escore_flg) {
 175     find_zp = &find_zt;
 176     inithistz(MAXHIST,histp);
 177     return 1;
 178   }
 179
 180   if (nstats < 20) {
 181     fprintf(stderr," too few sequences for sampling: %d\n",nstats);
 182     free(rs_s);
 183     *rs_sp = NULL;
 184     return -1;
 185   }
 186
 187   rs_s->ngLambda = m_msg.Lambda;
 188   rs_s->ngK = m_msg.K;
 189   rs_s->ngH = m_msg.H;
 190
 191   zsflag = pst.zsflag;
 192
 193   if (zsflag >= 10) {
 194     zsflag -= 10;
 195     do_trim = 0;
 196   }
 197   else do_trim = 1;
 198
 199   find_zp = &find_zr;
 200   return proc_hist_r(sptr, nstats,pst, histp, do_trim,  rs_s);
 201 }
 202
 203 int
 204 calc_thresh(struct pstruct pst, int nstats,
 205             double Lambda, double K, double H, double *zstrim)
 206 {
 207   int max_hscore;
 208   double ave_n1, tmp_score, z, l_fact;
 209
 210   if (pst.dnaseq == SEQT_DNA || pst.dnaseq == SEQT_RNA) {
 211     ave_n1 = 5000.0;
 212     l_fact = 1.0;
 213   }
 214   else {
 215     ave_n1 = 400.0;
 216     l_fact = 0.7;
 217   }
 218
 219 /*  max_hscore = MAX_SSCORE; */
 220 /*  mean expected for pst.n0 * 400 for protein, 5000 for DNA */
 221 /*  we want a number of offsets that is appropriate for the database size so
 222     far (nstats)
 223 */
 224
 225 /*
 226   the calculation below sets a high-score threshold using an
 227   ungapped lambda, but errs towards the high-score side by using
 228   E()=0.001 and calculating with 0.70*lambda, which is the correct for
 229   going from ungapped to -12/-2 gapped lambda with BLOSUM50
 230 */
 231
 232 #ifndef NORMAL_DIST
 233   tmp_score = 0.01/((double)nstats*K*(double)pst.n0*ave_n1);
 234   tmp_score = -log(tmp_score)/(Lambda*l_fact);
 235   max_hscore = (int)(tmp_score+0.5);
 236
 237   z = 1.0/(double)nstats;
 238   z = (log(z)+EULER_G)/(-PI_SQRT6);
 239 #else
 240   max_hscore = 100;
 241   z = 5.0;
 242 #endif
 243   *zstrim = 10.0*z+50.0;
 244   return max_hscore;
 245 }
 246
 247 int
 248 proc_hist_r(struct stat_str *sptr, int nstats,
 249             struct pstruct pst, struct hist_str *histp,
 250             int do_trim, struct rstat_str *rs)
 251 {
 252   int i, max_hscore;
 253   double zs, ztrim;
 254   char s_string[128];
 255   struct llen_str llen;
 256   char *f_string;
 257   llen.fit_flag=1;
 258   llen.hist=NULL;
 259
 260   max_hscore = calc_thresh(pst, nstats, rs->ngLambda,
 261                            rs->ngK, rs->ngH, &ztrim);
 262
 263   inithist(&llen,pst,max_hscore);
 264   f_string = &(histp->stat_info[0]);
 265
 266   for (i = 0; i<nstats; i++)
 267     addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
 268   histp->entries = nstats - llen.zero_s;
 269
 270   if ((llen.max_score - llen.min_score) < 10) {
 271     free_hist(&llen);
 272     llen.fit_flag = 0;
 273     find_zp = &find_zn;
 274     return proc_hist_n(sptr, nstats, pst, histp, do_trim, rs);
 275   }
 276
 277   fit_llen(&llen, rs); /* now we have rho, mu, rho2, mu2, mean_var
 278                           to set the parameters for the histogram */
 279
 280   if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_n */
 281     free_hist(&llen);
 282     find_zp = &find_zn;
 283     return proc_hist_n(sptr,nstats, pst, histp, do_trim, rs);
 284   }
 285
 286   rs->n_trimmed= rs->n1_trimmed = rs->nb_trimmed = 0;
 287
 288   if (do_trim) {
 289     if (llen.fit_flag) {
 290       for (i = 0; i < nstats; i++) {
 291         zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
 292         if (zs < 20.0 || zs > ztrim) {
 293           rs->n_trimmed++;
 294           prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
 295                      &(histp->entries));
 296         }
 297       }
 298     }
 299
 300   /*  fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", rs->n_trimmed); */
 301
 302     if (llen.fit_flag) fit_llens(&llen, rs);
 303
 304   /*   fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", rs->n1_trimmed,rs->nb_trimmed); */
 305   }
 306
 307
 308   free_hist(&llen);
 309
 310   /* rst all the scores in the histogram */
 311
 312   if (pst.zsflag < 10) s_string[0]='\0';
 313   else if (pst.zs_win > 0)
 314     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
 315   else strncpy(s_string,"(shuffled)",sizeof(s_string));
 316
 317   inithistz(MAXHIST, histp);
 318
 319   sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d\n Lambda= %6.4f",
 320           s_string,
 321           rs->rho*LN_FACT,sqrt(rs->rho_e),rs->mu,sqrt(rs->mu_e), rs->mean_var,sqrt(rs->var_e),
 322           llen.zero_s, rs->n_trimmed, rs->n1_trimmed, rs->nb_trimmed, rs->nb_tot,
 323           PI_SQRT6/sqrt(rs->mean_var));
 324     return REG_STATS;
 325 }
 326
 327
 328 int
 329 proc_hist_n(struct stat_str *sptr, int nstats,
 330             struct pstruct pst, struct hist_str *histp,
 331             int do_trim, struct rstat_str *rs)
 332 {
 333   int i, j;
 334   double s_score, s2_score, ssd;
 335   double ztrim;
 336   int nit, max_hscore;
 337   char s_string[128];
 338   char *f_string;
 339
 340   f_string = &(histp->stat_info[0]);
 341   /*   db->entries = db->length = db->carry = 0; */
 342
 343   max_hscore = calc_thresh(pst, nstats, rs->ngLambda,
 344                            rs->ngK, rs->ngH, &ztrim);
 345
 346   s_score = s2_score = 0.0;
 347
 348   histp->entries = 0;
 349
 350   for ( j = 0, i = 0; i < nstats; i++) {
 351     if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
 352       s_score += (ssd=(double)sptr[i].score);
 353       s2_score += ssd * ssd;
 354       histp->entries++;
 355       /*
 356       db->length += sptr[i].n1;
 357       if (db->length > LONG_MAX) {
 358         db->carry++;
 359         db->length -= LONG_MAX;
 360       }
 361       */
 362       j++;
 363     }
 364   }
 365
 366   if (j > 1 ) {
 367     rs->mu = s_score/(double)j;
 368     rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
 369     rs->mean_var /= (double)(j-1);
 370   }
 371   else {
 372     rs->mu = 50.0;
 373     rs->mean_var = 10.0;
 374   }
 375
 376   if (rs->mean_var < 0.01) {
 377     rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
 378   }
 379
 380   /* now remove some scores */
 381
 382   nit = 5;
 383   while (nit-- > 0) {
 384     rs->n_trimmed = 0;
 385
 386     for (i=0; i< nstats; i++) {
 387       if (sptr[i].n1 < 0) continue;
 388       ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
 389       if (ssd > ztrim || ssd < 20.0) {
 390         /*      fprintf(stderr,"removing %3d %3d %4.1f\n",
 391                 sptr[i].score, sptr[i].n1,ssd); */
 392         ssd = sptr[i].score;
 393         s_score -= ssd;
 394         s2_score -= ssd*ssd;
 395         j--;
 396         rs->n_trimmed++;
 397         histp->entries--;
 398         sptr[i].n1 = -sptr[i].n1;
 399       }
 400     }
 401
 402     if (j > 1 ) {
 403       rs->mu = s_score/(double)j;
 404       rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
 405       rs->mean_var /= (double)(j-1);
 406     }
 407     else {
 408       rs->mu = 50.0;
 409       rs->mean_var = 10.0;
 410     }
 411
 412     if (rs->mean_var < 0.01) {
 413       rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
 414     }
 415
 416     if (rs->n_trimmed < LHISTC) {
 417       /*
 418         fprintf(stderr,"nprune %d at %d\n",nprune,nit);
 419         */
 420       break;
 421     }
 422   }
 423
 424   if (pst.zsflag < 10) s_string[0]='\0';
 425   else if (pst.zs_win > 0)
 426     sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
 427   else strncpy(s_string,"(shuffled)",sizeof(s_string));
 428
 429   sprintf(f_string,"%s unscaled statistics: mu= %6.4f  var=%6.4f; Lambda= %6.4f",
 430           s_string, rs->mu,rs->mean_var,PI_SQRT6/sqrt(rs->mean_var));
 431   return AVE_STATS;
 432 }
 433
 434
 435 /*
 436 This routine calculates the maximum likelihood estimates for the
 437 extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
 438
 439         <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
 440         <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
 441
 442         The <a> parameter can be transformed into and K
 443         of the formula: 1 - exp ( - K m n exp ( - lambda S ))
 444         using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
 445                         1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
 446
 447                         a = log(K m n) / lambda
 448                         a lambda = log (K m n)
 449                         exp(a lambda)  = K m n
 450          but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
 451          so:            K m n = (1/n sum{ exp( -x[i] *lambda)})
 452                         K = sum{}/(nlib m n )
 453
 454 */
 455
 456 void
 457 alloc_hist(struct llen_str *llen)
 458 {
 459   int max_llen, i;
 460   max_llen = llen->max;
 461
 462   if (llen->hist == NULL) {
 463     llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
 464     llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
 465     llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
 466     llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
 467   }
 468
 469   for (i=0; i< max_llen+1; i++) {
 470       llen->hist[i] = 0;
 471       llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
 472   }
 473 }
 474
 475 void
 476 free_hist(struct llen_str *llen)
 477 {
 478   if (llen->hist!=NULL) {
 479     free(llen->score_var);
 480     free(llen->score2_sums);
 481     free(llen->score_sums);
 482     free(llen->hist);
 483     llen->hist=NULL;
 484   }
 485 }
 486
 487 void
 488 inithist(struct llen_str *llen, struct pstruct pst, int max_hscore)
 489 {
 490   llen->max = MAX_LLEN;
 491
 492   llen->max_score = -1;
 493   llen->min_score=10000;
 494
 495   alloc_hist(llen);
 496
 497   llen->zero_s = 0;
 498   llen->min_length = 10000;
 499   llen->max_length = 0;
 500 }
 501
 502 void
 503 addhist(struct llen_str *llen, int score, int length, int max_hscore)
 504 {
 505   int llength;
 506   double dscore;
 507
 508   if ( score<=0 || length < LENGTH_CUTOFF) {
 509     llen->min_score = 0;
 510     llen->zero_s++;
 511     return ;
 512   }
 513
 514   if (score < llen->min_score) llen->min_score = score;
 515   if (score > llen->max_score) llen->max_score = score;
 516
 517   if (length > llen->max_length) llen->max_length = length;
 518   if (length < llen->min_length) llen->min_length = length;
 519   if (score > max_hscore) score = max_hscore;
 520
 521   llength = (int)(LN_FACT*log((double)length)+0.5);
 522
 523   if (llength < 0 ) llength = 0;
 524   if (llength > llen->max) llength = llen->max;
 525   llen->hist[llength]++;
 526   dscore = (double)score;
 527   llen->score_sums[llength] += dscore;
 528   llen->score2_sums[llength] += dscore * dscore;
 529
 530   /*
 531   db->entries++;
 532   db->length += length;
 533   if (db->length > LONG_MAX) {db->carry++;db->length -= LONG_MAX;}
 534   */
 535 }
 536
 537 /* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
 538
 539
 540 void
 541 inithistz(int mh, struct hist_str *histp )
 542 {
 543   int i;
 544
 545   histp->min_hist = 20;
 546   histp->max_hist = 120;
 547
 548   histp->histint = (int)
 549     ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
 550   histp->maxh = (int)
 551     ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
 552
 553   if (histp->hist_a==NULL) {
 554     if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
 555         NULL) {
 556       fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
 557       histp->histflg = 0;
 558     }
 559     else histp->histflg = 1;
 560   }
 561   else {
 562     for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
 563   }
 564 }
 565
 566 /* fasts/f will not show any histogram */
 567 void
 568 addhistz(double zs, struct hist_str *histp)
 569 {
 570 }
 571
 572 void
 573 prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
 574            long *entries)
 575 {
 576   int llength;
 577   double dscore;
 578
 579   if (score <= 0 || length < LENGTH_CUTOFF) return;
 580
 581   if (score > max_hscore) score = max_hscore;
 582
 583   llength = (int)(LN_FACT*log((double)length)+0.5);
 584
 585   if (llength < 0 ) llength = 0;
 586   if (llength > llen->max) llength = llen->max;
 587   llen->hist[llength]--;
 588   dscore = (double)score;
 589   llen->score_sums[llength] -= dscore;
 590   llen->score2_sums[llength] -= dscore * dscore;
 591
 592   (*entries)--;
 593   /*
 594   if (length < db->length) db->length -= length;
 595   else {db->carry--; db->length += (LONG_MAX - (unsigned long)length);}
 596   */
 597 }
 598
 599 /* fit_llen: no trimming
 600    (1) regress scores vs log(n) using weighted variance
 601    (2) calculate mean variance after length regression
 602 */
 603
 604 void
 605 fit_llen(struct llen_str *llen, struct rstat_str *pr)
 606 {
 607   int j;
 608   int n;
 609   int n_size;
 610   double x, y2, u, z;
 611   double mean_x, mean_y, var_x, var_y, covar_xy;
 612   double mean_y2, covar_xy2, var_y2, dllj;
 613
 614   double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
 615
 616 /* now fit scores to best linear function of log(n), using
 617    simple linear regression */
 618
 619   for (llen->min=0; llen->min < llen->max; llen->min++)
 620     if (llen->hist[llen->min]) break;
 621   llen->min--;
 622
 623   for (n_size=0,j = llen->min; j < llen->max; j++) {
 624     if (llen->hist[j] > 1) {
 625       dllj = (double)llen->hist[j];
 626       llen->score_var[j] = llen->score2_sums[j]/dllj
 627         - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
 628       llen->score_var[j] /= (double)(llen->hist[j]-1);
 629       if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
 630       n_size++;
 631     }
 632   }
 633
 634   pr->nb_tot = n_size;
 635
 636   n_w = 0.0;
 637   sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
 638   for (j = llen->min; j < llen->max; j++)
 639     if (llen->hist[j] > 1) {
 640       x = j + 0.5;
 641       dllj = (double)llen->hist[j];
 642       n_w += dllj/llen->score_var[j];
 643       sum_x +=   dllj * x / llen->score_var[j] ;
 644       sum_y += llen->score_sums[j] / llen->score_var[j];
 645       sum_x2 +=  dllj * x * x /llen->score_var[j];
 646       sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
 647     }
 648
 649   if (n_size < 5 ) {
 650     llen->fit_flag=0;
 651     pr->rho = 0;
 652     pr->mu = sum_y/n_w;
 653     return;
 654   }
 655   else {
 656     delta = n_w * sum_x2 - sum_x * sum_x;
 657     if (delta > 0.001) {
 658       pr->rho = (n_w * sum_xy  - sum_x * sum_y)/delta;
 659       pr->rho_e = n_w/delta;
 660       pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
 661       pr->mu_e = sum_x2/delta;
 662     }
 663     else {
 664       llen->fit_flag = 0;
 665       pr->rho = 0;
 666       pr->mu = sum_y/n_w;
 667       return;
 668     }
 669   }
 670
 671   delta = n_w * sum_x2 - sum_x * sum_x;
 672   pr->rho = (n_w * sum_xy  - sum_x * sum_y)/delta;
 673   pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
 674
 675   n = 0;
 676   mean_x = mean_y = mean_y2 = 0.0;
 677   var_x = var_y = 0.0;
 678   covar_xy = covar_xy2 = 0.0;
 679
 680   for (j = llen->min; j <= llen->max; j++)
 681    if (llen->hist[j] > 1 ) {
 682       n += llen->hist[j];
 683       x = (double)j + 0.5;
 684       mean_x += (double)llen->hist[j] * x;
 685       mean_y += llen->score_sums[j];
 686       var_x += (double)llen->hist[j] * x * x;
 687       var_y += llen->score2_sums[j];
 688       covar_xy += x * llen->score_sums[j];
 689     }
 690   mean_x /= n; mean_y /= n;
 691   var_x = var_x / n - mean_x * mean_x;
 692   var_y = var_y / n - mean_y * mean_y;
 693
 694   covar_xy = covar_xy / n - mean_x * mean_y;
 695 /*
 696   pr->rho = covar_xy / var_x;
 697   pr->mu = mean_y - pr->rho * mean_x;
 698 */
 699   mean_y2 = covar_xy2 = var_y2 = 0.0;
 700   for (j = llen->min; j <= llen->max; j++)
 701     if (llen->hist[j] > 1) {
 702       x = (double)j + 0.5;
 703       u = pr->rho * x + pr->mu;
 704       y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
 705 /*
 706       dllj = (double)llen->hist[j];
 707       fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
 708               llen->score_sums[j]/dllj,y2/dllj);
 709 */
 710       mean_y2 += y2;
 711       var_y2 += y2 * y2;
 712       covar_xy2 += x * y2;
 713       /*      fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
 714               x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
 715     }
 716
 717   pr->mean_var = mean_y2 /= (double)n;
 718   covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
 719
 720   if (pr->mean_var <= 0.01) {
 721     llen->fit_flag = 0;
 722     pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
 723   }
 724
 725   /*
 726   fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
 727           pr->rho*LN_FACT,pr->mu,pr->mean_var);
 728   */
 729   if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
 730   else pr->var_e = 0.0;
 731
 732   if (llen->fit_flag) {
 733     pr->rho2 = covar_xy2 / var_x;
 734     pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
 735   }
 736   else {
 737     pr->rho2 = 0;
 738     pr->mu2 = pr->mean_var;
 739   }
 740
 741   if (pr->rho2 < 0.0 )
 742     z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
 743   else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
 744   if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
 745
 746   pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
 747 }
 748
 749 /* fit_llens: trim high variance bins
 750    (1) regress scores vs log(n) using weighted variance
 751    (2) regress residuals vs log(n)
 752    (3) remove high variance bins
 753    (4) calculate mean variance after length regression
 754 */
 755
 756 void
 757 fit_llens(struct llen_str *llen, struct rstat_str *pr)
 758 {
 759   int j;
 760   int n, n_u2;
 761   double x, y, y2, u, u2, v, z;
 762   double mean_x, mean_y, var_x, var_y, covar_xy;
 763   double mean_y2, covar_xy2;
 764   double mean_u2, mean_3u2, dllj;
 765   double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
 766
 767 /* now fit scores to best linear function of log(n), using
 768    simple linear regression */
 769
 770   for (llen->min=0; llen->min < llen->max; llen->min++)
 771     if (llen->hist[llen->min]) break;
 772   llen->min--;
 773
 774   for (j = llen->min; j < llen->max; j++) {
 775     if (llen->hist[j] > 1) {
 776       dllj = (double)llen->hist[j];
 777       llen->score_var[j] = (double)llen->score2_sums[j]/dllj
 778         - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
 779       llen->score_var[j] /= (double)(llen->hist[j]-1);
 780       if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
 781     }
 782   }
 783
 784   n_w = 0.0;
 785   sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
 786   for (j = llen->min; j < llen->max; j++)
 787     if (llen->hist[j] > 1) {
 788       x = j + 0.5;
 789       dllj = (double)llen->hist[j];
 790       n_w += dllj/llen->score_var[j];
 791       sum_x +=   dllj * x / llen->score_var[j] ;
 792       sum_y += llen->score_sums[j] / llen->score_var[j];
 793       sum_x2 +=  dllj * x * x /llen->score_var[j];
 794       sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
 795     }
 796
 797   delta = n_w * sum_x2 - sum_x * sum_x;
 798   pr->rho = (n_w * sum_xy  - sum_x * sum_y)/delta;
 799   pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
 800
 801 /* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
 802
 803   n = 0;
 804   mean_x = mean_y = mean_y2 = 0.0;
 805   var_x = var_y = 0.0;
 806   covar_xy = covar_xy2 = 0.0;
 807
 808   for (j = llen->min; j <= llen->max; j++)
 809     if (llen->hist[j] > 1 ) {
 810       n += llen->hist[j];
 811       x = (double)j + 0.5;
 812       dllj = (double)llen->hist[j];
 813       mean_x += dllj * x;
 814       mean_y += llen->score_sums[j];
 815       var_x += dllj * x * x;
 816       var_y += llen->score2_sums[j];
 817       covar_xy += x * llen->score_sums[j];
 818     }
 819   mean_x /= n; mean_y /= n;
 820   var_x = var_x / n - mean_x * mean_x;
 821   var_y = var_y / n - mean_y * mean_y;
 822
 823   covar_xy = covar_xy / n - mean_x * mean_y;
 824 /*  pr->rho = covar_xy / var_x;
 825   pr->mu = mean_y - pr->rho * mean_x;
 826 */
 827
 828   mean_y2 = covar_xy2 = 0.0;
 829   for (j = llen->min; j <= llen->max; j++)
 830     if (llen->hist[j] > 1) {
 831       x = (double)j + 0.5;
 832       u = pr->rho * x + pr->mu;
 833       y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
 834       mean_y2 += y2;
 835       covar_xy2 += x * y2;
 836     }
 837
 838   mean_y2 /= n;
 839   covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
 840   pr->rho2 = covar_xy2 / var_x;
 841   pr->mu2 = mean_y2 - pr->rho2 * mean_x;
 842
 843   if (pr->rho2 < 0.0 )
 844     z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
 845   else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
 846   if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
 847
 848   pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
 849
 850 /*  fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
 851          pr->var_cutoff,z);
 852 */
 853   mean_u2 = 0.0;
 854   n_u2 = 0;
 855   for ( j = llen->min; j < llen->max; j++) {
 856     y = j+0.5;
 857     dllj = (double)llen->hist[j];
 858     x = pr->rho * y + pr->mu;
 859     v = pr->rho2 * y + pr->mu2;
 860     if (v < pr->var_cutoff) v = pr->var_cutoff;
 861     if (llen->hist[j]> 1) {
 862       u2 =  (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
 863       mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
 864       n_u2++;
 865       /*      fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
 866               j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
 867     }
 868     else llen->score_var[j] = -1.0;
 869   }
 870
 871   mean_u2 = sqrt(mean_u2/(double)n_u2);
 872   /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */
 873
 874   mean_3u2 = mean_u2*3.0;
 875
 876   for (j = llen->min; j < llen->max; j++) {
 877     if (llen->hist[j] <= 1) continue;
 878     if (sqrt(llen->score_var[j]) > mean_3u2) {
 879       /*      fprintf(stderr," removing %d %d %.2f\n",
 880              j, (int)(exp((double)j/LN_FACT)-0.5),
 881              sqrt(llen->score_var[j]));
 882              */
 883       pr->nb_trimmed++;
 884       pr->n1_trimmed += llen->hist[j];
 885       llen->hist[j] = 0;
 886     }
 887   }
 888   fit_llen(llen, pr);
 889 }
 890
 891
 892 /* REG_STATS - Z() from rho/mu/mean_var */
 893 double find_zr(int score, double escore, int length, double comp,
 894               struct rstat_str *rs)
 895 {
 896   double log_len, z;
 897
 898   if (score <= 0) return 0.0;
 899   if ( length < LENGTH_CUTOFF) return 0.0;
 900
 901   log_len = LN_FACT*log((double)(length));
 902 /*  var = rs->rho2 * log_len + rs->mu2;
 903   if (var < rs->var_cutoff) var = rs->var_cutoff;
 904 */
 905
 906   z = ((double)score - rs->rho * log_len - rs->mu) / sqrt(rs->mean_var);
 907
 908   return (50.0 + z*10.0);
 909 }
 910
 911 double find_zt(int score, double escore, int length, double comp,
 912               struct rstat_str *rs)
 913 {
 914   if (escore > 0.0) return -log(escore)/M_LN2;
 915   else return 744.440071/M_LN2;
 916 }
 917
 918 double find_zn(int score, double escore, int length, double comp,
 919               struct rstat_str *rs)
 920 {
 921   double z;
 922
 923   z = ((double)score - rs->mu) / sqrt(rs->mean_var);
 924
 925   return (50.0 + z*10.0);
 926 }
 927
 928 /* computes E value for a given z value, assuming extreme value distribution */
 929 double
 930 z_to_E(double zs, long entries, struct db_str db)
 931 {
 932   double e, n;
 933
 934   /*  if (db->entries < 5) return (double)db.entries; */
 935   if (entries < 1) { n = db.entries;}
 936   else {n = entries;}
 937
 938   if (zs > ZS_MAX) return 0.0;
 939
 940   e = exp(-PI_SQRT6 * zs - EULER_G);
 941   return n * (e > .01 ? 1.0 - exp(-e) : e);
 942 }
 943
 944 double
 945 zs_to_p(double zs)
 946 {
 947   return zs;
 948 }
 949
 950 /* this version assumes the probability is in the ->zscore variable,
 951    which is provided by this file after last_scale()
 952 */
 953
 954 double
 955 zs_to_bit(double zs, int n0, int n1)
 956 {
 957   return zs+log((double)(n0*n1))/M_LN2 ;
 958 }
 959
 960 /* computes E-value for a given z value, assuming extreme value distribution */
 961 double
 962 zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
 963 {
 964   double e, z, k;
 965
 966   /*  if (db->entries < 5) return 0.0; */
 967
 968   if (zs > ZS_MAX ) return 0.0;
 969
 970   if (entries < 1) entries = db.entries;
 971
 972   if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
 973     k = (double)db.length /(double)n1;
 974     if (db.carry > 0) { k *= (double)db.carry * (double)LONG_MAX;}
 975   }
 976   else k = (double)entries;
 977
 978   if (k < 1.0) k = 1.0;
 979
 980   zs *= M_LN2;
 981   if ( zs > 100.0) e = 0.0;
 982   else e =  exp(-zs);
 983   return k * e;
 984 }
 985
 986 /* computes E-value for a given z value, assuming extreme value distribution */
 987 double
 988 E_to_zs(double E, long entries)
 989 {
 990   double e, z;
 991   int error;
 992
 993   e = E/(double)entries;
 994
 995 #ifndef NORMAL_DIST
 996   z = (log(e)+EULER_G)/(-PI_SQRT6);
 997   return z*10.0+50.0;
 998 #else
 999   z = np_to_z(1.0-e,&error);
1000
1001   if (!error) return z*10.0+50.0;
1002   else return 0.0;
1003 #endif
1004 }
1005
1006 /* computes 1.0 - E value for a given z value, assuming extreme value
1007    distribution */
1008 double
1009 zs_to_Ec(double zs, long entries)
1010 {
1011   double e, z;
1012
1013   if (entries < 5) return 0.0;
1014
1015   z = (zs - 50.0)/10.0;
1016
1017   if (z > ZS_MAX) return 1.0;
1018
1019   e =  exp(-PI_SQRT6 * z - EULER_G);
1020   return (double)entries * (e > .01 ?  exp(-e) : 1.0 - e);
1021 }
1022
1023 void
1024 vsort(v,s,n)
1025         double *v; int *s, n;
1026 {
1027   int gap, i, j;
1028   double tmp;
1029   int itmp;
1030
1031   for (gap=n/2; gap>0; gap/=2)
1032     for (i=gap; i<n; i++)
1033       for (j=i-gap; j>=0; j -= gap) {
1034         if (v[j] >= v[j+gap]) break;
1035         tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
1036         itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
1037       }
1038 }
1039
1040 void
1041 sort_escore(double *v, int n)
1042 {
1043   int gap, i, j;
1044   double dtmp;
1045
1046   for (gap=n/2; gap>0; gap/=2) {
1047     for (i=gap; i<n; i++) {
1048       for (j=i-gap; j>=0; j -= gap) {
1049         if (v[j] <= v[j+gap]) break;
1050         dtmp = v[j];
1051         v[j] = v[j+gap];
1052         v[j+gap] = dtmp;
1053       }
1054     }
1055   }
1056 }
1057
1058 /* scale_tat - compute 'a', 'b', 'c' coefficients for scaling fasts/f
1059    escores
1060    5-May-2003 - also calculate index for high ties
1061 */
1062 void
1063 scale_tat(double *escore, int nstats,
1064           long db_entries, int do_trim,
1065           struct rstat_str *rs)
1066 {
1067   int i, j, k, start;
1068   double *x, *lnx, *lny;
1069
1070   /*   sort_escore(escore, nstats); */
1071
1072   while (*escore<0.0) {escore++; nstats--; }
1073
1074   x = (double *) calloc(nstats, sizeof(double));
1075   if(x == NULL) {
1076     fprintf(stderr, "Couldn't calloc tatE/x\n");
1077     exit(1);
1078   }
1079
1080   lnx = (double *) calloc(nstats,sizeof(double));
1081   if(lnx == NULL) {
1082     fprintf(stderr, "Couldn't calloc tatE/lnx\n");
1083     exit(1);
1084   }
1085
1086   lny = (double *) calloc(nstats,sizeof(double));
1087   if(lny == NULL) {
1088     fprintf(stderr, "Couldn't calloc tatE/lny\n");
1089     exit(1);
1090   }
1091
1092   for(i = 0 ; i < nstats ; ) {
1093
1094     lny[i] = log(escore[i]);
1095
1096     for(j = i+1 ; j < nstats ; j++) {
1097         if(escore[j] != escore[i]) break;
1098     }
1099
1100     x[i] = ((((double)i + (double)(j - i - 1)/2.0)*(double)nstats/(double)db_entries)+1.0)/(double)nstats;
1101     lnx[i] = log(x[i]);
1102
1103     for(k = i+1 ; k < j ; k++) {
1104       lny[k]=lny[i];
1105       x[k] = x[i];
1106       lnx[k]=lnx[i];
1107     }
1108     i = k;
1109   }
1110
1111   if (!do_trim) {
1112     start = 0;
1113   } else {
1114     start = 0.05 * (double) nstats;
1115     start = start > 500 ? 500 : start;
1116   }
1117
1118   linreg(lny, x, lnx, nstats, &rs->tat_a, &rs->tat_b, &rs->tat_c, start);
1119
1120   /* I have the coefficients I need - a, b, c; free arrays */
1121
1122   free(lny);
1123   free(lnx);
1124   free(x);
1125
1126   /* calculate tie_j - the index below which all scores are considered
1127      positional ties */
1128
1129   rs->tie_j = 0.005 * db_entries;
1130 }
1131
1132 void
1133 linreg(double *lny, double *x, double *lnx, int n,
1134        double *a, double *b, double *c, int start) {
1135
1136   double yf1, yf2, yf3;
1137   double f1f1, f1f2, f1f3;
1138   double f2f2, f2f3;
1139   double f3f3, delta;
1140
1141   int i;
1142
1143   yf1 = yf2 = yf3 = 0.0;
1144   f1f1 = f1f2 = f1f3 = f2f2 = f2f3 = f3f3 = 0.0;
1145
1146   for(i = start; i < n; i++) {
1147     yf1 += lny[i] * lnx[i];
1148     yf2 += lny[i] * x[i];
1149     yf3 += lny[i];
1150
1151     f1f1 += lnx[i] * lnx[i];
1152     f1f2 += lnx[i] * x[i];
1153     f1f3 += lnx[i];
1154
1155     f2f2 += x[i] * x[i];
1156     f2f3 += x[i];
1157
1158     f3f3 += 1.0;
1159   }
1160
1161   delta = det(f1f1, f1f2, f1f3, f1f2, f2f2, f2f3, f1f3, f2f3, f3f3);
1162
1163   *a = det(yf1, f1f2, f1f3, yf2, f2f2, f2f3, yf3, f2f3, f3f3) / delta;
1164   *b = det(f1f1, yf1, f1f3, f1f2, yf2, f2f3, f1f3, yf3, f3f3) / delta;
1165   *c = det(f1f1, f1f2, yf1, f1f2, f2f2, yf2, f1f3, f2f3, yf3) / delta;
1166
1167 }
1168
1169 double det(double a11, double a12, double a13,
1170            double a21, double a22, double a23,
1171            double a31, double a32, double a33)
1172 {
1173   double result;
1174
1175   result = a11 * (a22 * a33 - a32 * a23);
1176   result -= a12 * (a21 * a33 - a31 * a23);
1177   result += a13 * (a21 * a32 - a31 * a22);
1178
1179   return result;
1180 }
1181
1182 void
1183 last_stats(const unsigned char *aa0, int n0,
1184            struct stat_str *sptr, int nstats,
1185            struct beststr **bestp_arr, int nbest,
1186            struct mngmsg m_msg, struct pstruct pst,
1187            struct hist_str *histp, struct rstat_str **rs_sp)
1188 {
1189   double *obs_escore;
1190   int i, nobs, nobs_t, do_trim;
1191   long db_entries;
1192   struct rstat_str *rs_s;
1193
1194   if (*rs_sp == NULL) {
1195     if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {
1196       fprintf(stderr," cannot allocate rs_s: %ld\n",sizeof(struct rstat_str));
1197       exit(1);
1198     }
1199     else *rs_sp = rs_s;
1200   }
1201   else rs_s = *rs_sp;
1202
1203   histp->entries = 0;
1204
1205   sortbeste(bestp_arr,nbest);
1206
1207   rs_s->spacefactor =
1208     calc_spacefactor(aa0, n0, m_msg.nm0,pst.nsq);
1209
1210   if (pst.zsflag >= 1 && pst.zsflag <= 4) {
1211     if (m_msg.escore_flg) {
1212       nobs = nbest;
1213       do_trim = 1;
1214     }
1215     else {
1216       nobs = nstats;
1217       do_trim = 0;
1218     }
1219
1220     if ((obs_escore = (double *)calloc(nobs,sizeof(double)))==NULL) {
1221       fprintf(stderr," cannot allocate obs_escore[%d]\n",nbest);
1222       exit(1);
1223     }
1224
1225     if (m_msg.escore_flg) {
1226       for (i=nobs=0; i<nbest; i++) {
1227         if (bestp_arr[i]->escore<= 1.00)
1228           obs_escore[nobs++]=bestp_arr[i]->escore;
1229       }
1230       /*
1231       nobs_t = nobs;
1232       for (i=0; i<nbest; i++) {
1233         if (bestp_arr[i]->escore >= 0.99 &&
1234             bestp_arr[i]->escore <= 1.00)
1235           obs_escore[nobs++]=bestp_arr[i]->escore;
1236       }
1237       */
1238       db_entries = m_msg.db.entries;
1239     }
1240     else {
1241       for (i=nobs=0; i<nstats; i++) {
1242         if (sptr[i].escore <= 1.00 ) obs_escore[nobs++]=sptr[i].escore;
1243       }
1244       /*
1245       nobs_t = nobs;
1246       for (i=0; i<nstats; i++) {
1247         if (sptr[i].escore >= 0.99 &&
1248             sptr[i].escore <= 1.0) obs_escore[nobs++]=sptr[i].escore;
1249       }
1250       */
1251       db_entries = nobs;
1252 /*    db_entries = m_msg.db.entries;*/
1253     }
1254
1255     sortbesto(obs_escore,nobs);
1256     if (nobs > 100) {
1257       scale_tat(obs_escore,nobs,db_entries,do_trim,rs_s);
1258       rs_s->have_tat=1;
1259       sprintf(histp->stat_info,"scaled Tatusov statistics (%d): tat_a: %6.4f tat_b: %6.4f tat_c: %6.4f",
1260               nobs,rs_s->tat_a, rs_s->tat_b, rs_s->tat_c);
1261     }
1262     else {
1263       rs_s->have_tat=0;
1264       sprintf(histp->stat_info,"Space_factor %.4g scaled statistics",
1265             rs_s->spacefactor);
1266     }
1267     free(obs_escore);
1268   }
1269   else {
1270     rs_s->have_tat=0;
1271     histp->stat_info[0] = '\0';
1272   }
1273 }
1274
1275 /* scale_scores() takes the best (real) scores and re-scales them;
1276    beststr bptr[] must be sorted */
1277
1278 void
1279 scale_scores(struct beststr **bptr, int nbest, struct db_str db,
1280              struct pstruct pst, struct rstat_str *rs)
1281 {
1282   int i, j, k;
1283   double obs, r_a, r_b, r_c;
1284
1285   /* this scale function absolutely requires that the results be sorted
1286      before it is used */
1287
1288   sortbeste(bptr,nbest);
1289
1290   if (!rs->have_tat) {
1291     for (i=0; i<nbest; i++) {
1292       bptr[i]->escore *= rs->spacefactor;
1293     }
1294   }
1295   else {
1296
1297     /* here if more than 1000 scores */
1298
1299     r_a = rs->tat_a; r_b = rs->tat_b; r_c = rs->tat_c;
1300
1301   /* the problem with scaletat is that the E() value is related to
1302      ones position in the list of top scores - thus, knowing the score
1303      is not enough - one must know the rank */
1304
1305     for(i = 0 ; i < nbest ; ) {
1306       /* take the bottom 0.5%, and the ties, and treat them all the same */
1307       j = i + 1;
1308       while (j< nbest &&
1309              (j <= (0.005 * db.entries) || bptr[j]->escore == bptr[i]->escore)
1310              ) {
1311         j++;
1312       }
1313
1314       /* observed frequency */
1315       obs = ((double)i + ((double)(j - i - 1)/ 2.0) + 1.0)/(double)db.entries;
1316
1317       /* make certain ties all have the same correction */
1318       for (k = i ; k < j ; k++) {
1319         bptr[k]->escore *= obs/exp(r_a*log(obs) + r_b*obs + r_c);
1320       }
1321       i = k;
1322     }
1323   }
1324
1325   for (i=0; i<nbest; i++) {
1326     if(bptr[i]->escore > 0.01)
1327       bptr[i]->escore = 1.0 - exp(-bptr[i]->escore);
1328     if (bptr[i]->escore > 0.0)
1329       bptr[i]->zscore = -log(bptr[i]->escore)/M_LN2;
1330     else
1331       bptr[i]->zscore = 744.440071/M_LN2;
1332     bptr[i]->escore *= pst.zdb_size;
1333   }
1334 }
1335
1336 double scale_one_score (int ipos, double escore,
1337                         struct db_str db,
1338                         struct rstat_str *rs) {
1339   double obs;
1340   double a, b, c;
1341
1342   if (!rs->have_tat)
1343     return escore * rs->spacefactor;
1344
1345   if (ipos < rs->tie_j) ipos = rs->tie_j/2;
1346
1347   a = rs->tat_a; b = rs->tat_b; c = rs->tat_c;
1348
1349   obs = ((double)ipos + 1.0)/(double)db.entries;
1350
1351   escore *= obs/exp(a*log(obs) + b*obs + c);
1352
1353   return escore;
1354 }
1355
1356 double calc_spacefactor(const unsigned char *aa0, int n0,
1357                         int nm0, int nsq) {
1358
1359 #if !defined(FASTF)
1360   return pow(2.0, (double) nm0) - 1.0;
1361 #else
1362
1363   int i, j, n, l, nr, bin, k;
1364   int nmoff;
1365   int **counts;
1366   int **factors;
1367   double tmp, result = 0.0;
1368
1369   nmoff = (n0 - nm0 + 1)/nm0+1;
1370
1371   counts = (int **) calloc(nsq, sizeof(int *));
1372   if(counts == NULL) {
1373     fprintf(stderr, "couldn't calloc counts array!\n");
1374     exit(1);
1375   }
1376
1377   counts[0] = (int *) calloc(nsq * (nmoff - 1), sizeof(int));
1378   if(counts[0] == NULL) {
1379     fprintf(stderr, "couldn't calloc counts array!\n");
1380     exit(1);
1381   }
1382
1383   for(i = 0 ; i < nsq ; i++) {
1384     counts[i] = counts[0] + (i * (nmoff - 1));
1385   }
1386
1387   for(i = 0 ; i < nm0 ; i++) {
1388     for(j = 0 ; j < (nmoff - 1) ; j++) {
1389       counts[ aa0[nmoff * i + j] ] [ j ] ++;
1390     }
1391   }
1392
1393   factors = (int **) calloc(nm0 + 1, sizeof(int *));
1394   if(factors == NULL) {
1395     fprintf(stderr, "Couldn't calloc factors array!\n");
1396     exit(1);
1397   }
1398
1399   factors[0] = (int *) calloc((nm0 + 1) * (nmoff - 1), sizeof(int));
1400   if(factors[0] == NULL) {
1401     fprintf(stderr, "Couldn't calloc factors array!\n");
1402     exit(1);
1403   }
1404
1405   for(i = 0 ; i <= nm0 ; i++) {
1406     factors[i] = factors[0] + (i * (nmoff - 1));
1407   }
1408
1409   /*
1410     this algorithm was adapted from the GAP4 library's NrArrangement function:
1411     The GAP Group, GAP --- Groups, Algorithms, and Programming,
1412     Version 4.1; Aachen, St Andrews, 1999.
1413     (http://www-gap.dcs.st-and.ac.uk/ gap)
1414   */
1415
1416   /* calculate K factors for each column in query: */
1417   for(j = 0 ; j < (nmoff - 1) ; j++) {
1418
1419     /* only one way to select 0 elements */
1420     factors[0][j] = 1;
1421
1422     /* for each of the possible elements in this column */
1423     for(n = 0 ; n < nsq ; n++) {
1424
1425       /* if there aren't any of these, skip it */
1426       if(counts[n][j] == 0) { continue; }
1427
1428       /* loop over the possible lengths of the arrangement: K..0 */
1429       for(l = nm0 ; l >= 0 ; l--) {
1430         nr = 0;
1431         bin = 1;
1432
1433         /*
1434           compute the number of arrangements of length <l>
1435           using only the first <n> elements of <mset>
1436         */
1437         for(i = 0, k = min(counts[n][j], l); i <= k ; i++) {
1438
1439           /*
1440              add the number of arrangements of length <l>
1441              that consist of <l>-<i> of the first <n>-1 elements
1442              and <i> copies of the <n>th element
1443           */
1444           nr += bin * factors[l-i][j];
1445           bin = (int) ((float) bin * (float) (l - i) / (float) (i + 1));
1446         }
1447
1448         factors[l][j] = nr;
1449       }
1450     }
1451   }
1452
1453   result = 0.0;
1454   for(i = 1 ; i <= nm0 ; i++) {
1455     tmp = 1.0;
1456     for(j = 0 ; j < (nmoff - 1) ; j++) {
1457       tmp *= (double) factors[i][j];
1458     }
1459     tmp /= factorial(i, 1);
1460     result += tmp;
1461   }
1462
1463   free(counts[0]);
1464   free(counts);
1465   free(factors[0]);
1466   free(factors);
1467
1468   return result;
1469 #endif
1470 }
1471
1472 void sortbesto (double *obs, int nobs)
1473 {
1474   int gap, i, j, k;
1475   double v;
1476   int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
1477                    13776, 4592, 1968, 861, 336,
1478                    112, 48, 21, 7, 3, 1 };
1479
1480   for ( k = 0; k < 16; k++)
1481     for (gap = incs[k], i=gap; i < nobs; i++) {
1482       v = obs[i];
1483       j = i;
1484       while ( j >= gap && obs[j-gap] > v) {
1485         obs[j] = obs[j - gap];
1486         j -= gap;
1487       }
1488       obs[j] = v;
1489     }
1490 }