/* scaleswt.c */

/* $Name: fa_34_26_5 $ - $Id: scaleswt.c,v 1.21 2006/04/12 18:50:01 wrp Exp $ */
/* as of 24 Sept, 2000 - scaleswn uses no global variables */

/*
  copyright (c) 1995, 1996, 2000, 2002 William R. Pearson

  This version is designed for fasts/f, which used Tatusov
  probabilities for statistical estimates, but still needs a
  quick-and-dirty linear regression fit to rank things

  For comparisons that obey tatusov statistics, we try whenever
  possible to provide accurate e_scores, rather than raw scores.  As a
  result, no lambda/K fitting is required; and process_hist() can be
  called atthe very beginning of the search to initialize some of the
  statistics structures and find_zp().

  find_zp() must still return a valid z_score surrogate, as
  comp_lib.c/p2_complib.c continue to use z_score's to rank hits, save
  the best, etc.

  If e_score's cannot be calculated, the process_hist() provides
  linear regression fitting for conventional z_score estimates.

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <limits.h>
#include <float.h>
#include <math.h>

#include <limits.h>

#include "defs.h"
#include "param.h"
#include "structs.h"
#ifndef PCOMPLIB
#include "mw.h"
#else
#include "p_mw.h"
#endif

#define MAXHIST 50
#define MAX_LLEN 200
#define LHISTC 5
#define VHISTC 5
#define MAX_SSCORE 300

#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */

#define LN_FACT 10.0
#ifndef M_LN2
#define M_LN2 0.69314718055994530942
#endif

#define EULER_G 0.57721566490153286060
#define PI_SQRT6 1.28254983016186409554

#ifndef M_SQRT2
#define M_SQRT2 1.41421356237
#endif
#define LN200 5.2983173666
#define ZS_MAX 400.0	/* used to prevent underflow on some machines */
#define TOLERANCE 1.0e-12
#define TINY 1.0e-6

/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
struct rstat_str {
  double ngLambda, ngK, ngH;
  double rho, rho_e, mu, mu_e, mean_var, var_e;  /* ?_e:std. error of ? */
/* used by REG2_STATS */
  double rho2, mu2, var_cutoff;
  int n_trimmed; /* excluded because of high z-score */
  int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
  double tat_a, tat_b, tat_c, spacefactor;
  int have_tat;
  int tie_j;
};

#define AVE_STATS 0	/* no length effect, only mean/variance */
double find_zt(int score, double escore, int len, double comp, struct rstat_str *);

double find_zn(int score, double escore, int len, double comp, struct rstat_str *);

double power(double, int);

void sortbesto(double *, int );
extern void sortbeste(struct beststr **bptr, int nbest);

int proc_hist_n(struct stat_str *sptr, int n, 
		 struct pstruct pst, struct hist_str *histp, int do_trim,
		 struct rstat_str *);

#define REG_STATS 1	/* length-regression scaled */
double find_zr(int score, double escore, int len, double comp, struct rstat_str *);

int proc_hist_r(struct stat_str *sptr, int n,
		 struct pstruct pst, struct hist_str *histp,
		 int do_trim, struct rstat_str *rs);

double (*find_zp)(int score, double escore, int len, double comp,
		 struct rstat_str *) = &find_zr;

struct llen_str {
  int min, max;
  int max_score, min_score;
  int *hist;
  double *score_sums, *score2_sums;
  double *score_var;
  int max_length, min_length, zero_s;
  int fit_flag;
};

static void inithist(struct llen_str *, struct pstruct, int);
static void free_hist( struct llen_str *);
static void addhist(struct llen_str *, int, int, int);
static void prune_hist(struct llen_str *, int, int, int, long *);
void inithistz(int, struct hist_str *histp);
void addhistz(double zs, struct hist_str *histp);

static void fit_llen(struct llen_str *, struct rstat_str *);
static void fit_llens(struct llen_str *, struct rstat_str *);

void linreg(double *lny, double *x, double *lnx, int n,
	    double *a, double *b, double *c, int start);

double calc_spacefactor(const unsigned char *, int, int, int);

double det(double a11, double a12, double a13,
	   double a21, double a22, double a23,
	   double a31, double a32, double a33);

double factorial (int a, int b);

/* void set_db_size(int, struct db_str *, struct hist_str *); */

#ifdef DEBUG
FILE *tmpf;
#endif

int
process_hist(struct stat_str *sptr, int nstats, 
	     struct mngmsg m_msg,
	     struct pstruct pst,
	     struct hist_str *histp,
	     struct rstat_str **rs_sp,
	     int do_hist
	     )
{
  int zsflag, do_trim;
  struct rstat_str *rs_s;

  if (pst.zsflag < 0) {
    *rs_sp = NULL;
    return pst.zsflag;
  }

  if (*rs_sp == NULL) {
    if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {
      fprintf(stderr," cannot allocate rs_snion: %ld\n",sizeof(struct rstat_str));
      exit(1);
    }
    else *rs_sp = rs_s;
  }
  else {
    rs_s = *rs_sp;
    memset(rs_s,0,sizeof(struct rstat_str));
  }

  if (m_msg.escore_flg) {
    find_zp = &find_zt;
    inithistz(MAXHIST,histp);
    return 1;
  }

  if (nstats < 20) {
    fprintf(stderr," too few sequences for sampling: %d\n",nstats);
    free(rs_s);
    *rs_sp = NULL;
    return -1;
  }

  rs_s->ngLambda = m_msg.Lambda;
  rs_s->ngK = m_msg.K;
  rs_s->ngH = m_msg.H;

  zsflag = pst.zsflag;

  if (zsflag >= 10) {
    zsflag -= 10;
    do_trim = 0;
  }
  else do_trim = 1;

  find_zp = &find_zr;
  return proc_hist_r(sptr, nstats,pst, histp, do_trim,  rs_s);
}

int
calc_thresh(struct pstruct pst, int nstats, 
	    double Lambda, double K, double H, double *zstrim)
{
  int max_hscore;
  double ave_n1, tmp_score, z, l_fact;

  if (pst.dnaseq == SEQT_DNA || pst.dnaseq == SEQT_RNA) {
    ave_n1 = 5000.0;
    l_fact = 1.0;
  }
  else {
    ave_n1 = 400.0;
    l_fact = 0.7;
  }

/*  max_hscore = MAX_SSCORE; */
/*  mean expected for pst.n0 * 400 for protein, 5000 for DNA */
/*  we want a number of offsets that is appropriate for the database size so
    far (nstats)
*/

/*
  the calculation below sets a high-score threshold using an
  ungapped lambda, but errs towards the high-score side by using
  E()=0.001 and calculating with 0.70*lambda, which is the correct for
  going from ungapped to -12/-2 gapped lambda with BLOSUM50
*/

#ifndef NORMAL_DIST
  tmp_score = 0.01/((double)nstats*K*(double)pst.n0*ave_n1);
  tmp_score = -log(tmp_score)/(Lambda*l_fact);
  max_hscore = (int)(tmp_score+0.5);

  z = 1.0/(double)nstats;
  z = (log(z)+EULER_G)/(-PI_SQRT6);
#else
  max_hscore = 100;
  z = 5.0;
#endif
  *zstrim = 10.0*z+50.0;
  return max_hscore;
}

int
proc_hist_r(struct stat_str *sptr, int nstats,
	    struct pstruct pst, struct hist_str *histp,
	    int do_trim, struct rstat_str *rs)
{
  int i, max_hscore;
  double zs, ztrim;
  char s_string[128];
  struct llen_str llen;
  char *f_string;
  llen.fit_flag=1;
  llen.hist=NULL;

  max_hscore = calc_thresh(pst, nstats, rs->ngLambda,
			   rs->ngK, rs->ngH, &ztrim);

  inithist(&llen,pst,max_hscore);
  f_string = &(histp->stat_info[0]);

  for (i = 0; i<nstats; i++)
    addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
  histp->entries = nstats - llen.zero_s;

  if ((llen.max_score - llen.min_score) < 10) {
    free_hist(&llen);
    llen.fit_flag = 0;
    find_zp = &find_zn;
    return proc_hist_n(sptr, nstats, pst, histp, do_trim, rs);
  }

  fit_llen(&llen, rs); /* now we have rho, mu, rho2, mu2, mean_var
			  to set the parameters for the histogram */

  if (!llen.fit_flag) {	/* the fit failed, fall back to proc_hist_n */
    free_hist(&llen);
    find_zp = &find_zn;
    return proc_hist_n(sptr,nstats, pst, histp, do_trim, rs);
  }

  rs->n_trimmed= rs->n1_trimmed = rs->nb_trimmed = 0;

  if (do_trim) {
    if (llen.fit_flag) {
      for (i = 0; i < nstats; i++) {
	zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
	if (zs < 20.0 || zs > ztrim) {
	  rs->n_trimmed++;
	  prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
		     &(histp->entries));
	}
      }
    }

  /*  fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", rs->n_trimmed); */

    if (llen.fit_flag) fit_llens(&llen, rs);

  /*   fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", rs->n1_trimmed,rs->nb_trimmed); */
  }


  free_hist(&llen);

  /* rst all the scores in the histogram */

  if (pst.zsflag < 10) s_string[0]='\0';
  else if (pst.zs_win > 0)
    sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
  else strncpy(s_string,"(shuffled)",sizeof(s_string));

  inithistz(MAXHIST, histp);

  sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d\n Lambda= %6.4f",
	  s_string,
	  rs->rho*LN_FACT,sqrt(rs->rho_e),rs->mu,sqrt(rs->mu_e), rs->mean_var,sqrt(rs->var_e),
	  llen.zero_s, rs->n_trimmed, rs->n1_trimmed, rs->nb_trimmed, rs->nb_tot,
	  PI_SQRT6/sqrt(rs->mean_var));
    return REG_STATS;
}


int
proc_hist_n(struct stat_str *sptr, int nstats,
	    struct pstruct pst, struct hist_str *histp,
	    int do_trim, struct rstat_str *rs)
{
  int i, j;
  double s_score, s2_score, ssd;
  double ztrim;
  int nit, max_hscore;
  char s_string[128];
  char *f_string;

  f_string = &(histp->stat_info[0]);
  /*   db->entries = db->length = db->carry = 0; */

  max_hscore = calc_thresh(pst, nstats, rs->ngLambda,
			   rs->ngK, rs->ngH, &ztrim);

  s_score = s2_score = 0.0;

  histp->entries = 0;

  for ( j = 0, i = 0; i < nstats; i++) {
    if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
      s_score += (ssd=(double)sptr[i].score);
      s2_score += ssd * ssd;
      histp->entries++;
      /* 
      db->length += sptr[i].n1;
      if (db->length > LONG_MAX) {
	db->carry++;
	db->length -= LONG_MAX;
      }
      */
      j++;
    }
  }

  if (j > 1 ) {
    rs->mu = s_score/(double)j;
    rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
    rs->mean_var /= (double)(j-1);
  }
  else {
    rs->mu = 50.0;
    rs->mean_var = 10.0;
  }
  
  if (rs->mean_var < 0.01) {
    rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
  }

  /* now remove some scores */

  nit = 5;
  while (nit-- > 0) {
    rs->n_trimmed = 0;

    for (i=0; i< nstats; i++) {
      if (sptr[i].n1 < 0) continue;
      ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
      if (ssd > ztrim || ssd < 20.0) {
	/*      fprintf(stderr,"removing %3d %3d %4.1f\n",
		sptr[i].score, sptr[i].n1,ssd); */
	ssd = sptr[i].score;
	s_score -= ssd;
	s2_score -= ssd*ssd;
	j--;
	rs->n_trimmed++;
	histp->entries--;
	sptr[i].n1 = -sptr[i].n1;
      }
    }

    if (j > 1 ) {
      rs->mu = s_score/(double)j;
      rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
      rs->mean_var /= (double)(j-1);
    }
    else {
      rs->mu = 50.0;
      rs->mean_var = 10.0;
    }

    if (rs->mean_var < 0.01) {
      rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
    }

    if (rs->n_trimmed < LHISTC) {
      /*
	fprintf(stderr,"nprune %d at %d\n",nprune,nit);
	*/
      break;
    }
  }

  if (pst.zsflag < 10) s_string[0]='\0';
  else if (pst.zs_win > 0)
    sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
  else strncpy(s_string,"(shuffled)",sizeof(s_string));

  sprintf(f_string,"%s unscaled statistics: mu= %6.4f  var=%6.4f; Lambda= %6.4f",
	  s_string, rs->mu,rs->mean_var,PI_SQRT6/sqrt(rs->mean_var));
  return AVE_STATS;
}


/*
This routine calculates the maximum likelihood estimates for the
extreme value distribution exp(-exp(-(-x-a)/b)) using the formula

	<lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
	<a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )

	The <a> parameter can be transformed into and K
	of the formula: 1 - exp ( - K m n exp ( - lambda S ))
	using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
			1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))

			a = log(K m n) / lambda
			a lambda = log (K m n)
			exp(a lambda)  = K m n 
	 but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
	 so:            K m n = (1/n sum{ exp( -x[i] *lambda)})
			K = sum{}/(nlib m n )

*/

void
alloc_hist(struct llen_str *llen)
{
  int max_llen, i;
  max_llen = llen->max;

  if (llen->hist == NULL) {
    llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
    llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
    llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
    llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
  }

  for (i=0; i< max_llen+1; i++) {
      llen->hist[i] = 0;
      llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
  }
}
  
void
free_hist(struct llen_str *llen)
{
  if (llen->hist!=NULL) {
    free(llen->score_var);
    free(llen->score2_sums);
    free(llen->score_sums);
    free(llen->hist);
    llen->hist=NULL;
  }
}

void
inithist(struct llen_str *llen, struct pstruct pst, int max_hscore)
{
  llen->max = MAX_LLEN;

  llen->max_score = -1;
  llen->min_score=10000;

  alloc_hist(llen);

  llen->zero_s = 0;
  llen->min_length = 10000;
  llen->max_length = 0;
}

void
addhist(struct llen_str *llen, int score, int length, int max_hscore)
{
  int llength; 
  double dscore;

  if ( score<=0 || length < LENGTH_CUTOFF) {
    llen->min_score = 0;
    llen->zero_s++;
    return ;
  }

  if (score < llen->min_score) llen->min_score = score;
  if (score > llen->max_score) llen->max_score = score;

  if (length > llen->max_length) llen->max_length = length;
  if (length < llen->min_length) llen->min_length = length;
  if (score > max_hscore) score = max_hscore;

  llength = (int)(LN_FACT*log((double)length)+0.5);

  if (llength < 0 ) llength = 0;
  if (llength > llen->max) llength = llen->max;
  llen->hist[llength]++;
  dscore = (double)score;
  llen->score_sums[llength] += dscore;
  llen->score2_sums[llength] += dscore * dscore;

  /*
  db->entries++;
  db->length += length;
  if (db->length > LONG_MAX) {db->carry++;db->length -= LONG_MAX;}
  */
}

/* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */


void
inithistz(int mh, struct hist_str *histp )
{
  int i;

  histp->min_hist = 20;
  histp->max_hist = 120;

  histp->histint = (int)
    ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
  histp->maxh = (int)
    ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);

  if (histp->hist_a==NULL) {
    if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
	NULL) {
      fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
      histp->histflg = 0;
    }
    else histp->histflg = 1;
  }
  else {
    for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
  }
}

/* fasts/f will not show any histogram */
void
addhistz(double zs, struct hist_str *histp)
{
}

void
prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
	   long *entries)
{
  int llength;
  double dscore;

  if (score <= 0 || length < LENGTH_CUTOFF) return;

  if (score > max_hscore) score = max_hscore;

  llength = (int)(LN_FACT*log((double)length)+0.5);

  if (llength < 0 ) llength = 0;
  if (llength > llen->max) llength = llen->max;
  llen->hist[llength]--;
  dscore = (double)score;
  llen->score_sums[llength] -= dscore;
  llen->score2_sums[llength] -= dscore * dscore;

  (*entries)--;
  /*
  if (length < db->length) db->length -= length;
  else {db->carry--; db->length += (LONG_MAX - (unsigned long)length);}
  */
}  

/* fit_llen: no trimming
   (1) regress scores vs log(n) using weighted variance
   (2) calculate mean variance after length regression
*/

void
fit_llen(struct llen_str *llen, struct rstat_str *pr)
{
  int j;
  int n;
  int n_size;
  double x, y2, u, z;
  double mean_x, mean_y, var_x, var_y, covar_xy;
  double mean_y2, covar_xy2, var_y2, dllj;

  double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
  
/* now fit scores to best linear function of log(n), using
   simple linear regression */
  
  for (llen->min=0; llen->min < llen->max; llen->min++)
    if (llen->hist[llen->min]) break;
  llen->min--;

  for (n_size=0,j = llen->min; j < llen->max; j++) {
    if (llen->hist[j] > 1) {
      dllj = (double)llen->hist[j];
      llen->score_var[j] = llen->score2_sums[j]/dllj
	- (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
      llen->score_var[j] /= (double)(llen->hist[j]-1);
      if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
      n_size++;
    }
  }

  pr->nb_tot = n_size;

  n_w = 0.0;
  sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
  for (j = llen->min; j < llen->max; j++)
    if (llen->hist[j] > 1) {
      x = j + 0.5;
      dllj = (double)llen->hist[j];
      n_w += dllj/llen->score_var[j];
      sum_x +=   dllj * x / llen->score_var[j] ;
      sum_y += llen->score_sums[j] / llen->score_var[j];
      sum_x2 +=  dllj * x * x /llen->score_var[j];
      sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
    }

  if (n_size < 5 ) {
    llen->fit_flag=0;
    pr->rho = 0;
    pr->mu = sum_y/n_w;
    return;
  }
  else {
    delta = n_w * sum_x2 - sum_x * sum_x;
    if (delta > 0.001) {
      pr->rho = (n_w * sum_xy  - sum_x * sum_y)/delta;
      pr->rho_e = n_w/delta;
      pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
      pr->mu_e = sum_x2/delta;
    }
    else {
      llen->fit_flag = 0;
      pr->rho = 0;
      pr->mu = sum_y/n_w;
      return;
    }
  }

  delta = n_w * sum_x2 - sum_x * sum_x;
  pr->rho = (n_w * sum_xy  - sum_x * sum_y)/delta;
  pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;

  n = 0;
  mean_x = mean_y = mean_y2 = 0.0;
  var_x = var_y = 0.0;
  covar_xy = covar_xy2 = 0.0;

  for (j = llen->min; j <= llen->max; j++) 
   if (llen->hist[j] > 1 ) {
      n += llen->hist[j];
      x = (double)j + 0.5;
      mean_x += (double)llen->hist[j] * x;
      mean_y += llen->score_sums[j];
      var_x += (double)llen->hist[j] * x * x;
      var_y += llen->score2_sums[j];
      covar_xy += x * llen->score_sums[j];
    }
  mean_x /= n; mean_y /= n;
  var_x = var_x / n - mean_x * mean_x;
  var_y = var_y / n - mean_y * mean_y;
  
  covar_xy = covar_xy / n - mean_x * mean_y;
/*
  pr->rho = covar_xy / var_x;
  pr->mu = mean_y - pr->rho * mean_x;
*/
  mean_y2 = covar_xy2 = var_y2 = 0.0;
  for (j = llen->min; j <= llen->max; j++) 
    if (llen->hist[j] > 1) {
      x = (double)j + 0.5;
      u = pr->rho * x + pr->mu;
      y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
/*
      dllj = (double)llen->hist[j];
      fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
	      llen->score_sums[j]/dllj,y2/dllj);
*/
      mean_y2 += y2;
      var_y2 += y2 * y2;
      covar_xy2 += x * y2;
      /*      fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
	      x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
    }
  
  pr->mean_var = mean_y2 /= (double)n;
  covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;

  if (pr->mean_var <= 0.01) {
    llen->fit_flag = 0;
    pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
  }

  /*
  fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
	  pr->rho*LN_FACT,pr->mu,pr->mean_var);
  */
  if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
  else pr->var_e = 0.0;

  if (llen->fit_flag) {
    pr->rho2 = covar_xy2 / var_x;
    pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
  }
  else {
    pr->rho2 = 0;
    pr->mu2 = pr->mean_var;
  }

  if (pr->rho2 < 0.0 )
    z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
  else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
  if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;

  pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
}

/* fit_llens: trim high variance bins
   (1) regress scores vs log(n) using weighted variance
   (2) regress residuals vs log(n)
   (3) remove high variance bins
   (4) calculate mean variance after length regression
*/

void
fit_llens(struct llen_str *llen, struct rstat_str *pr)
{
  int j;
  int n, n_u2;
  double x, y, y2, u, u2, v, z;
  double mean_x, mean_y, var_x, var_y, covar_xy;
  double mean_y2, covar_xy2;
  double mean_u2, mean_3u2, dllj;
  double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;

/* now fit scores to best linear function of log(n), using
   simple linear regression */
  
  for (llen->min=0; llen->min < llen->max; llen->min++)
    if (llen->hist[llen->min]) break;
  llen->min--;

  for (j = llen->min; j < llen->max; j++) {
    if (llen->hist[j] > 1) {
      dllj = (double)llen->hist[j];
      llen->score_var[j] = (double)llen->score2_sums[j]/dllj
	- (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
      llen->score_var[j] /= (double)(llen->hist[j]-1);
      if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
    }
  }
	  
  n_w = 0.0;
  sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
  for (j = llen->min; j < llen->max; j++)
    if (llen->hist[j] > 1) {
      x = j + 0.5;
      dllj = (double)llen->hist[j];
      n_w += dllj/llen->score_var[j];
      sum_x +=   dllj * x / llen->score_var[j] ;
      sum_y += llen->score_sums[j] / llen->score_var[j];
      sum_x2 +=  dllj * x * x /llen->score_var[j];
      sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];
    }

  delta = n_w * sum_x2 - sum_x * sum_x;
  pr->rho = (n_w * sum_xy  - sum_x * sum_y)/delta;
  pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;

/* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */

  n = 0;
  mean_x = mean_y = mean_y2 = 0.0;
  var_x = var_y = 0.0;
  covar_xy = covar_xy2 = 0.0;

  for (j = llen->min; j <= llen->max; j++) 
    if (llen->hist[j] > 1 ) {
      n += llen->hist[j];
      x = (double)j + 0.5;
      dllj = (double)llen->hist[j];
      mean_x += dllj * x;
      mean_y += llen->score_sums[j];
      var_x += dllj * x * x;
      var_y += llen->score2_sums[j];
      covar_xy += x * llen->score_sums[j];
    }
  mean_x /= n; mean_y /= n;
  var_x = var_x / n - mean_x * mean_x;
  var_y = var_y / n - mean_y * mean_y;
  
  covar_xy = covar_xy / n - mean_x * mean_y;
/*  pr->rho = covar_xy / var_x;
  pr->mu = mean_y - pr->rho * mean_x;
*/

  mean_y2 = covar_xy2 = 0.0;
  for (j = llen->min; j <= llen->max; j++) 
    if (llen->hist[j] > 1) {
      x = (double)j + 0.5;
      u = pr->rho * x + pr->mu;
      y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
      mean_y2 += y2;
      covar_xy2 += x * y2;
    }
  
  mean_y2 /= n;
  covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
  pr->rho2 = covar_xy2 / var_x;
  pr->mu2 = mean_y2 - pr->rho2 * mean_x;

  if (pr->rho2 < 0.0 )
    z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
  else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
  if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;

  pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;

/*  fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
	 pr->var_cutoff,z);
*/
  mean_u2 = 0.0;
  n_u2 = 0;
  for ( j = llen->min; j < llen->max; j++) {
    y = j+0.5;
    dllj = (double)llen->hist[j];
    x = pr->rho * y + pr->mu;
    v = pr->rho2 * y + pr->mu2;
    if (v < pr->var_cutoff) v = pr->var_cutoff;
    if (llen->hist[j]> 1) {
      u2 =  (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
      mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
      n_u2++;
      /*      fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
	      j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
    }
    else llen->score_var[j] = -1.0;
  }

  mean_u2 = sqrt(mean_u2/(double)n_u2);
  /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */

  mean_3u2 = mean_u2*3.0;

  for (j = llen->min; j < llen->max; j++) {
    if (llen->hist[j] <= 1) continue;
    if (sqrt(llen->score_var[j]) > mean_3u2) {
      /*      fprintf(stderr," removing %d %d %.2f\n",
	     j, (int)(exp((double)j/LN_FACT)-0.5),
	     sqrt(llen->score_var[j]));
	     */
      pr->nb_trimmed++;
      pr->n1_trimmed += llen->hist[j];
      llen->hist[j] = 0;
    }
  }
  fit_llen(llen, pr);
}


/* REG_STATS - Z() from rho/mu/mean_var */
double find_zr(int score, double escore, int length, double comp, 
	      struct rstat_str *rs)
{
  double log_len, z;
  
  if (score <= 0) return 0.0;
  if ( length < LENGTH_CUTOFF) return 0.0;

  log_len = LN_FACT*log((double)(length));
/*  var = rs->rho2 * log_len + rs->mu2;
  if (var < rs->var_cutoff) var = rs->var_cutoff;
*/

  z = ((double)score - rs->rho * log_len - rs->mu) / sqrt(rs->mean_var);

  return (50.0 + z*10.0);
}

double find_zt(int score, double escore, int length, double comp, 
	      struct rstat_str *rs)
{
  if (escore > 0.0) return -log(escore)/M_LN2;
  else return 744.440071/M_LN2;
}

double find_zn(int score, double escore, int length, double comp,
	      struct rstat_str *rs)
{
  double z;
  
  z = ((double)score - rs->mu) / sqrt(rs->mean_var);

  return (50.0 + z*10.0);
}

/* computes E value for a given z value, assuming extreme value distribution */
double
z_to_E(double zs, long entries, struct db_str db)
{
  double e, n;

  /*  if (db->entries < 5) return (double)db.entries; */
  if (entries < 1) { n = db.entries;}
  else {n = entries;}

  if (zs > ZS_MAX) return 0.0;

  e = exp(-PI_SQRT6 * zs - EULER_G);
  return n * (e > .01 ? 1.0 - exp(-e) : e);
}

double
zs_to_p(double zs)
{
  return zs;
}

/* this version assumes the probability is in the ->zscore variable,
   which is provided by this file after last_scale()
*/

double
zs_to_bit(double zs, int n0, int n1)
{
  return zs+log((double)(n0*n1))/M_LN2 ;
}

/* computes E-value for a given z value, assuming extreme value distribution */
double
zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
{
  double e, z, k;

  /*  if (db->entries < 5) return 0.0; */

  if (zs > ZS_MAX ) return 0.0;

  if (entries < 1) entries = db.entries;

  if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
    k = (double)db.length /(double)n1;
    if (db.carry > 0) { k *= (double)db.carry * (double)LONG_MAX;}
  }
  else k = (double)entries;

  if (k < 1.0) k = 1.0;

  zs *= M_LN2;
  if ( zs > 100.0) e = 0.0;
  else e =  exp(-zs);
  return k * e;
}

/* computes E-value for a given z value, assuming extreme value distribution */
double
E_to_zs(double E, long entries)
{
  double e, z;
  int error;

  e = E/(double)entries;

#ifndef NORMAL_DIST
  z = (log(e)+EULER_G)/(-PI_SQRT6);
  return z*10.0+50.0;
#else
  z = np_to_z(1.0-e,&error);

  if (!error) return z*10.0+50.0;
  else return 0.0;
#endif
}

/* computes 1.0 - E value for a given z value, assuming extreme value
   distribution */
double
zs_to_Ec(double zs, long entries)
{
  double e, z;

  if (entries < 5) return 0.0;

  z = (zs - 50.0)/10.0;

  if (z > ZS_MAX) return 1.0;

  e =  exp(-PI_SQRT6 * z - EULER_G);
  return (double)entries * (e > .01 ?  exp(-e) : 1.0 - e);
}

void
vsort(v,s,n)
	double *v; int *s, n;
{
  int gap, i, j;
  double tmp;
  int itmp;
	
  for (gap=n/2; gap>0; gap/=2)
    for (i=gap; i<n; i++)
      for (j=i-gap; j>=0; j -= gap) {
	if (v[j] >= v[j+gap]) break;
	tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
	itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
      }
}

void
sort_escore(double *v, int n)
{
  int gap, i, j;
  double dtmp;
	
  for (gap=n/2; gap>0; gap/=2) {
    for (i=gap; i<n; i++) {
      for (j=i-gap; j>=0; j -= gap) {
	if (v[j] <= v[j+gap]) break;
	dtmp = v[j];
	v[j] = v[j+gap];
	v[j+gap] = dtmp;
      }
    }
  }
}

/* scale_tat - compute 'a', 'b', 'c' coefficients for scaling fasts/f
   escores 
   5-May-2003 - also calculate index for high ties
*/
void
scale_tat(double *escore, int nstats,
	  long db_entries, int do_trim,
	  struct rstat_str *rs)
{
  int i, j, k, start;
  double *x, *lnx, *lny;

  /*   sort_escore(escore, nstats); */

  while (*escore<0.0) {escore++; nstats--; }

  x = (double *) calloc(nstats, sizeof(double));
  if(x == NULL) {
    fprintf(stderr, "Couldn't calloc tatE/x\n");
    exit(1);
  }

  lnx = (double *) calloc(nstats,sizeof(double));
  if(lnx == NULL) {
    fprintf(stderr, "Couldn't calloc tatE/lnx\n");
    exit(1);
  }
  
  lny = (double *) calloc(nstats,sizeof(double));
  if(lny == NULL) {
    fprintf(stderr, "Couldn't calloc tatE/lny\n");
    exit(1);
  }
  
  for(i = 0 ; i < nstats ; ) {

    lny[i] = log(escore[i]);

    for(j = i+1 ; j < nstats ; j++) {
	if(escore[j] != escore[i]) break;
    }

    x[i] = ((((double)i + (double)(j - i - 1)/2.0)*(double)nstats/(double)db_entries)+1.0)/(double)nstats;
    lnx[i] = log(x[i]);

    for(k = i+1 ; k < j ; k++) {
      lny[k]=lny[i];
      x[k] = x[i];
      lnx[k]=lnx[i];
    }
    i = k;
  }

  if (!do_trim) {
    start = 0;
  } else {
    start = 0.05 * (double) nstats;
    start = start > 500 ? 500 : start;
  }

  linreg(lny, x, lnx, nstats, &rs->tat_a, &rs->tat_b, &rs->tat_c, start);

  /* I have the coefficients I need - a, b, c; free arrays */

  free(lny);
  free(lnx);
  free(x);

  /* calculate tie_j - the index below which all scores are considered
     positional ties */

  rs->tie_j = 0.005 * db_entries;
}

void
linreg(double *lny, double *x, double *lnx, int n,
       double *a, double *b, double *c, int start) {

  double yf1, yf2, yf3;
  double f1f1, f1f2, f1f3;
  double f2f2, f2f3;
  double f3f3, delta;

  int i;

  yf1 = yf2 = yf3 = 0.0;
  f1f1 = f1f2 = f1f3 = f2f2 = f2f3 = f3f3 = 0.0;

  for(i = start; i < n; i++) {
    yf1 += lny[i] * lnx[i];
    yf2 += lny[i] * x[i];
    yf3 += lny[i];

    f1f1 += lnx[i] * lnx[i];
    f1f2 += lnx[i] * x[i];
    f1f3 += lnx[i];

    f2f2 += x[i] * x[i];
    f2f3 += x[i];

    f3f3 += 1.0;
  }

  delta = det(f1f1, f1f2, f1f3, f1f2, f2f2, f2f3, f1f3, f2f3, f3f3);

  *a = det(yf1, f1f2, f1f3, yf2, f2f2, f2f3, yf3, f2f3, f3f3) / delta;
  *b = det(f1f1, yf1, f1f3, f1f2, yf2, f2f3, f1f3, yf3, f3f3) / delta;
  *c = det(f1f1, f1f2, yf1, f1f2, f2f2, yf2, f1f3, f2f3, yf3) / delta;

}

double det(double a11, double a12, double a13,
	   double a21, double a22, double a23,
	   double a31, double a32, double a33)
{
  double result;

  result = a11 * (a22 * a33 - a32 * a23);
  result -= a12 * (a21 * a33 - a31 * a23);
  result += a13 * (a21 * a32 - a31 * a22);
    
  return result;
}

void
last_stats(const unsigned char *aa0, int n0,
	   struct stat_str *sptr, int nstats,
	   struct beststr **bestp_arr, int nbest,
	   struct mngmsg m_msg, struct pstruct pst,
	   struct hist_str *histp, struct rstat_str **rs_sp)
{
  double *obs_escore;
  int i, nobs, nobs_t, do_trim;
  long db_entries;
  struct rstat_str *rs_s;

  if (*rs_sp == NULL) {
    if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {
      fprintf(stderr," cannot allocate rs_s: %ld\n",sizeof(struct rstat_str));
      exit(1);
    }
    else *rs_sp = rs_s;
  }
  else rs_s = *rs_sp;
    
  histp->entries = 0;

  sortbeste(bestp_arr,nbest);

  rs_s->spacefactor = 
    calc_spacefactor(aa0, n0, m_msg.nm0,pst.nsq);

  if (pst.zsflag >= 1 && pst.zsflag <= 4) {
    if (m_msg.escore_flg) {
      nobs = nbest;
      do_trim = 1;
    }
    else {
      nobs = nstats;
      do_trim = 0;
    }

    if ((obs_escore = (double *)calloc(nobs,sizeof(double)))==NULL) {
      fprintf(stderr," cannot allocate obs_escore[%d]\n",nbest);
      exit(1);
    }

    if (m_msg.escore_flg) {
      for (i=nobs=0; i<nbest; i++) {
	if (bestp_arr[i]->escore<= 1.00)
	  obs_escore[nobs++]=bestp_arr[i]->escore;
      }
      /*
      nobs_t = nobs;
      for (i=0; i<nbest; i++) {
	if (bestp_arr[i]->escore >= 0.99 &&
	    bestp_arr[i]->escore <= 1.00)
	  obs_escore[nobs++]=bestp_arr[i]->escore;
      }
      */
      db_entries = m_msg.db.entries;
    }
    else {
      for (i=nobs=0; i<nstats; i++) {
	if (sptr[i].escore <= 1.00 ) obs_escore[nobs++]=sptr[i].escore;
      }
      /*
      nobs_t = nobs;
      for (i=0; i<nstats; i++) {
	if (sptr[i].escore >= 0.99 &&
	    sptr[i].escore <= 1.0) obs_escore[nobs++]=sptr[i].escore;
      }
      */
      db_entries = nobs;
/*    db_entries = m_msg.db.entries;*/
    }

    sortbesto(obs_escore,nobs);
    if (nobs > 100) {
      scale_tat(obs_escore,nobs,db_entries,do_trim,rs_s);
      rs_s->have_tat=1;
      sprintf(histp->stat_info,"scaled Tatusov statistics (%d): tat_a: %6.4f tat_b: %6.4f tat_c: %6.4f",
	      nobs,rs_s->tat_a, rs_s->tat_b, rs_s->tat_c);
    }
    else {
      rs_s->have_tat=0;
      sprintf(histp->stat_info,"Space_factor %.4g scaled statistics",
	    rs_s->spacefactor);
    }
    free(obs_escore);
  }
  else {
    rs_s->have_tat=0;
    histp->stat_info[0] = '\0';
  }
}

/* scale_scores() takes the best (real) scores and re-scales them;
   beststr bptr[] must be sorted */

void
scale_scores(struct beststr **bptr, int nbest, struct db_str db,
	     struct pstruct pst, struct rstat_str *rs)
{
  int i, j, k;
  double obs, r_a, r_b, r_c;

  /* this scale function absolutely requires that the results be sorted
     before it is used */

  sortbeste(bptr,nbest);

  if (!rs->have_tat) {
    for (i=0; i<nbest; i++) {
      bptr[i]->escore *= rs->spacefactor;
    }
  }
  else {

    /* here if more than 1000 scores */

    r_a = rs->tat_a; r_b = rs->tat_b; r_c = rs->tat_c;

  /* the problem with scaletat is that the E() value is related to
     ones position in the list of top scores - thus, knowing the score
     is not enough - one must know the rank */

    for(i = 0 ; i < nbest ; ) {
      /* take the bottom 0.5%, and the ties, and treat them all the same */
      j = i + 1;
      while (j< nbest && 
	     (j <= (0.005 * db.entries) || bptr[j]->escore == bptr[i]->escore)
	     ) {
	j++;
      }

      /* observed frequency */
      obs = ((double)i + ((double)(j - i - 1)/ 2.0) + 1.0)/(double)db.entries;

      /* make certain ties all have the same correction */
      for (k = i ; k < j ; k++) {
	bptr[k]->escore *= obs/exp(r_a*log(obs) + r_b*obs + r_c);
      }
      i = k;
    }
  }

  for (i=0; i<nbest; i++) {
    if(bptr[i]->escore > 0.01)
      bptr[i]->escore = 1.0 - exp(-bptr[i]->escore);
    if (bptr[i]->escore > 0.0)
      bptr[i]->zscore = -log(bptr[i]->escore)/M_LN2;
    else 
      bptr[i]->zscore = 744.440071/M_LN2;
    bptr[i]->escore *= pst.zdb_size;
  }
}

double scale_one_score (int ipos, double escore,
                        struct db_str db,
                        struct rstat_str *rs) {
  double obs;
  double a, b, c;

  if (!rs->have_tat)
    return escore * rs->spacefactor;

  if (ipos < rs->tie_j) ipos = rs->tie_j/2;

  a = rs->tat_a; b = rs->tat_b; c = rs->tat_c;

  obs = ((double)ipos + 1.0)/(double)db.entries;

  escore *= obs/exp(a*log(obs) + b*obs + c);

  return escore;
}

double calc_spacefactor(const unsigned char *aa0, int n0,
			int nm0, int nsq) {

#if !defined(FASTF)
  return pow(2.0, (double) nm0) - 1.0;
#else

  int i, j, n, l, nr, bin, k;
  int nmoff;
  int **counts;
  int **factors;
  double tmp, result = 0.0;

  nmoff = (n0 - nm0 + 1)/nm0+1;

  counts = (int **) calloc(nsq, sizeof(int *));
  if(counts == NULL) {
    fprintf(stderr, "couldn't calloc counts array!\n");
    exit(1);
  }

  counts[0] = (int *) calloc(nsq * (nmoff - 1), sizeof(int));
  if(counts[0] == NULL) {
    fprintf(stderr, "couldn't calloc counts array!\n");
    exit(1);
  }

  for(i = 0 ; i < nsq ; i++) {
    counts[i] = counts[0] + (i * (nmoff - 1));
  }

  for(i = 0 ; i < nm0 ; i++) {
    for(j = 0 ; j < (nmoff - 1) ; j++) {
      counts[ aa0[nmoff * i + j] ] [ j ] ++;
    }
  }

  factors = (int **) calloc(nm0 + 1, sizeof(int *));
  if(factors == NULL) {
    fprintf(stderr, "Couldn't calloc factors array!\n");
    exit(1);
  }

  factors[0] = (int *) calloc((nm0 + 1) * (nmoff - 1), sizeof(int));
  if(factors[0] == NULL) {
    fprintf(stderr, "Couldn't calloc factors array!\n");
    exit(1);
  }

  for(i = 0 ; i <= nm0 ; i++) {
    factors[i] = factors[0] + (i * (nmoff - 1));
  }

  /*
    this algorithm was adapted from the GAP4 library's NrArrangement function:
    The GAP Group, GAP --- Groups, Algorithms, and Programming,
    Version 4.1; Aachen, St Andrews, 1999.
    (http://www-gap.dcs.st-and.ac.uk/ gap)
  */

  /* calculate K factors for each column in query: */
  for(j = 0 ; j < (nmoff - 1) ; j++) {

    /* only one way to select 0 elements */
    factors[0][j] = 1;

    /* for each of the possible elements in this column */
    for(n = 0 ; n < nsq ; n++) {

      /* if there aren't any of these, skip it */
      if(counts[n][j] == 0) { continue; }

      /* loop over the possible lengths of the arrangement: K..0 */
      for(l = nm0 ; l >= 0 ; l--) {
	nr = 0;
	bin = 1;

	/*
	  compute the number of arrangements of length <l>
	  using only the first <n> elements of <mset>
	*/
	for(i = 0, k = min(counts[n][j], l); i <= k ; i++) {

	  /* 
	     add the number of arrangements of length <l>
	     that consist of <l>-<i> of the first <n>-1 elements
	     and <i> copies of the <n>th element
	  */
	  nr += bin * factors[l-i][j];
	  bin = (int) ((float) bin * (float) (l - i) / (float) (i + 1));
	}

	factors[l][j] = nr;
      }
    }
  }

  result = 0.0;
  for(i = 1 ; i <= nm0 ; i++) {
    tmp = 1.0;
    for(j = 0 ; j < (nmoff - 1) ; j++) {
      tmp *= (double) factors[i][j];
    }
    tmp /= factorial(i, 1);
    result += tmp;
  }
  
  free(counts[0]);
  free(counts);
  free(factors[0]);
  free(factors);

  return result;
#endif
}

void sortbesto (double *obs, int nobs)
{
  int gap, i, j, k;
  double v;
  int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
		   13776, 4592, 1968, 861, 336, 
		   112, 48, 21, 7, 3, 1 };

  for ( k = 0; k < 16; k++)
    for (gap = incs[k], i=gap; i < nobs; i++) {
      v = obs[i];
      j = i;
      while ( j >= gap && obs[j-gap] > v) {
	obs[j] = obs[j - gap];
	j -= gap;
      }
      obs[j] = v;
    }
}