+++ /dev/null
-/* scaleswn.c */
-
-/* $Name: fa_34_26_5 $ - $Id: scaleswn.c,v 1.60 2007/04/26 18:32:48 wrp Exp $ */
-
-/* as of 24 Sept, 2000 - scaleswn uses no global variables */
-
-/*
- Provide statistical estimates using an extreme value distribution
-
- copyright (c) 1995, 1996, 2000 William R. Pearson
-
- This code provides multiple methods for scaling sequence
- similarity scores to correct for length effects.
-
- Currently, six methods are available:
-
- pst.zsflag = 0 - no scaling (AVE_STATS)
- pst.zsflag = 1 - regression-scaled scores (REG_STATS)
- pst.zsflag = 2 - (revised) MLE Lmabda/K scaled scores (MLE_STATS)
- pst.zsflag = 3 - scaling using Altschul's parameters (AG_STATS)
- pst.zsflag = 4 - regression-scaled with iterative outlier removal (REGI_STATS)
- pst.zsflag = 5 = like 1, but length scaled variance (REG2_STATS)
- pst.zsflag = 6 = like 2, but uses lambda composition/scale (MLE2_STATS)
- pst.zsflag = 11 = 10 + 1 - use random shuffles, method 1
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-
-#include <limits.h>
-
-#include "defs.h"
-#include "param.h"
-#include "structs.h"
-#ifndef PCOMPLIB
-#include "mw.h"
-#else
-#include "p_mw.h"
-#endif
-
-#define MAXHIST 50
-#define MAX_LLEN 200
-#define LHISTC 5
-#define VHISTC 5
-#define MAX_SSCORE 300
-
-#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
-
-#define LN_FACT 10.0
-#ifndef M_LN2
-#define M_LN2 0.69314718055994530942
-#endif
-#define EULER_G 0.57721566490153286060
-#define PI_SQRT6 1.28254983016186409554
-
-#ifndef M_SQRT2
-#define M_SQRT2 1.41421356237
-#endif
-#define LN200 5.2983173666
-#define ZS_MAX 400.0 /* used to prevent underflow on some machines */
-#define TOLERANCE 1.0e-12
-#define TINY 1.0e-6
-
-/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
-struct rstat_str {
- double rho, rho_e, mu, mu_e, mean_var, var_e; /* ?_e:std. error of ? */
-/* used by REG2_STATS */
- double rho2, mu2, var_cutoff;
- int n_trimmed; /* excluded because of high z-score */
- int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
-};
-
-/* used by AG_STATS, MLE_STATS */
-struct ag_stat_str {
- double K, Lambda, H, a_n0f, a_n0;
-};
-
-/* used by MLE2_STATS */
-struct mle2_stat_str {
- double a_n0;
- double mle2_a0, mle2_a1, mle2_a2, mle2_b1;
- double ave_comp, max_comp, ave_H;
-};
-
-struct pstat_str {
- double ngLambda, ngK, ngH;
- union {
- struct rstat_str rg;
- struct ag_stat_str ag;
- struct mle2_stat_str m2;
- } r_u;
-};
-
-#define AVE_STATS 0 /* no length effect, only mean/variance */
-double find_zn(int score, double escore, int len, double comp, struct pstat_str *);
-
-int proc_hist_n(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp, int do_trim,
- struct pstat_str *);
-
-#define REG_STATS 1 /* length-regression scaled */
-#define REGI_STATS 4 /* length regression, iterative */
-double find_zr(int score, double escore, int len, double comp, struct pstat_str *);
-int proc_hist_r(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu);
-
-#define MLE_STATS 2 /* MLE for lambda, K */
-double find_ze(int score, double escore, int len, double comp, struct pstat_str *);
-int proc_hist_ml(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp, int do_trim,
- struct pstat_str *);
-
-#define AG_STATS 3 /* Altschul-Gish parameters */
-double find_za(int score, double escore, int len, double comp, struct pstat_str *);
-int proc_hist_a(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp, int do_trim,
- struct pstat_str *);
-
-#define REG2_STATS 5 /* length regression on mean + variance */
-double find_zr2(int score, double escore, int len, double comp, struct pstat_str *);
-int proc_hist_r2(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp, int do_trim,
- struct pstat_str *);
-
-#define MLE2_STATS 6 /* MLE stats using comp(lambda) */
-double find_ze2(int score, double escore, int length, double comp, struct pstat_str *);
-int proc_hist_ml2(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp, int do_trim,
- struct pstat_str *);
-
-#ifdef USE_LNSTATS
-#define LN_STATS 2
-double find_zl(int score, double escore, int len, double comp, struct pstat_str *);
-int proc_hist_ln(struct stat_str *sptr, int n,
- struct pstruct pst, struct hist_str *histp, int do_trim,
- struct pstat_str *);
-#endif
-
-/* scaleswn.c local variables that belong in their own structure */
-
-double (*find_zp)(int score, double escore, int len, double comp, struct pstat_str *) = &find_zr;
-
-/* void s_sort (double **ptr, int nbest); */
-void ss_sort ( int *sptr, int n);
-
-struct llen_str {
- int min, max;
- int max_score, min_score;
- int *hist;
- double *score_sums, *score2_sums;
- double *score_var;
- int max_length, min_length, zero_s;
- int fit_flag;
-};
-
-static void inithist(struct llen_str *, struct pstruct, int);
-static void free_hist( struct llen_str *);
-static void addhist(struct llen_str *, int, int, int);
-static void prune_hist(struct llen_str *, int, int, int, long *);
-void inithistz(int, struct hist_str *histp);
-void addhistz(double zs, struct hist_str *histp);
-void addhistzp(double zs, struct hist_str *histp);
-
-static void fit_llen(struct llen_str *, struct rstat_str *);
-static void fit_llen2(struct llen_str *, struct rstat_str *);
-static void fit_llens(struct llen_str *, struct rstat_str *);
-
-extern void sortbeste(struct beststr **bptr, int nbest);
-
-/* void set_db_size(int, struct db_str *, struct hist_str *); */
-
-#ifdef DEBUG
-FILE *tmpf;
-#endif
-
-int
-process_hist(struct stat_str *sptr, int nstats,
- struct mngmsg m_msg,
- struct pstruct pst,
- struct hist_str *histp,
- struct pstat_str **ps_sp,
- int do_hist)
-{
- int zsflag, do_trim, i;
- struct pstat_str *ps_s;
-
- if (pst.zsflag < 0) {
- *ps_sp = NULL;
- return pst.zsflag;
- }
-
- if (*ps_sp == NULL) {
- if ((ps_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {
- fprintf(stderr," cannot allocate pstat_union: %ld\n",sizeof(struct pstat_str));
- exit(1);
- }
- else *ps_sp = ps_s;
- }
- else {
- ps_s = *ps_sp;
- memset(ps_s,0,sizeof(struct pstat_str));
- }
-
- ps_s->ngLambda = m_msg.Lambda;
- ps_s->ngK = m_msg.K;
- ps_s->ngH = m_msg.H;
-
- if (nstats < 10) pst.zsflag = AG_STATS;
-
- zsflag = pst.zsflag;
-
-/*
-#ifdef DEBUG
- if (pst.debug_lib) {
- tmpf=fopen("tmp_stats.res","w+");
- for (i=0; i<nstats; i++) fprintf(tmpf,"%d\t%d\n",sptr[i].score,sptr[i].n1);
- fclose(tmpf);
- }
-#endif
-*/
-
- if (zsflag >= 10) {
- zsflag -= 10;
- do_trim = 0;
- }
- else do_trim = 1;
-
-#ifdef USE_LNSCALE
- if (zsflag==LN_STATS) {
- find_zp = &find_zl;
- pst.zsflag = proc_hist_ln(sptr, nstats, histp, do_trim, ps_s);
- }
-#else
- if (zsflag==MLE_STATS) {
- find_zp = &find_ze;
- pst.zsflag = proc_hist_ml(sptr, nstats, pst, histp, do_trim, ps_s);
- }
-#endif
- else if (zsflag==REG_STATS) {
- find_zp = &find_zr;
- pst.zsflag = proc_hist_r(sptr, nstats,pst, histp, do_trim, ps_s);
- }
- else if (zsflag==AG_STATS) {
- find_zp = &find_za;
- pst.zsflag = proc_hist_a(sptr, nstats, pst, histp, do_trim, ps_s);
- }
- else if (zsflag==REGI_STATS) {
- find_zp = &find_zr;
- pst.zsflag = proc_hist_r2(sptr,nstats, pst, histp, do_trim, ps_s);
- }
- else if (zsflag==REG2_STATS) {
- find_zp = &find_zr2;
- pst.zsflag = proc_hist_r(sptr,nstats,pst, histp, do_trim, ps_s);
- }
-#if !defined(TFAST) && !defined(FASTX)
- else if (zsflag == MLE2_STATS) {
- find_zp = &find_ze2;
- pst.zsflag = proc_hist_ml2(sptr, nstats, pst, histp, do_trim, ps_s);
- }
-#endif
- else { /* AVE_STATS */
- find_zp = &find_zn;
- pst.zsflag = proc_hist_n(sptr,nstats, pst, histp, do_trim, ps_s);
- }
-
- if (!do_hist) {
- histp->entries = nstats; /* db->entries = 0; */
- inithistz(MAXHIST, histp);
- for (i = 0; i < nstats; i++) {
- if (sptr[i].n1 < 0) sptr[i].n1 = -sptr[i].n1;
- addhistz(find_zp(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,ps_s),
- histp);
- }
- }
- return pst.zsflag;
-}
-
-int
-calc_thresh(struct pstruct pst, int nstats,
- double Lambda, double K, double H, double *zstrim)
-{
- int max_hscore;
- double ave_n1, tmp_score, z, l_fact;
-
- if (pst.dnaseq == SEQT_DNA || pst.dnaseq == SEQT_RNA) {
- ave_n1 = 5000.0;
- l_fact = 1.0;
- }
- else {
- ave_n1 = 400.0;
- l_fact = 0.7;
- }
-
-/* max_hscore = MAX_SSCORE; */
-/* mean expected for pst.n0 * 400 for protein, 5000 for DNA */
-/* we want a number of offsets that is appropriate for the database size so
- far (nstats)
-*/
-
-/*
- the calculation below sets a high-score threshold using an
- ungapped lambda, but errs towards the high-score side by using
- E()=0.001 and calculating with 0.70*lambda, which is the correct for
- going from ungapped to -12/-2 gapped lambda with BLOSUM50
-*/
-
-#ifndef NORMAL_DIST
- tmp_score = 0.01/((double)nstats*K*(double)pst.n0*ave_n1);
- tmp_score = -log(tmp_score)/(Lambda*l_fact);
- max_hscore = (int)(tmp_score+0.5);
-
- z = 1.0/(double)nstats;
- z = (log(z)+EULER_G)/(- PI_SQRT6);
-#else
- max_hscore = 100;
- z = 5.0;
-#endif
- *zstrim = 10.0*z+50.0;
- return max_hscore;
-}
-
-int
-proc_hist_r(struct stat_str *sptr, int nstats,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu)
-{
- int i, max_hscore;
- double zs, ztrim;
- char s_string[128];
- struct llen_str llen;
- char *f_string;
- llen.fit_flag=1;
- llen.hist=NULL;
-
- max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
- pu->ngK, pu->ngH, &ztrim);
-
- inithist(&llen,pst,max_hscore);
-
- f_string = &(histp->stat_info[0]);
-
- for (i = 0; i<nstats; i++)
- addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
-
- if ((llen.max_score - llen.min_score) < 10) {
- free_hist(&llen);
- llen.fit_flag = 0;
- find_zp = &find_zn;
- return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
- }
-
- fit_llen(&llen, &(pu->r_u.rg)); /* now we have rho, mu, rho2, mu2, mean_var
- to set the parameters for the histogram */
-
- if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_ml */
- free_hist(&llen);
- find_zp = &find_ze;
- return proc_hist_ml(sptr,nstats, pst, histp, do_trim, pu);
- }
-
- pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
-
- if (do_trim) {
- if (llen.fit_flag) {
- for (i = 0; i < nstats; i++) {
- zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
- if (zs < 20.0 || zs > ztrim) {
- pu->r_u.rg.n_trimmed++;
- prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
- &(histp->entries));
- }
- }
- }
-
- /* fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", pu->r_u.rg.n_trimmed); */
-
- if (llen.fit_flag) fit_llens(&llen, &(pu->r_u.rg));
-
- /* fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", pu->r_u.rg.n1_trimmed,pu->r_u.rg.nb_trimmed); */
- }
-
- free_hist(&llen);
-
- /* put all the scores in the histogram */
-
- if (pst.zsflag < 10) s_string[0]='\0';
- else if (pst.zs_win > 0)
- sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
- else strncpy(s_string,"(shuffled)",sizeof(s_string));
-
- if (pst.zsflag == REG2_STATS || pst.zsflag == 10+REG2_STATS)
- sprintf(f_string,"%s Expectation_v fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n rho2=%6.2f; mu2= %6.2f, 0's: %d Z-trim: %d B-trim: %d in %d/%d",
- s_string, pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
- pu->r_u.rg.rho2,pu->r_u.rg.mu2,llen.zero_s,
- pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot);
- else
- sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d B-trim: %d in %d/%d\n Lambda= %8.6f",
- s_string,
- pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e), pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),
- llen.zero_s, pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot,
- PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
- return REG_STATS;
-}
-
-
-int
-proc_hist_r2(struct stat_str *sptr, int nstats,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu)
-{
- int i, nit, nprune, max_hscore;
- double zs, ztrim;
- char s_string[128];
- char *f_string;
- struct llen_str llen;
-
- llen.fit_flag=1;
- llen.hist=NULL;
-
- max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
- pu->ngK, pu->ngH, &ztrim);
-
- inithist(&llen, pst,max_hscore);
- f_string = &(histp->stat_info[0]);
-
- for (i = 0; i<nstats; i++)
- addhist(&llen,sptr[i].score,sptr[i].n1,max_hscore);
-
- pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
- if (do_trim) nit = 5;
- else nit = 0;
-
- while (nit-- > 0) {
- nprune = 0;
- fit_llen2(&llen, &(pu->r_u.rg));
-
- for (i = 0; i < nstats; i++) {
- if (sptr[i].n1 < 0) continue;
- zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,pu);
- if (zs < 20.0 || zs > ztrim ) {
- nprune++;
- pu->r_u.rg.n_trimmed++;
- prune_hist(&llen,sptr[i].score,sptr[i].n1,max_hscore,
- &(histp->entries));
- sptr[i].n1 = -sptr[i].n1;
- }
- }
- /* fprintf(stderr," %d Z-trimmed at %d\n",nprune,nit); */
- if (nprune < LHISTC) { break; }
- }
-
- fit_llen(&llen, &(pu->r_u.rg));
-
- free_hist(&llen);
-
- if (pst.zsflag < 10) s_string[0]='\0';
- else if (pst.zs_win > 0)
- sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
- else strncpy(s_string,"(shuffled)",sizeof(s_string));
-
- sprintf(f_string,"%s Expectation_i fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n mean_var=%6.4f+/-%6.3f 0's: %d Z-trim: %d N-it: %d\n Lambda= %8.6f",
- s_string,
- pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
- pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),llen.zero_s,pu->r_u.rg.n_trimmed, nit,
- PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
- return REGI_STATS;
-}
-
-/* this procedure implements Altschul's pre-calculated values for lambda, K */
-
-#include "alt_parms.h"
-
-int
-look_p(struct alt_p parm[], int gap, int ext,
- double *K, double *Lambda, double *H);
-
-int
-proc_hist_a(struct stat_str *sptr, int nstats,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu)
-{
- double Lambda, K, H;
- char *f_string;
- int r_v;
- int t_gdelval, t_ggapval;
-
-#ifdef OLD_FASTA_GAP
- t_gdelval = pst.gdelval;
- t_ggapval = pst.ggapval;
-#else
- t_gdelval = pst.gdelval+pst.ggapval;
- t_ggapval = pst.ggapval;
-#endif
-
- f_string = &(histp->stat_info[0]);
-
- if (strcmp(pst.pamfile,"BL50")==0 || strcmp(pst.pamfile,"BLOSUM50")==0)
- r_v = look_p(bl50_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"BL62")==0 || strcmp(pst.pamfile,"BLOSUM62")==0)
- r_v = look_p(bl62_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"BL80")==0 || strcmp(pst.pamfile,"BLOSUM80")==0)
- r_v = look_p(bl80_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"P250")==0)
- r_v = look_p(p250_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"P120")==0)
- r_v = look_p(p120_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"MD_10")==0)
- r_v = look_p(md10_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"MD_20")==0)
- r_v = look_p(md20_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"MD_40")==0)
- r_v = look_p(md40_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"DNA")==0 || strcmp(pst.pamfile,"+5/-4")==0)
- r_v = look_p(nt54_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"+3/-2")==0)
- r_v = look_p(nt32_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else if (strcmp(pst.pamfile,"+1/-3")==0)
- r_v = look_p(nt13_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
- else r_v = 0;
-
- pu->r_u.ag.Lambda = Lambda;
- pu->r_u.ag.K = K;
- pu->r_u.ag.H = H;
-
- if (r_v == 0) {
- fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
- pst.pamfile,t_gdelval,t_ggapval);
-
- find_zp = &find_zr;
- return proc_hist_r(sptr, nstats,pst, histp, do_trim, pu);
- }
-
- /*
- fprintf(stderr," the parameters are: Lambda: %5.3f K: %5.3f H: %5.3f\n",
- Lambda, K, H);
- */
-
- pu->r_u.ag.a_n0 = (double)pst.n0;
- pu->r_u.ag.a_n0f = log (K * pu->r_u.ag.a_n0)/H;
-
- sprintf(f_string,"Altschul/Gish params: n0: %d Lambda: %5.3f K: %5.3f H: %5.3f",
- pst.n0,Lambda, K, H);
- return AG_STATS;
-}
-
-int
-ag_parm(char *pamfile, int gdelval, int ggapval, struct pstat_str *pu)
-{
- double Lambda, K, H;
- int r_v;
-
- if (strcmp(pamfile,"BL50")==0)
- r_v = look_p(bl50_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"BL62")==0)
- r_v = look_p(bl62_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"P250")==0)
- r_v = look_p(p250_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"P120")==0)
- r_v = look_p(p120_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"MD_10")==0)
- r_v = look_p(md10_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"MD_20")==0)
- r_v = look_p(md20_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"MD_40")==0)
- r_v = look_p(md40_p,gdelval,ggapval,&K,&Lambda,&H);
- else if (strcmp(pamfile,"DNA")==0 || strcmp(pamfile,"+5/-4")==0)
- r_v = look_p(nt54_p,gdelval,ggapval, &K,&Lambda,&H);
- else if (strcmp(pamfile,"+3/-2")==0)
- r_v = look_p(nt32_p,gdelval,ggapval, &K,&Lambda,&H);
- else if (strcmp(pamfile,"+1/-3")==0)
- r_v = look_p(nt13_p,gdelval,ggapval, &K,&Lambda,&H);
- else r_v = 0;
-
- pu->r_u.ag.K = K;
- pu->r_u.ag.Lambda = Lambda;
- pu->r_u.ag.H = H;
-
- if (r_v == 0) {
- fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
- pamfile,gdelval,ggapval);
- }
- return r_v;
-}
-
-int
-look_p(struct alt_p parm[], int gap, int ext,
- double *K, double *Lambda, double *H)
-{
- int i;
-
- gap = -gap;
- ext = -ext;
-
- if (gap > parm[1].gap) {
- *K = parm[0].K;
- *Lambda = parm[0].Lambda;
- *H = parm[0].H;
- return 1;
- }
-
- for (i=1; parm[i].gap > 0; i++) {
- if (parm[i].gap > gap) continue;
- else if (parm[i].gap == gap && parm[i].ext > ext ) continue;
- else if (parm[i].gap == gap && parm[i].ext == ext) {
- *K = parm[i].K;
- *Lambda = parm[i].Lambda;
- *H = parm[i].H;
- return 1;
- }
- else break;
- }
- return 0;
-}
-
-/* uncensored and censored maximum likelihood estimates developed
- by Aaron Mackey based on a preprint from Sean Eddy */
-
-int mle_cen (struct stat_str *, int, int, double, double *, double *);
-
-int
-proc_hist_ml(struct stat_str *sptr, int nstats,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu)
-{
- double f_cen;
- char s_string[128];
- char *f_string;
-
- f_string = &(histp->stat_info[0]);
- pu->r_u.ag.a_n0 = (double)pst.n0;
-
- if (pst.zsflag < 10) s_string[0]='\0';
- else if (pst.zs_win > 0)
- sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
- else strncpy(s_string,"(shuffled)",sizeof(s_string));
-
- if (!do_trim) {
- if (mle_cen(sptr, nstats, pst.n0, 0.0, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
- goto bad_mle;
- sprintf(f_string,"%s MLE statistics: Lambda= %6.4f; K=%6.4g",
- s_string,pu->r_u.ag.Lambda,pu->r_u.ag.K);
- }
- else {
- if (nstats/20 > 1000) f_cen = 1000.0/(double)nstats;
- else f_cen = 0.05;
- if (mle_cen(sptr, nstats, pst.n0, f_cen, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
- goto bad_mle;
- sprintf(f_string,"MLE_cen statistics: Lambda= %6.4f; K=%6.4g (cen=%d)",
- pu->r_u.ag.Lambda,pu->r_u.ag.K,(int)((double)nstats*f_cen));
- }
-
- return MLE_STATS;
- bad_mle:
- find_zp = &find_zn;
-
- return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
-}
-
-int
-mle_cen2 (struct stat_str *, int, int, double, double *, double *, double *, double *);
-
-
-int
-proc_hist_ml2(struct stat_str *sptr, int nstats,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu)
-{
- int i, ns=0, nneg=0;
- double f_cen, ave_lambda;
- char s_string[128], ex_string[64];
- char *f_string;
-
- f_string = &(histp->stat_info[0]);
- pu->r_u.m2.a_n0 = (double)pst.n0;
-
- if (pst.zsflag < 10) s_string[0]='\0';
- else if (pst.zs_win > 0)
- sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
- else strncpy(s_string,"(shuffled)",sizeof(s_string));
-
- pu->r_u.m2.ave_comp = 0.0;
- pu->r_u.m2.max_comp = -1.0;
-
- ns = nneg = 0;
- for (i=0; i<nstats; i++) {
- if (sptr[i].comp > pu->r_u.m2.max_comp) pu->r_u.m2.max_comp = sptr[i].comp;
- if (sptr[i].comp > 0.0) {
- pu->r_u.m2.ave_comp += log(sptr[i].comp);
- ns++;
- }
- else nneg++;
- }
- pu->r_u.m2.ave_comp /= (double)ns;
- pu->r_u.m2.ave_comp = exp(pu->r_u.m2.ave_comp);
- for (i=0; i<nstats; i++) if (sptr[i].comp < 0.0) {
- sptr[i].comp = pu->r_u.m2.ave_comp;
- }
-
- if (nneg > 0)
- sprintf(ex_string,"composition = -1 for %d sequences",nneg);
- else ex_string[0]='\0';
-
- if (!do_trim) {
- if (mle_cen2(sptr, nstats, pst.n0, 0.0,
- &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1,
- &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1) == -1) goto bad_mle2;
- ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
-
- sprintf(f_string,"%s MLE-2 statistics: a0= %6.4f; a1=%6.4f; a2=%6.4f; b1=%6.4f\n ave Lamdba: %6.4f",
- s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1,ave_lambda);
- }
- else {
- if (nstats/20 > 500) f_cen = 500.0/(double)nstats;
- else f_cen = 0.05;
- if (mle_cen2(sptr, nstats, pst.n0, f_cen, &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1, &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1)== -1) goto bad_mle2;
-
- ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
-
- sprintf(f_string,"%s MLE-2-cen statistics: a0= %6.4f; a1=%6.4f; a2=%6.4f; b1=%6.4f (cen=%d)\n ave Lambda:%6.4f",
- s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1, (int)((double)nstats*f_cen),ave_lambda);
- }
-
- return MLE2_STATS;
- bad_mle2:
- find_zp = &find_zn;
- return proc_hist_n(sptr, nstats, pst, histp, do_trim, pu);
-}
-
-double first_deriv_cen(double lambda, struct stat_str *sptr,
- int start, int stop,
- double sumlenL, double cenL,
- double sumlenH, double cenH);
-
-double second_deriv_cen(double lambda, struct stat_str *sptr,
- int start, int stop,
- double sumlenL, double cenL,
- double sumlenH, double cenH);
-
-static void
-st_sort (struct stat_str *v, int n) {
- int gap, i, j;
- int tmp;
-
- for (gap = 1; gap < n/3; gap = 3*gap +1) ;
-
- for (; gap > 0; gap = (gap-1)/3)
- for (i = gap; i < n; i++)
- for (j = i - gap; j >= 0; j -= gap) {
- if (v[j].score <= v[j + gap].score) break;
-
- tmp = v[j].score;
- v[j].score = v[j + gap].score;
- v[j + gap].score = tmp;
-
- tmp = v[j].n1;
- v[j].n1 = v[j + gap].n1;
- v[j + gap].n1 = tmp;
- }
-}
-
-/* sptr[].score, sptr[].n1; sptr[] must be sorted
- int n = total number of samples
- int M = length of query
- double fn = fraction of scores to be censored fn/2.0 from top, bottom
- double *Lambda = Lambda estimate
- double *K = K estimate
-*/
-
-#define MAX_NIT 100
-
-int
-mle_cen(struct stat_str *sptr, int n, int M, double fc,
- double *Lambda, double *K) {
-
- double sumlenL, sumlenH, cenL, cenH;
- double sum_s, sum2_s, mean_s, var_s, dtmp;
- int start, stop;
- int i, nf;
- int nit = 0;
- double deriv, deriv2, lambda, old_lambda, sum = 0.0;
- /*
- int sumlenL, int sumlenghtsR = sum of low (Left), right (High) seqs.
- int cenL, cenH = censoring score low, high
- */
-
- nf = (fc/2.0) * n;
- start = nf;
- stop = n - nf;
-
- st_sort(sptr,n);
-
- sum_s = sum2_s = 0.0;
- for (i=start; i<stop; i++) {
- sum_s += sptr[i].score;
- }
- dtmp = (double)(stop-start);
- mean_s = sum_s/dtmp;
-
- for (i=start; i<stop; i++) {
- sum2_s += sptr[i].score * sptr[i].score;
- }
- var_s = sum2_s/(dtmp-1.0);
-
- sumlenL = sumlenH = 0.0;
- for (i=0; i<start; i++) sumlenL += (double)sptr[i].n1;
- for (i=stop; i<n; i++) sumlenH += (double)sptr[i].n1;
-
- if (nf > 0) {
- cenL = (double)sptr[start].score;
- cenH = (double)sptr[stop].score;
- }
- else {
- cenL = (double)sptr[start].score/2.0;
- cenH = (double)sptr[start].score*2.0;
- }
-
- if (cenL >= cenH) return -1;
-
- /* initial guess for lambda is 0.2 - this does not work for matrices
- with very different scales */
- /* lambda = 0.2; */
- lambda = PI_SQRT6/sqrt(var_s);
- if (lambda > 1.0) {
- fprintf(stderr," Lambda initial estimate error: lambda: %6.4g; var_s: %6.4g\n",lambda,var_s);
- lambda = 0.2;
- }
-
- do {
- deriv = first_deriv_cen(lambda, sptr, start, stop,
- sumlenL, cenL, sumlenH, cenH);
- /* (uncensored version)
- first_deriv(lambda, &sptr[start], stop - start))
- */
-
- /* (uncensored version)
- deriv2 = second_deriv(lambda, &sptr[start], stop-start);
- */
- deriv2 = second_deriv_cen(lambda, sptr, start, stop,
- sumlenL, cenL, sumlenH, cenH);
-
- old_lambda = lambda;
- if (lambda - deriv/deriv2 > 0.0) lambda = lambda - deriv/deriv2;
- else lambda = lambda/2.0;
- nit++;
- } while (fabs((lambda - old_lambda)/lambda) > TINY && nit < MAX_NIT);
-
- /* fprintf(stderr," mle_cen nit: %d\n",nit); */
-
- if (nit >= MAX_NIT) return -1;
-
- for(i = start; i < stop ; i++) {
- sum += (double) sptr[i].n1 * exp(- lambda * (double)sptr[i].score);
- }
-
- *Lambda = lambda;
- /*
- *K = (double)(stop-start)/((double)M*sum);
- */
- *K = (double)n/((double)M*
- (sum+sumlenL*exp(-lambda*cenL)-sumlenH*exp(-lambda*cenH)));
- return 0;
-}
-
-/*
-double
-first_deriv(double lambda, struct stat_str *sptr, int n) {
-
- int i;
- double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
- double s, l, es;
-
- for(i = 0 ; i < n ; i++) {
- s = (double)sptr[i].score;
- l = (double)sptr[i].n1;
- es = exp(-lambda * s );
- sum += s;
- sum2 += l * es;
- sum1 += s * l * es;
- }
-
- return (1.0/lambda) - (sum/(double)n) + (sum1/sum2);
-}
-*/
-
-/*
-double
-second_deriv(double lambda, struct stat_str *sptr, int n) {
- double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
- double s, l, es;
- int i;
-
- for(i = 0 ; i < n ; i++) {
- l = (double)sptr[i].n1;
- s = (double)sptr[i].score;
- es = exp(-lambda * s);
- sum2 += l * es;
- sum1 += l * s * es;
- sum3 += l * s * s * es;
- }
-
- return ((sum1*sum1)/(sum2*sum2)) - (sum3/sum2) - (1.0/(lambda*lambda));
-}
-*/
-
-double
-first_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
- double sumlenL, double cenL, double sumlenH, double cenH) {
- int i;
- double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
- double s, l, es;
-
- for(i = start ; i < stop ; i++) {
- s = (double)sptr[i].score;
- l = (double)sptr[i].n1;
- es = exp(-lambda * s );
- sum += s;
- sum2 += l * es;
- sum1 += s * l * es;
- }
-
- sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
- sum2 += sumlenL*exp(-lambda*cenL) - sumlenH*exp(-lambda*cenH);
-
- return (1.0 / lambda) - (sum /(double)(stop-start)) + (sum1 / sum2);
-}
-
-double
-second_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
- double sumlenL, double cenL, double sumlenH, double cenH) {
-
- double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
- double s, l, es;
- int i;
-
- for(i = start ; i < stop ; i++) {
- s = (double)sptr[i].score;
- l = (double)sptr[i].n1;
- es = exp(-lambda * s);
- sum2 += l * es;
- sum1 += l * s * es;
- sum3 += l * s * s * es;
- }
-
- sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
- sum2 += sumlenL*exp(-lambda * cenL) - sumlenH*exp(-lambda * cenH);
- sum3 += sumlenL*cenL*cenL * exp(-lambda * cenL) -
- sumlenH*cenH*cenH * exp(-lambda * cenH);
- return ((sum1 * sum1) / (sum2 * sum2)) - (sum3 / sum2)
- - (1.0 / (lambda * lambda));
-}
-
-double mle2_func(double *params,
- double *consts,
- struct stat_str *values,
- int n, int start, int stop);
-
-void simplex(double *fitparams,
- double *lambda,
- int nparam,
- double (*minfunc) (double *tryparams, double *consts,
- struct stat_str *data, int ndata,
- int start, int stop),
- double *consts,
- void *data,
- int ndata, int start, int stop
- );
-
-int
-mle_cen2(struct stat_str *sptr, int n, int M, double fc,
- double *a0, double *a1, double *a2, double *b1) {
-
- double params[4], lambdas[4], consts[9];
- double avglenL, avglenH, avgcompL, avgcompH, cenL, cenH;
- int start, stop;
- int i, nf;
-
- nf = (fc/2.0) * n;
- start = nf;
- stop = n - nf;
-
- st_sort(sptr,n);
-
- /* choose arithmetic or geometic mean for compositions by appropriate commenting */
-
- if (nf > 0) {
- avglenL = avglenH = 0.0;
- avgcompL = avgcompH = 0.0;
- /* avgcompL = avgcompH = 1.0 */
- for (i=0; i<start; i++) {
- avglenL += (double)sptr[i].n1;
- avgcompL += (double)sptr[i].comp;
- /* avgcompL *= (double) sptr[i].comp; */
- }
- avglenL /= (double) start;
- avgcompL /= (double) start;
- /* avgcompL = pow(avgcompL, 1.0/(double) start); */
-
- for (i=stop; i<n; i++) {
- avglenH += (double)sptr[i].n1;
- avgcompH += (double)sptr[i].comp;
- /* avgcompH *= (double) sptr[i].comp; */
- }
- avglenH /= (double) (n - stop);
- avgcompH /= (double) (n - stop);
- /* avgcompL = pow(avgcompL, 1.0/(double) (n - stop)); */
-
- cenL = (double)sptr[start].score;
- cenH = (double)sptr[stop].score;
- if (cenL >= cenH) return -1;
- }
- else {
- avglenL = avglenH = cenL = cenH = 0.0;
- avgcompL = avgcompH = 1.0;
- }
-
- params[0] = 10.0;
- params[1] = -10.0;
- params[2] = 1.0;
- params[3] = 1.0;
-
- lambdas[0] = 1.0;
- lambdas[1] = 0.5;
- lambdas[2] = 0.1;
- lambdas[3] = 0.01;
-
- consts[0] = M;
- consts[1] = (double) start;
- consts[2] = (double) stop;
- consts[3] = cenL;
- consts[4] = cenH;
- consts[5] = avglenL;
- consts[6] = avglenH;
- consts[7] = avgcompL;
- consts[8] = avgcompH;
-
- simplex(params, lambdas, 4,
- (double (*) (double *, double *, struct stat_str *, int, int, int) )mle2_func,
- consts, sptr, n, start, stop);
-
- *a0 = params[0];
- *a1 = params[1];
- *a2 = params[2];
- *b1 = params[3];
-
- return 0;
-}
-
-double mle2_func(double *params,
- double *consts,
- struct stat_str *values,
- int n, int start, int stop
- ) {
-
- double a0, a1, a2, b1, M;
- double score, length, comp;
- double cenL, cenH, avglenL, avglenH, avgcompL, avgcompH;
- double L, y;
-
- int i;
-
- a0 = params[0];
- a1 = params[1];
- a2 = params[2];
- b1 = params[3];
-
- M = consts[0];
- /*
- start = (int) consts[1];
- stop = (int) consts[2];
- */
- cenL = consts[3];
- cenH = consts[4];
- avglenL = consts[5];
- avglenH = consts[6];
- avgcompL = consts[7];
- avgcompH = consts[8];
-
- L = 0;
- y = 0;
-
- if (start > 0) {
- y = -(cenL - (a0 + a1*avgcompL +a2*avgcompL*log(M*avglenL)))/(b1*avgcompL);
- L += (double) start * exp(y);
- }
-
- for(i = start ; i < stop ; i++) {
- score = (double) values[i].score;
- length = (double) values[i].n1;
- comp = (double) values[i].comp;
-
- y = - (score - (a0 + a1*comp + a2 * comp * log(M*length))) / (b1*comp);
-
- L += -y + exp(y) + log(b1 * comp);
- }
-
- if (stop < n) {
- y = -(cenH -(a0 + a1*avgcompH + a2*avgcompH*log(M*avglenH)))/(b1*avgcompH);
- L -= (double) (n - stop) * exp(y);
- }
- return L;
-}
-
-/* Begin Nelder-Mead simplex code: */
-
-double evalfunc(double **param,
- double *vals,
- double *psums,
- double *ptry,
- int nparam,
- double (*minfunc) (double *params, double *consts,
- struct stat_str *data, int ndata,
- int start, int stop),
- double *consts,
- void *data,
- int ndata, int start, int stop,
- int ihi,
- double factor);
-
-void simplex(double *fitparams,
- double *lambda,
- int nparam,
- double (*minfunc) (double *tryparams, double *consts,
- struct stat_str *data, int ndata,
- int start, int stop),
- double *consts,
- void *data,
- int ndata,
- int start,
- int stop
- )
-{
-
- int i, j, ilo, ihi, inhi;
- double rtol, sum, tmp, ysave, ytry;
- double *psum, *vals, *ptry, **param;
-
-
- psum = (double *) calloc(nparam, sizeof(double));
- ptry = (double *) calloc(nparam, sizeof(double));
-
- vals = (double *) calloc(nparam + 1, sizeof(double));
-
- param = (double **) calloc(nparam + 1, sizeof(double *));
- param[0] = (double *) calloc((nparam + 1) * nparam, sizeof(double));
- for( i = 1 ; i < (nparam + 1) ; i++ ) {
- param[i] = param[0] + i * nparam;
- }
-
- /* Get our N+1 initial parameter values for the simplex */
-
- for( i = 0 ; i < nparam ; i++ ) {
- param[0][i] = fitparams[i];
- }
-
- for( i = 1 ; i < (nparam + 1) ; i++ ) {
- for( j = 0 ; j < nparam ; j++ ) {
- param[i][j] = fitparams[j] + lambda[j] * ( (i - 1) == j ? 1 : 0 );
- }
- }
-
- /* calculate initial values at the simplex nodes */
-
- for( i = 0 ; i < (nparam + 1) ; i++ ) {
- vals[i] = minfunc(param[i], consts, data, ndata, start, stop);
- }
-
- /* Begin Nelder-Mead simplex algorithm from Numerical Recipes in C */
-
- for( j = 0 ; j < nparam ; j++ ) {
- for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
- sum += param[i][j];
- }
- psum[j] = sum;
- }
-
-
- while( 1 ) {
-/*
- determine which point is highest (ihi), next highest (inhi) and
- lowest (ilo) by looping over the points in the simplex
-*/
- ilo = 0;
-
-/* ihi = vals[0] > vals[1] ? (inhi = 1, 0) : (inhi = 0, 1); */
- if(vals[0] > vals[1]) { ihi = 0; inhi = 1; }
- else { ihi = 1; inhi = 0; }
-
- for( i = 0 ; i < nparam + 1 ; i++) {
- if( vals[i] <= vals[ilo] ) ilo = i;
- if( vals[i] > vals[ihi] ) {
- inhi = ihi;
- ihi = i;
- } else if ( vals[i] > vals[inhi] && i != ihi ) inhi = i;
- }
-
- /* Are we finished? */
-
- rtol = 2.0 * fabs(vals[ihi] - vals[ilo]) /
- (fabs(vals[ihi]) + fabs(vals[ilo]) + TINY);
-
- if( rtol < TOLERANCE ) {
-
-/* put the best value and best parameters into the first index */
-
- tmp = vals[0];
- vals[0] = vals[ilo];
- vals[ilo] = tmp;
-
- for( i = 0 ; i < nparam ; i++ ) {
- tmp = param[0][i];
- param[0][i] = param[ilo][i];
- param[ilo][i] = tmp;
- }
-
- /* et voila, c'est finis */
- break;
- }
-
- /* Begin a new iteration */
-
- /* first, extrapolate by -1 through the face of the simplex across from ihi */
-
- ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
- data, ndata, start, stop, ihi, -1.0);
-
- if( ytry <= vals[ilo] ) {
-
- /* Good result, try additional extrapolation by 2 */
-
- ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
- data, ndata, start, stop, ihi, 2.0);
-
- } else if ( ytry >= vals[inhi] ) {
-
- /* no good, look for an intermediate lower point by contracting */
-
- ysave = vals[ihi];
- ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
- data, ndata, start, stop, ihi, 0.5);
-
- if( ytry >= ysave ) {
-
- /* Still no good. Contract around lowest (best) point. */
-
- for( i = 0 ; i < nparam + 1 ; i++ ) {
- if( i != ilo ) {
- for ( j = 0 ; j < nparam ; j++ ) {
- param[i][j] = psum[j] = 0.5 * (param[i][j] + param[ilo][j]);
- }
- vals[i] = minfunc(psum, consts, data, ndata, start, stop);
- }
- }
-
-
- for( j = 0 ; j < nparam ; j++ ) {
- for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
- sum += param[i][j];
- }
- psum[j] = sum;
- }
-
- }
- }
- }
-
- for( i = 0 ; i < nparam ; i++ ) {
- fitparams[i] = param[0][i];
- }
-
- if (ptry!=NULL) {
- free(ptry);
- ptry=NULL;
- }
- free(param[0]);
- free(param);
- free(vals);
- free(psum);
-}
-
-
-double evalfunc(double **param,
- double *vals,
- double *psum,
- double *ptry,
- int nparam,
- double (*minfunc)(double *tryparam, double *consts,
- struct stat_str *data, int ndata,
- int start, int stop),
- double *consts,
- void *data,
- int ndata, int start, int stop,
- int ihi,
- double factor) {
-
- int j;
- double fac1, fac2, ytry;
-
-
- fac1 = (1.0 - factor) / nparam;
- fac2 = fac1 - factor;
-
- for( j = 0 ; j < nparam ; j++ ) {
- ptry[j] = psum[j] * fac1 - param[ihi][j] * fac2;
- }
-
- ytry = minfunc(ptry, consts, data, ndata, start, stop);
-
- if( ytry < vals[ihi] ) {
- vals[ihi] = ytry;
- for( j = 0 ; j < nparam ; j++ ) {
- psum[j] += ptry[j] - param[ihi][j];
- param[ihi][j] = ptry[j];
- }
- }
-
- return ytry;
-}
-
-/* end of Nelder-Mead simplex code */
-
-int
-proc_hist_n(struct stat_str *sptr, int nstats,
- struct pstruct pst, struct hist_str *histp,
- int do_trim, struct pstat_str *pu)
-{
- int i, j;
- double s_score, s2_score, ssd, ztrim;
- int nit, max_hscore;
- char s_string[128];
- char *f_string;
-
- f_string = &(histp->stat_info[0]);
-
- max_hscore = calc_thresh(pst, nstats, pu->ngLambda,
- pu->ngK, pu->ngH, &ztrim);
-
- s_score = s2_score = 0.0;
-
- for ( j = 0, i = 0; i < nstats; i++) {
- if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
- s_score += (ssd=(double)sptr[i].score);
- s2_score += ssd * ssd;
- j++;
- }
- }
-
- if (j > 1 ) {
- pu->r_u.rg.mu = s_score/(double)j;
- pu->r_u.rg.mean_var = s2_score - (double)j * pu->r_u.rg.mu * pu->r_u.rg.mu;
- pu->r_u.rg.mean_var /= (double)(j-1);
- }
- else {
- pu->r_u.rg.mu = 50.0;
- pu->r_u.rg.mean_var = 10.0;
- }
-
- if (pu->r_u.rg.mean_var < 0.01) {
- pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
- }
-
- /* now remove some scores */
-
- nit = 5;
- while (nit-- > 0) {
- pu->r_u.rg.n_trimmed = 0;
-
- for (i=0; i< nstats; i++) {
- if (sptr[i].n1 < 0) continue;
- ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
- if (ssd > ztrim || ssd < 20.0) {
- /* fprintf(stderr,"removing %3d %3d %4.1f\n",
- sptr[i].score, sptr[i].n1,ssd); */
- ssd = sptr[i].score;
- s_score -= ssd;
- s2_score -= ssd*ssd;
- j--;
- pu->r_u.rg.n_trimmed++;
- histp->entries--;
- sptr[i].n1 = -sptr[i].n1;
- }
- }
-
- if (j > 1 ) {
- pu->r_u.rg.mu = s_score/(double)j;
- pu->r_u.rg.mean_var = s2_score - (double)j * pu->r_u.rg.mu * pu->r_u.rg.mu;
- pu->r_u.rg.mean_var /= (double)(j-1);
- }
- else {
- pu->r_u.rg.mu = 50.0;
- pu->r_u.rg.mean_var = 10.0;
- }
-
- if (pu->r_u.rg.mean_var < 0.01) {
- pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
- }
-
- if (pu->r_u.rg.n_trimmed < LHISTC) {
- /*
- fprintf(stderr,"nprune %d at %d\n",nprune,nit);
- */
- break;
- }
- }
-
- if (pst.zsflag < 10) s_string[0]='\0';
- else if (pst.zs_win > 0)
- sprintf(s_string,"(shuffled, win: %d)",pst.zs_win);
- else strncpy(s_string,"(shuffled)",sizeof(s_string));
-
- sprintf(f_string,"%s unscaled statistics: mu= %6.4f var=%6.4f; Lambda= %6.4f",
- s_string, pu->r_u.rg.mu,pu->r_u.rg.mean_var,PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
- return AVE_STATS;
-}
-
-/*
-This routine calculates the maximum likelihood estimates for the
-extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
-
- <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
- <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
-
- The <a> parameter can be transformed into and K
- of the formula: 1 - exp ( - K m n exp ( - lambda S ))
- using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
- 1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
-
- a = log(K m n) / lambda
- a lambda = log (K m n)
- exp(a lambda) = K m n
- but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
- so: K m n = (1/n sum{ exp( -x[i] *lambda)})
- K = sum{}/(nlib m n )
-
-*/
-
-void
-alloc_hist(struct llen_str *llen)
-{
- int max_llen, i;
- max_llen = llen->max;
-
- if (llen->hist == NULL) {
- llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
- llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
- llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
- llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
- }
-
- for (i=0; i< max_llen+1; i++) {
- llen->hist[i] = 0;
- llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
- }
-}
-
-void
-free_hist(struct llen_str *llen)
-{
- if (llen->hist!=NULL) {
- free(llen->score_var);
- free(llen->score2_sums);
- free(llen->score_sums);
- free(llen->hist);
- llen->hist=NULL;
- }
-}
-
-void
-inithist(struct llen_str *llen, struct pstruct pst, int max_hscore)
-{
- llen->max = MAX_LLEN;
-
- llen->max_score = -1;
- llen->min_score=10000;
-
- alloc_hist(llen);
-
- llen->zero_s = 0;
- llen->min_length = 10000;
- llen->max_length = 0;
-}
-
-void
-addhist(struct llen_str *llen, int score, int length, int max_hscore)
-{
- int llength;
- double dscore;
-
- if ( score<=0 || length < LENGTH_CUTOFF) {
- llen->min_score = 0;
- llen->zero_s++;
- return;
- }
-
- if (score < llen->min_score) llen->min_score = score;
- if (score > llen->max_score) llen->max_score = score;
-
- if (length > llen->max_length) llen->max_length = length;
- if (length < llen->min_length) llen->min_length = length;
- if (score > max_hscore) score = max_hscore;
-
- llength = (int)(LN_FACT*log((double)length)+0.5);
-
- if (llength < 0 ) llength = 0;
- if (llength > llen->max) llength = llen->max;
- llen->hist[llength]++;
- dscore = (double)score;
- llen->score_sums[llength] += dscore;
- llen->score2_sums[llength] += dscore * dscore;
-}
-
-/* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
-
-void
-inithistz(int mh, struct hist_str *histp )
-{
- int i;
-
- histp->z_calls = 0;
-
- histp->min_hist = 20;
- histp->max_hist = 120;
-
- histp->histint = (int)
- ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
- histp->maxh = (int)
- ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
-
- if (histp->hist_a==NULL) {
- if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
- NULL) {
- fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
- histp->histflg = 0;
- }
- else histp->histflg = 1;
- }
- else {
- for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
- }
- histp->entries = 0;
-}
-
-static double nrv[100]={
- 0.3098900570,-0.0313400923, 0.1131975903,-0.2832547606, 0.0073672659,
- 0.2914489107, 0.4209306311,-0.4630181404, 0.3326537896, 0.0050140359,
- -0.1117435426,-0.2835630301, 0.2302997065,-0.3102716394, 0.0819894916,
- -0.1676455701,-0.3782225018,-0.3204509938,-0.3594969187,-0.0308950398,
- 0.2922813812, 0.1337170751, 0.4666577031,-0.2917784349,-0.2438179916,
- 0.3002301394, 0.0231147123, 0.5687927366,-0.2318208709,-0.1476839273,
- -0.0385043851,-0.1213476523, 0.1486341995, 0.1027917167, 0.1409192644,
- -0.3280652579, 0.4232041455, 0.0775993309, 0.1159071787, 0.2769424442,
- 0.3197284751, 0.1507346903, 0.0028580909, 0.4825103412,-0.0496843610,
- -0.2754357656, 0.6021881753,-0.0816123956,-0.0899148991, 0.4847183201,
- 0.2151621865,-0.4542246220, 0.0690709102, 0.2461894193, 0.2126042295,
- -0.0767060668, 0.4819746149, 0.3323031326, 0.0177600676, 0.1143185210,
- 0.2653977455, 0.0921872958,-0.1330986718, 0.0412287716,-0.1691604748,
- -0.0529679078,-0.0194157955,-0.6117493924, 0.1199067932, 0.0210243193,
- -0.5832259838,-0.1685528664, 0.0008591271,-0.1120347822, 0.0839125069,
- -0.2787486831,-0.1937017962,-0.1915733940,-0.7888453635,-0.3316745163,
- 0.1180885226,-0.3347001067,-0.2477492636,-0.2445697600, 0.0001342482,
- -0.0015759812,-0.1516473992,-0.5202267615, 0.2136975210, 0.2500423188,
- -0.2402926401,-0.1094186280,-0.0618869933,-0.0815221188, 0.2623337275,
- 0.0219427302 -0.1774469919, 0.0828245026,-0.3271952808,-0.0632898028};
-
-void
-addhistz(double zs, struct hist_str *histp)
-{
- int ih, zi;
- double rv;
-
- rv = nrv[histp->z_calls++ % 100];
- zi = (int)(zs + 0.5+rv );
-
- if ((zi >= 0) && (zi <= 120)) histp->entries++;
-
- if (zi < histp->min_hist) zi = histp->min_hist;
- if (zi > histp->max_hist) zi = histp->max_hist;
-
- ih = (zi - histp->min_hist)/histp->histint;
-
- histp->hist_a[ih]++;
-}
-
-/* addhistzp() does not increase histp->entries since addhist did it already */
-/*
-void
-addhistzp(double zs, struct hist_str *histp)
-{
- int ih, zi;
- double rv;
-
- rv = nrv[histp->z_calls++ %100];
- zi = (int)(zs + 0.5 + rv);
-
- if (zi < histp->min_hist) zi = histp->min_hist;
- if (zi > histp->max_hist) zi = histp->max_hist;
-
- ih = (zi - histp->min_hist)/histp->histint;
-
- histp->hist_a[ih]++;
-}
-*/
-
-void
-prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
- long *entries)
-{
- int llength;
- double dscore;
-
- if (score <= 0 || length < LENGTH_CUTOFF) return;
-
- if (score > max_hscore) score = max_hscore;
-
- llength = (int)(LN_FACT*log((double)length)+0.5);
-
- if (llength < 0 ) llength = 0;
- if (llength > llen->max) llength = llen->max;
- llen->hist[llength]--;
- dscore = (double)score;
- llen->score_sums[llength] -= dscore;
- llen->score2_sums[llength] -= dscore * dscore;
-
-/* (*entries)--; histp->entries is not yet initialized */
-}
-
-/* fit_llen: no trimming
- (1) regress scores vs log(n) using weighted variance
- (2) calculate mean variance after length regression
-*/
-
-void
-fit_llen(struct llen_str *llen, struct rstat_str *pr)
-{
- int j;
- int n;
- int n_size;
- double x, y2, u, z;
- double mean_x, mean_y, var_x, var_y, covar_xy;
- double mean_y2, covar_xy2, var_y2, dllj;
-
- double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
-
-/* now fit scores to best linear function of log(n), using
- simple linear regression */
-
- for (llen->min=0; llen->min < llen->max; llen->min++)
- if (llen->hist[llen->min]) break;
- llen->min--;
-
- for (n_size=0,j = llen->min; j < llen->max; j++) {
- if (llen->hist[j] > 1) {
- dllj = (double)llen->hist[j];
- llen->score_var[j] = llen->score2_sums[j]/dllj
- - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
- llen->score_var[j] /= (double)(llen->hist[j]-1);
- if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
- n_size++;
- }
- }
-
- pr->nb_tot = n_size;
-
- n_w = 0.0;
- sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
- for (j = llen->min; j < llen->max; j++)
- if (llen->hist[j] > 1) {
- x = j + 0.5;
- dllj = (double)llen->hist[j];
- n_w += dllj/llen->score_var[j];
- sum_x += dllj * x / llen->score_var[j] ;
- sum_y += llen->score_sums[j] / llen->score_var[j];
- sum_x2 += dllj * x * x /llen->score_var[j];
- sum_xy += x * llen->score_sums[j]/llen->score_var[j];
- }
-
- if (n_size < 5 ) {
- llen->fit_flag=0;
- pr->rho = 0;
- pr->mu = sum_y/n_w;
- return;
- }
- else {
- det = n_w * sum_x2 - sum_x * sum_x;
- if (det > 0.001) {
- pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
- pr->rho_e = n_w/det;
- pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
- pr->mu_e = sum_x2/det;
- }
- else {
- llen->fit_flag = 0;
- pr->rho = 0;
- pr->mu = sum_y/n_w;
- return;
- }
- }
-
- det = n_w * sum_x2 - sum_x * sum_x;
- pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
- pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
-
- n = 0;
- mean_x = mean_y = mean_y2 = 0.0;
- var_x = var_y = 0.0;
- covar_xy = covar_xy2 = 0.0;
-
- for (j = llen->min; j <= llen->max; j++)
- if (llen->hist[j] > 1 ) {
- n += llen->hist[j];
- x = (double)j + 0.5;
- mean_x += (double)llen->hist[j] * x;
- mean_y += llen->score_sums[j];
- var_x += (double)llen->hist[j] * x * x;
- var_y += llen->score2_sums[j];
- covar_xy += x * llen->score_sums[j];
- }
- mean_x /= n; mean_y /= n;
- var_x = var_x / n - mean_x * mean_x;
- var_y = var_y / n - mean_y * mean_y;
-
- covar_xy = covar_xy / n - mean_x * mean_y;
-/*
- pr->rho = covar_xy / var_x;
- pr->mu = mean_y - pr->rho * mean_x;
-*/
- mean_y2 = covar_xy2 = var_y2 = 0.0;
- for (j = llen->min; j <= llen->max; j++)
- if (llen->hist[j] > 1) {
- x = (double)j + 0.5;
- u = pr->rho * x + pr->mu;
- y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
-/*
- dllj = (double)llen->hist[j];
- fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
- llen->score_sums[j]/dllj,y2/dllj);
-*/
- mean_y2 += y2;
- var_y2 += y2 * y2;
- covar_xy2 += x * y2;
- /* fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
- x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
- }
-
- pr->mean_var = mean_y2 /= (double)n;
- covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
-
- if (pr->mean_var <= 0.01) {
- llen->fit_flag = 0;
- pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
- }
-
- /*
- fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
- pr->rho*LN_FACT,pr->mu,pr->mean_var);
- */
- if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
- else pr->var_e = 0.0;
-
- if (llen->fit_flag) {
- pr->rho2 = covar_xy2 / var_x;
- pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
- }
- else {
- pr->rho2 = 0;
- pr->mu2 = pr->mean_var;
- }
-
- if (pr->rho2 < 0.0 )
- z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
- else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
- if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
-
- pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
-}
-
-/* fit_llens: trim high variance bins
- (1) regress scores vs log(n) using weighted variance
- (2) regress residuals vs log(n)
- (3) remove high variance bins
- (4) calculate mean variance after length regression
-*/
-
-void
-fit_llens(struct llen_str *llen, struct rstat_str *pr)
-{
- int j;
- int n, n_u2;
- double x, y, y2, u, u2, v, z;
- double mean_x, mean_y, var_x, var_y, covar_xy;
- double mean_y2, covar_xy2;
- double mean_u2, mean_3u2, dllj;
- double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
-
-/* now fit scores to best linear function of log(n), using
- simple linear regression */
-
- for (llen->min=0; llen->min < llen->max; llen->min++)
- if (llen->hist[llen->min]) break;
- llen->min--;
-
- for (j = llen->min; j < llen->max; j++) {
- if (llen->hist[j] > 1) {
- dllj = (double)llen->hist[j];
- llen->score_var[j] = (double)llen->score2_sums[j]/dllj
- - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
- llen->score_var[j] /= (double)(llen->hist[j]-1);
- if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
- }
- }
-
- n_w = 0.0;
- sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
- for (j = llen->min; j < llen->max; j++)
- if (llen->hist[j] > 1) {
- x = j + 0.5;
- dllj = (double)llen->hist[j];
- n_w += dllj/llen->score_var[j];
- sum_x += dllj * x / llen->score_var[j] ;
- sum_y += llen->score_sums[j] / llen->score_var[j];
- sum_x2 += dllj * x * x /llen->score_var[j];
- sum_xy += x * llen->score_sums[j]/llen->score_var[j];
- }
-
- det = n_w * sum_x2 - sum_x * sum_x;
- pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
- pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
-
-/* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
-
- n = 0;
- mean_x = mean_y = mean_y2 = 0.0;
- var_x = var_y = 0.0;
- covar_xy = covar_xy2 = 0.0;
-
- for (j = llen->min; j <= llen->max; j++)
- if (llen->hist[j] > 1 ) {
- n += llen->hist[j];
- x = (double)j + 0.5;
- dllj = (double)llen->hist[j];
- mean_x += dllj * x;
- mean_y += llen->score_sums[j];
- var_x += dllj * x * x;
- var_y += llen->score2_sums[j];
- covar_xy += x * llen->score_sums[j];
- }
- mean_x /= n; mean_y /= n;
- var_x = var_x / n - mean_x * mean_x;
- var_y = var_y / n - mean_y * mean_y;
-
- covar_xy = covar_xy / n - mean_x * mean_y;
-/* pr->rho = covar_xy / var_x;
- pr->mu = mean_y - pr->rho * mean_x;
-*/
-
- mean_y2 = covar_xy2 = 0.0;
- for (j = llen->min; j <= llen->max; j++)
- if (llen->hist[j] > 1) {
- x = (double)j + 0.5;
- u = pr->rho * x + pr->mu;
- y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
- mean_y2 += y2;
- covar_xy2 += x * y2;
- }
-
- mean_y2 /= n;
- covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
- pr->rho2 = covar_xy2 / var_x;
- pr->mu2 = mean_y2 - pr->rho2 * mean_x;
-
- if (pr->rho2 < 0.0 )
- z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
- else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
- if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
-
- pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
-
-/* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
- pr->var_cutoff,z);
-*/
- mean_u2 = 0.0;
- n_u2 = 0;
- for ( j = llen->min; j < llen->max; j++) {
- y = j+0.5;
- dllj = (double)llen->hist[j];
- x = pr->rho * y + pr->mu;
- v = pr->rho2 * y + pr->mu2;
- if (v < pr->var_cutoff) v = pr->var_cutoff;
- if (llen->hist[j]> 1) {
- u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
- mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
- n_u2++;
- /* fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
- j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
- }
- else llen->score_var[j] = -1.0;
- }
-
- mean_u2 = sqrt(mean_u2/(double)n_u2);
- /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */
-
- mean_3u2 = mean_u2*3.0;
-
- for (j = llen->min; j < llen->max; j++) {
- if (llen->hist[j] <= 1) continue;
- if (sqrt(llen->score_var[j]) > mean_3u2) {
- /* fprintf(stderr," removing %d %d %.2f\n",
- j, (int)(exp((double)j/LN_FACT)-0.5),
- sqrt(llen->score_var[j]));
- */
- pr->nb_trimmed++;
- pr->n1_trimmed += llen->hist[j];
- llen->hist[j] = 0;
- }
- }
- fit_llen(llen, pr);
-}
-
-struct s2str {double s; int n;};
-void s2_sort ( struct s2str *sptr, int n);
-
-void
-fit_llen2(struct llen_str *llen, struct rstat_str *pr)
-{
- int j;
- int n, n_y2, llen_delta, llen_del05;
- int n_size;
- double x, y2, u;
- double mean_x, mean_y, var_x, var_y, covar_xy;
- double mean_y2, covar_xy2;
- struct s2str *ss2;
-
- double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
-
-/* now fit scores to best linear function of log(n), using
- simple linear regression */
-
- for (llen->min=0; llen->min < llen->max; llen->min++)
- if (llen->hist[llen->min]) break;
-
- for ( ; llen->max > llen->min; llen->max--)
- if (llen->hist[llen->max]) break;
-
- for (n_size=0,j = llen->min; j < llen->max; j++) {
- if (llen->hist[j] > 1) {
- llen->score_var[j] = llen->score2_sums[j]/(double)llen->hist[j]
- - (llen->score_sums[j]/(double)llen->hist[j])
- * (llen->score_sums[j]/(double)llen->hist[j]);
- llen->score_var[j] /= (double)(llen->hist[j]-1);
- if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
- n_size++;
- }
- }
-
- n_w = 0.0;
- sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
- for (j = llen->min; j < llen->max; j++)
- if (llen->hist[j] > 1) {
- x = j + 0.5;
- n_w += (double)llen->hist[j]/llen->score_var[j];
- sum_x += (double)llen->hist[j] * x / llen->score_var[j] ;
- sum_y += llen->score_sums[j] / llen->score_var[j];
- sum_x2 += (double)llen->hist[j] * x * x /llen->score_var[j];
- sum_xy += x * llen->score_sums[j]/llen->score_var[j];
- }
-
- if (n_size < 5 ) {
- llen->fit_flag=0;
- pr->rho = 0;
- pr->mu = sum_y/n_w;
- }
- else {
- det = n_w * sum_x2 - sum_x * sum_x;
- if (det > 0.001) {
- pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
- pr->rho_e = n_w/det;
- pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
- pr->mu_e = sum_x2/det;
- }
- else {
- llen->fit_flag = 0;
- pr->rho = 0;
- pr->mu = sum_y/n_w;
- }
- }
-
- det = n_w * sum_x2 - sum_x * sum_x;
- pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
- pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
-
-/* fprintf(stderr," rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
-
- n = 0;
- mean_x = mean_y = mean_y2 = 0.0;
- var_x = var_y = 0.0;
- covar_xy = covar_xy2 = 0.0;
-
- for (j = llen->min; j <= llen->max; j++)
- if (llen->hist[j] > 1 ) {
- n += llen->hist[j];
- x = (double)j + 0.5;
- mean_x += (double)llen->hist[j] * x;
- mean_y += llen->score_sums[j];
- var_x += (double)llen->hist[j] * x * x;
- var_y += llen->score2_sums[j];
- covar_xy += x * llen->score_sums[j];
- }
- mean_x /= n; mean_y /= n;
- var_x = var_x / n - mean_x * mean_x;
- var_y = var_y / n - mean_y * mean_y;
-
- covar_xy = covar_xy / n - mean_x * mean_y;
-/*
- pr->rho = covar_xy / var_x;
- pr->mu = mean_y - pr->rho * mean_x;
-*/
-
- if ((ss2=(struct s2str *)calloc(llen->max+1,sizeof(struct s2str)))==NULL) {
- fprintf(stderr," cannot allocate ss2\n");
- return;
- }
-
- mean_y2 = 0.0;
- n_y2 = n = 0;
- for (j = llen->min; j <= llen->max; j++)
- if (llen->hist[j] > VHISTC) {
- n++;
- n_y2 += ss2[j].n = llen->hist[j];
- x = (double)j + 0.5;
- u = pr->rho * x + pr->mu;
- ss2[j].s = y2 = llen->score2_sums[j] - 2*llen->score_sums[j]*u + llen->hist[j]*u*u;
- mean_y2 += y2;
- }
- pr->mean_var = mean_y2/(double)n_y2;
-
- s2_sort(ss2+llen->min,llen->max-llen->min+1);
-
- /* fprintf(stderr,"llen->min: %d, max: %d\n",llen->min,llen->max); */
- llen_delta = 0;
- for (j=llen->min; j<=llen->max; j++) {
- if (ss2[j].n > 1) {
- llen_delta++;
-/* fprintf(stderr,"%d\t%d\t%.2f\t%.4f\n",
- j,ss2[j].n,ss2[j].s,ss2[j].s/ss2[j].n);
-*/
- }
- }
-
- llen_del05 = llen_delta/20;
- mean_y2 = 0.0;
- n_y2 = 0;
- for (j = llen->min; j<llen->min+llen_del05; j++) {
- pr->n1_trimmed += ss2[j].n;
- pr->nb_trimmed++;
- }
- for (j = llen->min+llen_del05; j <= llen->min+llen_delta-llen_del05; j++)
- if (ss2[j].n > 1) {
- mean_y2 += ss2[j].s;
- n_y2 += ss2[j].n;
- }
- for (j = llen->min+llen_delta-llen_del05+1; j< llen->max; j++) {
- pr->n1_trimmed += ss2[j].n;
- pr->nb_trimmed++;
- }
-
- free(ss2);
- if (n_y2 > 1) pr->mean_var = mean_y2/(double)n_y2;
-
- /* fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var: %.4f/%d\n",
- pr->rho*LN_FACT,pr->mu,pr->mean_var,n); */
-
- pr->var_e = 0.0;
-}
-
-/* REG_STATS - Z() from rho/mu/mean_var */
-double find_zr(int score, double escore, int length, double comp, struct pstat_str *pu)
-{
- double log_len, z;
-
- if (score <= 0) return 0;
- if ( length < LENGTH_CUTOFF) return 0;
-
- log_len = LN_FACT*log((double)(length));
-/* var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
- if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.var_cutoff;
-*/
-
- z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
-
- return (50.0 + z*10.0);
-}
-
-/* REG2_STATS Z() from rho/mu, rho2/mu2 */
-double find_zr2(int score, double escore, int length, double comp, struct pstat_str *pu)
-{
- double log_len, var;
- double z;
-
- if ( length < LENGTH_CUTOFF) return 0;
-
- log_len = LN_FACT*log((double)(length));
-
- var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
- if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.mean_var;
-
- z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(var);
-
- return (50.0 + z*10.0);
-}
-
-#ifdef USE_LNSTATS
-/* LN_STATS - ln()-scaled mu, mean_var */
-double find_zl(int score, int length, double comp, struct pstat_str *pu)
-{
- double ls, z;
-
- ls = (double)score*LN200/log((double)length);
-
- z = (ls - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
-
- return (50.0 + z*10.0);
-}
-#endif
-
-/* MLE_STATS - Z() from MLE for lambda, K */
-double
-find_ze(int score, double escore, int length, double comp, struct pstat_str *pu)
-{
- double z, mp, np, a_n1;
-
- a_n1 = (double)length;
-
- mp = pu->r_u.ag.a_n0;
- np = a_n1;
-
- if (np < 1.0) np = 1.0;
- if (mp < 1.0) mp = 1.0;
-
- z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
-
- z = -z + EULER_G;
- z /= - PI_SQRT6;
-
- return (50.0 + z*10.0);
-}
-
-/* MLE2_STATS - Z() from MLE for mle_a0..2, mle_b1, length, comp */
-double
-find_ze2(int score, double escore, int length, double comp, struct pstat_str *pu)
-{
- double z, mp, np, a_n1;
-
- a_n1 = (double)length;
-
- if (comp <= 0.0) comp = pu->r_u.m2.ave_comp;
-
- /* avoid very biased comp estimates */
- /* comp = exp((4.0*log(comp)+log(pu->r_u.m2.ave_comp))/5.0); */
-
- mp = pu->r_u.m2.a_n0;
- np = a_n1;
-
- if (np < 1.0) np = 1.0;
- if (mp < 1.0) mp = 1.0;
-
- z = (-(pu->r_u.m2.mle2_a0 + pu->r_u.m2.mle2_a1 * comp + pu->r_u.m2.mle2_a2 * comp * log(np * mp)) + score) / (pu->r_u.m2.mle2_b1 * comp);
-
- z = -z + EULER_G;
- z /= - PI_SQRT6;
-
- return (50.0 + z*10.0);
-}
-
-/* AG_STATS - Altschul-Gish Lamdba, K */
-double
-find_za(int score, double escore, int length, double comp, struct pstat_str *pu)
-{
- double z, mp, np, a_n1, a_n1f;
-
- a_n1 = (double)length;
- a_n1f = log(a_n1)/pu->r_u.ag.H;
-
- mp = pu->r_u.ag.a_n0 - pu->r_u.ag.a_n0f - a_n1f;
- np = a_n1 - pu->r_u.ag.a_n0f - a_n1f;
-
- if (np < 1.0) np = 1.0;
- if (mp < 1.0) mp = 1.0;
-
- z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
-
- z = -z + EULER_G;
- z /= - PI_SQRT6;
-
- return (50.0 + z*10.0);
-}
-
-double find_zn(int score, double escore, int length, double comp, struct pstat_str *pu)
-{
- double z;
-
- z = ((double)score - pu->r_u.rg.mu) / sqrt(pu->r_u.rg.mean_var);
-
- return (50.0 + z*10.0);
-}
-
-/* computes E value for a given z value, assuming extreme value distribution */
-double
-z_to_E(double zs, long entries, struct db_str db)
-{
- double e, n;
-
- /* if (db->entries < 5) return (double)db.entries; */
- if (entries < 1) { n = db.entries;}
- else {n = entries;}
-
- if (zs > ZS_MAX) return 0.0;
-
-#ifndef NORMAL_DIST
- e = exp(- PI_SQRT6 * zs - .577216);
- return n * (e > .01 ? 1.0 - exp(-e) : e);
-#else
- return n * erfc(zs/M_SQRT2)/2.0;
-#endif
-}
-
-double
-zs_to_p(double zs)
-{
- double e, z;
-
- /* if (db.entries < 5) return 0.0; */
-
- z = (zs - 50.0)/10.0;
-
- if (z > ZS_MAX) return 0.0;
-
-#ifndef NORMAL_DIST
- e = exp(- PI_SQRT6 * z - EULER_G);
- return (e > .01 ? 1.0 - exp(-e) : e);
-#else
- return erfc(zs/M_SQRT2)/2.0;
-#endif
-}
-
-double
-zs_to_bit(double zs, int n0, int n1)
-{
- double z, a_n0, a_n1;
-
- z = (zs - 50.0)/10.0;
- a_n0 = (double)n0;
- a_n1 = (double)n1;
-
- return (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;
-}
-
-/* computes E-value for a given z value, assuming extreme value distribution */
-double
-zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
-{
- double e, z, k;
-
- /* if (db->entries < 5) return 0.0; */
-
- z = (zs - 50.0)/10.0;
-
- if (z > ZS_MAX ) return 0.0;
-
- if (entries < 1) entries = db.entries;
-
- if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
- k = (double)db.length /(double)n1;
- if (db.carry > 0) {
- k += ((double)db.carry * (double)LONG_MAX)/(double)n1;
- }
- }
- else k = (double)entries;
-
- if (k < 1.0) k = 1.0;
-
-#ifndef NORMAL_DIST
- z *= PI_SQRT6;
- z += EULER_G;
- e = exp(-z);
- return k * (e > .01 ? 1.0 - exp(-e) : e);
-#else
- return k * erfc(z/M_SQRT2)/2.0;
-#endif
-}
-
-#ifdef NORMAL_DIST
-double np_to_z(double, int *);
-#endif
-
-/* computes E-value for a given z value, assuming extreme value distribution */
-double
-E_to_zs(double E, long entries)
-{
- double e, z;
- int error;
-
- e = E/(double)entries;
-
-#ifndef NORMAL_DIST
- z = (log(e)+EULER_G)/(- PI_SQRT6);
- return z*10.0+50.0;
-#else
- z = np_to_z(1.0-e,&error);
-
- if (!error) return z*10.0+50.0;
- else return 0.0;
-#endif
-}
-
-/* computes 1.0 - E value for a given z value, assuming extreme value
- distribution */
-double
-zs_to_Ec(double zs, long entries)
-{
- double e, z;
-
- if (entries < 5) return 0.0;
-
- z = (zs - 50.0)/10.0;
-
- if (z > ZS_MAX) return 1.0;
-
-#ifndef NORMAL_DIST
- e = exp(- PI_SQRT6 * z - EULER_G);
- return (double)entries * (e > .01 ? exp(-e) : 1.0 - e);
-#else
- return (double)entries*erf(z/M_SQRT2)/2.0;
-#endif
-}
-
-/* calculate a threshold score, given an E() value and Lambda,K,H */
-
-int
-E1_to_s(double e_val, int n0, int n1, struct pstat_str *pu) {
- double mp, np, a_n0, a_n0f, a_n1;
- int score;
-
- a_n0 = (double)n0;
- a_n1 = (double)n1;
- a_n0f = log(pu->r_u.ag.K * a_n0 * a_n1)/pu->r_u.ag.H;
-
- mp = a_n0 - a_n0f;
- np = a_n1 - a_n0f;
-
- if (np < 1.0) np = 1.0;
- if (mp < 1.0) mp = 1.0;
-
- score = (int)((log( pu->r_u.ag.K * mp * np) - log(e_val))/pu->r_u.ag.Lambda +0.5);
- if (score < 0) score = 0;
- return score;
-}
-
-/* no longer used; stat_str returned by process_hist
-void
-summ_stats(char *s_str, struct pstat_str *pu)
-{
- strcpy(s_str,f_string);
-}
-*/
-
-void
-vsort(v,s,n)
- double *v; int *s, n;
-{
- int gap, i, j;
- double tmp;
- int itmp;
-
- for (gap=n/2; gap>0; gap/=2)
- for (i=gap; i<n; i++)
- for (j=i-gap; j>=0; j -= gap) {
- if (v[j] >= v[j+gap]) break;
- tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
- itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp;
- }
-}
-
-/*
-void s_sort (double **ptr, int nbest)
-{
- int gap, i, j;
- double *tmp;
-
- for (gap = nbest/2; gap > 0; gap /= 2)
- for (i = gap; i < nbest; i++)
- for (j = i - gap; j >= 0; j-= gap) {
- if (*ptr[j] >= *ptr[j + gap]) break;
- tmp = ptr[j];
- ptr[j] = ptr[j + gap];
- ptr[j + gap] = tmp;
- }
-}
-*/
-
-void ss_sort (int *ptr, int n)
-{
- int gap, i, j;
- int tmp;
-
- for (gap = n/2; gap > 0; gap /= 2)
- for (i = gap; i < n; i++)
- for (j = i - gap; j >= 0; j-= gap) {
- if (ptr[j] >= ptr[j + gap]) break;
- tmp = ptr[j];
- ptr[j] = ptr[j + gap];
- ptr[j + gap] = tmp;
- }
-}
-
-
-void s2_sort (struct s2str *ptr, int n)
-{
- int gap, i, j;
- struct s2str tmp;
-
- for (gap = n/2; gap > 0; gap /= 2)
- for (i = gap; i < n; i++)
- for (j = i - gap; j >= 0; j-= gap) {
- if (ptr[j].s >= ptr[j + gap].s) break;
- tmp.s = ptr[j].s;
- tmp.n = ptr[j].n;
- ptr[j].s = ptr[j + gap].s;
- ptr[j].n = ptr[j + gap].n;
- ptr[j + gap].s = tmp.s;
- ptr[j + gap].n = tmp.n;
- }
-}
-
-void last_stats() {}
-
-void
-scale_scores(struct beststr **bptr, int nbest, struct db_str db,
- struct pstruct pst, struct pstat_str *rs)
-{
- int i;
- double zscore;
-
- if (pst.zsflag < 0 || pst.zsflag_f < 0) return;
-
- for (i=0; i<nbest; i++) {
- zscore = find_zp(bptr[i]->score[pst.score_ix], bptr[i]->escore,
- bptr[i]->n1,bptr[i]->comp,rs);
- bptr[i]->zscore = zscore;
- bptr[i]->escore
- =zs_to_E(zscore,bptr[i]->n1,pst.dnaseq, pst.zdb_size,db);
- }
- sortbeste(bptr,nbest);
-}
-
-#ifdef NORMAL_DIST
-/* ALGORITHM AS241 APPL. STATIST. (1988) VOL. 37, NO. 3
-
- Produces the normal deviate Z corresponding to a given lower
- tail area of P; Z is accurate to about 1 part in 10**16.
-
- The hash sums below are the sums of the mantissas of the
- coefficients. They are included for use in checking
- transcription.
-*/
-
-double np_to_z(double p, int *fault) {
-
- double q, r, ppnd16;
-
- double zero = 0.0, one = 1.0, half = 0.5;
- double split1 = 0.425, split2 = 5.0;
- double const1 = 0.180625, const2 = 1.6;
-
-/* Coefficients for P close to 0.5 */
-
- double a0 = 3.3871328727963666080e0;
- double a1 = 1.3314166789178437745e+2;
- double a2 = 1.9715909503065514427e+3;
- double a3 = 1.3731693765509461125e+4;
- double a4 = 4.5921953931549871457e+4;
- double a5 = 6.7265770927008700853e+4;
- double a6 = 3.3430575583588128105e+4;
- double a7 = 2.5090809287301226727e+3;
- double b1 = 4.2313330701600911252e+1;
- double b2 = 6.8718700749205790830e+2;
- double b3 = 5.3941960214247511077e+3;
- double b4 = 2.1213794301586595867e+4;
- double b5 = 3.9307895800092710610e+4;
- double b6 = 2.8729085735721942674e+4;
- double b7 = 5.2264952788528545610e+3;
-
- double sum_ab= 55.8831928806149014439;
-/*
- Coefficients for P not close to 0, 0.5 or 1.
-*/
-
- double c0 = 1.42343711074968357734;
- double c1 = 4.63033784615654529590;
- double c2 = 5.76949722146069140550;
- double c3 = 3.64784832476320460504;
- double c4 = 1.27045825245236838258;
- double c5 = 2.41780725177450611770e-1;
- double c6 = 2.27238449892691845833e-2;
- double c7 = 7.74545014278341407640e-4;
- double d1 = 2.05319162663775882187;
- double d2 = 1.67638483018380384940;
- double d3 = 6.89767334985100004550e-1;
- double d4 = 1.48103976427480074590e-1;
- double d5 = 1.51986665636164571966e-2;
- double d6 = 5.47593808499534494600e-4;
- double d7 = 1.05075007164441684324e-9;
-
- double sum_cd=49.33206503301610289036;
-/*
- Coefficients for P near 0 or 1.
-*/
- double e0 = 6.65790464350110377720e0;
- double e1 = 5.46378491116411436990e0;
- double e2 = 1.78482653991729133580e0;
- double e3 = 2.96560571828504891230e-1;
- double e4 = 2.65321895265761230930e-2;
- double e5 = 1.24266094738807843860e-3;
- double e6 = 2.71155556874348757815e-5;
- double e7 = 2.01033439929228813265e-7;
- double f1 = 5.99832206555887937690e-1;
- double f2 = 1.36929880922735805310e-1;
- double f3 = 1.48753612908506148525e-2;
- double f4 = 7.86869131145613259100e-4;
- double f5 = 1.84631831751005468180e-5;
- double f6 = 1.42151175831644588870e-7;
- double f7 = 2.04426310338993978564e-15;
-
- double sum_ef=47.52583317549289671629;
-
- double sum_tmp = 0.0;
-
- /*
- sum_tmp = a0+a1+a2+a3+a4+a5+a6+a7+b1+b2+b3+b4+b5+b6+b7;
- if (fabs(sum_tmp - sum_ab) > 1e-12) {
- fprintf (stderr," sum_ab error: %lg %lg\n",sum_tmp,sum_ab);
- *fault = 1;
- return zero;
- }
-
- sum_tmp = c0+c1+c2+c3+c4+c5+c6+c7+d1+d2+d3+d4+d5+d6+d7;
- if (fabs(sum_tmp - sum_cd) > 1e-12) {
- fprintf (stderr," sum_cd error: %lg %lg\n",sum_tmp,sum_cd);
- *fault = 1;
- return zero;
- }
- sum_tmp = e0+e1+e2+e3+e4+e5+e6+e7+f1+f2+f3+f4+f5+f6+f7;
- if (fabs(sum_tmp - sum_ef) > 1e-12) {
- fprintf (stderr," sum_ef error: %lg %lg\n",sum_tmp,sum_ef);
- *fault = 1;
- return zero;
- }
- */
-
- *fault = 0;
- q = p - half;
- if (fabs(q) <= split1) {
- r = const1 - q * q;
- return q * (((((((a7 * r + a6) * r + a5) * r + a4) * r + a3)
- * r + a2) * r + a1) * r + a0) /
- (((((((b7 * r + b6) * r + b5) * r + b4) * r + b3)
- * r + b2) * r + b1) * r + one);
- }
- else {
- r = (q < zero) ? p : one - p;
- if (r <= zero) {
- *fault = 1;
- return zero;
- }
- r = sqrt(-log(r));
- if (r <= split2) {
- r -= const2;
- ppnd16 = (((((((c7 * r + c6) * r + c5) * r + c4) * r + c3)
- * r + c2) * r + c1) * r + c0) /
- (((((((d7 * r + d6) * r + d5) * r + d4) * r + d3)
- * r + d2) * r + d1) * r + one);
- }
- else {
- r -= split2;
- ppnd16 = (((((((e7 * r + e6) * r + e5) * r + e4) * r + e3)
- * r + e2) * r + e1) * r + e0) /
- (((((((f7 * r + f6) * r + f5) * r + f4) * r + f3)
- * r + f2) * r + f1) * r + one);
- }
- if (q < zero) return -ppnd16;
- else return ppnd16;
- }
-}
-#endif