--- /dev/null
+/**************** Statistical Significance Parameter Subroutine ****************
+
+ $Name: fa_34_26_5 $ - $Id: karlin.c,v 1.18 2006/06/01 16:05:30 wrp Exp $
+
+ Version 1.0 February 2, 1990
+ Version 2.0 March 18, 1993
+
+ Program by: Stephen Altschul
+
+ Address: National Center for Biotechnology Information
+ National Library of Medicine
+ National Institutes of Health
+ Bethesda, MD 20894
+
+ Internet: altschul@ncbi.nlm.nih.gov
+
+ See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
+ Significance of Molecular Sequence Features by Using General Scoring
+ Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
+
+ Computes the parameters lambda and K for use in calculating the
+ statistical significance of high-scoring segments or subalignments.
+
+ The scoring scheme must be integer valued. A positive score must be
+ possible, but the expected (mean) score must be negative.
+
+ A program that calls this routine must provide the value of the lowest
+ possible score, the value of the greatest possible score, and a pointer
+ to an array of probabilities for the occurence of all scores between
+ these two extreme scores. For example, if score -2 occurs with
+ probability 0.7, score 0 occurs with probability 0.1, and score 3
+ occurs with probability 0.2, then the subroutine must be called with
+ low = -2, high = 3, and pr pointing to the array of values
+ { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
+ pointers to lambda and K; the subroutine will then calculate the values
+ of these two parameters. In this example, lambda=0.330 and K=0.154.
+
+ The parameters lambda and K can be used as follows. Suppose we are
+ given a length N random sequence of independent letters. Associated
+ with each letter is a score, and the probabilities of the letters
+ determine the probability for each score. Let S be the aggregate score
+ of the highest scoring contiguous segment of this sequence. Then if N
+ is sufficiently large (greater than 100), the following bound on the
+ probability that S is greater than or equal to x applies:
+
+ P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
+
+ In other words, the p-value for this segment can be written as
+ 1-exp[-KN*exp(-lambda*S)].
+
+ This formula can be applied to pairwise sequence comparison by assigning
+ scores to pairs of letters (e.g. amino acids), and by replacing N in the
+ formula with N*M, where N and M are the lengths of the two sequences
+ being compared.
+
+ In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
+ distinct segments all with score >= S is given by:
+
+ 2 m-1 -y
+ 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
+
+ Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
+ as the previous formula.
+
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#define MAXIT 25 /* Maximum number of iterations used in calculating lambda */
+#define NMAP_X 23
+#define NMAP 33
+
+#define TINY 1e-6
+
+/* first build a residue map to automatically put residues in score bins */
+
+#include "defs.h"
+#include "param.h"
+
+/* initialize the Karlin frequency, probability arrays using
+ a specific query sequence */
+
+int karlin(int , int, double *, double *, double *);
+static int karlin_k(int , int , double *, double *, double *, double *);
+
+void init_karlin(const unsigned char *aa0, int n0, struct pstruct *ppst,
+ double *aa0_f, double **kp)
+{
+ int kar_nsq, kar_range, kar_min, kar_max;
+
+ const unsigned char *aa0p;
+ int i;
+ int r_cnt[NMAP+1];
+ double fn0, *kar_p;
+
+ kar_range = ppst->pam_h - ppst->pam_l + 1;
+ if (*kp == NULL) {
+ if ((kar_p=(double *)calloc(kar_range+1,sizeof(double)))==NULL) {
+ fprintf(stderr," cannot allocate kar_p array: %d\n",kar_range+1);
+ exit(1);
+ }
+ *kp = kar_p;
+ }
+ kar_nsq = ppst->nsq; /* alphabet size */
+ kar_min = ppst->pam_l; /* low pam value */
+ kar_max = ppst->pam_h; /* high pam value */
+
+ /* must have at least 1 residue of each type */
+ r_cnt[NMAP]=0;
+ for (i=1; i<=kar_nsq; i++) r_cnt[i]=1;
+
+ fn0 = 100.0/(double)(n0+kar_nsq); /* weight of each residue */
+
+ aa0p = aa0;
+ /* increment residue count for each residue in query sequence */
+ while (*aa0p) r_cnt[ppst->hsqx[*aa0p++]]++;
+
+ /* map all unmapped residues to 'X' */
+ r_cnt[NMAP_X] += r_cnt[NMAP];
+
+ for (i=1; i<=kar_nsq; i++) aa0_f[i] = fn0*(double)r_cnt[i];
+}
+
+double nt_f[] = {0.0, 0.25, 0.25, 0.25, 0.25 };
+
+/* Robinson and Robinson frequencies */
+double aa_f[] = {
+/* NULL */ 0.00,
+/* A */ 0.0780474700897585,
+/* R */ 0.0512953149316987,
+/* N */ 0.0448725775979007,
+/* D */ 0.0536397361638076,
+/* C */ 0.0192460110427568,
+/* Q */ 0.0426436013507063,
+/* E */ 0.0629485981204668,
+/* G */ 0.0737715654561964,
+/* H */ 0.0219922696262025,
+/* I */ 0.0514196403000682,
+/* L */ 0.090191394464413,
+/* K */ 0.0574383201866657,
+/* M */ 0.0224251883196316,
+/* F */ 0.0385564048655621,
+/* P */ 0.0520279465667327,
+/* S */ 0.0711984743501224,
+/* T */ 0.0584129422708473,
+/* W */ 0.013298374223799,
+/* Y */ 0.0321647488738564,
+/* V */ 0.0644094211988074};
+
+/* initialize the Karlin frequency, probability arrays using
+ an "average" composition (average length if n0 <=0) */
+
+void
+init_karlin_a(struct pstruct *ppst, double *aa0_f, double **kp)
+{
+ int kar_nsq, kar_range;
+
+ int i;
+ double fn0, *kar_p;
+
+ kar_range = ppst->pam_h - ppst->pam_l + 1;
+ if (*kp == NULL) {
+ if ((kar_p=(double *)calloc(kar_range+1,sizeof(double)))==NULL) {
+ fprintf(stderr," cannot allocate kar_p array: %d\n",kar_range+1);
+ exit(1);
+ }
+ *kp = kar_p;
+ }
+
+ if (ppst->nt_align) {
+ kar_nsq = 4;
+ for (i=1; i<=kar_nsq; i++) aa0_f[i] = nt_f[i];
+ }
+ else if (ppst->dnaseq==SEQT_PROT || ppst->dnaseq == SEQT_UNK) {
+ kar_nsq = 20;
+ for (i=1; i<=kar_nsq; i++) aa0_f[i] = aa_f[i];
+ }
+ else {
+ kar_nsq = ppst->nsq;
+ fn0 = 1.0/(double)(kar_nsq-1);
+ for (i=1; i< kar_nsq; i++) aa0_f[i] = fn0;
+ aa0_f[kar_nsq]=0.0;
+ }
+
+}
+
+/* calculate set up karlin() to calculate Lambda, K, by calculating
+ aa1 frequencies */
+int
+do_karlin(const unsigned char *aa1, int n1,
+ int **pam2, struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *H)
+{
+ register unsigned const char *aap;
+ int kar_range, kar_min, kar_max, kar_nsq;
+ int r_cnt[NMAP+1];
+ double aa1_f[NMAP];
+ double fn1, kar_tot;
+ int i, j;
+
+ kar_nsq = ppst->nsq;
+ kar_min = ppst->pam_l;
+ kar_max = ppst->pam_h;
+ kar_range = kar_max - kar_min + 1;
+
+ r_cnt[NMAP]=0;
+ for (i=1; i<=kar_nsq; i++) r_cnt[i]=1;
+
+ /* residue counts */
+
+ aap=aa1;
+ while (*aap) r_cnt[ppst->hsqx[*aap++]]++;
+
+ r_cnt[NMAP_X] += r_cnt[NMAP];
+
+ /* residue frequencies */
+ fn1 = 100.0/(double)(n1+kar_nsq);
+ for (i=1; i<=kar_nsq; i++) aa1_f[i]= fn1*(double)r_cnt[i];
+
+ for (i=0; i<=kar_range; i++) kar_p[i] = 0.0;
+
+ for (i=1; i<=kar_nsq; i++) {
+ for (j=1; j<=kar_nsq; j++)
+ kar_p[pam2[i][j]-kar_min] += aa0_f[i]*aa1_f[j];
+ }
+
+ kar_tot = 0.0;
+ for (i=0; i<=kar_range; i++) kar_tot += kar_p[i];
+ if (kar_tot <= 0.00001) return 0;
+
+ for (i=0; i<=kar_range; i++) kar_p[i] /= kar_tot;
+
+ return karlin(kar_min, kar_max, kar_p, lambda, H);
+}
+
+int
+do_karlin_a(int **pam2, struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *K, double *H)
+{
+ double *aa1fp;
+ int kar_range, kar_min, kar_max, kar_nsq;
+ double aa1_f[NMAP];
+ double fn1, kar_tot;
+ int i, j;
+
+ kar_min = ppst->pam_l;
+ kar_max = ppst->pam_h;
+ kar_range = kar_max - kar_min + 1;
+
+ kar_tot = 0.0;
+ if (ppst->nt_align ) {
+ kar_nsq = 4;
+ aa1fp = nt_f;
+ for (i=1; i<=kar_nsq; i++) {kar_tot += aa1fp[i];}
+ for (i=1; i<=kar_nsq; i++) {aa1_f[i]= aa1fp[i]/kar_tot;}
+ }
+ else if (!ppst->nt_align) {
+ kar_nsq = 20;
+ aa1fp = aa_f;
+ for (i=1; i<=kar_nsq; i++) {kar_tot += aa1fp[i];}
+ for (i=1; i<=kar_nsq; i++) {aa1_f[i]= aa1fp[i]/kar_tot;}
+ }
+ else {
+ kar_nsq = ppst->nsq;
+ fn1 = 1.0/(double)(kar_nsq-1);
+ for (i=1; i< kar_nsq; i++) aa1_f[i] = fn1;
+ aa1_f[kar_nsq]=0.0;
+ }
+
+ for (i=0; i<=kar_range; i++) kar_p[i] = 0.0;
+
+ for (i=1; i<=kar_nsq; i++) {
+ for (j=1; j<kar_nsq; j++)
+ kar_p[pam2[i][j]-kar_min] += aa0_f[i]*aa1_f[j];
+ }
+
+ kar_tot = 0.0;
+ for (i=0; i<=kar_range; i++) kar_tot += kar_p[i];
+ if (kar_tot <= 0.00001) return 0;
+
+ for (i=0; i<=kar_range; i++) kar_p[i] /= kar_tot;
+
+ return karlin_k(kar_min, kar_max, kar_p, lambda, K, H);
+}
+
+/* take a array of letters and pam information and get *lambda, *H */
+int
+karlin(int low, /* Lowest score (must be negative) */
+ int high, /* Highest score (must be positive) */
+ double *pr, /* Probabilities for various scores */
+ double *lambda_p, /* Pointer to parameter lambda */
+ double *H_p) /* Pointer to parameter H */
+{
+ int i,range, nit;
+ double up,new,sum,av,beta,ftemp;
+ double lambda;
+ double *p,*ptr1;
+
+ /* Calculate the parameter lambda */
+
+ p = pr;
+ range = high-low;
+
+ /* check for E() < 0.0 */
+ sum = 0;
+ ptr1 = pr;
+ for (i=low; i <= high ; i++) sum += i* (*ptr1++);
+ if (sum >= 0.0) {
+#ifdef DEBUG
+ fprintf(stderr," (karlin lambda) non-negative expected score: %.4lg\n",
+ sum);
+#endif
+ return 0;
+ }
+
+ /* up is upper bound on lambda */
+ up=0.5;
+ do {
+ up *= 2.0;
+ ptr1=p;
+
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ }
+ while (sum<1.0);
+
+ /* avoid overflow from very large lambda*S */
+/*
+ do {
+ up /= 2.0;
+ ptr1=p;
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ } while (sum > 2.0);
+
+ up *= 2.0;
+*/ /* we moved past, now back up */
+
+ /* for (lambda=j=0;j<25;++j) { */
+ lambda = 0.0;
+ nit = 0;
+ while ( nit++ < MAXIT ) {
+ new = (lambda+up)/2.0;
+ beta = exp(new);
+ ftemp = exp(new*(low-1));
+ ptr1=p;
+ sum = 0.0;
+ /* multiply by exp(new) for each score */
+ for (i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta);
+
+ if (sum > 1.0 + TINY) up=new;
+ else {
+ if ( fabs(lambda - new) < TINY ) goto done;
+ lambda = new;
+ }
+ }
+
+ if (lambda <= 1e-10) {
+ lambda = -1.0;
+ return 0;
+ }
+
+ done:
+ *lambda_p = lambda;
+
+ /* Calculate the parameter K */
+
+ ptr1=p;
+ ftemp=exp(lambda*(low-1));
+ for (av=0.0, i=low; i<=high; ++i)
+ av+= *ptr1++ *i*(ftemp*=beta);
+ *H_p= lambda*av;
+
+ return 1; /* Parameters calculated successfully */
+}
+
+static int a_gcd (int, int);
+
+/* take a array of letters and pam information and get *lambda, *K, *H */
+static int
+karlin_k(int low, /* Lowest score (must be negative) */
+ int high, /* Highest score (must be positive) */
+ double *pr, /* Probabilities for various scores */
+ double *lambda_p, /* Pointer to parameter lambda */
+ double *K_p,
+ double *H_p) /* Pointer to parameter H */
+{
+ int i,j,range,lo,hi,first,last, nit;
+ double up,new,sum,Sum,av,beta,oldsum,ratio,ftemp;
+ double lambda;
+ double *p,*P,*ptrP,*ptr1,*ptr2;
+
+ /* Calculate the parameter lambda */
+
+ p = pr;
+ range = high-low;
+
+ /* check for E() < 0.0 */
+ sum = 0;
+ ptr1 = pr;
+ for (i=low; i <= high ; i++) sum += i* (*ptr1++);
+ if (sum >= 0.0) {
+#ifdef DEBUG
+ fprintf(stderr," (karlin lambda) non-negative expected score: %.4lg\n",
+ sum);
+#endif
+ return 0;
+ }
+
+ /* up is upper bound on lambda */
+ up=0.5;
+ do {
+ up *= 2.0;
+ ptr1=p;
+
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ }
+ while (sum<1.0);
+
+ /* avoid overflow from very large lambda*S */
+ /*
+ do {
+ up /= 2.0;
+ ptr1=p;
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ } while (sum > 2.0);
+
+ up *= 2.0;
+ */
+ /* we moved past, now back up */
+
+ /* for (lambda=j=0;j<25;++j) { */
+ lambda = 0.0;
+ nit = 0;
+ while ( nit++ < MAXIT ) {
+ new = (lambda+up)/2.0;
+ beta = exp(new);
+ ftemp = exp(new*(low-1));
+ ptr1=p;
+ sum = 0.0;
+ /* multiply by exp(new) for each score */
+ for (i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta);
+
+ if (sum > 1.0 + TINY) up=new;
+ else {
+ if ( fabs(lambda - new) < TINY ) goto done;
+ lambda = new;
+ }
+ }
+
+ if (lambda <= 1e-10) {
+ lambda = -1.0;
+ return 0;
+ }
+
+ done:
+ *lambda_p = lambda;
+
+ /* Calculate the parameter H */
+
+ ptr1=p;
+ ftemp=exp(lambda*(low-1));
+ for (av=0.0, i=low; i<=high; ++i) av+= *ptr1++ *i*(ftemp*=beta);
+ *H_p= lambda*av;
+
+ /* Calculate the pamameter K */
+ Sum=lo=hi=0;
+ P= (double *) calloc(MAXIT*range+1,sizeof(double));
+ for (*P=sum=oldsum=j=1;j<=MAXIT && sum>0.001;Sum+=sum/=j++) {
+ first=last=range;
+ for (ptrP=P+(hi+=high)-(lo+=low); ptrP>=P; *ptrP-- =sum) {
+ ptr1=ptrP-first;
+ ptr2=p+first;
+ for (sum=0,i=first; i<=last; ++i) sum += *ptr1-- * *ptr2++;
+ if (first) --first;
+ if (ptrP-P<=range) --last;
+ }
+ ftemp=exp(lambda*(lo-1));
+ for (sum=0,i=lo;i;++i) sum+= *++ptrP * (ftemp*=beta);
+ for (;i<=hi;++i) sum+= *++ptrP;
+ ratio=sum/oldsum;
+ oldsum=sum;
+ }
+ for (;j<=200;Sum+=oldsum/j++) oldsum*=ratio;
+ for (i=low;!p[i-low];++i);
+ for (j= -i;i<high && j>1;) if (p[++i-low]) j=a_gcd(j,i);
+ *K_p = (j*exp(-2*Sum))/(av*(1.0-exp(- lambda*j)));
+ free(P);
+
+ return 1; /* Parameters calculated successfully */
+}
+
+int
+a_gcd(int a, int b)
+{
+ int c;
+
+ if (b<0) b= -b;
+ if (b>a) { c=a; a=b; b=c; }
+ for (;b;b=c) { c=a%b; a=b; }
+ return a;
+}
+