1 /**************** Statistical Significance Parameter Subroutine ****************
3 $Name: fa_34_26_5 $ - $Id: karlin.c,v 1.18 2006/06/01 16:05:30 wrp Exp $
5 Version 1.0 February 2, 1990
6 Version 2.0 March 18, 1993
8 Program by: Stephen Altschul
10 Address: National Center for Biotechnology Information
11 National Library of Medicine
12 National Institutes of Health
15 Internet: altschul@ncbi.nlm.nih.gov
17 See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
18 Significance of Molecular Sequence Features by Using General Scoring
19 Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
21 Computes the parameters lambda and K for use in calculating the
22 statistical significance of high-scoring segments or subalignments.
24 The scoring scheme must be integer valued. A positive score must be
25 possible, but the expected (mean) score must be negative.
27 A program that calls this routine must provide the value of the lowest
28 possible score, the value of the greatest possible score, and a pointer
29 to an array of probabilities for the occurence of all scores between
30 these two extreme scores. For example, if score -2 occurs with
31 probability 0.7, score 0 occurs with probability 0.1, and score 3
32 occurs with probability 0.2, then the subroutine must be called with
33 low = -2, high = 3, and pr pointing to the array of values
34 { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
35 pointers to lambda and K; the subroutine will then calculate the values
36 of these two parameters. In this example, lambda=0.330 and K=0.154.
38 The parameters lambda and K can be used as follows. Suppose we are
39 given a length N random sequence of independent letters. Associated
40 with each letter is a score, and the probabilities of the letters
41 determine the probability for each score. Let S be the aggregate score
42 of the highest scoring contiguous segment of this sequence. Then if N
43 is sufficiently large (greater than 100), the following bound on the
44 probability that S is greater than or equal to x applies:
46 P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
48 In other words, the p-value for this segment can be written as
49 1-exp[-KN*exp(-lambda*S)].
51 This formula can be applied to pairwise sequence comparison by assigning
52 scores to pairs of letters (e.g. amino acids), and by replacing N in the
53 formula with N*M, where N and M are the lengths of the two sequences
56 In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
57 distinct segments all with score >= S is given by:
60 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
62 Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
63 as the previous formula.
65 *******************************************************************************/
71 #define MAXIT 25 /* Maximum number of iterations used in calculating lambda */
77 /* first build a residue map to automatically put residues in score bins */
82 /* initialize the Karlin frequency, probability arrays using
83 a specific query sequence */
85 int karlin(int , int, double *, double *, double *);
86 static int karlin_k(int , int , double *, double *, double *, double *);
88 void init_karlin(const unsigned char *aa0, int n0, struct pstruct *ppst,
89 double *aa0_f, double **kp)
91 int kar_nsq, kar_range, kar_min, kar_max;
93 const unsigned char *aa0p;
98 kar_range = ppst->pam_h - ppst->pam_l + 1;
100 if ((kar_p=(double *)calloc(kar_range+1,sizeof(double)))==NULL) {
101 fprintf(stderr," cannot allocate kar_p array: %d\n",kar_range+1);
106 kar_nsq = ppst->nsq; /* alphabet size */
107 kar_min = ppst->pam_l; /* low pam value */
108 kar_max = ppst->pam_h; /* high pam value */
110 /* must have at least 1 residue of each type */
112 for (i=1; i<=kar_nsq; i++) r_cnt[i]=1;
114 fn0 = 100.0/(double)(n0+kar_nsq); /* weight of each residue */
117 /* increment residue count for each residue in query sequence */
118 while (*aa0p) r_cnt[ppst->hsqx[*aa0p++]]++;
120 /* map all unmapped residues to 'X' */
121 r_cnt[NMAP_X] += r_cnt[NMAP];
123 for (i=1; i<=kar_nsq; i++) aa0_f[i] = fn0*(double)r_cnt[i];
126 double nt_f[] = {0.0, 0.25, 0.25, 0.25, 0.25 };
128 /* Robinson and Robinson frequencies */
131 /* A */ 0.0780474700897585,
132 /* R */ 0.0512953149316987,
133 /* N */ 0.0448725775979007,
134 /* D */ 0.0536397361638076,
135 /* C */ 0.0192460110427568,
136 /* Q */ 0.0426436013507063,
137 /* E */ 0.0629485981204668,
138 /* G */ 0.0737715654561964,
139 /* H */ 0.0219922696262025,
140 /* I */ 0.0514196403000682,
141 /* L */ 0.090191394464413,
142 /* K */ 0.0574383201866657,
143 /* M */ 0.0224251883196316,
144 /* F */ 0.0385564048655621,
145 /* P */ 0.0520279465667327,
146 /* S */ 0.0711984743501224,
147 /* T */ 0.0584129422708473,
148 /* W */ 0.013298374223799,
149 /* Y */ 0.0321647488738564,
150 /* V */ 0.0644094211988074};
152 /* initialize the Karlin frequency, probability arrays using
153 an "average" composition (average length if n0 <=0) */
156 init_karlin_a(struct pstruct *ppst, double *aa0_f, double **kp)
158 int kar_nsq, kar_range;
163 kar_range = ppst->pam_h - ppst->pam_l + 1;
165 if ((kar_p=(double *)calloc(kar_range+1,sizeof(double)))==NULL) {
166 fprintf(stderr," cannot allocate kar_p array: %d\n",kar_range+1);
172 if (ppst->nt_align) {
174 for (i=1; i<=kar_nsq; i++) aa0_f[i] = nt_f[i];
176 else if (ppst->dnaseq==SEQT_PROT || ppst->dnaseq == SEQT_UNK) {
178 for (i=1; i<=kar_nsq; i++) aa0_f[i] = aa_f[i];
182 fn0 = 1.0/(double)(kar_nsq-1);
183 for (i=1; i< kar_nsq; i++) aa0_f[i] = fn0;
189 /* calculate set up karlin() to calculate Lambda, K, by calculating
192 do_karlin(const unsigned char *aa1, int n1,
193 int **pam2, struct pstruct *ppst,
194 double *aa0_f, double *kar_p, double *lambda, double *H)
196 register unsigned const char *aap;
197 int kar_range, kar_min, kar_max, kar_nsq;
204 kar_min = ppst->pam_l;
205 kar_max = ppst->pam_h;
206 kar_range = kar_max - kar_min + 1;
209 for (i=1; i<=kar_nsq; i++) r_cnt[i]=1;
214 while (*aap) r_cnt[ppst->hsqx[*aap++]]++;
216 r_cnt[NMAP_X] += r_cnt[NMAP];
218 /* residue frequencies */
219 fn1 = 100.0/(double)(n1+kar_nsq);
220 for (i=1; i<=kar_nsq; i++) aa1_f[i]= fn1*(double)r_cnt[i];
222 for (i=0; i<=kar_range; i++) kar_p[i] = 0.0;
224 for (i=1; i<=kar_nsq; i++) {
225 for (j=1; j<=kar_nsq; j++)
226 kar_p[pam2[i][j]-kar_min] += aa0_f[i]*aa1_f[j];
230 for (i=0; i<=kar_range; i++) kar_tot += kar_p[i];
231 if (kar_tot <= 0.00001) return 0;
233 for (i=0; i<=kar_range; i++) kar_p[i] /= kar_tot;
235 return karlin(kar_min, kar_max, kar_p, lambda, H);
239 do_karlin_a(int **pam2, struct pstruct *ppst,
240 double *aa0_f, double *kar_p, double *lambda, double *K, double *H)
243 int kar_range, kar_min, kar_max, kar_nsq;
248 kar_min = ppst->pam_l;
249 kar_max = ppst->pam_h;
250 kar_range = kar_max - kar_min + 1;
253 if (ppst->nt_align ) {
256 for (i=1; i<=kar_nsq; i++) {kar_tot += aa1fp[i];}
257 for (i=1; i<=kar_nsq; i++) {aa1_f[i]= aa1fp[i]/kar_tot;}
259 else if (!ppst->nt_align) {
262 for (i=1; i<=kar_nsq; i++) {kar_tot += aa1fp[i];}
263 for (i=1; i<=kar_nsq; i++) {aa1_f[i]= aa1fp[i]/kar_tot;}
267 fn1 = 1.0/(double)(kar_nsq-1);
268 for (i=1; i< kar_nsq; i++) aa1_f[i] = fn1;
272 for (i=0; i<=kar_range; i++) kar_p[i] = 0.0;
274 for (i=1; i<=kar_nsq; i++) {
275 for (j=1; j<kar_nsq; j++)
276 kar_p[pam2[i][j]-kar_min] += aa0_f[i]*aa1_f[j];
280 for (i=0; i<=kar_range; i++) kar_tot += kar_p[i];
281 if (kar_tot <= 0.00001) return 0;
283 for (i=0; i<=kar_range; i++) kar_p[i] /= kar_tot;
285 return karlin_k(kar_min, kar_max, kar_p, lambda, K, H);
288 /* take a array of letters and pam information and get *lambda, *H */
290 karlin(int low, /* Lowest score (must be negative) */
291 int high, /* Highest score (must be positive) */
292 double *pr, /* Probabilities for various scores */
293 double *lambda_p, /* Pointer to parameter lambda */
294 double *H_p) /* Pointer to parameter H */
297 double up,new,sum,av,beta,ftemp;
301 /* Calculate the parameter lambda */
306 /* check for E() < 0.0 */
309 for (i=low; i <= high ; i++) sum += i* (*ptr1++);
312 fprintf(stderr," (karlin lambda) non-negative expected score: %.4lg\n",
318 /* up is upper bound on lambda */
326 ftemp=exp(up*(low-1));
328 for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
332 /* avoid overflow from very large lambda*S */
339 ftemp=exp(up*(low-1));
341 for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
345 */ /* we moved past, now back up */
347 /* for (lambda=j=0;j<25;++j) { */
350 while ( nit++ < MAXIT ) {
351 new = (lambda+up)/2.0;
353 ftemp = exp(new*(low-1));
356 /* multiply by exp(new) for each score */
357 for (i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta);
359 if (sum > 1.0 + TINY) up=new;
361 if ( fabs(lambda - new) < TINY ) goto done;
366 if (lambda <= 1e-10) {
374 /* Calculate the parameter K */
377 ftemp=exp(lambda*(low-1));
378 for (av=0.0, i=low; i<=high; ++i)
379 av+= *ptr1++ *i*(ftemp*=beta);
382 return 1; /* Parameters calculated successfully */
385 static int a_gcd (int, int);
387 /* take a array of letters and pam information and get *lambda, *K, *H */
389 karlin_k(int low, /* Lowest score (must be negative) */
390 int high, /* Highest score (must be positive) */
391 double *pr, /* Probabilities for various scores */
392 double *lambda_p, /* Pointer to parameter lambda */
394 double *H_p) /* Pointer to parameter H */
396 int i,j,range,lo,hi,first,last, nit;
397 double up,new,sum,Sum,av,beta,oldsum,ratio,ftemp;
399 double *p,*P,*ptrP,*ptr1,*ptr2;
401 /* Calculate the parameter lambda */
406 /* check for E() < 0.0 */
409 for (i=low; i <= high ; i++) sum += i* (*ptr1++);
412 fprintf(stderr," (karlin lambda) non-negative expected score: %.4lg\n",
418 /* up is upper bound on lambda */
426 ftemp=exp(up*(low-1));
428 for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
432 /* avoid overflow from very large lambda*S */
439 ftemp=exp(up*(low-1));
441 for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
446 /* we moved past, now back up */
448 /* for (lambda=j=0;j<25;++j) { */
451 while ( nit++ < MAXIT ) {
452 new = (lambda+up)/2.0;
454 ftemp = exp(new*(low-1));
457 /* multiply by exp(new) for each score */
458 for (i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta);
460 if (sum > 1.0 + TINY) up=new;
462 if ( fabs(lambda - new) < TINY ) goto done;
467 if (lambda <= 1e-10) {
475 /* Calculate the parameter H */
478 ftemp=exp(lambda*(low-1));
479 for (av=0.0, i=low; i<=high; ++i) av+= *ptr1++ *i*(ftemp*=beta);
482 /* Calculate the pamameter K */
484 P= (double *) calloc(MAXIT*range+1,sizeof(double));
485 for (*P=sum=oldsum=j=1;j<=MAXIT && sum>0.001;Sum+=sum/=j++) {
487 for (ptrP=P+(hi+=high)-(lo+=low); ptrP>=P; *ptrP-- =sum) {
490 for (sum=0,i=first; i<=last; ++i) sum += *ptr1-- * *ptr2++;
492 if (ptrP-P<=range) --last;
494 ftemp=exp(lambda*(lo-1));
495 for (sum=0,i=lo;i;++i) sum+= *++ptrP * (ftemp*=beta);
496 for (;i<=hi;++i) sum+= *++ptrP;
500 for (;j<=200;Sum+=oldsum/j++) oldsum*=ratio;
501 for (i=low;!p[i-low];++i);
502 for (j= -i;i<high && j>1;) if (p[++i-low]) j=a_gcd(j,i);
503 *K_p = (j*exp(-2*Sum))/(av*(1.0-exp(- lambda*j)));
506 return 1; /* Parameters calculated successfully */
515 if (b>a) { c=a; a=b; b=c; }
516 for (;b;b=c) { c=a%b; a=b; }