JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / multicoil / stats.c
1 #include <math.h>
2 #include <stdio.h>
3 #include "scconst.h"
4 #include "scio.h"
5
6
7 /** This computes 1-erf(x) with fractional error everywhere less than **/
8 /** 1.2 * 10^-7.  ***/
9 /** I changed things from "float" to "double" from the numerical methods **/
10 /** in C code.  **/
11
12 double erfcc(x)
13 double x;
14 {
15         double t,z,ans;
16
17         z=fabs(x);
18         t=1.0/(1.0+0.5*z);
19         ans=t*exp(-z*z-1.26551223+t*(1.00002368+t*(0.37409196+t*(0.09678418+
20                 t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+
21                 t*(-0.82215223+t*0.17087277)))))))));
22         return  x >= 0.0 ? ans : 2.0-ans;
23 }
24
25
26
27
28 double erf(double x)
29 {
30   return(1-erfcc(x));
31 }
32
33
34 void printweight(long pfreqs[AANUM][POSNUM],
35                long pfreqp[AANUM][AANUM][POSNUM][POSNUM],
36                double gprobs[NUM_RES_TYPE],
37                double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
38                double weights[AANUM][POSNUM],
39                double weightp[AANUM][AANUM][POSNUM][POSNUM],
40                long totals[POSNUM],
41                long totalp[POSNUM][POSNUM],  FILE *weight_file,
42                  double pprobs[NUM_RES_TYPE][POSNUM],
43                  double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
44                  double bonnies_pprobs[NUM_RES_TYPE][POSNUM],
45                  double bonnies_pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM])
46 {
47   int reg1, reg2, res1, res2;
48   double expect_prob, quantity,value;
49   long Num_samples;
50   int alert=0;
51
52   fprintf(weight_file,"\n\n SINGLE WEIGHTS \n\n");
53   fprintf(weight_file,"         "); 
54               /* 9 spaces to get register to line up in columns*/
55   for(reg1=0; reg1<POSNUM; reg1++)  {
56     fprintf(weight_file, "\t%c", 'a' + reg1);
57   }
58
59   fprintf(weight_file,"\n");
60
61   for(res1=0; res1<AANUM; res1++) {
62 /* First print out the single weights. */    
63     fprintf(weight_file,"\n\n%c, weight",numaa(res1));
64     for(reg1=0; reg1<POSNUM; reg1++)  
65       fprintf(weight_file,"\t%.2lf", weights[res1][reg1]);
66
67 /*  Now print out the value that led to that weight. **/
68     fprintf(weight_file,"\n%c, values",numaa(res1));
69     for(reg1=0; reg1<POSNUM; reg1++) {
70       Num_samples = totals[reg1];
71       expect_prob = exp(gprobs[res1]);
72       quantity = ((double)pfreqs[res1][reg1]- (double)Num_samples*expect_prob);
73       value= fabs(quantity)/sqrt(Num_samples*expect_prob*(1-expect_prob));
74       fprintf(weight_file,"\t%.2lf", value);
75     }
76
77
78 /* Now print out the number of samples. */
79     fprintf(weight_file,"\n%c, N     ",numaa(res1));
80     for(reg1=0; reg1<POSNUM; reg1++) {
81       Num_samples = totals[reg1];
82       fprintf(weight_file,"\t%.2ld", Num_samples);
83     }
84 /* Now print out the gprob values. */
85     fprintf(weight_file,"\n%c, gprobs",numaa(res1));
86     for(reg1=0; reg1<POSNUM; reg1++) {
87       expect_prob = exp(gprobs[res1]);
88       fprintf(weight_file,"\t%.2lf", expect_prob);
89     }
90 /* Now print out the pprob values. */
91     fprintf(weight_file,"\n%c, pprobs",numaa(res1));
92     for(reg1=0; reg1<POSNUM; reg1++) {
93       fprintf(weight_file,"\t%.2lf", exp(pprobs[res1][reg1]));
94     }
95 /* Now print out bonnie's pprob values. */
96     fprintf(weight_file,"\n%c, bprobs",numaa(res1));
97     for(reg1=0; reg1<POSNUM; reg1++) {
98       fprintf(weight_file,"\t%.2lf", exp(bonnies_pprobs[res1][reg1]));
99     }
100 /* Now print out the num of pos samples values. */
101     fprintf(weight_file,"\n%c, NumPos",numaa(res1));
102     for(reg1=0; reg1<POSNUM; reg1++) {
103       fprintf(weight_file,"\t%.2ld", pfreqs[res1][reg1]);
104     }
105
106   }
107
108
109   fprintf(weight_file,"\n\n\nPAIR WEIGHTS");
110
111   for(reg1=0; reg1<POSNUM; reg1++)  {
112     fprintf(weight_file,"\n\n");
113       for(reg2=0; reg2<POSNUM; reg2++) 
114         fprintf(weight_file, "\t%c,%c", 'a' + reg1,'a' + reg2);
115     for(res1=0; res1<AANUM; res1++) {
116       for(res2=0; res2<AANUM; res2++) {
117         fprintf(weight_file,"\n\n%c,%c          ",numaa(res1),numaa(res2));
118         for(reg2=0; reg2<POSNUM; reg2++)  
119           fprintf(weight_file,"\t%.2lf", 
120                   weightp[res1][res2][reg1][reg2]);
121
122
123 /* Now print out the pprob values. */
124         fprintf(weight_file,"\n%c,%c log_pprob",numaa(res1),numaa(res2));
125         for(reg2=0; reg2<POSNUM; reg2++) {
126           fprintf(weight_file,"\t%.2lf", pprobp[res1][res2][reg1][reg2]);
127         }
128 /* Now print out bonnie's pprob values. */
129         fprintf(weight_file,"\n%c,%c log_bprob",numaa(res1),numaa(res2));
130         for(reg2=0; reg2<POSNUM; reg2++) {
131           fprintf(weight_file,"\t%.2lf", 
132                   bonnies_pprobp[res1][res2][reg1][reg2]);
133         }
134
135 /* Now print out gprob values. */
136         fprintf(weight_file,"\n%c,%c log_gprob",numaa(res1),numaa(res2));
137         for(reg2=0; reg2<POSNUM; reg2++) {
138           fprintf(weight_file,"\t%.2lf", 
139                   gprobp[res1][res2][(reg2-reg1)%POSNUM]);
140         }
141
142 /* Now print out the count on the amino acids. */
143         alert=0;
144         fprintf(weight_file,"\n%c,%c",numaa(res1),numaa(res2));
145         for(reg2=0; reg2<POSNUM; reg2++) {
146           fprintf(weight_file,"\t%.2ld", 
147                   pfreqp[res1][res2][reg1][reg2]);
148           if (pfreqp[res1][res2][reg1][reg2]<=1)
149             alert=1;
150         }
151         if (alert) fprintf(weight_file, "\nALERT: 0-1 frequency");
152
153
154       }
155     }
156   }
157 }
158
159
160
161
162
163 void printfreq_prob(long pfreqs[AANUM][POSNUM],
164                long pfreqp[AANUM][AANUM][POSNUM][POSNUM],
165                FILE *weight_file,
166                double pprobs[NUM_RES_TYPE][POSNUM],
167                double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM])
168 {
169   int reg1, reg2, res1, res2;
170   double expect_prob, quantity,value;
171   long Num_samples;
172   int alert=0;
173
174   fprintf(weight_file,"\n\n SINGLE WEIGHTS \n\n");
175   fprintf(weight_file,"         "); 
176               /* 9 spaces to get register to line up in columns*/
177   for(reg1=0; reg1<POSNUM; reg1++)  {
178     fprintf(weight_file, "\t%c", 'a' + reg1);
179   }
180
181   fprintf(weight_file,"\n");
182
183   for(res1=0; res1<AANUM; res1++) {
184     /* Now print out the pprob values. */
185     fprintf(weight_file,"\n%c, pprobs",numaa(res1));
186     for(reg1=0; reg1<POSNUM; reg1++) {
187       fprintf(weight_file,"\t%.2lf", exp(pprobs[res1][reg1]));
188     }
189
190 /* Now print out the num of pos samples values. */
191     fprintf(weight_file,"\n%c, NumPos",numaa(res1));
192     for(reg1=0; reg1<POSNUM; reg1++) {
193       fprintf(weight_file,"\t%.2ld", pfreqs[res1][reg1]);
194     }
195
196 }
197
198
199   fprintf(weight_file,"\n\n\nPAIR WEIGHTS");
200
201   for(reg1=0; reg1<POSNUM; reg1++)  {
202     fprintf(weight_file,"\n\n");
203       for(reg2=0; reg2<POSNUM; reg2++) 
204         fprintf(weight_file, "\t%c,%c", 'a' + reg1,'a' + reg2);
205     for(res1=0; res1<AANUM; res1++) {
206       for(res2=0; res2<AANUM; res2++) {
207         fprintf(weight_file,"\n\n%c,%c          ",numaa(res1),numaa(res2));
208
209 /* Now print out the pprob values. */
210         fprintf(weight_file,"\n%c,%c log_pprob",numaa(res1),numaa(res2));
211         for(reg2=0; reg2<POSNUM; reg2++) {
212           fprintf(weight_file,"\t%.2lf", pprobp[res1][res2][reg1][reg2]);
213         }
214 /* Now print out the count on the amino acids. */
215         alert=0;
216         fprintf(weight_file,"\n%c,%c",numaa(res1),numaa(res2));
217         for(reg2=0; reg2<POSNUM; reg2++) {
218           fprintf(weight_file,"\t%.2ld", 
219                   pfreqp[res1][res2][reg1][reg2]);
220           if (pfreqp[res1][res2][reg1][reg2]<=1)
221             alert=1;
222         }
223         if (alert) fprintf(weight_file, "\nALERT: 0-1 frequency");
224
225
226
227       }
228     }
229   }
230 }
231
232
233 /* Let x_i be 1 if kth sample at given positions give current amino acids.*/
234 /* Then by central limit thm  (sum(x_i)  *Np)/sqrt(Np(1-p)) is normal.   */
235 int calcweight(long pfreqs[AANUM][POSNUM],
236                long pfreqp[AANUM][AANUM][POSNUM][POSNUM],
237                double gprobs[NUM_RES_TYPE],
238                double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
239                double weights[AANUM][POSNUM],
240                double weightp[AANUM][AANUM][POSNUM][POSNUM],
241                long totals[POSNUM],
242                long totalp[POSNUM][POSNUM],  FILE *weight_file)
243 {
244
245   int res1, res2, pos1, pos2;
246   long Num_samples;
247   double value;
248   double  expect_prob;
249   double  quantity;
250
251
252   for (pos1=0; pos1<POSNUM; pos1++) {
253     Num_samples = totals[pos1];
254     for  (res1= 0; res1 < AANUM; res1++) {
255       expect_prob = exp(gprobs[res1]);
256       quantity = ((double)pfreqs[res1][pos1]- (double)Num_samples*expect_prob);
257       value= fabs(quantity)/sqrt(Num_samples*expect_prob*(1-expect_prob));
258       weights[res1][pos1] = erf(value/sqrt(2));
259       if (weights[res1][pos1]>1) weights[res1][pos1]=1;
260     }
261   }
262
263       
264   for (pos1=0; pos1<POSNUM; pos1++) 
265     for (pos2=0; pos2<POSNUM; pos2++) {
266       Num_samples = totalp[pos1][pos2];
267       for  (res1= 0; res1 < AANUM; res1++) 
268         for  (res2= 0; res2 < AANUM; res2++) {
269               expect_prob = exp(gprobp[res1][res2][(res2 -res1)%POSNUM]);
270               quantity = ((double)pfreqp[res1][res2][pos1][pos2] -
271                          (double)Num_samples*expect_prob); 
272               value= fabs(quantity)/
273                          sqrt(Num_samples*expect_prob*(1-expect_prob));
274               weightp[res1][res2][pos1][pos2] = erf(value/sqrt(2));
275               if (weightp[res1][res2][pos1][pos2] >1)
276                    weightp[res1][res2][pos1][pos2]=1;
277
278             }
279     }
280
281 /******* Now done in PairCoilData in sc2seq_interface.c
282   printweight(pfreqs,pfreqp,gprobs,gprobp,weights, weightp, totals,totalp,weight_file);
283 *********/
284 }
285
286
287
288 /** Give the combination table a wt. that is the max of the wt. in the **/
289 /** other two tables.   i.e. if it  is significant in one table, it should **/
290 /** be significant in the combination??? Could just average???             **/
291
292
293 void combine_many_weight(int current_table)
294 {
295   int reg1, reg2, res1, res2;
296 /*********
297   extern double many_weights[MAX_TABLE_NUMBER][AANUM][POSNUM];
298   extern double many_weightp[MAX_TABLE_NUMBER][AANUM][AANUM][POSNUM][POSNUM];
299 **********/
300   double many_weights[MAX_TABLE_NUMBER][AANUM][POSNUM];
301   double many_weightp[MAX_TABLE_NUMBER][AANUM][AANUM][POSNUM][POSNUM];
302
303   for (res1=0; res1<AANUM; res1++)
304     for (reg1=0; reg1<POSNUM; reg1++) {
305       if (many_weights[0][res1][reg1] > many_weights[1][res1][reg1])
306         many_weights[current_table][res1][reg1] = 
307                              many_weights[0][res1][reg1];
308       else many_weights[current_table][res1][reg1] = 
309                               many_weights[1][res1][reg1];
310
311       for (res2=0; res2<AANUM; res2++)
312         for (reg2=0; reg2<POSNUM; reg2++)
313           if (many_weightp[0][res1][res2][reg1][reg2] > 
314                 many_weightp[1][res1][res2][reg1][reg2])
315             many_weightp[current_table][res1][res2][reg1][reg2] = 
316               many_weightp[0][res1][res2][reg1][reg2]; 
317             else  many_weightp[current_table][res1][res2][reg1][reg2] = 
318                                  many_weightp[1][res1][res2][reg1][reg2];
319     }
320 }
321
322
323
324 /** Note that these weights should all be <= 1....Maybe even sum to 1 if ***/
325 /** want to use them is a weighted avg.  But in "geometric averaging" don't */
326 /** have to sum to 1. Note that the bigger the sum though, the smaller **/
327 /** the output prob.   **/
328 void calc_distance_weights(double dist_weight[POSNUM])
329 {
330   int i;
331
332   dist_weight[0]= .3;
333   dist_weight[1]= .1;
334   dist_weight[2]= .3;
335   dist_weight[3]= .3;
336   dist_weight[4]= .2;
337   dist_weight[5]= .05;
338   dist_weight[6]= .1;
339   
340   for(i=0; i<POSNUM; i++)
341     dist_weight[i]= 1;
342 }
343   
344   
345
346 double hard_cutoff(double x, double cutoff)
347 {
348   double result;
349
350   if (x>cutoff) return(1);
351   else return(0);
352 }
353
354 /*** Want a sigma that maps 0 to 0 and 1 to 1.  ***/
355 /*** middle is the value that things should have things above it be mapped */
356 /*** towards 1, and below it mapped  towards 0.  **/
357 /*** Use 1+middle since want 0 mapped to 0 and middle to middle.  **/
358 /*** The larger power is, the more values are mapped towards the **/
359 /*** extremes.                                                   **/
360 double sigma(double x, int power, double middle)
361 {
362   double result;
363
364   if (power ==-1)  /** signal to do hardcutoff **/
365     return(hard_cutoff(x,middle));
366
367   if (power == 0)  /** Just do nothing. **/
368     return(x);
369
370   if (x<=middle)
371     result= pow(1+middle,pow(x/middle,power)) - 1;
372   else
373     result= 2-pow(2-middle,pow( (1 - x)/(1-middle),power)); 
374
375   return(result);
376 }
377