JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / multicoil / stats.c
diff --git a/sources/multicoil/stats.c b/sources/multicoil/stats.c
new file mode 100644 (file)
index 0000000..a1dc1fc
--- /dev/null
@@ -0,0 +1,377 @@
+#include <math.h>
+#include <stdio.h>
+#include "scconst.h"
+#include "scio.h"
+
+
+/** This computes 1-erf(x) with fractional error everywhere less than **/
+/** 1.2 * 10^-7.  ***/
+/** I changed things from "float" to "double" from the numerical methods **/
+/** in C code.  **/
+
+double erfcc(x)
+double x;
+{
+       double t,z,ans;
+
+       z=fabs(x);
+       t=1.0/(1.0+0.5*z);
+       ans=t*exp(-z*z-1.26551223+t*(1.00002368+t*(0.37409196+t*(0.09678418+
+               t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+
+               t*(-0.82215223+t*0.17087277)))))))));
+       return  x >= 0.0 ? ans : 2.0-ans;
+}
+
+
+
+
+double erf(double x)
+{
+  return(1-erfcc(x));
+}
+
+
+void printweight(long pfreqs[AANUM][POSNUM],
+               long pfreqp[AANUM][AANUM][POSNUM][POSNUM],
+               double gprobs[NUM_RES_TYPE],
+               double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
+               double weights[AANUM][POSNUM],
+               double weightp[AANUM][AANUM][POSNUM][POSNUM],
+               long totals[POSNUM],
+               long totalp[POSNUM][POSNUM],  FILE *weight_file,
+                 double pprobs[NUM_RES_TYPE][POSNUM],
+                 double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+                 double bonnies_pprobs[NUM_RES_TYPE][POSNUM],
+                 double bonnies_pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM])
+{
+  int reg1, reg2, res1, res2;
+  double expect_prob, quantity,value;
+  long Num_samples;
+  int alert=0;
+
+  fprintf(weight_file,"\n\n SINGLE WEIGHTS \n\n");
+  fprintf(weight_file,"         "); 
+              /* 9 spaces to get register to line up in columns*/
+  for(reg1=0; reg1<POSNUM; reg1++)  {
+    fprintf(weight_file, "\t%c", 'a' + reg1);
+  }
+
+  fprintf(weight_file,"\n");
+
+  for(res1=0; res1<AANUM; res1++) {
+/* First print out the single weights. */    
+    fprintf(weight_file,"\n\n%c, weight",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++)  
+      fprintf(weight_file,"\t%.2lf", weights[res1][reg1]);
+
+/*  Now print out the value that led to that weight. **/
+    fprintf(weight_file,"\n%c, values",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      Num_samples = totals[reg1];
+      expect_prob = exp(gprobs[res1]);
+      quantity = ((double)pfreqs[res1][reg1]- (double)Num_samples*expect_prob);
+      value= fabs(quantity)/sqrt(Num_samples*expect_prob*(1-expect_prob));
+      fprintf(weight_file,"\t%.2lf", value);
+    }
+
+
+/* Now print out the number of samples. */
+    fprintf(weight_file,"\n%c, N     ",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      Num_samples = totals[reg1];
+      fprintf(weight_file,"\t%.2ld", Num_samples);
+    }
+/* Now print out the gprob values. */
+    fprintf(weight_file,"\n%c, gprobs",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      expect_prob = exp(gprobs[res1]);
+      fprintf(weight_file,"\t%.2lf", expect_prob);
+    }
+/* Now print out the pprob values. */
+    fprintf(weight_file,"\n%c, pprobs",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      fprintf(weight_file,"\t%.2lf", exp(pprobs[res1][reg1]));
+    }
+/* Now print out bonnie's pprob values. */
+    fprintf(weight_file,"\n%c, bprobs",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      fprintf(weight_file,"\t%.2lf", exp(bonnies_pprobs[res1][reg1]));
+    }
+/* Now print out the num of pos samples values. */
+    fprintf(weight_file,"\n%c, NumPos",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      fprintf(weight_file,"\t%.2ld", pfreqs[res1][reg1]);
+    }
+
+  }
+
+
+  fprintf(weight_file,"\n\n\nPAIR WEIGHTS");
+
+  for(reg1=0; reg1<POSNUM; reg1++)  {
+    fprintf(weight_file,"\n\n");
+      for(reg2=0; reg2<POSNUM; reg2++) 
+        fprintf(weight_file, "\t%c,%c", 'a' + reg1,'a' + reg2);
+    for(res1=0; res1<AANUM; res1++) {
+      for(res2=0; res2<AANUM; res2++) {
+        fprintf(weight_file,"\n\n%c,%c          ",numaa(res1),numaa(res2));
+        for(reg2=0; reg2<POSNUM; reg2++)  
+          fprintf(weight_file,"\t%.2lf", 
+                  weightp[res1][res2][reg1][reg2]);
+
+
+/* Now print out the pprob values. */
+        fprintf(weight_file,"\n%c,%c log_pprob",numaa(res1),numaa(res2));
+        for(reg2=0; reg2<POSNUM; reg2++) {
+          fprintf(weight_file,"\t%.2lf", pprobp[res1][res2][reg1][reg2]);
+        }
+/* Now print out bonnie's pprob values. */
+        fprintf(weight_file,"\n%c,%c log_bprob",numaa(res1),numaa(res2));
+        for(reg2=0; reg2<POSNUM; reg2++) {
+          fprintf(weight_file,"\t%.2lf", 
+                  bonnies_pprobp[res1][res2][reg1][reg2]);
+        }
+
+/* Now print out gprob values. */
+        fprintf(weight_file,"\n%c,%c log_gprob",numaa(res1),numaa(res2));
+        for(reg2=0; reg2<POSNUM; reg2++) {
+          fprintf(weight_file,"\t%.2lf", 
+                  gprobp[res1][res2][(reg2-reg1)%POSNUM]);
+        }
+
+/* Now print out the count on the amino acids. */
+        alert=0;
+        fprintf(weight_file,"\n%c,%c",numaa(res1),numaa(res2));
+        for(reg2=0; reg2<POSNUM; reg2++) {
+          fprintf(weight_file,"\t%.2ld", 
+                  pfreqp[res1][res2][reg1][reg2]);
+          if (pfreqp[res1][res2][reg1][reg2]<=1)
+            alert=1;
+        }
+        if (alert) fprintf(weight_file, "\nALERT: 0-1 frequency");
+
+
+      }
+    }
+  }
+}
+
+
+
+
+
+void printfreq_prob(long pfreqs[AANUM][POSNUM],
+              long pfreqp[AANUM][AANUM][POSNUM][POSNUM],
+              FILE *weight_file,
+              double pprobs[NUM_RES_TYPE][POSNUM],
+              double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM])
+{
+  int reg1, reg2, res1, res2;
+  double expect_prob, quantity,value;
+  long Num_samples;
+  int alert=0;
+
+  fprintf(weight_file,"\n\n SINGLE WEIGHTS \n\n");
+  fprintf(weight_file,"         "); 
+              /* 9 spaces to get register to line up in columns*/
+  for(reg1=0; reg1<POSNUM; reg1++)  {
+    fprintf(weight_file, "\t%c", 'a' + reg1);
+  }
+
+  fprintf(weight_file,"\n");
+
+  for(res1=0; res1<AANUM; res1++) {
+    /* Now print out the pprob values. */
+    fprintf(weight_file,"\n%c, pprobs",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      fprintf(weight_file,"\t%.2lf", exp(pprobs[res1][reg1]));
+    }
+
+/* Now print out the num of pos samples values. */
+    fprintf(weight_file,"\n%c, NumPos",numaa(res1));
+    for(reg1=0; reg1<POSNUM; reg1++) {
+      fprintf(weight_file,"\t%.2ld", pfreqs[res1][reg1]);
+    }
+
+}
+
+
+  fprintf(weight_file,"\n\n\nPAIR WEIGHTS");
+
+  for(reg1=0; reg1<POSNUM; reg1++)  {
+    fprintf(weight_file,"\n\n");
+      for(reg2=0; reg2<POSNUM; reg2++) 
+       fprintf(weight_file, "\t%c,%c", 'a' + reg1,'a' + reg2);
+    for(res1=0; res1<AANUM; res1++) {
+      for(res2=0; res2<AANUM; res2++) {
+       fprintf(weight_file,"\n\n%c,%c          ",numaa(res1),numaa(res2));
+
+/* Now print out the pprob values. */
+       fprintf(weight_file,"\n%c,%c log_pprob",numaa(res1),numaa(res2));
+       for(reg2=0; reg2<POSNUM; reg2++) {
+         fprintf(weight_file,"\t%.2lf", pprobp[res1][res2][reg1][reg2]);
+       }
+/* Now print out the count on the amino acids. */
+        alert=0;
+        fprintf(weight_file,"\n%c,%c",numaa(res1),numaa(res2));
+        for(reg2=0; reg2<POSNUM; reg2++) {
+          fprintf(weight_file,"\t%.2ld", 
+                  pfreqp[res1][res2][reg1][reg2]);
+          if (pfreqp[res1][res2][reg1][reg2]<=1)
+            alert=1;
+        }
+        if (alert) fprintf(weight_file, "\nALERT: 0-1 frequency");
+
+
+
+      }
+    }
+  }
+}
+
+
+/* Let x_i be 1 if kth sample at given positions give current amino acids.*/
+/* Then by central limit thm  (sum(x_i)  *Np)/sqrt(Np(1-p)) is normal.   */
+int calcweight(long pfreqs[AANUM][POSNUM],
+              long pfreqp[AANUM][AANUM][POSNUM][POSNUM],
+              double gprobs[NUM_RES_TYPE],
+              double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
+              double weights[AANUM][POSNUM],
+              double weightp[AANUM][AANUM][POSNUM][POSNUM],
+              long totals[POSNUM],
+              long totalp[POSNUM][POSNUM],  FILE *weight_file)
+{
+
+  int res1, res2, pos1, pos2;
+  long Num_samples;
+  double value;
+  double  expect_prob;
+  double  quantity;
+
+
+  for (pos1=0; pos1<POSNUM; pos1++) {
+    Num_samples = totals[pos1];
+    for  (res1= 0; res1 < AANUM; res1++) {
+      expect_prob = exp(gprobs[res1]);
+      quantity = ((double)pfreqs[res1][pos1]- (double)Num_samples*expect_prob);
+      value= fabs(quantity)/sqrt(Num_samples*expect_prob*(1-expect_prob));
+      weights[res1][pos1] = erf(value/sqrt(2));
+      if (weights[res1][pos1]>1) weights[res1][pos1]=1;
+    }
+  }
+
+      
+  for (pos1=0; pos1<POSNUM; pos1++) 
+    for (pos2=0; pos2<POSNUM; pos2++) {
+      Num_samples = totalp[pos1][pos2];
+      for  (res1= 0; res1 < AANUM; res1++) 
+       for  (res2= 0; res2 < AANUM; res2++) {
+             expect_prob = exp(gprobp[res1][res2][(res2 -res1)%POSNUM]);
+             quantity = ((double)pfreqp[res1][res2][pos1][pos2] -
+                        (double)Num_samples*expect_prob); 
+             value= fabs(quantity)/
+                        sqrt(Num_samples*expect_prob*(1-expect_prob));
+             weightp[res1][res2][pos1][pos2] = erf(value/sqrt(2));
+             if (weightp[res1][res2][pos1][pos2] >1)
+                   weightp[res1][res2][pos1][pos2]=1;
+
+           }
+    }
+
+/******* Now done in PairCoilData in sc2seq_interface.c
+  printweight(pfreqs,pfreqp,gprobs,gprobp,weights, weightp, totals,totalp,weight_file);
+*********/
+}
+
+
+
+/** Give the combination table a wt. that is the max of the wt. in the **/
+/** other two tables.   i.e. if it  is significant in one table, it should **/
+/** be significant in the combination??? Could just average???             **/
+
+
+void combine_many_weight(int current_table)
+{
+  int reg1, reg2, res1, res2;
+/*********
+  extern double many_weights[MAX_TABLE_NUMBER][AANUM][POSNUM];
+  extern double many_weightp[MAX_TABLE_NUMBER][AANUM][AANUM][POSNUM][POSNUM];
+**********/
+  double many_weights[MAX_TABLE_NUMBER][AANUM][POSNUM];
+  double many_weightp[MAX_TABLE_NUMBER][AANUM][AANUM][POSNUM][POSNUM];
+
+  for (res1=0; res1<AANUM; res1++)
+    for (reg1=0; reg1<POSNUM; reg1++) {
+      if (many_weights[0][res1][reg1] > many_weights[1][res1][reg1])
+       many_weights[current_table][res1][reg1] = 
+                            many_weights[0][res1][reg1];
+      else many_weights[current_table][res1][reg1] = 
+                             many_weights[1][res1][reg1];
+
+      for (res2=0; res2<AANUM; res2++)
+       for (reg2=0; reg2<POSNUM; reg2++)
+         if (many_weightp[0][res1][res2][reg1][reg2] > 
+               many_weightp[1][res1][res2][reg1][reg2])
+           many_weightp[current_table][res1][res2][reg1][reg2] = 
+             many_weightp[0][res1][res2][reg1][reg2]; 
+           else  many_weightp[current_table][res1][res2][reg1][reg2] = 
+                                many_weightp[1][res1][res2][reg1][reg2];
+    }
+}
+
+
+
+/** Note that these weights should all be <= 1....Maybe even sum to 1 if ***/
+/** want to use them is a weighted avg.  But in "geometric averaging" don't */
+/** have to sum to 1. Note that the bigger the sum though, the smaller **/
+/** the output prob.   **/
+void calc_distance_weights(double dist_weight[POSNUM])
+{
+  int i;
+
+  dist_weight[0]= .3;
+  dist_weight[1]= .1;
+  dist_weight[2]= .3;
+  dist_weight[3]= .3;
+  dist_weight[4]= .2;
+  dist_weight[5]= .05;
+  dist_weight[6]= .1;
+  
+  for(i=0; i<POSNUM; i++)
+    dist_weight[i]= 1;
+}
+  
+  
+
+double hard_cutoff(double x, double cutoff)
+{
+  double result;
+
+  if (x>cutoff) return(1);
+  else return(0);
+}
+
+/*** Want a sigma that maps 0 to 0 and 1 to 1.  ***/
+/*** middle is the value that things should have things above it be mapped */
+/*** towards 1, and below it mapped  towards 0.  **/
+/*** Use 1+middle since want 0 mapped to 0 and middle to middle.  **/
+/*** The larger power is, the more values are mapped towards the **/
+/*** extremes.                                                   **/
+double sigma(double x, int power, double middle)
+{
+  double result;
+
+  if (power ==-1)  /** signal to do hardcutoff **/
+    return(hard_cutoff(x,middle));
+
+  if (power == 0)  /** Just do nothing. **/
+    return(x);
+
+  if (x<=middle)
+    result= pow(1+middle,pow(x/middle,power)) - 1;
+  else
+    result= 2-pow(2-middle,pow( (1 - x)/(1-middle),power)); 
+
+  return(result);
+}
+