JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / multicoil / estimate_probabilities.c
diff --git a/sources/multicoil/estimate_probabilities.c b/sources/multicoil/estimate_probabilities.c
new file mode 100644 (file)
index 0000000..4d86185
--- /dev/null
@@ -0,0 +1,478 @@
+#include <stdio.h>
+#include <math.h>
+#include "sc.h"
+#include "scscore.h"
+#include "scconst.h"
+#include "options.h"
+#include "switches.h"
+#include "stats.h"
+
+/** The following are defined in scscore.c ***/
+extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
+
+/*** #define SCALE0         5   ***/
+/*** Now an extern variable ***/      /* scaling value for 0 probs */
+extern double SCALE0;
+extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
+
+/* Takes the log of a double */
+double getdlog(double num)
+{
+  if (num)
+    return log(num);
+  else
+    return LOG0VAL;
+}
+
+/* Takes the log of a long int */
+double getlog(long num)
+{
+  if (num) {
+    return log((double) num);
+  }
+  else
+    return LOG0VAL;
+}
+
+
+/**********************************************************************/
+
+
+void estimate_database_probs2(int tab, 
+        long freqs[AANUM][POSNUM], long totals[POSNUM], 
+        double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],  
+        long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM],
+        double many_pprobp[MAX_TABLE_NUMBER]
+                            [NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+        int ProlineFreeWin)
+{
+  extern double SCALE0;
+  extern double scale0s[MAX_TABLE_NUMBER],scale0p[MAX_TABLE_NUMBER];
+
+
+  SCALE0=  scale0s[tab];
+  calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin);
+
+  SCALE0=  scale0p[tab];
+  calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+              ProlineFreeWin);
+}
+
+
+
+
+/*************************************************************************/
+
+/****************************************************************************/
+/** If the file fgin == NULL so is not set in config file, use uniform dist. */
+
+int setgprob_to_uniform(double gprobs[NUM_RES_TYPE],
+                       double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM])
+{
+  int res1, res2, dist;
+
+  for (res1=0; res1<AANUM; res1++) {
+    gprobs[res1] = 1/AANUM;
+
+    for (res2=0; res2<AANUM; res2++)
+      for(dist=0; dist<POSNUM; dist++)
+       gprobp[res1][res2][dist] = 1/(AANUM*AANUM);
+  }
+}
+
+/****************************************************************************/
+
+
+/* Calculates log of genbnk probabilities for a residue. */
+int calcgprobs(long freqs[AANUM], long totals, 
+               double probs[NUM_RES_TYPE])
+{
+  int i;
+  double tal;
+  
+  tal = getlog(totals);
+  for (i = 0; i < AANUM; i++) {
+    probs[i] = getlog(freqs[i]) - tal;
+  }
+  return 1;
+}
+
+
+
+
+/* Calculates log of genbnk probabilities of pairs of residues at distance k.*/
+int calcgprobp(long freqp[AANUM][AANUM][POSNUM], 
+               long totalp[POSNUM],
+               double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM])
+{
+  int i, j, k;
+  double tal;
+  
+  for (k = 0; k < POSNUM; k++) {
+    tal = getlog(totalp[k]);
+    for (i = 0; i < AANUM; i++)
+      for (j = 0; j < AANUM; j++)
+        probp[i][j][k] = getlog(freqp[i][j][k]) - tal;
+  }
+}
+
+/*****************************************************************************/
+
+int calc_weird_gprob_sum(double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double gprobs[NUM_RES_TYPE])
+{
+  int res, pos;
+
+  gprobs[Glutamix] = gprobs[Glutamine] + gprobs[GlutamicAcid];
+  gprobs[Asparagix] = gprobs[Asparagine] + gprobs[AsparticAcid];
+  gprobs[AnyRes] = 1/AANUM;     /* AnyRes means amino acid unknown.... */
+
+  for (pos =0; pos<POSNUM; pos++) {
+    for (res =0; res<=AANUM; res++) {  
+      gprobp[res][AnyRes][pos] = gprobs[res];    /* AnyRes means res unknown */
+      gprobp[AnyRes][res][pos] = gprobs[res];    /* so this makes sense. */
+
+      gprobp[Glutamix][res][pos] = gprobp[Glutamine][res][pos]
+                                        + gprobp[GlutamicAcid][res][pos];
+
+      gprobp[res][Glutamix][pos] = gprobp[res][Glutamine][pos]
+                                        + gprobp[res][GlutamicAcid][pos];
+
+      gprobp[Asparagix][res][pos] = gprobp[Asparagine][res][pos]
+                                        + gprobp[AsparticAcid][res][pos];
+
+      gprobp[res][Asparagix][pos] = gprobp[res][Asparagine][pos]
+                                        + gprobp[res][AsparticAcid][pos];
+      
+    }
+
+
+    gprobp[Glutamix][Glutamix][pos] = gprobp[Glutamix][Glutamine][pos] +
+                                      gprobp[Glutamix][GlutamicAcid][pos];
+    gprobp[Asparagix][Asparagix][pos] = gprobp[Asparagix][Asparagine][pos] +
+                                      gprobp[Asparagix][AsparticAcid][pos];
+    gprobp[Glutamix][Asparagix][pos] = gprobp[Glutamix][Asparagine][pos] +
+                                      gprobp[Glutamix][AsparticAcid][pos];
+    gprobp[Asparagix][Glutamix][pos] = gprobp[Asparagix][Glutamine][pos] +
+                                      gprobp[Asparagix][GlutamicAcid][pos];
+  }
+
+}
+
+int calc_weird_gprob_avg(double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double gprobs[NUM_RES_TYPE])
+{
+  int res, pos;
+
+
+  gprobs[Glutamix] = getdlog(exp(gprobs[Glutamine]) + 
+                             exp(gprobs[GlutamicAcid])/2);
+  gprobs[Asparagix] = getdlog(exp(gprobs[Asparagine]) + 
+                              exp(gprobs[AsparticAcid])/2);
+  gprobs[AnyRes] = 1/AANUM;     /* AnyRes means amino acid unknown.... */
+
+
+  for (pos =0; pos<POSNUM; pos++) {
+    for (res =0; res<=AANUM; res++) {  
+      gprobp[res][AnyRes][pos] = gprobs[res];    /* AnyRes means res unknown */
+      gprobp[AnyRes][res][pos] = gprobs[res];    /* so this makes sense. */
+
+      gprobp[Glutamix][res][pos] = getdlog(exp(gprobp[Glutamine][res][pos])
+                                     + exp(gprobp[GlutamicAcid][res][pos])/2);
+
+      gprobp[res][Glutamix][pos] = getdlog(exp(gprobp[res][Glutamine][pos])
+                                     + exp(gprobp[res][GlutamicAcid][pos])/2);
+
+      gprobp[Asparagix][res][pos] = getdlog(exp(gprobp[Asparagine][res][pos])
+                                     + exp(gprobp[AsparticAcid][res][pos])/2);
+
+      gprobp[res][Asparagix][pos] = getdlog(exp(gprobp[res][Asparagine][pos])
+                                    + exp(gprobp[res][AsparticAcid][pos])/2);
+
+
+      
+    }
+
+
+    gprobp[Glutamix][Glutamix][pos] = 
+        getdlog(exp(gprobp[Glutamix][Glutamine][pos]) +
+                         exp(gprobp[Glutamix][GlutamicAcid][pos])/2);
+    gprobp[Asparagix][Asparagix][pos] = 
+        getdlog(exp(gprobp[Asparagix][Asparagine][pos]) +
+                    exp(gprobp[Asparagix][AsparticAcid][pos])/2);
+    gprobp[Glutamix][Asparagix][pos] = 
+        getdlog(exp(gprobp[Glutamix][Asparagine][pos]) +
+                                 exp(gprobp[Glutamix][AsparticAcid][pos])/2);
+    gprobp[Asparagix][Glutamix][pos] = 
+        getdlog(exp(gprobp[Asparagix][Glutamine][pos]) +
+                                 exp(gprobp[Asparagix][GlutamicAcid][pos])/2);
+  }
+
+}
+
+
+
+/****************************************************************************/
+
+
+
+/****************************************************************************/
+/* Calculates table of logs of single residue probabilities in position k   */
+/* in the positives.  The probability is initially set naiviely to          */
+/* freq(Res)(k)/total(k).  Then probability mass is redistributed to the    */
+/* zero frequency residues by setting their probability to 1/5*total(k).    */
+/* Note that this is the same thing as assuming that due to finite sampling,*/
+/* a zero frequency residue really has frequency 1/5.  (So this probability */
+/* is at most 1/5 of the minimum nonzero probability).  Prolines are        */
+/* treated as actual zeros since they are known coil breakers.              */
+/* The probabilities are then normalized.                                   */
+
+int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM], 
+               double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin)
+{
+  int i, k;
+  long tal=0;  /* total num of aa's in a given pos */
+  double mass;   /* prob mass for a given pos */
+  double prior_freq;
+  int pos_array_entry1=0;
+
+    
+  /* Calc probs for each aa i in each position k */
+  for (k = 0; k < POSNUM; k++) {
+    tal = totals[k];
+    mass = 0;
+    for (i = 0; i < AANUM; i++) {
+      prior_freq=0;
+
+      if (ProlineFreeWin && (i == Proline))
+        probs[i][k] = LOG0VAL;   /* 0 out P's */
+      else if (freqs[i][k])    
+                           /*  Handle the non-zero freq. naively.  */
+        mass += probs[i][k] = ( prior_freq + (double) freqs[i][k]) / tal; 
+      else {                  /* Now handle the freq 0 case specially. */
+          double tmp1, tmp2;
+         tmp1 = (prior_freq + 1.0 /SCALE0)/tal;   /* 1/5total(k)   */  
+          tmp2 = 1.0/AANUM;              /* Pretty much useless here
+                                            but useful for analogy in
+                                            computation of pair prob.  */
+      /* Take pr[i][k] to be min(tmp1, tmp2)--- usually tmp1.*/
+          mass += probs[i][k] = MIN(tmp1,tmp2);
+        }
+    }
+    /* Normalize all probs by the mass = sum of probs, and take log */
+    for (i = 0; i < AANUM; i++) 
+      if ((i != Proline) || (!ProlineFreeWin))   
+        probs[i][k] = getdlog(probs[i][k]/mass);
+  }
+  
+  return 1;
+}
+
+
+/***************************************************************/
+
+
+/* Calculates table of the logs of the probs of pairs in the positives.      */
+/* If the frequency is nonzero, just naively compute freq/total(k,l).        */
+/* For zero freq., deal with them analagously to zero freq. singles, so their*/
+/* final probability is no more than 1/5 of the smallest non-zero prob.      */
+/* To do this Pr(r1,r2,k,l)= min{1/400, 1/5total(k,l), Pr(r1,k)*Pr(r2,l)},   */
+/* where the last term is an upper bound acheived if the two residue,position*/
+/* were independent.                                                         */
+int calcpprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM], 
+               long totalp[POSNUM][POSNUM],
+               double probs[NUM_RES_TYPE][POSNUM],
+               double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+               int ProlineFreeWin)
+{
+  int i, j, k, l;
+  long tal=0;  /* total num of aa's in given pair of positions */
+  double mass; /* prob mass for given pair of pos */
+  double prior_freq;
+
+  
+  /* Calc probp for each aa pair in each pair of positions k,l */
+
+  for (k = 0; k < POSNUM; k++)
+    for (l = 0; l < POSNUM; l++) {
+      tal = totalp[k][l];
+      mass = 0;
+      for (i = 0; i < AANUM; i++)
+        for (j = 0; j < AANUM; j++) {
+         prior_freq=0;
+
+          if (ProlineFreeWin && (i == Proline || j == Proline))
+              probp[i][j][k][l] = LOG0VAL;    /* 0 out pairs with P in either*/
+                                              /* position                    */
+          else if (freqp[i][j][k][l]) 
+            mass += probp[i][j][k][l] = 
+                          (prior_freq + (double) freqp[i][j][k][l]) / tal;  
+          else {     /* freq 0 pairs are handled as min of 3 quanitities. */
+            double tmp1, tmp2, tmp3, tmp4;
+            tmp1 = (prior_freq + 1.0 /SCALE0) / tal;  
+                                            /* 1/5total(res1,res2,k,l) */
+            tmp2 = 1.0/(AANUM*AANUM);     /* 1/400                   */
+            tmp3 = exp(probs[i][k]) * exp(probs[j][l]);  
+                          /* product of single probs Pr(r1,k)*Pr(r2,l)*/
+
+            /* Take min(tmp1, tmp2, tmp3), 3 upper bounds on probp */  
+            tmp4 = MIN(tmp1,tmp2);
+            mass += probp[i][j][k][l] = MIN(tmp3,tmp4);
+          }
+       }
+      
+      /* Normalize all probp by the mass = sum of probp, and take log */
+      for (i = 0; i < AANUM; i++)
+        for (j = 0; j < AANUM; j++)
+          if ( (i != Proline && j != Proline) || !ProlineFreeWin)
+            probp[i][j][k][l] = getdlog(probp[i][j][k][l] / mass);
+    }
+}
+
+
+
+
+
+/****************************************************************************/
+int calc_weird_pprob_sum(double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double pprobs[NUM_RES_TYPE][POSNUM])
+{
+  int res, pos1,pos2;
+  
+
+  for (pos1 =0 ; pos1<POSNUM; pos1++) {
+    pprobs[Glutamix][pos1] = 
+           pprobs[Glutamine][pos1] + pprobs[GlutamicAcid][pos1];
+    pprobs[Asparagix][pos1] = 
+           pprobs[Asparagine][pos1] + pprobs[AsparticAcid][pos1];
+
+    pprobs[AnyRes][pos1] = 1/AANUM;   /* AnyRes means amino acid unknown.... */
+  }
+
+
+
+  for (pos1 =0; pos1<POSNUM; pos1++) {
+    for (pos2 =0; pos2<POSNUM; pos2++) {
+
+      for (res =0; res<=AANUM; res++) {  
+        pprobp[res][AnyRes][pos1][pos2] = 
+          pprobs[res][pos1];    /* AnyRes means res unknown */
+        pprobp[AnyRes][res][pos1][pos2] = 
+          pprobs[res][pos2];    /* so this makes sense. */
+
+        pprobp[Glutamix][res][pos1][pos2] = pprobp[Glutamine][res][pos1][pos2]
+                                       + pprobp[GlutamicAcid][res][pos1][pos2];
+
+        pprobp[res][Glutamix][pos1][pos2] = pprobp[res][Glutamine][pos1][pos2]
+                                       + pprobp[res][GlutamicAcid][pos1][pos2];
+
+        pprobp[Asparagix][res][pos1][pos2]= pprobp[Asparagine][res][pos1][pos2]
+                                       + pprobp[AsparticAcid][res][pos1][pos2];
+
+        pprobp[res][Asparagix][pos1][pos2] =pprobp[res][Asparagine][pos1][pos2]
+                                       + pprobp[res][AsparticAcid][pos1][pos2];
+      
+      }
+
+    
+      pprobp[Glutamix][Glutamix][pos1][pos2] = 
+            pprobp[Glutamix][Glutamine][pos1][pos2] +
+              pprobp[Glutamix][GlutamicAcid][pos1][pos2];
+      pprobp[Asparagix][Asparagix][pos1][pos2] = 
+            pprobp[Asparagix][Asparagine][pos1][pos2] +
+              pprobp[Asparagix][AsparticAcid][pos1][pos2];
+      pprobp[Glutamix][Asparagix][pos1][pos2] = 
+            pprobp[Glutamix][Asparagine][pos1][pos2] +
+              pprobp[Glutamix][AsparticAcid][pos1][pos2];
+      pprobp[Asparagix][Glutamix][pos1][pos2] = 
+            pprobp[Asparagix][Glutamine][pos1][pos2] +
+              pprobp[Asparagix][GlutamicAcid][pos1][pos2];
+    }
+  }
+
+
+}
+
+
+
+
+
+int calc_weird_pprob_avg(double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double pprobs[NUM_RES_TYPE][POSNUM])
+{
+  int res, pos1,pos2;
+  
+
+  for (pos1 =0 ; pos1<POSNUM; pos1++) {
+    pprobs[Glutamix][pos1] = 
+       getdlog(exp(pprobs[Glutamine][pos1]) + 
+               exp(pprobs[GlutamicAcid][pos1])/2);
+    pprobs[Asparagix][pos1] = 
+       getdlog(exp(pprobs[Asparagine][pos1]) + 
+               exp(pprobs[AsparticAcid][pos1])/2);
+
+    pprobs[AnyRes][pos1] = 1/AANUM;   /* AnyRes means amino acid unknown.... */
+  }
+
+
+
+  for (pos1 =0; pos1<POSNUM; pos1++) {
+    for (pos2 =0; pos2<POSNUM; pos2++) {
+
+      for (res =0; res<=AANUM; res++) {  
+        pprobp[res][AnyRes][pos1][pos2] = 
+          pprobs[res][pos1];    /* AnyRes means res unknown */
+        pprobp[AnyRes][res][pos1][pos2] = 
+          pprobs[res][pos2];    /* so this makes sense. */
+
+        pprobp[Glutamix][res][pos1][pos2] = 
+          getdlog(exp(pprobp[Glutamine][res][pos1][pos2])
+                              + exp(pprobp[GlutamicAcid][res][pos1][pos2])/2);
+
+        pprobp[res][Glutamix][pos1][pos2] = 
+          getdlog(exp(pprobp[res][Glutamine][pos1][pos2])
+                              + exp(pprobp[res][GlutamicAcid][pos1][pos2])/2);
+
+        pprobp[Asparagix][res][pos1][pos2]=
+          getdlog(exp(pprobp[Asparagine][res][pos1][pos2])
+                           + exp(pprobp[AsparticAcid][res][pos1][pos2])/2);
+
+        pprobp[res][Asparagix][pos1][pos2]=
+          getdlog(exp(pprobp[res][Asparagine][pos1][pos2])
+                            + exp(pprobp[res][AsparticAcid][pos1][pos2])/2);
+      
+      }
+
+    
+      pprobp[Glutamix][Glutamix][pos1][pos2] = 
+            getdlog(exp(pprobp[Glutamix][Glutamine][pos1][pos2]) +
+              exp(pprobp[Glutamix][GlutamicAcid][pos1][pos2])/2);
+      pprobp[Asparagix][Asparagix][pos1][pos2] = 
+            getdlog(exp(pprobp[Asparagix][Asparagine][pos1][pos2]) +
+              exp(pprobp[Asparagix][AsparticAcid][pos1][pos2])/2);
+      pprobp[Glutamix][Asparagix][pos1][pos2] = 
+            getdlog(exp(pprobp[Glutamix][Asparagine][pos1][pos2]) +
+              exp(pprobp[Glutamix][AsparticAcid][pos1][pos2])/2);
+      pprobp[Asparagix][Glutamix][pos1][pos2] = 
+            getdlog(exp(pprobp[Asparagix][Glutamine][pos1][pos2]) +
+              exp(pprobp[Asparagix][GlutamicAcid][pos1][pos2])/2);
+    }
+  }
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+