JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / multicoil / estimate_probabilities_cuts.c
diff --git a/sources/multicoil/estimate_probabilities_cuts.c b/sources/multicoil/estimate_probabilities_cuts.c
new file mode 100644 (file)
index 0000000..c07a59c
--- /dev/null
@@ -0,0 +1,357 @@
+changed
+    calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, 
+              dimers_for_zero_freqs, many_pprobs[0], prior_freq_single,
+              structural_pos);
+to
+    calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, 
+              many_pprobs[0]);
+and
+
+  calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+              ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0],
+              prior_freq_pair, structural_pos);
+to
+  calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+              ProlineFreeWin);
+
+cut all the other calcpprob_... stuff (i.e. good_turing, priors,etc.)
+
+
+
+Cut all this junk out of estimate_database_probs also.
+
+
+
+/************************ Second Round ********************************/
+Cut
+/** The following are defined in scscore.c ***/
+extern double prior_probs[NUM_RES_TYPE][POSNUM], 
+        prior_probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
+
+
+
+int in_list(int reg,int structural_pos_list[POSNUM+1])
+{
+  int i=0;
+
+  while (1) {
+    if (structural_pos_list[i] == -1) return 0;  /*End of list and not found */
+    else if (reg == structural_pos_list[i]) return 1;
+    else i++;
+  }
+
+}
+
+
+void set_unimportant_pos_to_genbnk(double pprobs[NUM_RES_TYPE][POSNUM], 
+                   double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+                   double gprobs[NUM_RES_TYPE],
+                   double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
+                   int structural_pos[POSNUM+1] )
+{
+  int reg1=0, reg2=0, res1, res2, dist;
+  int pos_array_entry1=0;
+  int pos_array_entry2=0;
+  double prob1, prob2;
+
+  for (reg1=0; reg1 < POSNUM; reg1++) {
+    if (!in_list(reg1, structural_pos)) 
+      for (res1=0; res1<NUM_RES_TYPE; res1++) 
+       pprobs[res1][reg1] = gprobs[res1];
+
+    for (reg2=0; reg2<POSNUM; reg2++)  
+      if (!in_list(reg1, structural_pos) || 
+         !in_list(reg2, structural_pos)) {
+       dist = (reg2-reg1-1+POSNUM) %POSNUM;
+
+       for (res1=0; res1<NUM_RES_TYPE; res1++)
+         for (res2=0; res2<NUM_RES_TYPE; res2++)
+           pprobp[res1][res2][reg1][reg2]=
+             gprobp[res1][res2][dist];
+      }
+  }
+}
+     
+/**********************************************************************/
+
+
+cut
+
+void estimate_database_probs(int tab, 
+         long freqs[AANUM][POSNUM], long totals[POSNUM], 
+         double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],  
+         long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM],
+         double many_pprobp[MAX_TABLE_NUMBER]
+                             [NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+         int ProlineFreeWin, int which_priors, int which_priorp,
+         double prior_freq_single, double prior_freq_pair,
+         int good_turing_fixed_disc, int structural_pos[POSNUM+1])
+{
+  extern double SCALE0;
+  extern double scale0s[MAX_TABLE_NUMBER],scale0p[MAX_TABLE_NUMBER];
+  int dimers_for_zero_freqs=0, dimers_for_zero_freqp=0;
+
+
+/** which_prior = -2 is a signal that only want to add prior freq to **/
+/** events which occur fewer times in trimer table than would expect **/
+/** using dimer prob of events (so the events that are likely undersampled **/
+
+  if (tab==1) {
+    if (which_priors==-1)
+      dimers_for_zero_freqs=1;
+    else if (which_priors == -2)
+      dimers_for_zero_freqs = 2;
+    if (which_priorp==-1)
+      dimers_for_zero_freqp=1;
+    else if (which_priorp == -2)
+      dimers_for_zero_freqp = 2;
+  }
+  
+
+
+  SCALE0=  scale0s[tab];
+  if (good_turing_fixed_disc==1)  /** Do good_turing **/
+    calcpprobs_good_turing(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+                           structural_pos);
+  else if (good_turing_fixed_disc ==2)  
+                            /*Do fixed discount with SCALE0 as discount. **/
+    calcpprobs_fixed_disc(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+                          structural_pos);
+  else if (good_turing_fixed_disc ==3)  /** Do modified good_turing. **/
+    calcpprobs_mod_good_turing(freqs,totals, many_pprobs[tab], ProlineFreeWin,
+                               structural_pos);
+  else if ((which_priors <= 0) || (tab !=1)) /* 0 means do old prob stuff. **/
+    calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, 
+               dimers_for_zero_freqs, many_pprobs[0], prior_freq_single,
+               structural_pos);
+  else 
+    calc_posterior_pprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+                            many_pprobs[0], which_priors, structural_pos);
+
+
+
+  SCALE0=  scale0p[tab];
+  if (good_turing_fixed_disc==1)
+    calcpprobp_good_turing(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+               ProlineFreeWin, which_priorp, structural_pos);
+  else if (good_turing_fixed_disc ==2) /* Do fixed_disc with discount SCALE0 */
+    calcpprobp_fixed_disc(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+               ProlineFreeWin, which_priorp, structural_pos);
+  else if (good_turing_fixed_disc ==3)  /** Do modified good_turing. **/
+    calcpprobp_mod_good_turing(freqp, totalp, many_pprobs[tab], 
+                               many_pprobp[tab],ProlineFreeWin, which_priorp,
+                               structural_pos);
+
+  else if ((which_priorp <= 0) || (tab != 1)) 
+    calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+               ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0],
+               prior_freq_pair, structural_pos);
+  else
+    calc_posterior_pprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], 
+                            ProlineFreeWin, many_pprobp[0], which_priorp,
+                          structural_pos);
+
+
+}
+
+
+
+
+
+
+changed
+int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM], 
+               double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin, 
+              double dimer_probs[NUM_RES_TYPE][POSNUM])
+
+to
+int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM], 
+               double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin)
+and from that function cut:
+
+prior_freq_single=0,    int prior_check_dimers=0;
+and replaced
+
+  while ( (k = structural_pos[pos_array_entry1]) != -1) {
+    pos_array_entry1++;
+with
+  for (k = 0; k < POSNUM; k++) {
+
+and replaced
+      if (prior_freq_single >= 0) prior_freq = prior_freq_single;
+      else prior_freq = -prior_freq_single * exp(dimer_probs[i][k]);
+               /** The maginitude of prior_freq_single scales things. **/
+                /** Since dimer_probs is a prob. function, we are adding a  **/
+                /** TOTAL of -prior_freq_single to the frequency.      **/
+               /** That all gets normalized when divide by mass.      **/
+by prior_freq =0;
+
+
+
+in calcpprobp()  
+cut
+int pos_array_entry1=0;
+  int pos_array_entry2=0;
+
+and replaced
+
+  while ( (k = structural_pos[pos_array_entry1]) != -1) {
+    pos_array_entry1++;
+    pos_array_entry2=0;
+    while ( (l = structural_pos[pos_array_entry2]) != -1) {
+      pos_array_entry2++;
+
+with 
+
+  for (k = 0; k < POSNUM; k++)
+    for (l = 0; l < POSNUM; l++) {
+
+
+cut
+double prior_freq_pair=0;
+
+replaced
+         if (prior_freq_pair >= 0) prior_freq = prior_freq_pair;
+         else prior_freq = -prior_freq_pair * exp(dimer_probp[i][j][k][l]);
+               /** The maginitude of prior_freq_single scales things. **/
+                /** Since dimer_probp is a prob. function, we are adding a  **/
+                /** TOTAL of -prior_freq_single to the frequency.      **/
+               /** That all gets normalized when divide by mass.      **/
+
+by 
+prior_freq =0;
+
+
+
+CUT
+
+void account_for_single_under_sampling(double *predictedfreq)
+{
+  if (*predictedfreq<=.2){
+   /* fprintf(stderr,"\nAccounted for a 0 freq that was %lf",*predictedfreq); */    *predictedfreq = .2;
+  }
+}
+
+
+int account_for_pair_under_sampling(double *predictedfreq)
+{
+  if (*predictedfreq<=.2) {
+        *predictedfreq = .2;
+        return(1);
+  }
+  else return(0);
+
+}
+
+
+
+int calc_weighted_pprobs(long freqs[AANUM][POSNUM], long totals[POSNUM], 
+               double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin,
+                         double weights[AANUM][POSNUM],
+                         double gprobs[NUM_RES_TYPE],
+                         double over_undersample[AANUM][POSNUM])
+{
+  int i, k;
+  long tal=0;  /* total num of aa's in a given pos */
+  double mass;   /* prob mass for a given pos */
+  double predicted_freq[AANUM][POSNUM];
+  double sample_error;
+  
+  /* Calc probs for each aa i in each position k */
+  for (k = 0; k < POSNUM; k++) {
+    tal = totals[k];
+    mass = 0;
+    for (i = 0; i < AANUM; i++)
+      if (ProlineFreeWin && (i == Proline))
+        probs[i][k] = LOG0VAL;   /* 0 out P's */
+      else {
+        if (over_undersample == NULL) 
+          sample_error=1;
+        else sample_error = over_undersample[i][k];
+
+      
+        predicted_freq[i][k] = (double)totals[k]*(1.0-weights[i][k])*
+          exp(gprobs[i]) +
+          weights[i][k]*(double)freqs[i][k]/sample_error;
+
+        account_for_single_under_sampling(&predicted_freq[i][k]);
+
+
+/*      fprintf(stderr,"\n Predicted freq=%lf, seen freq=%ld", predicted_freq[i][k], freqs[i][k]);
+        if (predicted_freq[i][k]<0) fprintf(stderr,"\ni=%d,k=%d,Weight = %lf,totals=%ld, gprob=%lf, sample_error=%lf ", i,k,weights[i][k],totals[k],gprobs[i],sample_error);
+*/
+        if (predicted_freq[i][k]>0) {
+          mass += predicted_freq[i][k];
+/*        fprintf(stderr,"\nAdded to mass for %d, %d",i,k); */
+        }
+      }
+
+    /* Normalize all probs by the mass = sum of probs, and take log */
+    for (i = 0; i < AANUM; i++) 
+      if ((i != Proline) || (!ProlineFreeWin))  
+        if (predicted_freq[i][k]>0) 
+          probs[i][k] = getdlog(predicted_freq[i][k]/mass);
+        else 
+          probs[i][k] = LOG0VAL;
+  }
+  
+  return 1;
+}
+
+int calc_weighted_pprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM], 
+               long totalp[POSNUM][POSNUM],
+               double probs[NUM_RES_TYPE][POSNUM],
+               double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+              int ProlineFreeWin, double weightp[AANUM][AANUM][POSNUM][POSNUM],
+              double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
+              double over_undersample[AANUM][AANUM][POSNUM][POSNUM])
+{
+  int i, j, k, l;
+  long tal=0;  /* total num of aa's in given pair of positions */
+  double mass; /* prob mass for given pair of pos */
+  double predicted_freq[AANUM][AANUM][POSNUM][POSNUM];
+  double sample_error;
+  int dist;     
+  /* Calc probp for each aa pair in each pair of positions k,l */
+  for (k = 0; k < POSNUM; k++)
+    for (l = 0; l < POSNUM; l++) {
+      tal = totalp[k][l];
+      mass = 0;
+      for (i = 0; i < AANUM; i++)
+        for (j = 0; j < AANUM; j++)
+          if (ProlineFreeWin && (i == Proline || j == Proline))
+              probp[i][j][k][l] = LOG0VAL;    /* 0 out pairs with P in either*/
+                                              /* position                    */
+          else {
+            if (over_undersample == NULL)
+              sample_error =1;
+            else sample_error = over_undersample[i][j][k][l];
+            dist = (l-k) %POSNUM;
+
+            predicted_freq[i][j][k][l] = (double)totalp[k][l]*
+              (1-weightp[i][j][k][l])*exp(gprobp[i][j][dist])
+                + weightp[i][j][k][l]*
+                (double)freqp[i][j][k][l]/sample_error;
+            
+            account_for_pair_under_sampling(&predicted_freq[i][j][k][l]);
+/**********
+               fprintf(stderr,"  0 freq changed to %lf for %c,%c,%c,%c",
+                          predicted_freq[i][j][k][l], 'a'+k,'a'+l,numaa(i),
+                          numaa(j));
+**********/
+
+            if (predicted_freq[i][j][k][l]>0) 
+              mass += predicted_freq[i][j][k][l];
+          }
+
+      /* Normalize all probp by the mass = sum of probp, and take log */
+      for (i = 0; i < AANUM; i++)
+        for (j = 0; j < AANUM; j++)
+          if ( (i != Proline && j != Proline) || !ProlineFreeWin)
+            if (predicted_freq[i][j][k][l]>0)
+              probp[i][j][k][l] = getdlog(predicted_freq[i][j][k][l] / mass);
+            else 
+              probp[i][j][k][l] = LOG0VAL;
+    }
+}