--- /dev/null
+#include <stdio.h>
+#include <math.h>
+#include "sc.h"
+#include "scscore.h"
+#include "scconst.h"
+#include "options.h"
+#include "switches.h"
+#include "stats.h"
+
+/** The following are defined in scscore.c ***/
+extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
+
+/*** #define SCALE0 5 ***/
+/*** Now an extern variable ***/ /* scaling value for 0 probs */
+extern double SCALE0;
+extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
+
+/* Takes the log of a double */
+double getdlog(double num)
+{
+ if (num)
+ return log(num);
+ else
+ return LOG0VAL;
+}
+
+/* Takes the log of a long int */
+double getlog(long num)
+{
+ if (num) {
+ return log((double) num);
+ }
+ else
+ return LOG0VAL;
+}
+
+
+/**********************************************************************/
+
+
+void estimate_database_probs2(int tab,
+ long freqs[AANUM][POSNUM], long totals[POSNUM],
+ double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
+ long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM],
+ double many_pprobp[MAX_TABLE_NUMBER]
+ [NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+ int ProlineFreeWin)
+{
+ extern double SCALE0;
+ extern double scale0s[MAX_TABLE_NUMBER],scale0p[MAX_TABLE_NUMBER];
+
+
+ SCALE0= scale0s[tab];
+ calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin);
+
+ SCALE0= scale0p[tab];
+ calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin);
+}
+
+
+
+
+/*************************************************************************/
+
+/****************************************************************************/
+/** If the file fgin == NULL so is not set in config file, use uniform dist. */
+
+int setgprob_to_uniform(double gprobs[NUM_RES_TYPE],
+ double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM])
+{
+ int res1, res2, dist;
+
+ for (res1=0; res1<AANUM; res1++) {
+ gprobs[res1] = 1/AANUM;
+
+ for (res2=0; res2<AANUM; res2++)
+ for(dist=0; dist<POSNUM; dist++)
+ gprobp[res1][res2][dist] = 1/(AANUM*AANUM);
+ }
+}
+
+/****************************************************************************/
+
+
+/* Calculates log of genbnk probabilities for a residue. */
+int calcgprobs(long freqs[AANUM], long totals,
+ double probs[NUM_RES_TYPE])
+{
+ int i;
+ double tal;
+
+ tal = getlog(totals);
+ for (i = 0; i < AANUM; i++) {
+ probs[i] = getlog(freqs[i]) - tal;
+ }
+ return 1;
+}
+
+
+
+
+/* Calculates log of genbnk probabilities of pairs of residues at distance k.*/
+int calcgprobp(long freqp[AANUM][AANUM][POSNUM],
+ long totalp[POSNUM],
+ double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM])
+{
+ int i, j, k;
+ double tal;
+
+ for (k = 0; k < POSNUM; k++) {
+ tal = getlog(totalp[k]);
+ for (i = 0; i < AANUM; i++)
+ for (j = 0; j < AANUM; j++)
+ probp[i][j][k] = getlog(freqp[i][j][k]) - tal;
+ }
+}
+
+/*****************************************************************************/
+
+int calc_weird_gprob_sum(double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double gprobs[NUM_RES_TYPE])
+{
+ int res, pos;
+
+ gprobs[Glutamix] = gprobs[Glutamine] + gprobs[GlutamicAcid];
+ gprobs[Asparagix] = gprobs[Asparagine] + gprobs[AsparticAcid];
+ gprobs[AnyRes] = 1/AANUM; /* AnyRes means amino acid unknown.... */
+
+
+ for (pos =0; pos<POSNUM; pos++) {
+ for (res =0; res<=AANUM; res++) {
+ gprobp[res][AnyRes][pos] = gprobs[res]; /* AnyRes means res unknown */
+ gprobp[AnyRes][res][pos] = gprobs[res]; /* so this makes sense. */
+
+ gprobp[Glutamix][res][pos] = gprobp[Glutamine][res][pos]
+ + gprobp[GlutamicAcid][res][pos];
+
+ gprobp[res][Glutamix][pos] = gprobp[res][Glutamine][pos]
+ + gprobp[res][GlutamicAcid][pos];
+
+ gprobp[Asparagix][res][pos] = gprobp[Asparagine][res][pos]
+ + gprobp[AsparticAcid][res][pos];
+
+ gprobp[res][Asparagix][pos] = gprobp[res][Asparagine][pos]
+ + gprobp[res][AsparticAcid][pos];
+
+ }
+
+
+ gprobp[Glutamix][Glutamix][pos] = gprobp[Glutamix][Glutamine][pos] +
+ gprobp[Glutamix][GlutamicAcid][pos];
+ gprobp[Asparagix][Asparagix][pos] = gprobp[Asparagix][Asparagine][pos] +
+ gprobp[Asparagix][AsparticAcid][pos];
+ gprobp[Glutamix][Asparagix][pos] = gprobp[Glutamix][Asparagine][pos] +
+ gprobp[Glutamix][AsparticAcid][pos];
+ gprobp[Asparagix][Glutamix][pos] = gprobp[Asparagix][Glutamine][pos] +
+ gprobp[Asparagix][GlutamicAcid][pos];
+ }
+
+}
+
+int calc_weird_gprob_avg(double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double gprobs[NUM_RES_TYPE])
+{
+ int res, pos;
+
+
+ gprobs[Glutamix] = getdlog(exp(gprobs[Glutamine]) +
+ exp(gprobs[GlutamicAcid])/2);
+ gprobs[Asparagix] = getdlog(exp(gprobs[Asparagine]) +
+ exp(gprobs[AsparticAcid])/2);
+ gprobs[AnyRes] = 1/AANUM; /* AnyRes means amino acid unknown.... */
+
+
+
+ for (pos =0; pos<POSNUM; pos++) {
+ for (res =0; res<=AANUM; res++) {
+ gprobp[res][AnyRes][pos] = gprobs[res]; /* AnyRes means res unknown */
+ gprobp[AnyRes][res][pos] = gprobs[res]; /* so this makes sense. */
+
+ gprobp[Glutamix][res][pos] = getdlog(exp(gprobp[Glutamine][res][pos])
+ + exp(gprobp[GlutamicAcid][res][pos])/2);
+
+ gprobp[res][Glutamix][pos] = getdlog(exp(gprobp[res][Glutamine][pos])
+ + exp(gprobp[res][GlutamicAcid][pos])/2);
+
+ gprobp[Asparagix][res][pos] = getdlog(exp(gprobp[Asparagine][res][pos])
+ + exp(gprobp[AsparticAcid][res][pos])/2);
+
+ gprobp[res][Asparagix][pos] = getdlog(exp(gprobp[res][Asparagine][pos])
+ + exp(gprobp[res][AsparticAcid][pos])/2);
+
+
+
+ }
+
+
+ gprobp[Glutamix][Glutamix][pos] =
+ getdlog(exp(gprobp[Glutamix][Glutamine][pos]) +
+ exp(gprobp[Glutamix][GlutamicAcid][pos])/2);
+ gprobp[Asparagix][Asparagix][pos] =
+ getdlog(exp(gprobp[Asparagix][Asparagine][pos]) +
+ exp(gprobp[Asparagix][AsparticAcid][pos])/2);
+ gprobp[Glutamix][Asparagix][pos] =
+ getdlog(exp(gprobp[Glutamix][Asparagine][pos]) +
+ exp(gprobp[Glutamix][AsparticAcid][pos])/2);
+ gprobp[Asparagix][Glutamix][pos] =
+ getdlog(exp(gprobp[Asparagix][Glutamine][pos]) +
+ exp(gprobp[Asparagix][GlutamicAcid][pos])/2);
+ }
+
+}
+
+
+
+/****************************************************************************/
+
+
+
+/****************************************************************************/
+/* Calculates table of logs of single residue probabilities in position k */
+/* in the positives. The probability is initially set naiviely to */
+/* freq(Res)(k)/total(k). Then probability mass is redistributed to the */
+/* zero frequency residues by setting their probability to 1/5*total(k). */
+/* Note that this is the same thing as assuming that due to finite sampling,*/
+/* a zero frequency residue really has frequency 1/5. (So this probability */
+/* is at most 1/5 of the minimum nonzero probability). Prolines are */
+/* treated as actual zeros since they are known coil breakers. */
+/* The probabilities are then normalized. */
+
+int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
+ double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin)
+{
+ int i, k;
+ long tal=0; /* total num of aa's in a given pos */
+ double mass; /* prob mass for a given pos */
+ double prior_freq;
+ int pos_array_entry1=0;
+
+
+ /* Calc probs for each aa i in each position k */
+ for (k = 0; k < POSNUM; k++) {
+ tal = totals[k];
+ mass = 0;
+ for (i = 0; i < AANUM; i++) {
+ prior_freq=0;
+
+ if (ProlineFreeWin && (i == Proline))
+ probs[i][k] = LOG0VAL; /* 0 out P's */
+ else if (freqs[i][k])
+ /* Handle the non-zero freq. naively. */
+ mass += probs[i][k] = ( prior_freq + (double) freqs[i][k]) / tal;
+ else { /* Now handle the freq 0 case specially. */
+ double tmp1, tmp2;
+ tmp1 = (prior_freq + 1.0 /SCALE0)/tal; /* 1/5total(k) */
+ tmp2 = 1.0/AANUM; /* Pretty much useless here
+ but useful for analogy in
+ computation of pair prob. */
+ /* Take pr[i][k] to be min(tmp1, tmp2)--- usually tmp1.*/
+ mass += probs[i][k] = MIN(tmp1,tmp2);
+ }
+ }
+ /* Normalize all probs by the mass = sum of probs, and take log */
+ for (i = 0; i < AANUM; i++)
+ if ((i != Proline) || (!ProlineFreeWin))
+ probs[i][k] = getdlog(probs[i][k]/mass);
+ }
+
+ return 1;
+}
+
+
+/***************************************************************/
+
+
+/* Calculates table of the logs of the probs of pairs in the positives. */
+/* If the frequency is nonzero, just naively compute freq/total(k,l). */
+/* For zero freq., deal with them analagously to zero freq. singles, so their*/
+/* final probability is no more than 1/5 of the smallest non-zero prob. */
+/* To do this Pr(r1,r2,k,l)= min{1/400, 1/5total(k,l), Pr(r1,k)*Pr(r2,l)}, */
+/* where the last term is an upper bound acheived if the two residue,position*/
+/* were independent. */
+int calcpprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM],
+ long totalp[POSNUM][POSNUM],
+ double probs[NUM_RES_TYPE][POSNUM],
+ double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+ int ProlineFreeWin)
+{
+ int i, j, k, l;
+ long tal=0; /* total num of aa's in given pair of positions */
+ double mass; /* prob mass for given pair of pos */
+ double prior_freq;
+
+
+ /* Calc probp for each aa pair in each pair of positions k,l */
+
+ for (k = 0; k < POSNUM; k++)
+ for (l = 0; l < POSNUM; l++) {
+ tal = totalp[k][l];
+ mass = 0;
+ for (i = 0; i < AANUM; i++)
+ for (j = 0; j < AANUM; j++) {
+ prior_freq=0;
+
+ if (ProlineFreeWin && (i == Proline || j == Proline))
+ probp[i][j][k][l] = LOG0VAL; /* 0 out pairs with P in either*/
+ /* position */
+ else if (freqp[i][j][k][l])
+ mass += probp[i][j][k][l] =
+ (prior_freq + (double) freqp[i][j][k][l]) / tal;
+ else { /* freq 0 pairs are handled as min of 3 quanitities. */
+ double tmp1, tmp2, tmp3, tmp4;
+ tmp1 = (prior_freq + 1.0 /SCALE0) / tal;
+ /* 1/5total(res1,res2,k,l) */
+ tmp2 = 1.0/(AANUM*AANUM); /* 1/400 */
+ tmp3 = exp(probs[i][k]) * exp(probs[j][l]);
+ /* product of single probs Pr(r1,k)*Pr(r2,l)*/
+
+ /* Take min(tmp1, tmp2, tmp3), 3 upper bounds on probp */
+ tmp4 = MIN(tmp1,tmp2);
+ mass += probp[i][j][k][l] = MIN(tmp3,tmp4);
+ }
+ }
+
+ /* Normalize all probp by the mass = sum of probp, and take log */
+ for (i = 0; i < AANUM; i++)
+ for (j = 0; j < AANUM; j++)
+ if ( (i != Proline && j != Proline) || !ProlineFreeWin)
+ probp[i][j][k][l] = getdlog(probp[i][j][k][l] / mass);
+ }
+}
+
+
+
+
+
+/****************************************************************************/
+int calc_weird_pprob_sum(double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double pprobs[NUM_RES_TYPE][POSNUM])
+{
+ int res, pos1,pos2;
+
+
+ for (pos1 =0 ; pos1<POSNUM; pos1++) {
+ pprobs[Glutamix][pos1] =
+ pprobs[Glutamine][pos1] + pprobs[GlutamicAcid][pos1];
+ pprobs[Asparagix][pos1] =
+ pprobs[Asparagine][pos1] + pprobs[AsparticAcid][pos1];
+
+ pprobs[AnyRes][pos1] = 1/AANUM; /* AnyRes means amino acid unknown.... */
+ }
+
+
+
+
+ for (pos1 =0; pos1<POSNUM; pos1++) {
+ for (pos2 =0; pos2<POSNUM; pos2++) {
+
+ for (res =0; res<=AANUM; res++) {
+ pprobp[res][AnyRes][pos1][pos2] =
+ pprobs[res][pos1]; /* AnyRes means res unknown */
+ pprobp[AnyRes][res][pos1][pos2] =
+ pprobs[res][pos2]; /* so this makes sense. */
+
+ pprobp[Glutamix][res][pos1][pos2] = pprobp[Glutamine][res][pos1][pos2]
+ + pprobp[GlutamicAcid][res][pos1][pos2];
+
+ pprobp[res][Glutamix][pos1][pos2] = pprobp[res][Glutamine][pos1][pos2]
+ + pprobp[res][GlutamicAcid][pos1][pos2];
+
+ pprobp[Asparagix][res][pos1][pos2]= pprobp[Asparagine][res][pos1][pos2]
+ + pprobp[AsparticAcid][res][pos1][pos2];
+
+ pprobp[res][Asparagix][pos1][pos2] =pprobp[res][Asparagine][pos1][pos2]
+ + pprobp[res][AsparticAcid][pos1][pos2];
+
+ }
+
+
+ pprobp[Glutamix][Glutamix][pos1][pos2] =
+ pprobp[Glutamix][Glutamine][pos1][pos2] +
+ pprobp[Glutamix][GlutamicAcid][pos1][pos2];
+ pprobp[Asparagix][Asparagix][pos1][pos2] =
+ pprobp[Asparagix][Asparagine][pos1][pos2] +
+ pprobp[Asparagix][AsparticAcid][pos1][pos2];
+ pprobp[Glutamix][Asparagix][pos1][pos2] =
+ pprobp[Glutamix][Asparagine][pos1][pos2] +
+ pprobp[Glutamix][AsparticAcid][pos1][pos2];
+ pprobp[Asparagix][Glutamix][pos1][pos2] =
+ pprobp[Asparagix][Glutamine][pos1][pos2] +
+ pprobp[Asparagix][GlutamicAcid][pos1][pos2];
+ }
+ }
+
+
+}
+
+
+
+
+
+int calc_weird_pprob_avg(double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double pprobs[NUM_RES_TYPE][POSNUM])
+{
+ int res, pos1,pos2;
+
+
+ for (pos1 =0 ; pos1<POSNUM; pos1++) {
+ pprobs[Glutamix][pos1] =
+ getdlog(exp(pprobs[Glutamine][pos1]) +
+ exp(pprobs[GlutamicAcid][pos1])/2);
+ pprobs[Asparagix][pos1] =
+ getdlog(exp(pprobs[Asparagine][pos1]) +
+ exp(pprobs[AsparticAcid][pos1])/2);
+
+ pprobs[AnyRes][pos1] = 1/AANUM; /* AnyRes means amino acid unknown.... */
+ }
+
+
+
+
+ for (pos1 =0; pos1<POSNUM; pos1++) {
+ for (pos2 =0; pos2<POSNUM; pos2++) {
+
+ for (res =0; res<=AANUM; res++) {
+ pprobp[res][AnyRes][pos1][pos2] =
+ pprobs[res][pos1]; /* AnyRes means res unknown */
+ pprobp[AnyRes][res][pos1][pos2] =
+ pprobs[res][pos2]; /* so this makes sense. */
+
+ pprobp[Glutamix][res][pos1][pos2] =
+ getdlog(exp(pprobp[Glutamine][res][pos1][pos2])
+ + exp(pprobp[GlutamicAcid][res][pos1][pos2])/2);
+
+ pprobp[res][Glutamix][pos1][pos2] =
+ getdlog(exp(pprobp[res][Glutamine][pos1][pos2])
+ + exp(pprobp[res][GlutamicAcid][pos1][pos2])/2);
+
+ pprobp[Asparagix][res][pos1][pos2]=
+ getdlog(exp(pprobp[Asparagine][res][pos1][pos2])
+ + exp(pprobp[AsparticAcid][res][pos1][pos2])/2);
+
+ pprobp[res][Asparagix][pos1][pos2]=
+ getdlog(exp(pprobp[res][Asparagine][pos1][pos2])
+ + exp(pprobp[res][AsparticAcid][pos1][pos2])/2);
+
+ }
+
+
+ pprobp[Glutamix][Glutamix][pos1][pos2] =
+ getdlog(exp(pprobp[Glutamix][Glutamine][pos1][pos2]) +
+ exp(pprobp[Glutamix][GlutamicAcid][pos1][pos2])/2);
+ pprobp[Asparagix][Asparagix][pos1][pos2] =
+ getdlog(exp(pprobp[Asparagix][Asparagine][pos1][pos2]) +
+ exp(pprobp[Asparagix][AsparticAcid][pos1][pos2])/2);
+ pprobp[Glutamix][Asparagix][pos1][pos2] =
+ getdlog(exp(pprobp[Glutamix][Asparagine][pos1][pos2]) +
+ exp(pprobp[Glutamix][AsparticAcid][pos1][pos2])/2);
+ pprobp[Asparagix][Glutamix][pos1][pos2] =
+ getdlog(exp(pprobp[Asparagix][Glutamine][pos1][pos2]) +
+ exp(pprobp[Asparagix][GlutamicAcid][pos1][pos2])/2);
+ }
+ }
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+