--- /dev/null
+changed
+ calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+ dimers_for_zero_freqs, many_pprobs[0], prior_freq_single,
+ structural_pos);
+to
+ calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+ many_pprobs[0]);
+and
+
+ calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0],
+ prior_freq_pair, structural_pos);
+to
+ calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin);
+
+cut all the other calcpprob_... stuff (i.e. good_turing, priors,etc.)
+
+
+
+Cut all this junk out of estimate_database_probs also.
+
+
+
+/************************ Second Round ********************************/
+Cut
+/** The following are defined in scscore.c ***/
+extern double prior_probs[NUM_RES_TYPE][POSNUM],
+ prior_probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
+
+
+
+int in_list(int reg,int structural_pos_list[POSNUM+1])
+{
+ int i=0;
+
+ while (1) {
+ if (structural_pos_list[i] == -1) return 0; /*End of list and not found */
+ else if (reg == structural_pos_list[i]) return 1;
+ else i++;
+ }
+
+}
+
+
+void set_unimportant_pos_to_genbnk(double pprobs[NUM_RES_TYPE][POSNUM],
+ double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+ double gprobs[NUM_RES_TYPE],
+ double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
+ int structural_pos[POSNUM+1] )
+{
+ int reg1=0, reg2=0, res1, res2, dist;
+ int pos_array_entry1=0;
+ int pos_array_entry2=0;
+ double prob1, prob2;
+
+ for (reg1=0; reg1 < POSNUM; reg1++) {
+ if (!in_list(reg1, structural_pos))
+ for (res1=0; res1<NUM_RES_TYPE; res1++)
+ pprobs[res1][reg1] = gprobs[res1];
+
+ for (reg2=0; reg2<POSNUM; reg2++)
+ if (!in_list(reg1, structural_pos) ||
+ !in_list(reg2, structural_pos)) {
+ dist = (reg2-reg1-1+POSNUM) %POSNUM;
+
+ for (res1=0; res1<NUM_RES_TYPE; res1++)
+ for (res2=0; res2<NUM_RES_TYPE; res2++)
+ pprobp[res1][res2][reg1][reg2]=
+ gprobp[res1][res2][dist];
+ }
+ }
+}
+
+/**********************************************************************/
+
+
+cut
+
+void estimate_database_probs(int tab,
+ long freqs[AANUM][POSNUM], long totals[POSNUM],
+ double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
+ long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM],
+ double many_pprobp[MAX_TABLE_NUMBER]
+ [NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+ int ProlineFreeWin, int which_priors, int which_priorp,
+ double prior_freq_single, double prior_freq_pair,
+ int good_turing_fixed_disc, int structural_pos[POSNUM+1])
+{
+ extern double SCALE0;
+ extern double scale0s[MAX_TABLE_NUMBER],scale0p[MAX_TABLE_NUMBER];
+ int dimers_for_zero_freqs=0, dimers_for_zero_freqp=0;
+
+
+/** which_prior = -2 is a signal that only want to add prior freq to **/
+/** events which occur fewer times in trimer table than would expect **/
+/** using dimer prob of events (so the events that are likely undersampled **/
+
+ if (tab==1) {
+ if (which_priors==-1)
+ dimers_for_zero_freqs=1;
+ else if (which_priors == -2)
+ dimers_for_zero_freqs = 2;
+ if (which_priorp==-1)
+ dimers_for_zero_freqp=1;
+ else if (which_priorp == -2)
+ dimers_for_zero_freqp = 2;
+ }
+
+
+
+ SCALE0= scale0s[tab];
+ if (good_turing_fixed_disc==1) /** Do good_turing **/
+ calcpprobs_good_turing(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+ structural_pos);
+ else if (good_turing_fixed_disc ==2)
+ /*Do fixed discount with SCALE0 as discount. **/
+ calcpprobs_fixed_disc(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+ structural_pos);
+ else if (good_turing_fixed_disc ==3) /** Do modified good_turing. **/
+ calcpprobs_mod_good_turing(freqs,totals, many_pprobs[tab], ProlineFreeWin,
+ structural_pos);
+ else if ((which_priors <= 0) || (tab !=1)) /* 0 means do old prob stuff. **/
+ calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+ dimers_for_zero_freqs, many_pprobs[0], prior_freq_single,
+ structural_pos);
+ else
+ calc_posterior_pprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
+ many_pprobs[0], which_priors, structural_pos);
+
+
+
+ SCALE0= scale0p[tab];
+ if (good_turing_fixed_disc==1)
+ calcpprobp_good_turing(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin, which_priorp, structural_pos);
+ else if (good_turing_fixed_disc ==2) /* Do fixed_disc with discount SCALE0 */
+ calcpprobp_fixed_disc(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin, which_priorp, structural_pos);
+ else if (good_turing_fixed_disc ==3) /** Do modified good_turing. **/
+ calcpprobp_mod_good_turing(freqp, totalp, many_pprobs[tab],
+ many_pprobp[tab],ProlineFreeWin, which_priorp,
+ structural_pos);
+
+ else if ((which_priorp <= 0) || (tab != 1))
+ calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0],
+ prior_freq_pair, structural_pos);
+ else
+ calc_posterior_pprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
+ ProlineFreeWin, many_pprobp[0], which_priorp,
+ structural_pos);
+
+
+}
+
+
+
+
+
+
+changed
+int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
+ double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin,
+ double dimer_probs[NUM_RES_TYPE][POSNUM])
+
+to
+int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
+ double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin)
+and from that function cut:
+
+prior_freq_single=0, int prior_check_dimers=0;
+and replaced
+
+ while ( (k = structural_pos[pos_array_entry1]) != -1) {
+ pos_array_entry1++;
+with
+ for (k = 0; k < POSNUM; k++) {
+
+and replaced
+ if (prior_freq_single >= 0) prior_freq = prior_freq_single;
+ else prior_freq = -prior_freq_single * exp(dimer_probs[i][k]);
+ /** The maginitude of prior_freq_single scales things. **/
+ /** Since dimer_probs is a prob. function, we are adding a **/
+ /** TOTAL of -prior_freq_single to the frequency. **/
+ /** That all gets normalized when divide by mass. **/
+by prior_freq =0;
+
+
+
+in calcpprobp()
+cut
+int pos_array_entry1=0;
+ int pos_array_entry2=0;
+
+and replaced
+
+ while ( (k = structural_pos[pos_array_entry1]) != -1) {
+ pos_array_entry1++;
+ pos_array_entry2=0;
+ while ( (l = structural_pos[pos_array_entry2]) != -1) {
+ pos_array_entry2++;
+
+with
+
+ for (k = 0; k < POSNUM; k++)
+ for (l = 0; l < POSNUM; l++) {
+
+
+cut
+double prior_freq_pair=0;
+
+replaced
+ if (prior_freq_pair >= 0) prior_freq = prior_freq_pair;
+ else prior_freq = -prior_freq_pair * exp(dimer_probp[i][j][k][l]);
+ /** The maginitude of prior_freq_single scales things. **/
+ /** Since dimer_probp is a prob. function, we are adding a **/
+ /** TOTAL of -prior_freq_single to the frequency. **/
+ /** That all gets normalized when divide by mass. **/
+
+by
+prior_freq =0;
+
+
+
+CUT
+
+void account_for_single_under_sampling(double *predictedfreq)
+{
+ if (*predictedfreq<=.2){
+ /* fprintf(stderr,"\nAccounted for a 0 freq that was %lf",*predictedfreq); */ *predictedfreq = .2;
+ }
+}
+
+
+int account_for_pair_under_sampling(double *predictedfreq)
+{
+ if (*predictedfreq<=.2) {
+ *predictedfreq = .2;
+ return(1);
+ }
+ else return(0);
+
+}
+
+
+
+int calc_weighted_pprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
+ double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin,
+ double weights[AANUM][POSNUM],
+ double gprobs[NUM_RES_TYPE],
+ double over_undersample[AANUM][POSNUM])
+{
+ int i, k;
+ long tal=0; /* total num of aa's in a given pos */
+ double mass; /* prob mass for a given pos */
+ double predicted_freq[AANUM][POSNUM];
+ double sample_error;
+
+ /* Calc probs for each aa i in each position k */
+ for (k = 0; k < POSNUM; k++) {
+ tal = totals[k];
+ mass = 0;
+ for (i = 0; i < AANUM; i++)
+ if (ProlineFreeWin && (i == Proline))
+ probs[i][k] = LOG0VAL; /* 0 out P's */
+ else {
+ if (over_undersample == NULL)
+ sample_error=1;
+ else sample_error = over_undersample[i][k];
+
+
+ predicted_freq[i][k] = (double)totals[k]*(1.0-weights[i][k])*
+ exp(gprobs[i]) +
+ weights[i][k]*(double)freqs[i][k]/sample_error;
+
+ account_for_single_under_sampling(&predicted_freq[i][k]);
+
+
+/* fprintf(stderr,"\n Predicted freq=%lf, seen freq=%ld", predicted_freq[i][k], freqs[i][k]);
+ if (predicted_freq[i][k]<0) fprintf(stderr,"\ni=%d,k=%d,Weight = %lf,totals=%ld, gprob=%lf, sample_error=%lf ", i,k,weights[i][k],totals[k],gprobs[i],sample_error);
+*/
+ if (predicted_freq[i][k]>0) {
+ mass += predicted_freq[i][k];
+/* fprintf(stderr,"\nAdded to mass for %d, %d",i,k); */
+ }
+ }
+
+ /* Normalize all probs by the mass = sum of probs, and take log */
+ for (i = 0; i < AANUM; i++)
+ if ((i != Proline) || (!ProlineFreeWin))
+ if (predicted_freq[i][k]>0)
+ probs[i][k] = getdlog(predicted_freq[i][k]/mass);
+ else
+ probs[i][k] = LOG0VAL;
+ }
+
+ return 1;
+}
+
+int calc_weighted_pprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM],
+ long totalp[POSNUM][POSNUM],
+ double probs[NUM_RES_TYPE][POSNUM],
+ double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
+ int ProlineFreeWin, double weightp[AANUM][AANUM][POSNUM][POSNUM],
+ double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
+ double over_undersample[AANUM][AANUM][POSNUM][POSNUM])
+{
+ int i, j, k, l;
+ long tal=0; /* total num of aa's in given pair of positions */
+ double mass; /* prob mass for given pair of pos */
+ double predicted_freq[AANUM][AANUM][POSNUM][POSNUM];
+ double sample_error;
+ int dist;
+
+ /* Calc probp for each aa pair in each pair of positions k,l */
+ for (k = 0; k < POSNUM; k++)
+ for (l = 0; l < POSNUM; l++) {
+ tal = totalp[k][l];
+ mass = 0;
+ for (i = 0; i < AANUM; i++)
+ for (j = 0; j < AANUM; j++)
+ if (ProlineFreeWin && (i == Proline || j == Proline))
+ probp[i][j][k][l] = LOG0VAL; /* 0 out pairs with P in either*/
+ /* position */
+ else {
+ if (over_undersample == NULL)
+ sample_error =1;
+ else sample_error = over_undersample[i][j][k][l];
+ dist = (l-k) %POSNUM;
+
+ predicted_freq[i][j][k][l] = (double)totalp[k][l]*
+ (1-weightp[i][j][k][l])*exp(gprobp[i][j][dist])
+ + weightp[i][j][k][l]*
+ (double)freqp[i][j][k][l]/sample_error;
+
+ account_for_pair_under_sampling(&predicted_freq[i][j][k][l]);
+/**********
+ fprintf(stderr," 0 freq changed to %lf for %c,%c,%c,%c",
+ predicted_freq[i][j][k][l], 'a'+k,'a'+l,numaa(i),
+ numaa(j));
+**********/
+
+ if (predicted_freq[i][j][k][l]>0)
+ mass += predicted_freq[i][j][k][l];
+ }
+
+ /* Normalize all probp by the mass = sum of probp, and take log */
+ for (i = 0; i < AANUM; i++)
+ for (j = 0; j < AANUM; j++)
+ if ( (i != Proline && j != Proline) || !ProlineFreeWin)
+ if (predicted_freq[i][j][k][l]>0)
+ probp[i][j][k][l] = getdlog(predicted_freq[i][j][k][l] / mass);
+ else
+ probp[i][j][k][l] = LOG0VAL;
+ }
+}