2 calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
3 dimers_for_zero_freqs, many_pprobs[0], prior_freq_single,
6 calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
10 calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
11 ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0],
12 prior_freq_pair, structural_pos);
14 calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
17 cut all the other calcpprob_... stuff (i.e. good_turing, priors,etc.)
21 Cut all this junk out of estimate_database_probs also.
25 /************************ Second Round ********************************/
27 /** The following are defined in scscore.c ***/
28 extern double prior_probs[NUM_RES_TYPE][POSNUM],
29 prior_probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
33 int in_list(int reg,int structural_pos_list[POSNUM+1])
38 if (structural_pos_list[i] == -1) return 0; /*End of list and not found */
39 else if (reg == structural_pos_list[i]) return 1;
46 void set_unimportant_pos_to_genbnk(double pprobs[NUM_RES_TYPE][POSNUM],
47 double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
48 double gprobs[NUM_RES_TYPE],
49 double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
50 int structural_pos[POSNUM+1] )
52 int reg1=0, reg2=0, res1, res2, dist;
53 int pos_array_entry1=0;
54 int pos_array_entry2=0;
57 for (reg1=0; reg1 < POSNUM; reg1++) {
58 if (!in_list(reg1, structural_pos))
59 for (res1=0; res1<NUM_RES_TYPE; res1++)
60 pprobs[res1][reg1] = gprobs[res1];
62 for (reg2=0; reg2<POSNUM; reg2++)
63 if (!in_list(reg1, structural_pos) ||
64 !in_list(reg2, structural_pos)) {
65 dist = (reg2-reg1-1+POSNUM) %POSNUM;
67 for (res1=0; res1<NUM_RES_TYPE; res1++)
68 for (res2=0; res2<NUM_RES_TYPE; res2++)
69 pprobp[res1][res2][reg1][reg2]=
70 gprobp[res1][res2][dist];
75 /**********************************************************************/
80 void estimate_database_probs(int tab,
81 long freqs[AANUM][POSNUM], long totals[POSNUM],
82 double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
83 long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM],
84 double many_pprobp[MAX_TABLE_NUMBER]
85 [NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
86 int ProlineFreeWin, int which_priors, int which_priorp,
87 double prior_freq_single, double prior_freq_pair,
88 int good_turing_fixed_disc, int structural_pos[POSNUM+1])
91 extern double scale0s[MAX_TABLE_NUMBER],scale0p[MAX_TABLE_NUMBER];
92 int dimers_for_zero_freqs=0, dimers_for_zero_freqp=0;
95 /** which_prior = -2 is a signal that only want to add prior freq to **/
96 /** events which occur fewer times in trimer table than would expect **/
97 /** using dimer prob of events (so the events that are likely undersampled **/
100 if (which_priors==-1)
101 dimers_for_zero_freqs=1;
102 else if (which_priors == -2)
103 dimers_for_zero_freqs = 2;
104 if (which_priorp==-1)
105 dimers_for_zero_freqp=1;
106 else if (which_priorp == -2)
107 dimers_for_zero_freqp = 2;
112 SCALE0= scale0s[tab];
113 if (good_turing_fixed_disc==1) /** Do good_turing **/
114 calcpprobs_good_turing(freqs, totals, many_pprobs[tab], ProlineFreeWin,
116 else if (good_turing_fixed_disc ==2)
117 /*Do fixed discount with SCALE0 as discount. **/
118 calcpprobs_fixed_disc(freqs, totals, many_pprobs[tab], ProlineFreeWin,
120 else if (good_turing_fixed_disc ==3) /** Do modified good_turing. **/
121 calcpprobs_mod_good_turing(freqs,totals, many_pprobs[tab], ProlineFreeWin,
123 else if ((which_priors <= 0) || (tab !=1)) /* 0 means do old prob stuff. **/
124 calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
125 dimers_for_zero_freqs, many_pprobs[0], prior_freq_single,
128 calc_posterior_pprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin,
129 many_pprobs[0], which_priors, structural_pos);
133 SCALE0= scale0p[tab];
134 if (good_turing_fixed_disc==1)
135 calcpprobp_good_turing(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
136 ProlineFreeWin, which_priorp, structural_pos);
137 else if (good_turing_fixed_disc ==2) /* Do fixed_disc with discount SCALE0 */
138 calcpprobp_fixed_disc(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
139 ProlineFreeWin, which_priorp, structural_pos);
140 else if (good_turing_fixed_disc ==3) /** Do modified good_turing. **/
141 calcpprobp_mod_good_turing(freqp, totalp, many_pprobs[tab],
142 many_pprobp[tab],ProlineFreeWin, which_priorp,
145 else if ((which_priorp <= 0) || (tab != 1))
146 calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
147 ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0],
148 prior_freq_pair, structural_pos);
150 calc_posterior_pprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
151 ProlineFreeWin, many_pprobp[0], which_priorp,
163 int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
164 double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin,
165 double dimer_probs[NUM_RES_TYPE][POSNUM])
168 int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
169 double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin)
170 and from that function cut:
172 prior_freq_single=0, int prior_check_dimers=0;
175 while ( (k = structural_pos[pos_array_entry1]) != -1) {
178 for (k = 0; k < POSNUM; k++) {
181 if (prior_freq_single >= 0) prior_freq = prior_freq_single;
182 else prior_freq = -prior_freq_single * exp(dimer_probs[i][k]);
183 /** The maginitude of prior_freq_single scales things. **/
184 /** Since dimer_probs is a prob. function, we are adding a **/
185 /** TOTAL of -prior_freq_single to the frequency. **/
186 /** That all gets normalized when divide by mass. **/
193 int pos_array_entry1=0;
194 int pos_array_entry2=0;
198 while ( (k = structural_pos[pos_array_entry1]) != -1) {
201 while ( (l = structural_pos[pos_array_entry2]) != -1) {
206 for (k = 0; k < POSNUM; k++)
207 for (l = 0; l < POSNUM; l++) {
211 double prior_freq_pair=0;
214 if (prior_freq_pair >= 0) prior_freq = prior_freq_pair;
215 else prior_freq = -prior_freq_pair * exp(dimer_probp[i][j][k][l]);
216 /** The maginitude of prior_freq_single scales things. **/
217 /** Since dimer_probp is a prob. function, we are adding a **/
218 /** TOTAL of -prior_freq_single to the frequency. **/
219 /** That all gets normalized when divide by mass. **/
228 void account_for_single_under_sampling(double *predictedfreq)
230 if (*predictedfreq<=.2){
231 /* fprintf(stderr,"\nAccounted for a 0 freq that was %lf",*predictedfreq); */ *predictedfreq = .2;
236 int account_for_pair_under_sampling(double *predictedfreq)
238 if (*predictedfreq<=.2) {
248 int calc_weighted_pprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
249 double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin,
250 double weights[AANUM][POSNUM],
251 double gprobs[NUM_RES_TYPE],
252 double over_undersample[AANUM][POSNUM])
255 long tal=0; /* total num of aa's in a given pos */
256 double mass; /* prob mass for a given pos */
257 double predicted_freq[AANUM][POSNUM];
260 /* Calc probs for each aa i in each position k */
261 for (k = 0; k < POSNUM; k++) {
264 for (i = 0; i < AANUM; i++)
265 if (ProlineFreeWin && (i == Proline))
266 probs[i][k] = LOG0VAL; /* 0 out P's */
268 if (over_undersample == NULL)
270 else sample_error = over_undersample[i][k];
273 predicted_freq[i][k] = (double)totals[k]*(1.0-weights[i][k])*
275 weights[i][k]*(double)freqs[i][k]/sample_error;
277 account_for_single_under_sampling(&predicted_freq[i][k]);
280 /* fprintf(stderr,"\n Predicted freq=%lf, seen freq=%ld", predicted_freq[i][k], freqs[i][k]);
281 if (predicted_freq[i][k]<0) fprintf(stderr,"\ni=%d,k=%d,Weight = %lf,totals=%ld, gprob=%lf, sample_error=%lf ", i,k,weights[i][k],totals[k],gprobs[i],sample_error);
283 if (predicted_freq[i][k]>0) {
284 mass += predicted_freq[i][k];
285 /* fprintf(stderr,"\nAdded to mass for %d, %d",i,k); */
289 /* Normalize all probs by the mass = sum of probs, and take log */
290 for (i = 0; i < AANUM; i++)
291 if ((i != Proline) || (!ProlineFreeWin))
292 if (predicted_freq[i][k]>0)
293 probs[i][k] = getdlog(predicted_freq[i][k]/mass);
295 probs[i][k] = LOG0VAL;
301 int calc_weighted_pprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM],
302 long totalp[POSNUM][POSNUM],
303 double probs[NUM_RES_TYPE][POSNUM],
304 double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
305 int ProlineFreeWin, double weightp[AANUM][AANUM][POSNUM][POSNUM],
306 double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM],
307 double over_undersample[AANUM][AANUM][POSNUM][POSNUM])
310 long tal=0; /* total num of aa's in given pair of positions */
311 double mass; /* prob mass for given pair of pos */
312 double predicted_freq[AANUM][AANUM][POSNUM][POSNUM];
316 /* Calc probp for each aa pair in each pair of positions k,l */
317 for (k = 0; k < POSNUM; k++)
318 for (l = 0; l < POSNUM; l++) {
321 for (i = 0; i < AANUM; i++)
322 for (j = 0; j < AANUM; j++)
323 if (ProlineFreeWin && (i == Proline || j == Proline))
324 probp[i][j][k][l] = LOG0VAL; /* 0 out pairs with P in either*/
327 if (over_undersample == NULL)
329 else sample_error = over_undersample[i][j][k][l];
330 dist = (l-k) %POSNUM;
332 predicted_freq[i][j][k][l] = (double)totalp[k][l]*
333 (1-weightp[i][j][k][l])*exp(gprobp[i][j][dist])
334 + weightp[i][j][k][l]*
335 (double)freqp[i][j][k][l]/sample_error;
337 account_for_pair_under_sampling(&predicted_freq[i][j][k][l]);
339 fprintf(stderr," 0 freq changed to %lf for %c,%c,%c,%c",
340 predicted_freq[i][j][k][l], 'a'+k,'a'+l,numaa(i),
344 if (predicted_freq[i][j][k][l]>0)
345 mass += predicted_freq[i][j][k][l];
348 /* Normalize all probp by the mass = sum of probp, and take log */
349 for (i = 0; i < AANUM; i++)
350 for (j = 0; j < AANUM; j++)
351 if ( (i != Proline && j != Proline) || !ProlineFreeWin)
352 if (predicted_freq[i][j][k][l]>0)
353 probp[i][j][k][l] = getdlog(predicted_freq[i][j][k][l] / mass);
355 probp[i][j][k][l] = LOG0VAL;