10 /** The following are defined in scscore.c ***/
11 extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
13 /*** #define SCALE0 5 ***/
14 /*** Now an extern variable ***/ /* scaling value for 0 probs */
16 extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
18 /* Takes the log of a double */
19 double getdlog(double num)
27 /* Takes the log of a long int */
28 double getlog(long num)
31 return log((double) num);
38 /**********************************************************************/
41 void estimate_database_probs2(int tab,
42 long freqs[AANUM][POSNUM], long totals[POSNUM],
43 double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
44 long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM],
45 double many_pprobp[MAX_TABLE_NUMBER]
46 [NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
50 extern double scale0s[MAX_TABLE_NUMBER],scale0p[MAX_TABLE_NUMBER];
54 calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin);
57 calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab],
64 /*************************************************************************/
66 /****************************************************************************/
67 /** If the file fgin == NULL so is not set in config file, use uniform dist. */
69 int setgprob_to_uniform(double gprobs[NUM_RES_TYPE],
70 double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM])
74 for (res1=0; res1<AANUM; res1++) {
75 gprobs[res1] = 1/AANUM;
77 for (res2=0; res2<AANUM; res2++)
78 for(dist=0; dist<POSNUM; dist++)
79 gprobp[res1][res2][dist] = 1/(AANUM*AANUM);
83 /****************************************************************************/
86 /* Calculates log of genbnk probabilities for a residue. */
87 int calcgprobs(long freqs[AANUM], long totals,
88 double probs[NUM_RES_TYPE])
94 for (i = 0; i < AANUM; i++) {
95 probs[i] = getlog(freqs[i]) - tal;
103 /* Calculates log of genbnk probabilities of pairs of residues at distance k.*/
104 int calcgprobp(long freqp[AANUM][AANUM][POSNUM],
106 double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM])
111 for (k = 0; k < POSNUM; k++) {
112 tal = getlog(totalp[k]);
113 for (i = 0; i < AANUM; i++)
114 for (j = 0; j < AANUM; j++)
115 probp[i][j][k] = getlog(freqp[i][j][k]) - tal;
119 /*****************************************************************************/
121 int calc_weird_gprob_sum(double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double gprobs[NUM_RES_TYPE])
125 gprobs[Glutamix] = gprobs[Glutamine] + gprobs[GlutamicAcid];
126 gprobs[Asparagix] = gprobs[Asparagine] + gprobs[AsparticAcid];
127 gprobs[AnyRes] = 1/AANUM; /* AnyRes means amino acid unknown.... */
130 for (pos =0; pos<POSNUM; pos++) {
131 for (res =0; res<=AANUM; res++) {
132 gprobp[res][AnyRes][pos] = gprobs[res]; /* AnyRes means res unknown */
133 gprobp[AnyRes][res][pos] = gprobs[res]; /* so this makes sense. */
135 gprobp[Glutamix][res][pos] = gprobp[Glutamine][res][pos]
136 + gprobp[GlutamicAcid][res][pos];
138 gprobp[res][Glutamix][pos] = gprobp[res][Glutamine][pos]
139 + gprobp[res][GlutamicAcid][pos];
141 gprobp[Asparagix][res][pos] = gprobp[Asparagine][res][pos]
142 + gprobp[AsparticAcid][res][pos];
144 gprobp[res][Asparagix][pos] = gprobp[res][Asparagine][pos]
145 + gprobp[res][AsparticAcid][pos];
150 gprobp[Glutamix][Glutamix][pos] = gprobp[Glutamix][Glutamine][pos] +
151 gprobp[Glutamix][GlutamicAcid][pos];
152 gprobp[Asparagix][Asparagix][pos] = gprobp[Asparagix][Asparagine][pos] +
153 gprobp[Asparagix][AsparticAcid][pos];
154 gprobp[Glutamix][Asparagix][pos] = gprobp[Glutamix][Asparagine][pos] +
155 gprobp[Glutamix][AsparticAcid][pos];
156 gprobp[Asparagix][Glutamix][pos] = gprobp[Asparagix][Glutamine][pos] +
157 gprobp[Asparagix][GlutamicAcid][pos];
162 int calc_weird_gprob_avg(double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double gprobs[NUM_RES_TYPE])
167 gprobs[Glutamix] = getdlog(exp(gprobs[Glutamine]) +
168 exp(gprobs[GlutamicAcid])/2);
169 gprobs[Asparagix] = getdlog(exp(gprobs[Asparagine]) +
170 exp(gprobs[AsparticAcid])/2);
171 gprobs[AnyRes] = 1/AANUM; /* AnyRes means amino acid unknown.... */
175 for (pos =0; pos<POSNUM; pos++) {
176 for (res =0; res<=AANUM; res++) {
177 gprobp[res][AnyRes][pos] = gprobs[res]; /* AnyRes means res unknown */
178 gprobp[AnyRes][res][pos] = gprobs[res]; /* so this makes sense. */
180 gprobp[Glutamix][res][pos] = getdlog(exp(gprobp[Glutamine][res][pos])
181 + exp(gprobp[GlutamicAcid][res][pos])/2);
183 gprobp[res][Glutamix][pos] = getdlog(exp(gprobp[res][Glutamine][pos])
184 + exp(gprobp[res][GlutamicAcid][pos])/2);
186 gprobp[Asparagix][res][pos] = getdlog(exp(gprobp[Asparagine][res][pos])
187 + exp(gprobp[AsparticAcid][res][pos])/2);
189 gprobp[res][Asparagix][pos] = getdlog(exp(gprobp[res][Asparagine][pos])
190 + exp(gprobp[res][AsparticAcid][pos])/2);
197 gprobp[Glutamix][Glutamix][pos] =
198 getdlog(exp(gprobp[Glutamix][Glutamine][pos]) +
199 exp(gprobp[Glutamix][GlutamicAcid][pos])/2);
200 gprobp[Asparagix][Asparagix][pos] =
201 getdlog(exp(gprobp[Asparagix][Asparagine][pos]) +
202 exp(gprobp[Asparagix][AsparticAcid][pos])/2);
203 gprobp[Glutamix][Asparagix][pos] =
204 getdlog(exp(gprobp[Glutamix][Asparagine][pos]) +
205 exp(gprobp[Glutamix][AsparticAcid][pos])/2);
206 gprobp[Asparagix][Glutamix][pos] =
207 getdlog(exp(gprobp[Asparagix][Glutamine][pos]) +
208 exp(gprobp[Asparagix][GlutamicAcid][pos])/2);
215 /****************************************************************************/
219 /****************************************************************************/
220 /* Calculates table of logs of single residue probabilities in position k */
221 /* in the positives. The probability is initially set naiviely to */
222 /* freq(Res)(k)/total(k). Then probability mass is redistributed to the */
223 /* zero frequency residues by setting their probability to 1/5*total(k). */
224 /* Note that this is the same thing as assuming that due to finite sampling,*/
225 /* a zero frequency residue really has frequency 1/5. (So this probability */
226 /* is at most 1/5 of the minimum nonzero probability). Prolines are */
227 /* treated as actual zeros since they are known coil breakers. */
228 /* The probabilities are then normalized. */
230 int calcpprobs(long freqs[AANUM][POSNUM], long totals[POSNUM],
231 double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin)
234 long tal=0; /* total num of aa's in a given pos */
235 double mass; /* prob mass for a given pos */
237 int pos_array_entry1=0;
240 /* Calc probs for each aa i in each position k */
241 for (k = 0; k < POSNUM; k++) {
244 for (i = 0; i < AANUM; i++) {
247 if (ProlineFreeWin && (i == Proline))
248 probs[i][k] = LOG0VAL; /* 0 out P's */
249 else if (freqs[i][k])
250 /* Handle the non-zero freq. naively. */
251 mass += probs[i][k] = ( prior_freq + (double) freqs[i][k]) / tal;
252 else { /* Now handle the freq 0 case specially. */
254 tmp1 = (prior_freq + 1.0 /SCALE0)/tal; /* 1/5total(k) */
255 tmp2 = 1.0/AANUM; /* Pretty much useless here
256 but useful for analogy in
257 computation of pair prob. */
258 /* Take pr[i][k] to be min(tmp1, tmp2)--- usually tmp1.*/
259 mass += probs[i][k] = MIN(tmp1,tmp2);
262 /* Normalize all probs by the mass = sum of probs, and take log */
263 for (i = 0; i < AANUM; i++)
264 if ((i != Proline) || (!ProlineFreeWin))
265 probs[i][k] = getdlog(probs[i][k]/mass);
272 /***************************************************************/
275 /* Calculates table of the logs of the probs of pairs in the positives. */
276 /* If the frequency is nonzero, just naively compute freq/total(k,l). */
277 /* For zero freq., deal with them analagously to zero freq. singles, so their*/
278 /* final probability is no more than 1/5 of the smallest non-zero prob. */
279 /* To do this Pr(r1,r2,k,l)= min{1/400, 1/5total(k,l), Pr(r1,k)*Pr(r2,l)}, */
280 /* where the last term is an upper bound acheived if the two residue,position*/
281 /* were independent. */
282 int calcpprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM],
283 long totalp[POSNUM][POSNUM],
284 double probs[NUM_RES_TYPE][POSNUM],
285 double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM],
289 long tal=0; /* total num of aa's in given pair of positions */
290 double mass; /* prob mass for given pair of pos */
294 /* Calc probp for each aa pair in each pair of positions k,l */
296 for (k = 0; k < POSNUM; k++)
297 for (l = 0; l < POSNUM; l++) {
300 for (i = 0; i < AANUM; i++)
301 for (j = 0; j < AANUM; j++) {
304 if (ProlineFreeWin && (i == Proline || j == Proline))
305 probp[i][j][k][l] = LOG0VAL; /* 0 out pairs with P in either*/
307 else if (freqp[i][j][k][l])
308 mass += probp[i][j][k][l] =
309 (prior_freq + (double) freqp[i][j][k][l]) / tal;
310 else { /* freq 0 pairs are handled as min of 3 quanitities. */
311 double tmp1, tmp2, tmp3, tmp4;
312 tmp1 = (prior_freq + 1.0 /SCALE0) / tal;
313 /* 1/5total(res1,res2,k,l) */
314 tmp2 = 1.0/(AANUM*AANUM); /* 1/400 */
315 tmp3 = exp(probs[i][k]) * exp(probs[j][l]);
316 /* product of single probs Pr(r1,k)*Pr(r2,l)*/
318 /* Take min(tmp1, tmp2, tmp3), 3 upper bounds on probp */
319 tmp4 = MIN(tmp1,tmp2);
320 mass += probp[i][j][k][l] = MIN(tmp3,tmp4);
324 /* Normalize all probp by the mass = sum of probp, and take log */
325 for (i = 0; i < AANUM; i++)
326 for (j = 0; j < AANUM; j++)
327 if ( (i != Proline && j != Proline) || !ProlineFreeWin)
328 probp[i][j][k][l] = getdlog(probp[i][j][k][l] / mass);
336 /****************************************************************************/
337 int calc_weird_pprob_sum(double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double pprobs[NUM_RES_TYPE][POSNUM])
342 for (pos1 =0 ; pos1<POSNUM; pos1++) {
343 pprobs[Glutamix][pos1] =
344 pprobs[Glutamine][pos1] + pprobs[GlutamicAcid][pos1];
345 pprobs[Asparagix][pos1] =
346 pprobs[Asparagine][pos1] + pprobs[AsparticAcid][pos1];
348 pprobs[AnyRes][pos1] = 1/AANUM; /* AnyRes means amino acid unknown.... */
354 for (pos1 =0; pos1<POSNUM; pos1++) {
355 for (pos2 =0; pos2<POSNUM; pos2++) {
357 for (res =0; res<=AANUM; res++) {
358 pprobp[res][AnyRes][pos1][pos2] =
359 pprobs[res][pos1]; /* AnyRes means res unknown */
360 pprobp[AnyRes][res][pos1][pos2] =
361 pprobs[res][pos2]; /* so this makes sense. */
363 pprobp[Glutamix][res][pos1][pos2] = pprobp[Glutamine][res][pos1][pos2]
364 + pprobp[GlutamicAcid][res][pos1][pos2];
366 pprobp[res][Glutamix][pos1][pos2] = pprobp[res][Glutamine][pos1][pos2]
367 + pprobp[res][GlutamicAcid][pos1][pos2];
369 pprobp[Asparagix][res][pos1][pos2]= pprobp[Asparagine][res][pos1][pos2]
370 + pprobp[AsparticAcid][res][pos1][pos2];
372 pprobp[res][Asparagix][pos1][pos2] =pprobp[res][Asparagine][pos1][pos2]
373 + pprobp[res][AsparticAcid][pos1][pos2];
378 pprobp[Glutamix][Glutamix][pos1][pos2] =
379 pprobp[Glutamix][Glutamine][pos1][pos2] +
380 pprobp[Glutamix][GlutamicAcid][pos1][pos2];
381 pprobp[Asparagix][Asparagix][pos1][pos2] =
382 pprobp[Asparagix][Asparagine][pos1][pos2] +
383 pprobp[Asparagix][AsparticAcid][pos1][pos2];
384 pprobp[Glutamix][Asparagix][pos1][pos2] =
385 pprobp[Glutamix][Asparagine][pos1][pos2] +
386 pprobp[Glutamix][AsparticAcid][pos1][pos2];
387 pprobp[Asparagix][Glutamix][pos1][pos2] =
388 pprobp[Asparagix][Glutamine][pos1][pos2] +
389 pprobp[Asparagix][GlutamicAcid][pos1][pos2];
400 int calc_weird_pprob_avg(double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double pprobs[NUM_RES_TYPE][POSNUM])
405 for (pos1 =0 ; pos1<POSNUM; pos1++) {
406 pprobs[Glutamix][pos1] =
407 getdlog(exp(pprobs[Glutamine][pos1]) +
408 exp(pprobs[GlutamicAcid][pos1])/2);
409 pprobs[Asparagix][pos1] =
410 getdlog(exp(pprobs[Asparagine][pos1]) +
411 exp(pprobs[AsparticAcid][pos1])/2);
413 pprobs[AnyRes][pos1] = 1/AANUM; /* AnyRes means amino acid unknown.... */
419 for (pos1 =0; pos1<POSNUM; pos1++) {
420 for (pos2 =0; pos2<POSNUM; pos2++) {
422 for (res =0; res<=AANUM; res++) {
423 pprobp[res][AnyRes][pos1][pos2] =
424 pprobs[res][pos1]; /* AnyRes means res unknown */
425 pprobp[AnyRes][res][pos1][pos2] =
426 pprobs[res][pos2]; /* so this makes sense. */
428 pprobp[Glutamix][res][pos1][pos2] =
429 getdlog(exp(pprobp[Glutamine][res][pos1][pos2])
430 + exp(pprobp[GlutamicAcid][res][pos1][pos2])/2);
432 pprobp[res][Glutamix][pos1][pos2] =
433 getdlog(exp(pprobp[res][Glutamine][pos1][pos2])
434 + exp(pprobp[res][GlutamicAcid][pos1][pos2])/2);
436 pprobp[Asparagix][res][pos1][pos2]=
437 getdlog(exp(pprobp[Asparagine][res][pos1][pos2])
438 + exp(pprobp[AsparticAcid][res][pos1][pos2])/2);
440 pprobp[res][Asparagix][pos1][pos2]=
441 getdlog(exp(pprobp[res][Asparagine][pos1][pos2])
442 + exp(pprobp[res][AsparticAcid][pos1][pos2])/2);
447 pprobp[Glutamix][Glutamix][pos1][pos2] =
448 getdlog(exp(pprobp[Glutamix][Glutamine][pos1][pos2]) +
449 exp(pprobp[Glutamix][GlutamicAcid][pos1][pos2])/2);
450 pprobp[Asparagix][Asparagix][pos1][pos2] =
451 getdlog(exp(pprobp[Asparagix][Asparagine][pos1][pos2]) +
452 exp(pprobp[Asparagix][AsparticAcid][pos1][pos2])/2);
453 pprobp[Glutamix][Asparagix][pos1][pos2] =
454 getdlog(exp(pprobp[Glutamix][Asparagine][pos1][pos2]) +
455 exp(pprobp[Glutamix][AsparticAcid][pos1][pos2])/2);
456 pprobp[Asparagix][Glutamix][pos1][pos2] =
457 getdlog(exp(pprobp[Asparagix][Glutamine][pos1][pos2]) +
458 exp(pprobp[Asparagix][GlutamicAcid][pos1][pos2])/2);