X-Git-Url: http://source.jalview.org/gitweb/?p=jpred.git;a=blobdiff_plain;f=sources%2Fmulticoil%2Festimate_probabilities_cuts.c;fp=sources%2Fmulticoil%2Festimate_probabilities_cuts.c;h=c07a59c98e6df625b648a4c73661b81767339745;hp=0000000000000000000000000000000000000000;hb=9aa768067094f24f46f273077f867348e6143711;hpb=eb3001dc41bf6cd46e20fd13fe3efbe9dedf6013 diff --git a/sources/multicoil/estimate_probabilities_cuts.c b/sources/multicoil/estimate_probabilities_cuts.c new file mode 100644 index 0000000..c07a59c --- /dev/null +++ b/sources/multicoil/estimate_probabilities_cuts.c @@ -0,0 +1,357 @@ +changed + calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, + dimers_for_zero_freqs, many_pprobs[0], prior_freq_single, + structural_pos); +to + calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, + many_pprobs[0]); +and + + calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], + ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0], + prior_freq_pair, structural_pos); +to + calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], + ProlineFreeWin); + +cut all the other calcpprob_... stuff (i.e. good_turing, priors,etc.) + + + +Cut all this junk out of estimate_database_probs also. + + + +/************************ Second Round ********************************/ +Cut +/** The following are defined in scscore.c ***/ +extern double prior_probs[NUM_RES_TYPE][POSNUM], + prior_probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM]; + + + +int in_list(int reg,int structural_pos_list[POSNUM+1]) +{ + int i=0; + + while (1) { + if (structural_pos_list[i] == -1) return 0; /*End of list and not found */ + else if (reg == structural_pos_list[i]) return 1; + else i++; + } + +} + + +void set_unimportant_pos_to_genbnk(double pprobs[NUM_RES_TYPE][POSNUM], + double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], + double gprobs[NUM_RES_TYPE], + double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], + int structural_pos[POSNUM+1] ) +{ + int reg1=0, reg2=0, res1, res2, dist; + int pos_array_entry1=0; + int pos_array_entry2=0; + double prob1, prob2; + + for (reg1=0; reg1 < POSNUM; reg1++) { + if (!in_list(reg1, structural_pos)) + for (res1=0; res1= 0) prior_freq = prior_freq_single; + else prior_freq = -prior_freq_single * exp(dimer_probs[i][k]); + /** The maginitude of prior_freq_single scales things. **/ + /** Since dimer_probs is a prob. function, we are adding a **/ + /** TOTAL of -prior_freq_single to the frequency. **/ + /** That all gets normalized when divide by mass. **/ +by prior_freq =0; + + + +in calcpprobp() +cut +int pos_array_entry1=0; + int pos_array_entry2=0; + +and replaced + + while ( (k = structural_pos[pos_array_entry1]) != -1) { + pos_array_entry1++; + pos_array_entry2=0; + while ( (l = structural_pos[pos_array_entry2]) != -1) { + pos_array_entry2++; + +with + + for (k = 0; k < POSNUM; k++) + for (l = 0; l < POSNUM; l++) { + + +cut +double prior_freq_pair=0; + +replaced + if (prior_freq_pair >= 0) prior_freq = prior_freq_pair; + else prior_freq = -prior_freq_pair * exp(dimer_probp[i][j][k][l]); + /** The maginitude of prior_freq_single scales things. **/ + /** Since dimer_probp is a prob. function, we are adding a **/ + /** TOTAL of -prior_freq_single to the frequency. **/ + /** That all gets normalized when divide by mass. **/ + +by +prior_freq =0; + + + +CUT + +void account_for_single_under_sampling(double *predictedfreq) +{ + if (*predictedfreq<=.2){ + /* fprintf(stderr,"\nAccounted for a 0 freq that was %lf",*predictedfreq); */ *predictedfreq = .2; + } +} + + +int account_for_pair_under_sampling(double *predictedfreq) +{ + if (*predictedfreq<=.2) { + *predictedfreq = .2; + return(1); + } + else return(0); + +} + + + +int calc_weighted_pprobs(long freqs[AANUM][POSNUM], long totals[POSNUM], + double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin, + double weights[AANUM][POSNUM], + double gprobs[NUM_RES_TYPE], + double over_undersample[AANUM][POSNUM]) +{ + int i, k; + long tal=0; /* total num of aa's in a given pos */ + double mass; /* prob mass for a given pos */ + double predicted_freq[AANUM][POSNUM]; + double sample_error; + + /* Calc probs for each aa i in each position k */ + for (k = 0; k < POSNUM; k++) { + tal = totals[k]; + mass = 0; + for (i = 0; i < AANUM; i++) + if (ProlineFreeWin && (i == Proline)) + probs[i][k] = LOG0VAL; /* 0 out P's */ + else { + if (over_undersample == NULL) + sample_error=1; + else sample_error = over_undersample[i][k]; + + + predicted_freq[i][k] = (double)totals[k]*(1.0-weights[i][k])* + exp(gprobs[i]) + + weights[i][k]*(double)freqs[i][k]/sample_error; + + account_for_single_under_sampling(&predicted_freq[i][k]); + + +/* fprintf(stderr,"\n Predicted freq=%lf, seen freq=%ld", predicted_freq[i][k], freqs[i][k]); + if (predicted_freq[i][k]<0) fprintf(stderr,"\ni=%d,k=%d,Weight = %lf,totals=%ld, gprob=%lf, sample_error=%lf ", i,k,weights[i][k],totals[k],gprobs[i],sample_error); +*/ + if (predicted_freq[i][k]>0) { + mass += predicted_freq[i][k]; +/* fprintf(stderr,"\nAdded to mass for %d, %d",i,k); */ + } + } + + /* Normalize all probs by the mass = sum of probs, and take log */ + for (i = 0; i < AANUM; i++) + if ((i != Proline) || (!ProlineFreeWin)) + if (predicted_freq[i][k]>0) + probs[i][k] = getdlog(predicted_freq[i][k]/mass); + else + probs[i][k] = LOG0VAL; + } + + return 1; +} + +int calc_weighted_pprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM], + long totalp[POSNUM][POSNUM], + double probs[NUM_RES_TYPE][POSNUM], + double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], + int ProlineFreeWin, double weightp[AANUM][AANUM][POSNUM][POSNUM], + double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], + double over_undersample[AANUM][AANUM][POSNUM][POSNUM]) +{ + int i, j, k, l; + long tal=0; /* total num of aa's in given pair of positions */ + double mass; /* prob mass for given pair of pos */ + double predicted_freq[AANUM][AANUM][POSNUM][POSNUM]; + double sample_error; + int dist; + + /* Calc probp for each aa pair in each pair of positions k,l */ + for (k = 0; k < POSNUM; k++) + for (l = 0; l < POSNUM; l++) { + tal = totalp[k][l]; + mass = 0; + for (i = 0; i < AANUM; i++) + for (j = 0; j < AANUM; j++) + if (ProlineFreeWin && (i == Proline || j == Proline)) + probp[i][j][k][l] = LOG0VAL; /* 0 out pairs with P in either*/ + /* position */ + else { + if (over_undersample == NULL) + sample_error =1; + else sample_error = over_undersample[i][j][k][l]; + dist = (l-k) %POSNUM; + + predicted_freq[i][j][k][l] = (double)totalp[k][l]* + (1-weightp[i][j][k][l])*exp(gprobp[i][j][dist]) + + weightp[i][j][k][l]* + (double)freqp[i][j][k][l]/sample_error; + + account_for_pair_under_sampling(&predicted_freq[i][j][k][l]); +/********** + fprintf(stderr," 0 freq changed to %lf for %c,%c,%c,%c", + predicted_freq[i][j][k][l], 'a'+k,'a'+l,numaa(i), + numaa(j)); +**********/ + + if (predicted_freq[i][j][k][l]>0) + mass += predicted_freq[i][j][k][l]; + } + + /* Normalize all probp by the mass = sum of probp, and take log */ + for (i = 0; i < AANUM; i++) + for (j = 0; j < AANUM; j++) + if ( (i != Proline && j != Proline) || !ProlineFreeWin) + if (predicted_freq[i][j][k][l]>0) + probp[i][j][k][l] = getdlog(predicted_freq[i][j][k][l] / mass); + else + probp[i][j][k][l] = LOG0VAL; + } +}