changed calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, dimers_for_zero_freqs, many_pprobs[0], prior_freq_single, structural_pos); to calcpprobs(freqs, totals, many_pprobs[tab], ProlineFreeWin, many_pprobs[0]); and calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], ProlineFreeWin, dimers_for_zero_freqp, many_pprobp[0], prior_freq_pair, structural_pos); to calcpprobp(freqp, totalp, many_pprobs[tab], many_pprobp[tab], ProlineFreeWin); cut all the other calcpprob_... stuff (i.e. good_turing, priors,etc.) Cut all this junk out of estimate_database_probs also. /************************ Second Round ********************************/ Cut /** The following are defined in scscore.c ***/ extern double prior_probs[NUM_RES_TYPE][POSNUM], prior_probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM]; int in_list(int reg,int structural_pos_list[POSNUM+1]) { int i=0; while (1) { if (structural_pos_list[i] == -1) return 0; /*End of list and not found */ else if (reg == structural_pos_list[i]) return 1; else i++; } } void set_unimportant_pos_to_genbnk(double pprobs[NUM_RES_TYPE][POSNUM], double pprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], double gprobs[NUM_RES_TYPE], double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], int structural_pos[POSNUM+1] ) { int reg1=0, reg2=0, res1, res2, dist; int pos_array_entry1=0; int pos_array_entry2=0; double prob1, prob2; for (reg1=0; reg1 < POSNUM; reg1++) { if (!in_list(reg1, structural_pos)) for (res1=0; res1= 0) prior_freq = prior_freq_single; else prior_freq = -prior_freq_single * exp(dimer_probs[i][k]); /** The maginitude of prior_freq_single scales things. **/ /** Since dimer_probs is a prob. function, we are adding a **/ /** TOTAL of -prior_freq_single to the frequency. **/ /** That all gets normalized when divide by mass. **/ by prior_freq =0; in calcpprobp() cut int pos_array_entry1=0; int pos_array_entry2=0; and replaced while ( (k = structural_pos[pos_array_entry1]) != -1) { pos_array_entry1++; pos_array_entry2=0; while ( (l = structural_pos[pos_array_entry2]) != -1) { pos_array_entry2++; with for (k = 0; k < POSNUM; k++) for (l = 0; l < POSNUM; l++) { cut double prior_freq_pair=0; replaced if (prior_freq_pair >= 0) prior_freq = prior_freq_pair; else prior_freq = -prior_freq_pair * exp(dimer_probp[i][j][k][l]); /** The maginitude of prior_freq_single scales things. **/ /** Since dimer_probp is a prob. function, we are adding a **/ /** TOTAL of -prior_freq_single to the frequency. **/ /** That all gets normalized when divide by mass. **/ by prior_freq =0; CUT void account_for_single_under_sampling(double *predictedfreq) { if (*predictedfreq<=.2){ /* fprintf(stderr,"\nAccounted for a 0 freq that was %lf",*predictedfreq); */ *predictedfreq = .2; } } int account_for_pair_under_sampling(double *predictedfreq) { if (*predictedfreq<=.2) { *predictedfreq = .2; return(1); } else return(0); } int calc_weighted_pprobs(long freqs[AANUM][POSNUM], long totals[POSNUM], double probs[NUM_RES_TYPE][POSNUM], int ProlineFreeWin, double weights[AANUM][POSNUM], double gprobs[NUM_RES_TYPE], double over_undersample[AANUM][POSNUM]) { int i, k; long tal=0; /* total num of aa's in a given pos */ double mass; /* prob mass for a given pos */ double predicted_freq[AANUM][POSNUM]; double sample_error; /* Calc probs for each aa i in each position k */ for (k = 0; k < POSNUM; k++) { tal = totals[k]; mass = 0; for (i = 0; i < AANUM; i++) if (ProlineFreeWin && (i == Proline)) probs[i][k] = LOG0VAL; /* 0 out P's */ else { if (over_undersample == NULL) sample_error=1; else sample_error = over_undersample[i][k]; predicted_freq[i][k] = (double)totals[k]*(1.0-weights[i][k])* exp(gprobs[i]) + weights[i][k]*(double)freqs[i][k]/sample_error; account_for_single_under_sampling(&predicted_freq[i][k]); /* fprintf(stderr,"\n Predicted freq=%lf, seen freq=%ld", predicted_freq[i][k], freqs[i][k]); if (predicted_freq[i][k]<0) fprintf(stderr,"\ni=%d,k=%d,Weight = %lf,totals=%ld, gprob=%lf, sample_error=%lf ", i,k,weights[i][k],totals[k],gprobs[i],sample_error); */ if (predicted_freq[i][k]>0) { mass += predicted_freq[i][k]; /* fprintf(stderr,"\nAdded to mass for %d, %d",i,k); */ } } /* Normalize all probs by the mass = sum of probs, and take log */ for (i = 0; i < AANUM; i++) if ((i != Proline) || (!ProlineFreeWin)) if (predicted_freq[i][k]>0) probs[i][k] = getdlog(predicted_freq[i][k]/mass); else probs[i][k] = LOG0VAL; } return 1; } int calc_weighted_pprobp(long freqp[AANUM][AANUM][POSNUM][POSNUM], long totalp[POSNUM][POSNUM], double probs[NUM_RES_TYPE][POSNUM], double probp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM], int ProlineFreeWin, double weightp[AANUM][AANUM][POSNUM][POSNUM], double gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM], double over_undersample[AANUM][AANUM][POSNUM][POSNUM]) { int i, j, k, l; long tal=0; /* total num of aa's in given pair of positions */ double mass; /* prob mass for given pair of pos */ double predicted_freq[AANUM][AANUM][POSNUM][POSNUM]; double sample_error; int dist; /* Calc probp for each aa pair in each pair of positions k,l */ for (k = 0; k < POSNUM; k++) for (l = 0; l < POSNUM; l++) { tal = totalp[k][l]; mass = 0; for (i = 0; i < AANUM; i++) for (j = 0; j < AANUM; j++) if (ProlineFreeWin && (i == Proline || j == Proline)) probp[i][j][k][l] = LOG0VAL; /* 0 out pairs with P in either*/ /* position */ else { if (over_undersample == NULL) sample_error =1; else sample_error = over_undersample[i][j][k][l]; dist = (l-k) %POSNUM; predicted_freq[i][j][k][l] = (double)totalp[k][l]* (1-weightp[i][j][k][l])*exp(gprobp[i][j][dist]) + weightp[i][j][k][l]* (double)freqp[i][j][k][l]/sample_error; account_for_pair_under_sampling(&predicted_freq[i][j][k][l]); /********** fprintf(stderr," 0 freq changed to %lf for %c,%c,%c,%c", predicted_freq[i][j][k][l], 'a'+k,'a'+l,numaa(i), numaa(j)); **********/ if (predicted_freq[i][j][k][l]>0) mass += predicted_freq[i][j][k][l]; } /* Normalize all probp by the mass = sum of probp, and take log */ for (i = 0; i < AANUM; i++) for (j = 0; j < AANUM; j++) if ( (i != Proline && j != Proline) || !ProlineFreeWin) if (predicted_freq[i][j][k][l]>0) probp[i][j][k][l] = getdlog(predicted_freq[i][j][k][l] / mass); else probp[i][j][k][l] = LOG0VAL; } }