10 date 2000.02.03.20.43.36; author alex_c; state Exp;
24 @/* Bonnie Berger, David Wilson and Theodore Tonchev 1992 */
25 /* Modified by Bonnie Berger and Ethan Wolf 1995. */
26 /* Commented by Ethan Wolf 1995. */
29 /*************************************************************************
30 * scscore.c contains functions to compute the probabilities for genbnk *
31 * and the positives. The probabilities are computed for genbnk *
32 * directly from the frequencies. For the positives, the 0 frequency *
33 * residues (and residue pairs) other than Prolines are given a non-zero*
34 * probability that is no more than 1/5 of the minimum probability of *
35 * a term with non-zero frequency. This accounts for sampling errors. *
37 * In addition, the PairCoil scoring algorithm is implemented here to *
38 * score subsequences obtained by the function subsequence_advance() *
39 * To obtain this score, residue "propensities" are first calculated *
40 * for each residue in every possible offset. Then each window of *
41 * length PAIRWINDOW is given a score which is the maximum of the sum *
42 * of the residue propensities in that window over all possible window *
43 * offsets. (Note that if a PAIRWINDOW does not fit in the subsequence,*
44 * then the max size window is used, as long as it is at least size *
45 * min(WINDOW,MINWINDOW). *
46 * The subsequence is then given the max score over all the windows *
47 * containe d in it and it is given the offset of that window. *
49 * Additionally, code for doing STOCK scoring is contained here, based *
50 * only on single residue probabilities and windows of size STOCKWINDOW.*
51 *************************************************************************/
64 /* #define Two_to_Three_Ratio 15 */ /* Make it an extern double instead. */
65 /* This guesses for Pr[x in C_2]/Pr[x in C_3] */
68 /** WINDOW is the size of the WINDOW that is currently being scored. *
69 * It is normally size PAIRWINDOW, but changing its value allows *
70 * the computation of scores on shorter sequences (like size 28). */
71 /*** int WINDOW = PAIRWINDOW; ***/
72 extern int WINDOW; /* Now WINDOW is defined in interface.c so can read */
73 /* it in in get_defaults(). **/
75 extern double (*pprobs)[POSNUM];
76 extern double (*pprobp)[NUM_RES_TYPE][POSNUM][POSNUM];
77 extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
82 /****************************************************************************/
83 /*** For now just average, but should really do weighted avg, or something more complex. ***/
84 double combine_dist_scores(double window_scores[POSNUM][MAXSEQLEN], int i,
85 int functnumber, char lib[MAXFUNCTNUM], int end,
91 for (f=0; f<functnumber; f++)
92 if (!end_effects || ((i+lib[f]+1) < end)) {
93 score +=window_scores[lib[f]][i];
98 return(score/number_dist);
100 return(window_scores[lib[0]][i]);
103 /***********************************************************************/
107 /* These are relative frequences, and logs need to be taken of them. */
108 /* They are used when computing stocks scores when STOCKSTOCK is defined */
109 /* in scconst.h. These frequencies correspond to the frequencies used */
110 /* by stock in the program that they distribute. */
111 static double relative[20][7] ={
112 {3.167, 0.297, 0.398, 3.902, 0.585, 0.501, 0.483},
113 {2.597, 0.098, 0.345, 0.894, 0.514, 0.471, 0.431},
114 {1.665, 0.403, 0.386, 0.949, 0.211, 0.342, 0.360},
115 {2.240, 0.307, 0.480, 1.409, 0.541, 0.772, 0.663},
116 {0.531, 0.076, 0.403, 0.662, 0.189, 0.106, 0.013},
117 {1.417, 0.090, 0.122, 1.659, 0.190, 0.130, 0.155},
118 {0.045, 0.275, 0.578, 0.216, 0.211, 0.426, 0.156},
119 {1.297, 1.551, 1.084, 2.612, 0.377, 1.248, 0.877},
120 {1.375, 2.639, 1.763, 0.191, 1.815, 1.961, 2.795},
121 {0.659, 1.163, 1.210, 0.031, 1.358, 1.937, 1.798},
122 {0.347, 0.275, 0.679, 0.395, 0.294, 0.579, 0.213},
123 {0.262, 3.496, 3.108, 0.998, 5.685, 2.494, 3.048},
124 {0.030, 2.352, 2.268, 0.237, 0.663, 1.620, 1.448},
125 {0.179, 2.114, 1.778, 0.631, 2.550, 1.578, 2.526},
126 {0.835, 1.475, 1.534, 0.039, 1.722, 2.456, 2.280},
127 {0.382, 0.583, 1.052, 0.419, 0.525, 0.916, 0.628},
128 {0.169, 0.702, 0.955, 0.654, 0.791, 0.843, 0.647},
129 {0.824, 0.022, 0.308, 0.152, 0.180, 0.156, 0.044},
130 {0.240, 0.000, 0.000, 0.456, 0.019, 0.000, 0.000},
131 {0.000, 0.008, 0.000, 0.013, 0.000, 0.000, 0.000}};
133 /* Takes logs of relative frequencies for singles */
134 initialize_relative ()
137 for (aa=0; aa<AANUM; ++aa)
138 for (off=0; off<POSNUM; ++off)
139 relative[aa][off] = (relative[aa][off]) ? log(relative[aa][off]) : LOG0VAL;
142 /****************************************************************************/
144 /* Advances past prolines and higher numbered residues in a subsequence, */
145 /* in order to return the biggest subsequence from the start position that */
146 /* contains no bad residues (>=AANUM-1). Here P is assumed to have number */
147 /* AANUM-1 and the higher numbered residues correspond to X,Z,B. */
148 /* st is the position of the last bad residue read (i.e. the previous value */
149 /* for en). The subsequence returned to be scored goes from st to en-1. */
150 /* If the end of the sequence is reached en=seqlen (seqlen-1 is the last */
151 /* residue in the sequence). */
152 /* If ProlineFreeWindows are not used, then prolines will NOT terminate */
153 /* the subsequence, so regions which include prolines will be scored. */
154 int subsequence_advance (int *st, int *en, char seq[], int seqlen,
159 if (ProlineFreeWin) {
160 /* Assumes that Proline == AANUM-1, i.e. last residue in list */
161 /* Read past "bad" residues P (X,Z,B are now okay). */
162 /* Used to be seq[start] >=AANUM-1 when X,Z,B weren't okay. */
163 for (start = *en; (start < seqlen) && (seq[start] == Proline); start++);
165 /* Read in next contiguous subsequence of "good" residues to score on. */
166 /* Used to be seq[end] < AANUM -1 */
167 for (end = start; (end < seqlen) && (seq[end] != Proline) &&
168 (seq[end]<Undefined); end++);
171 /* Used to Read past "bad" residues X,Z,B by seq[start]>=AANUM. */
172 for (start = *en; (start < seqlen) && (seq[start] >= Undefined); start++);
176 /* Read in next subsequence of "good" residues (including P). */
177 /* Used to be seq[end] <AANUM */
178 for (end = start; (end < seqlen) && (seq[end] < Undefined); end++);
181 if (start >= seqlen) return 0;
182 /* [st,en-1] now gives a subsequence to be scored. */
188 /****************************************************************************/
190 /* Takes a position and an offset value for position 0 and returns the offset
191 value for that position. */
193 int getoffs(int pos, int offs)
195 return ((pos+offs)%POSNUM);
199 /** The right propensity is defined as a function of the normalized prob. **/
200 /** terms (pprobp[R1][R2][p2-offs][p2]-gprobp[R1][R2][off]) **/
201 /** minus by (pprobs[R1][p2-offs]-gprobs[R1]). **/
202 /** i.e. the log of (normalized pair prob.)/(normalized single prob.). **/
203 double rpropensity_term(int x, int y, int xoffs, int yoffs,
204 char dist, char seq[])
208 /* First the pair probability normalized by genbnk. */
209 lnprob = pprobp[seq[x]][seq[y]][xoffs][yoffs]
210 - gprobp[seq[x]][seq[y]][dist];
212 /* Then the singles probability normalized by genbnk are included.*/
213 lnprob -= pprobs[seq[x]][xoffs] - gprobs[seq[x]];
220 /*********************************************************************/
221 /** This function is used to calculate the differentiator based on **/
222 /** paircoil algorithm, and is called in get_respropl from */
223 /** get_winscores from scseqadj. **/
225 double lprop_diff_term(int x, int y, int xoffs, int yoffs,
226 char dist, char seq[])
229 extern double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
230 many_pprobp[MAX_TABLE_NUMBER][NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
233 /* First the difference in the logs of the probabilities. */
234 lnprob = many_pprobp[0][seq[x]][seq[y]][xoffs][yoffs] -
235 many_pprobp[1][seq[x]][seq[y]][xoffs][yoffs];
236 /* Then the singles probabilities are included.*/
237 lnprob -= many_pprobs[0][seq[y]][yoffs] - many_pprobs[1][seq[y]][yoffs];
242 /***************************************************************************/
245 /** The left propensity terms are similar, but divided by rightmost singles**/
246 /** giving (pprobp[R1][R2][p1][p1+offs]-gprobp[R1][R2][off]) **/
247 /** minus by (pprobs[R2][p1+offs]-gprobs[R2]). **/
248 double lpropensity_term(int x, int y, int xoffs, int yoffs,
249 char dist, char seq[])
253 /*** Note: The weightp and weights stuff is statistical weighting **/
254 /*** based on whether the differences between pos file prob and genbnk **/
255 /*** residues are really significant enough to be useful in the score. **/
256 /*** sigma is defined in stats.c in order to make the weights sharper. **/
259 /* First the pair probability normalized by genbnk. */
260 lnprob = pprobp[seq[x]][seq[y]][xoffs][yoffs]
261 - gprobp[seq[x]][seq[y]][dist];
263 /* Then the singles probability normalized by genbnk are included.*/
264 lnprob -= pprobs[seq[y]][yoffs] - gprobs[seq[y]];
270 /****************************************************************************/
271 /**For the following, SCOREACCUMULATE is defined in options.h to take the */
272 /**min or to take the average of functnum terms. */
273 /****************************************************************************/
275 /****************************************************************************/
276 /** These "r" functions score the sequence from right to left and are */
277 /** comparable to the corresponding "l" functions defined later. */
278 /****************************************************************************/
281 /* Function to compute right residue propensity at the start of subsequence, */
282 /* when there is no residue at distance lib[f]. Uses the pairs that are */
283 /* available and uses singles probabilities if there are no pairs. */
284 /* maxfval is the maximum of the distances lib[f], so any residue within */
285 /* maxfval from the start of the subsequence must have its propensity */
286 /* computed by this function. */
287 getr_sc_en (double respropr[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int maxfval)
290 int f, i, cnt; /* cnt counts the number of lib[f] distances that stay */
291 /* within the subsequence, and so can be included in */
292 /* the propensity calculation. */
293 int x, y, xoffs, yoffs;
295 for (y = st+maxfval; y >= st; y--) {
296 yoffs = getoffs(y,offs);
297 for (f = 0, cnt=0; f < functnum; f++)
298 if ((x = y-lib[f]-1) >= st) { /* If the pair residue x is in the */
299 cnt++; /* subseq, then can include that pair. */
300 xoffs = getoffs(x,offs);
302 lnprob= rpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
305 respropr[y] += lnprob; /** Will need to divide by cnt. **/
308 if (respropr[y] > lnprob) respropr[y] = lnprob;
312 if (!(cnt)) /* no pairs so do singles */
313 respropr[y] = pprobs[seq[y]][yoffs] - gprobs[seq[y]];
316 respropr[y] /= cnt; /* divide by number of pairs done to average them */
324 /* Computes right residue propensities for each position between st and en, */
325 /* going right to left (respropr), and computing the geometric mean of the */
326 /* paritial propensity terms at distance d of the position when AvgDist */
327 /* defined, and the min of these terms when MinDist is defined. */
329 get_respropr (double respropr[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM])
332 int f, i, maxfval = 0;
333 int x, y, xoffs, yoffs;
335 for (f = 0; f < functnum; f++)
336 if (maxfval < lib[f]) maxfval = lib[f];
338 for (i = st; i < en; i++)
339 respropr[i] = SCOREIDENTITY; /* 0 for avg; HUGE_VAL for min */
341 /* Store residue propensity of each position in respropr */
342 for (f = 0; f < functnum; f++)
343 for (y = en-1, x = y-lib[f]-1; y >= st+maxfval+1; x--, y--) {
344 /* lib dist's off by 1 */
345 xoffs = getoffs(x,offs);
346 yoffs = getoffs(y,offs);
348 lnprob= rpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
350 /* Incorporate new lnprob by taking function over it and old one */
351 SCOREACCUMULATE(respropr[y],lnprob); /* This averages functnum terms */
352 /* or takes their min depending */
353 /* on if AvgDis or MinDist def'd*/
355 /* If can't do all pairs, do as many pairs as possible, or singles */
356 getr_sc_en(respropr,seq,st,en,offs,lib,maxfval);
360 /****************************************************************************/
361 /****************************************************************************/
362 /****************************************************************************/
363 /*****Left to right versions of propensities for scoring. Currently **/
364 /**** scoring is left to right. **/
365 /****************************************************************************/
367 /* Computes the left residue propensities for the positions towards the end of
368 the subsequence where it is not possible to compute all the pair scores.
369 For each such position, all the pairs that can be done are done, and if
370 no pairs can be done, the singles are done. st and en-1 are the first
371 and last postion in the subsequence considered, offs is the frame, lib
372 is the distance-1 values, and maxfval is the max of the distances lib[f].
373 Any residue within maxfval of the end must have its propensity computed by
376 get_sc_en (double respropl[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int maxfval, int differentiator)
379 int f, i, cnt; /* cnt counts the number of pair combos possible at a pos.*/
380 int x, y, xoffs, yoffs;
383 for (x = en-maxfval-1; x < en; x++) {
384 xoffs = getoffs(x,offs);
385 for (f = 0, cnt=0; f < functnum; f++)
386 if ((y = x+lib[f]+1) < en) { /* If the pair residue y is in the */
387 cnt++; /* sequence, then can do that pair */
388 yoffs = getoffs(y,offs);
391 lnprob = lprop_diff_term(x,y,xoffs,yoffs,lib[f],seq);
392 else /* The normal paircoil score term. */
393 lnprob= lpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
396 respropl[x] += lnprob; /* Divide by cnt after go all distances.*/
399 if (respropl[x] > lnprob) respropl[x] = lnprob;
405 if (!(cnt)) /* no pairs so do singles */
406 if (differentiator) {
407 extern double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
408 many_pprobp[MAX_TABLE_NUMBER][NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
410 respropl[x] = many_pprobs[0][seq[x]][xoffs] - many_pprobs[1][seq[x]][xoffs];
413 respropl[x] = pprobs[seq[x]][xoffs] - gprobs[seq[x]];
419 respropl[x] /= cnt; /* Average by number of do-able pairs. */
426 /* Computes left residue propensities for each position between st and en,
427 * going left to right. */
428 get_bonnie_respropl (double respropl[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator)
431 int f, i, maxfval = 0;
432 int x, y, xoffs, yoffs;
434 for (f = 0; f < functnum; f++)
435 if (maxfval < lib[f]) maxfval = lib[f];
438 for (i = st; i < en; i++)
439 respropl[i] = SCOREIDENTITY; /* 0 for avg; HUGE_VAL for min */
441 /* Store residue propensity of position x in respropl[x] */
442 for (f = 0; f < functnum; f++) {
443 for (x = st, y = x+lib[f]+1; x < en-maxfval-1; x++, y++) {
444 /* lib dist's off by 1 */
445 xoffs = getoffs(x,offs);
446 yoffs = getoffs(y,offs);
449 lnprob = lprop_diff_term(x,y,xoffs,yoffs,lib[f],seq);
451 else /* The normal paircoil score term. */
452 lnprob= lpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
456 /* Incorporate new lnprob by averaging or taking min of it and the */
457 /* terms corresponding to other distances, depending on options.h. */
458 SCOREACCUMULATE(respropl[x],lnprob);
461 /* If can't do all pairs because of end of subsequence, do all the */
462 /* distances that stay within the subsequence for x. */
463 get_sc_en(respropl,seq,st,en,offs,lib,maxfval, differentiator);
468 get_respropl (double respropl[POSNUM][MAXSEQLEN], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator)
471 int f, i, maxfval = 0;
472 int x, y, xoffs, yoffs;
474 for (f = 0; f < functnum; f++) {
475 if (maxfval < lib[f]) maxfval = lib[f];
478 /**** Don't need to initialize since not accumulateing over distances***
479 for (i = st; i < en; i++)
480 respropl[lib[f]][i] = SCOREIDENTITY;
482 /* 0 for avg; HUGE_VAL for min */
487 /* Store residue propensity of position x in respropl[x] */
488 for (f = 0; f < functnum; f++) {
489 for (x = st, y = x+lib[f]+1; x < en-lib[f]-1; x++, y++) {
490 /* lib dist's off by 1 */
491 xoffs = getoffs(x,offs);
492 yoffs = getoffs(y,offs);
495 lnprob = lprop_diff_term(x,y,xoffs,yoffs,lib[f],seq);
497 else /* The normal paircoil score term. */
498 lnprob= lpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
502 /* Incorporate new lnprob by averaging or taking min of it and the */
503 /* terms corresponding to other distances, depending on options.h. */
504 respropl[lib[f]][x] = lnprob;
507 while (x<en) { /* Deal with residues too close to end. */
508 /* Note that since we have a propensity */
509 /* for each distance, when we get to end*/
510 /* we don't want to use singles prob. since */
511 /* they were already used as the y value above */
512 /* in lpropensity_term. Setting this to 0 */
513 /* is like assuming it is random from genbnk. */
514 respropl[lib[f]][x] = 0;
521 /* If can't do all pairs because of end of subsequence, do all the */
522 /* distances that stay within the subsequence for x. */
523 /***** If just computing for each dist, then end is dealt with above. ***/
525 get_sc_en(respropl,seq,st,en,offs,lib,maxfval, differentiator);
531 /*****************Ends Propensity computations.***************************/
532 /*************************************************************************/
533 /* Computes all window scores between st and en for a given offset
534 * and returns them in winscores.
535 * A winscore is the sum of the residue propensities of residues in that
536 * window, given that offset.
537 * A left residue propensity is the relative pair frequency minus the
538 * relative single frequency of the rightmost residue.
539 * lib contains the values dist-1.
540 * SCOREIDENTITY returns 0 if taking geometric mean and HUGEVAL if min.
541 * SCOREACCUMULATE(a,b) either averages the logs
542 * (i.e. takes the geometric mean) of the contribution of b into a
543 * or it stores min of a and b in a, depending on if AvgDist or MinDist
544 * is definied in options.h. */
546 /** Note that windows of size PAIRWINDOW are used if possible. If a
547 * subsequence is shorter than PAIRWINDOW, but as long as MINWINDOW,
548 * then it is scored using a window the size of the subsequence
549 * (for more details see the function scseqadj). */
551 get_bonnie_winscores (double winscores[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator) /* Differentiator tells it to do the */
552 /* trimer-dimer diff method. */
554 extern double init_class_prob[MAX_CLASS_NUMBER];
556 double scorel/*,scorer*/;
558 /* residue propensities going left to right */
559 double respropl[MAXSEQLEN]/*,respropr[MAXSEQLEN]*/;
562 /* Get residue propensitites going left to right (respropl) */
563 get_bonnie_respropl(respropl,seq,st,en,offs,lib, differentiator);
564 /* get_respropr(respropr,seq,st,en,offs,lib); */
567 /* Compute window scores in linear time */
569 /* Init score to be sum of first WINDOW-1 */
570 for (scorel=/*scorer=*/0, i = st; i < st + WINDOW-1; i++) {
571 scorel += respropl[i];
572 /* scorer += respropr[i]; */
576 /* Fill in winscores */
577 for (i = st, j = st + WINDOW-1; j < en; ++i, ++j) {
578 /* Add the new right side */
579 scorel += respropl[j];
580 /* scorer += respropr[j]; */
583 /* store window score at position i. A few options are given. */
584 /* Currently scoring is left to right, but functions of the scores */
585 /* in both directions can be taken. */
587 if (differentiator) { /** The probability estimate should actually be **/
588 /** the formula in the #else..... **/
590 winscores[i] = scorel; /** Just compute one term in the formula. **/
593 winscores[i] = -getdlog(init_class_prob[0]/init_class_prob[1]
598 winscores[i] = scorel;
602 /**************************/
603 /* winscores[i] = scorer; */
604 /* winscores[i] = (scorel+scorer)/2; */
605 /* winscores[i] = MIN(scorel,scorer);
606 /***************************/
608 /* Take away the left side */
609 scorel -= respropl[i];
610 /* scorer -= respropr[i]; */
614 /************THIS CORRECTS THE BUG ****/
615 for (i= en - (WINDOW-1); i<en; i++) {
616 if (differentiator) {
618 winscores[i] = scorel; /** Just compute one term in the formula. **/
620 winscores[i] = -getdlog(init_class_prob[0]/init_class_prob[1]
625 winscores[i] = scorel;
627 scorel -= respropl[i];
635 /* Computes all window scores between st and en for a given offset
636 * and returns them in winscores.
637 * A winscore is the sum of the residue propensities of residues in that
638 * window, given that offset.
639 * A left residue propensity is the relative pair frequency minus the
640 * relative single frequency of the rightmost residue.
641 * lib contains the values dist-1.
642 * Some things defined in options.h. */
644 /** Note that windows of size PAIRWINDOW are used if possible. If a
645 * subsequence is shorter than PAIRWINDOW, but as long as MINWINDOW,
646 * then it is scored using a window the size of the subsequence
647 * (for more details see the function scseqadj). */
649 get_winscores (double winscores[POSNUM][MAXSEQLEN], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator) /* Differentiator tells it to do the */
650 /* trimer-dimer diff method. */
652 extern double init_class_prob[MAX_CLASS_NUMBER];
655 double scorel/*,scorer*/;
657 /* residue propensities going left to right */
658 double respropl[POSNUM][MAXSEQLEN]/*,respropr[MAXSEQLEN]*/;
661 /* Get residue propensitites going left to right (respropl) */
662 get_respropl(respropl,seq,st,en,offs,lib, differentiator);
663 /* get_respropr(respropr,seq,st,en,offs,lib); */
666 /* Compute window scores in linear time */
667 for (f = 0; f < functnum; f++) {
669 /* Init score to be sum of first WINDOW-1 */
670 for (scorel=/*scorer=*/0, i = st; i < st + WINDOW-1; i++) {
671 scorel += respropl[lib[f]][i];
672 /* scorer += respropr[i]; */
676 /* Fill in winscores */
677 for (i = st, j = st + WINDOW-1; j < en; ++i, ++j) {
678 /* Add the new right side */
679 scorel += respropl[lib[f]][j];
680 /* scorer += respropr[j]; */
683 /* store window score at position i. A few options are given. */
684 /* Currently scoring is left to right, but functions of the scores */
685 /* in both directions can be taken.
687 if (differentiator) {
689 winscores[lib[f]][i] = scorel; /* Just compute one term in formula. */
691 winscores[lib[f]][i] = -getdlog(init_class_prob[0]/init_class_prob[1]
696 winscores[lib[f]][i] = scorel;
698 /**************************/
699 /* winscores[i] = scorer; */
700 /* winscores[i] = (scorel+scorer)/2; */
701 /* winscores[i] = MIN(scorel,scorer);
702 /***************************/
704 /* Take away the left side */
705 scorel -= respropl[lib[f]][i];
706 /* scorer -= respropr[i]; */
710 /************THIS CORRECTS THE BUG ****/
711 for (i= en - (WINDOW-1); i<en; i++) {
712 if (differentiator) {
714 winscores[lib[f]][i] = scorel; /* Just compute one term in formula.*/
716 winscores[lib[f]][i] = -getdlog(init_class_prob[0]/init_class_prob[1]
721 winscores[lib[f]][i] = scorel;
723 scorel -= respropl[lib[f]][i];
730 /*************************************************************************/
731 /************** The following begins the scoring for PairCoil ****/
732 /*************************************************************************/
735 /* Returns 0 if sequence too short, otherwise returns 1.
737 ** When this is called in calcseq of sc2seq.c
738 * The max window scores for residue i in offset off are stored in
739 * scs[i][off] and the max offset score is in sc2[i].
741 * max_over_windows() is a dynamic program to find the max window score
742 * containing a residue, and is in the file dyn.c.
744 * avg_over_windows() computes the average window score by using prefix
745 * computations to compute the log of the avg of exp(window_scores),
746 * and is also in dyn.c.
748 * differentiator =1 says use the trimer-dimer diff method.
750 ** NOTE that WINDOW is set to PAIRWINDOW in calcseq in sc2seq.c.
751 ** PAIRWINDOW and STOCKWINDOW and MINWINDOW are defined in scconst.h. */
754 int scseqadj(char lib[],
755 char seq[], int seqlen,
756 double scores[MAXSEQLEN][POSNUM],
757 char offsets[MAXSEQLEN],
758 double *maxscore, int mode, int differentiator)
760 { int OLDWINDOW; /* temp var for const default window length */
762 /* double winscores[POSNUM][MAXSEQLEN], scorebuff[POSNUM][MAXSEQLEN]; */
763 double **winscores,**scorebuff;
764 double bonnie_winscores[MAXSEQLEN], bonnie_scorebuff[MAXSEQLEN];
765 double minwinscores[MAXSEQLEN];
768 int ProlineFreeWin = mode & PROLINE_FREE_MODE;
770 winscores=(double**)calloc(POSNUM,sizeof(double*));
772 printf("Failed to allocate winscores\n");exit(1);
774 scorebuff=(double**)calloc(POSNUM,sizeof(double*));
776 printf("Failed to allocate scorebuff\n");exit(1);
779 for (i=0;i<POSNUM;i++){
780 winscores[i]=(double*)calloc(MAXSEQLEN,sizeof(double));
781 if(winscores[i]==NULL){
782 printf("Failed to allocate winscores[%i]",i);exit(1);
784 scorebuff[i]=(double*)calloc(MAXSEQLEN,sizeof(double));
785 if(scorebuff[i]==NULL){
786 printf("Failed to allocate scorebuff[%i]",i);exit(1);
790 if (seqlen <MIN(WINDOW,MINWINDOW)) { /* 0 out score for too short seq. */
791 for (i=0; i<seqlen; i++) {
793 for(j=0; j<POSNUM; j++)
794 scores[i][j]=-HUGE_VAL;
796 *maxscore = -HUGE_VAL;
800 for (j = 0; j < POSNUM; j++)
801 for (i = 0; i < seqlen; scores[i++][j] = -HUGE_VAL);
802 for (i = 0; i < seqlen; offsets[i++] = 0);
806 /* Get next subseq (not containing P's if ProlineFreeWindows defined in */
807 /* options.h. If a window of size WINDOW=PAIRWINDOW fits in the subseq */
808 /* then that window is scored. Otherwise, if a window of size MINWINDOW */
809 /* fits in the subseq, then that window is scored. Otherwise, the subseq */
812 for (en=0; subsequence_advance(&st,&en,seq,seqlen,ProlineFreeWin);) {
813 if (en - st <MIN(WINDOW,MINWINDOW)) continue;
814 /* Don't score short regions. */
818 if (en - st <WINDOW) /* Change window size for subseq length between */
819 WINDOW = en - st; /* MINWINDOW and PAIRWINDOW. */
821 for (offs = 0; offs < POSNUM; offs++) {
822 #ifdef BONNIE_END_EFFECTS
823 get_bonnie_winscores(bonnie_winscores,seq,st,en,offs,lib,
826 get_winscores(winscores,seq,st,en,offs,lib, differentiator);
829 /* Compute residue scores */
830 if ((en-st)>=WINDOW) {
832 if (!differentiator) {
833 #ifdef BONNIE_END_EFFECTS
834 max_over_windows(&(bonnie_scorebuff[st]),&(bonnie_winscores[st]),
836 /** The score of a residue is its max window */
837 /** score over windows containing it for PAIRCOIL */
838 for (i=st; i<en; ++i)
839 scores[i][offs] = bonnie_scorebuff[i];
844 if (mode & MAX_WINDOW_BEFORE_COMBINE_MODE) {
845 for (f=0; f<functnum; f++)
846 max_over_windows(&(scorebuff[lib[f]][st]),&
847 (winscores[lib[f]][st]),en-st,WINDOW);
848 /** The score of a residue is its max window */
849 /** score over windows containing it for PAIRCOIL */
850 for (i=st; i<en; ++i)
851 scores[i][offs] = combine_dist_scores(scorebuff,i,
854 else { /* Do score like done in PNAS paper. Note that end */
855 /* effects need to be dealt with in combine_dist_scores. */
856 for (i=st; i<en; i++) /* Put the combined score in winscores[0]*/
857 winscores[0][i] = combine_dist_scores(winscores,i,
859 /* Put the maxscore in scorebuff[0] */
860 max_over_windows(&(scorebuff[0][st]),&(winscores[0][st]),
863 for (i=st; i<en; ++i)
864 scores[i][offs] = scorebuff[0][i];
871 else { /* Don't do max over window for differentiator.... */
873 #ifdef BONNIE_END_EFFECTS
874 if (differentiator == 2) { /* get max score **/
875 max_over_windows(&(bonnie_scorebuff[st]),&(bonnie_winscores[st]),
877 for (i=st; i<en; ++i)
878 scores[i][offs] = bonnie_scorebuff[i];
880 else if (differentiator ==3) { /* get min score **/
881 min_over_windows(&(bonnie_scorebuff[st]),&(bonnie_winscores[st]),
883 for (i=st; i<en; ++i)
884 scores[i][offs] = bonnie_scorebuff[i];
886 else if (differentiator ==1) /* Win score for win starting at i */
887 for (i=st; i<en; i++)
888 scores[i][offs] = bonnie_winscores[i];
890 for (i=st; i<en; i++) /* Put the combined score in winscores[0]*/
891 winscores[0][i] = combine_dist_scores(winscores,i,
894 if (differentiator == 2) { /* get max score **/
895 max_over_windows(&(scorebuff[0][st]),&(winscores[0][st]),
897 for (i=st; i<en; ++i)
898 scores[i][offs] = scorebuff[0][i];
900 else if (differentiator ==3){ /* get min score **/
901 min_over_windows(&(scorebuff[0][st]),&(winscores[0][st]),
903 for (i=st; i<en; ++i)
904 scores[i][offs] = scorebuff[0][i];
906 else if (differentiator==1) /* Win score for win starting at i */
907 for (i=st; i<en; i++)
909 combine_dist_scores(winscores,i,functnum,lib,en,0);
917 /* Store the frame which maximizes the score of residue i in offset[i] */
918 for (i=st; i<en; ++i) {
919 double localmax = -HUGE_VAL;
920 for (offs=0; offs<POSNUM; ++offs)
921 if (localmax < scores[i][offs]) {
923 localmax = scores[i][offs];
926 gmax = localmax; /* maintain max score in subseq */
928 WINDOW = OLDWINDOW; /* restore default value (PAIRWINDOW) to WINDOW */
934 *maxscore = exp(gmax/WINDOW);
936 for (i = 0; i < seqlen; i++)
937 for (offs=0; offs<POSNUM; ++offs)
938 scores[i][offs] = exp(scores[i][offs]/WINDOW);
949 /*************************************************************************/
950 /***** Ends the PairCoil scoring section *****/
951 /*************************************************************************/
956 /*************************************************************************/
957 /*************************************************************************/
958 /*** The following compute the NewCoil stock score. *****/
959 /*************************************************************************/
962 /* Fill in window scores - used by scorestock()*/
963 /**** WINDOW is set to STOCKWINDOW in calcseq in sc2seq.c ***/
964 singles_windows (double winscores[], char seq[], int st, int en, int offs,
970 /* Init score to be sum of first WINDOW-1 */
971 for (score=0, i = st; i < st + WINDOW -1; i++) {
972 xoffs = (i + offs) % POSNUM;
975 score += relative[seq[i]][xoffs]; /** Defined in scscore.c to compute */
976 /* singles prob from our their table */
978 score += pprobs[seq[i]][xoffs] -gprobs[seq[i]];
981 /* Fill in winscores */
983 for (i = st, j = st + WINDOW-1; j < en; ++i, ++j) {
984 /* Add the new right side */
985 xoffs = (j + offs) % POSNUM;
987 score += relative[seq[j]][xoffs]; /** Defined in scscore.c to compute */
988 /* singles prob from our their table */
990 score += pprobs[seq[j]][xoffs] -gprobs[seq[j]]; /* Computes from our */
992 winscores[i] = score;
993 /* Take away the left side */
994 xoffs = (i + offs) % POSNUM;
996 score -= relative[seq[i]][xoffs]; /** Defined in scscore.c to compute */
997 /* singles prob from our their table */
999 score -= pprobs[seq[i]][xoffs] -gprobs[seq[i]]; /* Computes from our */
1007 /* Called from calcseq in sc2seq.c. Scores using the Newcoils algorithm. */
1008 /*** WINDOW is set to STOCKWINDOW in calcseq ***/
1009 int scorestock (char seq[MAXSEQLEN], int seqlen,
1010 double scores[MAXSEQLEN][POSNUM],
1011 char offsets[MAXSEQLEN],
1012 double *maxscore, int ProlineFreeWin, int stockstock)
1015 double winscores[MAXSEQLEN], scorebuff[MAXSEQLEN];
1019 if (seqlen < WINDOW) return 0;
1021 /* Initialize scores to very negative */
1022 for (j = 0; j < POSNUM; j++)
1023 for (i = 0; i < seqlen; scores[i++][j] = -HUGE_VAL);
1024 for (i = 0; i < seqlen; offsets[i++] = 0);
1027 /* Get first subsequence (en=0), do loop, then get next subseq until get */
1028 /* to end of sequence. */
1029 for (en=0; subsequence_advance(&st,&en,seq,seqlen,ProlineFreeWin);) {
1030 if (en - st < WINDOW) continue; /* Don't score short sequences, just */
1031 /* en loop and get next subseq. */
1033 for (offs = 0; offs < POSNUM; offs++) {
1034 singles_windows(winscores,seq,st,en,offs,stockstock);
1035 /* Get the score from stocks or our table for each */
1036 /* window if offset is offs, where score is */
1037 /* attributed to first residue in the window. */
1039 max_over_windows(&(scorebuff[st]),&(winscores[st]),en-st,WINDOW);
1040 /** The score of a residue is its max score */
1041 /** over windows containing it in offset off. */
1042 /** Temporarily stor this in scorebuff[i]. */
1043 for (i=st; i<en; ++i)
1044 scores[i][offs] = scorebuff[i];
1046 for (i=st; i<en; ++i) { /* Find the best scoring offset in the coil*/
1047 double localmax = -HUGE_VAL;
1048 for (offs=0; offs<POSNUM; ++offs)
1049 if (scores[i][offs]>localmax) {
1051 localmax = scores[i][offs];
1053 if (localmax > gmax)
1060 *maxscore = exp(gmax/WINDOW);
1062 for (i = 0; i < seqlen; i++)
1063 for (offs=0; offs<POSNUM; ++offs)
1064 scores[i][offs] = exp(scores[i][offs]/WINDOW);
1069 /*************************************************************************/
1070 /***** Ends the Stock scoring section *****/
1071 /*************************************************************************/
1077 int score_differentiator_window(char lib[MAXFUNCTNUM],
1078 char *window_residues, int windowlen,
1079 double window_score[POSNUM],
1080 int mode, int differentiator)
1083 double respropl[MAXSEQLEN];
1085 for (off=0; off<POSNUM; off++) {
1086 window_score[off] = 0;
1087 get_bonnie_respropl(respropl,window_residues,0,windowlen,off,lib, differentiator);
1088 for (i=0;i<windowlen; i++) {
1089 window_score[off] += respropl[i];