--- /dev/null
+head 1.1;
+access;
+symbols;
+locks
+ alex_c:1.1; strict;
+comment @ * @;
+
+
+1.1
+date 2000.02.03.20.43.36; author alex_c; state Exp;
+branches;
+next ;
+
+
+desc
+@@
+
+
+1.1
+log
+@Initial revision
+@
+text
+@/* Bonnie Berger, David Wilson and Theodore Tonchev 1992 */
+/* Modified by Bonnie Berger and Ethan Wolf 1995. */
+/* Commented by Ethan Wolf 1995. */
+/* C Code File */
+
+/*************************************************************************
+ * scscore.c contains functions to compute the probabilities for genbnk *
+ * and the positives. The probabilities are computed for genbnk *
+ * directly from the frequencies. For the positives, the 0 frequency *
+ * residues (and residue pairs) other than Prolines are given a non-zero*
+ * probability that is no more than 1/5 of the minimum probability of *
+ * a term with non-zero frequency. This accounts for sampling errors. *
+ *
+ * In addition, the PairCoil scoring algorithm is implemented here to *
+ * score subsequences obtained by the function subsequence_advance() *
+ * To obtain this score, residue "propensities" are first calculated *
+ * for each residue in every possible offset. Then each window of *
+ * length PAIRWINDOW is given a score which is the maximum of the sum *
+ * of the residue propensities in that window over all possible window *
+ * offsets. (Note that if a PAIRWINDOW does not fit in the subsequence,*
+ * then the max size window is used, as long as it is at least size *
+ * min(WINDOW,MINWINDOW). *
+ * The subsequence is then given the max score over all the windows *
+ * containe d in it and it is given the offset of that window. *
+ *
+ * Additionally, code for doing STOCK scoring is contained here, based *
+ * only on single residue probabilities and windows of size STOCKWINDOW.*
+*************************************************************************/
+
+
+#include <stdio.h>
+#include <math.h>
+#include "sc.h"
+#include "scscore.h"
+#include "scconst.h"
+#include "options.h"
+#include "switches.h"
+#include "stats.h"
+/*#include "scio.h"*/
+
+/* #define Two_to_Three_Ratio 15 */ /* Make it an extern double instead. */
+ /* This guesses for Pr[x in C_2]/Pr[x in C_3] */
+
+
+/** WINDOW is the size of the WINDOW that is currently being scored. *
+ * It is normally size PAIRWINDOW, but changing its value allows *
+ * the computation of scores on shorter sequences (like size 28). */
+/*** int WINDOW = PAIRWINDOW; ***/
+extern int WINDOW; /* Now WINDOW is defined in interface.c so can read */
+ /* it in in get_defaults(). **/
+
+extern double (*pprobs)[POSNUM];
+extern double (*pprobp)[NUM_RES_TYPE][POSNUM][POSNUM];
+extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
+extern int functnum;
+
+
+
+/****************************************************************************/
+/*** For now just average, but should really do weighted avg, or something more complex. ***/
+double combine_dist_scores(double window_scores[POSNUM][MAXSEQLEN], int i,
+ int functnumber, char lib[MAXFUNCTNUM], int end,
+ int end_effects)
+{
+ int f, number_dist=0;
+ double score=0;
+
+ for (f=0; f<functnumber; f++)
+ if (!end_effects || ((i+lib[f]+1) < end)) {
+ score +=window_scores[lib[f]][i];
+ number_dist++;
+ }
+
+ if (number_dist>0)
+ return(score/number_dist);
+ else
+ return(window_scores[lib[0]][i]);
+}
+
+/***********************************************************************/
+
+
+
+/* These are relative frequences, and logs need to be taken of them. */
+/* They are used when computing stocks scores when STOCKSTOCK is defined */
+/* in scconst.h. These frequencies correspond to the frequencies used */
+/* by stock in the program that they distribute. */
+static double relative[20][7] ={
+ {3.167, 0.297, 0.398, 3.902, 0.585, 0.501, 0.483},
+ {2.597, 0.098, 0.345, 0.894, 0.514, 0.471, 0.431},
+ {1.665, 0.403, 0.386, 0.949, 0.211, 0.342, 0.360},
+ {2.240, 0.307, 0.480, 1.409, 0.541, 0.772, 0.663},
+ {0.531, 0.076, 0.403, 0.662, 0.189, 0.106, 0.013},
+ {1.417, 0.090, 0.122, 1.659, 0.190, 0.130, 0.155},
+ {0.045, 0.275, 0.578, 0.216, 0.211, 0.426, 0.156},
+ {1.297, 1.551, 1.084, 2.612, 0.377, 1.248, 0.877},
+ {1.375, 2.639, 1.763, 0.191, 1.815, 1.961, 2.795},
+ {0.659, 1.163, 1.210, 0.031, 1.358, 1.937, 1.798},
+ {0.347, 0.275, 0.679, 0.395, 0.294, 0.579, 0.213},
+ {0.262, 3.496, 3.108, 0.998, 5.685, 2.494, 3.048},
+ {0.030, 2.352, 2.268, 0.237, 0.663, 1.620, 1.448},
+ {0.179, 2.114, 1.778, 0.631, 2.550, 1.578, 2.526},
+ {0.835, 1.475, 1.534, 0.039, 1.722, 2.456, 2.280},
+ {0.382, 0.583, 1.052, 0.419, 0.525, 0.916, 0.628},
+ {0.169, 0.702, 0.955, 0.654, 0.791, 0.843, 0.647},
+ {0.824, 0.022, 0.308, 0.152, 0.180, 0.156, 0.044},
+ {0.240, 0.000, 0.000, 0.456, 0.019, 0.000, 0.000},
+ {0.000, 0.008, 0.000, 0.013, 0.000, 0.000, 0.000}};
+
+/* Takes logs of relative frequencies for singles */
+initialize_relative ()
+{
+ int aa, off;
+ for (aa=0; aa<AANUM; ++aa)
+ for (off=0; off<POSNUM; ++off)
+ relative[aa][off] = (relative[aa][off]) ? log(relative[aa][off]) : LOG0VAL;
+}
+
+/****************************************************************************/
+
+/* Advances past prolines and higher numbered residues in a subsequence, */
+/* in order to return the biggest subsequence from the start position that */
+/* contains no bad residues (>=AANUM-1). Here P is assumed to have number */
+/* AANUM-1 and the higher numbered residues correspond to X,Z,B. */
+/* st is the position of the last bad residue read (i.e. the previous value */
+/* for en). The subsequence returned to be scored goes from st to en-1. */
+/* If the end of the sequence is reached en=seqlen (seqlen-1 is the last */
+/* residue in the sequence). */
+/* If ProlineFreeWindows are not used, then prolines will NOT terminate */
+/* the subsequence, so regions which include prolines will be scored. */
+int subsequence_advance (int *st, int *en, char seq[], int seqlen,
+ int ProlineFreeWin)
+{
+ int start, end;
+
+ if (ProlineFreeWin) {
+ /* Assumes that Proline == AANUM-1, i.e. last residue in list */
+ /* Read past "bad" residues P (X,Z,B are now okay). */
+ /* Used to be seq[start] >=AANUM-1 when X,Z,B weren't okay. */
+ for (start = *en; (start < seqlen) && (seq[start] == Proline); start++);
+
+ /* Read in next contiguous subsequence of "good" residues to score on. */
+ /* Used to be seq[end] < AANUM -1 */
+ for (end = start; (end < seqlen) && (seq[end] != Proline) &&
+ (seq[end]<Undefined); end++);
+ }
+ else {
+ /* Used to Read past "bad" residues X,Z,B by seq[start]>=AANUM. */
+ for (start = *en; (start < seqlen) && (seq[start] >= Undefined); start++);
+
+
+
+ /* Read in next subsequence of "good" residues (including P). */
+ /* Used to be seq[end] <AANUM */
+ for (end = start; (end < seqlen) && (seq[end] < Undefined); end++);
+ }
+
+ if (start >= seqlen) return 0;
+ /* [st,en-1] now gives a subsequence to be scored. */
+ *st = start;
+ *en = end;
+ return 1;
+}
+
+/****************************************************************************/
+
+/* Takes a position and an offset value for position 0 and returns the offset
+value for that position. */
+
+int getoffs(int pos, int offs)
+{
+return ((pos+offs)%POSNUM);
+}
+
+
+/** The right propensity is defined as a function of the normalized prob. **/
+/** terms (pprobp[R1][R2][p2-offs][p2]-gprobp[R1][R2][off]) **/
+/** minus by (pprobs[R1][p2-offs]-gprobs[R1]). **/
+/** i.e. the log of (normalized pair prob.)/(normalized single prob.). **/
+double rpropensity_term(int x, int y, int xoffs, int yoffs,
+ char dist, char seq[])
+{
+ double lnprob=0;
+
+ /* First the pair probability normalized by genbnk. */
+ lnprob = pprobp[seq[x]][seq[y]][xoffs][yoffs]
+ - gprobp[seq[x]][seq[y]][dist];
+
+ /* Then the singles probability normalized by genbnk are included.*/
+ lnprob -= pprobs[seq[x]][xoffs] - gprobs[seq[x]];
+
+ return(lnprob);
+}
+
+
+
+/*********************************************************************/
+/** This function is used to calculate the differentiator based on **/
+/** paircoil algorithm, and is called in get_respropl from */
+/** get_winscores from scseqadj. **/
+
+double lprop_diff_term(int x, int y, int xoffs, int yoffs,
+ char dist, char seq[])
+{
+ double lnprob=0;
+ extern double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
+ many_pprobp[MAX_TABLE_NUMBER][NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
+
+
+ /* First the difference in the logs of the probabilities. */
+ lnprob = many_pprobp[0][seq[x]][seq[y]][xoffs][yoffs] -
+ many_pprobp[1][seq[x]][seq[y]][xoffs][yoffs];
+ /* Then the singles probabilities are included.*/
+ lnprob -= many_pprobs[0][seq[y]][yoffs] - many_pprobs[1][seq[y]][yoffs];
+
+ return(lnprob);
+}
+
+/***************************************************************************/
+
+
+/** The left propensity terms are similar, but divided by rightmost singles**/
+/** giving (pprobp[R1][R2][p1][p1+offs]-gprobp[R1][R2][off]) **/
+/** minus by (pprobs[R2][p1+offs]-gprobs[R2]). **/
+double lpropensity_term(int x, int y, int xoffs, int yoffs,
+ char dist, char seq[])
+{
+ double lnprob=0;
+
+ /*** Note: The weightp and weights stuff is statistical weighting **/
+ /*** based on whether the differences between pos file prob and genbnk **/
+ /*** residues are really significant enough to be useful in the score. **/
+ /*** sigma is defined in stats.c in order to make the weights sharper. **/
+
+
+ /* First the pair probability normalized by genbnk. */
+ lnprob = pprobp[seq[x]][seq[y]][xoffs][yoffs]
+ - gprobp[seq[x]][seq[y]][dist];
+
+ /* Then the singles probability normalized by genbnk are included.*/
+ lnprob -= pprobs[seq[y]][yoffs] - gprobs[seq[y]];
+
+ return(lnprob);
+}
+
+
+/****************************************************************************/
+/**For the following, SCOREACCUMULATE is defined in options.h to take the */
+/**min or to take the average of functnum terms. */
+/****************************************************************************/
+
+/****************************************************************************/
+/** These "r" functions score the sequence from right to left and are */
+/** comparable to the corresponding "l" functions defined later. */
+/****************************************************************************/
+
+
+/* Function to compute right residue propensity at the start of subsequence, */
+/* when there is no residue at distance lib[f]. Uses the pairs that are */
+/* available and uses singles probabilities if there are no pairs. */
+/* maxfval is the maximum of the distances lib[f], so any residue within */
+/* maxfval from the start of the subsequence must have its propensity */
+/* computed by this function. */
+getr_sc_en (double respropr[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int maxfval)
+{
+ double lnprob=0;
+ int f, i, cnt; /* cnt counts the number of lib[f] distances that stay */
+ /* within the subsequence, and so can be included in */
+ /* the propensity calculation. */
+ int x, y, xoffs, yoffs;
+
+ for (y = st+maxfval; y >= st; y--) {
+ yoffs = getoffs(y,offs);
+ for (f = 0, cnt=0; f < functnum; f++)
+ if ((x = y-lib[f]-1) >= st) { /* If the pair residue x is in the */
+ cnt++; /* subseq, then can include that pair. */
+ xoffs = getoffs(x,offs);
+
+ lnprob= rpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
+
+ #ifdef AvgDist
+ respropr[y] += lnprob; /** Will need to divide by cnt. **/
+ #endif
+ #ifdef MinDist
+ if (respropr[y] > lnprob) respropr[y] = lnprob;
+ #endif
+ }
+
+ if (!(cnt)) /* no pairs so do singles */
+ respropr[y] = pprobs[seq[y]][yoffs] - gprobs[seq[y]];
+ else {
+ #ifdef AvgDist
+ respropr[y] /= cnt; /* divide by number of pairs done to average them */
+ #endif
+ }
+ }
+}
+
+
+
+/* Computes right residue propensities for each position between st and en, */
+/* going right to left (respropr), and computing the geometric mean of the */
+/* paritial propensity terms at distance d of the position when AvgDist */
+/* defined, and the min of these terms when MinDist is defined. */
+
+get_respropr (double respropr[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM])
+{
+ double lnprob=0;
+ int f, i, maxfval = 0;
+ int x, y, xoffs, yoffs;
+
+ for (f = 0; f < functnum; f++)
+ if (maxfval < lib[f]) maxfval = lib[f];
+
+ for (i = st; i < en; i++)
+ respropr[i] = SCOREIDENTITY; /* 0 for avg; HUGE_VAL for min */
+
+ /* Store residue propensity of each position in respropr */
+ for (f = 0; f < functnum; f++)
+ for (y = en-1, x = y-lib[f]-1; y >= st+maxfval+1; x--, y--) {
+ /* lib dist's off by 1 */
+ xoffs = getoffs(x,offs);
+ yoffs = getoffs(y,offs);
+
+ lnprob= rpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
+
+ /* Incorporate new lnprob by taking function over it and old one */
+ SCOREACCUMULATE(respropr[y],lnprob); /* This averages functnum terms */
+ /* or takes their min depending */
+ /* on if AvgDis or MinDist def'd*/
+ }
+ /* If can't do all pairs, do as many pairs as possible, or singles */
+ getr_sc_en(respropr,seq,st,en,offs,lib,maxfval);
+}
+
+
+/****************************************************************************/
+/****************************************************************************/
+/****************************************************************************/
+/*****Left to right versions of propensities for scoring. Currently **/
+/**** scoring is left to right. **/
+/****************************************************************************/
+
+/* Computes the left residue propensities for the positions towards the end of
+the subsequence where it is not possible to compute all the pair scores.
+For each such position, all the pairs that can be done are done, and if
+no pairs can be done, the singles are done. st and en-1 are the first
+and last postion in the subsequence considered, offs is the frame, lib
+is the distance-1 values, and maxfval is the max of the distances lib[f].
+Any residue within maxfval of the end must have its propensity computed by
+this function. */
+
+get_sc_en (double respropl[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int maxfval, int differentiator)
+{
+ double lnprob=0;
+ int f, i, cnt; /* cnt counts the number of pair combos possible at a pos.*/
+ int x, y, xoffs, yoffs;
+
+
+ for (x = en-maxfval-1; x < en; x++) {
+ xoffs = getoffs(x,offs);
+ for (f = 0, cnt=0; f < functnum; f++)
+ if ((y = x+lib[f]+1) < en) { /* If the pair residue y is in the */
+ cnt++; /* sequence, then can do that pair */
+ yoffs = getoffs(y,offs);
+
+ if (differentiator)
+ lnprob = lprop_diff_term(x,y,xoffs,yoffs,lib[f],seq);
+ else /* The normal paircoil score term. */
+ lnprob= lpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
+
+ #ifdef AvgDist
+ respropl[x] += lnprob; /* Divide by cnt after go all distances.*/
+ #endif
+ #ifdef MinDist
+ if (respropl[x] > lnprob) respropl[x] = lnprob;
+ #endif
+ }
+
+
+
+ if (!(cnt)) /* no pairs so do singles */
+ if (differentiator) {
+ extern double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
+ many_pprobp[MAX_TABLE_NUMBER][NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
+
+ respropl[x] = many_pprobs[0][seq[x]][xoffs] - many_pprobs[1][seq[x]][xoffs];
+ }
+ else {
+ respropl[x] = pprobs[seq[x]][xoffs] - gprobs[seq[x]];
+
+ }
+
+ else
+ #ifdef AvgDist
+ respropl[x] /= cnt; /* Average by number of do-able pairs. */
+ #endif
+
+ }
+}
+
+
+/* Computes left residue propensities for each position between st and en,
+ * going left to right. */
+get_bonnie_respropl (double respropl[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator)
+{
+ double lnprob=0;
+ int f, i, maxfval = 0;
+ int x, y, xoffs, yoffs;
+
+ for (f = 0; f < functnum; f++)
+ if (maxfval < lib[f]) maxfval = lib[f];
+
+
+ for (i = st; i < en; i++)
+ respropl[i] = SCOREIDENTITY; /* 0 for avg; HUGE_VAL for min */
+
+ /* Store residue propensity of position x in respropl[x] */
+ for (f = 0; f < functnum; f++) {
+ for (x = st, y = x+lib[f]+1; x < en-maxfval-1; x++, y++) {
+ /* lib dist's off by 1 */
+ xoffs = getoffs(x,offs);
+ yoffs = getoffs(y,offs);
+
+ if (differentiator)
+ lnprob = lprop_diff_term(x,y,xoffs,yoffs,lib[f],seq);
+
+ else /* The normal paircoil score term. */
+ lnprob= lpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
+
+
+
+ /* Incorporate new lnprob by averaging or taking min of it and the */
+ /* terms corresponding to other distances, depending on options.h. */
+ SCOREACCUMULATE(respropl[x],lnprob);
+ }
+ }
+ /* If can't do all pairs because of end of subsequence, do all the */
+ /* distances that stay within the subsequence for x. */
+ get_sc_en(respropl,seq,st,en,offs,lib,maxfval, differentiator);
+}
+
+
+
+get_respropl (double respropl[POSNUM][MAXSEQLEN], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator)
+{
+ double lnprob=0;
+ int f, i, maxfval = 0;
+ int x, y, xoffs, yoffs;
+
+ for (f = 0; f < functnum; f++) {
+ if (maxfval < lib[f]) maxfval = lib[f];
+
+
+/**** Don't need to initialize since not accumulateing over distances***
+ for (i = st; i < en; i++)
+ respropl[lib[f]][i] = SCOREIDENTITY;
+***/
+ /* 0 for avg; HUGE_VAL for min */
+
+ }
+
+
+ /* Store residue propensity of position x in respropl[x] */
+ for (f = 0; f < functnum; f++) {
+ for (x = st, y = x+lib[f]+1; x < en-lib[f]-1; x++, y++) {
+ /* lib dist's off by 1 */
+ xoffs = getoffs(x,offs);
+ yoffs = getoffs(y,offs);
+
+ if (differentiator)
+ lnprob = lprop_diff_term(x,y,xoffs,yoffs,lib[f],seq);
+
+ else /* The normal paircoil score term. */
+ lnprob= lpropensity_term(x, y, xoffs, yoffs, lib[f], seq);
+
+
+
+ /* Incorporate new lnprob by averaging or taking min of it and the */
+ /* terms corresponding to other distances, depending on options.h. */
+ respropl[lib[f]][x] = lnprob;
+ }
+
+ while (x<en) { /* Deal with residues too close to end. */
+ /* Note that since we have a propensity */
+ /* for each distance, when we get to end*/
+ /* we don't want to use singles prob. since */
+ /* they were already used as the y value above */
+ /* in lpropensity_term. Setting this to 0 */
+ /* is like assuming it is random from genbnk. */
+ respropl[lib[f]][x] = 0;
+ x++;
+ }
+
+ }
+
+
+ /* If can't do all pairs because of end of subsequence, do all the */
+ /* distances that stay within the subsequence for x. */
+/***** If just computing for each dist, then end is dealt with above. ***/
+/****
+ get_sc_en(respropl,seq,st,en,offs,lib,maxfval, differentiator);
+****/
+}
+
+
+
+/*****************Ends Propensity computations.***************************/
+/*************************************************************************/
+/* Computes all window scores between st and en for a given offset
+ * and returns them in winscores.
+ * A winscore is the sum of the residue propensities of residues in that
+ * window, given that offset.
+ * A left residue propensity is the relative pair frequency minus the
+ * relative single frequency of the rightmost residue.
+ * lib contains the values dist-1.
+ * SCOREIDENTITY returns 0 if taking geometric mean and HUGEVAL if min.
+ * SCOREACCUMULATE(a,b) either averages the logs
+ * (i.e. takes the geometric mean) of the contribution of b into a
+ * or it stores min of a and b in a, depending on if AvgDist or MinDist
+ * is definied in options.h. */
+
+/** Note that windows of size PAIRWINDOW are used if possible. If a
+ * subsequence is shorter than PAIRWINDOW, but as long as MINWINDOW,
+ * then it is scored using a window the size of the subsequence
+ * (for more details see the function scseqadj). */
+
+get_bonnie_winscores (double winscores[], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator) /* Differentiator tells it to do the */
+ /* trimer-dimer diff method. */
+{
+ extern double init_class_prob[MAX_CLASS_NUMBER];
+
+ double scorel/*,scorer*/;
+ int i, j;
+ /* residue propensities going left to right */
+ double respropl[MAXSEQLEN]/*,respropr[MAXSEQLEN]*/;
+
+
+/* Get residue propensitites going left to right (respropl) */
+ get_bonnie_respropl(respropl,seq,st,en,offs,lib, differentiator);
+ /* get_respropr(respropr,seq,st,en,offs,lib); */
+
+
+/* Compute window scores in linear time */
+
+ /* Init score to be sum of first WINDOW-1 */
+ for (scorel=/*scorer=*/0, i = st; i < st + WINDOW-1; i++) {
+ scorel += respropl[i];
+ /* scorer += respropr[i]; */
+ }
+
+
+ /* Fill in winscores */
+ for (i = st, j = st + WINDOW-1; j < en; ++i, ++j) {
+ /* Add the new right side */
+ scorel += respropl[j];
+ /* scorer += respropr[j]; */
+
+
+ /* store window score at position i. A few options are given. */
+ /* Currently scoring is left to right, but functions of the scores */
+ /* in both directions can be taken. */
+
+ if (differentiator) { /** The probability estimate should actually be **/
+ /** the formula in the #else..... **/
+#ifdef NOT_PROB
+ winscores[i] = scorel; /** Just compute one term in the formula. **/
+
+#else
+ winscores[i] = -getdlog(init_class_prob[0]/init_class_prob[1]
+ * exp(scorel) + 1);
+#endif
+ }
+ else
+ winscores[i] = scorel;
+
+
+
+ /**************************/
+ /* winscores[i] = scorer; */
+ /* winscores[i] = (scorel+scorer)/2; */
+ /* winscores[i] = MIN(scorel,scorer);
+ /***************************/
+
+ /* Take away the left side */
+ scorel -= respropl[i];
+ /* scorer -= respropr[i]; */
+ }
+
+
+/************THIS CORRECTS THE BUG ****/
+ for (i= en - (WINDOW-1); i<en; i++) {
+ if (differentiator) {
+#ifdef NOT_PROB
+ winscores[i] = scorel; /** Just compute one term in the formula. **/
+#else
+ winscores[i] = -getdlog(init_class_prob[0]/init_class_prob[1]
+ * exp(scorel) + 1);
+#endif
+ }
+ else
+ winscores[i] = scorel;
+
+ scorel -= respropl[i];
+ }
+
+}
+
+
+
+
+/* Computes all window scores between st and en for a given offset
+ * and returns them in winscores.
+ * A winscore is the sum of the residue propensities of residues in that
+ * window, given that offset.
+ * A left residue propensity is the relative pair frequency minus the
+ * relative single frequency of the rightmost residue.
+ * lib contains the values dist-1.
+ * Some things defined in options.h. */
+
+/** Note that windows of size PAIRWINDOW are used if possible. If a
+ * subsequence is shorter than PAIRWINDOW, but as long as MINWINDOW,
+ * then it is scored using a window the size of the subsequence
+ * (for more details see the function scseqadj). */
+
+get_winscores (double winscores[POSNUM][MAXSEQLEN], char seq[], int st, int en, int offs, char lib[MAXFUNCTNUM], int differentiator) /* Differentiator tells it to do the */
+ /* trimer-dimer diff method. */
+{
+ extern double init_class_prob[MAX_CLASS_NUMBER];
+ int f;
+
+ double scorel/*,scorer*/;
+ int i, j;
+ /* residue propensities going left to right */
+ double respropl[POSNUM][MAXSEQLEN]/*,respropr[MAXSEQLEN]*/;
+
+
+/* Get residue propensitites going left to right (respropl) */
+ get_respropl(respropl,seq,st,en,offs,lib, differentiator);
+ /* get_respropr(respropr,seq,st,en,offs,lib); */
+
+
+/* Compute window scores in linear time */
+ for (f = 0; f < functnum; f++) {
+
+ /* Init score to be sum of first WINDOW-1 */
+ for (scorel=/*scorer=*/0, i = st; i < st + WINDOW-1; i++) {
+ scorel += respropl[lib[f]][i];
+ /* scorer += respropr[i]; */
+ }
+
+
+ /* Fill in winscores */
+ for (i = st, j = st + WINDOW-1; j < en; ++i, ++j) {
+ /* Add the new right side */
+ scorel += respropl[lib[f]][j];
+ /* scorer += respropr[j]; */
+
+
+ /* store window score at position i. A few options are given. */
+ /* Currently scoring is left to right, but functions of the scores */
+ /* in both directions can be taken.
+ */
+ if (differentiator) {
+#ifdef NOT_PROB
+ winscores[lib[f]][i] = scorel; /* Just compute one term in formula. */
+#else
+ winscores[lib[f]][i] = -getdlog(init_class_prob[0]/init_class_prob[1]
+ * exp(scorel) + 1);
+#endif
+ }
+ else
+ winscores[lib[f]][i] = scorel;
+
+ /**************************/
+ /* winscores[i] = scorer; */
+ /* winscores[i] = (scorel+scorer)/2; */
+ /* winscores[i] = MIN(scorel,scorer);
+ /***************************/
+
+ /* Take away the left side */
+ scorel -= respropl[lib[f]][i];
+ /* scorer -= respropr[i]; */
+ }
+
+
+/************THIS CORRECTS THE BUG ****/
+ for (i= en - (WINDOW-1); i<en; i++) {
+ if (differentiator) {
+#ifdef NOT_PROB
+ winscores[lib[f]][i] = scorel; /* Just compute one term in formula.*/
+#else
+ winscores[lib[f]][i] = -getdlog(init_class_prob[0]/init_class_prob[1]
+ * exp(scorel) + 1);
+#endif
+ }
+ else
+ winscores[lib[f]][i] = scorel;
+
+ scorel -= respropl[lib[f]][i];
+ }
+ }
+}
+
+
+
+/*************************************************************************/
+/************** The following begins the scoring for PairCoil ****/
+/*************************************************************************/
+
+
+/* Returns 0 if sequence too short, otherwise returns 1.
+ *
+ ** When this is called in calcseq of sc2seq.c
+ * The max window scores for residue i in offset off are stored in
+ * scs[i][off] and the max offset score is in sc2[i].
+
+ * max_over_windows() is a dynamic program to find the max window score
+ * containing a residue, and is in the file dyn.c.
+
+ * avg_over_windows() computes the average window score by using prefix
+ * computations to compute the log of the avg of exp(window_scores),
+ * and is also in dyn.c.
+
+ * differentiator =1 says use the trimer-dimer diff method.
+
+ ** NOTE that WINDOW is set to PAIRWINDOW in calcseq in sc2seq.c.
+ ** PAIRWINDOW and STOCKWINDOW and MINWINDOW are defined in scconst.h. */
+
+
+int scseqadj(char lib[],
+ char seq[], int seqlen,
+ double scores[MAXSEQLEN][POSNUM],
+ char offsets[MAXSEQLEN],
+ double *maxscore, int mode, int differentiator)
+
+{ int OLDWINDOW; /* temp var for const default window length */
+ double score, gmax;
+ /* double winscores[POSNUM][MAXSEQLEN], scorebuff[POSNUM][MAXSEQLEN]; */
+ double **winscores,**scorebuff;
+ double bonnie_winscores[MAXSEQLEN], bonnie_scorebuff[MAXSEQLEN];
+ double minwinscores[MAXSEQLEN];
+ int i, j, f;
+ int st, en, offs;
+ int ProlineFreeWin = mode & PROLINE_FREE_MODE;
+
+ winscores=(double**)calloc(POSNUM,sizeof(double*));
+ if(winscores==NULL){
+ printf("Failed to allocate winscores\n");exit(1);
+ }
+ scorebuff=(double**)calloc(POSNUM,sizeof(double*));
+ if(scorebuff==NULL){
+ printf("Failed to allocate scorebuff\n");exit(1);
+ }
+
+ for (i=0;i<POSNUM;i++){
+ winscores[i]=(double*)calloc(MAXSEQLEN,sizeof(double));
+ if(winscores[i]==NULL){
+ printf("Failed to allocate winscores[%i]",i);exit(1);
+ }
+ scorebuff[i]=(double*)calloc(MAXSEQLEN,sizeof(double));
+ if(scorebuff[i]==NULL){
+ printf("Failed to allocate scorebuff[%i]",i);exit(1);
+ }
+ }
+
+ if (seqlen <MIN(WINDOW,MINWINDOW)) { /* 0 out score for too short seq. */
+ for (i=0; i<seqlen; i++) {
+ offsets[i]=0;
+ for(j=0; j<POSNUM; j++)
+ scores[i][j]=-HUGE_VAL;
+ }
+ *maxscore = -HUGE_VAL;
+ return 0;
+ }
+
+ for (j = 0; j < POSNUM; j++)
+ for (i = 0; i < seqlen; scores[i++][j] = -HUGE_VAL);
+ for (i = 0; i < seqlen; offsets[i++] = 0);
+ gmax = -HUGE_VAL;
+
+
+ /* Get next subseq (not containing P's if ProlineFreeWindows defined in */
+ /* options.h. If a window of size WINDOW=PAIRWINDOW fits in the subseq */
+ /* then that window is scored. Otherwise, if a window of size MINWINDOW */
+ /* fits in the subseq, then that window is scored. Otherwise, the subseq */
+ /* is skipped. */
+
+ for (en=0; subsequence_advance(&st,&en,seq,seqlen,ProlineFreeWin);) {
+ if (en - st <MIN(WINDOW,MINWINDOW)) continue;
+ /* Don't score short regions. */
+ OLDWINDOW = WINDOW;
+
+
+ if (en - st <WINDOW) /* Change window size for subseq length between */
+ WINDOW = en - st; /* MINWINDOW and PAIRWINDOW. */
+
+ for (offs = 0; offs < POSNUM; offs++) {
+ #ifdef BONNIE_END_EFFECTS
+ get_bonnie_winscores(bonnie_winscores,seq,st,en,offs,lib,
+ differentiator);
+ #else
+ get_winscores(winscores,seq,st,en,offs,lib, differentiator);
+ #endif
+
+ /* Compute residue scores */
+ if ((en-st)>=WINDOW) {
+
+ if (!differentiator) {
+ #ifdef BONNIE_END_EFFECTS
+ max_over_windows(&(bonnie_scorebuff[st]),&(bonnie_winscores[st]),
+ en-st,WINDOW);
+ /** The score of a residue is its max window */
+ /** score over windows containing it for PAIRCOIL */
+ for (i=st; i<en; ++i)
+ scores[i][offs] = bonnie_scorebuff[i];
+
+
+ #else
+
+ if (mode & MAX_WINDOW_BEFORE_COMBINE_MODE) {
+ for (f=0; f<functnum; f++)
+ max_over_windows(&(scorebuff[lib[f]][st]),&
+ (winscores[lib[f]][st]),en-st,WINDOW);
+ /** The score of a residue is its max window */
+ /** score over windows containing it for PAIRCOIL */
+ for (i=st; i<en; ++i)
+ scores[i][offs] = combine_dist_scores(scorebuff,i,
+ functnum,lib,en,0);
+ }
+ else { /* Do score like done in PNAS paper. Note that end */
+ /* effects need to be dealt with in combine_dist_scores. */
+ for (i=st; i<en; i++) /* Put the combined score in winscores[0]*/
+ winscores[0][i] = combine_dist_scores(winscores,i,
+ functnum,lib,en,0);
+ /* Put the maxscore in scorebuff[0] */
+ max_over_windows(&(scorebuff[0][st]),&(winscores[0][st]),
+ en-st,WINDOW);
+
+ for (i=st; i<en; ++i)
+ scores[i][offs] = scorebuff[0][i];
+ }
+
+ #endif
+
+ }
+
+ else { /* Don't do max over window for differentiator.... */
+
+ #ifdef BONNIE_END_EFFECTS
+ if (differentiator == 2) { /* get max score **/
+ max_over_windows(&(bonnie_scorebuff[st]),&(bonnie_winscores[st]),
+ en-st,WINDOW);
+ for (i=st; i<en; ++i)
+ scores[i][offs] = bonnie_scorebuff[i];
+ }
+ else if (differentiator ==3) { /* get min score **/
+ min_over_windows(&(bonnie_scorebuff[st]),&(bonnie_winscores[st]),
+ en-st,WINDOW);
+ for (i=st; i<en; ++i)
+ scores[i][offs] = bonnie_scorebuff[i];
+ }
+ else if (differentiator ==1) /* Win score for win starting at i */
+ for (i=st; i<en; i++)
+ scores[i][offs] = bonnie_winscores[i];
+ #else
+ for (i=st; i<en; i++) /* Put the combined score in winscores[0]*/
+ winscores[0][i] = combine_dist_scores(winscores,i,
+ functnum,lib,en,0);
+
+ if (differentiator == 2) { /* get max score **/
+ max_over_windows(&(scorebuff[0][st]),&(winscores[0][st]),
+ en-st,WINDOW);
+ for (i=st; i<en; ++i)
+ scores[i][offs] = scorebuff[0][i];
+ }
+ else if (differentiator ==3){ /* get min score **/
+ min_over_windows(&(scorebuff[0][st]),&(winscores[0][st]),
+ en-st,WINDOW);
+ for (i=st; i<en; ++i)
+ scores[i][offs] = scorebuff[0][i];
+ }
+ else if (differentiator==1) /* Win score for win starting at i */
+ for (i=st; i<en; i++)
+ scores[i][offs] =
+ combine_dist_scores(winscores,i,functnum,lib,en,0);
+
+ #endif
+ }
+
+ }
+ }
+
+ /* Store the frame which maximizes the score of residue i in offset[i] */
+ for (i=st; i<en; ++i) {
+ double localmax = -HUGE_VAL;
+ for (offs=0; offs<POSNUM; ++offs)
+ if (localmax < scores[i][offs]) {
+ offsets[i] = offs;
+ localmax = scores[i][offs];
+ }
+ if (gmax < localmax)
+ gmax = localmax; /* maintain max score in subseq */
+ }
+ WINDOW = OLDWINDOW; /* restore default value (PAIRWINDOW) to WINDOW */
+ }
+
+#ifdef LOGSCALE
+ *maxscore = gmax;
+#else
+ *maxscore = exp(gmax/WINDOW);
+
+ for (i = 0; i < seqlen; i++)
+ for (offs=0; offs<POSNUM; ++offs)
+ scores[i][offs] = exp(scores[i][offs]/WINDOW);
+#endif
+
+
+ return 1;
+
+}
+
+
+
+
+/*************************************************************************/
+/***** Ends the PairCoil scoring section *****/
+/*************************************************************************/
+
+
+
+
+/*************************************************************************/
+/*************************************************************************/
+/*** The following compute the NewCoil stock score. *****/
+/*************************************************************************/
+
+
+/* Fill in window scores - used by scorestock()*/
+/**** WINDOW is set to STOCKWINDOW in calcseq in sc2seq.c ***/
+singles_windows (double winscores[], char seq[], int st, int en, int offs,
+ int stockstock)
+{
+ double score;
+ int i,j;
+ int xoffs;
+ /* Init score to be sum of first WINDOW-1 */
+ for (score=0, i = st; i < st + WINDOW -1; i++) {
+ xoffs = (i + offs) % POSNUM;
+
+ if (stockstock)
+ score += relative[seq[i]][xoffs]; /** Defined in scscore.c to compute */
+ /* singles prob from our their table */
+ else
+ score += pprobs[seq[i]][xoffs] -gprobs[seq[i]];
+ }
+
+ /* Fill in winscores */
+
+ for (i = st, j = st + WINDOW-1; j < en; ++i, ++j) {
+ /* Add the new right side */
+ xoffs = (j + offs) % POSNUM;
+ if (stockstock)
+ score += relative[seq[j]][xoffs]; /** Defined in scscore.c to compute */
+ /* singles prob from our their table */
+ else
+ score += pprobs[seq[j]][xoffs] -gprobs[seq[j]]; /* Computes from our */
+ /* table. */
+ winscores[i] = score;
+ /* Take away the left side */
+ xoffs = (i + offs) % POSNUM;
+ if (stockstock)
+ score -= relative[seq[i]][xoffs]; /** Defined in scscore.c to compute */
+ /* singles prob from our their table */
+ else
+ score -= pprobs[seq[i]][xoffs] -gprobs[seq[i]]; /* Computes from our */
+ /* table. */
+ }
+}
+
+
+
+
+/* Called from calcseq in sc2seq.c. Scores using the Newcoils algorithm. */
+/*** WINDOW is set to STOCKWINDOW in calcseq ***/
+int scorestock (char seq[MAXSEQLEN], int seqlen,
+ double scores[MAXSEQLEN][POSNUM],
+ char offsets[MAXSEQLEN],
+ double *maxscore, int ProlineFreeWin, int stockstock)
+{
+ double score, gmax;
+ double winscores[MAXSEQLEN], scorebuff[MAXSEQLEN];
+ int i, j;
+ int st, en, offs;
+
+ if (seqlen < WINDOW) return 0;
+
+ /* Initialize scores to very negative */
+ for (j = 0; j < POSNUM; j++)
+ for (i = 0; i < seqlen; scores[i++][j] = -HUGE_VAL);
+ for (i = 0; i < seqlen; offsets[i++] = 0);
+ gmax = -HUGE_VAL;
+
+/* Get first subsequence (en=0), do loop, then get next subseq until get */
+/* to end of sequence. */
+ for (en=0; subsequence_advance(&st,&en,seq,seqlen,ProlineFreeWin);) {
+ if (en - st < WINDOW) continue; /* Don't score short sequences, just */
+ /* en loop and get next subseq. */
+
+ for (offs = 0; offs < POSNUM; offs++) {
+ singles_windows(winscores,seq,st,en,offs,stockstock);
+ /* Get the score from stocks or our table for each */
+ /* window if offset is offs, where score is */
+ /* attributed to first residue in the window. */
+
+ max_over_windows(&(scorebuff[st]),&(winscores[st]),en-st,WINDOW);
+ /** The score of a residue is its max score */
+ /** over windows containing it in offset off. */
+ /** Temporarily stor this in scorebuff[i]. */
+ for (i=st; i<en; ++i)
+ scores[i][offs] = scorebuff[i];
+ }
+ for (i=st; i<en; ++i) { /* Find the best scoring offset in the coil*/
+ double localmax = -HUGE_VAL;
+ for (offs=0; offs<POSNUM; ++offs)
+ if (scores[i][offs]>localmax) {
+ offsets[i] = offs;
+ localmax = scores[i][offs];
+ }
+ if (localmax > gmax)
+ gmax = localmax;
+ }
+ }
+#ifdef LOGSCALE
+ *maxscore = gmax;
+#else
+ *maxscore = exp(gmax/WINDOW);
+
+ for (i = 0; i < seqlen; i++)
+ for (offs=0; offs<POSNUM; ++offs)
+ scores[i][offs] = exp(scores[i][offs]/WINDOW);
+#endif
+ return 1;
+}
+
+/*************************************************************************/
+/***** Ends the Stock scoring section *****/
+/*************************************************************************/
+
+
+
+
+
+int score_differentiator_window(char lib[MAXFUNCTNUM],
+ char *window_residues, int windowlen,
+ double window_score[POSNUM],
+ int mode, int differentiator)
+{
+ int off, i;
+ double respropl[MAXSEQLEN];
+
+ for (off=0; off<POSNUM; off++) {
+ window_score[off] = 0;
+ get_bonnie_respropl(respropl,window_residues,0,windowlen,off,lib, differentiator);
+ for (i=0;i<windowlen; i++) {
+ window_score[off] += respropl[i];
+ }
+ }
+}
+@