1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Tue Nov 18 10:12:28 1997
14 * Sequence masking routines; corrections for biased composition
17 * The Claverie/States XNU code is not used by default because I
18 * consider X'ing out sequence to be too black/white and too
19 * aggressive, but it's available as an option.
21 * The Wooton/Federhen SEG code was studied, but deemed too
22 * nonportable to include; it would've suffered the same drawback
25 * The TraceScoreCorrection() code is the default.
27 * RCS $Id: masks.c,v 1.1.1.1 2005/03/22 08:34:02 cmzmasek Exp $
43 /* The PAM120 score matrix, in HMMER's AMINO_ALPHABET alphabetic order
45 static int xpam120[23][23] = {
46 { 3, -3, 0, 0, -4, 1, -3, -1, -2, -3, -2, -1, 1, -1, -3, 1, 1, 0, -7, -4, 1, 0, 0 },
47 {-3, 9, -7, -7, -6, -4, -4, -3, -7, -7, -6, -5, -4, -7, -4, 0, -3, -3, -8, -1, -4, -6, 0 },
48 { 0, -7, 5, 3, -7, 0, 0, -3, -1, -5, -4, 2, -3, 1, -3, 0, -1, -3, -8, -5, 5, 3, 0 },
49 { 0, -7, 3, 5, -7, -1, -1, -3, -1, -4, -3, 1, -2, 2, -3, -1, -2, -3, -8, -5, 3, 5, 0 },
50 {-4, -6, -7, -7, 8, -5, -3, 0, -7, 0, -1, -4, -5, -6, -5, -3, -4, -3, -1, 4, -4, -5, 0 },
51 { 1, -4, 0, -1, -5, 5, -4, -4, -3, -5, -4, 0, -2, -3, -4, 1, -1, -2, -8, -6, 1, -1, 0 },
52 {-3, -4, 0, -1, -3, -4, 7, -4, -2, -3, -4, 2, -1, 3, 1, -2, -3, -3, -3, -1, 2, 2, 0 },
53 {-1, -3, -3, -3, 0, -4, -4, 6, -3, 1, 1, -2, -3, -3, -2, -2, 0, 3, -6, -2, -2, -2, 0 },
54 {-2, -7, -1, -1, -7, -3, -2, -3, 5, -4, 0, 1, -2, 0, 2, -1, -1, -4, -5, -5, 1, 0, 0 },
55 {-3, -7, -5, -4, 0, -5, -3, 1, -4, 5, 3, -4, -3, -2, -4, -4, -3, 1, -3, -2, -3, -2, 0 },
56 {-2, -6, -4, -3, -1, -4, -4, 1, 0, 3, 8, -3, -3, -1, -1, -2, -1, 1, -6, -4, -3, -1, 0 },
57 {-1, -5, 2, 1, -4, 0, 2, -2, 1, -4, -3, 4, -2, 0, -1, 1, 0, -3, -4, -2, 4, 1, 0 },
58 { 1, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2, 6, 0, -1, 1, -1, -2, -7, -6, -1, 0, 0 },
59 {-1, -7, 1, 2, -6, -3, 3, -3, 0, -2, -1, 0, 0, 6, 1, -2, -2, -3, -6, -5, 1, 5, 0 },
60 {-3, -4, -3, -3, -5, -4, 1, -2, 2, -4, -1, -1, -1, 1, 6, -1, -2, -3, 1, -5, -1, 0, 0 },
61 { 1, 0, 0, -1, -3, 1, -2, -2, -1, -4, -2, 1, 1, -2, -1, 3, 2, -2, -2, -3, 1, 0, 0 },
62 { 1, -3, -1, -2, -4, -1, -3, 0, -1, -3, -1, 0, -1, -2, -2, 2, 4, 0, -6, -3, 1, -1, 0 },
63 { 0, -3, -3, -3, -3, -2, -3, 3, -4, 1, 1, -3, -2, -3, -3, -2, 0, 5, -8, -3, -2, -2, 0 },
64 {-7, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6, 1, -2, -6, -8, 12, -2, -5, -6, 0 },
65 {-4, -1, -5, -5, 4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2, 8, -2, -4, 0 },
66 { 1, -4, 5, 3, -4, 1, 2, -2, 1, -3, -3, 4, -1, 1, -1, 1, 1, -2, -5, -2, 6, 4, 0 },
67 { 0, -6, 3, 5, -5, -1, 2, -2, 0, -2, -1, 1, 0, 5, 0, 0, -1, -2, -6, -4, 4, 6, 0 },
68 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
73 * Date: 18 Nov 1997 [StL]
75 * Purpose: x-out of repetitive sequence. XNU tends to be
76 * good at x'ing out short period tandem repeats.
78 * Note: Apply /only/ to protein sequence.
80 * Args: dsq: 1..len digitized sequence
83 * Return: number of characters x'ed out.
86 XNU(char *dsq, int len)
88 int i,k,off,sum,beg,end,top;
91 int noff = 4; /* maximum search offset */
95 double lambda = 0.346574;
100 if (len == 0) return 0;
102 hit = MallocOrDie(sizeof(int) * (len+1));
103 for (i=1; i<=len; i++) hit[i]=0;
106 ** Determine the score cutoff so that pcut will be the fraction
107 ** of random sequence eliminated assuming lambda, K, and H are
108 ** characteristic of the database as a whole
110 s0 = - log( pcut*H / (noff*K) ) / lambda;
111 if (s0>0) topcut = floor(s0 + log(s0)/lambda + 0.5);
113 fallcut = (int)log(K/0.001)/lambda;
115 for (off=mcut; off<=noff; off++) {
120 for (i=off+1; i<=len; i++) {
121 sum += xpam120[(int) dsq[i]][(int) dsq[i-off]];
126 if (top>=topcut && top-sum>fallcut) {
127 for (k=beg; k<=end; k++)
128 hit[k] = hit[k-off] = 1;
131 } else if (top-sum>fallcut) {
141 for (k=beg; k<=end; k++)
142 hit[k] = hit[k-off] = 1;
146 /* Now mask off detected repeats
148 for (i=1; i<=len; i++)
149 if (hit[i]) { xnum++; dsq[i] = Alphabet_iupac-1;} /* e.g. 'X' */
156 /* Function: TraceScoreCorrection()
157 * Date: Sun Dec 21 12:05:47 1997 [StL]
159 * Purpose: Calculate a correction (in integer log_2 odds) to be
160 * applied to a sequence, using a second null model,
161 * based on a traceback. M/I emissions are corrected;
162 * C/N/J are not -- as if the nonmatching part and
163 * matching part were each generated by the best null model.
164 * The null model is constructed /post hoc/ as the
165 * average over all the M,I distributions used by the trace.
167 * Return: the log_2-odds score correction.
170 TraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq)
172 float p[MAXABET]; /* null model distribution */
173 int sc[MAXCODE]; /* null model scores */
178 /* Set up model: average over the emission distributions of
179 * all M, I states that appear in the trace. Ad hoc? Sure, you betcha.
181 FSet(p, Alphabet_size, 0.0);
182 for (tpos = 0; tpos < tr->tlen; tpos++)
183 if (tr->statetype[tpos] == STM)
184 FAdd(p, hmm->mat[tr->nodeidx[tpos]], Alphabet_size);
185 else if (tr->statetype[tpos] == STI)
186 FAdd(p, hmm->ins[tr->nodeidx[tpos]], Alphabet_size);
187 FNorm(p, Alphabet_size);
189 for (x = 0; x < Alphabet_size; x++)
190 sc[x] = Prob2Score(p[x], hmm->null[x]);
191 /* could avoid this chunk if we knew
192 we didn't need any degenerate char scores */
193 for (x = Alphabet_size; x < Alphabet_iupac; x++)
194 sc[x] = DegenerateSymbolScore(p, hmm->null, x);
197 /* Score all the M,I state emissions that appear in the trace.
200 for (tpos = 0; tpos < tr->tlen; tpos++)
201 if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI)
202 score += sc[(int) dsq[tr->pos[tpos]]];
204 /* Apply an ad hoc 8 bit fudge factor penalty;
205 * interpreted as a prior, saying that the second null model is
206 * 1/2^8 (1/256) as likely as the standard null model
208 score -= 8 * INTSCALE;
210 /* Return the correction to the bit score.
212 return Scorify(ILogsum(0, score));
216 /* THE FOLLOWING CODE IS IN DEVELOPMENT.
217 * it is commented out of the current release deliberately.
218 * If you activate it, I'm not responsible for the consequences.
220 #if MICHAEL_JORDAN_BUYS_THE_PACERS
221 /* Function: NewTraceScoreCorrection()
222 * Date: Wed Feb 17 14:32:45 1999 [StL]
224 * Purpose: Calculate a correction (in integer log_2 odds) to be
225 * applied to a sequence, using a second null model,
226 * based on sequence endpoints. M/I emissions are corrected;
227 * C/N/J are not -- as if the nonmatching part and
228 * matching part were each generated by the best null model.
229 * Each null model is constructed /post hoc/ from the
230 * sequence composition of each matching domain (e.g.
231 * a null2 model is constructed for each domain in a
234 * Constraints on the construction of this function include:
235 * 1) Paracel hardware can't deal with trace-dependent
236 * null2 models. Original implementation of
237 * TraceScoreCorrection() was dependent on traceback
238 * and could not be reproduced on GeneMatcher.
239 * GeneMatcher may be able to deal w/ sequence endpoint
240 * dependent rescoring, though.
241 * Although this function looks like it's trace-
242 * dependent (because it's being passed a p7trace_s
243 * structure), it's really not; only the sequence
244 * endpoints are being used.
246 * 2) It is desirable that for multihit traces,
247 * per-domain scores sum to the per-sequence score.
248 * Otherwise people see this as a "bug" (cf.
249 * bug #2, David Kerk, NRC). HMMER calculates the
250 * per-domain scores by going through a separate
251 * TraceScore() call for each one and separately
252 * correcting them with TraceScoreCorrection(),
253 * so we have to do each domain in a full trace
254 * by a similar mechanism -- even if this means that
255 * we're adopting a very dubiously post hoc
258 * Return: the log_2-odds score correction.
261 NewTraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq)
263 float ct[MAXABET]; /* counts of observed residues */
264 float p[MAXABET]; /* null2 model distribution (also counts) */
265 float sc[MAXCODE]; /* null2 model scores (as floats not int) */
269 int score; /* tmp score for real HMM, integer logodds */
270 float hmmscore; /* score for real HMM for this domain */
271 float null2score; /* score for null2 model for this domain */
274 float totscore; /* overall score for trace */
275 float maxscore; /* best score so far for single domain */
276 int in_domain; /* flag for whether we're counting this domain */
277 int sym; /* digitized symbol in dsq */
278 int ndom; /* number of domains counted towards score */
280 int nsym; /* number of symbols in this alignment */
286 for (tpos = 0; tpos < tr->tlen; tpos++)
288 /* detect start of domain; start at N or J */
289 if (tpos < tr->tlen-1 && tr->statetype[tpos+1] == STB)
291 FCopy(ct, hmm->null, Alphabet_size); /* simple Dirichlet prior */
297 /* Count stuff in domain starting with N->B or J->B transition */
299 sym = (int) dsq[tr->pos[tpos]];
301 /* count emitted symbols in domain */
302 if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI)
304 P7CountSymbol(ct, sym, 1.0);
308 /* score emitted symbols in domain towards HMM */
309 if (tr->statetype[tpos] == STM)
310 score += hmm->msc[sym][tr->nodeidx[tpos]];
311 else if (tr->statetype[tpos] == STI)
312 score += hmm->isc[sym][tr->nodeidx[tpos]];
313 /* score transitions in domain towards HMM */
314 score += TransitionScoreLookup(hmm,
315 tr->statetype[tpos], tr->nodeidx[tpos],
316 tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
320 if (tr->statetype[tpos] == STE) /* done w/ a domain; calc its score */
322 /* convert counts to null2 prob distribution */
323 FCopy(p, ct, Alphabet_size);
324 FNorm(p, Alphabet_size);
325 /* Convert probs to log-odds_e scores */
326 /* p can't be zero, because of prior */
327 for (x = 0; x < Alphabet_size; x++)
328 sc[x] = log(p[x] / hmm->null[x]);
329 /* null2 score = counts \dot scores */
330 null2score = FDot(ct, sc, Alphabet_size);
332 printf("NSYM = %d NULL2 = %.1f\n", nsym, null2score);
334 /* Apply an ad hoc 12 bit fudge factor penalty, per domain.
335 * Interpreted probabilistically, saying that there's about
336 * a 1/256 probability to transition into the second null model.
340 /* Now correct score1 using the null2 score.
341 * If it's still > 0, add it to accumulated score.
343 hmmscore = Scorify(score);
344 hmmscore -= 1.44269504 * LogSum(0, null2score);
345 if (hmmscore > 0.) { totscore += hmmscore; ndom++; }
346 if (hmmscore > maxscore) maxscore = hmmscore;
352 /* Single domain special case.
354 if (ndom == 0) totscore = maxscore;
356 /* Return the correction to the bit score
358 return (P7TraceScore(hmm, dsq, tr) - totscore);
364 SantaCruzCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq)
366 return 0.0; /* UNFINISHED CODE */