forester/archive/RIO/others/hmmer/src/masks.c

   1 /************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  ************************************************************/
  10
  11 /* masks.c
  12  * SRE, Tue Nov 18 10:12:28 1997
  13  *
  14  * Sequence masking routines; corrections for biased composition
  15  * target sequences.
  16  *
  17  * The Claverie/States XNU code is not used by default because I
  18  * consider X'ing out sequence to be too black/white and too
  19  * aggressive, but it's available as an option.
  20  *
  21  * The Wooton/Federhen SEG code was studied, but deemed too
  22  * nonportable to include; it would've suffered the same drawback
  23  * as XNU.
  24  *
  25  * The TraceScoreCorrection() code is the default.
  26  *
  27  * RCS $Id: masks.c,v 1.1.1.1 2005/03/22 08:34:02 cmzmasek Exp $
  28  */
  29
  30 #include <stdio.h>
  31 #include <math.h>
  32 #include <float.h>
  33
  34 #include "squid.h"
  35 #include "config.h"
  36 #include "structs.h"
  37 #include "funcs.h"
  38
  39 #ifdef MEMDEBUG
  40 #include "dbmalloc.h"
  41 #endif
  42
  43 /* The PAM120 score matrix, in HMMER's AMINO_ALPHABET alphabetic order
  44  */
  45 static int xpam120[23][23] = {
  46   { 3, -3,  0,  0, -4,  1, -3, -1, -2, -3, -2, -1,  1, -1, -3,  1,  1,  0, -7, -4,  1,  0,  0 },
  47   {-3,  9, -7, -7, -6, -4, -4, -3, -7, -7, -6, -5, -4, -7, -4,  0, -3, -3, -8, -1, -4, -6,  0 },
  48   { 0, -7,  5,  3, -7,  0,  0, -3, -1, -5, -4,  2, -3,  1, -3,  0, -1, -3, -8, -5,  5,  3,  0 },
  49   { 0, -7,  3,  5, -7, -1, -1, -3, -1, -4, -3,  1, -2,  2, -3, -1, -2, -3, -8, -5,  3,  5,  0 },
  50   {-4, -6, -7, -7,  8, -5, -3,  0, -7,  0, -1, -4, -5, -6, -5, -3, -4, -3, -1,  4, -4, -5,  0 },
  51   { 1, -4,  0, -1, -5,  5, -4, -4, -3, -5, -4,  0, -2, -3, -4,  1, -1, -2, -8, -6,  1, -1,  0 },
  52   {-3, -4,  0, -1, -3, -4,  7, -4, -2, -3, -4,  2, -1,  3,  1, -2, -3, -3, -3, -1,  2,  2,  0 },
  53   {-1, -3, -3, -3,  0, -4, -4,  6, -3,  1,  1, -2, -3, -3, -2, -2,  0,  3, -6, -2, -2, -2,  0 },
  54   {-2, -7, -1, -1, -7, -3, -2, -3,  5, -4,  0,  1, -2,  0,  2, -1, -1, -4, -5, -5,  1,  0,  0 },
  55   {-3, -7, -5, -4,  0, -5, -3,  1, -4,  5,  3, -4, -3, -2, -4, -4, -3,  1, -3, -2, -3, -2,  0 },
  56   {-2, -6, -4, -3, -1, -4, -4,  1,  0,  3,  8, -3, -3, -1, -1, -2, -1,  1, -6, -4, -3, -1,  0 },
  57   {-1, -5,  2,  1, -4,  0,  2, -2,  1, -4, -3,  4, -2,  0, -1,  1,  0, -3, -4, -2,  4,  1,  0 },
  58   { 1, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2,  6,  0, -1,  1, -1, -2, -7, -6, -1,  0,  0 },
  59   {-1, -7,  1,  2, -6, -3,  3, -3,  0, -2, -1,  0,  0,  6,  1, -2, -2, -3, -6, -5,  1,  5,  0 },
  60   {-3, -4, -3, -3, -5, -4,  1, -2,  2, -4, -1, -1, -1,  1,  6, -1, -2, -3,  1, -5, -1,  0,  0 },
  61   { 1,  0,  0, -1, -3,  1, -2, -2, -1, -4, -2,  1,  1, -2, -1,  3,  2, -2, -2, -3,  1,  0,  0 },
  62   { 1, -3, -1, -2, -4, -1, -3,  0, -1, -3, -1,  0, -1, -2, -2,  2,  4,  0, -6, -3,  1, -1,  0 },
  63   { 0, -3, -3, -3, -3, -2, -3,  3, -4,  1,  1, -3, -2, -3, -3, -2,  0,  5, -8, -3, -2, -2,  0 },
  64   {-7, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6,  1, -2, -6, -8, 12, -2, -5, -6,  0 },
  65   {-4, -1, -5, -5,  4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2,  8, -2, -4,  0 },
  66   { 1, -4,  5,  3, -4,  1,  2, -2,  1, -3, -3,  4, -1,  1, -1,  1,  1, -2, -5, -2,  6,  4,  0 },
  67   { 0, -6,  3,  5, -5, -1,  2, -2,  0, -2, -1,  1,  0,  5,  0,  0, -1, -2, -6, -4,  4,  6,  0 },
  68   { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
  69 };
  70
  71
  72 /* Function: XNU()
  73  * Date:     18 Nov 1997 [StL]
  74  *
  75  * Purpose:  x-out of repetitive sequence. XNU tends to be
  76  *           good at x'ing out short period tandem repeats.
  77  *
  78  * Note:     Apply /only/ to protein sequence.
  79  *
  80  * Args:     dsq: 1..len digitized sequence
  81  *           len: length of dsq
  82  *
  83  * Return:   number of characters x'ed out.
  84  */
  85 int
  86 XNU(char *dsq, int len)
  87 {
  88   int    i,k,off,sum,beg,end,top;
  89   int    topcut,fallcut;
  90   double s0;
  91   int    noff = 4;              /* maximum search offset */
  92   int    mcut = 1;
  93   double pcut = 0.01;
  94   int   *hit;
  95   double lambda = 0.346574;
  96   double K      = 0.2;
  97   double H      = 0.664;
  98   int    xnum   = 0;
  99
 100   if (len == 0) return 0;
 101
 102   hit = MallocOrDie(sizeof(int) * (len+1));
 103   for (i=1; i<=len; i++) hit[i]=0;
 104
 105   /*
 106   ** Determine the score cutoff so that pcut will be the fraction
 107   ** of random sequence eliminated assuming lambda, K, and H are
 108   ** characteristic of the database as a whole
 109   */
 110   s0 = - log( pcut*H / (noff*K) ) / lambda;
 111   if (s0>0) topcut = floor(s0 + log(s0)/lambda + 0.5);
 112   else topcut = 0;
 113   fallcut = (int)log(K/0.001)/lambda;
 114
 115   for (off=mcut; off<=noff; off++) {
 116     sum=top=0;
 117     beg=off;
 118     end=0;
 119
 120     for (i=off+1; i<=len; i++) {
 121       sum += xpam120[(int) dsq[i]][(int) dsq[i-off]];
 122       if (sum>top) {
 123         top=sum;
 124         end=i;
 125       }
 126       if (top>=topcut && top-sum>fallcut) {
 127         for (k=beg; k<=end; k++)
 128           hit[k] = hit[k-off] = 1;
 129         sum=top=0;
 130         beg=end=i+1;
 131       } else if (top-sum>fallcut) {
 132         sum=top=0;
 133         beg=end=i+1;
 134       }
 135       if (sum<0) {
 136         beg=end=i+1;
 137         sum=top=0;
 138       }
 139     }
 140     if (top>=topcut) {
 141       for (k=beg; k<=end; k++)
 142         hit[k] = hit[k-off] = 1;
 143     }
 144   }
 145
 146   /* Now mask off detected repeats
 147    */
 148   for (i=1; i<=len; i++)
 149     if (hit[i]) { xnum++; dsq[i] = Alphabet_iupac-1;} /* e.g. 'X' */
 150
 151   free(hit);
 152   return xnum;
 153 }
 154
 155
 156 /* Function: TraceScoreCorrection()
 157  * Date:     Sun Dec 21 12:05:47 1997 [StL]
 158  *
 159  * Purpose:  Calculate a correction (in integer log_2 odds) to be
 160  *           applied to a sequence, using a second null model,
 161  *           based on a traceback. M/I emissions are corrected;
 162  *           C/N/J are not -- as if the nonmatching part and
 163  *           matching part were each generated by the best null model.
 164  *           The null model is constructed /post hoc/ as the
 165  *           average over all the M,I distributions used by the trace.
 166  *
 167  * Return:   the log_2-odds score correction.
 168  */
 169 float
 170 TraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq)
 171 {
 172   float p[MAXABET];             /* null model distribution */
 173   int   sc[MAXCODE];            /* null model scores       */
 174   int   x;
 175   int   tpos;
 176   int   score;
 177
 178   /* Set up model: average over the emission distributions of
 179    * all M, I states that appear in the trace. Ad hoc? Sure, you betcha.
 180    */
 181   FSet(p, Alphabet_size, 0.0);
 182   for (tpos = 0; tpos < tr->tlen; tpos++)
 183      if      (tr->statetype[tpos] == STM)
 184        FAdd(p, hmm->mat[tr->nodeidx[tpos]], Alphabet_size);
 185      else if (tr->statetype[tpos] == STI)
 186        FAdd(p, hmm->ins[tr->nodeidx[tpos]], Alphabet_size);
 187   FNorm(p, Alphabet_size);
 188
 189   for (x = 0; x < Alphabet_size; x++)
 190     sc[x] = Prob2Score(p[x], hmm->null[x]);
 191                                 /* could avoid this chunk if we knew
 192                                    we didn't need any degenerate char scores */
 193   for (x = Alphabet_size; x < Alphabet_iupac; x++)
 194     sc[x] = DegenerateSymbolScore(p, hmm->null, x);
 195
 196
 197   /* Score all the M,I state emissions that appear in the trace.
 198    */
 199    score = 0;
 200    for (tpos = 0; tpos < tr->tlen; tpos++)
 201      if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI)
 202        score += sc[(int) dsq[tr->pos[tpos]]];
 203
 204    /* Apply an ad hoc 8 bit fudge factor penalty;
 205     * interpreted as a prior, saying that the second null model is
 206     * 1/2^8 (1/256) as likely as the standard null model
 207     */
 208    score -= 8 * INTSCALE;
 209
 210    /* Return the correction to the bit score.
 211     */
 212    return Scorify(ILogsum(0, score));
 213 }
 214
 215
 216 /* THE FOLLOWING CODE IS IN DEVELOPMENT.
 217  * it is commented out of the current release deliberately.
 218  * If you activate it, I'm not responsible for the consequences.
 219  */
 220 #if MICHAEL_JORDAN_BUYS_THE_PACERS
 221 /* Function: NewTraceScoreCorrection()
 222  * Date:     Wed Feb 17 14:32:45 1999 [StL]
 223  *
 224  * Purpose:  Calculate a correction (in integer log_2 odds) to be
 225  *           applied to a sequence, using a second null model,
 226  *           based on sequence endpoints. M/I emissions are corrected;
 227  *           C/N/J are not -- as if the nonmatching part and
 228  *           matching part were each generated by the best null model.
 229  *           Each null model is constructed /post hoc/ from the
 230  *           sequence composition of each matching domain (e.g.
 231  *           a null2 model is constructed for each domain in a
 232  *           multihit trace).
 233  *
 234  *           Constraints on the construction of this function include:
 235  *            1) Paracel hardware can't deal with trace-dependent
 236  *               null2 models. Original implementation of
 237  *               TraceScoreCorrection() was dependent on traceback
 238  *               and could not be reproduced on GeneMatcher.
 239  *               GeneMatcher may be able to deal w/ sequence endpoint
 240  *               dependent rescoring, though.
 241  *               Although this function looks like it's trace-
 242  *               dependent (because it's being passed a p7trace_s
 243  *               structure), it's really not; only the sequence
 244  *               endpoints are being used.
 245  *
 246  *            2) It is desirable that for multihit traces,
 247  *               per-domain scores sum to the per-sequence score.
 248  *               Otherwise people see this as a "bug" (cf.
 249  *               bug #2, David Kerk, NRC). HMMER calculates the
 250  *               per-domain scores by going through a separate
 251  *               TraceScore() call for each one and separately
 252  *               correcting them with TraceScoreCorrection(),
 253  *               so we have to do each domain in a full trace
 254  *               by a similar mechanism -- even if this means that
 255  *               we're adopting a very dubiously post hoc
 256  *               null model.
 257  *
 258  * Return:   the log_2-odds score correction.
 259  */
 260 float
 261 NewTraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq)
 262 {
 263   float ct[MAXABET];            /* counts of observed residues            */
 264   float p[MAXABET];             /* null2 model distribution (also counts) */
 265   float sc[MAXCODE];            /* null2 model scores (as floats not int) */
 266
 267   int   x;
 268   int   tpos;
 269   int   score;                  /* tmp score for real HMM, integer logodds */
 270   float hmmscore;               /* score for real HMM for this domain */
 271   float null2score;             /* score for null2 model for this domain */
 272
 273
 274   float totscore;               /* overall score for trace */
 275   float maxscore;               /* best score so far for single domain */
 276   int   in_domain;              /* flag for whether we're counting this domain */
 277   int   sym;                    /* digitized symbol in dsq */
 278   int   ndom;                   /* number of domains counted towards score */
 279
 280   int   nsym;                   /* number of symbols in this alignment */
 281
 282   totscore  = 0.;
 283   maxscore  = -FLT_MAX;
 284   in_domain = FALSE;
 285   ndom      = 0;
 286   for (tpos = 0; tpos < tr->tlen; tpos++)
 287     {
 288                                 /* detect start of domain; start at N or J */
 289       if (tpos < tr->tlen-1 && tr->statetype[tpos+1] == STB)
 290         {
 291           FCopy(ct, hmm->null, Alphabet_size);  /* simple Dirichlet prior */
 292           score      = 0;
 293           null2score = 0.;
 294           nsym       = 0;
 295           in_domain  = TRUE;
 296         }
 297                 /* Count stuff in domain starting with N->B or J->B transition */
 298       if (in_domain) {
 299         sym = (int) dsq[tr->pos[tpos]];
 300
 301                                 /* count emitted symbols in domain */
 302         if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI)
 303           {
 304             P7CountSymbol(ct, sym, 1.0);
 305             nsym++;
 306           }
 307
 308                                 /* score emitted symbols in domain towards HMM */
 309         if (tr->statetype[tpos] == STM)
 310           score += hmm->msc[sym][tr->nodeidx[tpos]];
 311         else if (tr->statetype[tpos] == STI)
 312           score += hmm->isc[sym][tr->nodeidx[tpos]];
 313                                 /* score transitions in domain towards HMM */
 314         score += TransitionScoreLookup(hmm,
 315                                        tr->statetype[tpos], tr->nodeidx[tpos],
 316                                        tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
 317       }
 318
 319
 320       if (tr->statetype[tpos] == STE) /* done w/ a domain; calc its score */
 321         {
 322                                       /* convert counts to null2 prob distribution */
 323           FCopy(p, ct, Alphabet_size);
 324           FNorm(p, Alphabet_size);
 325                                       /* Convert probs to log-odds_e scores */
 326                                       /* p can't be zero, because of prior  */
 327           for (x = 0; x < Alphabet_size; x++)
 328             sc[x] = log(p[x] / hmm->null[x]);
 329                                       /* null2 score = counts \dot scores */
 330           null2score = FDot(ct, sc, Alphabet_size);
 331
 332           printf("NSYM = %d   NULL2 = %.1f\n", nsym, null2score);
 333
 334           /* Apply an ad hoc 12 bit fudge factor penalty, per domain.
 335            * Interpreted probabilistically, saying that there's about
 336            * a 1/256 probability to transition into the second null model.
 337            */
 338           null2score  -= 12.;
 339
 340           /* Now correct score1 using the null2 score.
 341            * If it's still > 0, add it to accumulated score.
 342            */
 343           hmmscore  = Scorify(score);
 344           hmmscore -= 1.44269504 * LogSum(0, null2score);
 345           if (hmmscore > 0.) { totscore += hmmscore; ndom++; }
 346           if (hmmscore > maxscore) maxscore = hmmscore;
 347
 348           in_domain = FALSE;
 349         }
 350     }
 351
 352   /* Single domain special case.
 353    */
 354   if (ndom == 0) totscore = maxscore;
 355
 356    /* Return the correction to the bit score
 357     */
 358    return (P7TraceScore(hmm, dsq, tr) - totscore);
 359 }
 360 #endif /*0*/
 361
 362
 363 float
 364 SantaCruzCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq)
 365 {
 366   return 0.0;                   /* UNFINISHED CODE */
 367 }