forester/archive/RIO/others/hmmer/src/emulation.c

   1 /************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  ************************************************************/
  10
  11 /* emulation.c
  12  * SRE, Wed Jan 21 07:50:01 1998
  13  *
  14  * Interfaces between HMMER and other software packages.
  15  *
  16  * RCS $Id: emulation.c,v 1.1.1.1 2005/03/22 08:34:01 cmzmasek Exp $
  17  */
  18
  19 #include <stdio.h>
  20 #include <string.h>
  21
  22 #include "squid.h"
  23 #include "config.h"
  24 #include "structs.h"
  25 #include "funcs.h"
  26 #include "version.h"
  27
  28
  29 /* Function: WriteProfile()
  30  * Date:     SRE, Wed Jan 21 07:58:09 1998 [St. Louis]
  31  *
  32  * Purpose:  Given an HMM, write a GCG profile .prf file as
  33  *           output. Based on examination of Michael Gribskov's Fortran
  34  *           source in GCG 9.1; on reverse engineering
  35  *           by examination of GCG 9.1 output from "profilemake"
  36  *           and how the .prf file is used by "profilesearch";
  37  *           and on the GCG 9.0 documentation.
  38  *
  39  *           See notes 28 Jan 98 for detail; in brief, the conversion goes like:
  40  *
  41  *           PROF(i,k) = match score         =  msc(i,k) + TMM(k-1)
  42  *
  43  *           GAP(k)    = cost per insertion  =  TMI(k-1) + TIM(k-1) - TMM(k-1) - TII(k-1)
  44  *           LEN(k)    = cost per inserted x =  TII(k-1)
  45  *
  46  *           QGAP(k)   = cost per deletion   =  TDM(k-1) + TMD(unknown) - TMM(k-1) - TDD(k-1)
  47  *           QLEN(k)   = cost per deleted k  =  TDD(k-1)
  48  *
  49  *           Note that GCG affine gaps are GAP + n * LEN;
  50  *           HMMER affine gaps count (n-1) * gap-extend, thus an
  51  *           extra TII gets taken away from GAP (and TDD from QGAP),
  52  *           since GCG will charge it.
  53  *
  54  *           Also note how the TMM transitions, which have no equivalent
  55  *           in a profile, get smuggled in OK.
  56  *
  57  *           Also note that GCG charges gaps using the profile position
  58  *           /after/ the gap, not preceding the gap as HMMER does.
  59  *
  60  *           Also note the TMD(unknown) in the QGAP calculation. HMMER
  61  *           distinguishes between gap-open and gap-close, but GCG does not,
  62  *           so there is a fundamental incompatibility here. Here
  63  *           we use an upper (best-scoring, minimum-cost) bound.
  64  *
  65  *           And finally note that GCG's implementation forces GAP=QGAP and
  66  *           LEN=QLEN. Here, we upper bound again. Compugen's implementation
  67  *           allows an "extended profile" format which distinguishes between
  68  *           the two.
  69  *
  70  *           The upper bound approach to these scores means that a
  71  *           score given by an emulated profile is an upper bound: the HMMER
  72  *           score (for a single Smith/Waterman style local alignment)
  73  *           cannot be better than this. This is intentional, so that
  74  *           the Compugen BIC can be used for rapid prefiltering of
  75  *           the database.
  76  *
  77  *           To get a close approximation of hmmsw scores, call
  78  *           profilesearch as
  79  *                profilesearch -noave -nonor -gap 10 -len 1
  80  *           On the Compugen BIC, using extended profiles, you want:
  81  *                om -model=xsw.model -gapop=10 -gapext=1 -qgapop=10 -qgapext=1 -noave -nonor
  82  *
  83  * Args:     fp      - open FILE to write to (or stdout, possibly)
  84  *           hmm     - the HMM to write
  85  *           do_xsw  - TRUE to write Compugen's experimental extended profile format
  86  *
  87  * Returns:  (void)
  88  */
  89 void
  90 WriteProfile(FILE *fp, struct plan7_s *hmm, int do_xsw)
  91 {
  92   int k;                        /* position in model      */
  93   int x;                        /* symbol index           */
  94   int sc;                       /* a score to print       */
  95   float nx;                     /* expected # of symbol x */
  96   int gap, len, qgap, qlen;     /* penalties to charge    */
  97
  98   P7Logoddsify(hmm, TRUE);
  99
 100   /* GCG can't deal with long profiles. Their limit is 1000
 101    * positions. However, Compugen can. Therefore we warn,
 102    * but don't die.
 103    */
 104   if (hmm->M > 1000 && !do_xsw)
 105     Warn("Profile %s will have more than 1000 positions. GCG won't read it; Compugen will.",
 106          hmm->name);
 107
 108   /* Header information.
 109    * GCG will look for sequence type and length of model.
 110    * Other than this, nothing is parsed until we get to the
 111    * Cons line that has a ".." on it.
 112    * Lines that begin with "!" are comments.
 113    */
 114   if (Alphabet_type == hmmAMINO)        fprintf(fp, "!!AA_PROFILE 1.0\n");
 115   else if (Alphabet_type == hmmNUCLEIC) fprintf(fp, "!!NA_PROFILE 1.0\n");
 116   else Die("No support for profiles with non-biological alphabets");
 117
 118   if (Alphabet_type == hmmAMINO)        fprintf(fp, "(Peptide) ");
 119   else if (Alphabet_type == hmmNUCLEIC) fprintf(fp, "(Nucleotide) ");
 120   fprintf(fp, "HMMCONVERT v%s Length: %d %s|%s|%s\n",
 121           RELEASE, hmm->M, hmm->name,
 122           hmm->flags & PLAN7_ACC ? hmm->acc : "",
 123           hmm->flags & PLAN7_DESC ? hmm->desc : "");
 124
 125   /* Insert some HMMER-specific commentary
 126    */
 127   if (do_xsw)
 128     {
 129       fprintf(fp, "   Profile converted from a profile HMM using HMMER v%s emulation.\n", RELEASE);
 130       fprintf(fp, "   Compugen XSW extended profile format.\n");
 131       fprintf(fp, "   Use -model=xsw.model -nonor -noave -gapop=10 -gapext=1 -qgapop=10 -qgapext=1\n");
 132       fprintf(fp, "      with om on the Compugen BIC to get the closest approximation to HMMER bit scores.\n");
 133       fprintf(fp, "   WARNING: There is a loss of information in this conversion.\n");
 134       fprintf(fp, "      Neither the scores nor even the rank order of hits will be precisely\n");
 135       fprintf(fp, "      preserved in a comparison of HMMER hmmsearch to GCG profilesearch.\n");
 136       fprintf(fp, "      The profile score is an approximation of the (single-hit) HMMER score.\n\n");
 137     }
 138   else
 139     {
 140       fprintf(fp, "   Profile converted from a profile HMM using HMMER v%s emulation.\n", RELEASE);
 141       fprintf(fp, "   Use -nonor -noave -gap=10 -len=1 with profilesearch and friends\n");
 142       fprintf(fp, "      to get the closest approximation to HMMER bit scores.\n");
 143       fprintf(fp, "   WARNING: There is a loss of information in this conversion.\n");
 144       fprintf(fp, "      Neither the scores nor even the rank order of hits will be precisely\n");
 145       fprintf(fp, "      preserved in a comparison of HMMER hmmsearch to GCG profilesearch.\n");
 146       fprintf(fp, "      The profile score is an approximation of the (single-hit) HMMER score.\n\n");
 147     }
 148
 149
 150   /* Do the CONS line, which gives the valid IUPAC symbols and their order
 151    */
 152   fprintf(fp, "Cons");
 153   for (x = 0; x < Alphabet_iupac; x++)
 154     fprintf(fp, "    %c ", Alphabet[x]);
 155   if (do_xsw)
 156     fprintf(fp, "  Gap   Len  QGap  Qlen ..\n");
 157   else
 158     fprintf(fp, "  Gap   Len ..\n");
 159
 160   /* Now, the profile; for each position in the HMM, write a line of profile.
 161    */
 162   for (k = 1; k <= hmm->M; k++)
 163     {
 164                                 /* GCG adds some indexing as comments */
 165       if ((k-1)%10 == 0 && k > 10)
 166         fprintf(fp, "! %d\n", k);
 167
 168                                 /* find consensus residue by max prob */
 169       x = FMax(hmm->mat[k], Alphabet_size);
 170       fprintf(fp, " %c  ", Alphabet[x]);
 171                                 /* generate emission score profile;
 172                                  * Profiles are scaled by a factor of 100
 173                                  */
 174       for (x = 0; x < Alphabet_iupac; x++)
 175         {
 176           sc = hmm->msc[x][k];
 177           if (k < hmm->M) sc += hmm->tsc[k][TMM];
 178           sc = sc * 100 / INTSCALE;
 179           fprintf(fp, "%5d ", sc);
 180         }
 181                                 /* Generate gap open, gap extend penalties;
 182                                    note we will force profilesearch to weights of 10, 1,
 183                                    and that GCG profile values are percentages
 184                                    of these base penalties, 0..100.*/
 185                                 /* gap open (insertion)*/
 186       if (k > 1)
 187         {
 188           gap = -1 * (hmm->tsc[k-1][TMI] + hmm->tsc[k-1][TIM] - hmm->tsc[k-1][TMM] - hmm->tsc[k-1][TII]);
 189           gap = gap * 100 / (10.0 * INTSCALE);
 190         }
 191       else gap = 100;           /* doesn't matter because GAP_1 is never used */
 192
 193                                 /* gap extend (insertion)*/
 194       if (k > 1)
 195         {
 196           len = -1 * hmm->tsc[k-1][TII];
 197           len = len * 100 / (1.0 * INTSCALE);
 198         }
 199       else len = 100;           /* again, doesn't matter because LEN_1 is never used */
 200
 201                                 /* gap open (deletion) */
 202       if (k > 1)
 203         {
 204           qgap = -1 * (hmm->tsc[k-1][TDM] + hmm->tsc[k-1][TMD] - hmm->tsc[k-1][TMM] - hmm->tsc[k-1][TDD]);
 205           qgap = qgap * 100 / (10.0 * INTSCALE);
 206         }
 207       else qgap = 100;
 208                                 /* gap extend (deletion) */
 209       if (k > 1)
 210         {
 211           qlen = -1 * hmm->tsc[k-1][TDD];
 212           qlen = qlen * 100 / (1.0 * INTSCALE);
 213         }
 214       else qlen = 100;
 215
 216
 217       if (do_xsw)
 218         fprintf(fp, "%5d %5d %5d %5d\n", gap, len, qgap, qlen);
 219       else
 220         fprintf(fp, "%5d %5d\n", gap, len); /* assume insertions >= deletions */
 221     }
 222
 223   /* The final line of the profile is a count of the observed
 224    * residues in the training sequences. This information is not
 225    * available in an HMM, and I'm not sure that GCG ever uses it.
 226    * Approximate it by calculating a /very/ rough expectation.
 227    */
 228   fprintf(fp, " *  ");
 229   for (x = 0; x < Alphabet_size; x++)
 230     {
 231       nx = 0.0;
 232       for (k = 1; k <= hmm->M; k++)
 233         nx += hmm->mat[k][x];
 234       nx *= hmm->nseq;
 235       fprintf(fp, "%5d ", (int) nx);
 236     }
 237   for (; x < Alphabet_iupac; x++)
 238       fprintf(fp, "%5d ", 0);
 239   fprintf(fp, "\n");
 240   return;
 241 }
 242