1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Wed Jan 21 07:50:01 1998
14 * Interfaces between HMMER and other software packages.
16 * RCS $Id: emulation.c,v 1.1.1.1 2005/03/22 08:34:01 cmzmasek Exp $
29 /* Function: WriteProfile()
30 * Date: SRE, Wed Jan 21 07:58:09 1998 [St. Louis]
32 * Purpose: Given an HMM, write a GCG profile .prf file as
33 * output. Based on examination of Michael Gribskov's Fortran
34 * source in GCG 9.1; on reverse engineering
35 * by examination of GCG 9.1 output from "profilemake"
36 * and how the .prf file is used by "profilesearch";
37 * and on the GCG 9.0 documentation.
39 * See notes 28 Jan 98 for detail; in brief, the conversion goes like:
41 * PROF(i,k) = match score = msc(i,k) + TMM(k-1)
43 * GAP(k) = cost per insertion = TMI(k-1) + TIM(k-1) - TMM(k-1) - TII(k-1)
44 * LEN(k) = cost per inserted x = TII(k-1)
46 * QGAP(k) = cost per deletion = TDM(k-1) + TMD(unknown) - TMM(k-1) - TDD(k-1)
47 * QLEN(k) = cost per deleted k = TDD(k-1)
49 * Note that GCG affine gaps are GAP + n * LEN;
50 * HMMER affine gaps count (n-1) * gap-extend, thus an
51 * extra TII gets taken away from GAP (and TDD from QGAP),
52 * since GCG will charge it.
54 * Also note how the TMM transitions, which have no equivalent
55 * in a profile, get smuggled in OK.
57 * Also note that GCG charges gaps using the profile position
58 * /after/ the gap, not preceding the gap as HMMER does.
60 * Also note the TMD(unknown) in the QGAP calculation. HMMER
61 * distinguishes between gap-open and gap-close, but GCG does not,
62 * so there is a fundamental incompatibility here. Here
63 * we use an upper (best-scoring, minimum-cost) bound.
65 * And finally note that GCG's implementation forces GAP=QGAP and
66 * LEN=QLEN. Here, we upper bound again. Compugen's implementation
67 * allows an "extended profile" format which distinguishes between
70 * The upper bound approach to these scores means that a
71 * score given by an emulated profile is an upper bound: the HMMER
72 * score (for a single Smith/Waterman style local alignment)
73 * cannot be better than this. This is intentional, so that
74 * the Compugen BIC can be used for rapid prefiltering of
77 * To get a close approximation of hmmsw scores, call
79 * profilesearch -noave -nonor -gap 10 -len 1
80 * On the Compugen BIC, using extended profiles, you want:
81 * om -model=xsw.model -gapop=10 -gapext=1 -qgapop=10 -qgapext=1 -noave -nonor
83 * Args: fp - open FILE to write to (or stdout, possibly)
84 * hmm - the HMM to write
85 * do_xsw - TRUE to write Compugen's experimental extended profile format
90 WriteProfile(FILE *fp, struct plan7_s *hmm, int do_xsw)
92 int k; /* position in model */
93 int x; /* symbol index */
94 int sc; /* a score to print */
95 float nx; /* expected # of symbol x */
96 int gap, len, qgap, qlen; /* penalties to charge */
98 P7Logoddsify(hmm, TRUE);
100 /* GCG can't deal with long profiles. Their limit is 1000
101 * positions. However, Compugen can. Therefore we warn,
104 if (hmm->M > 1000 && !do_xsw)
105 Warn("Profile %s will have more than 1000 positions. GCG won't read it; Compugen will.",
108 /* Header information.
109 * GCG will look for sequence type and length of model.
110 * Other than this, nothing is parsed until we get to the
111 * Cons line that has a ".." on it.
112 * Lines that begin with "!" are comments.
114 if (Alphabet_type == hmmAMINO) fprintf(fp, "!!AA_PROFILE 1.0\n");
115 else if (Alphabet_type == hmmNUCLEIC) fprintf(fp, "!!NA_PROFILE 1.0\n");
116 else Die("No support for profiles with non-biological alphabets");
118 if (Alphabet_type == hmmAMINO) fprintf(fp, "(Peptide) ");
119 else if (Alphabet_type == hmmNUCLEIC) fprintf(fp, "(Nucleotide) ");
120 fprintf(fp, "HMMCONVERT v%s Length: %d %s|%s|%s\n",
121 RELEASE, hmm->M, hmm->name,
122 hmm->flags & PLAN7_ACC ? hmm->acc : "",
123 hmm->flags & PLAN7_DESC ? hmm->desc : "");
125 /* Insert some HMMER-specific commentary
129 fprintf(fp, " Profile converted from a profile HMM using HMMER v%s emulation.\n", RELEASE);
130 fprintf(fp, " Compugen XSW extended profile format.\n");
131 fprintf(fp, " Use -model=xsw.model -nonor -noave -gapop=10 -gapext=1 -qgapop=10 -qgapext=1\n");
132 fprintf(fp, " with om on the Compugen BIC to get the closest approximation to HMMER bit scores.\n");
133 fprintf(fp, " WARNING: There is a loss of information in this conversion.\n");
134 fprintf(fp, " Neither the scores nor even the rank order of hits will be precisely\n");
135 fprintf(fp, " preserved in a comparison of HMMER hmmsearch to GCG profilesearch.\n");
136 fprintf(fp, " The profile score is an approximation of the (single-hit) HMMER score.\n\n");
140 fprintf(fp, " Profile converted from a profile HMM using HMMER v%s emulation.\n", RELEASE);
141 fprintf(fp, " Use -nonor -noave -gap=10 -len=1 with profilesearch and friends\n");
142 fprintf(fp, " to get the closest approximation to HMMER bit scores.\n");
143 fprintf(fp, " WARNING: There is a loss of information in this conversion.\n");
144 fprintf(fp, " Neither the scores nor even the rank order of hits will be precisely\n");
145 fprintf(fp, " preserved in a comparison of HMMER hmmsearch to GCG profilesearch.\n");
146 fprintf(fp, " The profile score is an approximation of the (single-hit) HMMER score.\n\n");
150 /* Do the CONS line, which gives the valid IUPAC symbols and their order
153 for (x = 0; x < Alphabet_iupac; x++)
154 fprintf(fp, " %c ", Alphabet[x]);
156 fprintf(fp, " Gap Len QGap Qlen ..\n");
158 fprintf(fp, " Gap Len ..\n");
160 /* Now, the profile; for each position in the HMM, write a line of profile.
162 for (k = 1; k <= hmm->M; k++)
164 /* GCG adds some indexing as comments */
165 if ((k-1)%10 == 0 && k > 10)
166 fprintf(fp, "! %d\n", k);
168 /* find consensus residue by max prob */
169 x = FMax(hmm->mat[k], Alphabet_size);
170 fprintf(fp, " %c ", Alphabet[x]);
171 /* generate emission score profile;
172 * Profiles are scaled by a factor of 100
174 for (x = 0; x < Alphabet_iupac; x++)
177 if (k < hmm->M) sc += hmm->tsc[k][TMM];
178 sc = sc * 100 / INTSCALE;
179 fprintf(fp, "%5d ", sc);
181 /* Generate gap open, gap extend penalties;
182 note we will force profilesearch to weights of 10, 1,
183 and that GCG profile values are percentages
184 of these base penalties, 0..100.*/
185 /* gap open (insertion)*/
188 gap = -1 * (hmm->tsc[k-1][TMI] + hmm->tsc[k-1][TIM] - hmm->tsc[k-1][TMM] - hmm->tsc[k-1][TII]);
189 gap = gap * 100 / (10.0 * INTSCALE);
191 else gap = 100; /* doesn't matter because GAP_1 is never used */
193 /* gap extend (insertion)*/
196 len = -1 * hmm->tsc[k-1][TII];
197 len = len * 100 / (1.0 * INTSCALE);
199 else len = 100; /* again, doesn't matter because LEN_1 is never used */
201 /* gap open (deletion) */
204 qgap = -1 * (hmm->tsc[k-1][TDM] + hmm->tsc[k-1][TMD] - hmm->tsc[k-1][TMM] - hmm->tsc[k-1][TDD]);
205 qgap = qgap * 100 / (10.0 * INTSCALE);
208 /* gap extend (deletion) */
211 qlen = -1 * hmm->tsc[k-1][TDD];
212 qlen = qlen * 100 / (1.0 * INTSCALE);
218 fprintf(fp, "%5d %5d %5d %5d\n", gap, len, qgap, qlen);
220 fprintf(fp, "%5d %5d\n", gap, len); /* assume insertions >= deletions */
223 /* The final line of the profile is a count of the observed
224 * residues in the training sequences. This information is not
225 * available in an HMM, and I'm not sure that GCG ever uses it.
226 * Approximate it by calculating a /very/ rough expectation.
229 for (x = 0; x < Alphabet_size; x++)
232 for (k = 1; k <= hmm->M; k++)
233 nx += hmm->mat[k][x];
235 fprintf(fp, "%5d ", (int) nx);
237 for (; x < Alphabet_iupac; x++)
238 fprintf(fp, "%5d ", 0);