1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Sun Mar 8 14:11:24 1998 [St. Louis]
14 * main() for generating sequences from an HMM
15 * CVS $Id: hmmemit.c,v 1.1.1.1 2005/03/22 08:34:09 cmzmasek Exp $
22 #include "structs.h" /* data structures, macros, #define's */
23 #include "config.h" /* compile-time configuration constants */
24 #include "funcs.h" /* function declarations */
25 #include "globals.h" /* alphabet global variables */
26 #include "squid.h" /* general sequence analysis library */
27 #include "msa.h" /* squid's multiple sequence i/o */
29 static char banner[] = "hmmemit - generate sequences from a profile HMM";
31 static char usage[] = "\
32 Usage: hmmemit [-options] <hmm file>\n\
33 Available options are:\n\
34 -a : write generated sequences as an alignment, not FASTA\n\
35 -c : generate a single \"consensus\" sequence\n\
36 -h : help; print brief help on version and usage\n\
37 -n <n> : emit <n> sequences (default 10)\n\
38 -o <f> : save sequences in file <f>\n\
39 -q : quiet - suppress verbose banner\n\
42 static char experts[] = "\
43 --seed <n> : set random number seed to <n>\n\
46 static struct opt_s OPTIONS[] = {
47 { "-a", TRUE, sqdARG_NONE },
48 { "-c", TRUE, sqdARG_NONE },
49 { "-h", TRUE, sqdARG_NONE },
50 { "-n", TRUE, sqdARG_INT},
51 { "-o", TRUE, sqdARG_STRING},
52 { "-q", TRUE, sqdARG_NONE},
53 { "--seed", FALSE, sqdARG_INT},
55 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
58 main(int argc, char **argv)
60 char *hmmfile; /* file to read HMMs from */
61 HMMFILE *hmmfp; /* opened hmmfile for reading */
62 struct plan7_s *hmm; /* HMM to generate from */
63 FILE *fp; /* output file handle */
64 int L; /* length of a sequence */
65 int i; /* counter over sequences */
66 int nhmm; /* counter over HMMs */
68 char *ofile; /* output sequence file */
69 int nseq; /* number of seqs to sample */
70 int seed; /* random number generator seed */
71 int be_quiet; /* TRUE to silence header/footer */
72 int do_alignment;/* TRUE to output in aligned format */
73 int do_consensus;/* TRUE to do a single consensus seq */
75 char *optname; /* name of option found by Getopt() */
76 char *optarg; /* argument found by Getopt() */
77 int optind; /* index in argv[] */
79 /***********************************************
81 ***********************************************/
84 seed = time ((time_t *) NULL);
90 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
91 &optind, &optname, &optarg)) {
92 if (strcmp(optname, "-a") == 0) do_alignment = TRUE;
93 else if (strcmp(optname, "-c") == 0) do_consensus = TRUE;
94 else if (strcmp(optname, "-n") == 0) nseq = atoi(optarg);
95 else if (strcmp(optname, "-o") == 0) ofile = optarg;
96 else if (strcmp(optname, "-q") == 0) be_quiet = TRUE;
97 else if (strcmp(optname, "--seed") == 0) seed = atoi(optarg);
98 else if (strcmp(optname, "-h") == 0)
100 Banner(stdout, banner);
106 if (argc - optind != 1)
107 Die("Incorrect number of arguments.\n%s\n", usage);
109 hmmfile = argv[optind++];
113 if (do_alignment && do_consensus)
114 Die("Sorry, -a and -c are incompatible.\nUsage:\n%s", usage);
115 if (nseq != 10 && do_consensus)
116 Warn("-c (consensus) overrides -n (# of sampled seqs)");
118 /***********************************************
119 * Open HMM file (might be in HMMERDB or current directory).
120 * Open output file, if needed.
121 ***********************************************/
123 if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
124 Die("Failed to open HMM file %s\n%s", hmmfile, usage);
126 if (ofile == NULL) fp = stdout;
128 if ((fp = fopen(ofile, "w")) == NULL)
129 Die("Failed to open output file %s for writing", ofile);
132 /***********************************************
133 * Show the options banner
134 ***********************************************/
138 Banner(stdout, banner);
139 printf("HMM file: %s\n", hmmfile);
140 if (! do_consensus) {
141 printf("Number of seqs: %d\n", nseq);
142 printf("Random seed: %d\n", seed);
144 printf("Generating consensus sequence.\n");
146 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
149 /***********************************************
150 * For every HMM in the file, do some emission.
151 ***********************************************/
154 while (HMMFileRead(hmmfp, &hmm)) {
156 Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
158 /* Configure the HMM to shut off N,J,C emission: so we
159 * do a simple single pass through the model.
161 Plan7NakedConfig(hmm);
162 Plan7Renormalize(hmm);
164 /***********************************************
166 * If we're generating an alignment, we have to collect
167 * all our traces, then output. If we're generating unaligned
168 * sequences, we can emit one at a time.
169 ***********************************************/
174 SQINFO sqinfo; /* info about sequence (name/desc) */
176 EmitConsensusSequence(hmm, &seq, NULL, &L, NULL);
177 strcpy(sqinfo.name, hmm->name);
178 strcpy(sqinfo.desc, "profile HMM generated consensus sequence [hmmemit]");
181 sqinfo.flags = SQINFO_NAME | SQINFO_DESC | SQINFO_LEN;
183 WriteSeq(fp, SQFILE_FASTA, seq, &sqinfo);
186 else if (do_alignment)
188 struct p7trace_s **tr; /* traces for aligned sequences */
189 char **dsq; /* digitized sequences */
190 SQINFO *sqinfo; /* info about sequences (name/desc) */
191 MSA *msa; /* alignment */
194 dsq = MallocOrDie(sizeof(char *) * nseq);
195 tr = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
196 sqinfo = MallocOrDie(sizeof(SQINFO) * nseq);
197 wgt = MallocOrDie(sizeof(float) * nseq);
198 FSet(wgt, nseq, 1.0);
200 for (i = 0; i < nseq; i++)
202 EmitSequence(hmm, &(dsq[i]), &L, &(tr[i]));
203 sprintf(sqinfo[i].name, "seq%d", i+1);
205 sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN;
208 msa = P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE);
209 msa->name = sre_strdup(hmm->name, -1);
210 msa->desc = sre_strdup("Synthetic sequence alignment generated by hmmemit", -1);
212 /* Output the alignment */
213 WriteStockholm(fp, msa);
217 for (i = 0; i < nseq; i++)
228 else /* unaligned sequence output */
230 struct p7trace_s *tr; /* generated trace */
231 char *dsq; /* digitized sequence */
232 char *seq; /* alphabetic sequence */
233 SQINFO sqinfo; /* info about sequence (name/len) */
235 for (i = 0; i < nseq; i++)
237 EmitSequence(hmm, &dsq, &L, &tr);
238 sprintf(sqinfo.name, "%s-%d", hmm->name, i+1);
240 sqinfo.flags = SQINFO_NAME | SQINFO_LEN;
242 seq = DedigitizeSequence(dsq, L);
244 WriteSeq(fp, SQFILE_FASTA, seq, &sqinfo);
255 /* We're done; clean up and exit.
258 Die("Failed to read any HMMs from %s\n", hmmfile);
261 if (!be_quiet) printf("Output saved in file %s\n", ofile);