1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Thu Dec 18 16:05:29 1997 [St. Louis]
14 * main() for aligning a set of sequences to an HMM.
15 * RCS $Id: hmmalign.c,v 1.1.1.1 2005/03/22 08:34:00 cmzmasek Exp $
21 #include "structs.h" /* data structures, macros, #define's */
22 #include "config.h" /* compile-time configuration constants */
23 #include "funcs.h" /* function declarations */
24 #include "globals.h" /* alphabet global variables */
25 #include "squid.h" /* general sequence analysis library */
26 #include "msa.h" /* squid's multiple alignment i/o */
28 static char banner[] = "hmmalign - align sequences to an HMM profile";
30 static char usage[] = "\
31 Usage: hmmalign [-options] <hmm file> <sequence file>\n\
32 Available options are:\n\
33 -h : help; print brief help on version and usage\n\
34 -m : only print symbols aligned to match states\n\
35 -o <f> : save alignment in file <f> in SELEX format\n\
36 -q : quiet - suppress verbose banner\n\
39 static char experts[] = "\
40 --informat <s>: sequence file is in format <s>, not FASTA\n\
41 --mapali <f> : include alignment in file <f> using map in HMM\n\
42 --withali <f> : include alignment to (fixed) alignment in file <f>\n\
45 static struct opt_s OPTIONS[] = {
46 { "-h", TRUE, sqdARG_NONE },
47 { "-m", TRUE, sqdARG_NONE } ,
48 { "-o", TRUE, sqdARG_STRING },
49 { "-q", TRUE, sqdARG_NONE },
50 { "--informat",FALSE, sqdARG_STRING },
51 { "--mapali", FALSE, sqdARG_STRING },
52 { "--withali", FALSE, sqdARG_STRING },
54 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
56 static void include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
57 char ***rseq, char ***dsq, SQINFO **sqinfo,
58 struct p7trace_s ***tr, int *nseq);
61 main(int argc, char **argv)
63 char *hmmfile; /* file to read HMMs from */
64 HMMFILE *hmmfp; /* opened hmmfile for reading */
65 struct plan7_s *hmm; /* HMM to align to */
66 char *seqfile; /* file to read target sequence from */
67 int format; /* format of seqfile */
68 char **rseq; /* raw, unaligned sequences */
69 SQINFO *sqinfo; /* info associated with sequences */
70 char **dsq; /* digitized raw sequences */
71 int nseq; /* number of sequences */
72 float *wgt; /* weights to assign to alignment */
73 MSA *msa; /* alignment that's created */
75 struct p7trace_s **tr; /* traces for aligned sequences */
77 char *optname; /* name of option found by Getopt() */
78 char *optarg; /* argument found by Getopt() */
79 int optind; /* index in argv[] */
80 int be_quiet; /* TRUE to suppress verbose banner */
81 int matchonly; /* TRUE to show only match state syms */
82 char *outfile; /* optional alignment output file */
83 FILE *ofp; /* handle on alignment output file */
84 char *withali; /* name of additional alignment file to align */
85 char *mapali; /* name of additional alignment file to map */
87 /***********************************************
89 ***********************************************/
91 format = SQFILE_UNKNOWN; /* default: autodetect format */
98 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
99 &optind, &optname, &optarg)) {
100 if (strcmp(optname, "-m") == 0) matchonly= TRUE;
101 else if (strcmp(optname, "-o") == 0) outfile = optarg;
102 else if (strcmp(optname, "-q") == 0) be_quiet = TRUE;
103 else if (strcmp(optname, "--mapali") == 0) mapali = optarg;
104 else if (strcmp(optname, "--withali") == 0) withali = optarg;
105 else if (strcmp(optname, "--informat") == 0) {
106 format = String2SeqfileFormat(optarg);
107 if (format == SQFILE_UNKNOWN)
108 Die("unrecognized sequence file format \"%s\"", optarg);
110 else if (strcmp(optname, "-h") == 0)
112 Banner(stdout, banner);
118 if (argc - optind != 2)
119 Die("Incorrect number of arguments.\n%s\n", usage);
121 hmmfile = argv[optind++];
122 seqfile = argv[optind++];
124 /***********************************************
125 * Open HMM file (might be in HMMERDB or current directory).
126 * Read a single HMM from it.
128 * Currently hmmalign disallows the J state and
129 * only allows one domain per sequence. To preserve
130 * the S/W entry information, the J state is explicitly
131 * disallowed, rather than calling a Plan7*Config() function.
132 * this is a workaround in 2.1 for the 2.0.x "yo!" bug.
133 ***********************************************/
135 if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
136 Die("Failed to open HMM file %s\n%s", hmmfile, usage);
137 if (!HMMFileRead(hmmfp, &hmm))
138 Die("Failed to read any HMMs from %s\n", hmmfile);
141 Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
142 hmm->xt[XTE][MOVE] = 1.; /* only 1 domain/sequence ("global" alignment) */
143 hmm->xt[XTE][LOOP] = 0.;
144 P7Logoddsify(hmm, TRUE);
145 /* do we have the map we might need? */
146 if (mapali != NULL && ! (hmm->flags & PLAN7_MAP))
147 Die("HMMER: HMM file %s has no map; you can't use --mapali.", hmmfile);
149 /***********************************************
150 * Open sequence file in current directory.
151 * Read all seqs from it.
152 ***********************************************/
154 if (! ReadMultipleRseqs(seqfile, format, &rseq, &sqinfo, &nseq))
155 Die("Failed to read any sequences from file %s", seqfile);
157 /***********************************************
159 ***********************************************/
163 Banner(stdout, banner);
164 printf( "HMM file: %s\n", hmmfile);
165 printf( "Sequence file: %s\n", seqfile);
166 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
169 /***********************************************
171 ***********************************************/
173 /* Allocations and initializations.
175 dsq = MallocOrDie(sizeof(char *) * nseq);
176 tr = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
178 /* Align each sequence to the model, collect traces
180 for (i = 0; i < nseq; i++)
182 dsq[i] = DigitizeSequence(rseq[i], sqinfo[i].len);
184 if (P7ViterbiSize(sqinfo[i].len, hmm->M) <= RAMLIMIT)
185 (void) P7Viterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
187 (void) P7SmallViterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
190 /* Include an aligned alignment, if desired.
193 include_alignment(mapali, hmm, TRUE, &rseq, &dsq, &sqinfo, &tr, &nseq);
195 include_alignment(withali, hmm, FALSE, &rseq, &dsq, &sqinfo, &tr, &nseq);
197 /* Turn traces into a multiple alignment
199 wgt = MallocOrDie(sizeof(float) * nseq);
200 FSet(wgt, nseq, 1.0);
201 msa = P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, matchonly);
203 /***********************************************
204 * Output the alignment
205 ***********************************************/
207 if (outfile != NULL && (ofp = fopen(outfile, "w")) != NULL)
209 WriteStockholm(ofp, msa);
210 printf("Alignment saved in file %s\n", outfile);
214 WriteStockholm(stdout, msa);
216 /***********************************************
218 ***********************************************/
220 for (i = 0; i < nseq; i++)
223 FreeSequence(rseq[i], &(sqinfo[i]));
239 /* Function: include_alignment()
240 * Date: SRE, Sun Jul 5 15:25:13 1998 [St. Louis]
242 * Purpose: Given the name of a multiple alignment file,
243 * align that alignment to the HMM, and add traces
244 * to an existing array of traces. If do_mapped
245 * is TRUE, we use the HMM's map file. If not,
246 * we use P7ViterbiAlignAlignment().
248 * Args: seqfile - name of alignment file
249 * hmm - model to align to
250 * do_mapped- TRUE if we're to use the HMM's alignment map
251 * rsq - RETURN: array of rseqs to add to
252 * dsq - RETURN: array of dsq to add to
253 * sqinfo - RETURN: array of SQINFO to add to
254 * tr - RETURN: array of traces to add to
255 * nseq - RETURN: number of seqs
257 * Returns: new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is
258 * increased to nseq+ainfo.nseq.
261 include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
262 char ***rsq, char ***dsq, SQINFO **sqinfo,
263 struct p7trace_s ***tr, int *nseq)
265 int format; /* format of alignment file */
266 MSA *msa; /* alignment to align to */
268 SQINFO *newinfo; /* sqinfo array from msa */
271 int idx; /* counter over aseqs */
272 struct p7trace_s *master; /* master trace */
273 struct p7trace_s **addtr; /* individual traces for aseq */
275 format = MSAFILE_UNKNOWN; /* invoke Babelfish */
276 if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL)
277 Die("Alignment file %s could not be opened for reading", seqfile);
278 if ((msa = MSAFileRead(afp)) == NULL)
279 Die("Failed to read an alignment from %s\n", seqfile);
281 for (idx = 0; idx < msa->nseq; idx++)
282 s2upper(msa->aseq[idx]);
283 newinfo = MSAToSqinfo(msa);
285 /* Verify checksums before mapping */
286 if (do_mapped && GCGMultchecksum(msa->aseq, msa->nseq) != hmm->checksum)
287 Die("The checksums for alignment file %s and the HMM alignment map don't match.",
289 /* Get a master trace */
290 if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, msa->alen);
291 else master = P7ViterbiAlignAlignment(msa, hmm);
293 /* convert to individual traces */
294 ImposeMasterTrace(msa->aseq, msa->nseq, master, &addtr);
295 /* add those traces to existing ones */
296 *tr = MergeTraceArrays(*tr, *nseq, addtr, msa->nseq);
298 /* additional bookkeeping: add to dsq, sqinfo */
299 *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + msa->nseq));
300 DealignAseqs(msa->aseq, msa->nseq, &newrseq);
301 for (idx = *nseq; idx < *nseq + msa->nseq; idx++)
302 (*rsq)[idx] = newrseq[idx - (*nseq)];
305 *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + msa->nseq));
306 DigitizeAlignment(msa, &newdsq);
307 for (idx = *nseq; idx < *nseq + msa->nseq; idx++)
308 (*dsq)[idx] = newdsq[idx - (*nseq)];
310 /* unnecessarily complex, but I can't be bothered... */
311 *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + msa->nseq));
312 for (idx = *nseq; idx < *nseq + msa->nseq; idx++)
313 SeqinfoCopy(&((*sqinfo)[idx]), &(newinfo[idx - (*nseq)]));
315 *nseq = *nseq + msa->nseq;