1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Fri May 28 15:46:41 1999
14 * Reading/writing of Stockholm format multiple sequence alignments.
19 * FILE *fp; -- opened for write with fopen()
20 * MSAFILE *afp; -- opened for read with MSAFileOpen()
22 * while ((msa = ReadStockholm(afp)) != NULL)
24 * WriteStockholm(fp, msa);
28 * RCS $Id: stockholm.c,v 1.1.1.1 2005/03/22 08:34:30 cmzmasek Exp $
35 static int parse_gf(MSA *msa, char *buf);
36 static int parse_gs(MSA *msa, char *buf);
37 static int parse_gc(MSA *msa, char *buf);
38 static int parse_gr(MSA *msa, char *buf);
39 static int parse_comment(MSA *msa, char *buf);
40 static int parse_sequence(MSA *msa, char *buf);
41 static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
43 #ifdef TESTDRIVE_STOCKHOLM
44 /*****************************************************************
45 * stockholm.c test driver:
46 * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
50 main(int argc, char **argv)
58 if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
59 Die("Couldn't open %s\n", file);
61 while ((msa = ReadStockholm(afp)) != NULL)
63 WriteStockholm(stdout, msa);
70 /******************************************************************/
71 #endif /* testdriver */
74 /* Function: ReadStockholm()
75 * Date: SRE, Fri May 21 17:33:10 1999 [St. Louis]
77 * Purpose: Parse the next alignment from an open Stockholm
78 * format alignment file. Return the alignment, or
79 * NULL if there are no more alignments in the file.
81 * Args: afp - open alignment file
83 * Returns: MSA * - an alignment object.
84 * caller responsible for an MSAFree()
85 * NULL if no more alignments
88 * Will Die() here with a (potentially) useful message
89 * if a parsing error occurs
92 ReadStockholm(MSAFILE *afp)
98 if (feof(afp->f)) return NULL;
100 /* Initialize allocation of the MSA.
102 msa = MSAAlloc(10, 0);
104 /* Check the magic Stockholm header line.
105 * We have to skip blank lines here, else we perceive
106 * trailing blank lines in a file as a format error when
107 * reading in multi-record mode.
110 if ((s = MSAFileGetLine(afp)) == NULL) {
114 } while (IsBlankline(s));
116 if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
118 File %s doesn't appear to be in Stockholm format.\n\
119 Assuming there isn't some other problem with your file (it is an\n\
120 alignment file, right?), please either:\n\
121 a) use the Babelfish format autotranslator option (-B, usually);\n\
122 b) specify the file's format with the --informat option; or\n\
123 a) reformat the alignment to Stockholm format.\n",
126 /* Read the alignment file one line at a time.
128 while ((s = MSAFileGetLine(afp)) != NULL)
130 while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */
133 if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s);
134 else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s);
135 else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s);
136 else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s);
137 else status = parse_comment(msa, s);
139 else if (strncmp(s, "//", 2) == 0) break;
140 else if (*s == '\n') continue;
141 else status = parse_sequence(msa, s);
144 Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
145 afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
148 if (s == NULL && msa->nseq != 0)
149 Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
151 if (s == NULL && msa->nseq == 0) {
152 /* probably just some junk at end of file */
162 /* Function: WriteStockholm()
163 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
165 * Purpose: Write an alignment in standard multi-block
166 * Stockholm format to an open file. A wrapper
167 * for actually_write_stockholm().
169 * Args: fp - file that's open for writing
170 * msa - alignment to write
175 WriteStockholm(FILE *fp, MSA *msa)
177 actually_write_stockholm(fp, msa, 50); /* 50 char per block */
180 /* Function: WriteStockholmOneBlock()
181 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
183 * Purpose: Write an alignment in Pfam's single-block
184 * Stockholm format to an open file. A wrapper
185 * for actually_write_stockholm().
187 * Args: fp - file that's open for writing
188 * msa - alignment to write
193 WriteStockholmOneBlock(FILE *fp, MSA *msa)
195 actually_write_stockholm(fp, msa, msa->alen); /* one big block */
199 /* Function: actually_write_stockholm()
200 * Date: SRE, Fri May 21 17:39:22 1999 [St. Louis]
202 * Purpose: Write an alignment in Stockholm format to
203 * an open file. This is the function that actually
204 * does the work. The API's WriteStockholm()
205 * and WriteStockholmOneBlock() are wrappers.
207 * Args: fp - file that's open for writing
208 * msa - alignment to write
209 * cpl - characters to write per line in alignment block
214 actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
219 int typewidth = 0; /* markup tags are up to 5 chars long */
220 int markupwidth = 0; /* #=GR, #=GC are four char wide + 1 space */
225 /* Figure out how much space we need for name + markup
226 * to keep the alignment in register. Required by Stockholm
227 * spec, even though our Stockholm parser doesn't care (Erik's does).
230 for (i = 0; i < msa->nseq; i++)
231 if ((len = strlen(msa->sqname[i])) > namewidth)
234 /* Figure out how much space we need for markup tags
235 * markupwidth = always 4 if we're doing markup: strlen("#=GR")
236 * typewidth = longest markup tag
238 if (msa->ss != NULL) { markupwidth = 4; typewidth = 2; }
239 if (msa->sa != NULL) { markupwidth = 4; typewidth = 2; }
240 for (i = 0; i < msa->ngr; i++)
241 if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
243 if (msa->rf != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
244 if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
245 if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
246 for (i = 0; i < msa->ngc; i++)
247 if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
250 /* Magic Stockholm header
252 fprintf(fp, "# STOCKHOLM 1.0\n");
254 /* Free text comments
256 for (i = 0; i < msa->ncomment; i++)
257 fprintf(fp, "# %s\n", msa->comment[i]);
258 if (msa->ncomment > 0) fprintf(fp, "\n");
260 /* GF section: per-file annotation
262 if (msa->name != NULL) fprintf(fp, "#=GF ID %s\n", msa->name);
263 if (msa->acc != NULL) fprintf(fp, "#=GF AC %s\n", msa->acc);
264 if (msa->desc != NULL) fprintf(fp, "#=GF DE %s\n", msa->desc);
265 if (msa->au != NULL) fprintf(fp, "#=GF AU %s\n", msa->au);
266 if (msa->flags & MSA_SET_GA) fprintf(fp, "#=GF GA %.1f %.1f\n", msa->ga1, msa->ga2);
267 if (msa->flags & MSA_SET_NC) fprintf(fp, "#=GF TC %.1f %.1f\n", msa->nc1, msa->nc2);
268 if (msa->flags & MSA_SET_TC) fprintf(fp, "#=GF TC %.1f %.1f\n", msa->tc1, msa->tc2);
269 for (i = 0; i < msa->ngf; i++)
270 fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]);
274 /* GS section: per-sequence annotation
276 if (msa->flags & MSA_SET_WGT)
278 for (i = 0; i < msa->nseq; i++)
279 fprintf(fp, "#=GS %-*.*s WT %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
282 if (msa->sqacc != NULL)
284 for (i = 0; i < msa->nseq; i++)
285 if (msa->sqacc[i] != NULL)
286 fprintf(fp, "#=GS %-*.*s AC %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
289 if (msa->sqdesc != NULL)
291 for (i = 0; i < msa->nseq; i++)
292 if (msa->sqdesc[i] != NULL)
293 fprintf(fp, "#=GS %*.*s DE %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
296 for (i = 0; i < msa->ngs; i++)
298 /* Multiannotated GS tags are possible; for example,
299 * #=GS foo DR PDB; 1xxx;
300 * #=GS foo DR PDB; 2yyy;
301 * These are stored, for example, as:
302 * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
303 * and must be decomposed.
305 for (j = 0; j < msa->nseq; j++)
306 if (msa->gs[i][j] != NULL)
309 while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
310 fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
311 msa->sqname[j], msa->gs_tag[i], tok);
316 /* Alignment section:
317 * contains aligned sequence, #=GR annotation, and #=GC annotation
319 for (currpos = 0; currpos < msa->alen; currpos += cpl)
321 if (currpos > 0) fprintf(fp, "\n");
322 for (i = 0; i < msa->nseq; i++)
324 strncpy(buf, msa->aseq[i] + currpos, cpl);
326 fprintf(fp, "%-*.*s %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth,
327 msa->sqname[i], buf);
329 if (msa->ss != NULL && msa->ss[i] != NULL) {
330 strncpy(buf, msa->ss[i] + currpos, cpl);
332 fprintf(fp, "#=GR %-*.*s SS %s\n", namewidth, namewidth, msa->sqname[i], buf);
334 if (msa->sa != NULL && msa->sa[i] != NULL) {
335 strncpy(buf, msa->sa[i] + currpos, cpl);
337 fprintf(fp, "#=GR %-*.*s SA %s\n", namewidth, namewidth, msa->sqname[i], buf);
339 for (j = 0; j < msa->ngr; j++)
340 if (msa->gr[j][i] != NULL) {
341 strncpy(buf, msa->gr[j][i] + currpos, cpl);
343 fprintf(fp, "#=GR %-*.*s %5s %s\n",
344 namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
347 if (msa->ss_cons != NULL) {
348 strncpy(buf, msa->ss_cons + currpos, cpl);
350 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
353 if (msa->sa_cons != NULL) {
354 strncpy(buf, msa->sa_cons + currpos, cpl);
356 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
359 if (msa->rf != NULL) {
360 strncpy(buf, msa->rf + currpos, cpl);
362 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
364 for (j = 0; j < msa->ngc; j++) {
365 strncpy(buf, msa->gc[j] + currpos, cpl);
367 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth,
368 msa->gc_tag[j], buf);
378 /* Format of a GF line:
379 * #=GF <featurename> <text>
382 parse_gf(MSA *msa, char *buf)
390 if ((gf = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
391 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
392 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
393 while (*text && (*text == ' ' || *text == '\t')) text++;
395 if (strcmp(featurename, "ID") == 0)
396 msa->name = sre_strdup(text, -1);
397 else if (strcmp(featurename, "AC") == 0)
398 msa->acc = sre_strdup(text, -1);
399 else if (strcmp(featurename, "DE") == 0)
400 msa->desc = sre_strdup(text, -1);
401 else if (strcmp(featurename, "AU") == 0)
402 msa->au = sre_strdup(text, -1);
403 else if (strcmp(featurename, "GA") == 0)
406 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
407 msa->ga1 = atof(text);
408 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
409 msa->ga2 = atof(text);
410 msa->flags |= MSA_SET_GA;
412 else if (strcmp(featurename, "NC") == 0)
415 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
416 msa->nc1 = atof(text);
417 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
418 msa->nc2 = atof(text);
419 msa->flags |= MSA_SET_NC;
421 else if (strcmp(featurename, "TC") == 0)
424 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
425 msa->tc1 = atof(text);
426 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
427 msa->tc2 = atof(text);
428 msa->flags |= MSA_SET_TC;
431 MSAAddGF(msa, featurename, text);
437 /* Format of a GS line:
438 * #=GS <seqname> <featurename> <text>
441 parse_gs(MSA *msa, char *buf)
451 if ((gs = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
452 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
453 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
454 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
455 while (*text && (*text == ' ' || *text == '\t')) text++;
457 /* GS usually follows another GS; guess lastidx+1
459 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
460 msa->lastidx = seqidx;
462 if (strcmp(featurename, "WT") == 0)
464 msa->wgt[seqidx] = atof(text);
465 msa->flags |= MSA_SET_WGT;
468 else if (strcmp(featurename, "AC") == 0)
469 MSASetSeqAccession(msa, seqidx, text);
471 else if (strcmp(featurename, "DE") == 0)
472 MSASetSeqDescription(msa, seqidx, text);
475 MSAAddGS(msa, featurename, seqidx, text);
480 /* Format of a GC line:
481 * #=GC <featurename> <text>
484 parse_gc(MSA *msa, char *buf)
493 if ((gc = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
494 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
495 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
497 if (strcmp(featurename, "SS_cons") == 0)
498 sre_strcat(&(msa->ss_cons), -1, text, len);
499 else if (strcmp(featurename, "SA_cons") == 0)
500 sre_strcat(&(msa->sa_cons), -1, text, len);
501 else if (strcmp(featurename, "RF") == 0)
502 sre_strcat(&(msa->rf), -1, text, len);
504 MSAAppendGC(msa, featurename, text);
509 /* Format of a GR line:
510 * #=GR <seqname> <featurename> <text>
513 parse_gr(MSA *msa, char *buf)
525 if ((gr = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
526 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
527 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
528 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
530 /* GR usually follows sequence it refers to; guess msa->lastidx */
531 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
532 msa->lastidx = seqidx;
534 if (strcmp(featurename, "SS") == 0)
538 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
539 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
540 for (j = 0; j < msa->nseqalloc; j++)
546 msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
548 else if (strcmp(featurename, "SA") == 0)
552 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
553 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
554 for (j = 0; j < msa->nseqalloc; j++)
560 msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
563 MSAAppendGR(msa, featurename, seqidx, text);
569 /* comments are simply stored verbatim, not parsed
572 parse_comment(MSA *msa, char *buf)
577 s = buf + 1; /* skip leading '#' */
578 if (*s == '\n') { *s = '\0'; comment = s; } /* deal with blank comment */
579 else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
581 MSAAddComment(msa, comment);
586 parse_sequence(MSA *msa, char *buf)
595 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
596 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
598 /* seq usually follows another seq; guess msa->lastidx +1 */
599 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
600 msa->lastidx = seqidx;
602 msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);