1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
11 * SRE, Fri May 28 15:46:41 1999
13 * Reading/writing of Stockholm format multiple sequence alignments.
18 * FILE *fp; -- opened for write with fopen()
19 * MSAFILE *afp; -- opened for read with MSAFileOpen()
21 * while ((msa = ReadStockholm(afp)) != NULL)
23 * WriteStockholm(fp, msa);
27 * RCS $Id: stockholm.c 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
34 static int parse_gf(MSA *msa, char *buf);
35 static int parse_gs(MSA *msa, char *buf);
36 static int parse_gc(MSA *msa, char *buf);
37 static int parse_gr(MSA *msa, char *buf);
38 static int parse_comment(MSA *msa, char *buf);
39 static int parse_sequence(MSA *msa, char *buf);
40 static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
42 #ifdef TESTDRIVE_STOCKHOLM
43 /*****************************************************************
44 * stockholm.c test driver:
45 * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
49 main(int argc, char **argv)
57 if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
58 Die("Couldn't open %s\n", file);
60 while ((msa = ReadStockholm(afp)) != NULL)
62 WriteStockholm(stdout, msa);
69 /******************************************************************/
70 #endif /* testdriver */
73 /* Function: ReadStockholm()
74 * Date: SRE, Fri May 21 17:33:10 1999 [St. Louis]
76 * Purpose: Parse the next alignment from an open Stockholm
77 * format alignment file. Return the alignment, or
78 * NULL if there are no more alignments in the file.
80 * Args: afp - open alignment file
82 * Returns: MSA * - an alignment object.
83 * caller responsible for an MSAFree()
84 * NULL if no more alignments
87 * Will Die() here with a (potentially) useful message
88 * if a parsing error occurs
91 ReadStockholm(MSAFILE *afp)
97 if (feof(afp->f)) return NULL;
99 /* Initialize allocation of the MSA.
101 msa = MSAAlloc(10, 0);
103 /* Check the magic Stockholm header line.
104 * We have to skip blank lines here, else we perceive
105 * trailing blank lines in a file as a format error when
106 * reading in multi-record mode.
109 if ((s = MSAFileGetLine(afp)) == NULL) {
113 } while (IsBlankline(s));
115 if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
117 File %s doesn't appear to be in Stockholm format.\n\
118 Assuming there isn't some other problem with your file (it is an\n\
119 alignment file, right?), please either:\n\
120 a) use the Babelfish format autotranslator option (-B, usually);\n\
121 b) specify the file's format with the --informat option; or\n\
122 a) reformat the alignment to Stockholm format.\n",
125 /* Read the alignment file one line at a time.
127 while ((s = MSAFileGetLine(afp)) != NULL)
129 while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */
132 if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s);
133 else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s);
134 else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s);
135 else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s);
136 else status = parse_comment(msa, s);
138 else if (strncmp(s, "//", 2) == 0) break;
139 else if (*s == '\n') continue;
140 else status = parse_sequence(msa, s);
143 Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
144 afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
147 if (s == NULL && msa->nseq != 0)
148 Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
150 if (s == NULL && msa->nseq == 0) {
151 /* probably just some junk at end of file */
160 /* Function: ReadDublin()
163 * Purpose: Parse the next sequences from an open Dublin
164 * format file. Return the sequences, or
165 * NULL if there are no more sequences in the file.
167 * Args: afp - open alignment file
169 * Returns: MSA * - an alignment object.
170 * caller responsible for an MSAFree()
171 * NULL if no more alignments
174 * Will Die() here with a (potentially) useful message
175 * if a parsing error occurs
178 ReadDublin(MSAFILE *afp)
184 if (feof(afp->f)) return NULL;
186 /* Initialize allocation of the MSA.
188 msa = MSAAlloc(10, 0);
190 /* Check the magic Dublin header line.
191 * We have to skip blank lines here, else we perceive
192 * trailing blank lines in a file as a format error when
193 * reading in multi-record mode.
196 if ((s = MSAFileGetLine(afp)) == NULL) {
200 } while (IsBlankline(s));
202 if (strncmp(s, "# DUBLIN 1.", 11) != 0)
204 File %s doesn't appear to be in Dublin format.\n",
207 /* Read the alignment file one line at a time.
209 while ((s = MSAFileGetLine(afp)) != NULL)
211 while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */
214 if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s);
215 else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s);
216 else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s);
217 else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s);
218 else status = parse_comment(msa, s);
220 else if (strncmp(s, "//", 2) == 0) break;
221 else if (*s == '\n') continue;
222 else status = parse_sequence(msa, s);
225 Die("Dublin format parse error: line %d of file %s while reading alignment %s",
226 afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
229 if (s == NULL && msa->nseq != 0)
230 Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
232 if (s == NULL && msa->nseq == 0) {
233 /* probably just some junk at end of file */
238 MSAVerifyParseDub(msa);
241 } /* this is the end of ReadDublin() */
244 /* Function: WriteStockholm()
245 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
247 * Purpose: Write an alignment in standard multi-block
248 * Stockholm format to an open file. A wrapper
249 * for actually_write_stockholm().
251 * Args: fp - file that's open for writing
252 * msa - alignment to write
257 WriteStockholm(FILE *fp, MSA *msa)
259 actually_write_stockholm(fp, msa, 50); /* 50 char per block */
262 /* Function: WriteStockholmOneBlock()
263 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
265 * Purpose: Write an alignment in Pfam's single-block
266 * Stockholm format to an open file. A wrapper
267 * for actually_write_stockholm().
269 * Args: fp - file that's open for writing
270 * msa - alignment to write
275 WriteStockholmOneBlock(FILE *fp, MSA *msa)
277 actually_write_stockholm(fp, msa, msa->alen); /* one big block */
281 /* Function: actually_write_stockholm()
282 * Date: SRE, Fri May 21 17:39:22 1999 [St. Louis]
284 * Purpose: Write an alignment in Stockholm format to
285 * an open file. This is the function that actually
286 * does the work. The API's WriteStockholm()
287 * and WriteStockholmOneBlock() are wrappers.
289 * Args: fp - file that's open for writing
290 * msa - alignment to write
291 * cpl - characters to write per line in alignment block
296 actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
301 int typewidth = 0; /* markup tags are up to 5 chars long */
302 int markupwidth = 0; /* #=GR, #=GC are four char wide + 1 space */
307 /* Figure out how much space we need for name + markup
308 * to keep the alignment in register. Required by Stockholm
309 * spec, even though our Stockholm parser doesn't care (Erik's does).
312 for (i = 0; i < msa->nseq; i++)
313 if ((len = strlen(msa->sqname[i])) > namewidth)
316 /* Figure out how much space we need for markup tags
317 * markupwidth = always 4 if we're doing markup: strlen("#=GR")
318 * typewidth = longest markup tag
320 if (msa->ss != NULL) { markupwidth = 4; typewidth = 2; }
321 if (msa->sa != NULL) { markupwidth = 4; typewidth = 2; }
322 for (i = 0; i < msa->ngr; i++)
323 if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
325 if (msa->rf != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
326 if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
327 if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
328 for (i = 0; i < msa->ngc; i++)
329 if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
331 buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61));
333 /* Magic Stockholm header
335 fprintf(fp, "# STOCKHOLM 1.0\n");
337 /* Free text comments
339 for (i = 0; i < msa->ncomment; i++)
340 fprintf(fp, "# %s\n", msa->comment[i]);
341 if (msa->ncomment > 0) fprintf(fp, "\n");
343 /* GF section: per-file annotation
345 if (msa->name != NULL) fprintf(fp, "#=GF ID %s\n", msa->name);
346 if (msa->acc != NULL) fprintf(fp, "#=GF AC %s\n", msa->acc);
347 if (msa->desc != NULL) fprintf(fp, "#=GF DE %s\n", msa->desc);
348 if (msa->au != NULL) fprintf(fp, "#=GF AU %s\n", msa->au);
350 /* Thresholds are hacky. Pfam has two. Rfam has one.
352 if (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
353 fprintf(fp, "#=GF GA %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
354 else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
355 fprintf(fp, "#=GF GA %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
356 if (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
357 fprintf(fp, "#=GF NC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
358 else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
359 fprintf(fp, "#=GF NC %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
360 if (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
361 fprintf(fp, "#=GF TC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
362 else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
363 fprintf(fp, "#=GF TC %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
365 for (i = 0; i < msa->ngf; i++)
366 fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]);
370 /* GS section: per-sequence annotation
372 if (msa->flags & MSA_SET_WGT)
374 for (i = 0; i < msa->nseq; i++)
375 fprintf(fp, "#=GS %-*.*s WT %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
378 if (msa->sqacc != NULL)
380 for (i = 0; i < msa->nseq; i++)
381 if (msa->sqacc[i] != NULL)
382 fprintf(fp, "#=GS %-*.*s AC %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
385 if (msa->sqdesc != NULL)
387 for (i = 0; i < msa->nseq; i++)
388 if (msa->sqdesc[i] != NULL)
389 fprintf(fp, "#=GS %*.*s DE %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
392 for (i = 0; i < msa->ngs; i++)
394 /* Multiannotated GS tags are possible; for example,
395 * #=GS foo DR PDB; 1xxx;
396 * #=GS foo DR PDB; 2yyy;
397 * These are stored, for example, as:
398 * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
399 * and must be decomposed.
401 for (j = 0; j < msa->nseq; j++)
402 if (msa->gs[i][j] != NULL)
405 while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
406 fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
407 msa->sqname[j], msa->gs_tag[i], tok);
412 /* Alignment section:
413 * contains aligned sequence, #=GR annotation, and #=GC annotation
415 for (currpos = 0; currpos < msa->alen; currpos += cpl)
417 if (currpos > 0) fprintf(fp, "\n");
418 for (i = 0; i < msa->nseq; i++)
420 strncpy(buf, msa->aseq[i] + currpos, cpl);
422 fprintf(fp, "%-*.*s %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth,
423 msa->sqname[i], buf);
425 if (msa->ss != NULL && msa->ss[i] != NULL) {
426 strncpy(buf, msa->ss[i] + currpos, cpl);
428 fprintf(fp, "#=GR %-*.*s SS %s\n", namewidth, namewidth, msa->sqname[i], buf);
430 if (msa->sa != NULL && msa->sa[i] != NULL) {
431 strncpy(buf, msa->sa[i] + currpos, cpl);
433 fprintf(fp, "#=GR %-*.*s SA %s\n", namewidth, namewidth, msa->sqname[i], buf);
435 for (j = 0; j < msa->ngr; j++)
436 if (msa->gr[j][i] != NULL) {
437 strncpy(buf, msa->gr[j][i] + currpos, cpl);
439 fprintf(fp, "#=GR %-*.*s %5s %s\n",
440 namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
443 if (msa->ss_cons != NULL) {
444 strncpy(buf, msa->ss_cons + currpos, cpl);
446 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
449 if (msa->sa_cons != NULL) {
450 strncpy(buf, msa->sa_cons + currpos, cpl);
452 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
455 if (msa->rf != NULL) {
456 strncpy(buf, msa->rf + currpos, cpl);
458 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
460 for (j = 0; j < msa->ngc; j++) {
461 strncpy(buf, msa->gc[j] + currpos, cpl);
463 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth,
464 msa->gc_tag[j], buf);
475 /* Format of a GF line:
476 * #=GF <featurename> <text>
479 parse_gf(MSA *msa, char *buf)
487 if ((gf = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
488 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
489 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
490 while (*text && (*text == ' ' || *text == '\t')) text++;
492 if (strcmp(featurename, "ID") == 0)
493 msa->name = sre_strdup(text, -1);
494 else if (strcmp(featurename, "AC") == 0)
495 msa->acc = sre_strdup(text, -1);
496 else if (strcmp(featurename, "DE") == 0)
497 msa->desc = sre_strdup(text, -1);
498 else if (strcmp(featurename, "AU") == 0)
499 msa->au = sre_strdup(text, -1);
500 else if (strcmp(featurename, "GA") == 0)
501 { /* Pfam has GA1, GA2. Rfam just has GA1. */
503 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
504 msa->cutoff[MSA_CUTOFF_GA1] = atof(text);
505 msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
506 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
507 msa->cutoff[MSA_CUTOFF_GA2] = atof(text);
508 msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
511 else if (strcmp(featurename, "NC") == 0)
514 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
515 msa->cutoff[MSA_CUTOFF_NC1] = atof(text);
516 msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
517 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
518 msa->cutoff[MSA_CUTOFF_NC2] = atof(text);
519 msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
522 else if (strcmp(featurename, "TC") == 0)
525 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
526 msa->cutoff[MSA_CUTOFF_TC1] = atof(text);
527 msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
528 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
529 msa->cutoff[MSA_CUTOFF_TC2] = atof(text);
530 msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
534 MSAAddGF(msa, featurename, text);
540 /* Format of a GS line:
541 * #=GS <seqname> <featurename> <text>
544 parse_gs(MSA *msa, char *buf)
554 if ((gs = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
555 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
556 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
557 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
558 while (*text && (*text == ' ' || *text == '\t')) text++;
560 /* GS usually follows another GS; guess lastidx+1
562 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
563 msa->lastidx = seqidx;
565 if (strcmp(featurename, "WT") == 0)
567 msa->wgt[seqidx] = atof(text);
568 msa->flags |= MSA_SET_WGT;
571 else if (strcmp(featurename, "AC") == 0)
572 MSASetSeqAccession(msa, seqidx, text);
574 else if (strcmp(featurename, "DE") == 0)
575 MSASetSeqDescription(msa, seqidx, text);
578 MSAAddGS(msa, featurename, seqidx, text);
583 /* Format of a GC line:
584 * #=GC <featurename> <text>
587 parse_gc(MSA *msa, char *buf)
596 if ((gc = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
597 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
598 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
600 if (strcmp(featurename, "SS_cons") == 0)
601 sre_strcat(&(msa->ss_cons), -1, text, len);
602 else if (strcmp(featurename, "SA_cons") == 0)
603 sre_strcat(&(msa->sa_cons), -1, text, len);
604 else if (strcmp(featurename, "RF") == 0)
605 sre_strcat(&(msa->rf), -1, text, len);
607 MSAAppendGC(msa, featurename, text);
612 /* Format of a GR line:
613 * #=GR <seqname> <featurename> <text>
616 parse_gr(MSA *msa, char *buf)
628 if ((gr = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
629 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
630 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
631 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
633 /* GR usually follows sequence it refers to; guess msa->lastidx */
634 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
635 msa->lastidx = seqidx;
637 if (strcmp(featurename, "SS") == 0)
641 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
642 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
643 for (j = 0; j < msa->nseqalloc; j++)
649 msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
651 else if (strcmp(featurename, "SA") == 0)
655 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
656 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
657 for (j = 0; j < msa->nseqalloc; j++)
663 msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
665 else if (strcmp(featurename, "CO") == 0)
669 msa->co = MallocOrDie(sizeof(char *) * msa->nseqalloc);
670 msa->colen = MallocOrDie(sizeof(int) * msa->nseqalloc);
671 for (j = 0; j < msa->nseqalloc; j++)
677 msa->colen[seqidx] = sre_strcat(&(msa->co[seqidx]), msa->colen[seqidx], text, len);
680 MSAAppendGR(msa, featurename, seqidx, text);
686 /* comments are simply stored verbatim, not parsed
689 parse_comment(MSA *msa, char *buf)
694 s = buf + 1; /* skip leading '#' */
695 if (*s == '\n') { *s = '\0'; comment = s; } /* deal with blank comment */
696 else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
698 MSAAddComment(msa, comment);
703 parse_sequence(MSA *msa, char *buf)
712 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
713 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
715 /* seq usually follows another seq; guess msa->lastidx +1 */
716 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
717 msa->lastidx = seqidx;
719 msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);