+++ /dev/null
-/*****************************************************************
- * SQUID - a library of functions for biological sequence analysis
- * Copyright (C) 1992-2002 Washington University School of Medicine
- *
- * This source code is freely distributed under the terms of the
- * GNU General Public License. See the files COPYRIGHT and LICENSE
- * for details.
- *****************************************************************/
-
-/* stockholm.c
- * SRE, Fri May 28 15:46:41 1999
- *
- * Reading/writing of Stockholm format multiple sequence alignments.
- *
- * example of API:
- *
- * MSA *msa;
- * FILE *fp; -- opened for write with fopen()
- * MSAFILE *afp; -- opened for read with MSAFileOpen()
- *
- * while ((msa = ReadStockholm(afp)) != NULL)
- * {
- * WriteStockholm(fp, msa);
- * MSAFree(msa);
- * }
- *
- * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
- */
-#include <stdio.h>
-#include <string.h>
-#include "squid.h"
-#include "msa.h"
-
-static int parse_gf(MSA *msa, char *buf);
-static int parse_gs(MSA *msa, char *buf);
-static int parse_gc(MSA *msa, char *buf);
-static int parse_gr(MSA *msa, char *buf);
-static int parse_comment(MSA *msa, char *buf);
-static int parse_sequence(MSA *msa, char *buf);
-static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
-
-#ifdef TESTDRIVE_STOCKHOLM
-/*****************************************************************
- * stockholm.c test driver:
- * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
- *
- */
-int
-main(int argc, char **argv)
-{
- MSAFILE *afp;
- MSA *msa;
- char *file;
-
- file = argv[1];
-
- if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
- Die("Couldn't open %s\n", file);
-
- while ((msa = ReadStockholm(afp)) != NULL)
- {
- WriteStockholm(stdout, msa);
- MSAFree(msa);
- }
-
- MSAFileClose(afp);
- exit(0);
-}
-/******************************************************************/
-#endif /* testdriver */
-
-
-/* Function: ReadStockholm()
- * Date: SRE, Fri May 21 17:33:10 1999 [St. Louis]
- *
- * Purpose: Parse the next alignment from an open Stockholm
- * format alignment file. Return the alignment, or
- * NULL if there are no more alignments in the file.
- *
- * Args: afp - open alignment file
- *
- * Returns: MSA * - an alignment object.
- * caller responsible for an MSAFree()
- * NULL if no more alignments
- *
- * Diagnostics:
- * Will Die() here with a (potentially) useful message
- * if a parsing error occurs
- */
-MSA *
-ReadStockholm(MSAFILE *afp)
-{
- MSA *msa;
- char *s;
- int status;
-
- if (feof(afp->f)) return NULL;
-
- /* Initialize allocation of the MSA.
- */
- msa = MSAAlloc(10, 0);
-
- /* Check the magic Stockholm header line.
- * We have to skip blank lines here, else we perceive
- * trailing blank lines in a file as a format error when
- * reading in multi-record mode.
- */
- do {
- if ((s = MSAFileGetLine(afp)) == NULL) {
- MSAFree(msa);
- return NULL;
- }
- } while (IsBlankline(s));
-
- if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
- Die("\
-File %s doesn't appear to be in Stockholm format.\n\
-Assuming there isn't some other problem with your file (it is an\n\
-alignment file, right?), please either:\n\
- a) use the Babelfish format autotranslator option (-B, usually);\n\
- b) specify the file's format with the --informat option; or\n\
- a) reformat the alignment to Stockholm format.\n",
- afp->fname);
-
- /* Read the alignment file one line at a time.
- */
- while ((s = MSAFileGetLine(afp)) != NULL)
- {
- while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */
-
- if (*s == '#') {
- if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s);
- else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s);
- else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s);
- else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s);
- else status = parse_comment(msa, s);
- }
- else if (strncmp(s, "//", 2) == 0) break;
- else if (*s == '\n') continue;
- else status = parse_sequence(msa, s);
-
- if (status == 0)
- Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
- afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
- }
-
- if (s == NULL && msa->nseq != 0)
- Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
-
- if (s == NULL && msa->nseq == 0) {
- /* probably just some junk at end of file */
- MSAFree(msa);
- return NULL;
- }
-
- MSAVerifyParse(msa);
- return msa;
-}
-
-
-/* Function: WriteStockholm()
- * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
- *
- * Purpose: Write an alignment in standard multi-block
- * Stockholm format to an open file. A wrapper
- * for actually_write_stockholm().
- *
- * Args: fp - file that's open for writing
- * msa - alignment to write
- *
- * Returns: (void)
- */
-void
-WriteStockholm(FILE *fp, MSA *msa)
-{
- actually_write_stockholm(fp, msa, 50); /* 50 char per block */
-}
-
-/* Function: WriteStockholmOneBlock()
- * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
- *
- * Purpose: Write an alignment in Pfam's single-block
- * Stockholm format to an open file. A wrapper
- * for actually_write_stockholm().
- *
- * Args: fp - file that's open for writing
- * msa - alignment to write
- *
- * Returns: (void)
- */
-void
-WriteStockholmOneBlock(FILE *fp, MSA *msa)
-{
- actually_write_stockholm(fp, msa, msa->alen); /* one big block */
-}
-
-
-/* Function: actually_write_stockholm()
- * Date: SRE, Fri May 21 17:39:22 1999 [St. Louis]
- *
- * Purpose: Write an alignment in Stockholm format to
- * an open file. This is the function that actually
- * does the work. The API's WriteStockholm()
- * and WriteStockholmOneBlock() are wrappers.
- *
- * Args: fp - file that's open for writing
- * msa - alignment to write
- * cpl - characters to write per line in alignment block
- *
- * Returns: (void)
- */
-static void
-actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
-{
- int i, j;
- int len = 0;
- int namewidth;
- int typewidth = 0; /* markup tags are up to 5 chars long */
- int markupwidth = 0; /* #=GR, #=GC are four char wide + 1 space */
- char *buf;
- int currpos;
- char *s, *tok;
-
- /* Figure out how much space we need for name + markup
- * to keep the alignment in register. Required by Stockholm
- * spec, even though our Stockholm parser doesn't care (Erik's does).
- */
- namewidth = 0;
- for (i = 0; i < msa->nseq; i++)
- if ((len = strlen(msa->sqname[i])) > namewidth)
- namewidth = len;
-
- /* Figure out how much space we need for markup tags
- * markupwidth = always 4 if we're doing markup: strlen("#=GR")
- * typewidth = longest markup tag
- */
- if (msa->ss != NULL) { markupwidth = 4; typewidth = 2; }
- if (msa->sa != NULL) { markupwidth = 4; typewidth = 2; }
- for (i = 0; i < msa->ngr; i++)
- if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
-
- if (msa->rf != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
- if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
- if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
- for (i = 0; i < msa->ngc; i++)
- if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
-
- buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61));
-
- /* Magic Stockholm header
- */
- fprintf(fp, "# STOCKHOLM 1.0\n");
-
- /* Free text comments
- */
- for (i = 0; i < msa->ncomment; i++)
- fprintf(fp, "# %s\n", msa->comment[i]);
- if (msa->ncomment > 0) fprintf(fp, "\n");
-
- /* GF section: per-file annotation
- */
- if (msa->name != NULL) fprintf(fp, "#=GF ID %s\n", msa->name);
- if (msa->acc != NULL) fprintf(fp, "#=GF AC %s\n", msa->acc);
- if (msa->desc != NULL) fprintf(fp, "#=GF DE %s\n", msa->desc);
- if (msa->au != NULL) fprintf(fp, "#=GF AU %s\n", msa->au);
-
- /* Thresholds are hacky. Pfam has two. Rfam has one.
- */
- if (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
- fprintf(fp, "#=GF GA %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
- else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
- fprintf(fp, "#=GF GA %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
- if (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
- fprintf(fp, "#=GF NC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
- else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
- fprintf(fp, "#=GF NC %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
- if (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
- fprintf(fp, "#=GF TC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
- else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
- fprintf(fp, "#=GF TC %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
-
- for (i = 0; i < msa->ngf; i++)
- fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]);
- fprintf(fp, "\n");
-
-
- /* GS section: per-sequence annotation
- */
- if (msa->flags & MSA_SET_WGT)
- {
- for (i = 0; i < msa->nseq; i++)
- fprintf(fp, "#=GS %-*.*s WT %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
- fprintf(fp, "\n");
- }
- if (msa->sqacc != NULL)
- {
- for (i = 0; i < msa->nseq; i++)
- if (msa->sqacc[i] != NULL)
- fprintf(fp, "#=GS %-*.*s AC %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
- fprintf(fp, "\n");
- }
- if (msa->sqdesc != NULL)
- {
- for (i = 0; i < msa->nseq; i++)
- if (msa->sqdesc[i] != NULL)
- fprintf(fp, "#=GS %*.*s DE %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
- fprintf(fp, "\n");
- }
- for (i = 0; i < msa->ngs; i++)
- {
- /* Multiannotated GS tags are possible; for example,
- * #=GS foo DR PDB; 1xxx;
- * #=GS foo DR PDB; 2yyy;
- * These are stored, for example, as:
- * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
- * and must be decomposed.
- */
- for (j = 0; j < msa->nseq; j++)
- if (msa->gs[i][j] != NULL)
- {
- s = msa->gs[i][j];
- while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
- fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
- msa->sqname[j], msa->gs_tag[i], tok);
- }
- fprintf(fp, "\n");
- }
-
- /* Alignment section:
- * contains aligned sequence, #=GR annotation, and #=GC annotation
- */
- for (currpos = 0; currpos < msa->alen; currpos += cpl)
- {
- if (currpos > 0) fprintf(fp, "\n");
- for (i = 0; i < msa->nseq; i++)
- {
- strncpy(buf, msa->aseq[i] + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "%-*.*s %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth,
- msa->sqname[i], buf);
-
- if (msa->ss != NULL && msa->ss[i] != NULL) {
- strncpy(buf, msa->ss[i] + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GR %-*.*s SS %s\n", namewidth, namewidth, msa->sqname[i], buf);
- }
- if (msa->sa != NULL && msa->sa[i] != NULL) {
- strncpy(buf, msa->sa[i] + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GR %-*.*s SA %s\n", namewidth, namewidth, msa->sqname[i], buf);
- }
- for (j = 0; j < msa->ngr; j++)
- if (msa->gr[j][i] != NULL) {
- strncpy(buf, msa->gr[j][i] + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GR %-*.*s %5s %s\n",
- namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
- }
- }
- if (msa->ss_cons != NULL) {
- strncpy(buf, msa->ss_cons + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
- }
-
- if (msa->sa_cons != NULL) {
- strncpy(buf, msa->sa_cons + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
- }
-
- if (msa->rf != NULL) {
- strncpy(buf, msa->rf + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
- }
- for (j = 0; j < msa->ngc; j++) {
- strncpy(buf, msa->gc[j] + currpos, cpl);
- buf[cpl] = '\0';
- fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth,
- msa->gc_tag[j], buf);
- }
- }
- fprintf(fp, "//\n");
- free(buf);
-}
-
-
-
-
-
-/* Format of a GF line:
- * #=GF <featurename> <text>
- */
-static int
-parse_gf(MSA *msa, char *buf)
-{
- char *gf;
- char *featurename;
- char *text;
- char *s;
-
- s = buf;
- if ((gf = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
- while (*text && (*text == ' ' || *text == '\t')) text++;
-
- if (strcmp(featurename, "ID") == 0)
- msa->name = sre_strdup(text, -1);
- else if (strcmp(featurename, "AC") == 0)
- msa->acc = sre_strdup(text, -1);
- else if (strcmp(featurename, "DE") == 0)
- msa->desc = sre_strdup(text, -1);
- else if (strcmp(featurename, "AU") == 0)
- msa->au = sre_strdup(text, -1);
- else if (strcmp(featurename, "GA") == 0)
- { /* Pfam has GA1, GA2. Rfam just has GA1. */
- s = text;
- if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- msa->cutoff[MSA_CUTOFF_GA1] = atof(text);
- msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
- if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
- msa->cutoff[MSA_CUTOFF_GA2] = atof(text);
- msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
- }
- }
- else if (strcmp(featurename, "NC") == 0)
- {
- s = text;
- if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- msa->cutoff[MSA_CUTOFF_NC1] = atof(text);
- msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
- if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
- msa->cutoff[MSA_CUTOFF_NC2] = atof(text);
- msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
- }
- }
- else if (strcmp(featurename, "TC") == 0)
- {
- s = text;
- if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- msa->cutoff[MSA_CUTOFF_TC1] = atof(text);
- msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
- if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
- msa->cutoff[MSA_CUTOFF_TC2] = atof(text);
- msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
- }
- }
- else
- MSAAddGF(msa, featurename, text);
-
- return 1;
-}
-
-
-/* Format of a GS line:
- * #=GS <seqname> <featurename> <text>
- */
-static int
-parse_gs(MSA *msa, char *buf)
-{
- char *gs;
- char *seqname;
- char *featurename;
- char *text;
- int seqidx;
- char *s;
-
- s = buf;
- if ((gs = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
- while (*text && (*text == ' ' || *text == '\t')) text++;
-
- /* GS usually follows another GS; guess lastidx+1
- */
- seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
- msa->lastidx = seqidx;
-
- if (strcmp(featurename, "WT") == 0)
- {
- msa->wgt[seqidx] = atof(text);
- msa->flags |= MSA_SET_WGT;
- }
-
- else if (strcmp(featurename, "AC") == 0)
- MSASetSeqAccession(msa, seqidx, text);
-
- else if (strcmp(featurename, "DE") == 0)
- MSASetSeqDescription(msa, seqidx, text);
-
- else
- MSAAddGS(msa, featurename, seqidx, text);
-
- return 1;
-}
-
-/* Format of a GC line:
- * #=GC <featurename> <text>
- */
-static int
-parse_gc(MSA *msa, char *buf)
-{
- char *gc;
- char *featurename;
- char *text;
- char *s;
- int len;
-
- s = buf;
- if ((gc = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
-
- if (strcmp(featurename, "SS_cons") == 0)
- sre_strcat(&(msa->ss_cons), -1, text, len);
- else if (strcmp(featurename, "SA_cons") == 0)
- sre_strcat(&(msa->sa_cons), -1, text, len);
- else if (strcmp(featurename, "RF") == 0)
- sre_strcat(&(msa->rf), -1, text, len);
- else
- MSAAppendGC(msa, featurename, text);
-
- return 1;
-}
-
-/* Format of a GR line:
- * #=GR <seqname> <featurename> <text>
- */
-static int
-parse_gr(MSA *msa, char *buf)
-{
- char *gr;
- char *seqname;
- char *featurename;
- char *text;
- int seqidx;
- int len;
- int j;
- char *s;
-
- s = buf;
- if ((gr = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
-
- /* GR usually follows sequence it refers to; guess msa->lastidx */
- seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
- msa->lastidx = seqidx;
-
- if (strcmp(featurename, "SS") == 0)
- {
- if (msa->ss == NULL)
- {
- msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
- msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
- for (j = 0; j < msa->nseqalloc; j++)
- {
- msa->ss[j] = NULL;
- msa->sslen[j] = 0;
- }
- }
- msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
- }
- else if (strcmp(featurename, "SA") == 0)
- {
- if (msa->sa == NULL)
- {
- msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
- msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
- for (j = 0; j < msa->nseqalloc; j++)
- {
- msa->sa[j] = NULL;
- msa->salen[j] = 0;
- }
- }
- msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
- }
- else
- MSAAppendGR(msa, featurename, seqidx, text);
-
- return 1;
-}
-
-
-/* comments are simply stored verbatim, not parsed
- */
-static int
-parse_comment(MSA *msa, char *buf)
-{
- char *s;
- char *comment;
-
- s = buf + 1; /* skip leading '#' */
- if (*s == '\n') { *s = '\0'; comment = s; } /* deal with blank comment */
- else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
-
- MSAAddComment(msa, comment);
- return 1;
-}
-
-static int
-parse_sequence(MSA *msa, char *buf)
-{
- char *s;
- char *seqname;
- char *text;
- int seqidx;
- int len;
-
- s = buf;
- if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
- if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
-
- /* seq usually follows another seq; guess msa->lastidx +1 */
- seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
- msa->lastidx = seqidx;
-
- msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);
- return 1;
-}
-
-
-