--- /dev/null
+/*****************************************************************
+ * SQUID - a library of functions for biological sequence analysis
+ * Copyright (C) 1992-2002 Washington University School of Medicine
+ *
+ * This source code is freely distributed under the terms of the
+ * GNU General Public License. See the files COPYRIGHT and LICENSE
+ * for details.
+ *****************************************************************/
+
+/* msf.c
+ * SRE, Sun Jul 11 16:17:32 1993
+ *
+ * Import/export of GCG MSF multiple sequence alignment
+ * formatted files. Designed using format specifications
+ * kindly provided by Steve Smith of Genetics Computer Group.
+ *
+ * RCS $Id: msf.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msf.c,v 1.4 2001/04/23 00:35:33 eddy Exp)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <time.h>
+#include "squid.h"
+#include "msa.h"
+
+#ifdef TESTDRIVE_MSF
+/*****************************************************************
+ * msf.c test driver:
+ * cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
+ *
+ */
+int
+main(int argc, char **argv)
+{
+ MSAFILE *afp;
+ MSA *msa;
+ char *file;
+
+ file = argv[1];
+
+ if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
+ Die("Couldn't open %s\n", file);
+
+ while ((msa = ReadMSF(afp)) != NULL)
+ {
+ WriteMSF(stdout, msa);
+ MSAFree(msa);
+ }
+
+ MSAFileClose(afp);
+ exit(0);
+}
+/******************************************************************/
+#endif /* testdrive_msf */
+
+
+
+/* Function: ReadMSF()
+ * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis]
+ *
+ * Purpose: Parse an alignment read from an open MSF format
+ * alignment file. (MSF is a single-alignment format.)
+ * Return the alignment, or NULL if we've already
+ * read the alignment.
+ *
+ * Args: afp - open alignment file
+ *
+ * Returns: MSA * - an alignment object
+ * caller responsible for an MSAFree()
+ * NULL if no more alignments
+ *
+ * Diagnostics:
+ * Will Die() here with a (potentially) useful message
+ * if a parsing error occurs.
+ */
+MSA *
+ReadMSF(MSAFILE *afp)
+{
+ MSA *msa;
+ char *s;
+ int alleged_alen;
+ int alleged_type;
+ int alleged_checksum;
+ char *tok;
+ char *sp;
+ int slen;
+ int sqidx;
+ char *name;
+ char *seq;
+
+ if (feof(afp->f)) return NULL;
+ if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
+
+ /* The first line is the header.
+ * This is a new-ish GCG feature. Don't count on it, so
+ * we can be a bit more tolerant towards non-GCG software
+ * generating "MSF" files.
+ */
+ msa = MSAAlloc(10, 0);
+ if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
+ msa->type = kAmino;
+ if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
+ } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
+ msa->type = kRNA;
+ if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
+ }
+
+ /* Now we're in the free text comment section of the MSF file.
+ * It ends when we see the "MSF: Type: Check: .." line.
+ * This line must be present.
+ */
+ do
+ {
+ if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
+ Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
+ {
+ alleged_alen = atoi(sqd_parse[0]);
+ switch (*(sqd_parse[1])) {
+ case 'N' : alleged_type = kRNA; break;
+ case 'P' : alleged_type = kAmino; break;
+ case 'X' : alleged_type = kOtherSeq; break;
+ default : alleged_type = kOtherSeq;
+ }
+ alleged_checksum = atoi(sqd_parse[3]);
+ if (msa->type == kOtherSeq) msa->type = alleged_type;
+ break; /* we're done with comment section. */
+ }
+ if (! IsBlankline(s))
+ MSAAddComment(msa, s);
+ } while ((s = MSAFileGetLine(afp)) != NULL);
+
+ /* Now we're in the name section.
+ * GCG has a relatively poorly documented feature: only sequences that
+ * appear in this list will be read from the alignment section. Commenting
+ * out sequences in the name list (by preceding them with "!") is
+ * allowed as a means of manually defining subsets of sequences in
+ * the alignment section. We can support this feature reasonably
+ * easily because of the hash table for names in the MSA: we
+ * only add names to the hash table when we see 'em in the name section.
+ */
+ while ((s = MSAFileGetLine(afp)) != NULL)
+ {
+ while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
+
+ if (*s == '\n') continue; /* skip blank lines */
+ else if (*s == '!') MSAAddComment(msa, s);
+ else if ((sp = strstr(s, "Name:")) != NULL)
+ {
+ /* We take the name and the weigh, and that's it */
+ sp += 5;
+ tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
+ sqidx = GKIStoreKey(msa->index, tok);
+ if (sqidx >= msa->nseqalloc) MSAExpand(msa);
+ msa->sqname[sqidx] = sre_strdup(tok, slen);
+ msa->nseq++;
+
+ if ((sp = strstr(sp, "Weight:")) == NULL)
+ Die("No Weight: on line %d for %s in name section of MSF file %s\n",
+ afp->linenumber, msa->sqname[sqidx], afp->fname);
+ sp += 7;
+ tok = sre_strtok(&sp, " \t", &slen);
+ msa->wgt[sqidx] = atof(tok);
+ msa->flags |= MSA_SET_WGT;
+ }
+ else if (strncmp(s, "//", 2) == 0)
+ break;
+ else
+ {
+ Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
+ afp->linenumber, afp->fname, s);
+ squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
+ return NULL;
+ }
+
+ }
+
+ /* And now we're in the sequence section.
+ * As discussed above, if we haven't seen a sequence name, then we
+ * don't include the sequence in the alignment.
+ * Also, watch out for coordinate-only lines.
+ */
+ while ((s = MSAFileGetLine(afp)) != NULL)
+ {
+ sp = s;
+ if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
+ if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue;
+
+ /* The test for a coord line: digits starting both fields
+ */
+ if (isdigit(*name) && isdigit(*seq))
+ continue;
+
+ /* It's not blank, and it's not a coord line: must be sequence
+ */
+ sqidx = GKIKeyIndex(msa->index, name);
+ if (sqidx < 0) continue; /* not a sequence we recognize */
+
+ msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
+ }
+
+ /* We've left blanks in the aseqs; take them back out.
+ */
+ for (sqidx = 0; sqidx < msa->nseq; sqidx++)
+ {
+ if (msa->aseq[sqidx] == NULL)
+ Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
+
+ for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
+ {
+ if (*s == ' ' || *s == '\t') {
+ msa->sqlen[sqidx]--;
+ } else {
+ *sp = *s;
+ sp++;
+ }
+ }
+ *sp = '\0';
+ }
+
+ MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
+ return msa;
+}
+
+
+/* Function: WriteMSF()
+ * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis]
+ *
+ * Purpose: Write an alignment in MSF format to an open file.
+ *
+ * Args: fp - file that's open for writing.
+ * msa - alignment to write.
+ *
+ * Note that msa->type, usually optional, must be
+ * set for WriteMSF to work. If it isn't, a fatal
+ * error is generated.
+ *
+ * Returns: (void)
+ */
+void
+WriteMSF(FILE *fp, MSA *msa)
+{
+ time_t now; /* current time as a time_t */
+ char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */
+ char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */
+ char **gcg_sqname; /* sequence names with GCG-valid character sets */
+ int idx; /* counter for sequences */
+ char *s; /* pointer into sqname or seq */
+ int len; /* tmp variable for name lengths */
+ int namelen; /* maximum name length used */
+ int pos; /* position counter */
+ char buffer[51]; /* buffer for writing seq */
+ int i; /* another position counter */
+
+ /*****************************************************************
+ * Make copies of sequence names and sequences.
+ * GCG recommends that name characters should only contain
+ * alphanumeric characters, -, or _
+ * Some GCG and GCG-compatible software is sensitive to this.
+ * We silently convert all other characters to '_'.
+ *
+ * For sequences, GCG allows only ~ and . for gaps.
+ * Otherwise, everthing is interpreted as a residue;
+ * so squid's IUPAC-restricted chars are fine. ~ means
+ * an external gap. . means an internal gap.
+ *****************************************************************/
+
+ /* make copies that we can edit */
+ gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq);
+ gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
+ for (idx = 0; idx < msa->nseq; idx++)
+ {
+ gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen);
+ gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
+ }
+ /* alter names as needed */
+ for (idx = 0; idx < msa->nseq; idx++)
+ for (s = gcg_sqname[idx]; *s != '\0'; s++)
+ if (! isalnum((int) *s) && *s != '-' && *s != '_')
+ *s = '_';
+ /* alter gap chars in seq */
+ for (idx = 0; idx < msa->nseq; idx++)
+ {
+ for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
+ *s = '~';
+ for (; *s != '\0'; s++)
+ if (isgap(*s)) *s = '.';
+ for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
+ gcg_aseq[idx][pos] = '~';
+ }
+ /* calculate max namelen used */
+ namelen = 0;
+ for (idx = 0; idx < msa->nseq; idx++)
+ if ((len = strlen(msa->sqname[idx])) > namelen)
+ namelen = len;
+
+ /*****************************************************
+ * Write the MSF header
+ *****************************************************/
+ /* required file type line */
+ if (msa->type == kOtherSeq)
+ msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
+
+ if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
+ else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
+ else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
+ else if (msa->type == kOtherSeq)
+ Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
+ else
+ Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
+
+ /* free text comments */
+ if (msa->ncomment > 0)
+ {
+ for (idx = 0; idx < msa->ncomment; idx++)
+ fprintf(fp, "%s\n", msa->comment[idx]);
+ fprintf(fp, "\n");
+ }
+ /* required checksum line */
+ now = time(NULL);
+ if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
+ Die("What time is it on earth? strftime() failed in WriteMSF().\n");
+ fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n",
+ msa->name != NULL ? msa->name : "squid.msf",
+ msa->alen,
+ msa->type == kRNA ? 'N' : 'P',
+ date,
+ GCGMultchecksum(gcg_aseq, msa->nseq));
+ fprintf(fp, "\n");
+
+ /*****************************************************
+ * Names/weights section
+ *****************************************************/
+
+ for (idx = 0; idx < msa->nseq; idx++)
+ {
+ fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n",
+ namelen, namelen,
+ gcg_sqname[idx],
+ msa->alen,
+ GCGchecksum(gcg_aseq[idx], msa->alen),
+ msa->wgt[idx]);
+ }
+ fprintf(fp, "\n");
+ fprintf(fp, "//\n");
+
+ /*****************************************************
+ * Write the sequences
+ *****************************************************/
+
+ for (pos = 0; pos < msa->alen; pos += 50)
+ {
+ fprintf(fp, "\n"); /* Blank line between sequence blocks */
+
+ /* Coordinate line */
+ len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
+ if (len > 10)
+ fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "",
+ pos+1,
+ len + ((len-1)/10) - 12, "",
+ pos + len);
+ else
+ fprintf(fp, "%*s %-6d\n", namelen, "", pos+1);
+
+ for (idx = 0; idx < msa->nseq; idx++)
+ {
+ fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
+ /* get next line's worth of 50 from seq */
+ strncpy(buffer, gcg_aseq[idx] + pos, 50);
+ buffer[50] = '\0';
+ /* draw the sequence line */
+ for (i = 0; i < len; i++)
+ {
+ if (! (i % 10)) fputc(' ', fp);
+ fputc(buffer[i], fp);
+ }
+ fputc('\n', fp);
+ }
+ }
+
+ Free2DArray((void **) gcg_aseq, msa->nseq);
+ Free2DArray((void **) gcg_sqname, msa->nseq);
+ return;
+}
+
+
+