+++ /dev/null
-/*****************************************************************
- * SQUID - a library of functions for biological sequence analysis
- * Copyright (C) 1992-2002 Washington University School of Medicine
- *
- * This source code is freely distributed under the terms of the
- * GNU General Public License. See the files COPYRIGHT and LICENSE
- * for details.
- *****************************************************************/
-
-/* msf.c
- * SRE, Sun Jul 11 16:17:32 1993
- *
- * Import/export of GCG MSF multiple sequence alignment
- * formatted files. Designed using format specifications
- * kindly provided by Steve Smith of Genetics Computer Group.
- *
- * RCS $Id: msf.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msf.c,v 1.4 2001/04/23 00:35:33 eddy Exp)
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-#include "squid.h"
-#include "msa.h"
-
-#ifdef TESTDRIVE_MSF
-/*****************************************************************
- * msf.c test driver:
- * cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
- *
- */
-int
-main(int argc, char **argv)
-{
- MSAFILE *afp;
- MSA *msa;
- char *file;
-
- file = argv[1];
-
- if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
- Die("Couldn't open %s\n", file);
-
- while ((msa = ReadMSF(afp)) != NULL)
- {
- WriteMSF(stdout, msa);
- MSAFree(msa);
- }
-
- MSAFileClose(afp);
- exit(0);
-}
-/******************************************************************/
-#endif /* testdrive_msf */
-
-
-
-/* Function: ReadMSF()
- * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis]
- *
- * Purpose: Parse an alignment read from an open MSF format
- * alignment file. (MSF is a single-alignment format.)
- * Return the alignment, or NULL if we've already
- * read the alignment.
- *
- * Args: afp - open alignment file
- *
- * Returns: MSA * - an alignment object
- * caller responsible for an MSAFree()
- * NULL if no more alignments
- *
- * Diagnostics:
- * Will Die() here with a (potentially) useful message
- * if a parsing error occurs.
- */
-MSA *
-ReadMSF(MSAFILE *afp)
-{
- MSA *msa;
- char *s;
- int alleged_alen;
- int alleged_type;
- int alleged_checksum;
- char *tok;
- char *sp;
- int slen;
- int sqidx;
- char *name;
- char *seq;
-
- if (feof(afp->f)) return NULL;
- if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
-
- /* The first line is the header.
- * This is a new-ish GCG feature. Don't count on it, so
- * we can be a bit more tolerant towards non-GCG software
- * generating "MSF" files.
- */
- msa = MSAAlloc(10, 0);
- if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
- msa->type = kAmino;
- if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
- } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
- msa->type = kRNA;
- if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
- }
-
- /* Now we're in the free text comment section of the MSF file.
- * It ends when we see the "MSF: Type: Check: .." line.
- * This line must be present.
- */
- do
- {
- if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
- Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
- {
- alleged_alen = atoi(sqd_parse[0]);
- switch (*(sqd_parse[1])) {
- case 'N' : alleged_type = kRNA; break;
- case 'P' : alleged_type = kAmino; break;
- case 'X' : alleged_type = kOtherSeq; break;
- default : alleged_type = kOtherSeq;
- }
- alleged_checksum = atoi(sqd_parse[3]);
- if (msa->type == kOtherSeq) msa->type = alleged_type;
- break; /* we're done with comment section. */
- }
- if (! IsBlankline(s))
- MSAAddComment(msa, s);
- } while ((s = MSAFileGetLine(afp)) != NULL);
-
- /* Now we're in the name section.
- * GCG has a relatively poorly documented feature: only sequences that
- * appear in this list will be read from the alignment section. Commenting
- * out sequences in the name list (by preceding them with "!") is
- * allowed as a means of manually defining subsets of sequences in
- * the alignment section. We can support this feature reasonably
- * easily because of the hash table for names in the MSA: we
- * only add names to the hash table when we see 'em in the name section.
- */
- while ((s = MSAFileGetLine(afp)) != NULL)
- {
- while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
-
- if (*s == '\n') continue; /* skip blank lines */
- else if (*s == '!') MSAAddComment(msa, s);
- else if ((sp = strstr(s, "Name:")) != NULL)
- {
- /* We take the name and the weigh, and that's it */
- sp += 5;
- tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
- sqidx = GKIStoreKey(msa->index, tok);
- if (sqidx >= msa->nseqalloc) MSAExpand(msa);
- msa->sqname[sqidx] = sre_strdup(tok, slen);
- msa->nseq++;
-
- if ((sp = strstr(sp, "Weight:")) == NULL)
- Die("No Weight: on line %d for %s in name section of MSF file %s\n",
- afp->linenumber, msa->sqname[sqidx], afp->fname);
- sp += 7;
- tok = sre_strtok(&sp, " \t", &slen);
- msa->wgt[sqidx] = atof(tok);
- msa->flags |= MSA_SET_WGT;
- }
- else if (strncmp(s, "//", 2) == 0)
- break;
- else
- {
- Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
- afp->linenumber, afp->fname, s);
- squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
- return NULL;
- }
-
- }
-
- /* And now we're in the sequence section.
- * As discussed above, if we haven't seen a sequence name, then we
- * don't include the sequence in the alignment.
- * Also, watch out for coordinate-only lines.
- */
- while ((s = MSAFileGetLine(afp)) != NULL)
- {
- sp = s;
- if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
- if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue;
-
- /* The test for a coord line: digits starting both fields
- */
- if (isdigit(*name) && isdigit(*seq))
- continue;
-
- /* It's not blank, and it's not a coord line: must be sequence
- */
- sqidx = GKIKeyIndex(msa->index, name);
- if (sqidx < 0) continue; /* not a sequence we recognize */
-
- msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
- }
-
- /* We've left blanks in the aseqs; take them back out.
- */
- for (sqidx = 0; sqidx < msa->nseq; sqidx++)
- {
- if (msa->aseq[sqidx] == NULL)
- Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
-
- for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
- {
- if (*s == ' ' || *s == '\t') {
- msa->sqlen[sqidx]--;
- } else {
- *sp = *s;
- sp++;
- }
- }
- *sp = '\0';
- }
-
- MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
- return msa;
-}
-
-
-/* Function: WriteMSF()
- * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis]
- *
- * Purpose: Write an alignment in MSF format to an open file.
- *
- * Args: fp - file that's open for writing.
- * msa - alignment to write.
- *
- * Note that msa->type, usually optional, must be
- * set for WriteMSF to work. If it isn't, a fatal
- * error is generated.
- *
- * Returns: (void)
- */
-void
-WriteMSF(FILE *fp, MSA *msa)
-{
- time_t now; /* current time as a time_t */
- char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */
- char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */
- char **gcg_sqname; /* sequence names with GCG-valid character sets */
- int idx; /* counter for sequences */
- char *s; /* pointer into sqname or seq */
- int len; /* tmp variable for name lengths */
- int namelen; /* maximum name length used */
- int pos; /* position counter */
- char buffer[51]; /* buffer for writing seq */
- int i; /* another position counter */
-
- /*****************************************************************
- * Make copies of sequence names and sequences.
- * GCG recommends that name characters should only contain
- * alphanumeric characters, -, or _
- * Some GCG and GCG-compatible software is sensitive to this.
- * We silently convert all other characters to '_'.
- *
- * For sequences, GCG allows only ~ and . for gaps.
- * Otherwise, everthing is interpreted as a residue;
- * so squid's IUPAC-restricted chars are fine. ~ means
- * an external gap. . means an internal gap.
- *****************************************************************/
-
- /* make copies that we can edit */
- gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq);
- gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
- for (idx = 0; idx < msa->nseq; idx++)
- {
- gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen);
- gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
- }
- /* alter names as needed */
- for (idx = 0; idx < msa->nseq; idx++)
- for (s = gcg_sqname[idx]; *s != '\0'; s++)
- if (! isalnum((int) *s) && *s != '-' && *s != '_')
- *s = '_';
- /* alter gap chars in seq */
- for (idx = 0; idx < msa->nseq; idx++)
- {
- for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
- *s = '~';
- for (; *s != '\0'; s++)
- if (isgap(*s)) *s = '.';
- for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
- gcg_aseq[idx][pos] = '~';
- }
- /* calculate max namelen used */
- namelen = 0;
- for (idx = 0; idx < msa->nseq; idx++)
- if ((len = strlen(msa->sqname[idx])) > namelen)
- namelen = len;
-
- /*****************************************************
- * Write the MSF header
- *****************************************************/
- /* required file type line */
- if (msa->type == kOtherSeq)
- msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
-
- if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
- else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
- else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
- else if (msa->type == kOtherSeq)
- Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
- else
- Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
-
- /* free text comments */
- if (msa->ncomment > 0)
- {
- for (idx = 0; idx < msa->ncomment; idx++)
- fprintf(fp, "%s\n", msa->comment[idx]);
- fprintf(fp, "\n");
- }
- /* required checksum line */
- now = time(NULL);
- if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
- Die("What time is it on earth? strftime() failed in WriteMSF().\n");
- fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n",
- msa->name != NULL ? msa->name : "squid.msf",
- msa->alen,
- msa->type == kRNA ? 'N' : 'P',
- date,
- GCGMultchecksum(gcg_aseq, msa->nseq));
- fprintf(fp, "\n");
-
- /*****************************************************
- * Names/weights section
- *****************************************************/
-
- for (idx = 0; idx < msa->nseq; idx++)
- {
- fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n",
- namelen, namelen,
- gcg_sqname[idx],
- msa->alen,
- GCGchecksum(gcg_aseq[idx], msa->alen),
- msa->wgt[idx]);
- }
- fprintf(fp, "\n");
- fprintf(fp, "//\n");
-
- /*****************************************************
- * Write the sequences
- *****************************************************/
-
- for (pos = 0; pos < msa->alen; pos += 50)
- {
- fprintf(fp, "\n"); /* Blank line between sequence blocks */
-
- /* Coordinate line */
- len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
- if (len > 10)
- fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "",
- pos+1,
- len + ((len-1)/10) - 12, "",
- pos + len);
- else
- fprintf(fp, "%*s %-6d\n", namelen, "", pos+1);
-
- for (idx = 0; idx < msa->nseq; idx++)
- {
- fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
- /* get next line's worth of 50 from seq */
- strncpy(buffer, gcg_aseq[idx] + pos, 50);
- buffer[50] = '\0';
- /* draw the sequence line */
- for (i = 0; i < len; i++)
- {
- if (! (i % 10)) fputc(' ', fp);
- fputc(buffer[i], fp);
- }
- fputc('\n', fp);
- }
- }
-
- Free2DArray((void **) gcg_aseq, msa->nseq);
- Free2DArray((void **) gcg_sqname, msa->nseq);
- return;
-}
-
-
-