+++ /dev/null
-/*****************************************************************
- * SQUID - a library of functions for biological sequence analysis
- * Copyright (C) 1992-2002 Washington University School of Medicine
- *
- * This source code is freely distributed under the terms of the
- * GNU General Public License. See the files COPYRIGHT and LICENSE
- * for details.
- *****************************************************************/
-
-/* File: sqio.c
- * From: ureadseq.c in Don Gilbert's sequence i/o package
- *
- * Reads and writes nucleic/protein sequence in various
- * formats. Data files may have multiple sequences.
- *
- * Heavily modified from READSEQ package
- * Copyright (C) 1990 by D.G. Gilbert
- * Biology Dept., Indiana University, Bloomington, IN 47405
- * email: gilbertd@bio.indiana.edu
- * Thanks Don!
- *
- * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992
- * Packaged for squid, Thu Oct 1 10:07:11 1992
- * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993
- *
- * CVS $Id: sqio.c,v 1.29 2002/08/26 23:10:52 eddy Exp)
- *
- *****************************************************************
- * Basic API for single sequence reading:
- *
- * SQFILE *sqfp;
- * char *seqfile;
- * int format; - see squid.h for formats; example: SQFILE_FASTA
- * char *seq;
- * SQINFO sqinfo;
- *
- * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
- * Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
- * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) {
- * do_stuff;
- * FreeSequence(seq, &sqinfo);
- * }
- * SeqfileClose(sqfp);
- *
- *****************************************************************
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-
-#ifndef SEEK_SET
-#include <unistd.h>
-#endif
-
-#include "squid.h"
-#include "msa.h"
-#include "ssi.h"
-
-static void SeqfileGetLine(SQFILE *V);
-
-#define kStartLength 500
-
-static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
-static char *primenuc = "ACGTUN";
-static char *protonly = "EFIPQZ";
-
-static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode);
-
-/* Function: SeqfileOpen()
- *
- * Purpose : Open a sequence database file and prepare for reading
- * sequentially.
- *
- * Args: filename - name of file to open
- * format - format of file
- * env - environment variable for path (e.g. BLASTDB)
- * ssimode - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64
- *
- * Returns opened SQFILE ptr, or NULL on failure.
- */
-SQFILE *
-SeqfileOpen(char *filename, int format, char *env)
-{
- return seqfile_open(filename, format, env, -1);
-}
-SQFILE *
-SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode)
-{
- return seqfile_open(filename, format, env, ssimode);
-}
-static SQFILE *
-seqfile_open(char *filename, int format, char *env, int ssimode)
-{
- SQFILE *dbfp;
-
- dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
-
- dbfp->ssimode = ssimode;
- dbfp->rpl = -1; /* flag meaning "unset" */
- dbfp->lastrpl = 0;
- dbfp->maxrpl = 0;
- dbfp->bpl = -1; /* flag meaning "unset" */
- dbfp->lastbpl = 0;
- dbfp->maxbpl = 0;
-
- /* Open our file handle.
- * Three possibilities:
- * 1. normal file open
- * 2. filename = "-"; read from stdin
- * 3. filename = "*.gz"; read thru pipe from gzip
- * If we're reading from stdin or a pipe, we can't reliably
- * back up, so we can't do two-pass parsers like the interleaved alignment
- * formats.
- */
- if (strcmp(filename, "-") == 0)
- {
- dbfp->f = stdin;
- dbfp->do_stdin = TRUE;
- dbfp->do_gzip = FALSE;
- dbfp->fname = sre_strdup("[STDIN]", -1);
- }
-#ifndef SRE_STRICT_ANSI
- /* popen(), pclose() aren't portable to non-POSIX systems; disable */
- else if (Strparse("^.*\\.gz$", filename, 0))
- {
- char cmd[256];
-
- /* Note that popen() will return "successfully"
- * if file doesn't exist, because gzip works fine
- * and prints an error! So we have to check for
- * existence of file ourself.
- */
- if (! FileExists(filename))
- Die("%s: file does not exist", filename);
-
- if (strlen(filename) + strlen("gzip -dc ") >= 256)
- Die("filename > 255 char in SeqfileOpen()");
- sprintf(cmd, "gzip -dc %s", filename);
- if ((dbfp->f = popen(cmd, "r")) == NULL)
- return NULL;
-
- dbfp->do_stdin = FALSE;
- dbfp->do_gzip = TRUE;
- dbfp->fname = sre_strdup(filename, -1);
- }
-#endif /*SRE_STRICT_ANSI*/
- else
- {
- if ((dbfp->f = fopen(filename, "r")) == NULL &&
- (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL)
- return NULL;
-
- dbfp->do_stdin = FALSE;
- dbfp->do_gzip = FALSE;
- dbfp->fname = sre_strdup(filename, -1);
- }
-
-
- /* Invoke autodetection if we haven't already been told what
- * to expect.
- */
- if (format == SQFILE_UNKNOWN)
- {
- if (dbfp->do_stdin == TRUE || dbfp->do_gzip)
- Die("Can't autodetect sequence file format from a stdin or gzip pipe");
- format = SeqfileFormat(dbfp->f);
- if (format == SQFILE_UNKNOWN)
- Die("Can't determine format of sequence file %s", dbfp->fname);
- }
-
- /* The hack for sequential access of an interleaved alignment file:
- * read the alignment in, we'll copy sequences out one at a time.
- */
- dbfp->msa = NULL;
- dbfp->afp = NULL;
- dbfp->format = format;
- dbfp->linenumber = 0;
- dbfp->buf = NULL;
- dbfp->buflen = 0;
- if (IsAlignmentFormat(format))
- {
- /* We'll be reading from the MSA interface. Copy our data
- * to the MSA afp's structure.
- */
- dbfp->afp = MallocOrDie(sizeof(MSAFILE));
- dbfp->afp->f = dbfp->f; /* just a ptr, don't close */
- dbfp->afp->do_stdin = dbfp->do_stdin;
- dbfp->afp->do_gzip = dbfp->do_gzip;
- dbfp->afp->fname = dbfp->fname; /* just a ptr, don't free */
- dbfp->afp->format = dbfp->format; /* e.g. format */
- dbfp->afp->linenumber = dbfp->linenumber; /* e.g. 0 */
- dbfp->afp->buf = NULL;
- dbfp->afp->buflen = 0;
-
- if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL)
- Die("Failed to read any alignment data from file %s", dbfp->fname);
- /* hack: overload/reuse msa->lastidx; indicates
- next seq to return upon a ReadSeq() call */
- dbfp->msa->lastidx = 0;
-
- return dbfp;
- }
-
- /* Load the first line.
- */
- SeqfileGetLine(dbfp);
- return dbfp;
-}
-
-/* Function: SeqfilePosition()
- *
- * Purpose: Move to a particular offset in a seqfile.
- * Will not work on alignment files.
- */
-void
-SeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset)
-{
- if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format))
- Die("SeqfilePosition() failed: in a nonrewindable data file or stream");
-
- if (SSISetFilePosition(sqfp->f, offset) != 0)
- Die("SSISetFilePosition failed, but that shouldn't happen.");
- SeqfileGetLine(sqfp);
-}
-
-
-/* Function: SeqfileRewind()
- *
- * Purpose: Set a sequence file back to the first sequence.
- *
- * Won't work on alignment files. Although it would
- * seem that it could (just set msa->lastidx back to 0),
- * that'll fail on "multiple multiple" alignment file formats
- * (e.g. Stockholm).
- */
-void
-SeqfileRewind(SQFILE *sqfp)
-{
- if (sqfp->do_stdin || sqfp->do_gzip)
- Die("SeqfileRewind() failed: in a nonrewindable data file or stream");
-
- rewind(sqfp->f);
- SeqfileGetLine(sqfp);
-}
-
-/* Function: SeqfileLineParameters()
- * Date: SRE, Thu Feb 15 17:00:41 2001 [St. Louis]
- *
- * Purpose: After all the sequences have been read from the file,
- * but before closing it, retrieve overall bytes-per-line and
- * residues-per-line info. If non-zero, these mean that
- * the file contains homogeneous sequence line lengths (except
- * the last line in each record).
- *
- * If either of bpl or rpl is determined to be inhomogeneous,
- * both are returned as 0.
- *
- * Args: *sqfp - an open but fully read sequence file
- * ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous
- * ret_rpl - RETURN: residues per line, or 0 if inhomogenous.
- *
- * Returns: void
- */
-void
-SeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl)
-{
- if (V->rpl > 0 && V->maxrpl == V->rpl &&
- V->bpl > 0 && V->maxbpl == V->bpl) {
- *ret_bpl = V->bpl;
- *ret_rpl = V->rpl;
- } else {
- *ret_bpl = 0;
- *ret_rpl = 0;
- }
-}
-
-
-void
-SeqfileClose(SQFILE *sqfp)
-{
- /* note: don't test for sqfp->msa being NULL. Now that
- * we're holding afp open and allowing access to multi-MSA
- * databases (e.g. Stockholm format, Pfam), msa ends
- * up being NULL when we run out of alignments.
- */
- if (sqfp->afp != NULL) {
- if (sqfp->msa != NULL) MSAFree(sqfp->msa);
- if (sqfp->afp->buf != NULL) free(sqfp->afp->buf);
- free(sqfp->afp);
- }
-#ifndef SRE_STRICT_ANSI /* gunzip functionality only on POSIX systems */
- if (sqfp->do_gzip) pclose(sqfp->f);
-#endif
- else if (! sqfp->do_stdin) fclose(sqfp->f);
- if (sqfp->buf != NULL) free(sqfp->buf);
- if (sqfp->fname != NULL) free(sqfp->fname);
- free(sqfp);
-}
-
-
-/* Function: SeqfileGetLine()
- * Date: SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre]
- *
- * Purpose: read a line from a sequence file into V->buf
- * If the fgets() is NULL, sets V->buf[0] to '\0'.
- *
- * Args: V
- *
- * Returns: void
- */
-static void
-SeqfileGetLine(SQFILE *V)
-{
- if (V->ssimode >= 0)
- if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset)))
- Die("SSIGetFilePosition() failed");
- if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL)
- *(V->buf) = '\0';
- V->linenumber++;
-}
-
-
-void
-FreeSequence(char *seq, SQINFO *sqinfo)
-{
- if (seq != NULL) free(seq); /* FS, r244, here is potential problem in profile/profile */
- if (sqinfo->flags & SQINFO_SS){
- if (NULL != sqinfo->ss){ /* FS, r244 -> r245 */
- free(sqinfo->ss);
- }
- }
- if (sqinfo->flags & SQINFO_SA){
- if (NULL != sqinfo->sa){ /* FS, r244 -> r245 */
- free(sqinfo->sa);
- }
- }
-}
-
-int
-SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
-{
- int len;
- int pos;
-
- /* silently ignore NULL. */
- if (sptr == NULL) return 1;
-
- while (*sptr == ' ') sptr++; /* ignore leading whitespace */
- for (pos = strlen(sptr)-1; pos >= 0; pos--)
- if (! isspace((int) sptr[pos])) break;
- sptr[pos+1] = '\0'; /* ignore trailing whitespace */
-
- switch (flag) {
- case SQINFO_NAME:
- if (*sptr != '-')
- {
- strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);
- sqinfo->name[SQINFO_NAMELEN-1] = '\0';
- sqinfo->flags |= SQINFO_NAME;
- }
- break;
-
- case SQINFO_ID:
- if (*sptr != '-')
- {
- strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);
- sqinfo->id[SQINFO_NAMELEN-1] = '\0';
- sqinfo->flags |= SQINFO_ID;
- }
- break;
-
- case SQINFO_ACC:
- if (*sptr != '-')
- {
- strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);
- sqinfo->acc[SQINFO_NAMELEN-1] = '\0';
- sqinfo->flags |= SQINFO_ACC;
- }
- break;
-
- case SQINFO_DESC:
- if (*sptr != '-')
- {
- if (sqinfo->flags & SQINFO_DESC) /* append? */
- {
- len = strlen(sqinfo->desc);
- if (len < SQINFO_DESCLEN-2) /* is there room? */
- {
- strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;
- strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);
- }
- }
- else /* else copy */
- strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);
- sqinfo->desc[SQINFO_DESCLEN-1] = '\0';
- sqinfo->flags |= SQINFO_DESC;
- }
- break;
-
- case SQINFO_START:
- if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
- sqinfo->start = atoi(sptr);
- if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;
- break;
-
- case SQINFO_STOP:
- if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
- sqinfo->stop = atoi(sptr);
- if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;
- break;
-
- case SQINFO_OLEN:
- if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
- sqinfo->olen = atoi(sptr);
- if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;
- break;
-
- default:
- Die("Invalid flag %d to SetSeqinfoString()", flag);
- }
- return 1;
-}
-
-void
-SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
-{
- sq1->flags = sq2->flags;
- if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name);
- if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id);
- if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc);
- if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc);
- if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len;
- if (sq2->flags & SQINFO_START) sq1->start = sq2->start;
- if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop;
- if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen;
- if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type;
- if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss);
- if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa);
-}
-
-/* Function: ToDNA()
- *
- * Purpose: Convert a sequence to DNA.
- * U --> T
- */
-void
-ToDNA(char *seq)
-{
- for (; *seq != '\0'; seq++)
- {
- if (*seq == 'U') *seq = 'T';
- else if (*seq == 'u') *seq = 't';
- }
-}
-
-/* Function: ToRNA()
- *
- * Purpose: Convert a sequence to RNA.
- * T --> U
- */
-void
-ToRNA(char *seq)
-{
- for (; *seq != '\0'; seq++)
- {
- if (*seq == 'T') *seq = 'U';
- else if (*seq == 't') *seq = 'u';
- }
-}
-
-
-/* Function: ToIUPAC()
- *
- * Purpose: Convert X's, o's, other junk in a nucleic acid sequence to N's,
- * to comply with IUPAC code. If is_aseq is TRUE, will allow gap
- * characters though, so we can call ToIUPAC() on aligned seqs.
- *
- * NUCLEOTIDES is defined in squid.h as:
- * "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
- * gap chars allowed by isgap() are defined in squid.h as:
- * " ._-~"
- *
- * WU-BLAST's pressdb will
- * choke on X's, for instance, necessitating conversion
- * of certain genome centers' data.
- */
-void
-ToIUPAC(char *seq, int is_aseq)
-{
- if (is_aseq) {
- for (; *seq != '\0'; seq++)
- if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N';
- } else {
- for (; *seq != '\0'; seq++)
- if (strchr(NUCLEOTIDES, *seq) == NULL) *seq = 'N';
- }
-}
-
-
-/* Function: addseq()
- *
- * Purpose: Add a line of sequence to the growing string in V.
- *
- * In the seven supported unaligned formats, all sequence
- * lines may contain whitespace that must be filtered out;
- * four formats (PIR, EMBL, Genbank, GCG) include coordinates
- * that must be filtered out. Thus an (!isdigit && !isspace)
- * test on each character before we accept it.
- */
-static void
-addseq(char *s, struct ReadSeqVars *V)
-{
- char *s0;
- char *sq;
- int rpl; /* valid residues per line */
- int bpl; /* characters per line */
-
- if (V->ssimode == -1)
- { /* Normal mode: keeping the seq */
- /* Make sure we have enough room. We know that s is <= buflen,
- * so just make sure we've got room for a whole new buflen worth
- * of sequence.
- */
- if (V->seqlen + V->buflen > V->maxseq) {
- V->maxseq += MAX(V->buflen, kStartLength);
- V->seq = ReallocOrDie (V->seq, V->maxseq+1);
- }
-
- sq = V->seq + V->seqlen;
- while (*s != 0) {
-#ifdef CLUSTALO
- if (! isdigit((int) *s) && ! isspace((int) *s) && isprint((int) *s)) {
-#else
- if (! isdigit((int) *s) && ! isspace((int) *s)) {
-#endif
- *sq = *s;
- sq++;
- }
- s++;
- }
- V->seqlen = sq - V->seq;
- }
- else /* else: indexing mode, discard the seq */
- {
- s0 = s;
- rpl = 0;
- while (*s != 0) {
- if (! isdigit((int) *s) && ! isspace((int) *s)) {
- rpl++;
- }
- s++;
- }
- V->seqlen += rpl;
- bpl = s - s0;
-
- /* Keep track of the global rpl, bpl for the file.
- * This is overly complicated because we have to
- * allow the last line of each record (e.g. the last addseq() call
- * on each sequence) to have a different length - and sometimes
- * we'll have one-line sequence records, too. Thus we only
- * do something with the global V->rpl when we have *passed over*
- * a line - we keep the last line's rpl in last_rpl. And because
- * a file might consist entirely of single-line records, we keep
- * a third guy, maxrpl, that tells us the maximum rpl of any line
- * in the file. If we reach the end of file and rpl is still unset,
- * we'll set it to maxrpl. If we reach eof and rpl is set, but is
- * less than maxrpl, that's a weird case where a last line in some
- * record is longer than every other line.
- */
- if (V->rpl != 0) { /* 0 means we already know rpl is invalid */
- if (V->lastrpl > 0) { /* we're on something that's not the first line */
- if (V->rpl > 0 && V->lastrpl != V->rpl) V->rpl = 0;
- else if (V->rpl == -1) V->rpl = V->lastrpl;
- }
- V->lastrpl = rpl;
- if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */
- }
- if (V->bpl != 0) { /* 0 means we already know bpl is invalid */
- if (V->lastbpl > 0) { /* we're on something that's not the first line */
- if (V->bpl > 0 && V->lastbpl != V->bpl) V->bpl = 0;
- else if (V->bpl == -1) V->bpl = V->lastbpl;
- }
- V->lastbpl = bpl;
- if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */
- }
- } /* end of indexing mode of addseq(). */
-
-}
-
-static void
-readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V)
-{
- int addend = 0;
- int done = 0;
-
- V->seqlen = 0;
- V->lastrpl = V->lastbpl = 0;
- if (addfirst) {
- if (V->ssimode >= 0) V->d_off = V->ssioffset;
- addseq(V->buf, V);
- } else if (V->ssimode >= 0)
- if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off)))
- Die("SSIGetFilePosition() failed");
-
- do {
- SeqfileGetLine(V);
- /* feof() alone is a bug; files not necessarily \n terminated */
- if (*(V->buf) == '\0' && feof(V->f))
- done = TRUE;
- done |= (*endTest)(V->buf, &addend);
- if (addend || !done)
- addseq(V->buf, V);
- } while (!done);
-}
-
-
-static int
-endPIR(char *s, int *addend)
-{
- *addend = 0;
- if ((strncmp(s, "///", 3) == 0) ||
- (strncmp(s, "ENTRY", 5) == 0))
- return 1;
- else
- return 0;
-}
-
-static void
-readPIR(struct ReadSeqVars *V)
-{
- char *sptr;
- /* load first line of entry */
- while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
- SeqfileGetLine(V);
- }
- if (feof(V->f)) return;
- if (V->ssimode >= 0) V->r_off = V->ssioffset;
-
- if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL)
- {
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
- }
- do {
- SeqfileGetLine(V);
- if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0)
- SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC);
- else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0)
- {
- if ((sptr = strtok(V->buf+15, " \t\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
- }
- } while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0));
- SeqfileGetLine(V); /* skip next line, coords */
-
- readLoop(0, endPIR, V);
-
- /* reading a real PIR-CODATA database file, we keep the source coords
- */
- V->sqinfo->start = 1;
- V->sqinfo->stop = V->seqlen;
- V->sqinfo->olen = V->seqlen;
- V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
-
- /* get next line
- */
- while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
- SeqfileGetLine(V);
- }
-}
-
-
-
-static int
-endIG(char *s, int *addend)
-{
- *addend = 1; /* 1 or 2 occur in line w/ bases */
- return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));
-}
-
-static void
-readIG(struct ReadSeqVars *V)
-{
- char *nm;
- /* position past ';' comments */
- do {
- SeqfileGetLine(V);
- } while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) ));
-
- if (!feof(V->f))
- {
- if ((nm = strtok(V->buf, "\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
-
- readLoop(0, endIG, V);
- }
-
- while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';'))))
- SeqfileGetLine(V);
-}
-
-static int
-endStrider(char *s, int *addend)
-{
- *addend = 0;
- return (strstr( s, "//") != NULL);
-}
-
-static void
-readStrider(struct ReadSeqVars *V)
-{
- char *nm;
-
- while ((!feof(V->f)) && (*V->buf == ';'))
- {
- if (strncmp(V->buf,"; DNA sequence", 14) == 0)
- {
- if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
- }
- SeqfileGetLine(V);
- }
-
- if (! feof(V->f))
- readLoop(1, endStrider, V);
-
- /* load next line
- */
- while ((!feof(V->f)) && (*V->buf != ';'))
- SeqfileGetLine(V);
-}
-
-
-static int
-endGB(char *s, int *addend)
-{
- *addend = 0;
- return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));
-}
-
-static void
-readGenBank(struct ReadSeqVars *V)
-{
- char *sptr;
- int in_definition;
-
- /* We'll map three genbank identifiers onto names:
- * LOCUS -> sqinfo.name
- * ACCESSION -> sqinfo.acc [primary accession only]
- * VERSION -> sqinfo.id
- * We don't currently store the GI number, or secondary accessions.
- */
- while (strncmp(V->buf, "LOCUS", 5) != 0) {
- SeqfileGetLine(V);
- }
- if (V->ssimode >= 0) V->r_off = V->ssioffset;
-
- if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
-
- in_definition = FALSE;
- while (! feof(V->f))
- {
- SeqfileGetLine(V);
- if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf)
- {
- if ((sptr = strtok(V->buf+12, "\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
- in_definition = TRUE;
- }
- else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf)
- {
- if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
- in_definition = FALSE;
- }
- else if (! feof(V->f) && strstr(V->buf, "VERSION") == V->buf)
- {
- if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
- in_definition = FALSE;
- }
- else if (strncmp(V->buf,"ORIGIN", 6) != 0)
- {
- if (in_definition)
- SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
- }
- else
- break;
- }
-
- readLoop(0, endGB, V);
-
- /* reading a real GenBank database file, we keep the source coords
- */
- V->sqinfo->start = 1;
- V->sqinfo->stop = V->seqlen;
- V->sqinfo->olen = V->seqlen;
- V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
-
-
- while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf))))
- SeqfileGetLine(V);
- /* SRE: V->s now holds "//", so sequential
- reads are wedged: fixed Tue Jul 13 1993 */
- while (!feof(V->f) && strstr(V->buf, "LOCUS ") != V->buf)
- SeqfileGetLine(V);
-}
-
-static int
-endGCGdata(char *s, int *addend)
-{
- *addend = 0;
- return (*s == '>');
-}
-
-static void
-readGCGdata(struct ReadSeqVars *V)
-{
- int binary = FALSE; /* whether data are binary or not */
- int blen = 0; /* length of binary sequence */
-
- /* first line contains ">>>>" followed by name */
- if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2))
- {
- binary = TRUE;
- SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
- blen = atoi(sqd_parse[2]);
- }
- else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1))
- SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
- else
- Die("bogus GCGdata format? %s", V->buf);
-
- /* second line contains free text description */
- SeqfileGetLine(V);
- SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
-
- if (binary) {
- /* allocate for blen characters +3... (allow for 3 bytes of slop) */
- if (blen >= V->maxseq) {
- V->maxseq = blen;
- if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
- Die("malloc failed");
- }
- /* read (blen+3)/4 bytes from file */
- if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
- Die("fread failed");
- V->seqlen = blen;
- /* convert binary code to seq */
- GCGBinaryToSequence(V->seq, blen);
- }
- else readLoop(0, endGCGdata, V);
-
- while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>'))))
- SeqfileGetLine(V);
-}
-
-static int
-endPearson(char *s, int *addend)
-{
- *addend = 0;
- return(*s == '>');
-}
-
-static void
-readPearson(struct ReadSeqVars *V)
-{
- char *sptr;
-
- if (V->ssimode >= 0) V->r_off = V->ssioffset;
-
- if (*V->buf != '>')
- Die("\
-File %s does not appear to be in FASTA format at line %d.\n\
-You may want to specify the file format on the command line.\n\
-Usually this is done with an option --informat <fmt>.\n",
- V->fname, V->linenumber);
-
- if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
- if ((sptr = strtok(NULL, "\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
-
- readLoop(0, endPearson, V);
-
- while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) {
- SeqfileGetLine(V);
- }
-}
-
-
-static int
-endEMBL(char *s, int *addend)
-{
- *addend = 0;
- /* Some people (Berlin 5S rRNA database, f'r instance) use
- * an extended EMBL format that attaches extra data after
- * the sequence -- watch out for that. We use the fact that
- * real EMBL sequence lines begin with five spaces.
- *
- * We can use this as the sole end test because readEMBL() will
- * advance to the next ID line before starting to read again.
- */
- return (strncmp(s," ",5) != 0);
-/* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */
-}
-
-static void
-readEMBL(struct ReadSeqVars *V)
-{
- char *sptr;
-
- /* make sure we have first line */
- while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) {
- SeqfileGetLine(V);
- }
- if (V->ssimode >= 0) V->r_off = V->ssioffset;
-
- if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL)
- {
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
- }
-
- do {
- SeqfileGetLine(V);
- if (!feof(V->f) && strstr(V->buf, "AC ") == V->buf)
- {
- if ((sptr = strtok(V->buf+5, "; \t\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
- }
- else if (!feof(V->f) && strstr(V->buf, "DE ") == V->buf)
- {
- if ((sptr = strtok(V->buf+5, "\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
- }
- } while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0);
-
- readLoop(0, endEMBL, V);
-
- /* Hack for Staden experiment files: convert - to N
- */
- if (V->ssimode == -1) /* if we're in ssi mode, we're not keeping the seq */
- for (sptr = V->seq; *sptr != '\0'; sptr++)
- if (*sptr == '-') *sptr = 'N';
-
- /* reading a real EMBL database file, we keep the source coords
- */
- V->sqinfo->start = 1;
- V->sqinfo->stop = V->seqlen;
- V->sqinfo->olen = V->seqlen;
- V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
-
- /* load next record's ID line */
- while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) {
- SeqfileGetLine(V);
- }
-
-}
-
-
-static int
-endZuker(char *s, int *addend)
-{
- *addend = 0;
- return( *s == '(' );
-}
-
-static void
-readZuker(struct ReadSeqVars *V)
-{
- char *sptr;
-
- SeqfileGetLine(V); /*s == "seqLen seqid string..."*/
-
- if ((sptr = strtok(V->buf+6, " \t\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
-
- if ((sptr = strtok(NULL, "\n")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
-
- readLoop(0, endZuker, V);
-
- while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '('))))
- SeqfileGetLine(V);
-}
-
-static void
-readUWGCG(struct ReadSeqVars *V)
-{
- char *si;
- char *sptr;
- int done;
-
- V->seqlen = 0;
-
- /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
- /*drop above or ".." from id*/
- if ((si = strstr(V->buf," Length: ")) != NULL) *si = 0;
- else if ((si = strstr(V->buf,"..")) != NULL) *si = 0;
-
- if ((sptr = strtok(V->buf, "\n\t ")) != NULL)
- SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
-
- do {
- done = feof(V->f);
- SeqfileGetLine(V);
- if (! done) addseq(V->buf, V);
- } while (!done);
-}
-
-
-/* Function: ReadSeq()
- *
- * Purpose: Read next sequence from an open database file.
- * Return the sequence and associated info.
- *
- * Args: fp - open sequence database file pointer
- * format - format of the file (previously determined
- * by call to SeqfileFormat()).
- * Currently unused, since we carry it in V.
- * ret_seq - RETURN: sequence
- * sqinfo - RETURN: filled in w/ other information
- *
- * Limitations: uses squid_errno, so it's not threadsafe.
- *
- * Return: 1 on success, 0 on failure.
- * ret_seq and some field of sqinfo are allocated here,
- * The preferred call mechanism to properly free the memory is:
- *
- * SQINFO sqinfo;
- * char *seq;
- *
- * ReadSeq(fp, format, &seq, &sqinfo);
- * ... do something...
- * FreeSequence(seq, &sqinfo);
- */
-int
-ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
-{
- int gotuw;
-
- squid_errno = SQERR_OK;
-
- /* Here's the hack for sequential access of sequences from
- * the multiple sequence alignment formats
- */
- if (IsAlignmentFormat(V->format))
- {
- if (V->msa->lastidx >= V->msa->nseq)
- { /* out of data. try to read another alignment */
- MSAFree(V->msa);
- if ((V->msa = MSAFileRead(V->afp)) == NULL)
- return 0;
- V->msa->lastidx = 0;
- }
- /* copy and dealign the appropriate aligned seq */
-/* AW: stopping squid from dealigning sequences and corresponding info */
-#ifdef CLUSTALO
- V->seq = sre_strdup(V->msa->aseq[V->msa->lastidx], V->msa->alen);
-#else
- MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
- V->msa->aseq[V->msa->lastidx], &(V->seq));
-#endif
- V->seqlen = strlen(V->seq);
-
- /* Extract sqinfo stuff for this sequence from the msa.
- * Tedious; code that should be cleaned.
- */
- sqinfo->flags = 0;
- if (V->msa->sqname[V->msa->lastidx] != NULL)
- SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME);
- if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL)
- SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC);
- if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL)
- SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC);
- if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) {
-/* AW: stopping squid from dealigning sequences and corresponding info */
-#ifdef CLUSTALO
- sqinfo->ss = sre_strdup(V->msa->ss[V->msa->lastidx], V->msa->alen);
-#else
- MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
- V->msa->ss[V->msa->lastidx], &(sqinfo->ss));
-#endif
- sqinfo->flags |= SQINFO_SS;
- }
- if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) {
-/* AW: stopping squid from dealigning sequences and corresponding info */
-#ifdef CLUSTALO
- sqinfo->sa = sre_strdup(V->msa->sa[V->msa->lastidx], V->msa->alen);
-#else
- MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
- V->msa->sa[V->msa->lastidx], &(sqinfo->sa));
-#endif
- sqinfo->flags |= SQINFO_SA;
- }
- V->msa->lastidx++;
- }
- else {
- if (feof(V->f)) return 0;
-
- if (V->ssimode == -1) { /* normal mode */
- V->seq = (char*) calloc (kStartLength+1, sizeof(char));
- V->maxseq = kStartLength;
- } else { /* index mode: discarding seq */
- V->seq = NULL;
- V->maxseq = 0;
- }
- V->seqlen = 0;
- V->sqinfo = sqinfo;
- V->sqinfo->flags = 0;
-
- switch (V->format) {
- case SQFILE_IG : readIG(V); break;
- case SQFILE_STRIDER : readStrider(V); break;
- case SQFILE_GENBANK : readGenBank(V); break;
- case SQFILE_FASTA : readPearson(V); break;
- case SQFILE_EMBL : readEMBL(V); break;
- case SQFILE_ZUKER : readZuker(V); break;
- case SQFILE_PIR : readPIR(V); break;
- case SQFILE_GCGDATA : readGCGdata(V); break;
-
- case SQFILE_GCG :
- do { /* skip leading comments on GCG file */
- gotuw = (strstr(V->buf,"..") != NULL);
- if (gotuw) readUWGCG(V);
- SeqfileGetLine(V);
- } while (! feof(V->f));
- break;
-
- case SQFILE_IDRAW: /* SRE: no attempt to read idraw postscript */
- default:
- squid_errno = SQERR_FORMAT;
- free(V->seq);
- return 0;
- }
- if (V->seq != NULL) /* (it can be NULL in indexing mode) */
- V->seq[V->seqlen] = 0; /* stick a string terminator on it */
- }
-
- /* Cleanup
- */
- sqinfo->len = V->seqlen;
- sqinfo->flags |= SQINFO_LEN;
- *ret_seq = V->seq;
- if (squid_errno == SQERR_OK) return 1; else return 0;
-}
-
-/* Function: SeqfileFormat()
- * Date: SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre]
- *
- * Purpose: Determine format of an open file.
- * Returns format code.
- * Rewinds the file.
- *
- * Autodetects the following unaligned formats:
- * SQFILE_FASTA
- * SQFILE_GENBANK
- * SQFILE_EMBL
- * SQFILE_GCG
- * SQFILE_GCGDATA
- * SQFILE_PIR
- * Also autodetects the following alignment formats:
- * MSAFILE_STOCKHOLM
- * MSAFILE_MSF
- * MSAFILE_CLUSTAL
- * MSAFILE_SELEX
- * MSAFILE_PHYLIP
- *
- * Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA.
- * MSAFileFormat() does the opposite.
- *
- * Args: sfp - open SQFILE
- *
- * Return: format code, or SQFILE_UNKNOWN if unrecognized
- */
-int
-SeqfileFormat(FILE *fp)
-{
- char *buf;
- int len;
- int fmt = SQFILE_UNKNOWN;
- int ndataline;
- char *bufcpy, *s, *s1, *s2;
- int has_junk;
-
- buf = NULL;
- len = 0;
- ndataline = 0;
- has_junk = FALSE;
- while (sre_fgets(&buf, &len, fp) != NULL)
- {
- if (IsBlankline(buf)) continue;
-
- /* Well-behaved formats identify themselves in first nonblank line.
- */
- if (ndataline == 0)
- {
- if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: "))
- { fmt = SQFILE_GCGDATA; goto DONE; }
-
- if (buf[0] == '>')
- { fmt = SQFILE_FASTA; goto DONE; }
-
- if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 ||
- strncmp(buf, "!!NA_SEQUENCE", 13) == 0)
- { fmt = SQFILE_GCG; goto DONE; }
-
- if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0)
- { fmt = MSAFILE_STOCKHOLM; goto DONE; }
-
- if (strncmp(buf, "CLUSTAL", 7) == 0 &&
- strstr(buf, "multiple sequence alignment") != NULL)
- { fmt = MSAFILE_CLUSTAL; goto DONE; }
-
- if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 ||
- strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0)
- { fmt = MSAFILE_MSF; goto DONE; }
-
- /* PHYLIP id: also just a good bet */
- bufcpy = sre_strdup(buf, -1);
- s = bufcpy;
- if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
- (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
- IsInt(s1) &&
- IsInt(s2))
- { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; }
- free(bufcpy);
- }
-
- /* We trust that other formats identify themselves soon.
- */
- /* dead giveaways for extended SELEX */
- if (strncmp(buf, "#=AU", 4) == 0 ||
- strncmp(buf, "#=ID", 4) == 0 ||
- strncmp(buf, "#=AC", 4) == 0 ||
- strncmp(buf, "#=DE", 4) == 0 ||
- strncmp(buf, "#=GA", 4) == 0 ||
- strncmp(buf, "#=TC", 4) == 0 ||
- strncmp(buf, "#=NC", 4) == 0 ||
- strncmp(buf, "#=SQ", 4) == 0 ||
- strncmp(buf, "#=SS", 4) == 0 ||
- strncmp(buf, "#=CS", 4) == 0 ||
- strncmp(buf, "#=RF", 4) == 0)
- { fmt = MSAFILE_SELEX; goto DONE; }
-
- if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0)
- { fmt = SQFILE_PIR; goto DONE; }
-
- /* a ha, diagnostic of an (old) MSF file */
- if ((strstr(buf, "..") != NULL) &&
- (strstr(buf, "MSF:") != NULL) &&
- (strstr(buf, "Check:")!= NULL))
- { fmt = MSAFILE_MSF; goto DONE; }
-
- /* unaligned GCG (must follow MSF test!) */
- if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL)
- { fmt = SQFILE_GCG; goto DONE; }
-
- if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0)
- { fmt = SQFILE_GENBANK; goto DONE; }
-
- if (strncmp(buf,"ID ",5) == 0 || strncmp(buf,"SQ ",5) == 0)
- { fmt = SQFILE_EMBL; goto DONE; }
-
- /* But past here, we're being desperate. A simple SELEX file is
- * very difficult to detect; we can only try to disprove it.
- */
- s = buf;
- if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */
- if (strchr("#%", *s1) != NULL) continue; /* skip comment lines */
-
- /* Disproof 1. Noncomment, nonblank lines in a SELEX file
- * must have at least two space-delimited fields (name/seq)
- */
- if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
- has_junk = TRUE;
-
- /* Disproof 2.
- * The sequence field should look like a sequence.
- */
- if (s2 != NULL && Seqtype(s2) == kOtherSeq)
- has_junk = TRUE;
-
- ndataline++;
- if (ndataline == 300) break; /* only look at first 300 lines */
- }
-
- if (ndataline == 0)
- Die("Sequence file contains no data");
-
- /* If we've made it this far, we've run out of data, but there
- * was at least one line of it; check if we've
- * disproven SELEX. If not, cross our fingers, pray, and guess SELEX.
- */
- if (has_junk == TRUE) fmt = SQFILE_UNKNOWN;
- else fmt = MSAFILE_SELEX;
-
- DONE:
- if (buf != NULL) free(buf);
- rewind(fp);
- return fmt;
-}
-
-/* Function: GCGBinaryToSequence()
- *
- * Purpose: Convert a GCG 2BIT binary string to DNA sequence.
- * 0 = C 1 = T 2 = A 3 = G
- * 4 nts/byte
- *
- * Args: seq - binary sequence. Converted in place to DNA.
- * len - length of DNA. binary is (len+3)/4 bytes
- */
-int
-GCGBinaryToSequence(char *seq, int len)
-{
- int bpos; /* position in binary */
- int spos; /* position in sequence */
- char twobit;
- int i;
-
- for (bpos = (len-1)/4; bpos >= 0; bpos--)
- {
- twobit = seq[bpos];
- spos = bpos*4;
-
- for (i = 3; i >= 0; i--)
- {
- switch (twobit & 0x3) {
- case 0: seq[spos+i] = 'C'; break;
- case 1: seq[spos+i] = 'T'; break;
- case 2: seq[spos+i] = 'A'; break;
- case 3: seq[spos+i] = 'G'; break;
- }
- twobit = twobit >> 2;
- }
- }
- seq[len] = '\0';
- return 1;
-}
-
-
-/* Function: GCGchecksum()
- * Date: SRE, Mon May 31 11:13:21 1999 [St. Louis]
- *
- * Purpose: Calculate a GCG checksum for a sequence.
- * Code provided by Steve Smith of Genetics
- * Computer Group.
- *
- * Args: seq - sequence to calculate checksum for.
- * may contain gap symbols.
- * len - length of sequence (usually known,
- * so save a strlen() call)
- *
- * Returns: GCG checksum.
- */
-int
-GCGchecksum(char *seq, int len)
-{
- int i; /* position in sequence */
- int chk = 0; /* calculated checksum */
-
- for (i = 0; i < len; i++)
- chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000;
- return chk;
-}
-
-
-/* Function: GCGMultchecksum()
- *
- * Purpose: GCG checksum for a multiple alignment: sum of
- * individual sequence checksums (including their
- * gap characters) modulo 10000.
- *
- * Implemented using spec provided by Steve Smith of
- * Genetics Computer Group.
- *
- * Args: seqs - sequences to be checksummed; aligned or not
- * nseq - number of sequences
- *
- * Return: the checksum, a number between 0 and 9999
- */
-int
-GCGMultchecksum(char **seqs, int nseq)
-{
- int chk = 0;
- int idx;
-
- for (idx = 0; idx < nseq; idx++)
- chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000;
- return chk;
-}
-
-
-
-
-/* Function: Seqtype()
- *
- * Purpose: Returns a (very good) guess about type of sequence:
- * kDNA, kRNA, kAmino, or kOtherSeq.
- *
- * Modified from, and replaces, Gilbert getseqtype().
- */
-int
-Seqtype(char *seq)
-{
- int saw; /* how many non-gap characters I saw */
- char c;
- int po = 0; /* count of protein-only */
- int nt = 0; /* count of t's */
- int nu = 0; /* count of u's */
- int na = 0; /* count of nucleotides */
- int aa = 0; /* count of amino acids */
- int no = 0; /* count of others */
-
- /* Look at the first 300 non-gap characters
- */
-
-#ifdef CLUSTALO
- /* VGGNGDDYLSGGTGNDTL is recognized as unknown using squid's default
- * approach.
- * We change it to the following:
-
- * 1. counting: ignore gaps and not alpha characters. if protein-only then
- * count as such (po). otherwise decide if amino-acid (aa) or nucleic-acid
- * (na) or unknown (no)
- *
- * 2. determine type: if we saw more unknown than aa or na, return unknown.
- * if encountered protein-only return protein-only. otherwise decide based
- * on majority. (if aa==na return na)
- */
- for (saw = 0; *seq != '\0' && saw < 300; seq++) {
- c = sre_toupper((int) *seq);
- int unknown = 1;
-
- if (isgap(c) || ! isalpha((int) c)) {
- continue;
- }
-
- if (strchr(protonly, c)) {
- po++;
- unknown = 0;
- }
-
- if (strchr(aminos,c)) {
- aa++;
- unknown = 0;
- }
-
- if (strchr(primenuc,c)) {
- na++;
- unknown = 0;
-
- if (c == 'T')
- nt++;
- else if (c == 'U')
- nu++;
- }
-
- if (unknown) {
- no ++;
- }
-
- saw++;
- }
-
- if (no > aa && no > na)
- return kOtherSeq;
-
- if (po > 0 || aa>na)
- return kAmino;
-
- if (na >= aa) {
- if (nu > nt)
- return kRNA;
- else
- return kDNA;
- }
-
- return kOtherSeq;
-
-
-#else
- for (saw = 0; *seq != '\0' && saw < 300; seq++)
- {
- c = sre_toupper((int) *seq);
- if (! isgap(c))
- {
- if (strchr(protonly, c)) po++;
- else if (strchr(primenuc,c)) {
- na++;
- if (c == 'T') nt++;
- else if (c == 'U') nu++;
- }
- else if (strchr(aminos,c)) aa++;
- else if (isalpha((int) c)) no++;
- saw++;
- }
- }
-
- if (no > 0) return kOtherSeq;
- else if (po > 0) return kAmino;
- else if (na > aa) {
- if (nu > nt) return kRNA;
- else return kDNA;
- }
- else return kAmino; /* ooooh. risky. */
-#endif
-
-}
-
-
-/* Function: GuessAlignmentSeqtype()
- * Date: SRE, Wed Jul 7 09:42:34 1999 [St. Louis]
- *
- * Purpose: Try to guess whether an alignment is protein
- * or nucleic acid; return a code for the
- * type (kRNA, kDNA, or kAmino).
- *
- * Args: aseq - array of aligned sequences. (Could also
- * be an rseq unaligned sequence array)
- * nseq - number of aseqs
- *
- * Returns: kRNA, kDNA, kAmino;
- * kOtherSeq if inconsistency is detected.
- */
-int
-GuessAlignmentSeqtype(char **aseq, int nseq)
-{
- int idx;
- int nrna = 0;
- int ndna = 0;
- int namino = 0;
- int nother = 0;
-
- for (idx = 0; idx < nseq; idx++)
- switch (Seqtype(aseq[idx])) {
- case kRNA: nrna++; break;
- case kDNA: ndna++; break;
- case kAmino: namino++; break;
- default: nother++;
- }
-
- /* Unambiguous decisions:
- */
- if (nother) return kOtherSeq;
- if (namino == nseq) return kAmino;
- if (ndna == nseq) return kDNA;
- if (nrna == nseq) return kRNA;
-
- /* Ambiguous decisions:
- */
- if (namino == 0) return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */
- return kAmino; /* some amino acid seen; others probably short seqs, some
- of which may be entirely ACGT (ala,cys,gly,thr). We
- could be a little more sophisticated: U would be a giveaway
- that we're not in protein seqs */
-}
-
-/* Function: WriteSimpleFASTA()
- * Date: SRE, Tue Nov 16 18:06:00 1999 [St. Louis]
- *
- * Purpose: Just write a FASTA format sequence to a file;
- * minimal interface, mostly for quick and dirty programs.
- *
- * Args: fp - open file handle (stdout, possibly)
- * seq - sequence to output
- * name - name for the sequence
- * desc - optional description line, or NULL.
- *
- * Returns: void
- */
-void
-WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc)
-{
- char buf[61];
- int len;
- int pos;
-
- len = strlen(seq);
- buf[60] = '\0';
- fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : "");
- for (pos = 0; pos < len; pos += 60)
- {
- strncpy(buf, seq+pos, 60);
- fprintf(fp, "%s\n", buf);
- }
-}
-
-int
-WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
-{
- int numline = 0;
- int lines = 0, spacer = 0, width = 50, tab = 0;
- int i, j, l, l1, ibase;
- char endstr[10];
- char s[100]; /* buffer for sequence */
- char ss[100]; /* buffer for structure */
- int checksum = 0;
- int seqlen;
- int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */
- int dostruc; /* TRUE to print structure lines*/
-
- which_case = 0;
- dostruc = FALSE;
- seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);
-
- if (IsAlignmentFormat(outform))
- Die("Tried to write an aligned format with WriteSeq() -- bad, bad.");
-
-
- strcpy( endstr,"");
- l1 = 0;
- checksum = GCGchecksum(seq, seqlen);
-
- switch (outform) {
- case SQFILE_UNKNOWN: /* no header, just sequence */
- strcpy(endstr,"\n"); /* end w/ extra blank line */
- break;
-
- case SQFILE_GENBANK:
- fprintf(outf,"LOCUS %s %d bp\n",
- sqinfo->name, seqlen);
- fprintf(outf,"ACCESSION %s\n",
- (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : ".");
- fprintf(outf,"DEFINITION %s\n",
- (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ".");
- fprintf(outf,"VERSION %s\n",
- (sqinfo->flags & SQINFO_ID) ? sqinfo->id : ".");
- fprintf(outf,"ORIGIN \n");
- spacer = 11;
- numline = 1;
- strcpy(endstr, "\n//");
- break;
-
- case SQFILE_GCGDATA:
- fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen);
- fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
- break;
-
- case SQFILE_PIR:
- fprintf(outf, "ENTRY %s\n",
- (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
- fprintf(outf, "TITLE %s\n",
- (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
- fprintf(outf, "ACCESSION %s\n",
- (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
- fprintf(outf, "SUMMARY #Length %d #Checksum %d\n",
- sqinfo->len, checksum);
- fprintf(outf, "SEQUENCE\n");
- fprintf(outf, " 5 10 15 20 25 30\n");
- spacer = 2; /* spaces after every residue */
- numline = 1; /* number lines w/ coords */
- width = 30; /* 30 aa per line */
- strcpy(endstr, "\n///");
- break;
-
- case SQFILE_SQUID:
- fprintf(outf, "NAM %s\n", sqinfo->name);
- if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
- fprintf(outf, "SRC %s %s %d..%d::%d\n",
- (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-",
- (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-",
- (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0,
- (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0,
- (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0);
- if (sqinfo->flags & SQINFO_DESC)
- fprintf(outf, "DES %s\n", sqinfo->desc);
- if (sqinfo->flags & SQINFO_SS)
- {
- fprintf(outf, "SEQ +SS\n");
- dostruc = TRUE; /* print structure lines too */
- }
- else
- fprintf(outf, "SEQ\n");
- numline = 1; /* number seq lines w/ coords */
- strcpy(endstr, "\n++");
- break;
-
- case SQFILE_EMBL:
- fprintf(outf,"ID %s\n",
- (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
- fprintf(outf,"AC %s\n",
- (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
- fprintf(outf,"DE %s\n",
- (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
- fprintf(outf,"SQ %d BP\n", seqlen);
- strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
- tab = 5; /** added 31jan91 */
- spacer = 11; /** added 31jan91 */
- break;
-
- case SQFILE_GCG:
- fprintf(outf,"%s\n", sqinfo->name);
- if (sqinfo->flags & SQINFO_ACC)
- fprintf(outf,"ACCESSION %s\n", sqinfo->acc);
- if (sqinfo->flags & SQINFO_DESC)
- fprintf(outf,"DEFINITION %s\n", sqinfo->desc);
- fprintf(outf," %s Length: %d (today) Check: %d ..\n",
- sqinfo->name, seqlen, checksum);
- spacer = 11;
- numline = 1;
- strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
- break;
-
- case SQFILE_STRIDER: /* ?? map ?*/
- fprintf(outf,"; ### from DNA Strider ;-)\n");
- fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n",
- sqinfo->name, seqlen, checksum);
- strcpy(endstr, "\n//");
- break;
-
- /* SRE: Don had Zuker default to Pearson, which is not
- intuitive or helpful, since Zuker's MFOLD can't read
- Pearson format. More useful to use kIG */
- case SQFILE_ZUKER:
- which_case = 1; /* MFOLD requires upper case. */
- /*FALLTHRU*/
- case SQFILE_IG:
- fprintf(outf,";%s %s\n",
- sqinfo->name,
- (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
- fprintf(outf,"%s\n", sqinfo->name);
- strcpy(endstr,"1"); /* == linear dna */
- break;
-
- case SQFILE_RAW: /* Raw: no header at all. */
- break;
-
- default :
- case SQFILE_FASTA:
- fprintf(outf,">%s %s\n", sqinfo->name,
- (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
- break;
- }
-
- if (which_case == 1) s2upper(seq);
- if (which_case == 2) s2lower(seq);
-
-
- width = MIN(width,100);
- for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
- if (l1 < 0) l1 = 0;
- else if (l1 == 0) {
- if (numline) fprintf(outf,"%8d ",ibase);
- for (j=0; j<tab; j++) fputc(' ',outf);
- }
- if ((spacer != 0) && ((l+1) % spacer == 1))
- { s[l] = ' '; ss[l] = ' '; l++; }
- s[l] = seq[i];
- ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
- l++; i++;
- l1++; /* don't count spaces for width*/
- if (l1 == width || i == seqlen) {
- s[l] = ss[l] = '\0';
- l = 0; l1 = 0;
- if (dostruc)
- {
- fprintf(outf, "%s\n", s);
- if (numline) fprintf(outf," ");
- for (j=0; j<tab; j++) fputc(' ',outf);
- if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
- else fprintf(outf,"%s\n",ss);
- }
- else
- {
- if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
- else fprintf(outf,"%s\n",s);
- }
- lines++;
- ibase = i+1;
- }
- }
- return lines;
-}
-
-
-/* Function: ReadMultipleRseqs()
- *
- * Purpose: Open a data file and
- * parse it into an array of rseqs (raw, unaligned
- * sequences).
- *
- * Caller is responsible for free'ing memory allocated
- * to ret_rseqs, ret_weights, and ret_names.
- *
- * Weights are currently only supported for MSF format.
- * Sequences read from all other formats will be assigned
- * weights of 1.0. If the caller isn't interested in
- * weights, it passes NULL as ret_weights.
- *
- * Returns 1 on success. Returns 0 on failure and sets
- * squid_errno to indicate the cause.
- */
-int
-ReadMultipleRseqs(char *seqfile,
- int fformat,
- char ***ret_rseqs,
- SQINFO **ret_sqinfo,
- int *ret_num)
-{
- SQINFO *sqinfo; /* array of sequence optional info */
- SQFILE *dbfp; /* open ptr for sequential access of file */
- char **rseqs; /* sequence array */
- int numalloced; /* num of seqs currently alloced for */
- int num;
-
-
- num = 0;
- numalloced = 16;
- rseqs = (char **) MallocOrDie (numalloced * sizeof(char *));
- sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
- if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;
-
- while (ReadSeq(dbfp, dbfp->format, &rseqs[num], &(sqinfo[num])))
- {
- num++;
- if (num == numalloced) /* more seqs coming, alloc more room */
- {
- numalloced += 16;
- rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
- sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
- }
- }
- SeqfileClose(dbfp);
-
- *ret_rseqs = rseqs;
- *ret_sqinfo = sqinfo;
- *ret_num = num;
- return 1;
-}
-
-
-/* Function: String2SeqfileFormat()
- * Date: SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield]
- *
- * Purpose: Convert a string (e.g. from command line option arg)
- * to a format code. Case insensitive. Return
- * MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad.
- * Uses codes defined in squid.h (unaligned formats) and
- * msa.h (aligned formats).
- *
- * Args: s - string to convert; e.g. "stockholm"
- *
- * Returns: format code; e.g. MSAFILE_STOCKHOLM
- */
-int
-String2SeqfileFormat(char *s)
-{
- char *s2;
- int code = SQFILE_UNKNOWN;
-
- if (s == NULL) return SQFILE_UNKNOWN;
- s2 = sre_strdup(s, -1);
- s2upper(s2);
-
- if (strcmp(s2, "FASTA") == 0) code = SQFILE_FASTA;
-#ifdef CLUSTALO
- if (strcmp(s2, "FA") == 0) code = SQFILE_FASTA;
- else if (strcmp(s2, "VIENNA") == 0) code = SQFILE_VIENNA;
- else if (strcmp(s2, "VIE") == 0) code = SQFILE_VIENNA;
-#endif
- else if (strcmp(s2, "GENBANK") == 0) code = SQFILE_GENBANK;
-#ifdef CLUSTALO
- else if (strcmp(s2, "GB") == 0) code = SQFILE_GENBANK;
-#endif
- else if (strcmp(s2, "EMBL") == 0) code = SQFILE_EMBL;
- else if (strcmp(s2, "GCG") == 0) code = SQFILE_GCG;
- else if (strcmp(s2, "GCGDATA") == 0) code = SQFILE_GCGDATA;
- else if (strcmp(s2, "RAW") == 0) code = SQFILE_RAW;
- else if (strcmp(s2, "IG") == 0) code = SQFILE_IG;
- else if (strcmp(s2, "STRIDER") == 0) code = SQFILE_STRIDER;
- else if (strcmp(s2, "IDRAW") == 0) code = SQFILE_IDRAW;
- else if (strcmp(s2, "ZUKER") == 0) code = SQFILE_ZUKER;
- else if (strcmp(s2, "PIR") == 0) code = SQFILE_PIR;
- else if (strcmp(s2, "SQUID") == 0) code = SQFILE_SQUID;
- else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM;
-#ifdef CLUSTALO
- else if (strcmp(s2, "ST") == 0) code = MSAFILE_STOCKHOLM;
- else if (strcmp(s2, "STK") == 0) code = MSAFILE_STOCKHOLM;
-#endif
- else if (strcmp(s2, "SELEX") == 0) code = MSAFILE_SELEX;
- else if (strcmp(s2, "MSF") == 0) code = MSAFILE_MSF;
- else if (strcmp(s2, "CLUSTAL") == 0) code = MSAFILE_CLUSTAL;
-#ifdef CLUSTALO
- else if (strcmp(s2, "CLU") == 0) code = MSAFILE_CLUSTAL;
-#endif
- else if (strcmp(s2, "A2M") == 0) code = MSAFILE_A2M;
- else if (strcmp(s2, "PHYLIP") == 0) code = MSAFILE_PHYLIP;
-#ifdef CLUSTALO
- else if (strcmp(s2, "PHY") == 0) code = MSAFILE_PHYLIP;
-#endif
- else if (strcmp(s2, "EPS") == 0) code = MSAFILE_EPS;
-#ifdef CLUSTALO
- else code = SQFILE_UNKNOWN;
-#endif
- free(s2);
- return code;
-}
-char *
-SeqfileFormat2String(int code)
-{
- switch (code) {
- case SQFILE_UNKNOWN: return "unknown";
- case SQFILE_FASTA: return "FASTA";
-#ifdef CLUSTALO
- case SQFILE_VIENNA: return "Vienna";
-#endif
- case SQFILE_GENBANK: return "Genbank";
- case SQFILE_EMBL: return "EMBL";
- case SQFILE_GCG: return "GCG";
- case SQFILE_GCGDATA: return "GCG data library";
- case SQFILE_RAW: return "raw";
- case SQFILE_IG: return "Intelligenetics";
- case SQFILE_STRIDER: return "MacStrider";
- case SQFILE_IDRAW: return "Idraw Postscript";
- case SQFILE_ZUKER: return "Zuker";
- case SQFILE_PIR: return "PIR";
- case SQFILE_SQUID: return "SQUID";
- case MSAFILE_STOCKHOLM: return "Stockholm";
- case MSAFILE_SELEX: return "SELEX";
- case MSAFILE_MSF: return "MSF";
- case MSAFILE_CLUSTAL: return "Clustal";
- case MSAFILE_A2M: return "a2m";
- case MSAFILE_PHYLIP: return "Phylip";
- case MSAFILE_EPS: return "EPS";
- default:
- Die("Bad code passed to MSAFormat2String()");
- }
- /*NOTREACHED*/
- return NULL;
-}
-
-
-/* Function: MSAToSqinfo()
- * Date: SRE, Tue Jul 20 14:36:56 1999 [St. Louis]
- *
- * Purpose: Take an MSA and generate a SQINFO array suitable
- * for use in annotating the unaligned sequences.
- * Return the array.
- *
- * Permanent temporary code. sqinfo was poorly designed.
- * it must eventually be replaced, but the odds
- * of this happening soon are nil, so I have to deal.
- *
- * Args: msa - the alignment
- *
- * Returns: ptr to allocated sqinfo array.
- * Freeing is ghastly: free in each individual sqinfo[i]
- * with FreeSequence(NULL, &(sqinfo[i])), then
- * free(sqinfo).
- */
-SQINFO *
-MSAToSqinfo(MSA *msa)
-{
- int idx;
- SQINFO *sqinfo;
-
- sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq);
-
- for (idx = 0; idx < msa->nseq; idx++)
- {
- sqinfo[idx].flags = 0;
- SetSeqinfoString(&(sqinfo[idx]),
- msa->sqname[idx], SQINFO_NAME);
- SetSeqinfoString(&(sqinfo[idx]),
- MSAGetSeqAccession(msa, idx), SQINFO_ACC);
- SetSeqinfoString(&(sqinfo[idx]),
- MSAGetSeqDescription(msa, idx), SQINFO_DESC);
-
- if (msa->ss != NULL && msa->ss[idx] != NULL) {
- MakeDealignedString(msa->aseq[idx], msa->alen,
- msa->ss[idx], &(sqinfo[idx].ss));
- sqinfo[idx].flags |= SQINFO_SS;
- }
-
- if (msa->sa != NULL && msa->sa[idx] != NULL) {
- MakeDealignedString(msa->aseq[idx], msa->alen,
- msa->sa[idx], &(sqinfo[idx].sa));
- sqinfo[idx].flags |= SQINFO_SA;
- }
-
- sqinfo[idx].len = DealignedLength(msa->aseq[idx]);
- sqinfo[idx].flags |= SQINFO_LEN;
- }
- return sqinfo;
-}
-
-
-
-/* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */
-#ifdef A_QUIET_DAY
-#include "ssi.h"
-int
-main(int argc, char **argv)
-{
- FILE *fp;
- char *filename;
- char *buf;
- int len;
- int mode = 3;
- SSIOFFSET off;
-
- filename = argv[1];
-
- if (mode == 1) {
- buf = malloc(sizeof(char) * 256);
- if ((fp = fopen(filename, "r")) == NULL)
- Die("open of %s failed", filename);
- while (fgets(buf, 255, fp) != NULL)
- ;
- fclose(fp);
- free(buf);
- } else if (mode == 2) {
- if ((fp = fopen(filename, "r")) == NULL)
- Die("open of %s failed", filename);
- buf = NULL; len = 0;
- while (sre_fgets(&buf, &len, fp) != NULL)
- SSIGetFilePosition(fp, SSI_OFFSET_I32, &off);
- fclose(fp);
- free(buf);
- } else if (mode == 3) {
- SQFILE *dbfp;
- SQINFO info;
-
- if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL)
- Die("open of %s failed", filename);
- while (ReadSeq(dbfp, dbfp->format, &buf, &info)) {
- SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off);
- FreeSequence(buf, &info);
- }
- SeqfileClose(dbfp);
- }
-
-}
-
-
-#endif