1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * From: ureadseq.c in Don Gilbert's sequence i/o package
14 * Reads and writes nucleic/protein sequence in various
15 * formats. Data files may have multiple sequences.
17 * Heavily modified from READSEQ package
18 * Copyright (C) 1990 by D.G. Gilbert
19 * Biology Dept., Indiana University, Bloomington, IN 47405
20 * email: gilbertd@bio.indiana.edu
23 * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992
24 * Packaged for squid, Thu Oct 1 10:07:11 1992
25 * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993
27 * CVS $Id: sqio.c,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $
29 *****************************************************************
30 * Basic API for single sequence reading:
34 * int format; - see squid.h for formats; example: SQFILE_FASTA
38 * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
39 * Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
40 * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) {
42 * FreeSequence(seq, &sqinfo);
46 *****************************************************************
62 static void SeqfileGetLine(SQFILE *V);
64 #define kStartLength 500
66 static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
67 static char *primenuc = "ACGTUN";
68 static char *protonly = "EFIPQZ";
70 static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode);
72 /* Function: SeqfileOpen()
74 * Purpose : Open a sequence database file and prepare for reading
77 * Args: filename - name of file to open
78 * format - format of file
79 * env - environment variable for path (e.g. BLASTDB)
80 * ssimode - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64
82 * Returns opened SQFILE ptr, or NULL on failure.
85 SeqfileOpen(char *filename, int format, char *env)
87 return seqfile_open(filename, format, env, -1);
90 SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode)
92 return seqfile_open(filename, format, env, ssimode);
95 seqfile_open(char *filename, int format, char *env, int ssimode)
99 dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
101 dbfp->ssimode = ssimode;
102 dbfp->rpl = -1; /* flag meaning "unset" */
105 dbfp->bpl = -1; /* flag meaning "unset" */
109 /* Open our file handle.
110 * Three possibilities:
111 * 1. normal file open
112 * 2. filename = "-"; read from stdin
113 * 3. filename = "*.gz"; read thru pipe from gzip
114 * If we're reading from stdin or a pipe, we can't reliably
115 * back up, so we can't do two-pass parsers like the interleaved alignment
118 if (strcmp(filename, "-") == 0)
121 dbfp->do_stdin = TRUE;
122 dbfp->do_gzip = FALSE;
123 dbfp->fname = sre_strdup("[STDIN]", -1);
125 #ifndef SRE_STRICT_ANSI
126 /* popen(), pclose() aren't portable to non-POSIX systems; disable */
127 else if (Strparse("^.*\\.gz$", filename, 0))
131 /* Note that popen() will return "successfully"
132 * if file doesn't exist, because gzip works fine
133 * and prints an error! So we have to check for
134 * existence of file ourself.
136 if (! FileExists(filename))
137 Die("%s: file does not exist", filename);
139 if (strlen(filename) + strlen("gzip -dc ") >= 256)
140 Die("filename > 255 char in SeqfileOpen()");
141 sprintf(cmd, "gzip -dc %s", filename);
142 if ((dbfp->f = popen(cmd, "r")) == NULL)
145 dbfp->do_stdin = FALSE;
146 dbfp->do_gzip = TRUE;
147 dbfp->fname = sre_strdup(filename, -1);
149 #endif /*SRE_STRICT_ANSI*/
152 if ((dbfp->f = fopen(filename, "r")) == NULL &&
153 (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL)
156 dbfp->do_stdin = FALSE;
157 dbfp->do_gzip = FALSE;
158 dbfp->fname = sre_strdup(filename, -1);
162 /* Invoke autodetection if we haven't already been told what
165 if (format == SQFILE_UNKNOWN)
167 if (dbfp->do_stdin == TRUE || dbfp->do_gzip)
168 Die("Can't autodetect sequence file format from a stdin or gzip pipe");
169 format = SeqfileFormat(dbfp->f);
170 if (format == SQFILE_UNKNOWN)
171 Die("Can't determine format of sequence file %s", dbfp->fname);
174 /* The hack for sequential access of an interleaved alignment file:
175 * read the alignment in, we'll copy sequences out one at a time.
179 dbfp->format = format;
180 dbfp->linenumber = 0;
183 if (IsAlignmentFormat(format))
185 /* We'll be reading from the MSA interface. Copy our data
186 * to the MSA afp's structure.
188 dbfp->afp = MallocOrDie(sizeof(MSAFILE));
189 dbfp->afp->f = dbfp->f; /* just a ptr, don't close */
190 dbfp->afp->do_stdin = dbfp->do_stdin;
191 dbfp->afp->do_gzip = dbfp->do_gzip;
192 dbfp->afp->fname = dbfp->fname; /* just a ptr, don't free */
193 dbfp->afp->format = dbfp->format; /* e.g. format */
194 dbfp->afp->linenumber = dbfp->linenumber; /* e.g. 0 */
195 dbfp->afp->buf = NULL;
196 dbfp->afp->buflen = 0;
198 if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL)
199 Die("Failed to read any alignment data from file %s", dbfp->fname);
200 /* hack: overload/reuse msa->lastidx; indicates
201 next seq to return upon a ReadSeq() call */
202 dbfp->msa->lastidx = 0;
207 /* Load the first line.
209 SeqfileGetLine(dbfp);
213 /* Function: SeqfilePosition()
215 * Purpose: Move to a particular offset in a seqfile.
216 * Will not work on alignment files.
219 SeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset)
221 if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format))
222 Die("SeqfilePosition() failed: in a nonrewindable data file or stream");
224 if (SSISetFilePosition(sqfp->f, offset) != 0)
225 Die("SSISetFilePosition failed, but that shouldn't happen.");
226 SeqfileGetLine(sqfp);
230 /* Function: SeqfileRewind()
232 * Purpose: Set a sequence file back to the first sequence.
234 * Won't work on alignment files. Although it would
235 * seem that it could (just set msa->lastidx back to 0),
236 * that'll fail on "multiple multiple" alignment file formats
240 SeqfileRewind(SQFILE *sqfp)
242 if (sqfp->do_stdin || sqfp->do_gzip)
243 Die("SeqfileRewind() failed: in a nonrewindable data file or stream");
246 SeqfileGetLine(sqfp);
249 /* Function: SeqfileLineParameters()
250 * Date: SRE, Thu Feb 15 17:00:41 2001 [St. Louis]
252 * Purpose: After all the sequences have been read from the file,
253 * but before closing it, retrieve overall bytes-per-line and
254 * residues-per-line info. If non-zero, these mean that
255 * the file contains homogeneous sequence line lengths (except
256 * the last line in each record).
258 * If either of bpl or rpl is determined to be inhomogeneous,
259 * both are returned as 0.
261 * Args: *sqfp - an open but fully read sequence file
262 * ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous
263 * ret_rpl - RETURN: residues per line, or 0 if inhomogenous.
268 SeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl)
270 if (V->rpl > 0 && V->maxrpl == V->rpl &&
271 V->bpl > 0 && V->maxbpl == V->bpl) {
282 SeqfileClose(SQFILE *sqfp)
284 /* note: don't test for sqfp->msa being NULL. Now that
285 * we're holding afp open and allowing access to multi-MSA
286 * databases (e.g. Stockholm format, Pfam), msa ends
287 * up being NULL when we run out of alignments.
289 if (sqfp->afp != NULL) {
290 if (sqfp->msa != NULL) MSAFree(sqfp->msa);
291 if (sqfp->afp->buf != NULL) free(sqfp->afp->buf);
294 #ifndef SRE_STRICT_ANSI /* gunzip functionality only on POSIX systems */
295 if (sqfp->do_gzip) pclose(sqfp->f);
297 else if (! sqfp->do_stdin) fclose(sqfp->f);
298 if (sqfp->buf != NULL) free(sqfp->buf);
299 if (sqfp->fname != NULL) free(sqfp->fname);
304 /* Function: SeqfileGetLine()
305 * Date: SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre]
307 * Purpose: read a line from a sequence file into V->buf
308 * If the fgets() is NULL, sets V->buf[0] to '\0'.
315 SeqfileGetLine(SQFILE *V)
318 if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset)))
319 Die("SSIGetFilePosition() failed");
320 if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL)
327 FreeSequence(char *seq, SQINFO *sqinfo)
329 if (seq != NULL) free(seq);
330 if (sqinfo->flags & SQINFO_SS) free(sqinfo->ss);
331 if (sqinfo->flags & SQINFO_SA) free(sqinfo->sa);
335 SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
340 /* silently ignore NULL. */
341 if (sptr == NULL) return 1;
343 while (*sptr == ' ') sptr++; /* ignore leading whitespace */
344 for (pos = strlen(sptr)-1; pos >= 0; pos--)
345 if (! isspace((int) sptr[pos])) break;
346 sptr[pos+1] = '\0'; /* ignore trailing whitespace */
352 strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);
353 sqinfo->name[SQINFO_NAMELEN-1] = '\0';
354 sqinfo->flags |= SQINFO_NAME;
361 strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);
362 sqinfo->id[SQINFO_NAMELEN-1] = '\0';
363 sqinfo->flags |= SQINFO_ID;
370 strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);
371 sqinfo->acc[SQINFO_NAMELEN-1] = '\0';
372 sqinfo->flags |= SQINFO_ACC;
379 if (sqinfo->flags & SQINFO_DESC) /* append? */
381 len = strlen(sqinfo->desc);
382 if (len < SQINFO_DESCLEN-2) /* is there room? */
384 strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;
385 strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);
389 strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);
390 sqinfo->desc[SQINFO_DESCLEN-1] = '\0';
391 sqinfo->flags |= SQINFO_DESC;
396 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
397 sqinfo->start = atoi(sptr);
398 if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;
402 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
403 sqinfo->stop = atoi(sptr);
404 if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;
408 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
409 sqinfo->olen = atoi(sptr);
410 if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;
414 Die("Invalid flag %d to SetSeqinfoString()", flag);
420 SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
422 sq1->flags = sq2->flags;
423 if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name);
424 if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id);
425 if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc);
426 if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc);
427 if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len;
428 if (sq2->flags & SQINFO_START) sq1->start = sq2->start;
429 if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop;
430 if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen;
431 if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type;
432 if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss);
433 if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa);
438 * Purpose: Convert a sequence to DNA.
444 for (; *seq != '\0'; seq++)
446 if (*seq == 'U') *seq = 'T';
447 else if (*seq == 'u') *seq = 't';
453 * Purpose: Convert a sequence to RNA.
459 for (; *seq != '\0'; seq++)
461 if (*seq == 'T') *seq = 'U';
462 else if (*seq == 't') *seq = 'u';
467 /* Function: ToIUPAC()
469 * Purpose: Convert X's, o's, other junk in a nucleic acid sequence to N's,
470 * to comply with IUPAC code. Does allow gap characters
471 * though, so we can call ToIUPAC() on aligned seqs.
473 * WU-BLAST's pressdb will
474 * choke on X's, for instance, necessitating conversion
475 * of certain genome centers' data.
480 for (; *seq != '\0'; seq++)
481 if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N';
485 /* Function: addseq()
487 * Purpose: Add a line of sequence to the growing string in V.
488 * Skip all nonalphabetic characters in the input string:
489 * in particular, spaces and digits (coordinates). This
490 * allows us to generically read sequence data from most
494 addseq(char *s, struct ReadSeqVars *V)
498 int rpl; /* valid residues per line */
499 int bpl; /* characters per line */
501 if (V->ssimode == -1)
502 { /* Normal mode: keeping the seq */
503 /* Make sure we have enough room. We know that s is <= buflen,
504 * so just make sure we've got room for a whole new buflen worth
507 if (V->seqlen + V->buflen > V->maxseq) {
508 V->maxseq += MAX(V->buflen, kStartLength);
509 V->seq = ReallocOrDie (V->seq, V->maxseq+1);
513 sq = V->seq + V->seqlen;
515 if (isalpha((int) *s)) {
521 V->seqlen = sq - V->seq;
523 else /* else: indexing mode, discard the seq */
528 if (isalpha((int) *s)) rpl++;
534 /* Keep track of the global rpl, bpl for the file.
535 * This is overly complicated because we have to
536 * allow the last line of each record (e.g. the last addseq() call
537 * on each sequence) to have a different length - and sometimes
538 * we'll have one-line sequence records, too. Thus we only
539 * do something with the global V->rpl when we have *passed over*
540 * a line - we keep the last line's rpl in last_rpl. And because
541 * a file might consist entirely of single-line records, we keep
542 * a third guy, maxrpl, that tells us the maximum rpl of any line
543 * in the file. If we reach the end of file and rpl is still unset,
544 * we'll set it to maxrpl. If we reach eof and rpl is set, but is
545 * less than maxrpl, that's a weird case where a last line in some
546 * record is longer than every other line.
548 if (V->rpl != 0) { /* 0 means we already know rpl is invalid */
549 if (V->lastrpl > 0) { /* we're on something that's not the first line */
550 if (V->rpl > 0 && V->lastrpl != V->rpl) V->rpl = 0;
551 else if (V->rpl == -1) V->rpl = V->lastrpl;
554 if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */
556 if (V->bpl != 0) { /* 0 means we already know bpl is invalid */
557 if (V->lastbpl > 0) { /* we're on something that's not the first line */
558 if (V->bpl > 0 && V->lastbpl != V->bpl) V->bpl = 0;
559 else if (V->bpl == -1) V->bpl = V->lastbpl;
562 if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */
564 } /* end of indexing mode of addseq(). */
569 readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V)
575 V->lastrpl = V->lastbpl = 0;
577 if (V->ssimode >= 0) V->d_off = V->ssioffset;
579 } else if (V->ssimode >= 0)
580 if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off)))
581 Die("SSIGetFilePosition() failed");
585 /* feof() alone is a bug; files not necessarily \n terminated */
586 if (*(V->buf) == '\0' && feof(V->f))
588 done |= (*endTest)(V->buf, &addend);
596 endPIR(char *s, int *addend)
599 if ((strncmp(s, "///", 3) == 0) ||
600 (strncmp(s, "ENTRY", 5) == 0))
607 readPIR(struct ReadSeqVars *V)
610 /* load first line of entry */
611 while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
614 if (feof(V->f)) return;
615 if (V->ssimode >= 0) V->r_off = V->ssioffset;
617 if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL)
619 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
620 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
624 if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0)
625 SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC);
626 else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0)
628 if ((sptr = strtok(V->buf+15, " \t\n")) != NULL)
629 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
631 } while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0));
632 SeqfileGetLine(V); /* skip next line, coords */
634 readLoop(0, endPIR, V);
636 /* reading a real PIR-CODATA database file, we keep the source coords
638 V->sqinfo->start = 1;
639 V->sqinfo->stop = V->seqlen;
640 V->sqinfo->olen = V->seqlen;
641 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
645 while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
653 endIG(char *s, int *addend)
655 *addend = 1; /* 1 or 2 occur in line w/ bases */
656 return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));
660 readIG(struct ReadSeqVars *V)
663 /* position past ';' comments */
666 } while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) ));
670 if ((nm = strtok(V->buf, "\n\t ")) != NULL)
671 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
673 readLoop(0, endIG, V);
676 while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';'))))
681 endStrider(char *s, int *addend)
684 return (strstr( s, "//") != NULL);
688 readStrider(struct ReadSeqVars *V)
692 while ((!feof(V->f)) && (*V->buf == ';'))
694 if (strncmp(V->buf,"; DNA sequence", 14) == 0)
696 if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL)
697 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
703 readLoop(1, endStrider, V);
707 while ((!feof(V->f)) && (*V->buf != ';'))
713 endGB(char *s, int *addend)
716 return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));
720 readGenBank(struct ReadSeqVars *V)
725 while (strncmp(V->buf, "LOCUS", 5) != 0) {
728 if (V->ssimode >= 0) V->r_off = V->ssioffset;
730 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
732 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
733 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
736 in_definition = FALSE;
740 if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf)
742 if ((sptr = strtok(V->buf+12, "\n")) != NULL)
743 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
744 in_definition = TRUE;
746 else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf)
748 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
749 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
750 in_definition = FALSE;
752 else if (strncmp(V->buf,"ORIGIN", 6) != 0)
755 SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
761 readLoop(0, endGB, V);
763 /* reading a real GenBank database file, we keep the source coords
765 V->sqinfo->start = 1;
766 V->sqinfo->stop = V->seqlen;
767 V->sqinfo->olen = V->seqlen;
768 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
771 while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf))))
773 /* SRE: V->s now holds "//", so sequential
774 reads are wedged: fixed Tue Jul 13 1993 */
775 while (!feof(V->f) && strstr(V->buf, "LOCUS ") != V->buf)
780 endGCGdata(char *s, int *addend)
787 readGCGdata(struct ReadSeqVars *V)
789 int binary = FALSE; /* whether data are binary or not */
790 int blen = 0; /* length of binary sequence */
792 /* first line contains ">>>>" followed by name */
793 if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2))
796 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
797 blen = atoi(sqd_parse[2]);
799 else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1))
800 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
802 Die("bogus GCGdata format? %s", V->buf);
804 /* second line contains free text description */
806 SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
809 /* allocate for blen characters +3... (allow for 3 bytes of slop) */
810 if (blen >= V->maxseq) {
812 if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
813 Die("malloc failed");
815 /* read (blen+3)/4 bytes from file */
816 if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
819 /* convert binary code to seq */
820 GCGBinaryToSequence(V->seq, blen);
822 else readLoop(0, endGCGdata, V);
824 while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>'))))
829 endPearson(char *s, int *addend)
836 readPearson(struct ReadSeqVars *V)
840 if (V->ssimode >= 0) V->r_off = V->ssioffset;
844 File %s does not appear to be in FASTA format at line %d.\n\
845 You may want to invoke the Babelfish to autodetect your file's format.\n\
846 Usually this is done with a -B option.\n",
847 V->fname, V->linenumber);
849 if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL)
850 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
851 if ((sptr = strtok(NULL, "\n")) != NULL)
852 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
854 readLoop(0, endPearson, V);
856 while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) {
863 endEMBL(char *s, int *addend)
866 /* Some people (Berlin 5S rRNA database, f'r instance) use
867 * an extended EMBL format that attaches extra data after
868 * the sequence -- watch out for that. We use the fact that
869 * real EMBL sequence lines begin with five spaces.
871 * We can use this as the sole end test because readEMBL() will
872 * advance to the next ID line before starting to read again.
874 return (strncmp(s," ",5) != 0);
875 /* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */
879 readEMBL(struct ReadSeqVars *V)
883 /* make sure we have first line */
884 while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) {
887 if (V->ssimode >= 0) V->r_off = V->ssioffset;
889 if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL)
891 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
892 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
897 if (!feof(V->f) && strstr(V->buf, "AC ") == V->buf)
899 if ((sptr = strtok(V->buf+5, "; \t\n")) != NULL)
900 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
902 else if (!feof(V->f) && strstr(V->buf, "DE ") == V->buf)
904 if ((sptr = strtok(V->buf+5, "\n")) != NULL)
905 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
907 } while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0);
909 readLoop(0, endEMBL, V);
911 /* Hack for Staden experiment files: convert - to N
913 if (V->ssimode == -1) /* if we're in ssi mode, we're not keeping the seq */
914 for (sptr = V->seq; *sptr != '\0'; sptr++)
915 if (*sptr == '-') *sptr = 'N';
917 /* reading a real EMBL database file, we keep the source coords
919 V->sqinfo->start = 1;
920 V->sqinfo->stop = V->seqlen;
921 V->sqinfo->olen = V->seqlen;
922 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
924 /* load next record's ID line */
925 while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) {
933 endZuker(char *s, int *addend)
940 readZuker(struct ReadSeqVars *V)
944 SeqfileGetLine(V); /*s == "seqLen seqid string..."*/
946 if ((sptr = strtok(V->buf+6, " \t\n")) != NULL)
947 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
949 if ((sptr = strtok(NULL, "\n")) != NULL)
950 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
952 readLoop(0, endZuker, V);
954 while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '('))))
959 readUWGCG(struct ReadSeqVars *V)
967 /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
968 /*drop above or ".." from id*/
969 if ((si = strstr(V->buf," Length: ")) != NULL) *si = 0;
970 else if ((si = strstr(V->buf,"..")) != NULL) *si = 0;
972 if ((sptr = strtok(V->buf, "\n\t ")) != NULL)
973 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
978 if (! done) addseq(V->buf, V);
983 /* Function: ReadSeq()
985 * Purpose: Read next sequence from an open database file.
986 * Return the sequence and associated info.
988 * Args: fp - open sequence database file pointer
989 * format - format of the file (previously determined
990 * by call to SeqfileFormat()).
991 * Currently unused, since we carry it in V.
992 * ret_seq - RETURN: sequence
993 * sqinfo - RETURN: filled in w/ other information
995 * Limitations: uses squid_errno, so it's not threadsafe.
997 * Return: 1 on success, 0 on failure.
998 * ret_seq and some field of sqinfo are allocated here,
999 * The preferred call mechanism to properly free the memory is:
1004 * ReadSeq(fp, format, &seq, &sqinfo);
1005 * ... do something...
1006 * FreeSequence(seq, &sqinfo);
1009 ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
1013 squid_errno = SQERR_OK;
1015 /* Here's the hack for sequential access of sequences from
1016 * the multiple sequence alignment formats
1018 if (IsAlignmentFormat(V->format))
1020 if (V->msa->lastidx >= V->msa->nseq)
1021 { /* out of data. try to read another alignment */
1023 if ((V->msa = MSAFileRead(V->afp)) == NULL)
1025 V->msa->lastidx = 0;
1027 /* copy and dealign the appropriate aligned seq */
1028 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1029 V->msa->aseq[V->msa->lastidx], &(V->seq));
1030 V->seqlen = strlen(V->seq);
1032 /* Extract sqinfo stuff for this sequence from the msa.
1033 * Tedious; code that should be cleaned.
1036 if (V->msa->sqname[V->msa->lastidx] != NULL)
1037 SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME);
1038 if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL)
1039 SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC);
1040 if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL)
1041 SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC);
1042 if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) {
1043 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1044 V->msa->ss[V->msa->lastidx], &(sqinfo->ss));
1045 sqinfo->flags |= SQINFO_SS;
1047 if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) {
1048 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1049 V->msa->sa[V->msa->lastidx], &(sqinfo->sa));
1050 sqinfo->flags |= SQINFO_SA;
1055 if (feof(V->f)) return 0;
1057 if (V->ssimode == -1) { /* normal mode */
1058 V->seq = (char*) calloc (kStartLength+1, sizeof(char));
1059 V->maxseq = kStartLength;
1060 } else { /* index mode: discarding seq */
1066 V->sqinfo->flags = 0;
1068 switch (V->format) {
1069 case SQFILE_IG : readIG(V); break;
1070 case SQFILE_STRIDER : readStrider(V); break;
1071 case SQFILE_GENBANK : readGenBank(V); break;
1072 case SQFILE_FASTA : readPearson(V); break;
1073 case SQFILE_EMBL : readEMBL(V); break;
1074 case SQFILE_ZUKER : readZuker(V); break;
1075 case SQFILE_PIR : readPIR(V); break;
1076 case SQFILE_GCGDATA : readGCGdata(V); break;
1079 do { /* skip leading comments on GCG file */
1080 gotuw = (strstr(V->buf,"..") != NULL);
1081 if (gotuw) readUWGCG(V);
1083 } while (! feof(V->f));
1086 case SQFILE_IDRAW: /* SRE: no attempt to read idraw postscript */
1088 squid_errno = SQERR_FORMAT;
1092 if (V->seq != NULL) /* (it can be NULL in indexing mode) */
1093 V->seq[V->seqlen] = 0; /* stick a string terminator on it */
1098 sqinfo->len = V->seqlen;
1099 sqinfo->flags |= SQINFO_LEN;
1101 if (squid_errno == SQERR_OK) return 1; else return 0;
1104 /* Function: SeqfileFormat()
1105 * Date: SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre]
1107 * Purpose: Determine format of an open file.
1108 * Returns format code.
1111 * Autodetects the following unaligned formats:
1118 * Also autodetects the following alignment formats:
1125 * Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA.
1126 * MSAFileFormat() does the opposite.
1128 * Args: sfp - open SQFILE
1130 * Return: format code, or SQFILE_UNKNOWN if unrecognized
1133 SeqfileFormat(FILE *fp)
1137 int fmt = SQFILE_UNKNOWN;
1139 char *bufcpy, *s, *s1, *s2;
1146 while (sre_fgets(&buf, &len, fp) != NULL)
1148 if (IsBlankline(buf)) continue;
1150 /* Well-behaved formats identify themselves in first nonblank line.
1154 if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: "))
1155 { fmt = SQFILE_GCGDATA; goto DONE; }
1158 { fmt = SQFILE_FASTA; goto DONE; }
1160 if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 ||
1161 strncmp(buf, "!!NA_SEQUENCE", 13) == 0)
1162 { fmt = SQFILE_GCG; goto DONE; }
1164 if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0)
1165 { fmt = MSAFILE_STOCKHOLM; goto DONE; }
1167 if (strncmp(buf, "CLUSTAL", 7) == 0 &&
1168 strstr(buf, "multiple sequence alignment") != NULL)
1169 { fmt = MSAFILE_CLUSTAL; goto DONE; }
1171 if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 ||
1172 strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0)
1173 { fmt = MSAFILE_MSF; goto DONE; }
1175 /* PHYLIP id: also just a good bet */
1176 bufcpy = sre_strdup(buf, -1);
1178 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
1179 (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
1182 { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; }
1186 /* We trust that other formats identify themselves soon.
1188 /* dead giveaways for extended SELEX */
1189 if (strncmp(buf, "#=AU", 4) == 0 ||
1190 strncmp(buf, "#=ID", 4) == 0 ||
1191 strncmp(buf, "#=AC", 4) == 0 ||
1192 strncmp(buf, "#=DE", 4) == 0 ||
1193 strncmp(buf, "#=GA", 4) == 0 ||
1194 strncmp(buf, "#=TC", 4) == 0 ||
1195 strncmp(buf, "#=NC", 4) == 0 ||
1196 strncmp(buf, "#=SQ", 4) == 0 ||
1197 strncmp(buf, "#=SS", 4) == 0 ||
1198 strncmp(buf, "#=CS", 4) == 0 ||
1199 strncmp(buf, "#=RF", 4) == 0)
1200 { fmt = MSAFILE_SELEX; goto DONE; }
1202 if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0)
1203 { fmt = SQFILE_PIR; goto DONE; }
1205 /* a ha, diagnostic of an (old) MSF file */
1206 if ((strstr(buf, "..") != NULL) &&
1207 (strstr(buf, "MSF:") != NULL) &&
1208 (strstr(buf, "Check:")!= NULL))
1209 { fmt = MSAFILE_MSF; goto DONE; }
1211 /* unaligned GCG (must follow MSF test!) */
1212 if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL)
1213 { fmt = SQFILE_GCG; goto DONE; }
1215 if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0)
1216 { fmt = SQFILE_GENBANK; goto DONE; }
1218 if (strncmp(buf,"ID ",5) == 0 || strncmp(buf,"SQ ",5) == 0)
1219 { fmt = SQFILE_EMBL; goto DONE; }
1221 /* But past here, we're being desperate. A simple SELEX file is
1222 * very difficult to detect; we can only try to disprove it.
1225 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */
1226 if (strchr("#%", *s1) != NULL) continue; /* skip comment lines */
1228 /* Disproof 1. Noncomment, nonblank lines in a SELEX file
1229 * must have at least two space-delimited fields (name/seq)
1231 if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
1235 * The sequence field should look like a sequence.
1237 if (s2 != NULL && Seqtype(s2) == kOtherSeq)
1241 if (ndataline == 300) break; /* only look at first 300 lines */
1245 Die("Sequence file contains no data");
1247 /* If we've made it this far, we've run out of data, but there
1248 * was at least one line of it; check if we've
1249 * disproven SELEX. If not, cross our fingers, pray, and guess SELEX.
1251 if (has_junk == TRUE) fmt = SQFILE_UNKNOWN;
1252 else fmt = MSAFILE_SELEX;
1255 if (buf != NULL) free(buf);
1260 /* Function: GCGBinaryToSequence()
1262 * Purpose: Convert a GCG 2BIT binary string to DNA sequence.
1263 * 0 = C 1 = T 2 = A 3 = G
1266 * Args: seq - binary sequence. Converted in place to DNA.
1267 * len - length of DNA. binary is (len+3)/4 bytes
1270 GCGBinaryToSequence(char *seq, int len)
1272 int bpos; /* position in binary */
1273 int spos; /* position in sequence */
1277 for (bpos = (len-1)/4; bpos >= 0; bpos--)
1282 for (i = 3; i >= 0; i--)
1284 switch (twobit & 0x3) {
1285 case 0: seq[spos+i] = 'C'; break;
1286 case 1: seq[spos+i] = 'T'; break;
1287 case 2: seq[spos+i] = 'A'; break;
1288 case 3: seq[spos+i] = 'G'; break;
1290 twobit = twobit >> 2;
1298 /* Function: GCGchecksum()
1299 * Date: SRE, Mon May 31 11:13:21 1999 [St. Louis]
1301 * Purpose: Calculate a GCG checksum for a sequence.
1302 * Code provided by Steve Smith of Genetics
1305 * Args: seq - sequence to calculate checksum for.
1306 * may contain gap symbols.
1307 * len - length of sequence (usually known,
1308 * so save a strlen() call)
1310 * Returns: GCG checksum.
1313 GCGchecksum(char *seq, int len)
1315 int i; /* position in sequence */
1316 int chk = 0; /* calculated checksum */
1318 for (i = 0; i < len; i++)
1319 chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000;
1324 /* Function: GCGMultchecksum()
1326 * Purpose: GCG checksum for a multiple alignment: sum of
1327 * individual sequence checksums (including their
1328 * gap characters) modulo 10000.
1330 * Implemented using spec provided by Steve Smith of
1331 * Genetics Computer Group.
1333 * Args: seqs - sequences to be checksummed; aligned or not
1334 * nseq - number of sequences
1336 * Return: the checksum, a number between 0 and 9999
1339 GCGMultchecksum(char **seqs, int nseq)
1344 for (idx = 0; idx < nseq; idx++)
1345 chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000;
1352 /* Function: Seqtype()
1354 * Purpose: Returns a (very good) guess about type of sequence:
1355 * kDNA, kRNA, kAmino, or kOtherSeq.
1357 * Modified from, and replaces, Gilbert getseqtype().
1362 int saw; /* how many non-gap characters I saw */
1364 int po = 0; /* count of protein-only */
1365 int nt = 0; /* count of t's */
1366 int nu = 0; /* count of u's */
1367 int na = 0; /* count of nucleotides */
1368 int aa = 0; /* count of amino acids */
1369 int no = 0; /* count of others */
1371 /* Look at the first 300 non-gap characters
1373 for (saw = 0; *seq != '\0' && saw < 300; seq++)
1375 c = sre_toupper((int) *seq);
1378 if (strchr(protonly, c)) po++;
1379 else if (strchr(primenuc,c)) {
1382 else if (c == 'U') nu++;
1384 else if (strchr(aminos,c)) aa++;
1385 else if (isalpha((int) c)) no++;
1390 if (no > 0) return kOtherSeq;
1391 else if (po > 0) return kAmino;
1393 if (nu > nt) return kRNA;
1396 else return kAmino; /* ooooh. risky. */
1400 /* Function: GuessAlignmentSeqtype()
1401 * Date: SRE, Wed Jul 7 09:42:34 1999 [St. Louis]
1403 * Purpose: Try to guess whether an alignment is protein
1404 * or nucleic acid; return a code for the
1405 * type (kRNA, kDNA, or kAmino).
1407 * Args: aseq - array of aligned sequences. (Could also
1408 * be an rseq unaligned sequence array)
1409 * nseq - number of aseqs
1411 * Returns: kRNA, kDNA, kAmino;
1412 * kOtherSeq if inconsistency is detected.
1415 GuessAlignmentSeqtype(char **aseq, int nseq)
1423 for (idx = 0; idx < nseq; idx++)
1424 switch (Seqtype(aseq[idx])) {
1425 case kRNA: nrna++; break;
1426 case kDNA: ndna++; break;
1427 case kAmino: namino++; break;
1431 /* Unambiguous decisions:
1433 if (nother) return kOtherSeq;
1434 if (namino == nseq) return kAmino;
1435 if (ndna == nseq) return kDNA;
1436 if (nrna == nseq) return kRNA;
1438 /* Ambiguous decisions:
1440 if (namino == 0) return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */
1441 return kAmino; /* some amino acid seen; others probably short seqs, some
1442 of which may be entirely ACGT (ala,cys,gly,thr). We
1443 could be a little more sophisticated: U would be a giveaway
1444 that we're not in protein seqs */
1447 /* Function: WriteSimpleFASTA()
1448 * Date: SRE, Tue Nov 16 18:06:00 1999 [St. Louis]
1450 * Purpose: Just write a FASTA format sequence to a file;
1451 * minimal interface, mostly for quick and dirty programs.
1453 * Args: fp - open file handle (stdout, possibly)
1454 * seq - sequence to output
1455 * name - name for the sequence
1456 * desc - optional description line, or NULL.
1461 WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc)
1469 fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : "");
1470 for (pos = 0; pos < len; pos += 60)
1472 strncpy(buf, seq+pos, 60);
1473 fprintf(fp, "%s\n", buf);
1478 WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
1481 int lines = 0, spacer = 0, width = 50, tab = 0;
1482 int i, j, l, l1, ibase;
1484 char s[100]; /* buffer for sequence */
1485 char ss[100]; /* buffer for structure */
1488 int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */
1489 int dostruc; /* TRUE to print structure lines*/
1493 seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);
1495 if (IsAlignmentFormat(outform))
1496 Die("Tried to write an aligned format with WriteSeq() -- bad, bad.");
1501 checksum = GCGchecksum(seq, seqlen);
1504 case SQFILE_UNKNOWN: /* no header, just sequence */
1505 strcpy(endstr,"\n"); /* end w/ extra blank line */
1508 case SQFILE_GENBANK:
1509 fprintf(outf,"LOCUS %s %d bp\n",
1510 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name,
1512 fprintf(outf,"DEFINITION %s\n",
1513 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1514 fprintf(outf,"ACCESSION %s\n",
1515 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1516 fprintf(outf,"ORIGIN \n");
1519 strcpy(endstr, "\n//");
1522 case SQFILE_GCGDATA:
1523 fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen);
1524 fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1528 fprintf(outf, "ENTRY %s\n",
1529 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1530 fprintf(outf, "TITLE %s\n",
1531 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1532 fprintf(outf, "ACCESSION %s\n",
1533 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1534 fprintf(outf, "SUMMARY #Length %d #Checksum %d\n",
1535 sqinfo->len, checksum);
1536 fprintf(outf, "SEQUENCE\n");
1537 fprintf(outf, " 5 10 15 20 25 30\n");
1538 spacer = 2; /* spaces after every residue */
1539 numline = 1; /* number lines w/ coords */
1540 width = 30; /* 30 aa per line */
1541 strcpy(endstr, "\n///");
1545 fprintf(outf, "NAM %s\n", sqinfo->name);
1546 if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
1547 fprintf(outf, "SRC %s %s %d..%d::%d\n",
1548 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-",
1549 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-",
1550 (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0,
1551 (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0,
1552 (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0);
1553 if (sqinfo->flags & SQINFO_DESC)
1554 fprintf(outf, "DES %s\n", sqinfo->desc);
1555 if (sqinfo->flags & SQINFO_SS)
1557 fprintf(outf, "SEQ +SS\n");
1558 dostruc = TRUE; /* print structure lines too */
1561 fprintf(outf, "SEQ\n");
1562 numline = 1; /* number seq lines w/ coords */
1563 strcpy(endstr, "\n++");
1567 fprintf(outf,"ID %s\n",
1568 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1569 fprintf(outf,"AC %s\n",
1570 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1571 fprintf(outf,"DE %s\n",
1572 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1573 fprintf(outf,"SQ %d BP\n", seqlen);
1574 strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1575 tab = 5; /** added 31jan91 */
1576 spacer = 11; /** added 31jan91 */
1580 fprintf(outf,"%s\n", sqinfo->name);
1581 if (sqinfo->flags & SQINFO_ACC)
1582 fprintf(outf,"ACCESSION %s\n", sqinfo->acc);
1583 if (sqinfo->flags & SQINFO_DESC)
1584 fprintf(outf,"DEFINITION %s\n", sqinfo->desc);
1585 fprintf(outf," %s Length: %d (today) Check: %d ..\n",
1586 sqinfo->name, seqlen, checksum);
1589 strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1592 case SQFILE_STRIDER: /* ?? map ?*/
1593 fprintf(outf,"; ### from DNA Strider ;-)\n");
1594 fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n",
1595 sqinfo->name, seqlen, checksum);
1596 strcpy(endstr, "\n//");
1599 /* SRE: Don had Zuker default to Pearson, which is not
1600 intuitive or helpful, since Zuker's MFOLD can't read
1601 Pearson format. More useful to use kIG */
1603 which_case = 1; /* MFOLD requires upper case. */
1606 fprintf(outf,";%s %s\n",
1608 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1609 fprintf(outf,"%s\n", sqinfo->name);
1610 strcpy(endstr,"1"); /* == linear dna */
1613 case SQFILE_RAW: /* Raw: no header at all. */
1618 fprintf(outf,">%s %s\n", sqinfo->name,
1619 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1623 if (which_case == 1) s2upper(seq);
1624 if (which_case == 2) s2lower(seq);
1627 width = MIN(width,100);
1628 for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
1631 if (numline) fprintf(outf,"%8d ",ibase);
1632 for (j=0; j<tab; j++) fputc(' ',outf);
1634 if ((spacer != 0) && ((l+1) % spacer == 1))
1635 { s[l] = ' '; ss[l] = ' '; l++; }
1637 ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
1639 l1++; /* don't count spaces for width*/
1640 if (l1 == width || i == seqlen) {
1641 s[l] = ss[l] = '\0';
1645 fprintf(outf, "%s\n", s);
1646 if (numline) fprintf(outf," ");
1647 for (j=0; j<tab; j++) fputc(' ',outf);
1648 if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
1649 else fprintf(outf,"%s\n",ss);
1653 if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
1654 else fprintf(outf,"%s\n",s);
1664 /* Function: ReadMultipleRseqs()
1666 * Purpose: Open a data file and
1667 * parse it into an array of rseqs (raw, unaligned
1670 * Caller is responsible for free'ing memory allocated
1671 * to ret_rseqs, ret_weights, and ret_names.
1673 * Weights are currently only supported for MSF format.
1674 * Sequences read from all other formats will be assigned
1675 * weights of 1.0. If the caller isn't interested in
1676 * weights, it passes NULL as ret_weights.
1678 * Returns 1 on success. Returns 0 on failure and sets
1679 * squid_errno to indicate the cause.
1682 ReadMultipleRseqs(char *seqfile,
1685 SQINFO **ret_sqinfo,
1688 SQINFO *sqinfo; /* array of sequence optional info */
1689 SQFILE *dbfp; /* open ptr for sequential access of file */
1690 char **rseqs; /* sequence array */
1691 int numalloced; /* num of seqs currently alloced for */
1697 rseqs = (char **) MallocOrDie (numalloced * sizeof(char *));
1698 sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
1699 if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;
1701 while (ReadSeq(dbfp, dbfp->format, &rseqs[num], &(sqinfo[num])))
1704 if (num == numalloced) /* more seqs coming, alloc more room */
1707 rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
1708 sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
1714 *ret_sqinfo = sqinfo;
1720 /* Function: String2SeqfileFormat()
1721 * Date: SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield]
1723 * Purpose: Convert a string (e.g. from command line option arg)
1724 * to a format code. Case insensitive. Return
1725 * MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad.
1726 * Uses codes defined in squid.h (unaligned formats) and
1727 * msa.h (aligned formats).
1729 * Args: s - string to convert; e.g. "stockholm"
1731 * Returns: format code; e.g. MSAFILE_STOCKHOLM
1734 String2SeqfileFormat(char *s)
1737 int code = SQFILE_UNKNOWN;
1739 if (s == NULL) return SQFILE_UNKNOWN;
1740 s2 = sre_strdup(s, -1);
1743 if (strcmp(s2, "FASTA") == 0) code = SQFILE_FASTA;
1744 else if (strcmp(s2, "GENBANK") == 0) code = SQFILE_GENBANK;
1745 else if (strcmp(s2, "EMBL") == 0) code = SQFILE_EMBL;
1746 else if (strcmp(s2, "GCG") == 0) code = SQFILE_GCG;
1747 else if (strcmp(s2, "GCGDATA") == 0) code = SQFILE_GCGDATA;
1748 else if (strcmp(s2, "RAW") == 0) code = SQFILE_RAW;
1749 else if (strcmp(s2, "IG") == 0) code = SQFILE_IG;
1750 else if (strcmp(s2, "STRIDER") == 0) code = SQFILE_STRIDER;
1751 else if (strcmp(s2, "IDRAW") == 0) code = SQFILE_IDRAW;
1752 else if (strcmp(s2, "ZUKER") == 0) code = SQFILE_ZUKER;
1753 else if (strcmp(s2, "PIR") == 0) code = SQFILE_PIR;
1754 else if (strcmp(s2, "SQUID") == 0) code = SQFILE_SQUID;
1755 else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM;
1756 else if (strcmp(s2, "SELEX") == 0) code = MSAFILE_SELEX;
1757 else if (strcmp(s2, "MSF") == 0) code = MSAFILE_MSF;
1758 else if (strcmp(s2, "CLUSTAL") == 0) code = MSAFILE_CLUSTAL;
1759 else if (strcmp(s2, "A2M") == 0) code = MSAFILE_A2M;
1760 else if (strcmp(s2, "PHYLIP") == 0) code = MSAFILE_PHYLIP;
1761 else if (strcmp(s2, "EPS") == 0) code = MSAFILE_EPS;
1767 SeqfileFormat2String(int code)
1770 case SQFILE_UNKNOWN: return "unknown";
1771 case SQFILE_FASTA: return "FASTA";
1772 case SQFILE_GENBANK: return "Genbank";
1773 case SQFILE_EMBL: return "EMBL";
1774 case SQFILE_GCG: return "GCG";
1775 case SQFILE_GCGDATA: return "GCG data library";
1776 case SQFILE_RAW: return "raw";
1777 case SQFILE_IG: return "Intelligenetics";
1778 case SQFILE_STRIDER: return "MacStrider";
1779 case SQFILE_IDRAW: return "Idraw Postscript";
1780 case SQFILE_ZUKER: return "Zuker";
1781 case SQFILE_PIR: return "PIR";
1782 case SQFILE_SQUID: return "SQUID";
1783 case MSAFILE_STOCKHOLM: return "Stockholm";
1784 case MSAFILE_SELEX: return "SELEX";
1785 case MSAFILE_MSF: return "MSF";
1786 case MSAFILE_CLUSTAL: return "Clustal";
1787 case MSAFILE_A2M: return "a2m";
1788 case MSAFILE_PHYLIP: return "Phylip";
1789 case MSAFILE_EPS: return "EPS";
1791 Die("Bad code passed to MSAFormat2String()");
1798 /* Function: MSAToSqinfo()
1799 * Date: SRE, Tue Jul 20 14:36:56 1999 [St. Louis]
1801 * Purpose: Take an MSA and generate a SQINFO array suitable
1802 * for use in annotating the unaligned sequences.
1805 * Permanent temporary code. sqinfo was poorly designed.
1806 * it must eventually be replaced, but the odds
1807 * of this happening soon are nil, so I have to deal.
1809 * Args: msa - the alignment
1811 * Returns: ptr to allocated sqinfo array.
1812 * Freeing is ghastly: free in each individual sqinfo[i]
1813 * with FreeSequence(NULL, &(sqinfo[i])), then
1817 MSAToSqinfo(MSA *msa)
1822 sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq);
1824 for (idx = 0; idx < msa->nseq; idx++)
1826 sqinfo[idx].flags = 0;
1827 SetSeqinfoString(&(sqinfo[idx]),
1828 msa->sqname[idx], SQINFO_NAME);
1829 SetSeqinfoString(&(sqinfo[idx]),
1830 MSAGetSeqAccession(msa, idx), SQINFO_ACC);
1831 SetSeqinfoString(&(sqinfo[idx]),
1832 MSAGetSeqDescription(msa, idx), SQINFO_DESC);
1834 if (msa->ss != NULL && msa->ss[idx] != NULL) {
1835 MakeDealignedString(msa->aseq[idx], msa->alen,
1836 msa->ss[idx], &(sqinfo[idx].ss));
1837 sqinfo[idx].flags |= SQINFO_SS;
1840 if (msa->sa != NULL && msa->sa[idx] != NULL) {
1841 MakeDealignedString(msa->aseq[idx], msa->alen,
1842 msa->sa[idx], &(sqinfo[idx].sa));
1843 sqinfo[idx].flags |= SQINFO_SA;
1846 sqinfo[idx].len = DealignedLength(msa->aseq[idx]);
1847 sqinfo[idx].flags |= SQINFO_LEN;
1854 /* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */
1858 main(int argc, char **argv)
1870 buf = malloc(sizeof(char) * 256);
1871 if ((fp = fopen(filename, "r")) == NULL)
1872 Die("open of %s failed", filename);
1873 while (fgets(buf, 255, fp) != NULL)
1877 } else if (mode == 2) {
1878 if ((fp = fopen(filename, "r")) == NULL)
1879 Die("open of %s failed", filename);
1880 buf = NULL; len = 0;
1881 while (sre_fgets(&buf, &len, fp) != NULL)
1882 SSIGetFilePosition(fp, SSI_OFFSET_I32, &off);
1885 } else if (mode == 3) {
1889 if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL)
1890 Die("open of %s failed", filename);
1891 while (ReadSeq(dbfp, dbfp->format, &buf, &info)) {
1892 SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off);
1893 FreeSequence(buf, &info);