--- /dev/null
+/* File: ureadseq.c
+ *
+ * Reads and writes nucleic/protein sequence in various
+ * formats. Data files may have multiple sequences.
+ *
+ * Copyright 1990 by d.g.gilbert
+ * biology dept., indiana university, bloomington, in 47405
+ * e-mail: gilbertd@bio.indiana.edu
+ *
+ * This program may be freely copied and used by anyone.
+ * Developers are encourged to incorporate parts in their
+ * programs, rather than devise their own private sequence
+ * format.
+ *
+ * This should compile and run with any ANSI C compiler.
+ *
+ * Alexander Sherstnev, 18/11/2013
+ * renamed getline to locgetline
+ *
+ */
+
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#define UREADSEQ_G
+#include "ureadseq.h"
+
+#pragma segment ureadseq
+
+
+int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */
+{
+ int diff, done;
+ if (a == b) return 0;
+ done = 0;
+ while (! done) {
+ diff = to_upper(*a) - to_upper(*b);
+ if (diff) return diff;
+ if (*a == '\0') done = 1;
+ else { a++; b++; }
+ }
+ return 0;
+}
+
+int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */
+{
+ int diff, done;
+ if (a == b) return 0;
+ done = 0;
+ while (! done) {
+ diff = to_upper(*a) - to_upper(*b);
+ if (diff) return diff;
+ if (*a == '\0') done = 1;
+ else {
+ a++; b++; maxn--;
+ if (! maxn) done = 1;
+ }
+ }
+ return 0;
+}
+
+
+
+
+
+#ifndef Local
+# define Local static /* local functions */
+#endif
+
+#define kStartLength 500
+
+const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
+const char *primenuc = "ACGTU";
+const char *protonly = "EFIPQZ";
+
+const char kNocountsymbols[5] = "_.-?";
+const char stdsymbols[6] = "_.-*?";
+const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";
+static const char *seqsymbols = allsymbols;
+
+const char nummask[11] = "0123456789";
+const char nonummask[11] = "~!@#$%^&*(";
+
+/*
+ use general form of isseqchar -- all chars + symbols.
+ no formats except nbrf (?) use symbols in data area as
+ anything other than sequence chars.
+*/
+
+
+
+ /* Local variables for readSeq: */
+struct ReadSeqVars {
+ short choice, err, nseq;
+ long seqlen, maxseq, seqlencount;
+ short topnseq;
+ long topseqlen;
+ const char *fname;
+ char *seq, *seqid, matchchar;
+ boolean allDone, done, filestart, addit;
+ FILE *f;
+ long linestart;
+ char s[256], *sp;
+
+ int (*isseqchar)();
+ /* int (*isseqchar)(int c); << sgi cc hates (int c) */
+};
+
+
+
+int isSeqChar(int c)
+{
+ return (isalpha(c) || strchr(seqsymbols,c));
+}
+
+int isSeqNumChar(int c)
+{
+ return (isalnum(c) || strchr(seqsymbols,c));
+}
+
+
+int isAnyChar(int c)
+{
+ return isascii(c); /* wrap in case isascii is macro */
+}
+
+Local void readline(FILE *f, char *s, long *linestart)
+{
+ char *cp;
+
+ *linestart= ftell(f);
+ if (NULL == fgets(s, 256, f))
+ *s = 0;
+ else {
+ cp = strchr(s, '\n');
+ if (cp != NULL) *cp = 0;
+ }
+}
+
+Local void locgetline(struct ReadSeqVars *V)
+{
+ readline(V->f, V->s, &V->linestart);
+}
+
+Local void ungetline(struct ReadSeqVars *V)
+{
+ fseek(V->f, V->linestart, 0);
+}
+
+
+Local void addseq(char *s, struct ReadSeqVars *V)
+{
+ char *ptr;
+
+ if (V->addit) while (*s != 0) {
+ if ((V->isseqchar)(*s)) {
+ if (V->seqlen >= V->maxseq) {
+ V->maxseq += kStartLength;
+ ptr = (char*) realloc(V->seq, V->maxseq+1);
+ if (ptr==NULL) {
+ V->err = eMemFull;
+ return;
+ }
+ else V->seq = ptr;
+ }
+ V->seq[(V->seqlen)++] = *s;
+ }
+ s++;
+ }
+}
+
+Local void countseq(char *s, struct ReadSeqVars *V)
+ /* this must count all valid seq chars, for some formats (paup-sequential) even
+ if we are skipping seq... */
+{
+ while (*s != 0) {
+ if ((V->isseqchar)(*s)) {
+ (V->seqlencount)++;
+ }
+ s++;
+ }
+}
+
+
+Local void addinfo(char *s, struct ReadSeqVars *V)
+{
+ char s2[256], *si;
+ boolean saveadd;
+
+ si = s2;
+ while (*s == ' ') s++;
+ sprintf(si, " %d) %s\n", V->nseq, s);
+
+ saveadd = V->addit;
+ V->addit = true;
+ V->isseqchar = isAnyChar;
+ addseq( si, V);
+ V->addit = saveadd;
+ V->isseqchar = isSeqChar;
+}
+
+
+
+
+Local void readLoop(short margin, boolean addfirst,
+ boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),
+ struct ReadSeqVars *V)
+{
+ boolean addend = false;
+ boolean ungetend = false;
+
+ V->nseq++;
+ if (V->choice == kListSequences) V->addit = false;
+ else V->addit = (V->nseq == V->choice);
+ if (V->addit) V->seqlen = 0;
+
+ if (addfirst) addseq(V->s, V);
+ do {
+ locgetline(V);
+ V->done = feof(V->f);
+ V->done |= (*endTest)( &addend, &ungetend, V);
+ if (V->addit && (addend || !V->done) && (strlen(V->s) > margin)) {
+ addseq( (V->s)+margin, V);
+ }
+ } while (!V->done);
+
+ if (V->choice == kListSequences) addinfo(V->seqid, V);
+ else {
+ V->allDone = (V->nseq >= V->choice);
+ if (V->allDone && ungetend) ungetline(V);
+ }
+}
+
+
+
+Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = true; /* 1 or 2 occur in line w/ bases */
+ *ungetend= false;
+ return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));
+}
+
+Local void readIG(struct ReadSeqVars *V)
+{
+/* 18Aug92: new IG format -- ^L between sequences in place of ";" */
+ char *si;
+
+ while (!V->allDone) {
+ do {
+ locgetline(V);
+ for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */
+ if (*si == 0) *V->s= 0; /* chop line to empty */
+ } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));
+ if (feof(V->f))
+ V->allDone = true;
+ else {
+ strcpy(V->seqid, V->s);
+ readLoop(0, false, endIG, V);
+ }
+ }
+}
+
+
+
+Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= false;
+ return (strstr( V->s, "//") != NULL);
+}
+
+Local void readStrider(struct ReadSeqVars *V)
+{ /* ? only 1 seq/file ? */
+
+ while (!V->allDone) {
+ locgetline(V);
+ if (strstr(V->s,"; DNA sequence ") == V->s)
+ strcpy(V->seqid, (V->s)+16);
+ else
+ strcpy(V->seqid, (V->s)+1);
+ while ((!feof(V->f)) && (*V->s == ';')) {
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ else readLoop(0, true, endStrider, V);
+ }
+}
+
+
+Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= (strstr(V->s,"ENTRY") == V->s);
+ return ((strstr(V->s,"///") != NULL) || *ungetend);
+}
+
+Local void readPIR(struct ReadSeqVars *V)
+{ /*PIR -- many seqs/file */
+
+ while (!V->allDone) {
+ while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) )
+ locgetline(V);
+ strcpy(V->seqid, (V->s)+16);
+ while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))
+ locgetline(V);
+ readLoop(0, false, endPIR, V);
+
+ if (!V->allDone) {
+ while (! (feof(V->f) || ((*V->s != 0)
+ && (strstr( V->s,"ENTRY") == V->s))))
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= (strstr(V->s,"LOCUS") == V->s);
+ return ((strstr(V->s,"//") != NULL) || *ungetend);
+}
+
+Local void readGenBank(struct ReadSeqVars *V)
+{ /*GenBank -- many seqs/file */
+
+ while (!V->allDone) {
+ strcpy(V->seqid, (V->s)+12);
+ while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))
+ locgetline(V);
+ readLoop(0, false, endGB, V);
+
+ if (!V->allDone) {
+ while (! (feof(V->f) || ((*V->s != 0)
+ && (strstr( V->s,"LOCUS") == V->s))))
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ char *a;
+
+ if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */
+ /* "*" can be valid base symbol, drop it here */
+ *a = 0;
+ *addend = true;
+ *ungetend= false;
+ return(true);
+ }
+ else if (*V->s == '>') { /* start of next seq */
+ *addend = false;
+ *ungetend= true;
+ return(true);
+ }
+ else
+ return(false);
+}
+
+Local void readNBRF(struct ReadSeqVars *V)
+{
+ while (!V->allDone) {
+ strcpy(V->seqid, (V->s)+4);
+ locgetline(V); /*skip title-junk line*/
+ readLoop(0, false, endNBRF, V);
+ if (!V->allDone) {
+ while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+
+Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= true;
+ return(*V->s == '>');
+}
+
+Local void readPearson(struct ReadSeqVars *V)
+{
+ while (!V->allDone) {
+ strcpy(V->seqid, (V->s)+1);
+ readLoop(0, false, endPearson, V);
+ if (!V->allDone) {
+ while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+
+Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= (strstr(V->s,"ID ") == V->s);
+ return ((strstr(V->s,"//") != NULL) || *ungetend);
+}
+
+Local void readEMBL(struct ReadSeqVars *V)
+{
+ while (!V->allDone) {
+ strcpy(V->seqid, (V->s)+5);
+ do {
+ locgetline(V);
+ } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s)));
+
+ readLoop(0, false, endEMBL, V);
+ if (!V->allDone) {
+ while (!(feof(V->f) |
+ ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s))))
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+
+Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= true;
+ return( *V->s == '(' );
+}
+
+Local void readZuker(struct ReadSeqVars *V)
+{
+ /*! 1st string is Zuker's Fortran format */
+
+ while (!V->allDone) {
+ locgetline(V); /*s == "seqLen seqid string..."*/
+ strcpy(V->seqid, (V->s)+6);
+ readLoop(0, false, endZuker, V);
+ if (!V->allDone) {
+ while (!(feof(V->f) |
+ ((*V->s != '\0') & (*V->s == '('))))
+ locgetline(V);
+ }
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+
+Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ /* this is a somewhat shaky end,
+ 1st char of line is non-blank for seq. title
+ */
+ *addend = false;
+ *ungetend= true;
+ return( *V->s != ' ' );
+}
+
+Local void readFitch(struct ReadSeqVars *V)
+{
+ boolean first;
+
+ first = true;
+ while (!V->allDone) {
+ if (!first) strcpy(V->seqid, V->s);
+ readLoop(0, first, endFitch, V);
+ if (feof(V->f)) V->allDone = true;
+ first = false;
+ }
+}
+
+
+Local void readPlain(struct ReadSeqVars *V)
+{
+ V->nseq++;
+ V->addit = (V->choice > 0);
+ if (V->addit) V->seqlen = 0;
+ addseq(V->seqid, V); /*from above..*/
+ if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname);
+ else sprintf(V->seqid, " [Unknown form]");
+ do {
+ addseq(V->s, V);
+ V->done = feof(V->f);
+ locgetline(V);
+ } while (!V->done);
+ if (V->choice == kListSequences) addinfo(V->seqid, V);
+ V->allDone = true;
+}
+
+
+Local void readUWGCG(struct ReadSeqVars *V)
+{
+/*
+10nov91: Reading GCG files casued duplication of last line when
+ EOF followed that line !!!
+ fix: locgetline now sets *V->s = 0
+*/
+ char *si;
+
+ V->nseq++;
+ V->addit = (V->choice > 0);
+ if (V->addit) V->seqlen = 0;
+ strcpy(V->seqid, V->s);
+ /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
+ /*drop above or ".." from id*/
+ if (si = strstr(V->seqid," Length: ")) *si = 0;
+ else if (si = strstr(V->seqid,"..")) *si = 0;
+ do {
+ V->done = feof(V->f);
+ locgetline(V);
+ if (!V->done) addseq((V->s), V);
+ } while (!V->done);
+ if (V->choice == kListSequences) addinfo(V->seqid, V);
+ V->allDone = true;
+}
+
+
+Local void readOlsen(struct ReadSeqVars *V)
+{ /* G. Olsen /print output from multiple sequence editor */
+
+ char *si, *sj, *sk, *sm, sid[40], snum[20];
+ boolean indata = false;
+ int snumlen;
+
+ V->addit = (V->choice > 0);
+ if (V->addit) V->seqlen = 0;
+ rewind(V->f); V->nseq= 0;
+ do {
+ locgetline(V);
+ V->done = feof(V->f);
+
+ if (V->done && !(*V->s)) break;
+ else if (indata) {
+ if ( (si= strstr(V->s, sid))
+ /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */
+ && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) {
+
+ /* Spaces are valid alignment data !! */
+/* 17Oct91: Error, the left margin is 21 not 22! */
+/* dropped some nucs up to now -- my example file was right shifted ! */
+/* variable right id margin, drop id-2 spaces at end */
+/*
+ VMS CC COMPILER (VAXC031) mess up:
+ -- Index of 21 is chopping 1st nuc on VMS systems Only!
+ Byte-for-byte same ame rnasep.olsen sequence file !
+*/
+
+ /* si = (V->s)+21; < was this before VMS CC wasted my time */
+ si += 10; /* use strstr index plus offset to outfox VMS CC bug */
+
+ if (sk = strstr(si, sid)) *(sk-2) = 0;
+ for (sk = si; *sk != 0; sk++) {
+ if (*sk == ' ') *sk = '.';
+ /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */
+ else if (isdigit(*sk)) *sk= nonummask[*sk - '0'];
+ }
+
+ addseq(si, V);
+ }
+ }
+
+ else if (sk = strstr(V->s, "): ")) { /* seq info header line */
+ /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */
+ /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */
+ /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */
+ (V->nseq)++;
+ si = 1 + strchr(V->s,'(');
+ *sk = ' ';
+ if (V->choice == kListSequences) addinfo( si, V);
+ else if (V->nseq == V->choice) {
+ strcpy(V->seqid, si);
+ sj = strchr(V->seqid, ':');
+ while (*(--sj) == ' ') ;
+ while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; }
+
+ *sk = 0;
+ while (*(--sk) == ' ') *sk = 0;
+ strcpy(sid, si);
+
+ si= V->s;
+ while ((*si <= ' ') && (*si != 0)) si++;
+ snumlen=0;
+ while (si[snumlen] > ' ' && snumlen<20)
+ { snum[snumlen]= si[snumlen]; snumlen++; }
+ snum[snumlen]= 0;
+ }
+
+ }
+
+ else if (strstr(V->s,"identity: Data:")) {
+ indata = true;
+ if (V->choice == kListSequences) V->done = true;
+ }
+
+ } while (!V->done);
+
+ V->allDone = true;
+} /*readOlsen*/
+
+
+Local void readMSF(struct ReadSeqVars *V)
+{ /* gcg's MSF, mult. sequence format, interleaved ! */
+
+ char *si, *sj, sid[128];
+ boolean indata = false;
+ int atseq= 0, iline= 0;
+
+ V->addit = (V->choice > 0);
+ if (V->addit) V->seqlen = 0;
+ rewind(V->f); V->nseq= 0;
+ do {
+ locgetline(V);
+ V->done = feof(V->f);
+
+ if (V->done && !(*V->s)) break;
+ else if (indata) {
+ /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */
+ /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */
+
+ si= V->s;
+ skipwhitespace(si);
+ /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/
+ for (sj= si; *sj > ' '; sj++) ;
+ *sj= 0;
+ if ( *si ) {
+ if ( (0==strcmp(si, sid)) ) {
+ addseq(sj+1, V);
+ }
+ iline++;
+ }
+ }
+
+ else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */
+ /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */
+
+ (V->nseq)++;
+ si += 6;
+ if (V->choice == kListSequences) addinfo( si, V);
+ else if (V->nseq == V->choice) {
+ strcpy(V->seqid, si);
+ si = V->seqid;
+ skipwhitespace(si);
+ /* for (sj= si; isalnum(*sj); sj++) ; -- bug */
+ for (sj= si; *sj > ' '; sj++) ;
+ *sj= 0;
+ strcpy(sid, si);
+ }
+ }
+
+ else if ( strstr(V->s,"//") /*== V->s*/ ) {
+ indata = true;
+ iline= 0;
+ if (V->choice == kListSequences) V->done = true;
+ }
+
+ } while (!V->done);
+
+
+ V->allDone = true;
+} /*readMSF*/
+
+
+
+Local void readPAUPinterleaved(struct ReadSeqVars *V)
+{ /* PAUP mult. sequence format, interleaved or sequential! */
+
+ char *si, *sj, *send, sid[40], sid1[40], saveseq[255];
+ boolean first = true, indata = false, domatch;
+ int atseq= 0, iline= 0, ifmc, saveseqlen=0;
+
+#define fixmatchchar(s) { \
+ for (ifmc=0; ifmc<saveseqlen; ifmc++) \
+ if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; }
+
+ V->addit = (V->choice > 0);
+ V->seqlencount = 0;
+ if (V->addit) V->seqlen = 0;
+ /* rewind(V->f); V->nseq= 0; << do in caller !*/
+ indata= true; /* call here after we find "matrix" */
+ domatch= (V->matchchar > 0);
+
+ do {
+ locgetline(V);
+ V->done = feof(V->f);
+
+ if (V->done && !(*V->s)) break;
+ else if (indata) {
+ /* [ 1 1 1 ]*/
+ /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
+ /* chimp ................a.t. .c.................a ..........*/
+ /* !! need to correct for V->matchchar */
+ si= V->s;
+ skipwhitespace(si);
+ if (strchr(si,';')) indata= false;
+
+ if (isalnum(*si)) {
+ /* valid data line starts w/ a left-justified seq name in columns [0..8] */
+ if (first) {
+ (V->nseq)++;
+ if (V->nseq >= V->topnseq) first= false;
+ for (sj = si; isalnum(*sj); sj++) ;
+ send= sj;
+ skipwhitespace(sj);
+ if (V->choice == kListSequences) {
+ *send= 0;
+ addinfo( si, V);
+ }
+ else if (V->nseq == V->choice) {
+ if (domatch) {
+ if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); }
+ else fixmatchchar( sj);
+ }
+ addseq(sj, V);
+ *send= 0;
+ strcpy(V->seqid, si);
+ strcpy(sid, si);
+ if (V->nseq == 1) strcpy(sid1, sid);
+ }
+ }
+
+ else if ( (strstr(si, sid) == si) ){
+ while (isalnum(*si)) si++;
+ skipwhitespace(si);
+ if (domatch) {
+ if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); }
+ else fixmatchchar( si);
+ }
+ addseq(si, V);
+ }
+
+ else if (domatch && (strstr(si, sid1) == si)) {
+ strcpy( saveseq, si);
+ saveseqlen= strlen(saveseq);
+ }
+
+ iline++;
+ }
+ }
+
+ else if ( strstr(V->s,"matrix") ) {
+ indata = true;
+ iline= 0;
+ if (V->choice == kListSequences) V->done = true;
+ }
+
+ } while (!V->done);
+
+ V->allDone = true;
+} /*readPAUPinterleaved*/
+
+
+
+Local void readPAUPsequential(struct ReadSeqVars *V)
+{ /* PAUP mult. sequence format, interleaved or sequential! */
+ char *si, *sj;
+ boolean atname = true, indata = false;
+
+ V->addit = (V->choice > 0);
+ if (V->addit) V->seqlen = 0;
+ V->seqlencount = 0;
+ /* rewind(V->f); V->nseq= 0; << do in caller !*/
+ indata= true; /* call here after we find "matrix" */
+ do {
+ locgetline(V);
+ V->done = feof(V->f);
+
+ if (V->done && !(*V->s)) break;
+ else if (indata) {
+ /* [ 1 1 1 ]*/
+ /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
+ /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
+ /* chimp ................a.t. .c.................a ..........*/
+ /* ................a.t. .c.................a ..........*/
+
+ si= V->s;
+ skipwhitespace(si);
+ if (strchr(si,';')) indata= false;
+ if (isalnum(*si)) {
+ /* valid data line starts w/ a left-justified seq name in columns [0..8] */
+ if (atname) {
+ (V->nseq)++;
+ V->seqlencount = 0;
+ atname= false;
+ sj= si+1;
+ while (isalnum(*sj)) sj++;
+ if (V->choice == kListSequences) {
+ /* !! we must count bases to know when topseqlen is reached ! */
+ countseq(sj, V);
+ if (V->seqlencount >= V->topseqlen) atname= true;
+ *sj= 0;
+ addinfo( si, V);
+ }
+ else if (V->nseq == V->choice) {
+ addseq(sj, V);
+ V->seqlencount= V->seqlen;
+ if (V->seqlencount >= V->topseqlen) atname= true;
+ *sj= 0;
+ strcpy(V->seqid, si);
+ }
+ else {
+ countseq(sj, V);
+ if (V->seqlencount >= V->topseqlen) atname= true;
+ }
+ }
+
+ else if (V->nseq == V->choice) {
+ addseq(V->s, V);
+ V->seqlencount= V->seqlen;
+ if (V->seqlencount >= V->topseqlen) atname= true;
+ }
+ else {
+ countseq(V->s, V);
+ if (V->seqlencount >= V->topseqlen) atname= true;
+ }
+ }
+ }
+
+ else if ( strstr(V->s,"matrix") ) {
+ indata = true;
+ atname= true;
+ if (V->choice == kListSequences) V->done = true;
+ }
+
+ } while (!V->done);
+
+ V->allDone = true;
+} /*readPAUPsequential*/
+
+
+Local void readPhylipInterleaved(struct ReadSeqVars *V)
+{
+ char *si, *sj;
+ boolean first = true;
+ int iline= 0;
+
+ V->addit = (V->choice > 0);
+ if (V->addit) V->seqlen = 0;
+ V->seqlencount = 0;
+ /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */
+ si= V->s;
+ skipwhitespace(si);
+ V->topnseq= atoi(si);
+ while (isdigit(*si)) si++;
+ skipwhitespace(si);
+ V->topseqlen= atol(si);
+ /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */
+
+ do {
+ locgetline(V);
+ V->done = feof(V->f);
+
+ if (V->done && !(*V->s)) break;
+ si= V->s;
+ skipwhitespace(si);
+ if (*si != 0) {
+
+ if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */
+ (V->nseq)++;
+ if (V->nseq >= V->topnseq) first= false;
+ sj= V->s+10; /* past name, start of data */
+ if (V->choice == kListSequences) {
+ *sj= 0;
+ addinfo( si, V);
+ }
+ else if (V->nseq == V->choice) {
+ addseq(sj, V);
+ *sj= 0;
+ strcpy(V->seqid, si);
+ }
+ }
+ else if ( iline % V->nseq == V->choice -1 ) {
+ addseq(si, V);
+ }
+ iline++;
+ }
+ } while (!V->done);
+
+ V->allDone = true;
+} /*readPhylipInterleaved*/
+
+
+
+Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
+{
+ *addend = false;
+ *ungetend= false;
+ countseq( V->s, V);
+ return V->seqlencount >= V->topseqlen;
+}
+
+Local void readPhylipSequential(struct ReadSeqVars *V)
+{
+ short i;
+ char *si;
+ /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */
+ si= V->s;
+ skipwhitespace(si);
+ V->topnseq= atoi(si);
+ while (isdigit(*si)) si++;
+ skipwhitespace(si);
+ V->topseqlen= atol(si);
+ locgetline(V);
+ while (!V->allDone) {
+ V->seqlencount= 0;
+ strncpy(V->seqid, (V->s), 10);
+ V->seqid[10]= 0;
+ for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' ';
+ readLoop(0, true, endPhylipSequential, V);
+ if (feof(V->f)) V->allDone = true;
+ }
+}
+
+
+
+
+Local void readSeqMain(
+ struct ReadSeqVars *V,
+ const long skiplines_,
+ const short format_)
+{
+#define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \
+ for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); }
+
+ boolean gotuw;
+ long l;
+
+ V->linestart= 0;
+ V->matchchar= 0;
+ if (V->f == NULL)
+ V->err = eFileNotFound;
+ else {
+
+ for (l = skiplines_; l > 0; l--) locgetline( V);
+
+ do {
+ locgetline( V);
+ for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;
+ } while ((l == 0) && !feof(V->f));
+
+ if (feof(V->f)) V->err = eNoData;
+
+ else switch (format_) {
+ case kPlain : readPlain(V); break;
+ case kIG : readIG(V); break;
+ case kStrider: readStrider(V); break;
+ case kGenBank: readGenBank(V); break;
+ case kPIR : readPIR(V); break;
+ case kNBRF : readNBRF(V); break;
+ case kPearson: readPearson(V); break;
+ case kEMBL : readEMBL(V); break;
+ case kZuker : readZuker(V); break;
+ case kOlsen : readOlsen(V); break;
+ case kMSF : readMSF(V); break;
+
+ case kPAUP : {
+ boolean done= false;
+ boolean interleaved= false;
+ char *cp;
+ /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */
+ while (!done) {
+ locgetline( V);
+ tolowerstr( V->s);
+ if (strstr( V->s, "matrix")) done= true;
+ if (strstr( V->s, "interleav")) interleaved= true;
+ if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5);
+ if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6);
+ if (NULL != (cp=strstr( V->s, "matchchar=")) ) {
+ cp += 10;
+ if (*cp=='\'') cp++;
+ else if (*cp=='"') cp++;
+ V->matchchar= *cp;
+ }
+ }
+ if (interleaved) readPAUPinterleaved(V);
+ else readPAUPsequential(V);
+ }
+ break;
+
+ /* kPhylip: ! can't determine in middle of file which type it is...*/
+ /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */
+ case kPhylip2:
+ readPhylipSequential(V);
+ break;
+ case kPhylip4: /* == kPhylip3 */
+ readPhylipInterleaved(V);
+ break;
+
+ default:
+ V->err = eUnknownFormat;
+ break;
+
+ case kFitch :
+ strcpy(V->seqid, V->s); locgetline(V);
+ readFitch(V);
+ break;
+
+ case kGCG:
+ do {
+ gotuw = (strstr(V->s,"..") != NULL);
+ if (gotuw) readUWGCG(V);
+ locgetline(V);
+ } while (!(feof(V->f) || V->allDone));
+ break;
+ }
+ }
+
+ V->filestart= false;
+ V->seq[V->seqlen] = 0; /* stick a string terminator on it */
+}
+
+
+char *readSeqFp(
+ const short whichEntry_, /* index to sequence in file */
+ FILE *fp_, /* pointer to open seq file */
+ const long skiplines_,
+ const short format_, /* sequence file format */
+ long *seqlen_, /* return seq size */
+ short *nseq_, /* number of seqs in file, for listSeqs() */
+ short *error_, /* return error */
+ char *seqid_) /* return seq name/info */
+{
+ struct ReadSeqVars V;
+
+ if (format_ < kMinFormat || format_ > kMaxFormat) {
+ *error_ = eUnknownFormat;
+ *seqlen_ = 0;
+ return NULL;
+ }
+
+ V.choice = whichEntry_;
+ V.fname = NULL; /* don't know */
+ V.seq = (char*) calloc(1, kStartLength+1);
+ V.maxseq = kStartLength;
+ V.seqlen = 0;
+ V.seqid = seqid_;
+
+ V.f = fp_;
+ V.filestart= (ftell( fp_) == 0);
+ /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */
+ if (V.filestart) V.nseq = 0;
+ else V.nseq= *nseq_; /* track where we are in file...*/
+
+ *V.seqid = '\0';
+ V.err = 0;
+ V.nseq = 0;
+ V.isseqchar = isSeqChar;
+ if (V.choice == kListSequences) ; /* leave as is */
+ else if (V.choice <= 0) V.choice = 1; /* default ?? */
+ V.addit = (V.choice > 0);
+ V.allDone = false;
+
+ readSeqMain(&V, skiplines_, format_);
+
+ *error_ = V.err;
+ *seqlen_ = V.seqlen;
+ *nseq_ = V.nseq;
+ return V.seq;
+}
+
+char *readSeq(
+ const short whichEntry_, /* index to sequence in file */
+ const char *filename_, /* file name */
+ const long skiplines_,
+ const short format_, /* sequence file format */
+ long *seqlen_, /* return seq size */
+ short *nseq_, /* number of seqs in file, for listSeqs() */
+ short *error_, /* return error */
+ char *seqid_) /* return seq name/info */
+{
+ struct ReadSeqVars V;
+
+ if (format_ < kMinFormat || format_ > kMaxFormat) {
+ *error_ = eUnknownFormat;
+ *seqlen_ = 0;
+ return NULL;
+ }
+
+ V.choice = whichEntry_;
+ V.fname = filename_; /* don't need to copy string, just ptr to it */
+ V.seq = (char*) calloc(1, kStartLength+1);
+ V.maxseq = kStartLength;
+ V.seqlen = 0;
+ V.seqid = seqid_;
+
+ V.f = NULL;
+ *V.seqid = '\0';
+ V.err = 0;
+ V.nseq = 0;
+ V.isseqchar = isSeqChar;
+ if (V.choice == kListSequences) ; /* leave as is */
+ else if (V.choice <= 0) V.choice = 1; /* default ?? */
+ V.addit = (V.choice > 0);
+ V.allDone = false;
+
+ V.f = fopen(V.fname, "r");
+ V.filestart= true;
+
+ readSeqMain(&V, skiplines_, format_);
+
+ if (V.f != NULL) fclose(V.f);
+ *error_ = V.err;
+ *seqlen_ = V.seqlen;
+ *nseq_ = V.nseq;
+ return V.seq;
+}
+
+
+
+
+
+char *listSeqs(
+ const char *filename_, /* file name */
+ const long skiplines_,
+ const short format_, /* sequence file format */
+ short *nseq_, /* number of seqs in file, for listSeqs() */
+ short *error_) /* return error */
+{
+ char seqid[256];
+ long seqlen;
+
+ return readSeq( kListSequences, filename_, skiplines_, format_,
+ &seqlen, nseq_, error_, seqid);
+}
+
+
+
+
+short seqFileFormat( /* return sequence format number, see ureadseq.h */
+ const char *filename,
+ long *skiplines, /* return #lines to skip any junk like mail header */
+ short *error) /* return any error value or 0 */
+{
+ FILE *fseq;
+ short format;
+
+ fseq = fopen(filename, "r");
+ format= seqFileFormatFp( fseq, skiplines, error);
+ if (fseq!=NULL) fclose(fseq);
+ return format;
+}
+
+short seqFileFormatFp(
+ FILE *fseq,
+ long *skiplines, /* return #lines to skip any junk like mail header */
+ short *error) /* return any error value or 0 */
+{
+ boolean foundDNA= false, foundIG= false, foundStrider= false,
+ foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,
+ foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,
+ gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,
+ isfitch= false, isphylip= false, done= false;
+ short format= kUnknown;
+ int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;
+ char sp[256];
+ long linestart=0;
+ int maxlines2check=500;
+
+#define ReadOneLine(sp) \
+ { done |= (feof(fseq)); \
+ readline( fseq, sp, &linestart); \
+ if (!done) { splen = strlen(sp); ++nlines; } }
+
+ *skiplines = 0;
+ *error = 0;
+ if (fseq == NULL) { *error = eFileNotFound; return kNoformat; }
+
+ while ( !done ) {
+ ReadOneLine(sp);
+
+ /* check for mailer head & skip past if found */
+ if (nlines < 4 && !done) {
+ if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {
+ do {
+ /* skip all lines until find one blank line */
+ ReadOneLine(sp);
+ if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;
+ } while ((!done) && (k < splen));
+ *skiplines = nlines; /* !? do we want #lines or #bytes ?? */
+ }
+ }
+
+ if (sp==NULL || *sp==0)
+ ; /* nada */
+
+ /* high probability identities: */
+
+ else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )
+ gotMSF= true;
+
+ else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))
+ gotuw= true;
+
+ else if (strstr(sp,"identity: Data:") != NULL)
+ gotolsen= true;
+
+ else if ( strstr(sp,"::=") &&
+ (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */
+ strstr(sp,"Seq-entry") ||
+ strstr(sp,"Seq-submit") ) ) /* can we read submit format? */
+ gotasn1= true;
+
+ else if ( strstr(sp,"#NEXUS") == sp )
+ gotpaup= true;
+
+ /* uncertain identities: */
+
+ else if (*sp ==';') {
+ if (strstr(sp,"Strider") !=NULL) foundStrider= true;
+ else foundIG= true;
+ }
+
+ else if (strstr(sp,"LOCUS") == sp)
+ foundGB= true;
+ else if (strstr(sp,"ORIGIN") == sp)
+ foundGB= true;
+
+ else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */
+ foundPIR= true;
+ else if (strstr(sp,"SEQUENCE") == sp)
+ foundPIR= true;
+
+ else if (*sp == '>') {
+ if (sp[3] == ';') foundNBRF= true;
+ else foundPearson= true;
+ }
+
+ else if (strstr(sp,"ID ") == sp)
+ foundEMBL= true;
+ else if (strstr(sp,"SQ ") == sp)
+ foundEMBL= true;
+
+ else if (*sp == '(')
+ foundZuker= true;
+
+ else {
+ if (nlines - *skiplines == 1) {
+ int ispp= 0, ilen= 0;
+ sscanf( sp, "%d%d", &ispp, &ilen);
+ if (ispp > 0 && ilen > 0) isphylip= true;
+ }
+ else if (isphylip && nlines - *skiplines == 2) {
+ int tseq;
+ tseq= getseqtype(sp+10, strlen(sp+10));
+ if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */
+ && (tseq != kOtherSeq)) /* sequence section must be okay */
+ foundPhylip= true;
+ }
+
+ for (k=0, isfitch= true; isfitch & (k < splen); k++) {
+ if (k % 4 == 0) isfitch &= (sp[k] == ' ');
+ else isfitch &= (sp[k] != ' ');
+ }
+ if (isfitch & (splen > 20)) foundFitch= true;
+
+ /* kRNA && kDNA are fairly certain...*/
+ switch (getseqtype( sp, splen)) {
+ case kOtherSeq: otherlines++; break;
+ case kAmino : if (splen>20) aminolines++; break;
+ case kDNA :
+ case kRNA : if (splen>20) dnalines++; break;
+ case kNucleic : break; /* not much info ? */
+ }
+
+ }
+
+ /* pretty certain */
+ if (gotolsen) {
+ format= kOlsen;
+ done= true;
+ }
+ else if (gotMSF) {
+ format= kMSF;
+ done= true;
+ }
+ else if (gotasn1) {
+ /* !! we need to look further and return kASNseqentry | kASNseqset */
+ /*
+ seqentry key is Seq-entry ::=
+ seqset key is Bioseq-set ::=
+ ?? can't read these yet w/ ncbi tools ??
+ Seq-submit ::=
+ Bioseq ::= << fails both bioseq-seq and seq-entry parsers !
+ */
+ if (strstr(sp,"Bioseq-set")) format= kASNseqset;
+ else if (strstr(sp,"Seq-entry")) format= kASNseqentry;
+ else format= kASN1; /* other form, we can't yet read... */
+ done= true;
+ }
+ else if (gotpaup) {
+ format= kPAUP;
+ done= true;
+ }
+
+ else if (gotuw) {
+ if (foundIG) format= kIG; /* a TOIG file from GCG for certain */
+ else format= kGCG;
+ done= true;
+ }
+
+ else if ((dnalines > 1) || done || (nlines > maxlines2check)) {
+ /* decide on most likely format */
+ /* multichar idents: */
+ if (foundStrider) format= kStrider;
+ else if (foundGB) format= kGenBank;
+ else if (foundPIR) format= kPIR;
+ else if (foundEMBL) format= kEMBL;
+ else if (foundNBRF) format= kNBRF;
+ /* single char idents: */
+ else if (foundIG) format= kIG;
+ else if (foundPearson) format= kPearson;
+ else if (foundZuker) format= kZuker;
+ /* digit ident: */
+ else if (foundPhylip) format= kPhylip;
+ /* spacing ident: */
+ else if (foundFitch) format= kFitch;
+ /* no format chars: */
+ else if (otherlines > 0) format= kUnknown;
+ else if (dnalines > 1) format= kPlain;
+ else if (aminolines > 1) format= kPlain;
+ else format= kUnknown;
+
+ done= true;
+ }
+
+ /* need this for possible long header in olsen format */
+ else if (strstr(sp,"): ") != NULL)
+ maxlines2check++;
+ }
+
+ if (format == kPhylip) {
+ /* check for interleaved or sequential -- really messy */
+ int tname, tseq;
+ long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;
+ char *ps;
+
+ rewind(fseq);
+ for (i=0; i < *skiplines; i++) ReadOneLine(sp);
+ nlines= 0;
+ ReadOneLine(sp);
+ sscanf( sp, "%d%d", &nspp, &nlen);
+ ReadOneLine(sp); /* 1st seq line */
+ for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++;
+
+ for (i= 1; i<nspp; i++) {
+ ReadOneLine(sp);
+
+ tseq= getseqtype(sp+10, strlen(sp+10));
+ tname= getseqtype(sp, 10);
+ for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
+ for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
+
+ /* find probable interleaf or sequential ... */
+ if (j>=9) seq += 10; /* pretty certain not ileaf */
+ else {
+ if (tseq != tname) leaf++; else seq++;
+ if (tname == kDNA || tname == kRNA) seq++; else leaf++;
+ }
+
+ if (ilen <= nlen && j<9) {
+ if (tname == kOtherSeq) leaf += 10;
+ else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;
+ }
+ else if (ilen > nlen) {
+ ilen= 0;
+ }
+ }
+ for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */
+ ReadOneLine(sp);
+
+ tseq= getseqtype(sp+10, strlen(sp+10));
+ tname= getseqtype(sp, 10);
+ for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
+ for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
+ if (j<9) {
+ if (tname == kOtherSeq) seq += 10;
+ if (tseq != tname) seq++; else leaf++;
+ if (tname == kDNA || tname == kRNA) leaf++; else seq++;
+ }
+ if (ilen > nlen) {
+ if (j>9) leaf += 10; /* must be a name here for sequent */
+ else if (tname == kOtherSeq) seq += 10;
+ ilen= 0;
+ }
+ }
+
+ if (leaf > seq) format= kPhylip4;
+ else format= kPhylip2;
+ }
+
+ return(format);
+#undef ReadOneLine
+} /* SeqFileFormat */
+
+
+
+
+unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)
+/* GCGchecksum */
+{
+ register long i, check = 0, count = 0;
+
+ for (i = 0; i < seqlen; i++) {
+ count++;
+ check += count * to_upper(seq[i]);
+ if (count == 57) count = 0;
+ }
+ check %= 10000;
+ *checktotal += check;
+ *checktotal %= 10000;
+ return check;
+}
+
+/* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */
+const unsigned long crctab[] = {
+ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
+ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
+ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
+ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
+ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
+ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
+ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
+ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
+ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
+ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
+ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
+ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
+ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
+ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
+ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
+ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
+ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
+ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
+ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
+ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
+ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
+ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
+ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
+ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
+ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
+ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
+ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
+ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
+ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
+ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
+ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
+ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
+ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
+ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
+ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
+ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
+ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
+ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
+ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
+ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
+ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
+ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
+ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
+ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
+ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
+ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
+ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
+ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
+ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
+ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
+ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
+ 0x2d02ef8dL
+};
+
+unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
+/*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */
+{
+ register unsigned long c = 0xffffffffL;
+ register long n = seqlen;
+
+ while (n--) {
+ c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);
+ seq++; /* fixed aug'98 finally */
+ }
+ c= c ^ 0xffffffffL;
+ *checktotal += c;
+ return c;
+}
+
+
+
+
+short getseqtype( const char *seq, const long seqlen)
+{ /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */
+ char c;
+ short i, maxtest;
+ short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;
+
+ maxtest = min(300, seqlen);
+ for (i = 0; i < maxtest; i++) {
+ c = to_upper(seq[i]);
+ if (strchr(protonly, c)) po++;
+ else if (strchr(primenuc,c)) {
+ na++;
+ if (c == 'T') nt++;
+ else if (c == 'U') nu++;
+ }
+ else if (strchr(aminos,c)) aa++;
+ else if (strchr(seqsymbols,c)) ns++;
+ else if (isalpha(c)) no++;
+ }
+
+ if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;
+ /* ?? test for probability of kOtherSeq ?, e.g.,
+ else if (po+aa+na / maxtest < 0.70) return kOtherSeq;
+ */
+ else if (po > 0) return kAmino;
+ else if (aa == 0) {
+ if (nu > nt) return kRNA;
+ else return kDNA;
+ }
+ else if (na > aa) return kNucleic;
+ else return kAmino;
+} /* getseqtype */
+
+
+char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen)
+{
+ register char *a, *b;
+ register long i;
+ char *newseq;
+
+ *newlen= 0;
+ if (!seq) return NULL;
+ newseq = (char*) malloc(seqlen+1);
+ if (!newseq) return NULL;
+ for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)
+ if (*a != gapc) {
+ *b++= *a;
+ i++;
+ }
+ *b= '\0';
+ newseq = (char*) realloc(newseq, i+1);
+ *newlen= i;
+ return newseq;
+}
+
+
+
+/***
+char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \
+{\\fonttbl\
+ {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\
+ {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\
+ {\\f5\\froman Times;}{\\f6\\froman Palatino;}\
+ {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\
+{\\stylesheet\
+ {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\
+ {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\
+ {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";
+
+char *rtftail = "}";
+****/
+
+short writeSeq(FILE *outf, const char *seq, const long seqlen,
+ const short outform, const char *seqid)
+/* dump sequence to standard output */
+{
+ const short kSpaceAll = -9;
+#define kMaxseqwidth 250
+
+ boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */
+ short numline = 0; /* only true if we are writing seq number line (for interleave) */
+ boolean numright = false, numleft = false;
+ boolean nameright = false, nameleft = false;
+ short namewidth = 8, numwidth = 8;
+ short spacer = 0, width = 50, tab = 0;
+ /* new parameters: width, spacer, those above... */
+
+ short linesout = 0, seqtype = kNucleic;
+ long i, j, l, l1, ibase;
+ char idword[31], endstr[10];
+ char seqnamestore[128], *seqname = seqnamestore;
+ char s[kMaxseqwidth], *cp;
+ char nameform[10], numform[10], nocountsymbols[10];
+ unsigned long checksum = 0, checktotal = 0;
+
+ gPretty.atseq++;
+ skipwhitespace(seqid);
+ l = min(128, strlen(seqid));
+ strncpy( seqnamestore, seqid, l);
+ seqname[l] = 0;
+
+ sscanf( seqname, "%30s", idword);
+ sprintf(numform, "%d", seqlen);
+ numwidth= strlen(numform)+1;
+ nameform[0]= '\0';
+
+ if (strstr(seqname,"checksum") != NULL) {
+ cp = strstr(seqname,"bases");
+ if (cp!=NULL) {
+ for ( ; (cp!=seqname) && (*cp!=','); cp--) ;
+ if (cp!=seqname) *cp=0;
+ }
+ }
+
+ strcpy( endstr,"");
+ l1 = 0;
+
+ if (outform == kGCG || outform == kMSF)
+ checksum = GCGchecksum(seq, seqlen, &checktotal);
+ else
+ checksum = seqchecksum(seq, seqlen, &checktotal);
+
+ switch (outform) {
+
+ case kPlain:
+ case kUnknown: /* no header, just sequence */
+ strcpy(endstr,"\n"); /* end w/ extra blank line */
+ break;
+
+ case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */
+ case kGenBank:
+ fprintf(outf,"LOCUS %s %d bp\n", idword, seqlen);
+ fprintf(outf,"DEFINITION %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ /* fprintf(outf,"ACCESSION %s\n", accnum); */
+ fprintf(outf,"ORIGIN \n");
+ spacer = 11;
+ numleft = true;
+ numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */
+ strcpy(endstr, "\n//");
+ linesout += 4;
+ break;
+
+ case kPIR:
+ /* somewhat like genbank... \\\*/
+ /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */
+ fprintf(outf,"ENTRY %s \n", idword);
+ fprintf(outf,"TITLE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ /* fprintf(outf,"ACCESSION %s\n", accnum); */
+ fprintf(outf,"SEQUENCE \n");
+ numwidth = 7;
+ width= 30;
+ spacer = kSpaceAll;
+ numleft = true;
+ strcpy(endstr, "\n///");
+ /* run a top number line for PIR */
+ for (j=0; j<numwidth; j++) fputc(' ',outf);
+ for (j= 5; j<=width; j += 5) fprintf(outf,"%10d",j);
+ fputc('\n',outf);
+ linesout += 5;
+ break;
+
+ case kNBRF:
+ if (getseqtype(seq, seqlen) == kAmino)
+ fprintf(outf,">P1;%s\n", idword);
+ else
+ fprintf(outf,">DL;%s\n", idword);
+ fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ spacer = 11;
+ strcpy(endstr,"*\n");
+ linesout += 3;
+ break;
+
+ case kEMBL:
+ fprintf(outf,"ID %s\n", idword);
+ /* fprintf(outf,"AC %s\n", accnum); */
+ fprintf(outf,"DE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ fprintf(outf,"SQ %d BP\n", seqlen);
+ strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
+ tab = 4; /** added 31jan91 */
+ spacer = 11; /** added 31jan91 */
+ width = 60;
+ linesout += 4;
+ break;
+
+ case kGCG:
+ fprintf(outf,"%s\n", seqname);
+ /* fprintf(outf,"ACCESSION %s\n", accnum); */
+ fprintf(outf," %s Length: %d (today) Check: %d ..\n", idword, seqlen, checksum);
+ spacer = 11;
+ numleft = true;
+ strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
+ linesout += 3;
+ break;
+
+ case kStrider: /* ?? map ?*/
+ fprintf(outf,"; ### from DNA Strider ;-)\n");
+ fprintf(outf,"; DNA sequence %s, %d bases, %X checksum.\n;\n", seqname, seqlen, checksum);
+ strcpy(endstr, "\n//");
+ linesout += 3;
+ break;
+
+ case kFitch:
+ fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ spacer = 4;
+ width = 60;
+ linesout += 1;
+ break;
+
+ case kPhylip2:
+ case kPhylip4:
+ /* this is version 3.2/3.4 -- simplest way to write
+ version 3.3 is to write as version 3.2, then
+ re-read file and interleave the species lines */
+ if (strlen(idword)>10) idword[10] = 0;
+ fprintf(outf,"%-10s ",idword);
+ l1 = -1;
+ tab = 12;
+ spacer = 11;
+ break;
+
+ case kASN1:
+ seqtype= getseqtype(seq, seqlen);
+ switch (seqtype) {
+ case kDNA : cp= "dna"; break;
+ case kRNA : cp= "rna"; break;
+ case kNucleic : cp= "na"; break;
+ case kAmino : cp= "aa"; break;
+ case kOtherSeq: cp= "not-set"; break;
+ }
+ fprintf(outf," seq {\n");
+ fprintf(outf," id { local id %d },\n", gPretty.atseq);
+ fprintf(outf," descr { title \"%s\" },\n", seqid);
+ fprintf(outf," inst {\n");
+ fprintf(outf," repr raw, mol %s, length %d, topology linear,\n", cp, seqlen);
+ fprintf(outf," seq-data\n");
+ if (seqtype == kAmino)
+ fprintf(outf," iupacaa \"");
+ else
+ fprintf(outf," iupacna \"");
+ l1 = 17;
+ spacer = 0;
+ width = 78;
+ tab = 0;
+ strcpy(endstr,"\"\n } } ,");
+ linesout += 7;
+ break;
+
+ case kPAUP:
+ nameleft= true;
+ namewidth = 9;
+ spacer = 21;
+ width = 100;
+ tab = 0; /* 1; */
+ /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */
+ /* do a header comment line for paup */
+ fprintf(outf,"[Name: %-16s Len:%6d Check: %8X]\n", idword, seqlen, checksum);
+ linesout += 1;
+ break;
+
+ case kPretty:
+ numline= gPretty.numline;
+ baseonlynum= gPretty.baseonlynum;
+ namewidth = gPretty.namewidth;
+ numright = gPretty.numright;
+ numleft = gPretty.numleft;
+ nameright = gPretty.nameright;
+ nameleft = gPretty.nameleft;
+ spacer = gPretty.spacer + 1;
+ width = gPretty.seqwidth;
+ tab = gPretty.tab;
+ /* also add rtf formatting w/ font, size, style */
+ if (gPretty.nametop) {
+ fprintf(outf,"Name: %-16s Len:%6d Check: %8X\n", idword, seqlen, checksum);
+ linesout++;
+ }
+ break;
+
+ case kMSF:
+ fprintf(outf," Name: %-16s Len:%6d Check: %5d Weight: 1.00\n",
+ idword, seqlen, checksum);
+ linesout++;
+ nameleft= true;
+ namewidth= 15; /* need MAX namewidth here... */
+ sprintf(nameform, "%%+%ds ",namewidth);
+ spacer = 11;
+ width = 50;
+ tab = 0; /* 1; */
+ break;
+
+ case kIG:
+ fprintf(outf,";%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ fprintf(outf,"%s\n", idword);
+ strcpy(endstr,"1"); /* == linear dna */
+ linesout += 2;
+ break;
+
+ default :
+ case kZuker: /* don't attempt Zuker's ftn format */
+ case kPearson:
+ fprintf(outf,">%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
+ linesout += 1;
+ break;
+ }
+
+ if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);
+ if (numline) sprintf(numform, "%%%ds ",numwidth);
+ else sprintf(numform, "%%%dd ",numwidth);
+ strcpy( nocountsymbols, kNocountsymbols);
+ if (baseonlynum) {
+ if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {
+ strcat(nocountsymbols," ");
+ nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;
+ }
+ if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {
+ *cp= ' ';
+ }
+ }
+
+ if (numline) {
+ *idword= 0;
+ }
+
+ width = min(width,kMaxseqwidth);
+ for (i=0, l=0, ibase = 1; i < seqlen; ) {
+
+ if (l1 < 0) l1 = 0;
+ else if (l1 == 0) {
+ if (nameleft) fprintf(outf, nameform, idword);
+ if (numleft) { if (numline) fprintf(outf, numform, "");
+ else fprintf(outf, numform, ibase);}
+ for (j=0; j<tab; j++) fputc(' ',outf);
+ }
+
+ l1++; /* don't count spaces for width*/
+ if (numline) {
+ if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {
+ if (numline==1) fputc(' ',outf);
+ s[l++] = ' ';
+ }
+ if (l1 % 10 == 1 || l1 == width) {
+ if (numline==1) fprintf(outf,"%-9d ",i+1);
+ s[l++]= '|'; /* == put a number here */
+ }
+ else s[l++]= ' ';
+ i++;
+ }
+
+ else {
+ if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
+ s[l++] = ' ';
+ if (!baseonlynum) ibase++;
+ else if (0==strchr(nocountsymbols,seq[i])) ibase++;
+ s[l++] = seq[i++];
+ }
+
+ if (l1 == width || i == seqlen) {
+ if (outform==kPretty) for ( ; l1<width; l1++) {
+ if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
+ s[l++] = ' ';
+ s[l++]=' '; /* pad w/ blanks */
+ }
+ s[l] = '\0';
+ l = 0; l1 = 0;
+
+ if (numline) {
+ if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */
+ }
+ else {
+ if (i == seqlen) fprintf(outf,"%s%s",s,endstr);
+ else fprintf(outf,"%s",s);
+ if (numright || nameright) fputc(' ',outf);
+ if (numright) fprintf(outf,numform, ibase-1);
+ if (nameright) fprintf(outf, nameform,idword);
+ }
+ fputc('\n',outf);
+ linesout++;
+ }
+ }
+ return linesout;
+} /*writeSeq*/
+
+
+
+/* End file: ureadseq.c */