3 * Reads and writes nucleic/protein sequence in various
4 * formats. Data files may have multiple sequences.
6 * Copyright 1990 by d.g.gilbert
7 * biology dept., indiana university, bloomington, in 47405
8 * e-mail: gilbertd@bio.indiana.edu
10 * This program may be freely copied and used by anyone.
11 * Developers are encourged to incorporate parts in their
12 * programs, rather than devise their own private sequence
15 * This should compile and run with any ANSI C compiler.
28 #pragma segment ureadseq
31 int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */
37 diff = to_upper(*a) - to_upper(*b);
38 if (diff) return diff;
39 if (*a == '\0') done = 1;
45 int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */
51 diff = to_upper(*a) - to_upper(*b);
52 if (diff) return diff;
53 if (*a == '\0') done = 1;
67 # define Local static /* local functions */
70 #define kStartLength 500
72 const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
73 const char *primenuc = "ACGTU";
74 const char *protonly = "EFIPQZ";
76 const char kNocountsymbols[5] = "_.-?";
77 const char stdsymbols[6] = "_.-*?";
78 const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";
79 static const char *seqsymbols = allsymbols;
81 const char nummask[11] = "0123456789";
82 const char nonummask[11] = "~!@#$%^&*(";
85 use general form of isseqchar -- all chars + symbols.
86 no formats except nbrf (?) use symbols in data area as
87 anything other than sequence chars.
92 /* Local variables for readSeq: */
94 short choice, err, nseq;
95 long seqlen, maxseq, seqlencount;
99 char *seq, *seqid, matchchar;
100 boolean allDone, done, filestart, addit;
106 /* int (*isseqchar)(int c); << sgi cc hates (int c) */
113 return (isalpha(c) || strchr(seqsymbols,c));
116 int isSeqNumChar(int c)
118 return (isalnum(c) || strchr(seqsymbols,c));
124 return isascii(c); /* wrap in case isascii is macro */
127 Local void readline(FILE *f, char *s, long *linestart)
131 *linestart= ftell(f);
132 if (NULL == fgets(s, 256, f))
135 cp = strchr(s, '\n');
136 if (cp != NULL) *cp = 0;
140 Local void getline(struct ReadSeqVars *V)
142 readline(V->f, V->s, &V->linestart);
145 Local void ungetline(struct ReadSeqVars *V)
147 fseek(V->f, V->linestart, 0);
151 Local void addseq(char *s, struct ReadSeqVars *V)
155 if (V->addit) while (*s != 0) {
156 if ((V->isseqchar)(*s)) {
157 if (V->seqlen >= V->maxseq) {
158 V->maxseq += kStartLength;
159 ptr = (char*) realloc(V->seq, V->maxseq+1);
166 V->seq[(V->seqlen)++] = *s;
172 Local void countseq(char *s, struct ReadSeqVars *V)
173 /* this must count all valid seq chars, for some formats (paup-sequential) even
174 if we are skipping seq... */
177 if ((V->isseqchar)(*s)) {
185 Local void addinfo(char *s, struct ReadSeqVars *V)
191 while (*s == ' ') s++;
192 sprintf(si, " %d) %s\n", V->nseq, s);
196 V->isseqchar = isAnyChar;
199 V->isseqchar = isSeqChar;
205 Local void readLoop(short margin, boolean addfirst,
206 boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),
207 struct ReadSeqVars *V)
209 boolean addend = false;
210 boolean ungetend = false;
213 if (V->choice == kListSequences) V->addit = false;
214 else V->addit = (V->nseq == V->choice);
215 if (V->addit) V->seqlen = 0;
217 if (addfirst) addseq(V->s, V);
220 V->done = feof(V->f);
221 V->done |= (*endTest)( &addend, &ungetend, V);
222 if (V->addit && (addend || !V->done) && (strlen(V->s) > margin)) {
223 addseq( (V->s)+margin, V);
227 if (V->choice == kListSequences) addinfo(V->seqid, V);
229 V->allDone = (V->nseq >= V->choice);
230 if (V->allDone && ungetend) ungetline(V);
236 Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
238 *addend = true; /* 1 or 2 occur in line w/ bases */
240 return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));
243 Local void readIG(struct ReadSeqVars *V)
245 /* 18Aug92: new IG format -- ^L between sequences in place of ";" */
248 while (!V->allDone) {
251 for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */
252 if (*si == 0) *V->s= 0; /* chop line to empty */
253 } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));
257 strcpy(V->seqid, V->s);
258 readLoop(0, false, endIG, V);
265 Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
269 return (strstr( V->s, "//") != NULL);
272 Local void readStrider(struct ReadSeqVars *V)
273 { /* ? only 1 seq/file ? */
275 while (!V->allDone) {
277 if (strstr(V->s,"; DNA sequence ") == V->s)
278 strcpy(V->seqid, (V->s)+16);
280 strcpy(V->seqid, (V->s)+1);
281 while ((!feof(V->f)) && (*V->s == ';')) {
284 if (feof(V->f)) V->allDone = true;
285 else readLoop(0, true, endStrider, V);
290 Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
293 *ungetend= (strstr(V->s,"ENTRY") == V->s);
294 return ((strstr(V->s,"///") != NULL) || *ungetend);
297 Local void readPIR(struct ReadSeqVars *V)
298 { /*PIR -- many seqs/file */
300 while (!V->allDone) {
301 while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) )
303 strcpy(V->seqid, (V->s)+16);
304 while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))
306 readLoop(0, false, endPIR, V);
309 while (! (feof(V->f) || ((*V->s != 0)
310 && (strstr( V->s,"ENTRY") == V->s))))
313 if (feof(V->f)) V->allDone = true;
318 Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
321 *ungetend= (strstr(V->s,"LOCUS") == V->s);
322 return ((strstr(V->s,"//") != NULL) || *ungetend);
325 Local void readGenBank(struct ReadSeqVars *V)
326 { /*GenBank -- many seqs/file */
328 while (!V->allDone) {
329 strcpy(V->seqid, (V->s)+12);
330 while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))
332 readLoop(0, false, endGB, V);
335 while (! (feof(V->f) || ((*V->s != 0)
336 && (strstr( V->s,"LOCUS") == V->s))))
339 if (feof(V->f)) V->allDone = true;
344 Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
348 if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */
349 /* "*" can be valid base symbol, drop it here */
355 else if (*V->s == '>') { /* start of next seq */
364 Local void readNBRF(struct ReadSeqVars *V)
366 while (!V->allDone) {
367 strcpy(V->seqid, (V->s)+4);
368 getline(V); /*skip title-junk line*/
369 readLoop(0, false, endNBRF, V);
371 while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))
374 if (feof(V->f)) V->allDone = true;
380 Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
384 return(*V->s == '>');
387 Local void readPearson(struct ReadSeqVars *V)
389 while (!V->allDone) {
390 strcpy(V->seqid, (V->s)+1);
391 readLoop(0, false, endPearson, V);
393 while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))
396 if (feof(V->f)) V->allDone = true;
402 Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
405 *ungetend= (strstr(V->s,"ID ") == V->s);
406 return ((strstr(V->s,"//") != NULL) || *ungetend);
409 Local void readEMBL(struct ReadSeqVars *V)
411 while (!V->allDone) {
412 strcpy(V->seqid, (V->s)+5);
415 } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s)));
417 readLoop(0, false, endEMBL, V);
419 while (!(feof(V->f) |
420 ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s))))
423 if (feof(V->f)) V->allDone = true;
429 Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
433 return( *V->s == '(' );
436 Local void readZuker(struct ReadSeqVars *V)
438 /*! 1st string is Zuker's Fortran format */
440 while (!V->allDone) {
441 getline(V); /*s == "seqLen seqid string..."*/
442 strcpy(V->seqid, (V->s)+6);
443 readLoop(0, false, endZuker, V);
445 while (!(feof(V->f) |
446 ((*V->s != '\0') & (*V->s == '('))))
449 if (feof(V->f)) V->allDone = true;
455 Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
457 /* this is a somewhat shaky end,
458 1st char of line is non-blank for seq. title
462 return( *V->s != ' ' );
465 Local void readFitch(struct ReadSeqVars *V)
470 while (!V->allDone) {
471 if (!first) strcpy(V->seqid, V->s);
472 readLoop(0, first, endFitch, V);
473 if (feof(V->f)) V->allDone = true;
479 Local void readPlain(struct ReadSeqVars *V)
482 V->addit = (V->choice > 0);
483 if (V->addit) V->seqlen = 0;
484 addseq(V->seqid, V); /*from above..*/
485 if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname);
486 else sprintf(V->seqid, " [Unknown form]");
489 V->done = feof(V->f);
492 if (V->choice == kListSequences) addinfo(V->seqid, V);
497 Local void readUWGCG(struct ReadSeqVars *V)
500 10nov91: Reading GCG files casued duplication of last line when
501 EOF followed that line !!!
502 fix: getline now sets *V->s = 0
507 V->addit = (V->choice > 0);
508 if (V->addit) V->seqlen = 0;
509 strcpy(V->seqid, V->s);
510 /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
511 /*drop above or ".." from id*/
512 if (si = strstr(V->seqid," Length: ")) *si = 0;
513 else if (si = strstr(V->seqid,"..")) *si = 0;
515 V->done = feof(V->f);
517 if (!V->done) addseq((V->s), V);
519 if (V->choice == kListSequences) addinfo(V->seqid, V);
524 Local void readOlsen(struct ReadSeqVars *V)
525 { /* G. Olsen /print output from multiple sequence editor */
527 char *si, *sj, *sk, *sm, sid[40], snum[20];
528 boolean indata = false;
531 V->addit = (V->choice > 0);
532 if (V->addit) V->seqlen = 0;
533 rewind(V->f); V->nseq= 0;
536 V->done = feof(V->f);
538 if (V->done && !(*V->s)) break;
540 if ( (si= strstr(V->s, sid))
541 /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */
542 && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) {
544 /* Spaces are valid alignment data !! */
545 /* 17Oct91: Error, the left margin is 21 not 22! */
546 /* dropped some nucs up to now -- my example file was right shifted ! */
547 /* variable right id margin, drop id-2 spaces at end */
549 VMS CC COMPILER (VAXC031) mess up:
550 -- Index of 21 is chopping 1st nuc on VMS systems Only!
551 Byte-for-byte same ame rnasep.olsen sequence file !
554 /* si = (V->s)+21; < was this before VMS CC wasted my time */
555 si += 10; /* use strstr index plus offset to outfox VMS CC bug */
557 if (sk = strstr(si, sid)) *(sk-2) = 0;
558 for (sk = si; *sk != 0; sk++) {
559 if (*sk == ' ') *sk = '.';
560 /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */
561 else if (isdigit(*sk)) *sk= nonummask[*sk - '0'];
568 else if (sk = strstr(V->s, "): ")) { /* seq info header line */
569 /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */
570 /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */
571 /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */
573 si = 1 + strchr(V->s,'(');
575 if (V->choice == kListSequences) addinfo( si, V);
576 else if (V->nseq == V->choice) {
577 strcpy(V->seqid, si);
578 sj = strchr(V->seqid, ':');
579 while (*(--sj) == ' ') ;
580 while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; }
583 while (*(--sk) == ' ') *sk = 0;
587 while ((*si <= ' ') && (*si != 0)) si++;
589 while (si[snumlen] > ' ' && snumlen<20)
590 { snum[snumlen]= si[snumlen]; snumlen++; }
596 else if (strstr(V->s,"identity: Data:")) {
598 if (V->choice == kListSequences) V->done = true;
607 Local void readMSF(struct ReadSeqVars *V)
608 { /* gcg's MSF, mult. sequence format, interleaved ! */
610 char *si, *sj, sid[128];
611 boolean indata = false;
612 int atseq= 0, iline= 0;
614 V->addit = (V->choice > 0);
615 if (V->addit) V->seqlen = 0;
616 rewind(V->f); V->nseq= 0;
619 V->done = feof(V->f);
621 if (V->done && !(*V->s)) break;
623 /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */
624 /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */
628 /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/
629 for (sj= si; *sj > ' '; sj++) ;
632 if ( (0==strcmp(si, sid)) ) {
639 else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */
640 /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */
644 if (V->choice == kListSequences) addinfo( si, V);
645 else if (V->nseq == V->choice) {
646 strcpy(V->seqid, si);
649 /* for (sj= si; isalnum(*sj); sj++) ; -- bug */
650 for (sj= si; *sj > ' '; sj++) ;
656 else if ( strstr(V->s,"//") /*== V->s*/ ) {
659 if (V->choice == kListSequences) V->done = true;
670 Local void readPAUPinterleaved(struct ReadSeqVars *V)
671 { /* PAUP mult. sequence format, interleaved or sequential! */
673 char *si, *sj, *send, sid[40], sid1[40], saveseq[255];
674 boolean first = true, indata = false, domatch;
675 int atseq= 0, iline= 0, ifmc, saveseqlen=0;
677 #define fixmatchchar(s) { \
678 for (ifmc=0; ifmc<saveseqlen; ifmc++) \
679 if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; }
681 V->addit = (V->choice > 0);
683 if (V->addit) V->seqlen = 0;
684 /* rewind(V->f); V->nseq= 0; << do in caller !*/
685 indata= true; /* call here after we find "matrix" */
686 domatch= (V->matchchar > 0);
690 V->done = feof(V->f);
692 if (V->done && !(*V->s)) break;
695 /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
696 /* chimp ................a.t. .c.................a ..........*/
697 /* !! need to correct for V->matchchar */
700 if (strchr(si,';')) indata= false;
703 /* valid data line starts w/ a left-justified seq name in columns [0..8] */
706 if (V->nseq >= V->topnseq) first= false;
707 for (sj = si; isalnum(*sj); sj++) ;
710 if (V->choice == kListSequences) {
714 else if (V->nseq == V->choice) {
716 if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); }
717 else fixmatchchar( sj);
721 strcpy(V->seqid, si);
723 if (V->nseq == 1) strcpy(sid1, sid);
727 else if ( (strstr(si, sid) == si) ){
728 while (isalnum(*si)) si++;
731 if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); }
732 else fixmatchchar( si);
737 else if (domatch && (strstr(si, sid1) == si)) {
738 strcpy( saveseq, si);
739 saveseqlen= strlen(saveseq);
746 else if ( strstr(V->s,"matrix") ) {
749 if (V->choice == kListSequences) V->done = true;
755 } /*readPAUPinterleaved*/
759 Local void readPAUPsequential(struct ReadSeqVars *V)
760 { /* PAUP mult. sequence format, interleaved or sequential! */
762 boolean atname = true, indata = false;
764 V->addit = (V->choice > 0);
765 if (V->addit) V->seqlen = 0;
767 /* rewind(V->f); V->nseq= 0; << do in caller !*/
768 indata= true; /* call here after we find "matrix" */
771 V->done = feof(V->f);
773 if (V->done && !(*V->s)) break;
776 /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
777 /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
778 /* chimp ................a.t. .c.................a ..........*/
779 /* ................a.t. .c.................a ..........*/
783 if (strchr(si,';')) indata= false;
785 /* valid data line starts w/ a left-justified seq name in columns [0..8] */
791 while (isalnum(*sj)) sj++;
792 if (V->choice == kListSequences) {
793 /* !! we must count bases to know when topseqlen is reached ! */
795 if (V->seqlencount >= V->topseqlen) atname= true;
799 else if (V->nseq == V->choice) {
801 V->seqlencount= V->seqlen;
802 if (V->seqlencount >= V->topseqlen) atname= true;
804 strcpy(V->seqid, si);
808 if (V->seqlencount >= V->topseqlen) atname= true;
812 else if (V->nseq == V->choice) {
814 V->seqlencount= V->seqlen;
815 if (V->seqlencount >= V->topseqlen) atname= true;
819 if (V->seqlencount >= V->topseqlen) atname= true;
824 else if ( strstr(V->s,"matrix") ) {
827 if (V->choice == kListSequences) V->done = true;
833 } /*readPAUPsequential*/
836 Local void readPhylipInterleaved(struct ReadSeqVars *V)
839 boolean first = true;
842 V->addit = (V->choice > 0);
843 if (V->addit) V->seqlen = 0;
845 /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */
848 V->topnseq= atoi(si);
849 while (isdigit(*si)) si++;
851 V->topseqlen= atol(si);
852 /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */
856 V->done = feof(V->f);
858 if (V->done && !(*V->s)) break;
863 if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */
865 if (V->nseq >= V->topnseq) first= false;
866 sj= V->s+10; /* past name, start of data */
867 if (V->choice == kListSequences) {
871 else if (V->nseq == V->choice) {
874 strcpy(V->seqid, si);
877 else if ( iline % V->nseq == V->choice -1 ) {
885 } /*readPhylipInterleaved*/
889 Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
894 return V->seqlencount >= V->topseqlen;
897 Local void readPhylipSequential(struct ReadSeqVars *V)
901 /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */
904 V->topnseq= atoi(si);
905 while (isdigit(*si)) si++;
907 V->topseqlen= atol(si);
909 while (!V->allDone) {
911 strncpy(V->seqid, (V->s), 10);
913 for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' ';
914 readLoop(0, true, endPhylipSequential, V);
915 if (feof(V->f)) V->allDone = true;
922 Local void readSeqMain(
923 struct ReadSeqVars *V,
924 const long skiplines_,
927 #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \
928 for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); }
936 V->err = eFileNotFound;
939 for (l = skiplines_; l > 0; l--) getline( V);
943 for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;
944 } while ((l == 0) && !feof(V->f));
946 if (feof(V->f)) V->err = eNoData;
948 else switch (format_) {
949 case kPlain : readPlain(V); break;
950 case kIG : readIG(V); break;
951 case kStrider: readStrider(V); break;
952 case kGenBank: readGenBank(V); break;
953 case kPIR : readPIR(V); break;
954 case kNBRF : readNBRF(V); break;
955 case kPearson: readPearson(V); break;
956 case kEMBL : readEMBL(V); break;
957 case kZuker : readZuker(V); break;
958 case kOlsen : readOlsen(V); break;
959 case kMSF : readMSF(V); break;
963 boolean interleaved= false;
965 /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */
969 if (strstr( V->s, "matrix")) done= true;
970 if (strstr( V->s, "interleav")) interleaved= true;
971 if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5);
972 if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6);
973 if (NULL != (cp=strstr( V->s, "matchchar=")) ) {
976 else if (*cp=='"') cp++;
980 if (interleaved) readPAUPinterleaved(V);
981 else readPAUPsequential(V);
985 /* kPhylip: ! can't determine in middle of file which type it is...*/
986 /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */
988 readPhylipSequential(V);
990 case kPhylip4: /* == kPhylip3 */
991 readPhylipInterleaved(V);
995 V->err = eUnknownFormat;
999 strcpy(V->seqid, V->s); getline(V);
1005 gotuw = (strstr(V->s,"..") != NULL);
1006 if (gotuw) readUWGCG(V);
1008 } while (!(feof(V->f) || V->allDone));
1013 V->filestart= false;
1014 V->seq[V->seqlen] = 0; /* stick a string terminator on it */
1019 const short whichEntry_, /* index to sequence in file */
1020 FILE *fp_, /* pointer to open seq file */
1021 const long skiplines_,
1022 const short format_, /* sequence file format */
1023 long *seqlen_, /* return seq size */
1024 short *nseq_, /* number of seqs in file, for listSeqs() */
1025 short *error_, /* return error */
1026 char *seqid_) /* return seq name/info */
1028 struct ReadSeqVars V;
1030 if (format_ < kMinFormat || format_ > kMaxFormat) {
1031 *error_ = eUnknownFormat;
1036 V.choice = whichEntry_;
1037 V.fname = NULL; /* don't know */
1038 V.seq = (char*) calloc(1, kStartLength+1);
1039 V.maxseq = kStartLength;
1044 V.filestart= (ftell( fp_) == 0);
1045 /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */
1046 if (V.filestart) V.nseq = 0;
1047 else V.nseq= *nseq_; /* track where we are in file...*/
1052 V.isseqchar = isSeqChar;
1053 if (V.choice == kListSequences) ; /* leave as is */
1054 else if (V.choice <= 0) V.choice = 1; /* default ?? */
1055 V.addit = (V.choice > 0);
1058 readSeqMain(&V, skiplines_, format_);
1061 *seqlen_ = V.seqlen;
1067 const short whichEntry_, /* index to sequence in file */
1068 const char *filename_, /* file name */
1069 const long skiplines_,
1070 const short format_, /* sequence file format */
1071 long *seqlen_, /* return seq size */
1072 short *nseq_, /* number of seqs in file, for listSeqs() */
1073 short *error_, /* return error */
1074 char *seqid_) /* return seq name/info */
1076 struct ReadSeqVars V;
1078 if (format_ < kMinFormat || format_ > kMaxFormat) {
1079 *error_ = eUnknownFormat;
1084 V.choice = whichEntry_;
1085 V.fname = filename_; /* don't need to copy string, just ptr to it */
1086 V.seq = (char*) calloc(1, kStartLength+1);
1087 V.maxseq = kStartLength;
1095 V.isseqchar = isSeqChar;
1096 if (V.choice == kListSequences) ; /* leave as is */
1097 else if (V.choice <= 0) V.choice = 1; /* default ?? */
1098 V.addit = (V.choice > 0);
1101 V.f = fopen(V.fname, "r");
1104 readSeqMain(&V, skiplines_, format_);
1106 if (V.f != NULL) fclose(V.f);
1108 *seqlen_ = V.seqlen;
1118 const char *filename_, /* file name */
1119 const long skiplines_,
1120 const short format_, /* sequence file format */
1121 short *nseq_, /* number of seqs in file, for listSeqs() */
1122 short *error_) /* return error */
1127 return readSeq( kListSequences, filename_, skiplines_, format_,
1128 &seqlen, nseq_, error_, seqid);
1134 short seqFileFormat( /* return sequence format number, see ureadseq.h */
1135 const char *filename,
1136 long *skiplines, /* return #lines to skip any junk like mail header */
1137 short *error) /* return any error value or 0 */
1142 fseq = fopen(filename, "r");
1143 format= seqFileFormatFp( fseq, skiplines, error);
1144 if (fseq!=NULL) fclose(fseq);
1148 short seqFileFormatFp(
1150 long *skiplines, /* return #lines to skip any junk like mail header */
1151 short *error) /* return any error value or 0 */
1153 boolean foundDNA= false, foundIG= false, foundStrider= false,
1154 foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,
1155 foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,
1156 gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,
1157 isfitch= false, isphylip= false, done= false;
1158 short format= kUnknown;
1159 int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;
1162 int maxlines2check=500;
1164 #define ReadOneLine(sp) \
1165 { done |= (feof(fseq)); \
1166 readline( fseq, sp, &linestart); \
1167 if (!done) { splen = strlen(sp); ++nlines; } }
1171 if (fseq == NULL) { *error = eFileNotFound; return kNoformat; }
1176 /* check for mailer head & skip past if found */
1177 if (nlines < 4 && !done) {
1178 if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {
1180 /* skip all lines until find one blank line */
1182 if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;
1183 } while ((!done) && (k < splen));
1184 *skiplines = nlines; /* !? do we want #lines or #bytes ?? */
1188 if (sp==NULL || *sp==0)
1191 /* high probability identities: */
1193 else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )
1196 else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))
1199 else if (strstr(sp,"identity: Data:") != NULL)
1202 else if ( strstr(sp,"::=") &&
1203 (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */
1204 strstr(sp,"Seq-entry") ||
1205 strstr(sp,"Seq-submit") ) ) /* can we read submit format? */
1208 else if ( strstr(sp,"#NEXUS") == sp )
1211 /* uncertain identities: */
1213 else if (*sp ==';') {
1214 if (strstr(sp,"Strider") !=NULL) foundStrider= true;
1218 else if (strstr(sp,"LOCUS") == sp)
1220 else if (strstr(sp,"ORIGIN") == sp)
1223 else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */
1225 else if (strstr(sp,"SEQUENCE") == sp)
1228 else if (*sp == '>') {
1229 if (sp[3] == ';') foundNBRF= true;
1230 else foundPearson= true;
1233 else if (strstr(sp,"ID ") == sp)
1235 else if (strstr(sp,"SQ ") == sp)
1238 else if (*sp == '(')
1242 if (nlines - *skiplines == 1) {
1243 int ispp= 0, ilen= 0;
1244 sscanf( sp, "%d%d", &ispp, &ilen);
1245 if (ispp > 0 && ilen > 0) isphylip= true;
1247 else if (isphylip && nlines - *skiplines == 2) {
1249 tseq= getseqtype(sp+10, strlen(sp+10));
1250 if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */
1251 && (tseq != kOtherSeq)) /* sequence section must be okay */
1255 for (k=0, isfitch= true; isfitch & (k < splen); k++) {
1256 if (k % 4 == 0) isfitch &= (sp[k] == ' ');
1257 else isfitch &= (sp[k] != ' ');
1259 if (isfitch & (splen > 20)) foundFitch= true;
1261 /* kRNA && kDNA are fairly certain...*/
1262 switch (getseqtype( sp, splen)) {
1263 case kOtherSeq: otherlines++; break;
1264 case kAmino : if (splen>20) aminolines++; break;
1266 case kRNA : if (splen>20) dnalines++; break;
1267 case kNucleic : break; /* not much info ? */
1272 /* pretty certain */
1282 /* !! we need to look further and return kASNseqentry | kASNseqset */
1284 seqentry key is Seq-entry ::=
1285 seqset key is Bioseq-set ::=
1286 ?? can't read these yet w/ ncbi tools ??
1288 Bioseq ::= << fails both bioseq-seq and seq-entry parsers !
1290 if (strstr(sp,"Bioseq-set")) format= kASNseqset;
1291 else if (strstr(sp,"Seq-entry")) format= kASNseqentry;
1292 else format= kASN1; /* other form, we can't yet read... */
1301 if (foundIG) format= kIG; /* a TOIG file from GCG for certain */
1306 else if ((dnalines > 1) || done || (nlines > maxlines2check)) {
1307 /* decide on most likely format */
1308 /* multichar idents: */
1309 if (foundStrider) format= kStrider;
1310 else if (foundGB) format= kGenBank;
1311 else if (foundPIR) format= kPIR;
1312 else if (foundEMBL) format= kEMBL;
1313 else if (foundNBRF) format= kNBRF;
1314 /* single char idents: */
1315 else if (foundIG) format= kIG;
1316 else if (foundPearson) format= kPearson;
1317 else if (foundZuker) format= kZuker;
1319 else if (foundPhylip) format= kPhylip;
1320 /* spacing ident: */
1321 else if (foundFitch) format= kFitch;
1322 /* no format chars: */
1323 else if (otherlines > 0) format= kUnknown;
1324 else if (dnalines > 1) format= kPlain;
1325 else if (aminolines > 1) format= kPlain;
1326 else format= kUnknown;
1331 /* need this for possible long header in olsen format */
1332 else if (strstr(sp,"): ") != NULL)
1336 if (format == kPhylip) {
1337 /* check for interleaved or sequential -- really messy */
1339 long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;
1343 for (i=0; i < *skiplines; i++) ReadOneLine(sp);
1346 sscanf( sp, "%d%d", &nspp, &nlen);
1347 ReadOneLine(sp); /* 1st seq line */
1348 for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1350 for (i= 1; i<nspp; i++) {
1353 tseq= getseqtype(sp+10, strlen(sp+10));
1354 tname= getseqtype(sp, 10);
1355 for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1356 for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1358 /* find probable interleaf or sequential ... */
1359 if (j>=9) seq += 10; /* pretty certain not ileaf */
1361 if (tseq != tname) leaf++; else seq++;
1362 if (tname == kDNA || tname == kRNA) seq++; else leaf++;
1365 if (ilen <= nlen && j<9) {
1366 if (tname == kOtherSeq) leaf += 10;
1367 else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;
1369 else if (ilen > nlen) {
1373 for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */
1376 tseq= getseqtype(sp+10, strlen(sp+10));
1377 tname= getseqtype(sp, 10);
1378 for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1379 for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1381 if (tname == kOtherSeq) seq += 10;
1382 if (tseq != tname) seq++; else leaf++;
1383 if (tname == kDNA || tname == kRNA) leaf++; else seq++;
1386 if (j>9) leaf += 10; /* must be a name here for sequent */
1387 else if (tname == kOtherSeq) seq += 10;
1392 if (leaf > seq) format= kPhylip4;
1393 else format= kPhylip2;
1398 } /* SeqFileFormat */
1403 unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)
1406 register long i, check = 0, count = 0;
1408 for (i = 0; i < seqlen; i++) {
1410 check += count * to_upper(seq[i]);
1411 if (count == 57) count = 0;
1414 *checktotal += check;
1415 *checktotal %= 10000;
1419 /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */
1420 const unsigned long crctab[] = {
1421 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
1422 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
1423 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
1424 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
1425 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
1426 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
1427 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
1428 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
1429 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
1430 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
1431 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
1432 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
1433 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
1434 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
1435 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
1436 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
1437 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
1438 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
1439 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
1440 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
1441 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
1442 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
1443 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
1444 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
1445 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
1446 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
1447 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
1448 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
1449 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
1450 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
1451 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
1452 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
1453 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
1454 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
1455 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
1456 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
1457 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
1458 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
1459 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
1460 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
1461 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
1462 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
1463 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
1464 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
1465 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
1466 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
1467 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
1468 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
1469 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
1470 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
1471 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
1475 unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
1476 /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */
1478 register unsigned long c = 0xffffffffL;
1479 register long n = seqlen;
1482 c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);
1483 seq++; /* fixed aug'98 finally */
1493 short getseqtype( const char *seq, const long seqlen)
1494 { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */
1497 short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;
1499 maxtest = min(300, seqlen);
1500 for (i = 0; i < maxtest; i++) {
1501 c = to_upper(seq[i]);
1502 if (strchr(protonly, c)) po++;
1503 else if (strchr(primenuc,c)) {
1506 else if (c == 'U') nu++;
1508 else if (strchr(aminos,c)) aa++;
1509 else if (strchr(seqsymbols,c)) ns++;
1510 else if (isalpha(c)) no++;
1513 if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;
1514 /* ?? test for probability of kOtherSeq ?, e.g.,
1515 else if (po+aa+na / maxtest < 0.70) return kOtherSeq;
1517 else if (po > 0) return kAmino;
1519 if (nu > nt) return kRNA;
1522 else if (na > aa) return kNucleic;
1527 char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen)
1529 register char *a, *b;
1534 if (!seq) return NULL;
1535 newseq = (char*) malloc(seqlen+1);
1536 if (!newseq) return NULL;
1537 for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)
1543 newseq = (char*) realloc(newseq, i+1);
1551 char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \
1553 {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\
1554 {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\
1555 {\\f5\\froman Times;}{\\f6\\froman Palatino;}\
1556 {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\
1558 {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\
1559 {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\
1560 {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";
1562 char *rtftail = "}";
1565 short writeSeq(FILE *outf, const char *seq, const long seqlen,
1566 const short outform, const char *seqid)
1567 /* dump sequence to standard output */
1569 const short kSpaceAll = -9;
1570 #define kMaxseqwidth 250
1572 boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */
1573 short numline = 0; /* only true if we are writing seq number line (for interleave) */
1574 boolean numright = false, numleft = false;
1575 boolean nameright = false, nameleft = false;
1576 short namewidth = 8, numwidth = 8;
1577 short spacer = 0, width = 50, tab = 0;
1578 /* new parameters: width, spacer, those above... */
1580 short linesout = 0, seqtype = kNucleic;
1581 long i, j, l, l1, ibase;
1582 char idword[31], endstr[15];
1583 char seqnamestore[128], *seqname = seqnamestore;
1584 char s[kMaxseqwidth], *cp;
1585 char nameform[10], numform[10], nocountsymbols[10];
1586 unsigned long checksum = 0, checktotal = 0;
1589 skipwhitespace(seqid);
1590 l = min(128, strlen(seqid));
1591 strncpy( seqnamestore, seqid, l);
1594 sscanf( seqname, "%30s", idword);
1595 sprintf(numform, "%d", seqlen);
1596 numwidth= strlen(numform)+1;
1599 if (strstr(seqname,"checksum") != NULL) {
1600 cp = strstr(seqname,"bases");
1602 for ( ; (cp!=seqname) && (*cp!=','); cp--) ;
1603 if (cp!=seqname) *cp=0;
1610 if (outform == kGCG || outform == kMSF)
1611 checksum = GCGchecksum(seq, seqlen, &checktotal);
1613 checksum = seqchecksum(seq, seqlen, &checktotal);
1618 case kUnknown: /* no header, just sequence */
1619 strcpy(endstr,"\n"); /* end w/ extra blank line */
1622 case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */
1624 fprintf(outf,"LOCUS %s %d bp\n", idword, seqlen);
1625 fprintf(outf,"DEFINITION %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1626 /* fprintf(outf,"ACCESSION %s\n", accnum); */
1627 fprintf(outf,"ORIGIN \n");
1630 numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */
1631 strcpy(endstr, "\n//");
1636 /* somewhat like genbank... \\\*/
1637 /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */
1638 fprintf(outf,"ENTRY %s \n", idword);
1639 fprintf(outf,"TITLE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1640 /* fprintf(outf,"ACCESSION %s\n", accnum); */
1641 fprintf(outf,"SEQUENCE \n");
1646 strcpy(endstr, "\n///");
1647 /* run a top number line for PIR */
1648 for (j=0; j<numwidth; j++) fputc(' ',outf);
1649 for (j= 5; j<=width; j += 5) fprintf(outf,"%10d",j);
1655 if (getseqtype(seq, seqlen) == kAmino)
1656 fprintf(outf,">P1;%s\n", idword);
1658 fprintf(outf,">DL;%s\n", idword);
1659 fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1661 strcpy(endstr,"*\n");
1666 fprintf(outf,"ID %s\n", idword);
1667 /* fprintf(outf,"AC %s\n", accnum); */
1668 fprintf(outf,"DE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1669 fprintf(outf,"SQ %d BP\n", seqlen);
1670 strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1671 tab = 4; /** added 31jan91 */
1672 spacer = 11; /** added 31jan91 */
1678 fprintf(outf,"%s\n", seqname);
1679 /* fprintf(outf,"ACCESSION %s\n", accnum); */
1680 fprintf(outf," %s Length: %d (today) Check: %d ..\n", idword, seqlen, checksum);
1683 strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1687 case kStrider: /* ?? map ?*/
1688 fprintf(outf,"; ### from DNA Strider ;-)\n");
1689 fprintf(outf,"; DNA sequence %s, %d bases, %X checksum.\n;\n", seqname, seqlen, checksum);
1690 strcpy(endstr, "\n//");
1695 fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1703 /* this is version 3.2/3.4 -- simplest way to write
1704 version 3.3 is to write as version 3.2, then
1705 re-read file and interleave the species lines */
1706 if (strlen(idword)>10) idword[10] = 0;
1707 fprintf(outf,"%-10s ",idword);
1714 seqtype= getseqtype(seq, seqlen);
1716 case kDNA : cp= "dna"; break;
1717 case kRNA : cp= "rna"; break;
1718 case kNucleic : cp= "na"; break;
1719 case kAmino : cp= "aa"; break;
1720 case kOtherSeq: cp= "not-set"; break;
1722 fprintf(outf," seq {\n");
1723 fprintf(outf," id { local id %d },\n", gPretty.atseq);
1724 fprintf(outf," descr { title \"%s\" },\n", seqid);
1725 fprintf(outf," inst {\n");
1726 fprintf(outf," repr raw, mol %s, length %d, topology linear,\n", cp, seqlen);
1727 fprintf(outf," seq-data\n");
1728 if (seqtype == kAmino)
1729 fprintf(outf," iupacaa \"");
1731 fprintf(outf," iupacna \"");
1736 strcpy(endstr,"\"\n } } ,");
1746 /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */
1747 /* do a header comment line for paup */
1748 fprintf(outf,"[Name: %-16s Len:%6d Check: %8X]\n", idword, seqlen, checksum);
1753 numline= gPretty.numline;
1754 baseonlynum= gPretty.baseonlynum;
1755 namewidth = gPretty.namewidth;
1756 numright = gPretty.numright;
1757 numleft = gPretty.numleft;
1758 nameright = gPretty.nameright;
1759 nameleft = gPretty.nameleft;
1760 spacer = gPretty.spacer + 1;
1761 width = gPretty.seqwidth;
1763 /* also add rtf formatting w/ font, size, style */
1764 if (gPretty.nametop) {
1765 fprintf(outf,"Name: %-16s Len:%6d Check: %8X\n", idword, seqlen, checksum);
1771 fprintf(outf," Name: %-16s Len:%6d Check: %5d Weight: 1.00\n",
1772 idword, seqlen, checksum);
1775 namewidth= 15; /* need MAX namewidth here... */
1776 sprintf(nameform, "%%+%ds ",namewidth);
1783 fprintf(outf,";%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1784 fprintf(outf,"%s\n", idword);
1785 strcpy(endstr,"1"); /* == linear dna */
1790 case kZuker: /* don't attempt Zuker's ftn format */
1792 fprintf(outf,">%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1797 if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);
1798 if (numline) sprintf(numform, "%%%ds ",numwidth);
1799 else sprintf(numform, "%%%dd ",numwidth);
1800 strcpy( nocountsymbols, kNocountsymbols);
1802 if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {
1803 strcat(nocountsymbols," ");
1804 nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;
1806 if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {
1815 width = min(width,kMaxseqwidth);
1816 for (i=0, l=0, ibase = 1; i < seqlen; ) {
1820 if (nameleft) fprintf(outf, nameform, idword);
1821 if (numleft) { if (numline) fprintf(outf, numform, "");
1822 else fprintf(outf, numform, ibase);}
1823 for (j=0; j<tab; j++) fputc(' ',outf);
1826 l1++; /* don't count spaces for width*/
1828 if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {
1829 if (numline==1) fputc(' ',outf);
1832 if (l1 % 10 == 1 || l1 == width) {
1833 if (numline==1) fprintf(outf,"%-9d ",i+1);
1834 s[l++]= '|'; /* == put a number here */
1841 if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1843 if (!baseonlynum) ibase++;
1844 else if (0==strchr(nocountsymbols,seq[i])) ibase++;
1848 if (l1 == width || i == seqlen) {
1849 if (outform==kPretty) for ( ; l1<width; l1++) {
1850 if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1852 s[l++]=' '; /* pad w/ blanks */
1858 if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */
1861 if (i == seqlen) fprintf(outf,"%s%s",s,endstr);
1862 else fprintf(outf,"%s",s);
1863 if (numright || nameright) fputc(' ',outf);
1864 if (numright) fprintf(outf,numform, ibase-1);
1865 if (nameright) fprintf(outf, nameform,idword);
1876 /* End file: ureadseq.c */