3 * Reads and writes nucleic/protein sequence in various
4 * formats. Data files may have multiple sequences.
6 * Copyright 1990 by d.g.gilbert
7 * biology dept., indiana university, bloomington, in 47405
8 * e-mail: gilbertd@bio.indiana.edu
10 * This program may be freely copied and used by anyone.
11 * Developers are encourged to incorporate parts in their
12 * programs, rather than devise their own private sequence
15 * This should compile and run with any ANSI C compiler.
17 * Alexander Sherstnev, 18/11/2013
18 * renamed getline to locgetline
30 #pragma segment ureadseq
33 int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */
39 diff = to_upper(*a) - to_upper(*b);
40 if (diff) return diff;
41 if (*a == '\0') done = 1;
47 int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */
53 diff = to_upper(*a) - to_upper(*b);
54 if (diff) return diff;
55 if (*a == '\0') done = 1;
69 # define Local static /* local functions */
72 #define kStartLength 500
74 const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
75 const char *primenuc = "ACGTU";
76 const char *protonly = "EFIPQZ";
78 const char kNocountsymbols[5] = "_.-?";
79 const char stdsymbols[6] = "_.-*?";
80 const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";
81 static const char *seqsymbols = allsymbols;
83 const char nummask[11] = "0123456789";
84 const char nonummask[11] = "~!@#$%^&*(";
87 use general form of isseqchar -- all chars + symbols.
88 no formats except nbrf (?) use symbols in data area as
89 anything other than sequence chars.
94 /* Local variables for readSeq: */
96 short choice, err, nseq;
97 long seqlen, maxseq, seqlencount;
101 char *seq, *seqid, matchchar;
102 boolean allDone, done, filestart, addit;
108 /* int (*isseqchar)(int c); << sgi cc hates (int c) */
115 return (isalpha(c) || strchr(seqsymbols,c));
118 int isSeqNumChar(int c)
120 return (isalnum(c) || strchr(seqsymbols,c));
126 return isascii(c); /* wrap in case isascii is macro */
129 Local void readline(FILE *f, char *s, long *linestart)
133 *linestart= ftell(f);
134 if (NULL == fgets(s, 256, f))
137 cp = strchr(s, '\n');
138 if (cp != NULL) *cp = 0;
142 Local void locgetline(struct ReadSeqVars *V)
144 readline(V->f, V->s, &V->linestart);
147 Local void ungetline(struct ReadSeqVars *V)
149 fseek(V->f, V->linestart, 0);
153 Local void addseq(char *s, struct ReadSeqVars *V)
157 if (V->addit) while (*s != 0) {
158 if ((V->isseqchar)(*s)) {
159 if (V->seqlen >= V->maxseq) {
160 V->maxseq += kStartLength;
161 ptr = (char*) realloc(V->seq, V->maxseq+1);
168 V->seq[(V->seqlen)++] = *s;
174 Local void countseq(char *s, struct ReadSeqVars *V)
175 /* this must count all valid seq chars, for some formats (paup-sequential) even
176 if we are skipping seq... */
179 if ((V->isseqchar)(*s)) {
187 Local void addinfo(char *s, struct ReadSeqVars *V)
193 while (*s == ' ') s++;
194 sprintf(si, " %d) %s\n", V->nseq, s);
198 V->isseqchar = isAnyChar;
201 V->isseqchar = isSeqChar;
207 Local void readLoop(short margin, boolean addfirst,
208 boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),
209 struct ReadSeqVars *V)
211 boolean addend = false;
212 boolean ungetend = false;
215 if (V->choice == kListSequences) V->addit = false;
216 else V->addit = (V->nseq == V->choice);
217 if (V->addit) V->seqlen = 0;
219 if (addfirst) addseq(V->s, V);
222 V->done = feof(V->f);
223 V->done |= (*endTest)( &addend, &ungetend, V);
224 if (V->addit && (addend || !V->done) && (strlen(V->s) > margin)) {
225 addseq( (V->s)+margin, V);
229 if (V->choice == kListSequences) addinfo(V->seqid, V);
231 V->allDone = (V->nseq >= V->choice);
232 if (V->allDone && ungetend) ungetline(V);
238 Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
240 *addend = true; /* 1 or 2 occur in line w/ bases */
242 return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));
245 Local void readIG(struct ReadSeqVars *V)
247 /* 18Aug92: new IG format -- ^L between sequences in place of ";" */
250 while (!V->allDone) {
253 for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */
254 if (*si == 0) *V->s= 0; /* chop line to empty */
255 } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));
259 strcpy(V->seqid, V->s);
260 readLoop(0, false, endIG, V);
267 Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
271 return (strstr( V->s, "//") != NULL);
274 Local void readStrider(struct ReadSeqVars *V)
275 { /* ? only 1 seq/file ? */
277 while (!V->allDone) {
279 if (strstr(V->s,"; DNA sequence ") == V->s)
280 strcpy(V->seqid, (V->s)+16);
282 strcpy(V->seqid, (V->s)+1);
283 while ((!feof(V->f)) && (*V->s == ';')) {
286 if (feof(V->f)) V->allDone = true;
287 else readLoop(0, true, endStrider, V);
292 Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
295 *ungetend= (strstr(V->s,"ENTRY") == V->s);
296 return ((strstr(V->s,"///") != NULL) || *ungetend);
299 Local void readPIR(struct ReadSeqVars *V)
300 { /*PIR -- many seqs/file */
302 while (!V->allDone) {
303 while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) )
305 strcpy(V->seqid, (V->s)+16);
306 while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))
308 readLoop(0, false, endPIR, V);
311 while (! (feof(V->f) || ((*V->s != 0)
312 && (strstr( V->s,"ENTRY") == V->s))))
315 if (feof(V->f)) V->allDone = true;
320 Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
323 *ungetend= (strstr(V->s,"LOCUS") == V->s);
324 return ((strstr(V->s,"//") != NULL) || *ungetend);
327 Local void readGenBank(struct ReadSeqVars *V)
328 { /*GenBank -- many seqs/file */
330 while (!V->allDone) {
331 strcpy(V->seqid, (V->s)+12);
332 while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))
334 readLoop(0, false, endGB, V);
337 while (! (feof(V->f) || ((*V->s != 0)
338 && (strstr( V->s,"LOCUS") == V->s))))
341 if (feof(V->f)) V->allDone = true;
346 Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
350 if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */
351 /* "*" can be valid base symbol, drop it here */
357 else if (*V->s == '>') { /* start of next seq */
366 Local void readNBRF(struct ReadSeqVars *V)
368 while (!V->allDone) {
369 strcpy(V->seqid, (V->s)+4);
370 locgetline(V); /*skip title-junk line*/
371 readLoop(0, false, endNBRF, V);
373 while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))
376 if (feof(V->f)) V->allDone = true;
382 Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
386 return(*V->s == '>');
389 Local void readPearson(struct ReadSeqVars *V)
391 while (!V->allDone) {
392 strcpy(V->seqid, (V->s)+1);
393 readLoop(0, false, endPearson, V);
395 while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))
398 if (feof(V->f)) V->allDone = true;
404 Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
407 *ungetend= (strstr(V->s,"ID ") == V->s);
408 return ((strstr(V->s,"//") != NULL) || *ungetend);
411 Local void readEMBL(struct ReadSeqVars *V)
413 while (!V->allDone) {
414 strcpy(V->seqid, (V->s)+5);
417 } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s)));
419 readLoop(0, false, endEMBL, V);
421 while (!(feof(V->f) |
422 ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s))))
425 if (feof(V->f)) V->allDone = true;
431 Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
435 return( *V->s == '(' );
438 Local void readZuker(struct ReadSeqVars *V)
440 /*! 1st string is Zuker's Fortran format */
442 while (!V->allDone) {
443 locgetline(V); /*s == "seqLen seqid string..."*/
444 strcpy(V->seqid, (V->s)+6);
445 readLoop(0, false, endZuker, V);
447 while (!(feof(V->f) |
448 ((*V->s != '\0') & (*V->s == '('))))
451 if (feof(V->f)) V->allDone = true;
457 Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
459 /* this is a somewhat shaky end,
460 1st char of line is non-blank for seq. title
464 return( *V->s != ' ' );
467 Local void readFitch(struct ReadSeqVars *V)
472 while (!V->allDone) {
473 if (!first) strcpy(V->seqid, V->s);
474 readLoop(0, first, endFitch, V);
475 if (feof(V->f)) V->allDone = true;
481 Local void readPlain(struct ReadSeqVars *V)
484 V->addit = (V->choice > 0);
485 if (V->addit) V->seqlen = 0;
486 addseq(V->seqid, V); /*from above..*/
487 if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname);
488 else sprintf(V->seqid, " [Unknown form]");
491 V->done = feof(V->f);
494 if (V->choice == kListSequences) addinfo(V->seqid, V);
499 Local void readUWGCG(struct ReadSeqVars *V)
502 10nov91: Reading GCG files casued duplication of last line when
503 EOF followed that line !!!
504 fix: locgetline now sets *V->s = 0
509 V->addit = (V->choice > 0);
510 if (V->addit) V->seqlen = 0;
511 strcpy(V->seqid, V->s);
512 /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
513 /*drop above or ".." from id*/
514 if (si = strstr(V->seqid," Length: ")) *si = 0;
515 else if (si = strstr(V->seqid,"..")) *si = 0;
517 V->done = feof(V->f);
519 if (!V->done) addseq((V->s), V);
521 if (V->choice == kListSequences) addinfo(V->seqid, V);
526 Local void readOlsen(struct ReadSeqVars *V)
527 { /* G. Olsen /print output from multiple sequence editor */
529 char *si, *sj, *sk, *sm, sid[40], snum[20];
530 boolean indata = false;
533 V->addit = (V->choice > 0);
534 if (V->addit) V->seqlen = 0;
535 rewind(V->f); V->nseq= 0;
538 V->done = feof(V->f);
540 if (V->done && !(*V->s)) break;
542 if ( (si= strstr(V->s, sid))
543 /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */
544 && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) {
546 /* Spaces are valid alignment data !! */
547 /* 17Oct91: Error, the left margin is 21 not 22! */
548 /* dropped some nucs up to now -- my example file was right shifted ! */
549 /* variable right id margin, drop id-2 spaces at end */
551 VMS CC COMPILER (VAXC031) mess up:
552 -- Index of 21 is chopping 1st nuc on VMS systems Only!
553 Byte-for-byte same ame rnasep.olsen sequence file !
556 /* si = (V->s)+21; < was this before VMS CC wasted my time */
557 si += 10; /* use strstr index plus offset to outfox VMS CC bug */
559 if (sk = strstr(si, sid)) *(sk-2) = 0;
560 for (sk = si; *sk != 0; sk++) {
561 if (*sk == ' ') *sk = '.';
562 /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */
563 else if (isdigit(*sk)) *sk= nonummask[*sk - '0'];
570 else if (sk = strstr(V->s, "): ")) { /* seq info header line */
571 /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */
572 /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */
573 /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */
575 si = 1 + strchr(V->s,'(');
577 if (V->choice == kListSequences) addinfo( si, V);
578 else if (V->nseq == V->choice) {
579 strcpy(V->seqid, si);
580 sj = strchr(V->seqid, ':');
581 while (*(--sj) == ' ') ;
582 while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; }
585 while (*(--sk) == ' ') *sk = 0;
589 while ((*si <= ' ') && (*si != 0)) si++;
591 while (si[snumlen] > ' ' && snumlen<20)
592 { snum[snumlen]= si[snumlen]; snumlen++; }
598 else if (strstr(V->s,"identity: Data:")) {
600 if (V->choice == kListSequences) V->done = true;
609 Local void readMSF(struct ReadSeqVars *V)
610 { /* gcg's MSF, mult. sequence format, interleaved ! */
612 char *si, *sj, sid[128];
613 boolean indata = false;
614 int atseq= 0, iline= 0;
616 V->addit = (V->choice > 0);
617 if (V->addit) V->seqlen = 0;
618 rewind(V->f); V->nseq= 0;
621 V->done = feof(V->f);
623 if (V->done && !(*V->s)) break;
625 /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */
626 /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */
630 /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/
631 for (sj= si; *sj > ' '; sj++) ;
634 if ( (0==strcmp(si, sid)) ) {
641 else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */
642 /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */
646 if (V->choice == kListSequences) addinfo( si, V);
647 else if (V->nseq == V->choice) {
648 strcpy(V->seqid, si);
651 /* for (sj= si; isalnum(*sj); sj++) ; -- bug */
652 for (sj= si; *sj > ' '; sj++) ;
658 else if ( strstr(V->s,"//") /*== V->s*/ ) {
661 if (V->choice == kListSequences) V->done = true;
672 Local void readPAUPinterleaved(struct ReadSeqVars *V)
673 { /* PAUP mult. sequence format, interleaved or sequential! */
675 char *si, *sj, *send, sid[40], sid1[40], saveseq[255];
676 boolean first = true, indata = false, domatch;
677 int atseq= 0, iline= 0, ifmc, saveseqlen=0;
679 #define fixmatchchar(s) { \
680 for (ifmc=0; ifmc<saveseqlen; ifmc++) \
681 if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; }
683 V->addit = (V->choice > 0);
685 if (V->addit) V->seqlen = 0;
686 /* rewind(V->f); V->nseq= 0; << do in caller !*/
687 indata= true; /* call here after we find "matrix" */
688 domatch= (V->matchchar > 0);
692 V->done = feof(V->f);
694 if (V->done && !(*V->s)) break;
697 /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
698 /* chimp ................a.t. .c.................a ..........*/
699 /* !! need to correct for V->matchchar */
702 if (strchr(si,';')) indata= false;
705 /* valid data line starts w/ a left-justified seq name in columns [0..8] */
708 if (V->nseq >= V->topnseq) first= false;
709 for (sj = si; isalnum(*sj); sj++) ;
712 if (V->choice == kListSequences) {
716 else if (V->nseq == V->choice) {
718 if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); }
719 else fixmatchchar( sj);
723 strcpy(V->seqid, si);
725 if (V->nseq == 1) strcpy(sid1, sid);
729 else if ( (strstr(si, sid) == si) ){
730 while (isalnum(*si)) si++;
733 if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); }
734 else fixmatchchar( si);
739 else if (domatch && (strstr(si, sid1) == si)) {
740 strcpy( saveseq, si);
741 saveseqlen= strlen(saveseq);
748 else if ( strstr(V->s,"matrix") ) {
751 if (V->choice == kListSequences) V->done = true;
757 } /*readPAUPinterleaved*/
761 Local void readPAUPsequential(struct ReadSeqVars *V)
762 { /* PAUP mult. sequence format, interleaved or sequential! */
764 boolean atname = true, indata = false;
766 V->addit = (V->choice > 0);
767 if (V->addit) V->seqlen = 0;
769 /* rewind(V->f); V->nseq= 0; << do in caller !*/
770 indata= true; /* call here after we find "matrix" */
773 V->done = feof(V->f);
775 if (V->done && !(*V->s)) break;
778 /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
779 /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
780 /* chimp ................a.t. .c.................a ..........*/
781 /* ................a.t. .c.................a ..........*/
785 if (strchr(si,';')) indata= false;
787 /* valid data line starts w/ a left-justified seq name in columns [0..8] */
793 while (isalnum(*sj)) sj++;
794 if (V->choice == kListSequences) {
795 /* !! we must count bases to know when topseqlen is reached ! */
797 if (V->seqlencount >= V->topseqlen) atname= true;
801 else if (V->nseq == V->choice) {
803 V->seqlencount= V->seqlen;
804 if (V->seqlencount >= V->topseqlen) atname= true;
806 strcpy(V->seqid, si);
810 if (V->seqlencount >= V->topseqlen) atname= true;
814 else if (V->nseq == V->choice) {
816 V->seqlencount= V->seqlen;
817 if (V->seqlencount >= V->topseqlen) atname= true;
821 if (V->seqlencount >= V->topseqlen) atname= true;
826 else if ( strstr(V->s,"matrix") ) {
829 if (V->choice == kListSequences) V->done = true;
835 } /*readPAUPsequential*/
838 Local void readPhylipInterleaved(struct ReadSeqVars *V)
841 boolean first = true;
844 V->addit = (V->choice > 0);
845 if (V->addit) V->seqlen = 0;
847 /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */
850 V->topnseq= atoi(si);
851 while (isdigit(*si)) si++;
853 V->topseqlen= atol(si);
854 /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */
858 V->done = feof(V->f);
860 if (V->done && !(*V->s)) break;
865 if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */
867 if (V->nseq >= V->topnseq) first= false;
868 sj= V->s+10; /* past name, start of data */
869 if (V->choice == kListSequences) {
873 else if (V->nseq == V->choice) {
876 strcpy(V->seqid, si);
879 else if ( iline % V->nseq == V->choice -1 ) {
887 } /*readPhylipInterleaved*/
891 Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
896 return V->seqlencount >= V->topseqlen;
899 Local void readPhylipSequential(struct ReadSeqVars *V)
903 /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */
906 V->topnseq= atoi(si);
907 while (isdigit(*si)) si++;
909 V->topseqlen= atol(si);
911 while (!V->allDone) {
913 strncpy(V->seqid, (V->s), 10);
915 for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' ';
916 readLoop(0, true, endPhylipSequential, V);
917 if (feof(V->f)) V->allDone = true;
924 Local void readSeqMain(
925 struct ReadSeqVars *V,
926 const long skiplines_,
929 #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \
930 for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); }
938 V->err = eFileNotFound;
941 for (l = skiplines_; l > 0; l--) locgetline( V);
945 for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;
946 } while ((l == 0) && !feof(V->f));
948 if (feof(V->f)) V->err = eNoData;
950 else switch (format_) {
951 case kPlain : readPlain(V); break;
952 case kIG : readIG(V); break;
953 case kStrider: readStrider(V); break;
954 case kGenBank: readGenBank(V); break;
955 case kPIR : readPIR(V); break;
956 case kNBRF : readNBRF(V); break;
957 case kPearson: readPearson(V); break;
958 case kEMBL : readEMBL(V); break;
959 case kZuker : readZuker(V); break;
960 case kOlsen : readOlsen(V); break;
961 case kMSF : readMSF(V); break;
965 boolean interleaved= false;
967 /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */
971 if (strstr( V->s, "matrix")) done= true;
972 if (strstr( V->s, "interleav")) interleaved= true;
973 if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5);
974 if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6);
975 if (NULL != (cp=strstr( V->s, "matchchar=")) ) {
978 else if (*cp=='"') cp++;
982 if (interleaved) readPAUPinterleaved(V);
983 else readPAUPsequential(V);
987 /* kPhylip: ! can't determine in middle of file which type it is...*/
988 /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */
990 readPhylipSequential(V);
992 case kPhylip4: /* == kPhylip3 */
993 readPhylipInterleaved(V);
997 V->err = eUnknownFormat;
1001 strcpy(V->seqid, V->s); locgetline(V);
1007 gotuw = (strstr(V->s,"..") != NULL);
1008 if (gotuw) readUWGCG(V);
1010 } while (!(feof(V->f) || V->allDone));
1015 V->filestart= false;
1016 V->seq[V->seqlen] = 0; /* stick a string terminator on it */
1021 const short whichEntry_, /* index to sequence in file */
1022 FILE *fp_, /* pointer to open seq file */
1023 const long skiplines_,
1024 const short format_, /* sequence file format */
1025 long *seqlen_, /* return seq size */
1026 short *nseq_, /* number of seqs in file, for listSeqs() */
1027 short *error_, /* return error */
1028 char *seqid_) /* return seq name/info */
1030 struct ReadSeqVars V;
1032 if (format_ < kMinFormat || format_ > kMaxFormat) {
1033 *error_ = eUnknownFormat;
1038 V.choice = whichEntry_;
1039 V.fname = NULL; /* don't know */
1040 V.seq = (char*) calloc(1, kStartLength+1);
1041 V.maxseq = kStartLength;
1046 V.filestart= (ftell( fp_) == 0);
1047 /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */
1048 if (V.filestart) V.nseq = 0;
1049 else V.nseq= *nseq_; /* track where we are in file...*/
1054 V.isseqchar = isSeqChar;
1055 if (V.choice == kListSequences) ; /* leave as is */
1056 else if (V.choice <= 0) V.choice = 1; /* default ?? */
1057 V.addit = (V.choice > 0);
1060 readSeqMain(&V, skiplines_, format_);
1063 *seqlen_ = V.seqlen;
1069 const short whichEntry_, /* index to sequence in file */
1070 const char *filename_, /* file name */
1071 const long skiplines_,
1072 const short format_, /* sequence file format */
1073 long *seqlen_, /* return seq size */
1074 short *nseq_, /* number of seqs in file, for listSeqs() */
1075 short *error_, /* return error */
1076 char *seqid_) /* return seq name/info */
1078 struct ReadSeqVars V;
1080 if (format_ < kMinFormat || format_ > kMaxFormat) {
1081 *error_ = eUnknownFormat;
1086 V.choice = whichEntry_;
1087 V.fname = filename_; /* don't need to copy string, just ptr to it */
1088 V.seq = (char*) calloc(1, kStartLength+1);
1089 V.maxseq = kStartLength;
1097 V.isseqchar = isSeqChar;
1098 if (V.choice == kListSequences) ; /* leave as is */
1099 else if (V.choice <= 0) V.choice = 1; /* default ?? */
1100 V.addit = (V.choice > 0);
1103 V.f = fopen(V.fname, "r");
1106 readSeqMain(&V, skiplines_, format_);
1108 if (V.f != NULL) fclose(V.f);
1110 *seqlen_ = V.seqlen;
1120 const char *filename_, /* file name */
1121 const long skiplines_,
1122 const short format_, /* sequence file format */
1123 short *nseq_, /* number of seqs in file, for listSeqs() */
1124 short *error_) /* return error */
1129 return readSeq( kListSequences, filename_, skiplines_, format_,
1130 &seqlen, nseq_, error_, seqid);
1136 short seqFileFormat( /* return sequence format number, see ureadseq.h */
1137 const char *filename,
1138 long *skiplines, /* return #lines to skip any junk like mail header */
1139 short *error) /* return any error value or 0 */
1144 fseq = fopen(filename, "r");
1145 format= seqFileFormatFp( fseq, skiplines, error);
1146 if (fseq!=NULL) fclose(fseq);
1150 short seqFileFormatFp(
1152 long *skiplines, /* return #lines to skip any junk like mail header */
1153 short *error) /* return any error value or 0 */
1155 boolean foundDNA= false, foundIG= false, foundStrider= false,
1156 foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,
1157 foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,
1158 gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,
1159 isfitch= false, isphylip= false, done= false;
1160 short format= kUnknown;
1161 int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;
1164 int maxlines2check=500;
1166 #define ReadOneLine(sp) \
1167 { done |= (feof(fseq)); \
1168 readline( fseq, sp, &linestart); \
1169 if (!done) { splen = strlen(sp); ++nlines; } }
1173 if (fseq == NULL) { *error = eFileNotFound; return kNoformat; }
1178 /* check for mailer head & skip past if found */
1179 if (nlines < 4 && !done) {
1180 if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {
1182 /* skip all lines until find one blank line */
1184 if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;
1185 } while ((!done) && (k < splen));
1186 *skiplines = nlines; /* !? do we want #lines or #bytes ?? */
1190 if (sp==NULL || *sp==0)
1193 /* high probability identities: */
1195 else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )
1198 else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))
1201 else if (strstr(sp,"identity: Data:") != NULL)
1204 else if ( strstr(sp,"::=") &&
1205 (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */
1206 strstr(sp,"Seq-entry") ||
1207 strstr(sp,"Seq-submit") ) ) /* can we read submit format? */
1210 else if ( strstr(sp,"#NEXUS") == sp )
1213 /* uncertain identities: */
1215 else if (*sp ==';') {
1216 if (strstr(sp,"Strider") !=NULL) foundStrider= true;
1220 else if (strstr(sp,"LOCUS") == sp)
1222 else if (strstr(sp,"ORIGIN") == sp)
1225 else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */
1227 else if (strstr(sp,"SEQUENCE") == sp)
1230 else if (*sp == '>') {
1231 if (sp[3] == ';') foundNBRF= true;
1232 else foundPearson= true;
1235 else if (strstr(sp,"ID ") == sp)
1237 else if (strstr(sp,"SQ ") == sp)
1240 else if (*sp == '(')
1244 if (nlines - *skiplines == 1) {
1245 int ispp= 0, ilen= 0;
1246 sscanf( sp, "%d%d", &ispp, &ilen);
1247 if (ispp > 0 && ilen > 0) isphylip= true;
1249 else if (isphylip && nlines - *skiplines == 2) {
1251 tseq= getseqtype(sp+10, strlen(sp+10));
1252 if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */
1253 && (tseq != kOtherSeq)) /* sequence section must be okay */
1257 for (k=0, isfitch= true; isfitch & (k < splen); k++) {
1258 if (k % 4 == 0) isfitch &= (sp[k] == ' ');
1259 else isfitch &= (sp[k] != ' ');
1261 if (isfitch & (splen > 20)) foundFitch= true;
1263 /* kRNA && kDNA are fairly certain...*/
1264 switch (getseqtype( sp, splen)) {
1265 case kOtherSeq: otherlines++; break;
1266 case kAmino : if (splen>20) aminolines++; break;
1268 case kRNA : if (splen>20) dnalines++; break;
1269 case kNucleic : break; /* not much info ? */
1274 /* pretty certain */
1284 /* !! we need to look further and return kASNseqentry | kASNseqset */
1286 seqentry key is Seq-entry ::=
1287 seqset key is Bioseq-set ::=
1288 ?? can't read these yet w/ ncbi tools ??
1290 Bioseq ::= << fails both bioseq-seq and seq-entry parsers !
1292 if (strstr(sp,"Bioseq-set")) format= kASNseqset;
1293 else if (strstr(sp,"Seq-entry")) format= kASNseqentry;
1294 else format= kASN1; /* other form, we can't yet read... */
1303 if (foundIG) format= kIG; /* a TOIG file from GCG for certain */
1308 else if ((dnalines > 1) || done || (nlines > maxlines2check)) {
1309 /* decide on most likely format */
1310 /* multichar idents: */
1311 if (foundStrider) format= kStrider;
1312 else if (foundGB) format= kGenBank;
1313 else if (foundPIR) format= kPIR;
1314 else if (foundEMBL) format= kEMBL;
1315 else if (foundNBRF) format= kNBRF;
1316 /* single char idents: */
1317 else if (foundIG) format= kIG;
1318 else if (foundPearson) format= kPearson;
1319 else if (foundZuker) format= kZuker;
1321 else if (foundPhylip) format= kPhylip;
1322 /* spacing ident: */
1323 else if (foundFitch) format= kFitch;
1324 /* no format chars: */
1325 else if (otherlines > 0) format= kUnknown;
1326 else if (dnalines > 1) format= kPlain;
1327 else if (aminolines > 1) format= kPlain;
1328 else format= kUnknown;
1333 /* need this for possible long header in olsen format */
1334 else if (strstr(sp,"): ") != NULL)
1338 if (format == kPhylip) {
1339 /* check for interleaved or sequential -- really messy */
1341 long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;
1345 for (i=0; i < *skiplines; i++) ReadOneLine(sp);
1348 sscanf( sp, "%d%d", &nspp, &nlen);
1349 ReadOneLine(sp); /* 1st seq line */
1350 for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1352 for (i= 1; i<nspp; i++) {
1355 tseq= getseqtype(sp+10, strlen(sp+10));
1356 tname= getseqtype(sp, 10);
1357 for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1358 for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1360 /* find probable interleaf or sequential ... */
1361 if (j>=9) seq += 10; /* pretty certain not ileaf */
1363 if (tseq != tname) leaf++; else seq++;
1364 if (tname == kDNA || tname == kRNA) seq++; else leaf++;
1367 if (ilen <= nlen && j<9) {
1368 if (tname == kOtherSeq) leaf += 10;
1369 else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;
1371 else if (ilen > nlen) {
1375 for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */
1378 tseq= getseqtype(sp+10, strlen(sp+10));
1379 tname= getseqtype(sp, 10);
1380 for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1381 for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1383 if (tname == kOtherSeq) seq += 10;
1384 if (tseq != tname) seq++; else leaf++;
1385 if (tname == kDNA || tname == kRNA) leaf++; else seq++;
1388 if (j>9) leaf += 10; /* must be a name here for sequent */
1389 else if (tname == kOtherSeq) seq += 10;
1394 if (leaf > seq) format= kPhylip4;
1395 else format= kPhylip2;
1400 } /* SeqFileFormat */
1405 unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)
1408 register long i, check = 0, count = 0;
1410 for (i = 0; i < seqlen; i++) {
1412 check += count * to_upper(seq[i]);
1413 if (count == 57) count = 0;
1416 *checktotal += check;
1417 *checktotal %= 10000;
1421 /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */
1422 const unsigned long crctab[] = {
1423 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
1424 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
1425 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
1426 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
1427 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
1428 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
1429 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
1430 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
1431 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
1432 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
1433 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
1434 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
1435 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
1436 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
1437 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
1438 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
1439 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
1440 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
1441 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
1442 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
1443 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
1444 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
1445 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
1446 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
1447 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
1448 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
1449 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
1450 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
1451 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
1452 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
1453 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
1454 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
1455 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
1456 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
1457 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
1458 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
1459 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
1460 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
1461 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
1462 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
1463 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
1464 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
1465 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
1466 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
1467 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
1468 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
1469 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
1470 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
1471 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
1472 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
1473 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
1477 unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
1478 /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */
1480 register unsigned long c = 0xffffffffL;
1481 register long n = seqlen;
1484 c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);
1485 seq++; /* fixed aug'98 finally */
1495 short getseqtype( const char *seq, const long seqlen)
1496 { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */
1499 short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;
1501 maxtest = min(300, seqlen);
1502 for (i = 0; i < maxtest; i++) {
1503 c = to_upper(seq[i]);
1504 if (strchr(protonly, c)) po++;
1505 else if (strchr(primenuc,c)) {
1508 else if (c == 'U') nu++;
1510 else if (strchr(aminos,c)) aa++;
1511 else if (strchr(seqsymbols,c)) ns++;
1512 else if (isalpha(c)) no++;
1515 if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;
1516 /* ?? test for probability of kOtherSeq ?, e.g.,
1517 else if (po+aa+na / maxtest < 0.70) return kOtherSeq;
1519 else if (po > 0) return kAmino;
1521 if (nu > nt) return kRNA;
1524 else if (na > aa) return kNucleic;
1529 char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen)
1531 register char *a, *b;
1536 if (!seq) return NULL;
1537 newseq = (char*) malloc(seqlen+1);
1538 if (!newseq) return NULL;
1539 for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)
1545 newseq = (char*) realloc(newseq, i+1);
1553 char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \
1555 {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\
1556 {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\
1557 {\\f5\\froman Times;}{\\f6\\froman Palatino;}\
1558 {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\
1560 {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\
1561 {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\
1562 {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";
1564 char *rtftail = "}";
1567 short writeSeq(FILE *outf, const char *seq, const long seqlen,
1568 const short outform, const char *seqid)
1569 /* dump sequence to standard output */
1571 const short kSpaceAll = -9;
1572 #define kMaxseqwidth 250
1574 boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */
1575 short numline = 0; /* only true if we are writing seq number line (for interleave) */
1576 boolean numright = false, numleft = false;
1577 boolean nameright = false, nameleft = false;
1578 short namewidth = 8, numwidth = 8;
1579 short spacer = 0, width = 50, tab = 0;
1580 /* new parameters: width, spacer, those above... */
1582 short linesout = 0, seqtype = kNucleic;
1583 long i, j, l, l1, ibase;
1584 char idword[31], endstr[10];
1585 char seqnamestore[128], *seqname = seqnamestore;
1586 char s[kMaxseqwidth], *cp;
1587 char nameform[10], numform[10], nocountsymbols[10];
1588 unsigned long checksum = 0, checktotal = 0;
1591 skipwhitespace(seqid);
1592 l = min(128, strlen(seqid));
1593 strncpy( seqnamestore, seqid, l);
1596 sscanf( seqname, "%30s", idword);
1597 sprintf(numform, "%d", seqlen);
1598 numwidth= strlen(numform)+1;
1601 if (strstr(seqname,"checksum") != NULL) {
1602 cp = strstr(seqname,"bases");
1604 for ( ; (cp!=seqname) && (*cp!=','); cp--) ;
1605 if (cp!=seqname) *cp=0;
1612 if (outform == kGCG || outform == kMSF)
1613 checksum = GCGchecksum(seq, seqlen, &checktotal);
1615 checksum = seqchecksum(seq, seqlen, &checktotal);
1620 case kUnknown: /* no header, just sequence */
1621 strcpy(endstr,"\n"); /* end w/ extra blank line */
1624 case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */
1626 fprintf(outf,"LOCUS %s %d bp\n", idword, seqlen);
1627 fprintf(outf,"DEFINITION %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1628 /* fprintf(outf,"ACCESSION %s\n", accnum); */
1629 fprintf(outf,"ORIGIN \n");
1632 numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */
1633 strcpy(endstr, "\n//");
1638 /* somewhat like genbank... \\\*/
1639 /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */
1640 fprintf(outf,"ENTRY %s \n", idword);
1641 fprintf(outf,"TITLE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1642 /* fprintf(outf,"ACCESSION %s\n", accnum); */
1643 fprintf(outf,"SEQUENCE \n");
1648 strcpy(endstr, "\n///");
1649 /* run a top number line for PIR */
1650 for (j=0; j<numwidth; j++) fputc(' ',outf);
1651 for (j= 5; j<=width; j += 5) fprintf(outf,"%10d",j);
1657 if (getseqtype(seq, seqlen) == kAmino)
1658 fprintf(outf,">P1;%s\n", idword);
1660 fprintf(outf,">DL;%s\n", idword);
1661 fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1663 strcpy(endstr,"*\n");
1668 fprintf(outf,"ID %s\n", idword);
1669 /* fprintf(outf,"AC %s\n", accnum); */
1670 fprintf(outf,"DE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1671 fprintf(outf,"SQ %d BP\n", seqlen);
1672 strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1673 tab = 4; /** added 31jan91 */
1674 spacer = 11; /** added 31jan91 */
1680 fprintf(outf,"%s\n", seqname);
1681 /* fprintf(outf,"ACCESSION %s\n", accnum); */
1682 fprintf(outf," %s Length: %d (today) Check: %d ..\n", idword, seqlen, checksum);
1685 strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1689 case kStrider: /* ?? map ?*/
1690 fprintf(outf,"; ### from DNA Strider ;-)\n");
1691 fprintf(outf,"; DNA sequence %s, %d bases, %X checksum.\n;\n", seqname, seqlen, checksum);
1692 strcpy(endstr, "\n//");
1697 fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1705 /* this is version 3.2/3.4 -- simplest way to write
1706 version 3.3 is to write as version 3.2, then
1707 re-read file and interleave the species lines */
1708 if (strlen(idword)>10) idword[10] = 0;
1709 fprintf(outf,"%-10s ",idword);
1716 seqtype= getseqtype(seq, seqlen);
1718 case kDNA : cp= "dna"; break;
1719 case kRNA : cp= "rna"; break;
1720 case kNucleic : cp= "na"; break;
1721 case kAmino : cp= "aa"; break;
1722 case kOtherSeq: cp= "not-set"; break;
1724 fprintf(outf," seq {\n");
1725 fprintf(outf," id { local id %d },\n", gPretty.atseq);
1726 fprintf(outf," descr { title \"%s\" },\n", seqid);
1727 fprintf(outf," inst {\n");
1728 fprintf(outf," repr raw, mol %s, length %d, topology linear,\n", cp, seqlen);
1729 fprintf(outf," seq-data\n");
1730 if (seqtype == kAmino)
1731 fprintf(outf," iupacaa \"");
1733 fprintf(outf," iupacna \"");
1738 strcpy(endstr,"\"\n } } ,");
1748 /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */
1749 /* do a header comment line for paup */
1750 fprintf(outf,"[Name: %-16s Len:%6d Check: %8X]\n", idword, seqlen, checksum);
1755 numline= gPretty.numline;
1756 baseonlynum= gPretty.baseonlynum;
1757 namewidth = gPretty.namewidth;
1758 numright = gPretty.numright;
1759 numleft = gPretty.numleft;
1760 nameright = gPretty.nameright;
1761 nameleft = gPretty.nameleft;
1762 spacer = gPretty.spacer + 1;
1763 width = gPretty.seqwidth;
1765 /* also add rtf formatting w/ font, size, style */
1766 if (gPretty.nametop) {
1767 fprintf(outf,"Name: %-16s Len:%6d Check: %8X\n", idword, seqlen, checksum);
1773 fprintf(outf," Name: %-16s Len:%6d Check: %5d Weight: 1.00\n",
1774 idword, seqlen, checksum);
1777 namewidth= 15; /* need MAX namewidth here... */
1778 sprintf(nameform, "%%+%ds ",namewidth);
1785 fprintf(outf,";%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1786 fprintf(outf,"%s\n", idword);
1787 strcpy(endstr,"1"); /* == linear dna */
1792 case kZuker: /* don't attempt Zuker's ftn format */
1794 fprintf(outf,">%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
1799 if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);
1800 if (numline) sprintf(numform, "%%%ds ",numwidth);
1801 else sprintf(numform, "%%%dd ",numwidth);
1802 strcpy( nocountsymbols, kNocountsymbols);
1804 if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {
1805 strcat(nocountsymbols," ");
1806 nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;
1808 if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {
1817 width = min(width,kMaxseqwidth);
1818 for (i=0, l=0, ibase = 1; i < seqlen; ) {
1822 if (nameleft) fprintf(outf, nameform, idword);
1823 if (numleft) { if (numline) fprintf(outf, numform, "");
1824 else fprintf(outf, numform, ibase);}
1825 for (j=0; j<tab; j++) fputc(' ',outf);
1828 l1++; /* don't count spaces for width*/
1830 if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {
1831 if (numline==1) fputc(' ',outf);
1834 if (l1 % 10 == 1 || l1 == width) {
1835 if (numline==1) fprintf(outf,"%-9d ",i+1);
1836 s[l++]= '|'; /* == put a number here */
1843 if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1845 if (!baseonlynum) ibase++;
1846 else if (0==strchr(nocountsymbols,seq[i])) ibase++;
1850 if (l1 == width || i == seqlen) {
1851 if (outform==kPretty) for ( ; l1<width; l1++) {
1852 if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1854 s[l++]=' '; /* pad w/ blanks */
1860 if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */
1863 if (i == seqlen) fprintf(outf,"%s%s",s,endstr);
1864 else fprintf(outf,"%s",s);
1865 if (numright || nameright) fputc(' ',outf);
1866 if (numright) fprintf(outf,numform, ibase-1);
1867 if (nameright) fprintf(outf, nameform,idword);
1878 /* End file: ureadseq.c */