sources/readseq/Formats

   1 ||||||||||| ReadSeq supported formats   (revised 30Dec92)
   2 --------------------------------------------------------
   3
   4     -f[ormat=]Name Format name for output:
   5          1. IG/Stanford           10. Olsen (in-only)
   6          2. GenBank/GB            11. Phylip3.2
   7          3. NBRF                  12. Phylip
   8          4. EMBL                  13. Plain/Raw
   9          5. GCG                   14. PIR/CODATA
  10          6. DNAStrider            15. MSF
  11          7. Fitch                 16. ASN.1
  12          8. Pearson/Fasta         17. PAUP
  13          9. Zuker (in-only)       18. Pretty (out-only)
  14
  15 In general, output supports only minimal subsets of each format
  16 needed for sequence data exchanges.  Features, descriptions
  17 and other format-unique information is discarded.
  18
  19 Users of Olsen multi sequence editor (VMS).  The Olsen format
  20 here is produced with the print command:
  21   print/out=some.file
  22 Use Genbank output from readseq to produce a format that this
  23 editor can read, and use the command
  24   load/genbank some.file
  25 Dan Davison has a VMS program that will convert to/from the
  26 Olsen native binary data format.  E-mail davison@uh.edu
  27
  28 Warning: Phylip format input is now supported (30Dec92), however the
  29 auto-detection of Phylip format is very probabilistic and messy,
  30 especially distinguishing sequential from interleaved versions. It
  31 is not recommended that one use readseq to convert files from Phylip
  32 format to others unless essential.
  33
  34
  35
  36 ||||||||||| ReadSeq usage             (revised 11Nov91)
  37 --------------------------------------------------------
  38
  39 A. determine file format:
  40
  41         short skiplines;  /* result: number of header lines to skip (or 0) */
  42         short error;      /* error result or 0 */
  43         short format;     /* resulting format code, see ureadseq.h */
  44         char  *filename   = "Mysequence.file"
  45
  46         format = seqFileFormat( filename, &skiplines, &error);
  47         if (error!=0) fail;
  48
  49 B. read number and list of sequences (optional)
  50         short numseqs;    /* resulting number of sequences found in file */
  51         char  *seqlist;   /* list of sequence names, newline separated, 0 terminated */
  52
  53         seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
  54         if (error!=0)  display (seqlist);
  55         free( seqlist);
  56
  57 C.  read individual sequences as desired
  58         short seqIndex;   /* sequence index #, or == kListSeqs for listSeqs equivalent */
  59         long  seqlen;     /* length of seq */
  60         char  seqid[256]; /* sequence name */
  61         char  *seq;       /* sequence, 0 terminated, free when done */
  62
  63         seq = readSeq( seqIndex, filename, skiplines, format,
  64                       &seqlen, &numseqs, &error, seqid);
  65         if (error!=0) manipulate(seq);
  66         free(seq);
  67
  68 D. write sequences as desired
  69         int nlines;     /* number of lines of sequence written */
  70         FILE* fout;     /* open file pointer (stdout or other) */
  71         short outform;  /* output format, see ureadseq.h */
  72
  73         nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);
  74
  75
  76 Note (30Dec92): There is various processing done by the main program (in readseq.c),
  77   rather than just in the subroutines (in ureadseq.c).  Especially for interleaved
  78   output formats, the writeSeq subroutine does not handle interleaving, nor some of
  79   the formatting at the top and end of output files.  While seqFileFormat, listSeqs,
  80   and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
  81   auxilliary processing.  At some point, this may be revised so writeSeq is self-
  82   contained.
  83
  84 Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
  85   reading (see ureadasn.c).  A bastard (but workable I hope) ASN.1 format is written
  86   by writeSeq alone.
  87
  88
  89
  90 |||||||||||  sequence formats....
  91 ---------------------------------------------------
  92
  93 stanford/IG
  94 ;comments
  95 ;...
  96 seq1 info
  97 abcd...
  98 efgh1 (or 2 = terminator)
  99 ;another seq
 100 ;....
 101 seq2 info
 102 abcd...1
 103 --- for e.g. ----
 104 ;     Dro5s-T.Seq  Length: 120  April 6, 1989  21:22  Check: 9487  ..
 105 dro5stseq
 106 GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
 107 GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
 108
 109 ;  TOIG of: Dro5srna.Seq  check: 9487  from: 1  to: 120
 110 ---------------------------------------------------
 111
 112 Genbank:
 113 LOCUS    seq1 ID..
 114 ...
 115 ORIGIN ...
 116 123456789abcdefg....(1st 9 columns are formatting)
 117      hijkl...
 118 //         (end of sequence)
 119 LOCUS     seq2 ID ..
 120 ...
 121 ORIGIN
 122       abcd...
 123 //
 124 ---------------------------------------------------
 125
 126 NBRF format: (from uwgcg ToNBRF)
 127 >DL;DRO5SRNA
 128 Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
 129
 130       51  AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
 131      101  AACACCGCGU GUUGUUGGCC U
 132
 133 ---------------------------------------------------
 134
 135 EMBL format
 136 ID345 seq1 id   (the 345 are spaces)
 137 ... other info
 138 SQ345Sequence   (the 3,4,5 are spaces)
 139 abcd...
 140 hijk...
 141 //              (! this is proper end string: 12Oct90)
 142 ID    seq2 id
 143 ...
 144 SQ   Sequence
 145 abcd...
 146 ...
 147 //
 148 ---------------------------------------------------
 149
 150 UW GCG Format:
 151 comments of any form, up to ".." signal
 152 signal line has seq id, and " Check: ####   .."
 153 only 1 seq/file
 154
 155 -- e.g. --- (GCG from GenBank)
 156 LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
 157     ... much more ...
 158 ORIGIN      1 bp upstream of EcoRI site; chromosome BK9 region 69A1.
 159
 160 INVERTEBRATE:DROEST6  Length: 1819  January 9, 1989  16:48  Check: 8008  ..
 161
 162        1  GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT
 163
 164       51  CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG
 165
 166
 167 ---------------------------------------------------
 168
 169 DNAStrider (Mac) = modified Stanford:
 170 ; ### from DNA Strider  Friday, April 7, 1989   11:04:24 PM
 171 ; DNA sequence  pBR322   4363  b.p. complete sequence
 172 ;
 173 abcd...
 174 efgh
 175 //  (end of sequence)
 176 ---------------------------------------------------
 177
 178 Fitch format:
 179 Dro5srna.Seq
 180  GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
 181  GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
 182 Droest6.Seq
 183  GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
 184  AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
 185 ---------------------------------------------------
 186
 187 W.Pearson/Fasta format:
 188 >BOVPRL GenBank entry BOVPRL from omam file.  907 nucleotides.
 189 TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
 190
 191 ---------------------------------------------------
 192 Phylip version 3.2 format (e.g., DNAML):
 193
 194    5   13 YF                (# seqs, #bases, YF)
 195 Alpha     AACGTGGCCAAAT
 196           aaaagggccc...  (continued sp. alpha)
 197 Beta      AAGGTCGCCAAAC
 198           aaaagggccc...  (continued sp. beta)
 199 Gamma     CATTTCGTCACAA
 200           aaaagggccc...  (continued sp. Gamma)
 201 1234567890^-- bases must start in col 11, and run 'til #bases
 202         (spaces & newlines are okay)
 203 ---------------------------------------------------
 204 Phylip version 3.3 format (e.g., DNAML):
 205
 206   5    42  YF             (# seqs, #bases, YF)
 207 Turkey    AAGCTNGGGC ATTTCAGGGT
 208 Salmo gairAAGCCTTGGC AGTGCAGGGT
 209 H. SapiensACCGGTTGGC CGTTCAGGGT
 210 Chimp     AAACCCTTGC CGTTACGCTT
 211 Gorilla   AAACCCTTGC CGGTACGCTT
 212 1234567890^-- bases must start in col 11
 213   !! this version interleaves the species -- contrary to
 214      all other output formats.
 215
 216 GAGCCCGGGC AATACAGGGT AT
 217 GAGCCGTGGC CGGGCACGGT AT
 218 ACAGGTTGGC CGTTCAGGGT AA
 219 AAACCGAGGC CGGGACACTC AT
 220 AAACCATTGC CGGTACGCTT AA
 221
 222 ---------------------------------------------------
 223 Phylip version 3.4 format (e.g., DNAML)
 224 -- Both Interleaved and sequential are permitted
 225
 226    5   13                (# seqs, #bases)
 227 Alpha     AACGTGGCCAAAT
 228           aaaagggccc...  (continued sp. alpha)
 229 Beta      AAGGTCGCCAAAC
 230           aaaagggccc...  (continued sp. beta)
 231 Gamma     CATTTCGTCACAA
 232           aaaagggccc...  (continued sp. Gamma)
 233 1234567890^-- bases must start in col 11, and run 'til #bases
 234         (spaces, newlines and numbers are are ignored)
 235
 236 ---------------------------------------------------
 237 Gary Olsen (multiple) sequence editor /print format:
 238
 239 !---------------------
 240 !17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
 241 ! here is correct copy:
 242   301  40 Tb.thiop  CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG-----------------------------------------------------  Tb.thiop
 243 123456789012345678901
 244   301  42 Rhc.purp  CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG-----------------------------------------------------  Rhc.purp
 245
 246   301  44 Rhc.gela  nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA-----------------------------------------------------  Rhc.gela
 247 !---------------------
 248
 249  RNase P RNA components.  on 20-FEB-90 17:23:58
 250
 251     1 (E.c. pr ):  Base pairing in Escherichia coli RNase P RNA.
 252     2 (chrom   ):  Chromatium
 253       :
 254    12 (B.brevis):  Bacillus brevis RNase P RNA, B. James.
 255    13 ( 90% con):   90% conserved
 256    14 (100% con):  100% conserved
 257    15 (gram+ pr):  pairing
 258
 259 1
 260  RNase P RNA components.  on 20-FEB-90 17:23:58
 261
 262  Posi-   Sequence
 263  tion:   identity:   Data:
 264
 265      1   1 E.c. pr      <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>>  E.c. pr
 266      1   2 chrom        GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------  chrom
 267             :
 268      1  12 B.brevis  AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU-------------  B.brevis
 269 1234567890123456789012 <! this should be 21 not 22,
 270 ! this example must be inset on left by 1 space from olsen /print files !
 271      1  13  90% con           G  C G  A  CGC GC               -    -      90% con
 272      1  14 100% con                G  A  CGC                             100% con
 273      1  15 gram+ pr     <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<===============  gram+ pr
 274
 275     60   1 E.c. pr   >>>>>>^>>^>>>>:>>    <<<^<<<< {{{{{                 E.c. pr
 276     60   2 chrom     -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU-------------  chrom
 277     :       :
 278     60  10 B.stearo  ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC  B.stearo
 279
 280
 281 ---------------------------------------------------
 282   GCG MSF format
 283 Title line
 284
 285 picorna.msf  MSF: 100  Type: P  January 17, 1991  17:53  Check: 541
 286 ..
 287 Name: Cb3              Len:   100  Check: 7009  Weight:  1.00
 288 Name: E                Len:   100  Check:   60  Weight:  1.00
 289
 290 //
 291
 292    1                                                   50
 293 Cb3  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
 294   E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs
 295
 296    51                                                 100
 297
 298 Cb3  ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
 299   E  ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf
 300
 301 ---------------------------------------------------
 302      PIR format
 303 This is NBRF-PIR MAILSERVER version 1.45
 304 Command-> get PIR3:A31391
 305 \\\
 306 ENTRY           A31391       #Type Protein
 307 TITLE           *Esterase-6 - Fruit fly (Drosophila melanogaster)
 308
 309 DATE            03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
 310 PLACEMENT          0.0    0.0    0.0    0.0    0.0
 311 COMMENT         *This entry is not verified.
 312 SOURCE          Drosophila melanogaster
 313
 314 REFERENCE
 315    #Authors     Cooke P.H., Oakeshott J.G.
 316    #Citation    submitted to GenBank, April 1989
 317    #Reference-number A31391
 318    #Accession   A31391
 319    #Cross-reference GB:J04167
 320
 321 SUMMARY       #Molecular-weight 61125  #Length 544  #Checksum  1679
 322 SEQUENCE
 323                 5        10        15        20        25        30
 324       1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
 325      31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
 326      61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
 327      91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
 328     121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
 329     151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
 330     181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
 331     211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
 332     241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
 333     271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
 334     301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
 335     331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
 336     361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
 337     391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
 338     421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
 339     451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
 340     481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
 341     511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
 342     541 V E F P
 343 ///
 344 \\\
 345 ---------------------------------------------------
 346 PAUP format:
 347 The NEXUS Format
 348
 349 Every block starts with "BEGIN blockname;" and ends with "END;".
 350 Each block is composed of one or more statements, each
 351 terminated by a semicolon (;).
 352
 353 Comments may be included in NEXUS files by enclosing them within
 354 square brackets, as in "[This is a comment]."
 355
 356 NEXUS-conforming files are identified by a "#NEXUS" directive at
 357 the very beginning of the file (line 1, column 1).  If the
 358 #NEXUS is omitted PAUP issues a warning but continues
 359 processing.
 360
 361 NEXUS files are entirely free-format.  Blanks, tabs, and
 362 newlines may be placed anywhere in the file.  Unless RESPECTCASE
 363 is requested, commands and data may be entered in upper case,
 364 lower case, or a mixture of upper and lower case.
 365
 366 The following conventions are used in the syntax descriptions of
 367 the various blocks.  Upper-case items are entered exactly as
 368 shown.  Lower-case items inside of angle brackets -- e.g., <x>
 369 -- represent items to be substituted by the user.  Items inside
 370 of square brackets -- e.g., [X] -- are optional.  Items inside
 371 of curly braces and separated by vertical bars -- e.g.,  { X | Y
 372 | Z } -- are mutually exclusive options.
 373
 374
 375 The DATA Block
 376
 377 The DATA block contains the data matrix and other associated
 378 information.  Its syntax is:
 379
 380 BEGIN DATA;
 381 DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
 382   [ FORMAT  [ MISSING=<missing-symbol> ]
 383         [ LABELPOS={ LEFT | RIGHT } ]
 384         [ SYMBOLS="<symbols-list>" ]
 385         [ INTERLEAVE ]
 386         [ MATCHCHAR=<match-symbol> ]
 387         [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
 388         [ TRANSPOSE ]
 389         [ RESPECTCASE ]
 390         [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
 391         [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
 392         [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
 393         [ ZAP = "<list of zapped characters>" ] ; ]
 394   [ CHARLABELS <label_1> label_2>ÊÉ <label_NCHAR> ; ]
 395   [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
 396   [ STATELABELS <currently ignored by PAUP> ; ]
 397   MATRIX <data-matrix> ;
 398   END;
 399
 400 --- example PAUP file
 401
 402 #NEXUS
 403
 404 [!Brown et al. (1982) primate mitochondrial DNA]
 405
 406 begin data;
 407   dimensions ntax=5 nchar=896;
 408   format datatype=dna matchchar=. interleave missing='-';
 409   matrix
 410 [                              2                    4                    6            8                    ]
 411 [         1                    1                    1                    1            1                    ]
 412 human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
 413 chimp     ................a.t. .c.................a ...............t.... ..................t. .t........c.........
 414 gorilla   ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
 415 orang     ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
 416 gibbon    ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...
 417
 418 [         8                    8                    8                    8            8              8     ]
 419 [         0                    2                    4                    6            8              9     ]
 420 [         1                    1                    1                    1            1              6     ]
 421 human     cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
 422 chimp     t................... .a................c. ........a.....g..... ...a................ ................
 423 gorilla   ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
 424 orang     ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
 425 gibbon    a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
 426   ;
 427 end;
 428 ---------------------------------------------------
 429
 430
 431
 432
 433
 434
 435 |||||||||||  Sample SMTP mail header
 436 ---------------------------------------------------
 437
 438 - - - - - - - - -
 439 From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
 440 Received: from genbank.bio.net by sunflower.bio.indiana.edu
 441         (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
 442 Received: by genbank.bio.net (5.65/IG-2.0)
 443         id AA14458; Sun, 10 Nov 91 14:30:03 -0800
 444 Date: Sun, 10 Nov 91 14:30:03 -0800
 445 Message-Id: <9111102230.AA14458@genbank.bio.net>
 446 From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
 447 To: gilbertd@sunflower.bio.indiana.edu
 448 Subject: Results of Query for drorna
 449 Status: R
 450
 451 No matches on drorna.
 452 - - - - - -
 453 From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
 454 Received: from genbank.bio.net by sunflower.bio.indiana.edu
 455         (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
 456 Received: by genbank.bio.net (5.65/IG-2.0)
 457         id AA14461; Sun, 10 Nov 91 14:30:03 -0800
 458 Date: Sun, 10 Nov 91 14:30:03 -0800
 459 Message-Id: <9111102230.AA14461@genbank.bio.net>
 460 From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
 461 To: gilbertd@sunflower.bio.indiana.edu
 462 Subject: Results of Query for droest6
 463 Status: R
 464
 465 LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
 466 DEFINITION  D.melanogaster esterase-6 mRNA, complete cds.
 467 ACCESSION   M15961
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480 |||||||||||  GCG manual discussion of sequence symbols:
 481 ---------------------------------------------------
 482
 483 III_SEQUENCE_SYMBOLS
 484
 485
 486      GCG programs allow all upper and lower  case  letters,  periods  (.),
 487 asterisks  (*),  pluses  (+),  ampersands  (&),  and ats (@) as symbols in
 488 biological sequences.  Nucleotide  symbols,  their  complements,  and  the
 489 standard  one-letter amino acid symbols are shown below in separate lists.
 490 The meanings of the symbols +, &, and @ have not  been  assigned  at  this
 491 writing (March, 1989).
 492
 493      GCG uses the  letter  codes  for  amino  acid  codes  and  nucleotide
 494 ambiguity    proposed    by    IUB    (Nomenclature    Committee,    1985,
 495 Eur. J. Biochem. 150; 1-5).  These codes are  compatible  with  the  codes
 496 used by the EMBL, GenBank, and NBRF data libraries.
 497
 498
 499                                NUCLEOTIDES
 500
 501      The meaning of each symbol, its complement,  and  the  Cambridge  and
 502 Stanford  equivalents  are  shown below.  Cambridge files can be converted
 503 into GCG files and vice versa with the programs FROMSTADEN  and  TOSTADEN.
 504 IntelliGenetics  sequence  files  can  be interconverted with the programs
 505 FROMIG and TOIG.
 506
 507 IUB/GCG      Meaning     Complement   Staden/Sanger  Stanford
 508
 509    A             A             T             A            A
 510    C             C             G             C            C
 511    G             G             C             G            G
 512   T/U            T             A             T           T/U
 513    M           A or C          K             5            J
 514    R           A or G          Y             R            R
 515    W           A or T          W             7            L
 516    S           C or G          S             8            M
 517    Y           C or T          R             Y            Y
 518    K           G or T          M             6            K
 519    V        A or C or G        B       not supported      N
 520    H        A or C or T        D       not supported      N
 521    D        A or G or T        H       not supported      N
 522    B        C or G or T        V       not supported      N
 523   X/N     G or A or T or C     X            -/X           N
 524    .    not G or A or T or C   .       not supported      ?
 525
 526
 527   The frame ambiguity codes used by Staden are not  supported  by  GCG
 528 and   are  translated  by  FROMSTADEN  as  the  lower  case  single  base
 529 equivalent.
 530
 531      Staden Code          Meaning              GCG
 532
 533          D                C or CC                c
 534          V                T or TT                t
 535          B                A or AA                a
 536          H                G or GG                g
 537          K                C or CX                c
 538          L                T or TX                t
 539          M                A or AX                a
 540          N                G or GX                g
 541
 542
 543                         AMINO ACIDS
 544
 545   Here is a list of the standard one-letter amino acid codes and their
 546 three-letter  equivalents.   The synonymous codons and their depiction in
 547 the IUB codes are shown.  You should recognize that the codons  following
 548 semicolons  (;)  are  not  sufficiently specific to define a single amino
 549 acid even though they represent the best possible back  translation  into
 550 the IUB codes!  All of the relationships in this list can be redefined by
 551 the user in a local data file described below.
 552
 553                                                       IUB
 554 Symbol 3-letter  Meaning      Codons                Depiction
 555  A    Ala       Alanine      GCT,GCC,GCA,GCG         !GCX
 556  B    Asp,Asn   Aspartic,
 557                 Asparagine   GAT,GAC,AAT,AAC         !RAY
 558  C    Cys       Cysteine     TGT,TGC                 !TGY
 559  D    Asp       Aspartic     GAT,GAC                 !GAY
 560  E    Glu       Glutamic     GAA,GAG                 !GAR
 561  F    Phe     Phenylalanine  TTT,TTC                 !TTY
 562  G    Gly       Glycine      GGT,GGC,GGA,GGG         !GGX
 563  H    His       Histidine    CAT,CAC                 !CAY
 564  I    Ile       Isoleucine   ATT,ATC,ATA             !ATH
 565  K    Lys       Lysine       AAA,AAG                 !AAR
 566  L    Leu       Leucine      TTG,TTA,CTT,CTC,CTA,CTG
 567 !TTR,CTX,YTR;YTX
 568  M    Met       Methionine   ATG                     !ATG
 569  N    Asn       Asparagine   AAT,AAC                 !AAY
 570  P    Pro       Proline      CCT,CCC,CCA,CCG         !CCX
 571  Q    Gln       Glutamine    CAA,CAG                 !CAR
 572  R    Arg       Arginine     CGT,CGC,CGA,CGG,AGA,AGG
 573 !CGX,AGR,MGR;MGX
 574  S    Ser       Serine       TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
 575  T    Thr       Threonine    ACT,ACC,ACA,ACG         !ACX
 576  V    Val       Valine       GTT,GTC,GTA,GTG         !GTX
 577  W    Trp       Tryptophan   TGG                     !TGG
 578  X    Xxx       Unknown                              !XXX
 579  Y    Tyr       Tyrosine     TAT, TAC                !TAY
 580  Z    Glu,Gln   Glutamic,
 581                 Glutamine    GAA,GAG,CAA,CAG         !SAR
 582  *    End       Terminator   TAA, TAG, TGA           !TAR,TRA;TRR
 583
 584
 585
 586
 587
 588
 589
 590
 591 |||||||||||  docs from PSC on sequence formats:
 592 ---------------------------------------------------
 593
 594
 595           Nucleic Acid and Protein Sequence File Formats
 596
 597
 598 It will probably save you some time if you have your data in a usable
 599 format before you send it to us.  However, we do have the University of
 600 Wisconsin Genetics Computing Group programs running on our VAXen and
 601 this package includes several reformatting utilities.  Our programs
 602 usually recognize any of several standard formats, including GenBank,
 603 EMBL, NBRF, and MolGen/Stanford.  For the purposes of annotating an
 604 analysis we find the GenBank and EMBL formats most useful, particularly
 605 if you have already received an accession number from one of these
 606 organizations for your sequence.
 607
 608 Our programs do not require that all of the line types available in
 609 GenBank, EMBL, or NBRF file formats be present for the file format to
 610 be recognized and processed.  The following pages outline the essential
 611 details required for correct processing of files by our programs.
 612 Additional information may be present but will generally be ignored.
 613
 614
 615                       GenBank File Format
 616
 617 File Header
 618
 619 1.  The first line in the file must have "GENETIC SEQUENCE DATA BANK"
 620     in spaces 20 through 46 (see LINE  1, below).
 621 2.  The next 8 lines may contain arbitrary text.  They are ignored but
 622     are required to maintain the GenBank format (see LINE 2 - LINE 9).
 623
 624 Sequence Data Entries
 625
 626 3.  Each sequence entry in the file should have the following format.
 627     a) first line:   Must have LOCUS in the first 5 spaces.  The
 628                      genetic locus name or identifier must be in spaces
 629                      13 - 22.  The length of the sequences is right
 630                      justified in spaces 23 through 29 (see LINE  10).
 631     b) second line:  Must have DEFINITION in the first 10 spaces.
 632                      Spaces 13 - 80 are free form text to identify the
 633                      sequence (see LINE  11).
 634     c) third line:   Must have ACCESSION in the first 9 spaces.  Spaces
 635                      13 - 18 must hold the primary accession number
 636                      (see LINE  12).
 637     d) fourth line:  Must have ORIGIN in the first 6 spaces.  Nothing
 638                      else is required on this line, it indicates that
 639                      the nucleic acid sequence begins on the next line
 640                      (see LINE  13).
 641     e) fifth line:   Begins the nucleotide sequence.  The first 9
 642                      spaces of each sequence line may either be blank
 643                      or may contain the position in the sequence of the
 644                      first nucleotide on the line.  The next 66 spaces
 645                      hold the nucleotide sequence in six blocks of ten
 646                      nucleotides.  Each of the six blocks begins with a
 647                      blank space followed by ten nucleotides.  Thus the
 648                      first nucleotide is in space eleven of the line while
 649                      the last is in space 75 (see LINE  14, LINE  15).
 650     f) last line:    Must have // in the first 2 spaces to indicate
 651                      termination of the sequence (see LINE  16).
 652
 653 NOTE:  Multiple sequences may appear in each file.  To begin another
 654        sequence go back to a) and start again.
 655
 656
 657                          Example GenBank file
 658
 659
 660 LINE  1  :                   GENETIC SEQUENCE DATA BANK
 661 LINE  2  :
 662 LINE  3  :
 663 LINE  4  :
 664 LINE  5  :
 665 LINE  6  :
 666 LINE  7  :
 667 LINE  8  :
 668 LINE  9  :
 669 LINE 10  :LOCUS       L_Name     Length BP
 670 LINE 11  :DEFINITION  Describe the sequence any way you want
 671 LINE 12  :ACCESSION   Accession Number
 672 LINE 13  :ORIGIN
 673 LINE 14  :        1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
 674 LINE 15  :       61 acgt...
 675 LINE 16  ://
 676
 677
 678
 679                          EMBL File Format
 680
 681 Unlike the GenBank file format the EMBL file format does not require
 682 a series of header lines.  Thus the first line in the file begins
 683 the first sequence entry of the file.
 684
 685 1.  The first line of each sequence entry contains the two letters ID
 686     in the first two spaces.  This is followed by the EMBL identifier
 687     in spaces 6 through 14.  (See LINE  1).
 688
 689 2.  The second line of each sequence entry has the two letters AC in
 690     the first two spaces.  This is followed by the accession number in
 691     spaces 6 through 11.  (See LINE  2).
 692
 693 3.  The third line of each sequence entry has the two letters DE in the
 694     first two spaces.  This is followed by a free form text definition
 695     in spaces 6 through 72.  (See LINE  3).
 696
 697 4.  The fourth line in each sequence entry has the two letters SQ in
 698     the first two spaces.  This is followed by the length of the
 699     sequence beginning at or after space 13.  After the sequence length
 700     there is a blank space and the two letters BP.  (See LINE  4).
 701
 702 5.  The nucleotide sequence begins on the fifth line of the sequence
 703     entry.  Each line of sequence begins with four blank spaces. The
 704     next 66 spaces hold the nucleotide sequence in six blocks of ten
 705     nucleotides.  Each of the six blocks begins with a blank space
 706     followed by ten nucleotides.  Thus the first nucleotide is in space
 707     6 of the line while the last is in space 70.  (See LINE  5 -
 708     LINE  6).
 709
 710 6.  The last line of each sequence entry in the file is a terminator
 711     line which has the two characters // in the first two spaces.
 712     (See LINE  7).
 713
 714 7.  Multiple sequences may appear in each file.  To begin another
 715     sequence go back to item 1 and start again.
 716
 717
 718                           Example EMBL file
 719
 720 LINE  1  :ID   ID_name
 721 LINE  2  :AC   Accession number
 722 LINE  3  :DE   Describe the sequence any way you want
 723 LINE  4  :SQ          Length BP
 724 LINE  5  :     ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
 725 LINE  6  :     ACGT...
 726 LINE  7  ://
 727
 728
 729
 730             NBRF (protein or nucleic acid) File Format
 731
 732 1.  The first line of each sequence entry begins with a greater than
 733   symbol, >.  This is immediately followed by the two character
 734   sequence type specifier.  Space four must contain a semi-colon.
 735   Beginning in space five is the sequence name or identification code
 736   for the NBRF database.  The code is from four to six letters and
 737   numbers.  (See LINE  1).
 738
 739 !!!! >> add these to readseq
 740           Specifier             Sequence type
 741
 742               P1                protein, complete
 743               F1                protein, fragment
 744               DL                DNA, linear
 745               DC                DNA, circular
 746               RL                RNA, linear
 747               RC                RNA, circular
 748               N1                functional RNA, other than tRNA
 749               N3                tRNA
 750
 751 2.  The second line of each sequence entry contains two kinds of
 752   information.  First is the sequence name which is separated from
 753   the organism or organelle name by the three character sequence
 754   blank space, dash, blank space, " - ".  There is no special
 755   character marking the beginning of this line.  (See LINE  2).
 756
 757 3.  Either the amino acid or nucleic acid sequence begins on line three
 758   and can begin in any space, including the first.  The sequence is
 759   free format and may be interrupted by blanks for ease of reading.
 760   Protein sequences man contain special punctuation to indicate
 761   various indeterminacies in the sequence.  In the NBRF data files
 762   all lines may be up to 500 characters long.  However some PSC
 763   programs currently have a limit of 130 characters per line
 764   (including blanks), and BitNet will not accept lines of over eighty
 765   characters.  (See LINE  3, LINE  4, and LINE  5).
 766
 767   The last character in the sequence must be an asterisks, *.
 768
 769                        Example NBRF file
 770
 771  LINE  1  :>P1;CBRT
 772  LINE  2  :Cytochrome b - Rat mitochondrion (SGC1)
 773  LINE  3  :M T N I R K S H P L F K I I N H S F I D L P A P S
 774  LINE  4  : VTHICRDVN Y GWL IRY
 775  LINE  5  :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*
 776
 777
 778
 779                 MolGen/Stanford File Format
 780
 781 1.  The first line in a sequence file is a comment line.  This line
 782   begins with a semi-colon in the first space.  This line need
 783   not be present.  If it is present it holds descriptive text.
 784   There may be as many comment lines as desired at the first of
 785   sequence file.  (See LINE  1).
 786
 787 2.  The second line must be present and contains an identifier or
 788   name for the sequence in the first ten spaces.  (See LINE  2).
 789
 790 3.  The sequence begins on the third line and occupies up to eighty
 791   spaces.  Spaces may be included in the sequence for ease of
 792   reading.  The sequence continues for as many line as needed
 793   and is terminated with a 1 or 2.  1 indicates a linear sequence
 794   while 2 marks a circular sequence.  (See LINE  3 and LINE  4).
 795
 796                           Example MolGen/Stanford file
 797
 798 LINE  1  :;  Describe the sequence any way you want
 799 LINE  2  :ECTRNAGLY2
 800 LINE  3  :ACGCACGTAC ACGTACGTAC   A C G T C C G T ACG TAC GTA CGT
 801 LINE  4  :  GCTTA   GG G C T A1
 802
 803
 804
 805
 806 |||||||||||  Phylip file format
 807 ---------------------------------------------------
 808
 809         Phylip 3.3 File Format (DNA sequences)
 810
 811
 812      The input and output formats for PROTPARS and for RESTML are described  in
 813 their  document  files.   In  general  their input formats are similar to those
 814 described here, except that the one-letter codes for data are specific to those
 815 programs  and  are  described in those document files.  Since the input formats
 816 for the eight DNA sequence programs apply to  all  eight,  they  are  described
 817 here.   Their  input  formats are standard: the data have A's, G's, C's and T's
 818 (or U's).  The first line of the input file contains the number of species  and
 819 the  number  of  sites.   As  with  the other programs, options information may
 820 follow this.  In the case of DNAML, DNAMLK,  and  DNADIST  an  additional  line
 821 (described  in  the  document file for these pograms) may follow the first one.
 822 Following this, each species starts on a new line.  The first 10 characters  of
 823 that  line  are the species name.  There then follows the base sequence of that
 824 species, each character being one of the letters A, B, C, D, G, H, K, M, N,  O,
 825 R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
 826 no longer allowed, because it sometimes is used to in aligned sequences to mean
 827 "the  same  as  the  sequence  above").   Blanks  will  be ignored, and so will
 828 numerical digits.  This allows GENBANK and EMBL sequence  entries  to  be  read
 829 with minimum editing.
 830
 831      These characters can be  either  upper  or  lower  case.   The  algorithms
 832 convert  all  input  characters  to upper case (which is how they are treated).
 833 The characters constitute the IUPAC (IUB) nucleic acid code  plus  some  slight
 834 extensions.  They enable input of nucleic acid sequences taking full account of
 835 any ambiguities in the sequence.
 836
 837 The sequences can continue over multiple lines; when this is done the sequences
 838 must  be  either  in  "interleaved"  format, similar to the output of alignment
 839 programs, or "sequential" format.  These are described  in  the  main  document
 840 file.   In sequential format all of one sequence is given, possibly on multiple
 841 lines, before the next starts.  In interleaved format the  first  part  of  the
 842 file  should  contain  the first part of each of the sequences, then possibly a
 843 line containing nothing but a carriage-return character, then the  second  part
 844 of  each  sequence, and so on.  Only the first parts of the sequences should be
 845 preceded by names.  Here is a hypothetical example of interleaved format:
 846
 847   5    42
 848 Turkey    AAGCTNGGGC ATTTCAGGGT
 849 Salmo gairAAGCCTTGGC AGTGCAGGGT
 850 H. SapiensACCGGTTGGC CGTTCAGGGT
 851 Chimp     AAACCCTTGC CGTTACGCTT
 852 Gorilla   AAACCCTTGC CGGTACGCTT
 853
 854 GAGCCCGGGC AATACAGGGT AT
 855 GAGCCGTGGC CGGGCACGGT AT
 856 ACAGGTTGGC CGTTCAGGGT AA
 857 AAACCGAGGC CGGGACACTC AT
 858 AAACCATTGC CGGTACGCTT AA
 859
 860 while in sequential format the same sequences would be:
 861
 862   5    42
 863 Turkey    AAGCTNGGGC ATTTCAGGGT
 864 GAGCCCGGGC AATACAGGGT AT
 865 Salmo gairAAGCCTTGGC AGTGCAGGGT
 866 GAGCCGTGGC CGGGCACGGT AT
 867 H. SapiensACCGGTTGGC CGTTCAGGGT
 868 ACAGGTTGGC CGTTCAGGGT AA
 869 Chimp     AAACCCTTGC CGTTACGCTT
 870 AAACCGAGGC CGGGACACTC AT
 871 Gorilla   AAACCCTTGC CGGTACGCTT
 872 AAACCATTGC CGGTACGCTT AA
 873
 874
 875 Note, of course, that a portion of a sequence like this:
 876
 877    300   AAGCGTGAAC GTTGTACTAA TRCAG
 878
 879 is perfectly legal, assuming that the species name  has  gone  before,  and  is
 880 filled  out  to  full  length  by  blanks.  The above digits and blanks will be
 881 ignored, the sequence being taken as starting at the first base symbol (in this
 882 case an A).
 883
 884      The present versions of the programs may sometimes have difficulties  with
 885 the  blank  lines  between  groups of lines, and if so you might want to retype
 886 those lines, making sure that they have only a  carriage-return  and  no  blank
 887 characters on them, or you may perhaps have to eliminate them.  The symptoms of
 888 this problem are that the programs complain that the sequences are not properly
 889 aligned, and you can find no other cause for this complaint.
 890
 891 ------------------------------------------------
 892
 893
 894 |||||||||||  ASN.1 file format
 895 ---------------------------------------------------
 896
 897
 898 ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)
 899
 900 Example asn.1 sequence file----
 901
 902 Bioseq-set ::= {
 903 seq-set {
 904   seq {
 905     id { local id 1 } ,                 -- id essential
 906     descr {  title "Dummy sequence data from nowhere"  } ,  -- optional
 907     inst {                              -- inst essential
 908       repr raw ,
 909       mol dna ,
 910       length 156 ,
 911       topology linear ,
 912       seq-data
 913         iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
 914 TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
 915 TGGATTCAAAGCAATAGAGTTGTTCTT"
 916       } } ,
 917
 918         seq {
 919           id { local id 2 } ,
 920           descr {  title "Dummy sequence 2 data from somewhere else"  } ,
 921           inst {
 922                 repr raw ,
 923                 mol dna ,
 924                 length 150 ,
 925                 topology linear ,
 926                 seq-data
 927                   iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
 928 TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
 929 TGGATTCAAAGCAATAGAGTT"
 930             }
 931           }
 932         }
 933       }
 934
 935
 936 partial ASN.1 description from toolkit
 937
 938 Bioseq ::= SEQUENCE {
 939     id SET OF Seq-id ,            -- equivalent identifiers
 940     descr Seq-descr OPTIONAL , -- descriptors
 941     inst Seq-inst ,            -- the sequence data
 942     annot SET OF Seq-annot OPTIONAL }
 943
 944 Seq-inst ::= SEQUENCE {            -- the sequence data itself
 945     repr ENUMERATED {              -- representation class
 946         not-set (0) ,              -- empty
 947         virtual (1) ,              -- no seq data
 948         raw (2) ,                  -- continuous sequence
 949         seg (3) ,                  -- segmented sequence
 950         const (4) ,                -- constructed sequence
 951         ref (5) ,                  -- reference to another sequence
 952         consen (6) ,               -- consensus sequence or pattern
 953         map (7) ,                  -- ordered map (genetic, restriction)
 954         other (255) } ,
 955     mol ENUMERATED {               -- molecule class in living organism
 956         not-set (0) ,              --   > cdna = rna
 957         dna (1) ,
 958         rna (2) ,
 959         aa (3) ,
 960         na (4) ,                   -- just a nucleic acid
 961         other (255) } ,
 962     length INTEGER OPTIONAL ,      -- length of sequence in residues
 963     fuzz Int-fuzz OPTIONAL ,       -- length uncertainty
 964     topology ENUMERATED {          -- topology of molecule
 965         not-set (0) ,
 966         linear (1) ,
 967         circular (2) ,
 968         tandem (3) ,               -- some part of tandem repeat
 969         other (255) } DEFAULT linear ,
 970     strand ENUMERATED {            -- strandedness in living organism
 971         not-set (0) ,
 972         ss (1) ,                   -- single strand
 973         ds (2) ,                   -- double strand
 974         mixed (3) ,
 975         other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept
 976     seq-data Seq-data OPTIONAL ,   -- the sequence
 977     ext Seq-ext OPTIONAL ,         -- extensions for special types
 978   hist Seq-hist OPTIONAL }       -- sequence history
 979
 980 ------------------------------------------------