1 # Copyright 2006-2008 by Peter Cock. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
7 # http://www.ebi.ac.uk/help/formats_frame.html
9 """Sequence input/output as SeqRecord objects.
11 Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by
12 a whole chapter in our tutorial:
13 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
14 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
18 The main function is Bio.SeqIO.parse(...) which takes an input file handle,
19 and format string. This returns an iterator giving SeqRecord objects:
21 >>> from Bio import SeqIO
22 >>> handle = open("Fasta/f002", "rU")
23 >>> for record in SeqIO.parse(handle, "fasta") :
24 ... print record.id, len(record)
25 gi|1348912|gb|G26680|G26680 633
26 gi|1348917|gb|G26685|G26685 413
27 gi|1592936|gb|G29385|G29385 471
30 Note that the parse() function will all invoke the relevant parser for the
31 format with its default settings. You may want more control, in which case
32 you need to create a format specific sequence iterator directly.
34 For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
35 using a sequence iterator can save you a lot of memory (RAM). There is
36 less benefit for interlaced file formats (e.g. most multiple alignment file
37 formats). However, an iterator only lets you access the records one by one.
39 If you want random access to the records by number, turn this into a list:
41 >>> from Bio import SeqIO
42 >>> handle = open("Fasta/f002", "rU")
43 >>> records = list(SeqIO.parse(handle, "fasta"))
45 >>> print records[1].id
46 gi|1348917|gb|G26685|G26685
48 If you want random access to the records by a key such as the record id,
49 turn the iterator into a dictionary:
51 >>> from Bio import SeqIO
52 >>> handle = open("Fasta/f002", "rU")
53 >>> record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
55 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
58 If you expect your file to contain one-and-only-one record, then we provide
59 the following 'helper' function which will return a single SeqRecord, or
60 raise an exception if there are no records or more than one record:
62 >>> from Bio import SeqIO
63 >>> handle = open("Fasta/f001", "rU")
64 >>> record = SeqIO.read(handle, "fasta")
66 >>> print record.id, len(record)
67 gi|3318709|pdb|1A91| 79
69 This style is useful when you expect a single record only (and would
70 consider multiple records an error). For example, when dealing with GenBank
71 files for bacterial genomes or chromosomes, there is normally only a single
72 record. Alternatively, use this with a handle when download a single record
75 However, if you just want the first record from a file containing multiple
76 record, use the iterator's next() method:
78 >>> from Bio import SeqIO
79 >>> handle = open("Fasta/f002", "rU")
80 >>> record = SeqIO.parse(handle, "fasta").next()
82 >>> print record.id, len(record)
83 gi|1348912|gb|G26680|G26680 633
85 The above code will work as long as the file contains at least one record.
86 Note that if there is more than one record, the remaining records will be
91 You can read in alignment files as Alignment objects using Bio.AlignIO.
92 Alternatively, reading in an alignment file format via Bio.SeqIO will give
93 you a SeqRecord for each row of each alignment:
95 >>> from Bio import SeqIO
96 >>> handle = open("Clustalw/hedgehog.aln", "rU")
97 >>> for record in SeqIO.parse(handle, "clustal") :
98 ... print record.id, len(record)
99 gi|167877390|gb|EDS40773.1| 447
100 gi|167234445|ref|NP_001107837. 447
101 gi|74100009|gb|AAZ99217.1| 447
102 gi|13990994|dbj|BAA33523.2| 447
103 gi|56122354|gb|AAV74328.1| 447
108 Use the function Bio.SeqIO.write(...), which takes a complete set of
109 SeqRecord objects (either as a list, or an iterator), an output file handle
110 and of course the file format::
112 from Bio import SeqIO
114 handle = open("example.faa", "w")
115 SeqIO.write(records, handle, "fasta")
118 In general, you are expected to call this function once (with all your
119 records) and then close the file handle.
123 The effect of calling write() multiple times on a single file will vary
124 depending on the file format, and is best avoided unless you have a strong
127 Trying this for certain alignment formats (e.g. phylip, clustal, stockholm)
128 would have the effect of concatenating several multiple sequence alignments
129 together. Such files are created by the PHYLIP suite of programs for
132 For sequential files formats (e.g. fasta, genbank) each "record block" holds
133 a single sequence. For these files it would probably be safe to call
134 write() multiple times.
138 When specifying the file format, use lowercase strings. The same format
139 names are also used in Bio.AlignIO and include the following:
141 - ace - Reads the contig sequences from an ACE assembly file.
142 - embl - The EMBL flat file format. Uses Bio.GenBank internally.
143 - fasta - The generic sequence file format where each record starts with
144 an identifer line starting with a ">" character, followed by
146 - fastq - A "FASTA like" format used by Sanger which also stores PHRED
147 sequence quality values.
148 - fastq-solexa - The Solexa/Illumnia variant of the Sanger FASTQ format which
149 encodes Solexa quality scores (not PHRED quality scores).
150 - genbank - The GenBank or GenPept flat file format.
151 - gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
152 - ig - The IntelliGenetics file format, apparently the same as the
153 MASE alignment format.
154 - phd - Output from PHRED, used by PHRAP and CONSED for input.
155 - pir - A "FASTA like" format introduced by the National Biomedical
156 Research Foundation (NBRF) for the Protein Information Resource
157 (PIR) database, now part of UniProt.
158 - swiss - Plain text Swiss-Prot aka UniProt format.
159 - tab - Simple two column tab separated sequence files, where each
160 line holds a record's identifier and sequence. For example,
161 this is used as by Aligent's eArray software when saving
162 microarray probes in a minimal tab delimited text file.
163 - qual - A "FASTA like" format holding PHRED quality values from
164 sequencing DNA, but no actual sequences (usually provided
165 in separate FASTA files).
167 Note that while Bio.SeqIO can read all the above file formats, it cannot
168 write to all of them.
170 You can also use any file format supported by Bio.AlignIO, such as "nexus",
171 "phlip" and "stockholm", which gives you access to the individual sequences
172 making up each alignment as SeqRecords.
174 __docformat__ = "epytext en" #not just plaintext
177 # - define policy on reading aligned sequences with gaps in
178 # (e.g. - and . characters) including how the alphabet interacts
180 # - Can we build the to_alignment(...) functionality
181 # into the generic Alignment class instead?
183 # - How best to handle unique/non unique record.id when writing.
184 # For most file formats reading such files is fine; The stockholm
187 # - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
188 # http://www.bioperl.org/wiki/MSF_multiple_alignment_format
191 FAO BioPython Developers
192 ========================
193 The way I envision this SeqIO system working as that for any sequence file
194 format we have an iterator that returns SeqRecord objects.
196 This also applies to interlaced fileformats (like clustal - although that
197 is now handled via Bio.AlignIO instead) where the file cannot be read record
198 by record. You should still return an iterator, even if the implementation
199 could just as easily return a list.
201 These file format specific sequence iterators may be implemented as:
202 * Classes which take a handle for __init__ and provide the __iter__ method
203 * Functions that take a handle, and return an iterator object
204 * Generator functions that take a handle, and yield SeqRecord objects
206 It is then trivial to turn this iterator into a list of SeqRecord objects,
207 an in memory dictionary, or a multiple sequence alignment object.
209 For building the dictionary by default the id propery of each SeqRecord is
210 used as the key. You should always populate the id property, and it should
211 be unique in most cases. For some file formats the accession number is a good
212 choice. If the file itself contains ambiguous identifiers, don't try and
213 dis-ambiguate them - return them as is.
215 When adding a new file format, please use the same lower case format name
216 as BioPerl, or if they have not defined one, try the names used by EMBOSS.
218 See also http://biopython.org/wiki/SeqIO_dev
224 from Bio.Seq import Seq
225 from Bio.SeqRecord import SeqRecord
226 from Bio.Align.Generic import Alignment
227 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
231 import IgIO #IntelliGenetics or MASE format
232 import InsdcIO #EMBL and GenBank
237 import QualityIO #FastQ and qual files
240 #Convention for format names is "mainname-subtype" in lower case.
241 #Please use the same names as BioPerl where possible.
243 #Note that this simple system copes with defining
244 #multiple possible iterators for a given format/extension
245 #with the -subtype suffix
247 #Most alignment file formats will be handled via Bio.AlignIO
249 _FormatToIterator ={"fasta" : FastaIO.FastaIterator,
250 "gb" : InsdcIO.GenBankIterator,
251 "genbank" : InsdcIO.GenBankIterator,
252 "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
253 "embl" : InsdcIO.EmblIterator,
254 "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
255 "ig" : IgIO.IgIterator,
256 "swiss" : SwissIO.SwissIterator,
257 "phd" : PhdIO.PhdIterator,
258 "ace" : AceIO.AceIterator,
259 "tab" : TabIO.TabIterator,
260 "pir" : PirIO.PirIterator,
261 "fastq" : QualityIO.FastqPhredIterator,
262 "fastq-solexa" : QualityIO.FastqSolexaIterator,
263 "qual" : QualityIO.QualPhredIterator,
266 _FormatToWriter ={"fasta" : FastaIO.FastaWriter,
267 "gb" : InsdcIO.GenBankWriter,
268 "genbank" : InsdcIO.GenBankWriter,
269 "tab" : TabIO.TabWriter,
270 "fastq" : QualityIO.FastqPhredWriter,
271 "fastq-solexa" : QualityIO.FastqSolexaWriter,
272 "qual" : QualityIO.QualPhredWriter,
275 def write(sequences, handle, format) :
276 """Write complete set of sequences to a file.
278 - sequences - A list (or iterator) of SeqRecord objects.
279 - handle - File handle object to write to.
280 - format - lower case string describing the file format to write.
282 You should close the handle after calling this function.
284 Returns the number of records written (as an integer).
286 from Bio import AlignIO
288 #Try and give helpful error messages:
289 if isinstance(handle, basestring) :
290 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
291 if not isinstance(format, basestring) :
292 raise TypeError("Need a string for the file format (lower case)")
294 raise ValueError("Format required (lower case string)")
295 if format != format.lower() :
296 raise ValueError("Format string '%s' should be lower case" % format)
297 if isinstance(sequences,SeqRecord):
298 raise ValueError("Use a SeqRecord list/iterator, not just a single SeqRecord")
300 #Map the file format to a writer class
301 if format in _FormatToWriter :
302 writer_class = _FormatToWriter[format]
303 count = writer_class(handle).write_file(sequences)
304 elif format in AlignIO._FormatToWriter :
305 #Try and turn all the records into a single alignment,
306 #and write that using Bio.AlignIO
307 alignment = to_alignment(sequences)
308 alignment_count = AlignIO.write([alignment], handle, format)
309 assert alignment_count == 1, "Internal error - the underlying writer " \
310 + " should have returned 1, not %s" % repr(alignment_count)
311 count = len(alignment.get_all_seqs())
312 del alignment_count, alignment
313 elif format in _FormatToIterator or format in AlignIO._FormatToIterator :
314 raise ValueError("Reading format '%s' is supported, but not writing" \
317 raise ValueError("Unknown format '%s'" % format)
319 assert isinstance(count, int), "Internal error - the underlying writer " \
320 + " should have returned the record count, not %s" % repr(count)
323 def parse(handle, format, alphabet=None) :
324 r"""Turns a sequence file into an iterator returning SeqRecords.
326 - handle - handle to the file.
327 - format - lower case string describing the file format.
328 - alphabet - optional Alphabet object, useful when the sequence type
329 cannot be automatically inferred from the file itself
330 (e.g. format="fasta" or "tab")
332 Typical usage, opening a file to read in, and looping over the record(s):
334 >>> from Bio import SeqIO
335 >>> filename = "Nucleic/sweetpea.nu"
336 >>> for record in SeqIO.parse(open(filename,"rU"), "fasta") :
337 ... print "ID", record.id
338 ... print "Sequence length", len(record)
339 ... print "Sequence alphabet", record.seq.alphabet
340 ID gi|3176602|gb|U78617.1|LOU78617
342 Sequence alphabet SingleLetterAlphabet()
344 For file formats like FASTA where the alphabet cannot be determined, it
345 may be useful to specify the alphabet explicitly:
347 >>> from Bio import SeqIO
348 >>> from Bio.Alphabet import generic_dna
349 >>> filename = "Nucleic/sweetpea.nu"
350 >>> for record in SeqIO.parse(open(filename,"rU"), "fasta", generic_dna) :
351 ... print "ID", record.id
352 ... print "Sequence length", len(record)
353 ... print "Sequence alphabet", record.seq.alphabet
354 ID gi|3176602|gb|U78617.1|LOU78617
356 Sequence alphabet DNAAlphabet()
358 If you have a string 'data' containing the file contents, you must
359 first turn this into a handle in order to parse it:
361 >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
362 >>> from Bio import SeqIO
363 >>> from StringIO import StringIO
364 >>> for record in SeqIO.parse(StringIO(data), "fasta") :
365 ... print record.id, record.seq
369 Use the Bio.SeqIO.read(handle, format) function when you expect a single
372 #NOTE - The above docstring has some raw \n characters needed
373 #for the StringIO example, hense the whole docstring is in raw
374 #string more (see the leading r before the opening quote).
375 from Bio import AlignIO
377 #Try and give helpful error messages:
378 if isinstance(handle, basestring) :
379 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
380 if not isinstance(format, basestring) :
381 raise TypeError("Need a string for the file format (lower case)")
383 raise ValueError("Format required (lower case string)")
384 if format != format.lower() :
385 raise ValueError("Format string '%s' should be lower case" % format)
386 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
387 isinstance(alphabet, AlphabetEncoder)) :
388 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
390 #Map the file format to a sequence iterator:
391 if format in _FormatToIterator :
392 iterator_generator = _FormatToIterator[format]
393 if alphabet is None :
394 return iterator_generator(handle)
396 return iterator_generator(handle, alphabet=alphabet)
398 return _force_alphabet(iterator_generator(handle), alphabet)
399 elif format in AlignIO._FormatToIterator :
400 #Use Bio.AlignIO to read in the alignments
401 #TODO - Once we drop support for Python 2.3, this helper function can be
402 #replaced with a generator expression.
403 return _iterate_via_AlignIO(handle, format, alphabet)
405 raise ValueError("Unknown format '%s'" % format)
407 #This is a generator function
408 def _iterate_via_AlignIO(handle, format, alphabet) :
409 """Iterate over all records in several alignments (PRIVATE)."""
410 from Bio import AlignIO
411 for align in AlignIO.parse(handle, format, alphabet=alphabet) :
412 for record in align :
415 def _force_alphabet(record_iterator, alphabet) :
416 """Iterate over records, over-riding the alphabet (PRIVATE)."""
417 #Assume the alphabet argument has been pre-validated
418 given_base_class = _get_base_alphabet(alphabet).__class__
419 for record in record_iterator :
420 if isinstance(_get_base_alphabet(record.seq.alphabet),
422 record.seq.alphabet = alphabet
425 raise ValueError("Specified alphabet %s clashes with "\
426 "that determined from the file, %s" \
427 % (repr(alphabet), repr(record.seq.alphabet)))
429 def read(handle, format, alphabet=None) :
430 """Turns a sequence file into a single SeqRecord.
432 - handle - handle to the file.
433 - format - string describing the file format.
434 - alphabet - optional Alphabet object, useful when the sequence type
435 cannot be automatically inferred from the file itself
436 (e.g. format="fasta" or "tab")
438 This function is for use parsing sequence files containing
439 exactly one record. For example, reading a GenBank file:
441 >>> from Bio import SeqIO
442 >>> record = SeqIO.read(open("GenBank/arab1.gb", "rU"), "genbank")
443 >>> print "ID", record.id
445 >>> print "Sequence length", len(record)
446 Sequence length 86436
447 >>> print "Sequence alphabet", record.seq.alphabet
448 Sequence alphabet IUPACAmbiguousDNA()
450 If the handle contains no records, or more than one record,
451 an exception is raised. For example:
453 >>> from Bio import SeqIO
454 >>> record = SeqIO.read(open("GenBank/cor6_6.gb", "rU"), "genbank")
455 Traceback (most recent call last):
457 ValueError: More than one record found in handle
459 If however you want the first record from a file containing
460 multiple records this function would raise an exception (as
461 shown in the example above). Instead use:
463 >>> from Bio import SeqIO
464 >>> record = SeqIO.parse(open("GenBank/cor6_6.gb", "rU"), "genbank").next()
465 >>> print "First record's ID", record.id
466 First record's ID X55053.1
468 Use the Bio.SeqIO.parse(handle, format) function if you want
469 to read multiple records from the handle.
471 iterator = parse(handle, format, alphabet)
473 first = iterator.next()
474 except StopIteration :
477 raise ValueError("No records found in handle")
479 second = iterator.next()
480 except StopIteration :
482 if second is not None :
483 raise ValueError("More than one record found in handle")
486 def to_dict(sequences, key_function=None) :
487 """Turns a sequence iterator or list into a dictionary.
489 - sequences - An iterator that returns SeqRecord objects,
490 or simply a list of SeqRecord objects.
491 - key_function - Optional function which when given a SeqRecord
492 returns a unique string for the dictionary key.
494 e.g. key_function = lambda rec : rec.name
495 or, key_function = lambda rec : rec.description.split()[0]
497 If key_function is ommitted then record.id is used, on the
498 assumption that the records objects returned are SeqRecords
499 with a unique id field.
501 If there are duplicate keys, an error is raised.
503 Example usage, defaulting to using the record.id as key:
505 >>> from Bio import SeqIO
506 >>> handle = open("GenBank/cor6_6.gb", "rU")
507 >>> format = "genbank"
508 >>> id_dict = SeqIO.to_dict(SeqIO.parse(handle, format))
509 >>> print id_dict.keys()
510 ['L31939.1', 'AJ237582.1', 'X62281.1', 'AF297471.1', 'X55053.1', 'M81224.1']
511 >>> print id_dict["L31939.1"].description
512 Brassica rapa (clone bif72) kin mRNA, complete cds.
514 A more complex example, using the key_function argument in order to use
515 a sequence checksum as the dictionary key:
517 >>> from Bio import SeqIO
518 >>> from Bio.SeqUtils.CheckSum import seguid
519 >>> handle = open("GenBank/cor6_6.gb", "rU")
520 >>> format = "genbank"
521 >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(handle, format),
522 ... key_function = lambda rec : seguid(rec.seq))
523 >>> for key, record in seguid_dict.iteritems() :
524 ... print key, record.id
525 SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
526 l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
527 /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
528 TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
529 uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
530 BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
532 if key_function is None :
533 key_function = lambda rec : rec.id
536 for record in sequences :
537 key = key_function(record)
539 raise ValueError("Duplicate key '%s'" % key)
544 def to_alignment(sequences, alphabet=None, strict=True) :
545 """Returns a multiple sequence alignment (OBSOLETE).
547 - sequences -An iterator that returns SeqRecord objects,
548 or simply a list of SeqRecord objects. All
549 the record sequences must be the same length.
550 - alphabet - Optional alphabet. Stongly recommended.
551 - strict - Optional, defaults to True. Should error checking
554 Using this function is now discouraged. Rather doing this:
556 >>> from Bio import SeqIO
557 >>> handle = open("Clustalw/protein.aln")
558 >>> alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal"))
561 You are now encouraged to use Bio.AlignIO instead, e.g.
563 >>> from Bio import AlignIO
564 >>> handle = open("Clustalw/protein.aln")
565 >>> alignment = AlignIO.read(handle, "clustal")
568 #TODO - Move this functionality into the Alignment class instead?
569 from Bio.Alphabet import generic_alphabet
570 from Bio.Alphabet import _consensus_alphabet
571 if alphabet is None :
572 sequences = list(sequences)
573 alphabet = _consensus_alphabet([rec.seq.alphabet for rec in sequences \
574 if rec.seq is not None])
576 if not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)) :
577 raise ValueError("Invalid alphabet")
579 alignment_length = None
580 alignment = Alignment(alphabet)
581 for record in sequences :
583 if alignment_length is None :
584 alignment_length = len(record.seq)
585 elif alignment_length != len(record.seq) :
586 raise ValueError("Sequences must all be the same length")
588 assert isinstance(record.seq.alphabet, Alphabet) \
589 or isinstance(record.seq.alphabet, AlphabetEncoder), \
590 "Sequence does not have a valid alphabet"
592 #TODO - Move this alphabet comparison code into the Alphabet module/class?
593 #TODO - Is a normal alphabet "ungapped" by default, or does it just mean
595 if isinstance(record.seq.alphabet, Alphabet) \
596 and isinstance(alphabet, Alphabet) :
597 #Comparing two non-gapped alphabets
598 if not isinstance(record.seq.alphabet, alphabet.__class__) :
599 raise ValueError("Incompatible sequence alphabet " \
600 + "%s for %s alignment" \
601 % (record.seq.alphabet, alphabet))
602 elif isinstance(record.seq.alphabet, AlphabetEncoder) \
603 and isinstance(alphabet, Alphabet) :
604 raise ValueError("Sequence has a gapped alphabet, alignment does not")
605 elif isinstance(record.seq.alphabet, Alphabet) \
606 and isinstance(alphabet, Gapped) :
607 #Sequence isn't gapped, alignment is.
608 if not isinstance(record.seq.alphabet, alphabet.alphabet.__class__) :
609 raise ValueError("Incompatible sequence alphabet " \
610 + "%s for %s alignment" \
611 % (record.seq.alphabet, alphabet))
613 #Comparing two gapped alphabets
614 if not isinstance(record.seq.alphabet, alphabet.__class__) :
615 raise ValueError("Incompatible sequence alphabet " \
616 + "%s for %s alignment" \
617 % (record.seq.alphabet, alphabet))
618 if record.seq.alphabet.gap_char != alphabet.gap_char :
619 raise ValueError("Sequence gap characters != alignment gap char")
620 #ToDo, additional checks on the specified alignment...
621 #Should we look at the alphabet.contains() method?
622 if record.seq is None :
623 raise TypeError("SeqRecord (id=%s) has None for its sequence." % record.id)
625 #This is abusing the "private" records list,
626 #we should really have a method like add_sequence
627 #but which takes SeqRecord objects. See also Bug 1944
628 alignment._records.append(record)
632 """Run the Bio.SeqIO module's doctests.
634 This will try and locate the unit tests directory, and run the doctests
635 from there in order that the relative paths used in the examples work.
639 if os.path.isdir(os.path.join("..","..","Tests")) :
640 print "Runing doctests..."
641 cur_dir = os.path.abspath(os.curdir)
642 os.chdir(os.path.join("..","..","Tests"))
648 if __name__ == "__main__":