1 # Copyright 2006-2009 by Peter Cock. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
6 # This module is for reading and writing FASTA format files as SeqRecord
7 # objects. The code is partly inspired by earlier Biopython modules,
8 # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA
10 """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
12 You are expected to use this module via the Bio.SeqIO functions."""
14 from Bio.Alphabet import single_letter_alphabet
15 from Bio.Seq import Seq
16 from Bio.SeqRecord import SeqRecord
17 from Interfaces import SequentialSequenceWriter
19 #This is a generator function!
20 def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
21 """Generator function to iterate over Fasta records (as SeqRecord objects).
24 alphabet - optional alphabet
25 title2ids - A function that, when given the title of the FASTA
26 file (without the beginning >), will return the id, name and
27 description (in that order) for the record as a tuple of strings.
29 If this is not given, then the entire title line will be used
30 as the description, and the first word as the id and name.
32 Note that use of title2ids matches that of Bio.Fasta.SequenceParser
33 but the defaults are slightly different.
35 #Skip any text before the first record (e.g. blank lines, comments)
37 line = handle.readline()
38 if line == "" : return #Premature end of file, or just empty?
44 raise ValueError("Records in Fasta files should start with '>' character")
46 id, name, descr = title2ids(line[1:].rstrip())
48 descr = line[1:].rstrip()
53 line = handle.readline()
56 if line[0] == ">": break
57 #Remove trailing whitespace, and any internal spaces
58 #(and any embedded \r which are possible in mangled files
59 #when not opened in universal read lines mode)
60 lines.append(line.rstrip().replace(" ","").replace("\r",""))
61 line = handle.readline()
63 #Return the record and then continue...
64 yield SeqRecord(Seq("".join(lines), alphabet),
65 id = id, name = name, description = descr)
67 if not line : return #StopIteration
68 assert False, "Should not reach this line"
70 class FastaWriter(SequentialSequenceWriter):
71 """Class to write Fasta format files."""
72 def __init__(self, handle, wrap=60, record2title=None):
73 """Create a Fasta writer.
75 handle - Handle to an output file, e.g. as returned
76 by open(filename, "w")
77 wrap - Optional line length used to wrap sequence lines.
78 Defaults to wrapping the sequence at 60 characters
79 Use zero (or None) for no wrapping, giving a single
80 long line for the sequence.
81 record2title - Optional function to return the text to be
82 used for the title line of each record. By default the
83 a combination of the record.id and record.description
84 is used. If the record.description starts with the
85 record.id, then just the record.description is used.
89 myWriter = FastaWriter(open(filename,"w"))
90 writer.write_file(myRecords)
92 Or, follow the sequential file writer system, for example:
94 myWriter = FastaWriter(open(filename,"w"))
95 writer.write_header() # does nothing for Fasta files
97 Multiple calls to writer.write_record() and/or writer.write_records()
99 writer.write_footer() # does nothing for Fasta files
102 SequentialSequenceWriter.__init__(self, handle)
103 #self.handle = handle
109 self.record2title = record2title
111 def write_record(self, record):
112 """Write a single Fasta record to the file."""
113 assert self._header_written
114 assert not self._footer_written
115 self._record_written = True
117 if self.record2title :
118 title=self.clean(self.record2title(record))
120 id = self.clean(record.id)
121 description = self.clean(record.description)
123 #if description[:len(id)]==id :
124 if description and description.split(None,1)[0]==id :
125 #The description includes the id at the start
128 title = "%s %s" % (id, description)
130 assert "\n" not in title
131 assert "\r" not in title
132 self.handle.write(">%s\n" % title)
134 data = self._get_seq_string(record) #Catches sequence being None
136 assert "\n" not in data
137 assert "\r" not in data
140 for i in range(0, len(data), self.wrap):
141 self.handle.write(data[i:i+self.wrap] + "\n")
143 self.handle.write(data + "\n")
145 if __name__ == "__main__" :
146 print "Running quick self test"
149 from Bio.Alphabet import generic_protein, generic_nucleotide
151 #Download the files from here:
152 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans
153 fna_filename = "NC_005213.fna"
154 faa_filename = "NC_005213.faa"
156 def genbank_name_function(text) :
157 text, descr = text.split(None,1)
158 id = text.split("|")[3]
159 name = id.split(".",1)[0]
160 return id, name, descr
162 def print_record(record) :
164 #http://bugzilla.open-bio.org/show_bug.cgi?id=2057
165 print "ID:" + record.id
166 print "Name:" + record.name
167 print "Descr:" + record.description
169 for feature in record.annotations :
170 print '/%s=%s' % (feature, record.annotations[feature])
172 print "Database cross references:"
173 for x in record.dbxrefs : print " - %s" % x
175 if os.path.isfile(fna_filename) :
177 print "FastaIterator (single sequence)"
178 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
180 for record in iterator :
184 print str(record.__class__)
186 if os.path.isfile(faa_filename) :
188 print "FastaIterator (multiple sequences)"
189 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
191 for record in iterator :
196 print str(record.__class__)
198 from cStringIO import StringIO
200 print "FastaIterator (empty input file)"
201 #Just to make sure no errors happen
202 iterator = FastaIterator(StringIO(""))
204 for record in iterator :