1 """Utilities for working with FASTA-formatted sequences (OBSOLETE).
4 Record Holds FASTA sequence data.
5 Iterator Iterates over sequence data in a FASTA file.
6 RecordParser Parses FASTA sequence data into a Record object.
7 SequenceParser Parses FASTA sequence data into a SeqRecord object.
9 For a long time this module was the most commonly used and best documented
10 FASTA parser in Biopython. However, we now recommend using Bio.SeqIO instead.
12 In view of this, while you can continue to use Bio.Fasta for the moment, it is
13 considered to be a legacy module and should not be used if you are writing new
14 code. At some point Bio.Fasta may be officially deprecated (with warning
15 messages when used) before finally being removed.
17 If you are already using Bio.Fasta with the SequenceParser to get SeqRecord
18 objects, then you should be able to switch to the more recent Bio.SeqIO module
19 very easily as that too uses SeqRecord objects. For example,
22 handle = open("example.fas")
23 for seq_record in Fasta.Iterator(handle, Fasta.SequenceParser()) :
24 print seq_record.description
28 Using Bio.SeqIO instead this becomes:
31 handle = open("example.fas")
32 for seq_record in SeqIO.parse(handle, "fasta") :
33 print seq_record.description
37 Converting an existing code which uses the RecordParser is a little more
38 complicated as the Bio.Fasta.Record object differs from the SeqRecord.
41 handle = open("example.fas")
42 for record in Fasta.Iterator(handle, Fasta.RecordParser()) :
43 #record is a Bio.Fasta.Record object
44 print record.title #The full title line as a string
45 print record.sequence #The sequence as a string
48 Using Bio.SeqIO instead this becomes:
51 handle = open("example.fas")
52 for seq_record in SeqIO.parse(handle, "fasta") :
53 print seq_record.description #The full title line as a string
54 print seq_record.seq.tostring() #The sequence as a string
61 from Bio import SeqRecord
62 from Bio import Alphabet
66 """Holds information from a FASTA record.
69 title Title line ('>' character not included).
70 sequence The sequence.
73 def __init__(self, colwidth=60):
74 """__init__(self, colwidth=60)
76 Create a new Record. colwidth specifies the number of residues
77 to put on each line when generating FASTA format.
82 self._colwidth = colwidth
86 s.append('>%s' % self.title)
88 while i < len(self.sequence):
89 s.append(self.sequence[i:i+self._colwidth])
90 i = i + self._colwidth
91 #Was having a problem getting the tests to pass on windows...
92 #return os.linesep.join(s)
96 """Returns one record at a time from a FASTA file.
98 def __init__(self, handle, parser = None, debug = 0):
99 """Initialize a new iterator.
102 self._parser = parser
105 #Skip any text before the first record (e.g. blank lines)
107 line = handle.readline()
108 if not line or line[0] == ">" :
110 if debug : print "Skipping: " + line
111 self._lookahead = line
114 return iter(self.next, None)
117 """Return the next record in the file"""
118 line = self._lookahead
121 assert line[0]==">", line
122 lines = [line.rstrip()]
123 line = self.handle.readline()
125 if line[0] == ">": break
127 if self._debug : print "Ignoring comment line"
130 lines.append(line.rstrip())
131 line = self.handle.readline()
132 self._lookahead = line
133 if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines))
134 if self._parser is None:
135 return "\n".join(lines)
137 return self._parser.parse_string("\n".join(lines))
140 """Parses FASTA sequence data into a Fasta.Record object.
142 def __init__(self, debug = 0):
145 def parse_string(self, text) :
146 text = text.replace("\r\n","\n") #Crude way of dealing with \r\n
147 assert text[0] == ">", text
148 text = text.split("\n>",1)[0] # Only do the first record if more than one
149 title, sequence = text.split("\n", 1)
153 rec.sequence = sequence.replace("\n","")
156 def parse(self, handle):
157 return self.parse_string(handle.read())
159 class SequenceParser:
160 """Parses FASTA sequence data into a SeqRecord object.
162 def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None,
164 """Initialize a Scanner and Sequence Consumer.
167 o alphabet - The alphabet of the sequences to be parsed. If not
168 passed, this will be set as generic_alphabet.
169 o title2ids - A function that, when given the title of the FASTA
170 file (without the beginning >), will return the id, name and
171 description (in that order) for the record. If this is not given,
172 then the entire title line will be used as the description.
174 self.alphabet = alphabet
175 self.title2ids = title2ids
177 def parse_string(self, text) :
178 text = text.replace("\r\n","\n") #Crude way of dealing with \r\n
179 assert text[0] == ">", text
180 text = text.split("\n>",1)[0] # Only do the first record if more than one
181 title, sequence = text.split("\n", 1)
184 seq = Seq.Seq(sequence.replace("\n",""), self.alphabet)
185 rec = SeqRecord.SeqRecord(seq)
188 seq_id, name, descr = self.title2ids(title)
191 rec.description = descr
193 rec.description = title
197 def parse(self, handle):
198 return self.parse_string(handle.read())