"""Utilities for working with FASTA-formatted sequences (OBSOLETE). Classes: Record Holds FASTA sequence data. Iterator Iterates over sequence data in a FASTA file. RecordParser Parses FASTA sequence data into a Record object. SequenceParser Parses FASTA sequence data into a SeqRecord object. For a long time this module was the most commonly used and best documented FASTA parser in Biopython. However, we now recommend using Bio.SeqIO instead. In view of this, while you can continue to use Bio.Fasta for the moment, it is considered to be a legacy module and should not be used if you are writing new code. At some point Bio.Fasta may be officially deprecated (with warning messages when used) before finally being removed. If you are already using Bio.Fasta with the SequenceParser to get SeqRecord objects, then you should be able to switch to the more recent Bio.SeqIO module very easily as that too uses SeqRecord objects. For example, from Bio import Fasta handle = open("example.fas") for seq_record in Fasta.Iterator(handle, Fasta.SequenceParser()) : print seq_record.description print seq_record.seq handle.close() Using Bio.SeqIO instead this becomes: from Bio import SeqIO handle = open("example.fas") for seq_record in SeqIO.parse(handle, "fasta") : print seq_record.description print seq_record.seq handle.close() Converting an existing code which uses the RecordParser is a little more complicated as the Bio.Fasta.Record object differs from the SeqRecord. from Bio import Fasta handle = open("example.fas") for record in Fasta.Iterator(handle, Fasta.RecordParser()) : #record is a Bio.Fasta.Record object print record.title #The full title line as a string print record.sequence #The sequence as a string handle.close() Using Bio.SeqIO instead this becomes: from Bio import SeqIO handle = open("example.fas") for seq_record in SeqIO.parse(handle, "fasta") : print seq_record.description #The full title line as a string print seq_record.seq.tostring() #The sequence as a string handle.close() """ from Bio import Seq from Bio import SeqRecord from Bio import Alphabet class Record: """Holds information from a FASTA record. Members: title Title line ('>' character not included). sequence The sequence. """ def __init__(self, colwidth=60): """__init__(self, colwidth=60) Create a new Record. colwidth specifies the number of residues to put on each line when generating FASTA format. """ self.title = '' self.sequence = '' self._colwidth = colwidth def __str__(self): s = [] s.append('>%s' % self.title) i = 0 while i < len(self.sequence): s.append(self.sequence[i:i+self._colwidth]) i = i + self._colwidth #Was having a problem getting the tests to pass on windows... #return os.linesep.join(s) return "\n".join(s) class Iterator: """Returns one record at a time from a FASTA file. """ def __init__(self, handle, parser = None, debug = 0): """Initialize a new iterator. """ self.handle = handle self._parser = parser self._debug = debug #Skip any text before the first record (e.g. blank lines) while True : line = handle.readline() if not line or line[0] == ">" : break if debug : print "Skipping: " + line self._lookahead = line def __iter__(self): return iter(self.next, None) def next(self): """Return the next record in the file""" line = self._lookahead if not line: return None assert line[0]==">", line lines = [line.rstrip()] line = self.handle.readline() while line: if line[0] == ">": break if line[0] == "#" : if self._debug : print "Ignoring comment line" pass else : lines.append(line.rstrip()) line = self.handle.readline() self._lookahead = line if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines)) if self._parser is None: return "\n".join(lines) else : return self._parser.parse_string("\n".join(lines)) class RecordParser: """Parses FASTA sequence data into a Fasta.Record object. """ def __init__(self, debug = 0): pass def parse_string(self, text) : text = text.replace("\r\n","\n") #Crude way of dealing with \r\n assert text[0] == ">", text text = text.split("\n>",1)[0] # Only do the first record if more than one title, sequence = text.split("\n", 1) title = title[1:] rec = Record() rec.title = title rec.sequence = sequence.replace("\n","") return rec def parse(self, handle): return self.parse_string(handle.read()) class SequenceParser: """Parses FASTA sequence data into a SeqRecord object. """ def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None, debug = 0): """Initialize a Scanner and Sequence Consumer. Arguments: o alphabet - The alphabet of the sequences to be parsed. If not passed, this will be set as generic_alphabet. o title2ids - A function that, when given the title of the FASTA file (without the beginning >), will return the id, name and description (in that order) for the record. If this is not given, then the entire title line will be used as the description. """ self.alphabet = alphabet self.title2ids = title2ids def parse_string(self, text) : text = text.replace("\r\n","\n") #Crude way of dealing with \r\n assert text[0] == ">", text text = text.split("\n>",1)[0] # Only do the first record if more than one title, sequence = text.split("\n", 1) title = title[1:] seq = Seq.Seq(sequence.replace("\n",""), self.alphabet) rec = SeqRecord.SeqRecord(seq) if self.title2ids: seq_id, name, descr = self.title2ids(title) rec.id = seq_id rec.name = name rec.description = descr else: rec.description = title return rec def parse(self, handle): return self.parse_string(handle.read())