--- /dev/null
+"""Utilities for working with FASTA-formatted sequences (OBSOLETE).
+
+Classes:
+Record Holds FASTA sequence data.
+Iterator Iterates over sequence data in a FASTA file.
+RecordParser Parses FASTA sequence data into a Record object.
+SequenceParser Parses FASTA sequence data into a SeqRecord object.
+
+For a long time this module was the most commonly used and best documented
+FASTA parser in Biopython. However, we now recommend using Bio.SeqIO instead.
+
+In view of this, while you can continue to use Bio.Fasta for the moment, it is
+considered to be a legacy module and should not be used if you are writing new
+code. At some point Bio.Fasta may be officially deprecated (with warning
+messages when used) before finally being removed.
+
+If you are already using Bio.Fasta with the SequenceParser to get SeqRecord
+objects, then you should be able to switch to the more recent Bio.SeqIO module
+very easily as that too uses SeqRecord objects. For example,
+
+from Bio import Fasta
+handle = open("example.fas")
+for seq_record in Fasta.Iterator(handle, Fasta.SequenceParser()) :
+ print seq_record.description
+ print seq_record.seq
+handle.close()
+
+Using Bio.SeqIO instead this becomes:
+
+from Bio import SeqIO
+handle = open("example.fas")
+for seq_record in SeqIO.parse(handle, "fasta") :
+ print seq_record.description
+ print seq_record.seq
+handle.close()
+
+Converting an existing code which uses the RecordParser is a little more
+complicated as the Bio.Fasta.Record object differs from the SeqRecord.
+
+from Bio import Fasta
+handle = open("example.fas")
+for record in Fasta.Iterator(handle, Fasta.RecordParser()) :
+ #record is a Bio.Fasta.Record object
+ print record.title #The full title line as a string
+ print record.sequence #The sequence as a string
+handle.close()
+
+Using Bio.SeqIO instead this becomes:
+
+from Bio import SeqIO
+handle = open("example.fas")
+for seq_record in SeqIO.parse(handle, "fasta") :
+ print seq_record.description #The full title line as a string
+ print seq_record.seq.tostring() #The sequence as a string
+handle.close()
+
+
+
+"""
+from Bio import Seq
+from Bio import SeqRecord
+from Bio import Alphabet
+
+
+class Record:
+ """Holds information from a FASTA record.
+
+ Members:
+ title Title line ('>' character not included).
+ sequence The sequence.
+
+ """
+ def __init__(self, colwidth=60):
+ """__init__(self, colwidth=60)
+
+ Create a new Record. colwidth specifies the number of residues
+ to put on each line when generating FASTA format.
+
+ """
+ self.title = ''
+ self.sequence = ''
+ self._colwidth = colwidth
+
+ def __str__(self):
+ s = []
+ s.append('>%s' % self.title)
+ i = 0
+ while i < len(self.sequence):
+ s.append(self.sequence[i:i+self._colwidth])
+ i = i + self._colwidth
+ #Was having a problem getting the tests to pass on windows...
+ #return os.linesep.join(s)
+ return "\n".join(s)
+
+class Iterator:
+ """Returns one record at a time from a FASTA file.
+ """
+ def __init__(self, handle, parser = None, debug = 0):
+ """Initialize a new iterator.
+ """
+ self.handle = handle
+ self._parser = parser
+ self._debug = debug
+
+ #Skip any text before the first record (e.g. blank lines)
+ while True :
+ line = handle.readline()
+ if not line or line[0] == ">" :
+ break
+ if debug : print "Skipping: " + line
+ self._lookahead = line
+
+ def __iter__(self):
+ return iter(self.next, None)
+
+ def next(self):
+ """Return the next record in the file"""
+ line = self._lookahead
+ if not line:
+ return None
+ assert line[0]==">", line
+ lines = [line.rstrip()]
+ line = self.handle.readline()
+ while line:
+ if line[0] == ">": break
+ if line[0] == "#" :
+ if self._debug : print "Ignoring comment line"
+ pass
+ else :
+ lines.append(line.rstrip())
+ line = self.handle.readline()
+ self._lookahead = line
+ if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines))
+ if self._parser is None:
+ return "\n".join(lines)
+ else :
+ return self._parser.parse_string("\n".join(lines))
+
+class RecordParser:
+ """Parses FASTA sequence data into a Fasta.Record object.
+ """
+ def __init__(self, debug = 0):
+ pass
+
+ def parse_string(self, text) :
+ text = text.replace("\r\n","\n") #Crude way of dealing with \r\n
+ assert text[0] == ">", text
+ text = text.split("\n>",1)[0] # Only do the first record if more than one
+ title, sequence = text.split("\n", 1)
+ title = title[1:]
+ rec = Record()
+ rec.title = title
+ rec.sequence = sequence.replace("\n","")
+ return rec
+
+ def parse(self, handle):
+ return self.parse_string(handle.read())
+
+class SequenceParser:
+ """Parses FASTA sequence data into a SeqRecord object.
+ """
+ def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None,
+ debug = 0):
+ """Initialize a Scanner and Sequence Consumer.
+
+ Arguments:
+ o alphabet - The alphabet of the sequences to be parsed. If not
+ passed, this will be set as generic_alphabet.
+ o title2ids - A function that, when given the title of the FASTA
+ file (without the beginning >), will return the id, name and
+ description (in that order) for the record. If this is not given,
+ then the entire title line will be used as the description.
+ """
+ self.alphabet = alphabet
+ self.title2ids = title2ids
+
+ def parse_string(self, text) :
+ text = text.replace("\r\n","\n") #Crude way of dealing with \r\n
+ assert text[0] == ">", text
+ text = text.split("\n>",1)[0] # Only do the first record if more than one
+ title, sequence = text.split("\n", 1)
+ title = title[1:]
+
+ seq = Seq.Seq(sequence.replace("\n",""), self.alphabet)
+ rec = SeqRecord.SeqRecord(seq)
+
+ if self.title2ids:
+ seq_id, name, descr = self.title2ids(title)
+ rec.id = seq_id
+ rec.name = name
+ rec.description = descr
+ else:
+ rec.description = title
+
+ return rec
+
+ def parse(self, handle):
+ return self.parse_string(handle.read())