+++ /dev/null
-# Copyright 2006-2009 by Peter Cock. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-#
-# This module is for reading and writing FASTA format files as SeqRecord
-# objects. The code is partly inspired by earlier Biopython modules,
-# Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA
-
-"""Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
-
-You are expected to use this module via the Bio.SeqIO functions."""
-
-from Bio.Alphabet import single_letter_alphabet
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-from Interfaces import SequentialSequenceWriter
-
-#This is a generator function!
-def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
- """Generator function to iterate over Fasta records (as SeqRecord objects).
-
- handle - input file
- alphabet - optional alphabet
- title2ids - A function that, when given the title of the FASTA
- file (without the beginning >), will return the id, name and
- description (in that order) for the record as a tuple of strings.
-
- If this is not given, then the entire title line will be used
- as the description, and the first word as the id and name.
-
- Note that use of title2ids matches that of Bio.Fasta.SequenceParser
- but the defaults are slightly different.
- """
- #Skip any text before the first record (e.g. blank lines, comments)
- while True :
- line = handle.readline()
- if line == "" : return #Premature end of file, or just empty?
- if line[0] == ">" :
- break
-
- while True :
- if line[0]!=">" :
- raise ValueError("Records in Fasta files should start with '>' character")
- if title2ids :
- id, name, descr = title2ids(line[1:].rstrip())
- else :
- descr = line[1:].rstrip()
- id = descr.split()[0]
- name = id
-
- lines = []
- line = handle.readline()
- while True:
- if not line : break
- if line[0] == ">": break
- #Remove trailing whitespace, and any internal spaces
- #(and any embedded \r which are possible in mangled files
- #when not opened in universal read lines mode)
- lines.append(line.rstrip().replace(" ","").replace("\r",""))
- line = handle.readline()
-
- #Return the record and then continue...
- yield SeqRecord(Seq("".join(lines), alphabet),
- id = id, name = name, description = descr)
-
- if not line : return #StopIteration
- assert False, "Should not reach this line"
-
-class FastaWriter(SequentialSequenceWriter):
- """Class to write Fasta format files."""
- def __init__(self, handle, wrap=60, record2title=None):
- """Create a Fasta writer.
-
- handle - Handle to an output file, e.g. as returned
- by open(filename, "w")
- wrap - Optional line length used to wrap sequence lines.
- Defaults to wrapping the sequence at 60 characters
- Use zero (or None) for no wrapping, giving a single
- long line for the sequence.
- record2title - Optional function to return the text to be
- used for the title line of each record. By default the
- a combination of the record.id and record.description
- is used. If the record.description starts with the
- record.id, then just the record.description is used.
-
- You can either use:
-
- myWriter = FastaWriter(open(filename,"w"))
- writer.write_file(myRecords)
-
- Or, follow the sequential file writer system, for example:
-
- myWriter = FastaWriter(open(filename,"w"))
- writer.write_header() # does nothing for Fasta files
- ...
- Multiple calls to writer.write_record() and/or writer.write_records()
- ...
- writer.write_footer() # does nothing for Fasta files
- writer.close()
- """
- SequentialSequenceWriter.__init__(self, handle)
- #self.handle = handle
- self.wrap = None
- if wrap :
- if wrap < 1 :
- raise ValueError
- self.wrap = wrap
- self.record2title = record2title
-
- def write_record(self, record):
- """Write a single Fasta record to the file."""
- assert self._header_written
- assert not self._footer_written
- self._record_written = True
-
- if self.record2title :
- title=self.clean(self.record2title(record))
- else :
- id = self.clean(record.id)
- description = self.clean(record.description)
-
- #if description[:len(id)]==id :
- if description and description.split(None,1)[0]==id :
- #The description includes the id at the start
- title = description
- else :
- title = "%s %s" % (id, description)
-
- assert "\n" not in title
- assert "\r" not in title
- self.handle.write(">%s\n" % title)
-
- data = self._get_seq_string(record) #Catches sequence being None
-
- assert "\n" not in data
- assert "\r" not in data
-
- if self.wrap :
- for i in range(0, len(data), self.wrap):
- self.handle.write(data[i:i+self.wrap] + "\n")
- else :
- self.handle.write(data + "\n")
-
-if __name__ == "__main__" :
- print "Running quick self test"
-
- import os
- from Bio.Alphabet import generic_protein, generic_nucleotide
-
- #Download the files from here:
- #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans
- fna_filename = "NC_005213.fna"
- faa_filename = "NC_005213.faa"
-
- def genbank_name_function(text) :
- text, descr = text.split(None,1)
- id = text.split("|")[3]
- name = id.split(".",1)[0]
- return id, name, descr
-
- def print_record(record) :
- #See also bug 2057
- #http://bugzilla.open-bio.org/show_bug.cgi?id=2057
- print "ID:" + record.id
- print "Name:" + record.name
- print "Descr:" + record.description
- print record.seq
- for feature in record.annotations :
- print '/%s=%s' % (feature, record.annotations[feature])
- if record.dbxrefs :
- print "Database cross references:"
- for x in record.dbxrefs : print " - %s" % x
-
- if os.path.isfile(fna_filename) :
- print "--------"
- print "FastaIterator (single sequence)"
- iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
- count=0
- for record in iterator :
- count=count+1
- print_record(record)
- assert count == 1
- print str(record.__class__)
-
- if os.path.isfile(faa_filename) :
- print "--------"
- print "FastaIterator (multiple sequences)"
- iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
- count=0
- for record in iterator :
- count=count+1
- print_record(record)
- break
- assert count>0
- print str(record.__class__)
-
- from cStringIO import StringIO
- print "--------"
- print "FastaIterator (empty input file)"
- #Just to make sure no errors happen
- iterator = FastaIterator(StringIO(""))
- count = 0
- for record in iterator :
- count = count+1
- assert count==0
-
- print "Done"