SeqIO/FastaIO.py

   1 # Copyright 2006-2009 by Peter Cock.  All rights reserved.
   2 # This code is part of the Biopython distribution and governed by its
   3 # license.  Please see the LICENSE file that should have been included
   4 # as part of this package.
   5 #
   6 # This module is for reading and writing FASTA format files as SeqRecord
   7 # objects.  The code is partly inspired  by earlier Biopython modules,
   8 # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA
   9
  10 """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
  11
  12 You are expected to use this module via the Bio.SeqIO functions."""
  13
  14 from Bio.Alphabet import single_letter_alphabet
  15 from Bio.Seq import Seq
  16 from Bio.SeqRecord import SeqRecord
  17 from Interfaces import SequentialSequenceWriter
  18
  19 #This is a generator function!
  20 def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
  21     """Generator function to iterate over Fasta records (as SeqRecord objects).
  22
  23     handle - input file
  24     alphabet - optional alphabet
  25     title2ids - A function that, when given the title of the FASTA
  26     file (without the beginning >), will return the id, name and
  27     description (in that order) for the record as a tuple of strings.
  28
  29     If this is not given, then the entire title line will be used
  30     as the description, and the first word as the id and name.
  31
  32     Note that use of title2ids matches that of Bio.Fasta.SequenceParser
  33     but the defaults are slightly different.
  34     """
  35     #Skip any text before the first record (e.g. blank lines, comments)
  36     while True :
  37         line = handle.readline()
  38         if line == "" : return #Premature end of file, or just empty?
  39         if line[0] == ">" :
  40             break
  41
  42     while True :
  43         if line[0]!=">" :
  44             raise ValueError("Records in Fasta files should start with '>' character")
  45         if title2ids :
  46             id, name, descr = title2ids(line[1:].rstrip())
  47         else :
  48             descr = line[1:].rstrip()
  49             id   = descr.split()[0]
  50             name = id
  51
  52         lines = []
  53         line = handle.readline()
  54         while True:
  55             if not line : break
  56             if line[0] == ">": break
  57             #Remove trailing whitespace, and any internal spaces
  58             #(and any embedded \r which are possible in mangled files
  59             #when not opened in universal read lines mode)
  60             lines.append(line.rstrip().replace(" ","").replace("\r",""))
  61             line = handle.readline()
  62
  63         #Return the record and then continue...
  64         yield SeqRecord(Seq("".join(lines), alphabet),
  65                          id = id, name = name, description = descr)
  66
  67         if not line : return #StopIteration
  68     assert False, "Should not reach this line"
  69
  70 class FastaWriter(SequentialSequenceWriter):
  71     """Class to write Fasta format files."""
  72     def __init__(self, handle, wrap=60, record2title=None):
  73         """Create a Fasta writer.
  74
  75         handle - Handle to an output file, e.g. as returned
  76                  by open(filename, "w")
  77         wrap -   Optional line length used to wrap sequence lines.
  78                  Defaults to wrapping the sequence at 60 characters
  79                  Use zero (or None) for no wrapping, giving a single
  80                  long line for the sequence.
  81         record2title - Optional function to return the text to be
  82                  used for the title line of each record.  By default the
  83                  a combination of the record.id and record.description
  84                  is used.  If the record.description starts with the
  85                  record.id, then just the record.description is used.
  86
  87         You can either use:
  88
  89         myWriter = FastaWriter(open(filename,"w"))
  90         writer.write_file(myRecords)
  91
  92         Or, follow the sequential file writer system, for example:
  93
  94         myWriter = FastaWriter(open(filename,"w"))
  95         writer.write_header() # does nothing for Fasta files
  96         ...
  97         Multiple calls to writer.write_record() and/or writer.write_records()
  98         ...
  99         writer.write_footer() # does nothing for Fasta files
 100         writer.close()
 101         """
 102         SequentialSequenceWriter.__init__(self, handle)
 103         #self.handle = handle
 104         self.wrap = None
 105         if wrap :
 106             if wrap < 1 :
 107                 raise ValueError
 108         self.wrap = wrap
 109         self.record2title = record2title
 110
 111     def write_record(self, record):
 112         """Write a single Fasta record to the file."""
 113         assert self._header_written
 114         assert not self._footer_written
 115         self._record_written = True
 116
 117         if self.record2title :
 118             title=self.clean(self.record2title(record))
 119         else :
 120             id = self.clean(record.id)
 121             description = self.clean(record.description)
 122
 123             #if description[:len(id)]==id :
 124             if description and description.split(None,1)[0]==id :
 125                 #The description includes the id at the start
 126                 title = description
 127             else :
 128                 title = "%s %s" % (id, description)
 129
 130         assert "\n" not in title
 131         assert "\r" not in title
 132         self.handle.write(">%s\n" % title)
 133
 134         data = self._get_seq_string(record) #Catches sequence being None
 135
 136         assert "\n" not in data
 137         assert "\r" not in data
 138
 139         if self.wrap :
 140             for i in range(0, len(data), self.wrap):
 141                 self.handle.write(data[i:i+self.wrap] + "\n")
 142         else :
 143             self.handle.write(data + "\n")
 144
 145 if __name__ == "__main__" :
 146     print "Running quick self test"
 147
 148     import os
 149     from Bio.Alphabet import generic_protein, generic_nucleotide
 150
 151     #Download the files from here:
 152     #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans
 153     fna_filename = "NC_005213.fna"
 154     faa_filename = "NC_005213.faa"
 155
 156     def genbank_name_function(text) :
 157         text, descr = text.split(None,1)
 158         id = text.split("|")[3]
 159         name = id.split(".",1)[0]
 160         return id, name, descr
 161
 162     def print_record(record) :
 163         #See also bug 2057
 164         #http://bugzilla.open-bio.org/show_bug.cgi?id=2057
 165         print "ID:" + record.id
 166         print "Name:" + record.name
 167         print "Descr:" + record.description
 168         print record.seq
 169         for feature in record.annotations :
 170             print '/%s=%s' % (feature, record.annotations[feature])
 171         if record.dbxrefs :
 172             print "Database cross references:"
 173             for x in record.dbxrefs : print " - %s" % x
 174
 175     if os.path.isfile(fna_filename) :
 176         print "--------"
 177         print "FastaIterator (single sequence)"
 178         iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
 179         count=0
 180         for record in iterator :
 181             count=count+1
 182             print_record(record)
 183         assert count == 1
 184         print str(record.__class__)
 185
 186     if os.path.isfile(faa_filename) :
 187         print "--------"
 188         print "FastaIterator (multiple sequences)"
 189         iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
 190         count=0
 191         for record in iterator :
 192             count=count+1
 193             print_record(record)
 194             break
 195         assert count>0
 196         print str(record.__class__)
 197
 198     from cStringIO import StringIO
 199     print "--------"
 200     print "FastaIterator (empty input file)"
 201     #Just to make sure no errors happen
 202     iterator = FastaIterator(StringIO(""))
 203     count = 0
 204     for record in iterator :
 205         count = count+1
 206     assert count==0
 207
 208     print "Done"