binaries/src/globplot/biopython-1.50/Bio/SeqIO/AceIO.py

   1 # Copyright 2008 by Peter Cock.  All rights reserved.
   2 #
   3 # This code is part of the Biopython distribution and governed by its
   4 # license.  Please see the LICENSE file that should have been included
   5 # as part of this package.
   6
   7 """Bio.SeqIO support for the "ace" file format.
   8
   9 You are expected to use this module via the Bio.SeqIO functions.
  10 See also the Bio.Sequencing.Ace module which offers more than just accessing
  11 the contig consensus sequences in an ACE file as SeqRecord objects."""
  12
  13 from Bio.Seq import Seq
  14 from Bio.SeqRecord import SeqRecord
  15 from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped
  16 from Bio.Sequencing import Ace
  17
  18 #This is a generator function!
  19 def AceIterator(handle) :
  20     """Returns SeqRecord objects from an ACE file.
  21
  22     This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
  23     by iterating over the file in a single pass, we are forced to ignore any
  24     WA, CT, RT or WR footer tags."""
  25
  26     for ace_contig in Ace.parse(handle) :
  27         #Convert the ACE contig record into a SeqRecord...
  28         consensus_seq_str = ace_contig.sequence
  29         #Assume its DNA unless there is a U in it,
  30         if "U" in consensus_seq_str :
  31             if "T" in consensus_seq_str :
  32                 #Very odd! Error?
  33                 alpha = generic_ncleotide
  34             else :
  35                 alpha = generic_rna
  36         else :
  37             alpha = generic_dna
  38
  39         if "*" in consensus_seq_str :
  40             #For consistency with most other file formats, map
  41             #any * gaps into 0 gaps.
  42             assert "-" not in consensus_seq_str
  43             consensus_seq = Seq(consensus_seq_str.replace("*","-"),
  44                                 Gapped(alpha, gap_char="-"))
  45         else :
  46             consensus_seq = Seq(consensus_seq_str, alpha)
  47
  48         #TODO - Consensus base quality (BQ lines).  Note that any gaps
  49         #(* character) in the consensus does not get a quality entry.
  50         #This really needs Biopython support for per-letter-annotation.
  51
  52         #TODO? - Base segments (BS lines) which indicates which read
  53         #phrap has chosen to be the consensus at a particular position.
  54         #Perhaps as SeqFeature objects?
  55
  56         #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
  57         #Perhaps as SeqFeature objects?
  58
  59         seq_record = SeqRecord(consensus_seq,
  60                                id = ace_contig.name,
  61                                name = ace_contig.name)
  62         yield seq_record
  63     #All done