+++ /dev/null
-# Copyright 2008 by Peter Cock. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-#
-# This module is for reading and writing IntelliGenetics format files as
-# SeqRecord objects. This file format appears to be the same as the MASE
-# multiple sequence alignment format.
-
-"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
-
-You are expected to use this module via the Bio.SeqIO functions."""
-
-from Bio.Alphabet import single_letter_alphabet
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-
-#This is a generator function!
-def IgIterator(handle, alphabet = single_letter_alphabet) :
- """Iterate over IntelliGenetics records (as SeqRecord objects).
-
- handle - input file
- alphabet - optional alphabet
-
- The optional free format file header lines (which start with two
- semi-colons) are ignored.
-
- The free format commentary lines at the start of each record (which
- start with a semi-colon) are recorded as a single string with embedded
- new line characters in the SeqRecord's annotations dictionary under the
- key 'comment'.
- """
- #Skip any file header text before the first record (;; lines)
- while True :
- line = handle.readline()
- if not line : break #Premature end of file, or just empty?
- if not line.startswith(";;") : break
-
- while line :
- #Now iterate over the records
- if line[0]!=";" :
- raise ValueError( \
- "Records should start with ';' and not:\n%s" % repr(line))
-
- #Try and agree with SeqRecord convention from the GenBank parser,
- #(and followed in the SwissProt parser) which stores the comments
- #as a long string with newlines under annotations key 'comment'.
-
- #Note some examples use "; ..." and others ";..."
- comment_lines = []
- while line.startswith(";") :
- #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
- comment_lines.append(line[1:].strip())
- line = handle.readline()
- title = line.rstrip()
-
- seq_lines = []
- while True:
- line = handle.readline()
- if not line : break
- if line[0] == ";": break
- #Remove trailing whitespace, and any internal spaces
- seq_lines.append(line.rstrip().replace(" ",""))
- seq_str = "".join(seq_lines)
- if seq_str.endswith("1") :
- #Remove the optional terminator (digit one)
- seq_str = seq_str[:-1]
- if "1" in seq_str :
- raise ValueError("Potential terminator digit one found within sequence.")
-
- #Return the record and then continue...
- record= SeqRecord(Seq(seq_str, alphabet),
- id = title, name = title)
- record.annotations['comment'] = "\n".join(comment_lines)
- yield record
-
- #We should be at the end of the file now
- assert not line
-
-if __name__ == "__main__" :
- print "Running quick self test"
-
- import os
- path = "../../Tests/IntelliGenetics/"
- if os.path.isdir(path) :
- for filename in os.listdir(path) :
- if os.path.splitext(filename)[-1] == ".txt" :
- print
- print filename
- print "-"*len(filename)
- handle = open(os.path.join(path, filename))
- for record in IgIterator(handle) :
- print record.id, len(record)
- handle.close()
- print "Done"
- else :
- print "Could not find input files"