1 # Copyright 2008 by Peter Cock. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
6 # This module is for reading and writing IntelliGenetics format files as
7 # SeqRecord objects. This file format appears to be the same as the MASE
8 # multiple sequence alignment format.
10 """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
12 You are expected to use this module via the Bio.SeqIO functions."""
14 from Bio.Alphabet import single_letter_alphabet
15 from Bio.Seq import Seq
16 from Bio.SeqRecord import SeqRecord
18 #This is a generator function!
19 def IgIterator(handle, alphabet = single_letter_alphabet) :
20 """Iterate over IntelliGenetics records (as SeqRecord objects).
23 alphabet - optional alphabet
25 The optional free format file header lines (which start with two
26 semi-colons) are ignored.
28 The free format commentary lines at the start of each record (which
29 start with a semi-colon) are recorded as a single string with embedded
30 new line characters in the SeqRecord's annotations dictionary under the
33 #Skip any file header text before the first record (;; lines)
35 line = handle.readline()
36 if not line : break #Premature end of file, or just empty?
37 if not line.startswith(";;") : break
40 #Now iterate over the records
43 "Records should start with ';' and not:\n%s" % repr(line))
45 #Try and agree with SeqRecord convention from the GenBank parser,
46 #(and followed in the SwissProt parser) which stores the comments
47 #as a long string with newlines under annotations key 'comment'.
49 #Note some examples use "; ..." and others ";..."
51 while line.startswith(";") :
52 #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
53 comment_lines.append(line[1:].strip())
54 line = handle.readline()
59 line = handle.readline()
61 if line[0] == ";": break
62 #Remove trailing whitespace, and any internal spaces
63 seq_lines.append(line.rstrip().replace(" ",""))
64 seq_str = "".join(seq_lines)
65 if seq_str.endswith("1") :
66 #Remove the optional terminator (digit one)
67 seq_str = seq_str[:-1]
69 raise ValueError("Potential terminator digit one found within sequence.")
71 #Return the record and then continue...
72 record= SeqRecord(Seq(seq_str, alphabet),
73 id = title, name = title)
74 record.annotations['comment'] = "\n".join(comment_lines)
77 #We should be at the end of the file now
80 if __name__ == "__main__" :
81 print "Running quick self test"
84 path = "../../Tests/IntelliGenetics/"
85 if os.path.isdir(path) :
86 for filename in os.listdir(path) :
87 if os.path.splitext(filename)[-1] == ".txt" :
90 print "-"*len(filename)
91 handle = open(os.path.join(path, filename))
92 for record in IgIterator(handle) :
93 print record.id, len(record)
97 print "Could not find input files"