1 # Copyright 2008 by Peter Cock. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
6 # This module is for reading and writing PIR or NBRF format files as
7 # SeqRecord objects. The code is based on Bio.SeqIO.FastaIO
9 """Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
11 You are expected to use this module via the Bio.SeqIO functions, or if
12 the file contains a sequence alignment, optionally via Bio.AlignIO instead.
14 This format was introduced for the Protein Information Resource (PIR), a
15 project of the National Biomedical Research Foundation (NBRF). The PIR
16 database itself is now part of UniProt.
18 The file format is described online at:
19 http://www.ebi.ac.uk/help/pir_frame.html
20 http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
22 An example file in this format would be:
25 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
26 MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
27 SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
28 GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
29 SDVPERSIPI TREEKPAIAG AQRK*
32 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
33 MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
34 PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
35 HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
36 QASGPERTIP ITREEKPAVT AAPKK*
38 Or, an example of a multiple sequence alignment:
41 rhodopsin - northern leopard frog
42 MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
43 VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
44 GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
45 EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
46 ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
47 YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
50 rhodopsin - African clawed frog
51 MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
52 VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
53 GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
54 EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
55 LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
56 YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
59 rhodopsin - Japanese lamprey
60 MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
61 VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
62 GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
63 EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
64 ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
65 YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
68 As with the FASTA format, each record starts with a line begining with ">"
69 character. There is then a two letter sequence type (P1, F1, DL, DC, RL,
70 RC, or XX), a semi colon, and the identification code. The second like is
71 free text description. The remaining lines contain the sequence itself,
72 terminating in an asterisk. Space separated blocks of ten letters as shown
75 Sequence codes and their meanings:
77 P1 - Protein (complete)
78 F1 - Protein (fragment)
79 D1 - DNA (e.g. EMBOSS seqret output)
85 N1 - Other functional RNA
89 from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna
90 from Bio.Seq import Seq
91 from Bio.SeqRecord import SeqRecord
93 _pir_alphabets = {"P1" : generic_protein,
94 "F1" : generic_protein,
101 "XX" : single_letter_alphabet,
104 #This is a generator function!
105 def PirIterator(handle) :
106 """Generator function to iterate over Fasta records (as SeqRecord objects).
109 alphabet - optional alphabet
110 title2ids - A function that, when given the title of the FASTA
111 file (without the beginning >), will return the id, name and
112 description (in that order) for the record as a tuple of strings.
114 If this is not given, then the entire title line will be used
115 as the description, and the first word as the id and name.
117 Note that use of title2ids matches that of Bio.Fasta.SequenceParser
118 but the defaults are slightly different.
120 #Skip any text before the first record (e.g. blank lines, comments)
122 line = handle.readline()
123 if line == "" : return #Premature end of file, or just empty?
129 raise ValueError("Records in PIR files should start with '>' character")
131 if pir_type not in _pir_alphabets or line[3] != ";" :
132 raise ValueError("Records should start with '>XX;' where XX is a valid sequence type")
133 identifier = line[4:].strip()
134 description = handle.readline().strip()
138 line = handle.readline()
141 if line[0] == ">": break
142 #Remove trailing whitespace, and any internal spaces
143 lines.append(line.rstrip().replace(" ",""))
144 line = handle.readline()
147 #Note the * terminator is present on nucleotide sequences too,
148 #it is not a stop codon!
149 raise ValueError("Sequences in PIR files should include a * terminator!")
151 #Return the record and then continue...
152 record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
153 id = identifier, name = identifier,
154 description = description)
155 record.annotations["PIR-type"] = pir_type
158 if not line : return #StopIteration
159 assert False, "Should not reach this line"
161 if __name__ == "__main__" :
162 print "Running quick self test"
164 from StringIO import StringIO
167 for name in ["clustalw", "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] :
169 filename = "../../Tests/NBRF/%s.pir" % name
170 if not os.path.isfile(filename) :
171 print "Missing %s" % filename
174 records = list(PirIterator(open(filename)))
176 for record in records :
178 parts = record.description.split()
179 if "bases," in parts :
180 assert len(record) == int(parts[parts.index("bases,")-1])
181 print "Could read %s (%i records)" % (name, count)