--- /dev/null
+# Copyright 2008 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+# This module is for reading and writing PIR or NBRF format files as
+# SeqRecord objects. The code is based on Bio.SeqIO.FastaIO
+
+"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
+
+You are expected to use this module via the Bio.SeqIO functions, or if
+the file contains a sequence alignment, optionally via Bio.AlignIO instead.
+
+This format was introduced for the Protein Information Resource (PIR), a
+project of the National Biomedical Research Foundation (NBRF). The PIR
+database itself is now part of UniProt.
+
+The file format is described online at:
+http://www.ebi.ac.uk/help/pir_frame.html
+http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
+
+An example file in this format would be:
+
+>P1;CRAB_ANAPL
+ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
+ MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
+ SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
+ GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
+ SDVPERSIPI TREEKPAIAG AQRK*
+
+>P1;CRAB_BOVIN
+ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
+ MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
+ PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
+ HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
+ QASGPERTIP ITREEKPAVT AAPKK*
+
+Or, an example of a multiple sequence alignment:
+
+>P1;S27231
+rhodopsin - northern leopard frog
+MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
+VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
+GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
+EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
+ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
+YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
+
+>P1;I51200
+rhodopsin - African clawed frog
+MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
+VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
+GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
+EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
+LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
+YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
+
+>P1;JN0120
+rhodopsin - Japanese lamprey
+MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
+VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
+GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
+EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
+ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
+YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
+
+
+As with the FASTA format, each record starts with a line begining with ">"
+character. There is then a two letter sequence type (P1, F1, DL, DC, RL,
+RC, or XX), a semi colon, and the identification code. The second like is
+free text description. The remaining lines contain the sequence itself,
+terminating in an asterisk. Space separated blocks of ten letters as shown
+above are typical.
+
+Sequence codes and their meanings:
+
+P1 - Protein (complete)
+F1 - Protein (fragment)
+D1 - DNA (e.g. EMBOSS seqret output)
+DL - DNA (linear)
+DC - DNA (circular)
+RL - RNA (linear)
+RC - RNA (circular)
+N3 - tRNA
+N1 - Other functional RNA
+XX - Unknown
+"""
+
+from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+_pir_alphabets = {"P1" : generic_protein,
+ "F1" : generic_protein,
+ "D1" : generic_dna,
+ "DL" : generic_dna,
+ "DC" : generic_dna,
+ "RL" : generic_rna,
+ "RC" : generic_rna,
+ "N3" : generic_rna,
+ "XX" : single_letter_alphabet,
+ }
+
+#This is a generator function!
+def PirIterator(handle) :
+ """Generator function to iterate over Fasta records (as SeqRecord objects).
+
+ handle - input file
+ alphabet - optional alphabet
+ title2ids - A function that, when given the title of the FASTA
+ file (without the beginning >), will return the id, name and
+ description (in that order) for the record as a tuple of strings.
+
+ If this is not given, then the entire title line will be used
+ as the description, and the first word as the id and name.
+
+ Note that use of title2ids matches that of Bio.Fasta.SequenceParser
+ but the defaults are slightly different.
+ """
+ #Skip any text before the first record (e.g. blank lines, comments)
+ while True :
+ line = handle.readline()
+ if line == "" : return #Premature end of file, or just empty?
+ if line[0] == ">" :
+ break
+
+ while True :
+ if line[0]!=">" :
+ raise ValueError("Records in PIR files should start with '>' character")
+ pir_type = line[1:3]
+ if pir_type not in _pir_alphabets or line[3] != ";" :
+ raise ValueError("Records should start with '>XX;' where XX is a valid sequence type")
+ identifier = line[4:].strip()
+ description = handle.readline().strip()
+
+
+ lines = []
+ line = handle.readline()
+ while True:
+ if not line : break
+ if line[0] == ">": break
+ #Remove trailing whitespace, and any internal spaces
+ lines.append(line.rstrip().replace(" ",""))
+ line = handle.readline()
+ seq = "".join(lines)
+ if seq[-1] != "*" :
+ #Note the * terminator is present on nucleotide sequences too,
+ #it is not a stop codon!
+ raise ValueError("Sequences in PIR files should include a * terminator!")
+
+ #Return the record and then continue...
+ record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
+ id = identifier, name = identifier,
+ description = description)
+ record.annotations["PIR-type"] = pir_type
+ yield record
+
+ if not line : return #StopIteration
+ assert False, "Should not reach this line"
+
+if __name__ == "__main__" :
+ print "Running quick self test"
+
+ from StringIO import StringIO
+ import os
+
+ for name in ["clustalw", "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] :
+ print name
+ filename = "../../Tests/NBRF/%s.pir" % name
+ if not os.path.isfile(filename) :
+ print "Missing %s" % filename
+ continue
+
+ records = list(PirIterator(open(filename)))
+ count = 0
+ for record in records :
+ count += 1
+ parts = record.description.split()
+ if "bases," in parts :
+ assert len(record) == int(parts[parts.index("bases,")-1])
+ print "Could read %s (%i records)" % (name, count)
+