+++ /dev/null
-# Copyright 2008 by Peter Cock. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-#
-# This module is for reading and writing PIR or NBRF format files as
-# SeqRecord objects. The code is based on Bio.SeqIO.FastaIO
-
-"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
-
-You are expected to use this module via the Bio.SeqIO functions, or if
-the file contains a sequence alignment, optionally via Bio.AlignIO instead.
-
-This format was introduced for the Protein Information Resource (PIR), a
-project of the National Biomedical Research Foundation (NBRF). The PIR
-database itself is now part of UniProt.
-
-The file format is described online at:
-http://www.ebi.ac.uk/help/pir_frame.html
-http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
-
-An example file in this format would be:
-
->P1;CRAB_ANAPL
-ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
- MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
- SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
- GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
- SDVPERSIPI TREEKPAIAG AQRK*
-
->P1;CRAB_BOVIN
-ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
- MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
- PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
- HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
- QASGPERTIP ITREEKPAVT AAPKK*
-
-Or, an example of a multiple sequence alignment:
-
->P1;S27231
-rhodopsin - northern leopard frog
-MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
-VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
-GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
-EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
-ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
-YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
-
->P1;I51200
-rhodopsin - African clawed frog
-MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
-VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
-GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
-EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
-LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
-YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
-
->P1;JN0120
-rhodopsin - Japanese lamprey
-MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
-VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
-GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
-EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
-ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
-YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
-
-
-As with the FASTA format, each record starts with a line begining with ">"
-character. There is then a two letter sequence type (P1, F1, DL, DC, RL,
-RC, or XX), a semi colon, and the identification code. The second like is
-free text description. The remaining lines contain the sequence itself,
-terminating in an asterisk. Space separated blocks of ten letters as shown
-above are typical.
-
-Sequence codes and their meanings:
-
-P1 - Protein (complete)
-F1 - Protein (fragment)
-D1 - DNA (e.g. EMBOSS seqret output)
-DL - DNA (linear)
-DC - DNA (circular)
-RL - RNA (linear)
-RC - RNA (circular)
-N3 - tRNA
-N1 - Other functional RNA
-XX - Unknown
-"""
-
-from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-
-_pir_alphabets = {"P1" : generic_protein,
- "F1" : generic_protein,
- "D1" : generic_dna,
- "DL" : generic_dna,
- "DC" : generic_dna,
- "RL" : generic_rna,
- "RC" : generic_rna,
- "N3" : generic_rna,
- "XX" : single_letter_alphabet,
- }
-
-#This is a generator function!
-def PirIterator(handle) :
- """Generator function to iterate over Fasta records (as SeqRecord objects).
-
- handle - input file
- alphabet - optional alphabet
- title2ids - A function that, when given the title of the FASTA
- file (without the beginning >), will return the id, name and
- description (in that order) for the record as a tuple of strings.
-
- If this is not given, then the entire title line will be used
- as the description, and the first word as the id and name.
-
- Note that use of title2ids matches that of Bio.Fasta.SequenceParser
- but the defaults are slightly different.
- """
- #Skip any text before the first record (e.g. blank lines, comments)
- while True :
- line = handle.readline()
- if line == "" : return #Premature end of file, or just empty?
- if line[0] == ">" :
- break
-
- while True :
- if line[0]!=">" :
- raise ValueError("Records in PIR files should start with '>' character")
- pir_type = line[1:3]
- if pir_type not in _pir_alphabets or line[3] != ";" :
- raise ValueError("Records should start with '>XX;' where XX is a valid sequence type")
- identifier = line[4:].strip()
- description = handle.readline().strip()
-
-
- lines = []
- line = handle.readline()
- while True:
- if not line : break
- if line[0] == ">": break
- #Remove trailing whitespace, and any internal spaces
- lines.append(line.rstrip().replace(" ",""))
- line = handle.readline()
- seq = "".join(lines)
- if seq[-1] != "*" :
- #Note the * terminator is present on nucleotide sequences too,
- #it is not a stop codon!
- raise ValueError("Sequences in PIR files should include a * terminator!")
-
- #Return the record and then continue...
- record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
- id = identifier, name = identifier,
- description = description)
- record.annotations["PIR-type"] = pir_type
- yield record
-
- if not line : return #StopIteration
- assert False, "Should not reach this line"
-
-if __name__ == "__main__" :
- print "Running quick self test"
-
- from StringIO import StringIO
- import os
-
- for name in ["clustalw", "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] :
- print name
- filename = "../../Tests/NBRF/%s.pir" % name
- if not os.path.isfile(filename) :
- print "Missing %s" % filename
- continue
-
- records = list(PirIterator(open(filename)))
- count = 0
- for record in records :
- count += 1
- parts = record.description.split()
- if "bases," in parts :
- assert len(record) == int(parts[parts.index("bases,")-1])
- print "Could read %s (%i records)" % (name, count)
-