binaries/src/disembl/biopython-1.50/Bio/SeqIO/PirIO.py

   1 # Copyright 2008 by Peter Cock.  All rights reserved.
   2 # This code is part of the Biopython distribution and governed by its
   3 # license.  Please see the LICENSE file that should have been included
   4 # as part of this package.
   5 #
   6 # This module is for reading and writing PIR or NBRF format files as
   7 # SeqRecord objects.  The code is based on Bio.SeqIO.FastaIO
   8
   9 """Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
  10
  11 You are expected to use this module via the Bio.SeqIO functions, or if
  12 the file contains a sequence alignment, optionally via Bio.AlignIO instead.
  13
  14 This format was introduced for the Protein Information Resource (PIR), a
  15 project of the National Biomedical Research Foundation (NBRF).  The PIR
  16 database itself is now part of UniProt.
  17
  18 The file format is described online at:
  19 http://www.ebi.ac.uk/help/pir_frame.html
  20 http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
  21
  22 An example file in this format would be:
  23
  24 >P1;CRAB_ANAPL
  25 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
  26   MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
  27   SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
  28   GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
  29   SDVPERSIPI TREEKPAIAG AQRK*
  30
  31 >P1;CRAB_BOVIN
  32 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
  33   MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
  34   PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
  35   HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
  36   QASGPERTIP ITREEKPAVT AAPKK*
  37
  38 Or, an example of a multiple sequence alignment:
  39
  40 >P1;S27231
  41 rhodopsin - northern leopard frog
  42 MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
  43 VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
  44 GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
  45 EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
  46 ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
  47 YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
  48
  49 >P1;I51200
  50 rhodopsin - African clawed frog
  51 MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
  52 VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
  53 GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
  54 EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
  55 LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
  56 YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
  57
  58 >P1;JN0120
  59 rhodopsin - Japanese lamprey
  60 MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
  61 VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
  62 GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
  63 EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
  64 ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
  65 YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
  66
  67
  68 As with the FASTA format, each record starts with a line begining with ">"
  69 character.  There is then a two letter sequence type (P1, F1, DL, DC, RL,
  70 RC, or XX), a semi colon, and the identification code.  The second like is
  71 free text description.  The remaining lines contain the sequence itself,
  72 terminating in an asterisk.  Space separated blocks of ten letters as shown
  73 above are typical.
  74
  75 Sequence codes and their meanings:
  76
  77 P1 - Protein (complete)
  78 F1 - Protein (fragment)
  79 D1 - DNA (e.g. EMBOSS seqret output)
  80 DL - DNA (linear)
  81 DC - DNA (circular)
  82 RL - RNA (linear)
  83 RC - RNA (circular)
  84 N3 - tRNA
  85 N1 - Other functional RNA
  86 XX - Unknown
  87 """
  88
  89 from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna
  90 from Bio.Seq import Seq
  91 from Bio.SeqRecord import SeqRecord
  92
  93 _pir_alphabets = {"P1" : generic_protein,
  94                   "F1" : generic_protein,
  95                   "D1" : generic_dna,
  96                   "DL" : generic_dna,
  97                   "DC" : generic_dna,
  98                   "RL" : generic_rna,
  99                   "RC" : generic_rna,
 100                   "N3" : generic_rna,
 101                   "XX" : single_letter_alphabet,
 102                   }
 103
 104 #This is a generator function!
 105 def PirIterator(handle) :
 106     """Generator function to iterate over Fasta records (as SeqRecord objects).
 107
 108     handle - input file
 109     alphabet - optional alphabet
 110     title2ids - A function that, when given the title of the FASTA
 111     file (without the beginning >), will return the id, name and
 112     description (in that order) for the record as a tuple of strings.
 113
 114     If this is not given, then the entire title line will be used
 115     as the description, and the first word as the id and name.
 116
 117     Note that use of title2ids matches that of Bio.Fasta.SequenceParser
 118     but the defaults are slightly different.
 119     """
 120     #Skip any text before the first record (e.g. blank lines, comments)
 121     while True :
 122         line = handle.readline()
 123         if line == "" : return #Premature end of file, or just empty?
 124         if line[0] == ">" :
 125             break
 126
 127     while True :
 128         if line[0]!=">" :
 129             raise ValueError("Records in PIR files should start with '>' character")
 130         pir_type = line[1:3]
 131         if pir_type not in _pir_alphabets or line[3] != ";" :
 132             raise ValueError("Records should start with '>XX;' where XX is a valid sequence type")
 133         identifier = line[4:].strip()
 134         description = handle.readline().strip()
 135
 136
 137         lines = []
 138         line = handle.readline()
 139         while True:
 140             if not line : break
 141             if line[0] == ">": break
 142             #Remove trailing whitespace, and any internal spaces
 143             lines.append(line.rstrip().replace(" ",""))
 144             line = handle.readline()
 145         seq = "".join(lines)
 146         if seq[-1] != "*" :
 147             #Note the * terminator is present on nucleotide sequences too,
 148             #it is not a stop codon!
 149             raise ValueError("Sequences in PIR files should include a * terminator!")
 150
 151         #Return the record and then continue...
 152         record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
 153                            id = identifier, name = identifier,
 154                            description = description)
 155         record.annotations["PIR-type"] = pir_type
 156         yield record
 157
 158         if not line : return #StopIteration
 159     assert False, "Should not reach this line"
 160
 161 if __name__ == "__main__" :
 162     print "Running quick self test"
 163
 164     from StringIO import StringIO
 165     import os
 166
 167     for name in ["clustalw",  "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] :
 168         print name
 169         filename = "../../Tests/NBRF/%s.pir" % name
 170         if not os.path.isfile(filename) :
 171             print "Missing %s" % filename
 172             continue
 173
 174         records = list(PirIterator(open(filename)))
 175         count = 0
 176         for record in records :
 177             count += 1
 178             parts = record.description.split()
 179             if "bases," in parts :
 180                 assert len(record) == int(parts[parts.index("bases,")-1])
 181         print "Could read %s (%i records)" % (name, count)
 182