X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Fdisembl%2Fbiopython-1.50%2FBio%2FSeqIO%2FPirIO.py;fp=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Fdisembl%2Fbiopython-1.50%2FBio%2FSeqIO%2FPirIO.py;h=0000000000000000000000000000000000000000;hb=fb21789ba11ae9b9f6abed1b01d4542e54abfe34;hp=953076cb448badbc3294375c3fb7f27801d38cbe;hpb=7d68cf3489c816b2ed7afb1925a6de2168eccf4a;p=jabaws.git diff --git a/website/archive/binaries/mac/src/disembl/biopython-1.50/Bio/SeqIO/PirIO.py b/website/archive/binaries/mac/src/disembl/biopython-1.50/Bio/SeqIO/PirIO.py deleted file mode 100644 index 953076c..0000000 --- a/website/archive/binaries/mac/src/disembl/biopython-1.50/Bio/SeqIO/PirIO.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright 2008 by Peter Cock. All rights reserved. -# This code is part of the Biopython distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -# -# This module is for reading and writing PIR or NBRF format files as -# SeqRecord objects. The code is based on Bio.SeqIO.FastaIO - -"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format. - -You are expected to use this module via the Bio.SeqIO functions, or if -the file contains a sequence alignment, optionally via Bio.AlignIO instead. - -This format was introduced for the Protein Information Resource (PIR), a -project of the National Biomedical Research Foundation (NBRF). The PIR -database itself is now part of UniProt. - -The file format is described online at: -http://www.ebi.ac.uk/help/pir_frame.html -http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down) - -An example file in this format would be: - ->P1;CRAB_ANAPL -ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). - MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR - SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH - GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ - SDVPERSIPI TREEKPAIAG AQRK* - ->P1;CRAB_BOVIN -ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). - MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR - PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV - HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK - QASGPERTIP ITREEKPAVT AAPKK* - -Or, an example of a multiple sequence alignment: - ->P1;S27231 -rhodopsin - northern leopard frog -MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY -VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG -GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP -EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES -ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI -YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA* - ->P1;I51200 -rhodopsin - African clawed frog -MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF -VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG -GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP -EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES -LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI -YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA* - ->P1;JN0120 -rhodopsin - Japanese lamprey -MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF -VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG -GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP -EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES -ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL -YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA* - - -As with the FASTA format, each record starts with a line begining with ">" -character. There is then a two letter sequence type (P1, F1, DL, DC, RL, -RC, or XX), a semi colon, and the identification code. The second like is -free text description. The remaining lines contain the sequence itself, -terminating in an asterisk. Space separated blocks of ten letters as shown -above are typical. - -Sequence codes and their meanings: - -P1 - Protein (complete) -F1 - Protein (fragment) -D1 - DNA (e.g. EMBOSS seqret output) -DL - DNA (linear) -DC - DNA (circular) -RL - RNA (linear) -RC - RNA (circular) -N3 - tRNA -N1 - Other functional RNA -XX - Unknown -""" - -from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord - -_pir_alphabets = {"P1" : generic_protein, - "F1" : generic_protein, - "D1" : generic_dna, - "DL" : generic_dna, - "DC" : generic_dna, - "RL" : generic_rna, - "RC" : generic_rna, - "N3" : generic_rna, - "XX" : single_letter_alphabet, - } - -#This is a generator function! -def PirIterator(handle) : - """Generator function to iterate over Fasta records (as SeqRecord objects). - - handle - input file - alphabet - optional alphabet - title2ids - A function that, when given the title of the FASTA - file (without the beginning >), will return the id, name and - description (in that order) for the record as a tuple of strings. - - If this is not given, then the entire title line will be used - as the description, and the first word as the id and name. - - Note that use of title2ids matches that of Bio.Fasta.SequenceParser - but the defaults are slightly different. - """ - #Skip any text before the first record (e.g. blank lines, comments) - while True : - line = handle.readline() - if line == "" : return #Premature end of file, or just empty? - if line[0] == ">" : - break - - while True : - if line[0]!=">" : - raise ValueError("Records in PIR files should start with '>' character") - pir_type = line[1:3] - if pir_type not in _pir_alphabets or line[3] != ";" : - raise ValueError("Records should start with '>XX;' where XX is a valid sequence type") - identifier = line[4:].strip() - description = handle.readline().strip() - - - lines = [] - line = handle.readline() - while True: - if not line : break - if line[0] == ">": break - #Remove trailing whitespace, and any internal spaces - lines.append(line.rstrip().replace(" ","")) - line = handle.readline() - seq = "".join(lines) - if seq[-1] != "*" : - #Note the * terminator is present on nucleotide sequences too, - #it is not a stop codon! - raise ValueError("Sequences in PIR files should include a * terminator!") - - #Return the record and then continue... - record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]), - id = identifier, name = identifier, - description = description) - record.annotations["PIR-type"] = pir_type - yield record - - if not line : return #StopIteration - assert False, "Should not reach this line" - -if __name__ == "__main__" : - print "Running quick self test" - - from StringIO import StringIO - import os - - for name in ["clustalw", "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] : - print name - filename = "../../Tests/NBRF/%s.pir" % name - if not os.path.isfile(filename) : - print "Missing %s" % filename - continue - - records = list(PirIterator(open(filename))) - count = 0 - for record in records : - count += 1 - parts = record.description.split() - if "bases," in parts : - assert len(record) == int(parts[parts.index("bases,")-1]) - print "Could read %s (%i records)" % (name, count) -