1 # Copyright 2000-2002 Andrew Dalke.
2 # Copyright 2002-2004 Brad Chapman.
3 # Copyright 2006-2009 by Peter Cock.
5 # This code is part of the Biopython distribution and governed by its
6 # license. Please see the LICENSE file that should have been included
7 # as part of this package.
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en" #Simple markup to show doctests nicely
11 # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL
12 # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes
13 # need to be in sync (this is the BioSQL "Database SeqRecord", see
14 # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class)
16 class _RestrictedDict(dict):
17 """Dict which only allows sequences of given length as values (PRIVATE).
19 This simple subclass of the python dictionary is used in the SeqRecord
20 object for holding per-letter-annotations. This class is intended to
21 prevent simple errors by only allowing python sequences (e.g. lists,
22 strings and tuples) to be stored, and only if their length matches that
23 expected (the length of the SeqRecord's seq object). It cannot however
24 prevent the entries being edited in situ (for example appending entries
27 def __init__(self, length) :
28 """Create an EMPTY restricted dictionary."""
30 self._length = int(length)
31 def __setitem__(self, key, value) :
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
33 or len(value) != self._length :
34 raise TypeError("We only allow python sequences (lists, tuples or "
35 "strings) of length %i." % self._length)
36 dict.__setitem__(self, key, value)
38 class SeqRecord(object):
39 """A SeqRecord object holds a sequence and information about it.
42 - id - Identifier such as a locus tag (string)
43 - seq - The sequence itself (Seq object)
45 Additional attributes:
46 - name - Sequence name, e.g. gene name (string)
47 - description - Additional text (string)
48 - dbxrefs - List of database cross references (list of strings)
49 - features - Any (sub)features defined (list of SeqFeature objects)
50 - annotations - Further information about the whole sequence (dictionary)
51 Most entries are lists of strings.
52 - letter_annotations - Per letter/symbol annotation (restricted
53 dictionary). This holds python sequences (lists, strings
54 or tuples) whose length matches that of the sequence.
55 A typical use would be to hold a list of integers
56 representing sequencing quality scores, or a string
57 representing the secondary structure.
59 You will typically use Bio.SeqIO to read in sequences from files as
60 SeqRecord objects. However, you may want to create your own SeqRecord
61 objects directly (see the __init__ method for further details):
63 >>> from Bio.Seq import Seq
64 >>> from Bio.SeqRecord import SeqRecord
65 >>> from Bio.Alphabet import IUPAC
66 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
68 ... id="YP_025292.1", name="HokC",
69 ... description="toxic membrane protein")
73 Description: toxic membrane protein
75 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
77 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
78 for this. For the special case where you want the SeqRecord turned into
79 a string in a particular file format there is a format method which uses
82 >>> print record.format("fasta")
83 >YP_025292.1 toxic membrane protein
84 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
87 def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
88 description = "<unknown description>", dbxrefs = None,
90 """Create a SeqRecord.
93 - seq - Sequence, required (Seq or Mutable object)
94 - id - Sequence identifier, recommended (string)
95 - name - Sequence name, optional (string)
96 - description - Sequence description, optional (string)
97 - dbxrefs - Database cross references, optional (list of strings)
98 - features - Any (sub)features, optional (list of SeqFeature objects)
100 You will typically use Bio.SeqIO to read in sequences from files as
101 SeqRecord objects. However, you may want to create your own SeqRecord
104 Note that while an id is optional, we strongly recommend you supply a
105 unique id string for each record. This is especially important
106 if you wish to write your sequences to a file.
108 If you don't have the actual sequence, but you do know its length,
109 then using the UnknownSeq object from Bio.Seq is appropriate.
111 You can create a 'blank' SeqRecord object, and then populate the
112 attributes later. Note that currently the annotations and the
113 letter_annotations dictionaries cannot be specified when creating
116 if id is not None and not isinstance(id, basestring) :
117 #Lots of existing code uses id=None... this may be a bad idea.
118 raise TypeError("id argument should be a string")
119 if not isinstance(name, basestring) :
120 raise TypeError("name argument should be a string")
121 if not isinstance(description, basestring) :
122 raise TypeError("description argument should be a string")
123 if dbxrefs is not None and not isinstance(dbxrefs, list) :
124 raise TypeError("dbxrefs argument should be a list (of strings)")
125 if features is not None and not isinstance(features, list) :
126 raise TypeError("features argument should be a list (of SeqFeature objects)")
130 self.description = description
133 self.dbxrefs = dbxrefs
134 # annotations about the whole sequence
135 self.annotations = {}
137 # annotations about each letter in the sequence
139 #Should we allow this and use a normal unrestricted dict?
140 self._per_letter_annotations = _RestrictedDict(length=0)
143 self._per_letter_annotations = _RestrictedDict(length=len(seq))
145 raise TypeError("seq argument should be a Seq or MutableSeq")
147 # annotations about parts of the sequence
150 self.features = features
152 #TODO - Just make this a read only property?
153 def _set_per_letter_annotations(self, value) :
154 if not isinstance(value, dict) :
155 raise TypeError("The per-letter-annotations should be a "
156 "(restricted) dictionary.")
157 #Turn this into a restricted-dictionary (and check the entries)
159 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
160 except AttributeError :
162 self._per_letter_annotations = _RestrictedDict(length=0)
163 self._per_letter_annotations.update(value)
164 letter_annotations = property( \
165 fget=lambda self : self._per_letter_annotations,
166 fset=_set_per_letter_annotations,
167 doc="""Dictionary of per-letter-annotation for the sequence.
169 For example, this can hold quality scores used in FASTQ or QUAL files.
170 Consider this example using Bio.SeqIO to read in an example Solexa
171 variant FASTQ file as a SeqRecord:
173 >>> from Bio import SeqIO
174 >>> handle = open("Quality/solexa.fastq", "rU")
175 >>> record = SeqIO.read(handle, "fastq-solexa")
177 >>> print record.id, record.seq
178 slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
179 >>> print record.letter_annotations.keys()
181 >>> print record.letter_annotations["solexa_quality"]
182 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
184 The per-letter-annotaions get sliced automatically if you slice the
185 parent SeqRecord, for example taking the last ten bases:
187 >>> sub_record = record[-10:]
188 >>> print sub_record.id, sub_record.seq
189 slxa_0013_1_0001_24 CTTATACACC
190 >>> print sub_record.letter_annotations["solexa_quality"]
191 [-6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
193 Any python sequence (i.e. list, tuple or string) can be recorded in
194 the SeqRecord's letter_annotations dictionary as long as the length
195 matches that of the SeqRecord's sequence. e.g.
197 >>> len(sub_record.letter_annotations)
199 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
200 >>> len(sub_record.letter_annotations)
203 You can delete entries from the letter_annotations dictionary as usual:
205 >>> del sub_record.letter_annotations["solexa_quality"]
206 >>> sub_record.letter_annotations
207 {'dummy': 'abcdefghij'}
209 You can completely clear the dictionary easily as follows:
211 >>> sub_record.letter_annotations = {}
212 >>> sub_record.letter_annotations
216 def _set_seq(self, value) :
217 #TODO - Add a deprecation warning that the seq should be write only?
218 if self._per_letter_annotations :
219 #TODO - Make this a warning? Silently empty the dictionary?
220 raise ValueError("You must empty the letter annotations first!")
223 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
224 except AttributeError :
226 self._per_letter_annotations = _RestrictedDict(length=0)
228 seq = property(fget=lambda self : self._seq,
230 doc="The sequence itself, as a Seq or MutableSeq object.")
232 def __getitem__(self, index) :
233 """Returns a sub-sequence or an individual letter.
235 Splicing, e.g. my_record[5:10], returns a new SeqRecord for
236 that sub-sequence with approriate annotation preserved. The
237 name, id and description are kept.
239 Any per-letter-annotations are sliced to match the requested
240 sub-sequence. Unless a stride is used, all those features
241 which fall fully within the subsequence are included (with
242 their locations adjusted accordingly).
244 However, the annotations dictionary and the dbxrefs list are
245 not used for the new SeqRecord, as in general they may not
246 apply to the subsequence. If you want to preserve them, you
247 must explictly copy them to the new SeqRecord yourself.
249 Using an integer index, e.g. my_record[5] is shorthand for
250 extracting that letter from the sequence, my_record.seq[5].
252 For example, consider this short protein and its secondary
253 structure as encoded by the PDB (e.g. H for alpha helices),
254 plus a simple feature for its histidine self phosphorylation
257 >>> from Bio.Seq import Seq
258 >>> from Bio.SeqRecord import SeqRecord
259 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
260 >>> from Bio.Alphabet import IUPAC
261 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
262 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
264 ... id="1JOY", name="EnvZ",
265 ... description="Homodimeric domain of EnvZ from E. coli")
266 >>> rec.letter_annotations["secondary_structure"] = \
267 " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
268 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
271 Now let's have a quick look at the full record,
276 Description: Homodimeric domain of EnvZ from E. coli
277 Number of features: 1
278 Per letter annotation for: secondary_structure
279 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
280 >>> print rec.letter_annotations["secondary_structure"]
281 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
282 >>> print rec.features[0].location
285 Now let's take a sub sequence, here chosen as the first (fractured)
286 alpha helix which includes the histidine phosphorylation site:
292 Description: Homodimeric domain of EnvZ from E. coli
293 Number of features: 1
294 Per letter annotation for: secondary_structure
295 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
296 >>> print sub.letter_annotations["secondary_structure"]
297 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
298 >>> print sub.features[0].location
301 You can also of course omit the start or end values, for
302 example to get the first ten letters only:
307 Description: Homodimeric domain of EnvZ from E. coli
308 Number of features: 0
309 Per letter annotation for: secondary_structure
310 Seq('MAAGVKQLAD', IUPACProtein())
312 Or for the last ten letters:
317 Description: Homodimeric domain of EnvZ from E. coli
318 Number of features: 0
319 Per letter annotation for: secondary_structure
320 Seq('IIEQFIDYLR', IUPACProtein())
322 If you omit both, then you get a copy of the original record (although
323 lacking the annotations and dbxrefs):
328 Description: Homodimeric domain of EnvZ from E. coli
329 Number of features: 1
330 Per letter annotation for: secondary_structure
331 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
333 Finally, indexing with a simple integer is shorthand for pulling out
334 that letter from the sequence directly:
341 if isinstance(index, int) :
342 #NOTE - The sequence level annotation like the id, name, etc
343 #do not really apply to a single character. However, should
344 #we try and expose any per-letter-annotation here? If so how?
345 return self.seq[index]
346 elif isinstance(index, slice) :
347 if self.seq is None :
348 raise ValueError("If the sequence is None, we cannot slice it.")
349 parent_length = len(self)
350 answer = self.__class__(self.seq[index],
353 description=self.description)
354 #TODO - The desription may no longer apply.
355 #It would be safer to change it to something
356 #generic like "edited" or the default value.
358 #Don't copy the annotation dict and dbxefs list,
359 #they may not apply to a subsequence.
360 #answer.annotations = dict(self.annotations.iteritems())
361 #answer.dbxrefs = self.dbxrefs[:]
363 #TODO - Cope with strides by generating ambiguous locations?
364 if index.step is None or index.step == 1 :
365 #Select relevant features, add them with shifted locations
366 if index.start is None :
370 if index.stop is None :
374 if (start < 0 or stop < 0) and parent_length == 0 :
376 "Cannot support negative indices without the sequence length"
378 start = parent_length - start
380 stop = parent_length - stop + 1
381 #assert str(self.seq)[index] == str(self.seq)[start:stop]
382 for f in self.features :
383 if start <= f.location.start.position \
384 and f.location.end.position < stop :
385 answer.features.append(f._shift(-start))
387 #Slice all the values to match the sliced sequence
388 #(this should also work with strides, even negative strides):
389 for key, value in self.letter_annotations.iteritems() :
390 answer._per_letter_annotations[key] = value[index]
393 raise ValueError, "Invalid index"
396 """Iterate over the letters in the sequence.
398 For example, using Bio.SeqIO to read in a protein FASTA file:
400 >>> from Bio import SeqIO
401 >>> record = SeqIO.read(open("Amino/loveliesbleeding.pro"),"fasta")
402 >>> for amino in record :
404 ... if amino == "L" : break
409 >>> print record.seq[3]
412 This is just a shortcut for iterating over the sequence directly:
414 >>> for amino in record.seq :
416 ... if amino == "L" : break
421 >>> print record.seq[3]
424 Note that this does not facilitate iteration together with any
425 per-letter-annotation. However, you can achieve that using the
426 python zip function on the record (or its sequence) and the relevant
427 per-letter-annotation:
429 >>> from Bio import SeqIO
430 >>> rec = SeqIO.read(open("Quality/solexa.fastq", "rU"),
432 >>> print rec.id, rec.seq
433 slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
434 >>> print rec.letter_annotations.keys()
436 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]) :
443 You may agree that using zip(rec.seq, ...) is more explicit than using
444 zip(rec, ...) as shown above.
446 return iter(self.seq)
449 """A human readable summary of the record and its annotation (string).
451 The python built in function str works by calling the object's ___str__
454 >>> from Bio.Seq import Seq
455 >>> from Bio.SeqRecord import SeqRecord
456 >>> from Bio.Alphabet import IUPAC
457 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
459 ... id="YP_025292.1", name="HokC",
460 ... description="toxic membrane protein, small")
461 >>> print str(record)
464 Description: toxic membrane protein, small
465 Number of features: 0
466 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
468 In this example you don't actually need to call str explicity, as the
469 print command does this automatically:
474 Description: toxic membrane protein, small
475 Number of features: 0
476 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
478 Note that long sequences are shown truncated.
481 if self.id : lines.append("ID: %s" % self.id)
482 if self.name : lines.append("Name: %s" % self.name)
483 if self.description : lines.append("Description: %s" % self.description)
484 if self.dbxrefs : lines.append("Database cross-references: " \
485 + ", ".join(self.dbxrefs))
486 lines.append("Number of features: %i" % len(self.features))
487 for a in self.annotations:
488 lines.append("/%s=%s" % (a, str(self.annotations[a])))
489 if self.letter_annotations :
490 lines.append("Per letter annotation for: " \
491 + ", ".join(self.letter_annotations.keys()))
492 #Don't want to include the entire sequence,
493 #and showing the alphabet is useful:
494 lines.append(repr(self.seq))
495 return "\n".join(lines)
498 """A concise summary of the record for debugging (string).
500 The python built in function repr works by calling the object's ___repr__
503 >>> from Bio.Seq import Seq
504 >>> from Bio.SeqRecord import SeqRecord
505 >>> from Bio.Alphabet import generic_protein
506 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
507 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
508 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
509 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
510 ... generic_protein),
511 ... id="NP_418483.1", name="b4059",
512 ... description="ssDNA-binding protein",
513 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
515 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
517 At the python prompt you can also use this shorthand:
520 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
522 Note that long sequences are shown truncated.
524 return self.__class__.__name__ \
525 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
526 % tuple(map(repr, (self.seq, self.id, self.name,
527 self.description, self.dbxrefs)))
529 def format(self, format) :
530 r"""Returns the record as a string in the specified file format.
532 The format should be a lower case string supported as an output
533 format by Bio.SeqIO, which is used to turn the SeqRecord into a
536 >>> from Bio.Seq import Seq
537 >>> from Bio.SeqRecord import SeqRecord
538 >>> from Bio.Alphabet import IUPAC
539 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
541 ... id="YP_025292.1", name="HokC",
542 ... description="toxic membrane protein")
543 >>> record.format("fasta")
544 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
545 >>> print record.format("fasta")
546 >YP_025292.1 toxic membrane protein
547 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
550 The python print command automatically appends a new line, meaning
551 in this example a blank line is shown. If you look at the string
552 representation you can see there is a trailing new line (shown as
553 slash n) which is important when writing to a file or if
554 concatenating mutliple sequence strings together.
556 Note that this method will NOT work on every possible file format
557 supported by Bio.SeqIO (e.g. some are for multiple sequences only).
559 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101
560 #See also the Bio.Align.Generic.Alignment class and its format()
561 return self.__format__(format)
563 def __format__(self, format_spec) :
564 """Returns the record as a string in the specified file format.
566 This method supports the python format() function added in
567 Python 2.6/3.0. The format_spec should be a lower case
568 string supported by Bio.SeqIO as an output file format.
569 See also the SeqRecord's format() method.
572 from StringIO import StringIO
573 from Bio import SeqIO
575 SeqIO.write([self], handle, format_spec)
576 return handle.getvalue()
578 #Follow python convention and default to using __str__
582 """Returns the length of the sequence.
584 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
586 >>> from Bio import SeqIO
587 >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"),"fasta")
595 def __nonzero__(self) :
596 """Returns True regardless of the length of the sequence.
598 This behaviour is for backwards compatibility, since until the
599 __len__ method was added, a SeqRecord always evaluated as True.
601 Note that in comparison, a Seq object will evaluate to False if it
602 has a zero length sequence.
604 WARNING: The SeqRecord may in future evaluate to False when its
605 sequence is of zero length (in order to better match the Seq
611 """Run the Bio.SeqRecord module's doctests (PRIVATE).
613 This will try and locate the unit tests directory, and run the doctests
614 from there in order that the relative paths used in the examples work.
618 if os.path.isdir(os.path.join("..","Tests")) :
619 print "Runing doctests..."
620 cur_dir = os.path.abspath(os.curdir)
621 os.chdir(os.path.join("..","Tests"))
627 if __name__ == "__main__":