binaries/src/disembl/biopython-1.50/Bio/SeqRecord.py

   1 # Copyright 2000-2002 Andrew Dalke.
   2 # Copyright 2002-2004 Brad Chapman.
   3 # Copyright 2006-2009 by Peter Cock.
   4 # All rights reserved.
   5 # This code is part of the Biopython distribution and governed by its
   6 # license.  Please see the LICENSE file that should have been included
   7 # as part of this package.
   8 """Represent a Sequence Record, a sequence with annotation."""
   9 __docformat__ = "epytext en" #Simple markup to show doctests nicely
  10
  11 # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL
  12 # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes
  13 # need to be in sync (this is the BioSQL "Database SeqRecord", see
  14 # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class)
  15
  16 class _RestrictedDict(dict):
  17     """Dict which only allows sequences of given length as values (PRIVATE).
  18
  19     This simple subclass of the python dictionary is used in the SeqRecord
  20     object for holding per-letter-annotations.  This class is intended to
  21     prevent simple errors by only allowing python sequences (e.g. lists,
  22     strings and tuples) to be stored, and only if their length matches that
  23     expected (the length of the SeqRecord's seq object).  It cannot however
  24     prevent the entries being edited in situ (for example appending entries
  25     to a list).
  26     """
  27     def __init__(self, length) :
  28         """Create an EMPTY restricted dictionary."""
  29         dict.__init__(self)
  30         self._length = int(length)
  31     def __setitem__(self, key, value) :
  32         if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
  33         or len(value) != self._length :
  34             raise TypeError("We only allow python sequences (lists, tuples or "
  35                             "strings) of length %i." % self._length)
  36         dict.__setitem__(self, key, value)
  37
  38 class SeqRecord(object):
  39     """A SeqRecord object holds a sequence and information about it.
  40
  41     Main attributes:
  42      - id          - Identifier such as a locus tag (string)
  43      - seq         - The sequence itself (Seq object)
  44
  45     Additional attributes:
  46      - name        - Sequence name, e.g. gene name (string)
  47      - description - Additional text (string)
  48      - dbxrefs     - List of database cross references (list of strings)
  49      - features    - Any (sub)features defined (list of SeqFeature objects)
  50      - annotations - Further information about the whole sequence (dictionary)
  51                      Most entries are lists of strings.
  52      - letter_annotations - Per letter/symbol annotation (restricted
  53                      dictionary). This holds python sequences (lists, strings
  54                      or tuples) whose length matches that of the sequence.
  55                      A typical use would be to hold a list of integers
  56                      representing sequencing quality scores, or a string
  57                      representing the secondary structure.
  58
  59     You will typically use Bio.SeqIO to read in sequences from files as
  60     SeqRecord objects.  However, you may want to create your own SeqRecord
  61     objects directly (see the __init__ method for further details):
  62
  63     >>> from Bio.Seq import Seq
  64     >>> from Bio.SeqRecord import SeqRecord
  65     >>> from Bio.Alphabet import IUPAC
  66     >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
  67     ...                         IUPAC.protein),
  68     ...                    id="YP_025292.1", name="HokC",
  69     ...                    description="toxic membrane protein")
  70     >>> print record
  71     ID: YP_025292.1
  72     Name: HokC
  73     Description: toxic membrane protein
  74     Number of features: 0
  75     Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
  76
  77     If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
  78     for this.  For the special case where you want the SeqRecord turned into
  79     a string in a particular file format there is a format method which uses
  80     Bio.SeqIO internally:
  81
  82     >>> print record.format("fasta")
  83     >YP_025292.1 toxic membrane protein
  84     MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
  85     <BLANKLINE>
  86     """
  87     def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
  88                  description = "<unknown description>", dbxrefs = None,
  89                  features = None):
  90         """Create a SeqRecord.
  91
  92         Arguments:
  93          - seq         - Sequence, required (Seq or Mutable object)
  94          - id          - Sequence identifier, recommended (string)
  95          - name        - Sequence name, optional (string)
  96          - description - Sequence description, optional (string)
  97          - dbxrefs     - Database cross references, optional (list of strings)
  98          - features    - Any (sub)features, optional (list of SeqFeature objects)
  99
 100         You will typically use Bio.SeqIO to read in sequences from files as
 101         SeqRecord objects.  However, you may want to create your own SeqRecord
 102         objects directly.
 103
 104         Note that while an id is optional, we strongly recommend you supply a
 105         unique id string for each record.  This is especially important
 106         if you wish to write your sequences to a file.
 107
 108         If you don't have the actual sequence, but you do know its length,
 109         then using the UnknownSeq object from Bio.Seq is appropriate.
 110
 111         You can create a 'blank' SeqRecord object, and then populate the
 112         attributes later.  Note that currently the annotations and the
 113         letter_annotations dictionaries cannot be specified when creating
 114         the SeqRecord.
 115         """
 116         if id is not None and not isinstance(id, basestring) :
 117             #Lots of existing code uses id=None... this may be a bad idea.
 118             raise TypeError("id argument should be a string")
 119         if not isinstance(name, basestring) :
 120             raise TypeError("name argument should be a string")
 121         if not isinstance(description, basestring) :
 122             raise TypeError("description argument should be a string")
 123         if dbxrefs is not None and not isinstance(dbxrefs, list) :
 124             raise TypeError("dbxrefs argument should be a list (of strings)")
 125         if features is not None and not isinstance(features, list) :
 126             raise TypeError("features argument should be a list (of SeqFeature objects)")
 127         self._seq = seq
 128         self.id = id
 129         self.name = name
 130         self.description = description
 131         if dbxrefs is None:
 132             dbxrefs = []
 133         self.dbxrefs = dbxrefs
 134         # annotations about the whole sequence
 135         self.annotations = {}
 136
 137         # annotations about each letter in the sequence
 138         if seq is None :
 139             #Should we allow this and use a normal unrestricted dict?
 140             self._per_letter_annotations = _RestrictedDict(length=0)
 141         else :
 142             try :
 143                 self._per_letter_annotations = _RestrictedDict(length=len(seq))
 144             except :
 145                 raise TypeError("seq argument should be a Seq or MutableSeq")
 146
 147         # annotations about parts of the sequence
 148         if features is None:
 149             features = []
 150         self.features = features
 151
 152     #TODO - Just make this a read only property?
 153     def _set_per_letter_annotations(self, value) :
 154         if not isinstance(value, dict) :
 155             raise TypeError("The per-letter-annotations should be a "
 156                             "(restricted) dictionary.")
 157         #Turn this into a restricted-dictionary (and check the entries)
 158         try :
 159             self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
 160         except AttributeError :
 161             #e.g. seq is None
 162             self._per_letter_annotations = _RestrictedDict(length=0)
 163         self._per_letter_annotations.update(value)
 164     letter_annotations = property( \
 165         fget=lambda self : self._per_letter_annotations,
 166         fset=_set_per_letter_annotations,
 167         doc="""Dictionary of per-letter-annotation for the sequence.
 168
 169         For example, this can hold quality scores used in FASTQ or QUAL files.
 170         Consider this example using Bio.SeqIO to read in an example Solexa
 171         variant FASTQ file as a SeqRecord:
 172
 173         >>> from Bio import SeqIO
 174         >>> handle = open("Quality/solexa.fastq", "rU")
 175         >>> record = SeqIO.read(handle, "fastq-solexa")
 176         >>> handle.close()
 177         >>> print record.id, record.seq
 178         slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
 179         >>> print record.letter_annotations.keys()
 180         ['solexa_quality']
 181         >>> print record.letter_annotations["solexa_quality"]
 182         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
 183
 184         The per-letter-annotaions get sliced automatically if you slice the
 185         parent SeqRecord, for example taking the last ten bases:
 186
 187         >>> sub_record = record[-10:]
 188         >>> print sub_record.id, sub_record.seq
 189         slxa_0013_1_0001_24 CTTATACACC
 190         >>> print sub_record.letter_annotations["solexa_quality"]
 191         [-6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
 192
 193         Any python sequence (i.e. list, tuple or string) can be recorded in
 194         the SeqRecord's letter_annotations dictionary as long as the length
 195         matches that of the SeqRecord's sequence.  e.g.
 196
 197         >>> len(sub_record.letter_annotations)
 198         1
 199         >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
 200         >>> len(sub_record.letter_annotations)
 201         2
 202
 203         You can delete entries from the letter_annotations dictionary as usual:
 204
 205         >>> del sub_record.letter_annotations["solexa_quality"]
 206         >>> sub_record.letter_annotations
 207         {'dummy': 'abcdefghij'}
 208
 209         You can completely clear the dictionary easily as follows:
 210
 211         >>> sub_record.letter_annotations = {}
 212         >>> sub_record.letter_annotations
 213         {}
 214         """)
 215
 216     def _set_seq(self, value) :
 217         #TODO - Add a deprecation warning that the seq should be write only?
 218         if self._per_letter_annotations :
 219             #TODO - Make this a warning? Silently empty the dictionary?
 220             raise ValueError("You must empty the letter annotations first!")
 221         self._seq = value
 222         try :
 223             self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
 224         except AttributeError :
 225             #e.g. seq is None
 226             self._per_letter_annotations = _RestrictedDict(length=0)
 227
 228     seq = property(fget=lambda self : self._seq,
 229                    fset=_set_seq,
 230                    doc="The sequence itself, as a Seq or MutableSeq object.")
 231
 232     def __getitem__(self, index) :
 233         """Returns a sub-sequence or an individual letter.
 234
 235         Splicing, e.g. my_record[5:10], returns a new SeqRecord for
 236         that sub-sequence with approriate annotation preserved.  The
 237         name, id and description are kept.
 238
 239         Any per-letter-annotations are sliced to match the requested
 240         sub-sequence.  Unless a stride is used, all those features
 241         which fall fully within the subsequence are included (with
 242         their locations adjusted accordingly).
 243
 244         However, the annotations dictionary and the dbxrefs list are
 245         not used for the new SeqRecord, as in general they may not
 246         apply to the subsequence.  If you want to preserve them, you
 247         must explictly copy them to the new SeqRecord yourself.
 248
 249         Using an integer index, e.g. my_record[5] is shorthand for
 250         extracting that letter from the sequence, my_record.seq[5].
 251
 252         For example, consider this short protein and its secondary
 253         structure as encoded by the PDB (e.g. H for alpha helices),
 254         plus a simple feature for its histidine self phosphorylation
 255         site:
 256
 257         >>> from Bio.Seq import Seq
 258         >>> from Bio.SeqRecord import SeqRecord
 259         >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
 260         >>> from Bio.Alphabet import IUPAC
 261         >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
 262         ...                     "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
 263         ...                     IUPAC.protein),
 264         ...                 id="1JOY", name="EnvZ",
 265         ...                 description="Homodimeric domain of EnvZ from E. coli")
 266         >>> rec.letter_annotations["secondary_structure"] = \
 267             "  S  SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT  "
 268         >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
 269         ...                     type = "Site"))
 270
 271         Now let's have a quick look at the full record,
 272
 273         >>> print rec
 274         ID: 1JOY
 275         Name: EnvZ
 276         Description: Homodimeric domain of EnvZ from E. coli
 277         Number of features: 1
 278         Per letter annotation for: secondary_structure
 279         Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
 280         >>> print rec.letter_annotations["secondary_structure"]
 281           S  SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
 282         >>> print rec.features[0].location
 283         [20:21]
 284
 285         Now let's take a sub sequence, here chosen as the first (fractured)
 286         alpha helix which includes the histidine phosphorylation site:
 287
 288         >>> sub = rec[11:41]
 289         >>> print sub
 290         ID: 1JOY
 291         Name: EnvZ
 292         Description: Homodimeric domain of EnvZ from E. coli
 293         Number of features: 1
 294         Per letter annotation for: secondary_structure
 295         Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
 296         >>> print sub.letter_annotations["secondary_structure"]
 297         HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
 298         >>> print sub.features[0].location
 299         [9:10]
 300
 301         You can also of course omit the start or end values, for
 302         example to get the first ten letters only:
 303
 304         >>> print rec[:10]
 305         ID: 1JOY
 306         Name: EnvZ
 307         Description: Homodimeric domain of EnvZ from E. coli
 308         Number of features: 0
 309         Per letter annotation for: secondary_structure
 310         Seq('MAAGVKQLAD', IUPACProtein())
 311
 312         Or for the last ten letters:
 313
 314         >>> print rec[-10:]
 315         ID: 1JOY
 316         Name: EnvZ
 317         Description: Homodimeric domain of EnvZ from E. coli
 318         Number of features: 0
 319         Per letter annotation for: secondary_structure
 320         Seq('IIEQFIDYLR', IUPACProtein())
 321
 322         If you omit both, then you get a copy of the original record (although
 323         lacking the annotations and dbxrefs):
 324
 325         >>> print rec[:]
 326         ID: 1JOY
 327         Name: EnvZ
 328         Description: Homodimeric domain of EnvZ from E. coli
 329         Number of features: 1
 330         Per letter annotation for: secondary_structure
 331         Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
 332
 333         Finally, indexing with a simple integer is shorthand for pulling out
 334         that letter from the sequence directly:
 335
 336         >>> rec[5]
 337         'K'
 338         >>> rec.seq[5]
 339         'K'
 340         """
 341         if isinstance(index, int) :
 342             #NOTE - The sequence level annotation like the id, name, etc
 343             #do not really apply to a single character.  However, should
 344             #we try and expose any per-letter-annotation here?  If so how?
 345             return self.seq[index]
 346         elif isinstance(index, slice) :
 347             if self.seq is None :
 348                 raise ValueError("If the sequence is None, we cannot slice it.")
 349             parent_length = len(self)
 350             answer = self.__class__(self.seq[index],
 351                                     id=self.id,
 352                                     name=self.name,
 353                                     description=self.description)
 354             #TODO - The desription may no longer apply.
 355             #It would be safer to change it to something
 356             #generic like "edited" or the default value.
 357
 358             #Don't copy the annotation dict and dbxefs list,
 359             #they may not apply to a subsequence.
 360             #answer.annotations = dict(self.annotations.iteritems())
 361             #answer.dbxrefs = self.dbxrefs[:]
 362
 363             #TODO - Cope with strides by generating ambiguous locations?
 364             if index.step is None or index.step == 1 :
 365                 #Select relevant features, add them with shifted locations
 366                 if index.start is None :
 367                     start = 0
 368                 else :
 369                     start = index.start
 370                 if index.stop is None :
 371                     stop = -1
 372                 else :
 373                     stop = index.stop
 374                 if (start < 0 or stop < 0) and parent_length == 0 :
 375                     raise ValueError, \
 376                           "Cannot support negative indices without the sequence length"
 377                 if start < 0 :
 378                     start = parent_length - start
 379                 if stop < 0  :
 380                     stop  = parent_length - stop + 1
 381                 #assert str(self.seq)[index] == str(self.seq)[start:stop]
 382                 for f in self.features :
 383                     if start <= f.location.start.position \
 384                     and f.location.end.position < stop :
 385                         answer.features.append(f._shift(-start))
 386
 387             #Slice all the values to match the sliced sequence
 388             #(this should also work with strides, even negative strides):
 389             for key, value in self.letter_annotations.iteritems() :
 390                 answer._per_letter_annotations[key] = value[index]
 391
 392             return answer
 393         raise ValueError, "Invalid index"
 394
 395     def __iter__(self) :
 396         """Iterate over the letters in the sequence.
 397
 398         For example, using Bio.SeqIO to read in a protein FASTA file:
 399
 400         >>> from Bio import SeqIO
 401         >>> record = SeqIO.read(open("Amino/loveliesbleeding.pro"),"fasta")
 402         >>> for amino in record :
 403         ...     print amino
 404         ...     if amino == "L" : break
 405         X
 406         A
 407         G
 408         L
 409         >>> print record.seq[3]
 410         L
 411
 412         This is just a shortcut for iterating over the sequence directly:
 413
 414         >>> for amino in record.seq :
 415         ...     print amino
 416         ...     if amino == "L" : break
 417         X
 418         A
 419         G
 420         L
 421         >>> print record.seq[3]
 422         L
 423
 424         Note that this does not facilitate iteration together with any
 425         per-letter-annotation.  However, you can achieve that using the
 426         python zip function on the record (or its sequence) and the relevant
 427         per-letter-annotation:
 428
 429         >>> from Bio import SeqIO
 430         >>> rec = SeqIO.read(open("Quality/solexa.fastq", "rU"),
 431         ...                  "fastq-solexa")
 432         >>> print rec.id, rec.seq
 433         slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
 434         >>> print rec.letter_annotations.keys()
 435         ['solexa_quality']
 436         >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]) :
 437         ...     if qual < -10 :
 438         ...         print nuc, qual
 439         C -19
 440         C -27
 441         C -18
 442
 443         You may agree that using zip(rec.seq, ...) is more explicit than using
 444         zip(rec, ...) as shown above.
 445         """
 446         return iter(self.seq)
 447
 448     def __str__(self) :
 449         """A human readable summary of the record and its annotation (string).
 450
 451         The python built in function str works by calling the object's ___str__
 452         method.  e.g.
 453
 454         >>> from Bio.Seq import Seq
 455         >>> from Bio.SeqRecord import SeqRecord
 456         >>> from Bio.Alphabet import IUPAC
 457         >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
 458         ...                         IUPAC.protein),
 459         ...                    id="YP_025292.1", name="HokC",
 460         ...                    description="toxic membrane protein, small")
 461         >>> print str(record)
 462         ID: YP_025292.1
 463         Name: HokC
 464         Description: toxic membrane protein, small
 465         Number of features: 0
 466         Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
 467
 468         In this example you don't actually need to call str explicity, as the
 469         print command does this automatically:
 470
 471         >>> print record
 472         ID: YP_025292.1
 473         Name: HokC
 474         Description: toxic membrane protein, small
 475         Number of features: 0
 476         Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
 477
 478         Note that long sequences are shown truncated.
 479         """
 480         lines = []
 481         if self.id : lines.append("ID: %s" % self.id)
 482         if self.name : lines.append("Name: %s" % self.name)
 483         if self.description : lines.append("Description: %s" % self.description)
 484         if self.dbxrefs : lines.append("Database cross-references: " \
 485                                        + ", ".join(self.dbxrefs))
 486         lines.append("Number of features: %i" % len(self.features))
 487         for a in self.annotations:
 488             lines.append("/%s=%s" % (a, str(self.annotations[a])))
 489         if self.letter_annotations :
 490             lines.append("Per letter annotation for: " \
 491                          + ", ".join(self.letter_annotations.keys()))
 492         #Don't want to include the entire sequence,
 493         #and showing the alphabet is useful:
 494         lines.append(repr(self.seq))
 495         return "\n".join(lines)
 496
 497     def __repr__(self) :
 498         """A concise summary of the record for debugging (string).
 499
 500         The python built in function repr works by calling the object's ___repr__
 501         method.  e.g.
 502
 503         >>> from Bio.Seq import Seq
 504         >>> from Bio.SeqRecord import SeqRecord
 505         >>> from Bio.Alphabet import generic_protein
 506         >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
 507         ...                    +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
 508         ...                    +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
 509         ...                    +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
 510         ...                    generic_protein),
 511         ...                 id="NP_418483.1", name="b4059",
 512         ...                 description="ssDNA-binding protein",
 513         ...                 dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
 514         >>> print repr(rec)
 515         SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
 516
 517         At the python prompt you can also use this shorthand:
 518
 519         >>> rec
 520         SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
 521
 522         Note that long sequences are shown truncated.
 523         """
 524         return self.__class__.__name__ \
 525          + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
 526          % tuple(map(repr, (self.seq, self.id, self.name,
 527                             self.description, self.dbxrefs)))
 528
 529     def format(self, format) :
 530         r"""Returns the record as a string in the specified file format.
 531
 532         The format should be a lower case string supported as an output
 533         format by Bio.SeqIO, which is used to turn the SeqRecord into a
 534         string.  e.g.
 535
 536         >>> from Bio.Seq import Seq
 537         >>> from Bio.SeqRecord import SeqRecord
 538         >>> from Bio.Alphabet import IUPAC
 539         >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
 540         ...                         IUPAC.protein),
 541         ...                    id="YP_025292.1", name="HokC",
 542         ...                    description="toxic membrane protein")
 543         >>> record.format("fasta")
 544         '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
 545         >>> print record.format("fasta")
 546         >YP_025292.1 toxic membrane protein
 547         MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
 548         <BLANKLINE>
 549
 550         The python print command automatically appends a new line, meaning
 551         in this example a blank line is shown.  If you look at the string
 552         representation you can see there is a trailing new line (shown as
 553         slash n) which is important when writing to a file or if
 554         concatenating mutliple sequence strings together.
 555
 556         Note that this method will NOT work on every possible file format
 557         supported by Bio.SeqIO (e.g. some are for multiple sequences only).
 558         """
 559         #See also the __format__ added for Python 2.6 / 3.0, PEP 3101
 560         #See also the Bio.Align.Generic.Alignment class and its format()
 561         return self.__format__(format)
 562
 563     def __format__(self, format_spec) :
 564         """Returns the record as a string in the specified file format.
 565
 566         This method supports the python format() function added in
 567         Python 2.6/3.0.  The format_spec should be a lower case
 568         string supported by Bio.SeqIO as an output file format.
 569         See also the SeqRecord's format() method.
 570         """
 571         if format_spec:
 572             from StringIO import StringIO
 573             from Bio import SeqIO
 574             handle = StringIO()
 575             SeqIO.write([self], handle, format_spec)
 576             return handle.getvalue()
 577         else :
 578             #Follow python convention and default to using __str__
 579             return str(self)
 580
 581     def __len__(self) :
 582         """Returns the length of the sequence.
 583
 584         For example, using Bio.SeqIO to read in a FASTA nucleotide file:
 585
 586         >>> from Bio import SeqIO
 587         >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"),"fasta")
 588         >>> len(record)
 589         309
 590         >>> len(record.seq)
 591         309
 592         """
 593         return len(self.seq)
 594
 595     def __nonzero__(self) :
 596         """Returns True regardless of the length of the sequence.
 597
 598         This behaviour is for backwards compatibility, since until the
 599         __len__ method was added, a SeqRecord always evaluated as True.
 600
 601         Note that in comparison, a Seq object will evaluate to False if it
 602         has a zero length sequence.
 603
 604         WARNING: The SeqRecord may in future evaluate to False when its
 605         sequence is of zero length (in order to better match the Seq
 606         object behaviour)!
 607         """
 608         return True
 609
 610 def _test():
 611     """Run the Bio.SeqRecord module's doctests (PRIVATE).
 612
 613     This will try and locate the unit tests directory, and run the doctests
 614     from there in order that the relative paths used in the examples work.
 615     """
 616     import doctest
 617     import os
 618     if os.path.isdir(os.path.join("..","Tests")) :
 619         print "Runing doctests..."
 620         cur_dir = os.path.abspath(os.curdir)
 621         os.chdir(os.path.join("..","Tests"))
 622         doctest.testmod()
 623         os.chdir(cur_dir)
 624         del cur_dir
 625         print "Done"
 626
 627 if __name__ == "__main__":
 628     _test()