SeqIO/__init__.py

   1 # Copyright 2006-2008 by Peter Cock.  All rights reserved.
   2 # This code is part of the Biopython distribution and governed by its
   3 # license.  Please see the LICENSE file that should have been included
   4 # as part of this package.
   5 #
   6 #Nice link:
   7 # http://www.ebi.ac.uk/help/formats_frame.html
   8
   9 """Sequence input/output as SeqRecord objects.
  10
  11 Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by
  12 a whole chapter in our tutorial:
  13  - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
  14  - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
  15
  16 Input
  17 =====
  18 The main function is Bio.SeqIO.parse(...) which takes an input file handle,
  19 and format string.  This returns an iterator giving SeqRecord objects:
  20
  21     >>> from Bio import SeqIO
  22     >>> handle = open("Fasta/f002", "rU")
  23     >>> for record in SeqIO.parse(handle, "fasta") :
  24     ...     print record.id, len(record)
  25     gi|1348912|gb|G26680|G26680 633
  26     gi|1348917|gb|G26685|G26685 413
  27     gi|1592936|gb|G29385|G29385 471
  28     >>> handle.close()
  29
  30 Note that the parse() function will all invoke the relevant parser for the
  31 format with its default settings.  You may want more control, in which case
  32 you need to create a format specific sequence iterator directly.
  33
  34 For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
  35 using a sequence iterator can save you a lot of memory (RAM).  There is
  36 less benefit for interlaced file formats (e.g. most multiple alignment file
  37 formats).  However, an iterator only lets you access the records one by one.
  38
  39 If you want random access to the records by number, turn this into a list:
  40
  41     >>> from Bio import SeqIO
  42     >>> handle = open("Fasta/f002", "rU")
  43     >>> records = list(SeqIO.parse(handle, "fasta"))
  44     >>> handle.close()
  45     >>> print records[1].id
  46     gi|1348917|gb|G26685|G26685
  47
  48 If you want random access to the records by a key such as the record id,
  49 turn the iterator into a dictionary:
  50
  51     >>> from Bio import SeqIO
  52     >>> handle = open("Fasta/f002", "rU")
  53     >>> record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
  54     >>> handle.close()
  55     >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
  56     413
  57
  58 If you expect your file to contain one-and-only-one record, then we provide
  59 the following 'helper' function which will return a single SeqRecord, or
  60 raise an exception if there are no records or more than one record:
  61
  62     >>> from Bio import SeqIO
  63     >>> handle = open("Fasta/f001", "rU")
  64     >>> record = SeqIO.read(handle, "fasta")
  65     >>> handle.close()
  66     >>> print record.id, len(record)
  67     gi|3318709|pdb|1A91| 79
  68
  69 This style is useful when you expect a single record only (and would
  70 consider multiple records an error).  For example, when dealing with GenBank
  71 files for bacterial genomes or chromosomes, there is normally only a single
  72 record.  Alternatively, use this with a handle when download a single record
  73 from the internet.
  74
  75 However, if you just want the first record from a file containing multiple
  76 record, use the iterator's next() method:
  77
  78     >>> from Bio import SeqIO
  79     >>> handle = open("Fasta/f002", "rU")
  80     >>> record = SeqIO.parse(handle, "fasta").next()
  81     >>> handle.close()
  82     >>> print record.id, len(record)
  83     gi|1348912|gb|G26680|G26680 633
  84
  85 The above code will work as long as the file contains at least one record.
  86 Note that if there is more than one record, the remaining records will be
  87 silently ignored.
  88
  89 Input - Alignments
  90 ==================
  91 You can read in alignment files as Alignment objects using Bio.AlignIO.
  92 Alternatively, reading in an alignment file format via Bio.SeqIO will give
  93 you a SeqRecord for each row of each alignment:
  94
  95     >>> from Bio import SeqIO
  96     >>> handle = open("Clustalw/hedgehog.aln", "rU")
  97     >>> for record in SeqIO.parse(handle, "clustal") :
  98     ...     print record.id, len(record)
  99     gi|167877390|gb|EDS40773.1| 447
 100     gi|167234445|ref|NP_001107837. 447
 101     gi|74100009|gb|AAZ99217.1| 447
 102     gi|13990994|dbj|BAA33523.2| 447
 103     gi|56122354|gb|AAV74328.1| 447
 104     >>> handle.close()
 105
 106 Output
 107 ======
 108 Use the function Bio.SeqIO.write(...), which takes a complete set of
 109 SeqRecord objects (either as a list, or an iterator), an output file handle
 110 and of course the file format::
 111
 112     from Bio import SeqIO
 113     records = ...
 114     handle = open("example.faa", "w")
 115     SeqIO.write(records, handle, "fasta")
 116     handle.close()
 117
 118 In general, you are expected to call this function once (with all your
 119 records) and then close the file handle.
 120
 121 Output - Advanced
 122 =================
 123 The effect of calling write() multiple times on a single file will vary
 124 depending on the file format, and is best avoided unless you have a strong
 125 reason to do so.
 126
 127 Trying this for certain alignment formats (e.g. phylip, clustal, stockholm)
 128 would have the effect of concatenating several multiple sequence alignments
 129 together.  Such files are created by the PHYLIP suite of programs for
 130 bootstrap analysis.
 131
 132 For sequential files formats (e.g. fasta, genbank) each "record block" holds
 133 a single sequence.  For these files it would probably be safe to call
 134 write() multiple times.
 135
 136 File Formats
 137 ============
 138 When specifying the file format, use lowercase strings.  The same format
 139 names are also used in Bio.AlignIO and include the following:
 140
 141  - ace     - Reads the contig sequences from an ACE assembly file.
 142  - embl    - The EMBL flat file format. Uses Bio.GenBank internally.
 143  - fasta   - The generic sequence file format where each record starts with
 144              an identifer line starting with a ">" character, followed by
 145              lines of sequence.
 146  - fastq   - A "FASTA like" format used by Sanger which also stores PHRED
 147              sequence quality values.
 148  - fastq-solexa - The Solexa/Illumnia variant of the Sanger FASTQ format which
 149                   encodes Solexa quality scores (not PHRED quality scores).
 150  - genbank - The GenBank or GenPept flat file format.
 151  - gb      - An alias for "genbank", for consistency with NCBI Entrez Utilities
 152  - ig      - The IntelliGenetics file format, apparently the same as the
 153              MASE alignment format.
 154  - phd     - Output from PHRED, used by PHRAP and CONSED for input.
 155  - pir     - A "FASTA like" format introduced by the National Biomedical
 156              Research Foundation (NBRF) for the Protein Information Resource
 157              (PIR) database, now part of UniProt.
 158  - swiss   - Plain text Swiss-Prot aka UniProt format.
 159  - tab     - Simple two column tab separated sequence files, where each
 160              line holds a record's identifier and sequence. For example,
 161              this is used as by Aligent's eArray software when saving
 162              microarray probes in a minimal tab delimited text file.
 163  - qual    - A "FASTA like" format holding PHRED quality values from
 164              sequencing DNA, but no actual sequences (usually provided
 165              in separate FASTA files).
 166
 167 Note that while Bio.SeqIO can read all the above file formats, it cannot
 168 write to all of them.
 169
 170 You can also use any file format supported by Bio.AlignIO, such as "nexus",
 171 "phlip" and "stockholm", which gives you access to the individual sequences
 172 making up each alignment as SeqRecords.
 173 """
 174 __docformat__ = "epytext en" #not just plaintext
 175
 176 #TODO
 177 # - define policy on reading aligned sequences with gaps in
 178 #   (e.g. - and . characters) including how the alphabet interacts
 179 #
 180 # - Can we build the to_alignment(...) functionality
 181 #   into the generic Alignment class instead?
 182 #
 183 # - How best to handle unique/non unique record.id when writing.
 184 #   For most file formats reading such files is fine; The stockholm
 185 #   parser would fail.
 186 #
 187 # - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
 188 #   http://www.bioperl.org/wiki/MSF_multiple_alignment_format
 189
 190 """
 191 FAO BioPython Developers
 192 ========================
 193 The way I envision this SeqIO system working as that for any sequence file
 194 format we have an iterator that returns SeqRecord objects.
 195
 196 This also applies to interlaced fileformats (like clustal - although that
 197 is now handled via Bio.AlignIO instead) where the file cannot be read record
 198 by record.  You should still return an iterator, even if the implementation
 199 could just as easily return a list.
 200
 201 These file format specific sequence iterators may be implemented as:
 202 * Classes which take a handle for __init__ and provide the __iter__ method
 203 * Functions that take a handle, and return an iterator object
 204 * Generator functions that take a handle, and yield SeqRecord objects
 205
 206 It is then trivial to turn this iterator into a list of SeqRecord objects,
 207 an in memory dictionary, or a multiple sequence alignment object.
 208
 209 For building the dictionary by default the id propery of each SeqRecord is
 210 used as the key.  You should always populate the id property, and it should
 211 be unique in most cases. For some file formats the accession number is a good
 212 choice.  If the file itself contains ambiguous identifiers, don't try and
 213 dis-ambiguate them - return them as is.
 214
 215 When adding a new file format, please use the same lower case format name
 216 as BioPerl, or if they have not defined one, try the names used by EMBOSS.
 217
 218 See also http://biopython.org/wiki/SeqIO_dev
 219
 220 --Peter
 221 """
 222
 223 import os
 224 from Bio.Seq import Seq
 225 from Bio.SeqRecord import SeqRecord
 226 from Bio.Align.Generic import Alignment
 227 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
 228
 229 import AceIO
 230 import FastaIO
 231 import IgIO #IntelliGenetics or MASE format
 232 import InsdcIO #EMBL and GenBank
 233 import PhdIO
 234 import PirIO
 235 import SwissIO
 236 import TabIO
 237 import QualityIO #FastQ and qual files
 238
 239
 240 #Convention for format names is "mainname-subtype" in lower case.
 241 #Please use the same names as BioPerl where possible.
 242 #
 243 #Note that this simple system copes with defining
 244 #multiple possible iterators for a given format/extension
 245 #with the -subtype suffix
 246 #
 247 #Most alignment file formats will be handled via Bio.AlignIO
 248
 249 _FormatToIterator ={"fasta" : FastaIO.FastaIterator,
 250                     "gb" : InsdcIO.GenBankIterator,
 251                     "genbank" : InsdcIO.GenBankIterator,
 252                     "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
 253                     "embl" : InsdcIO.EmblIterator,
 254                     "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
 255                     "ig" : IgIO.IgIterator,
 256                     "swiss" : SwissIO.SwissIterator,
 257                     "phd" : PhdIO.PhdIterator,
 258                     "ace" : AceIO.AceIterator,
 259                     "tab" : TabIO.TabIterator,
 260                     "pir" : PirIO.PirIterator,
 261                     "fastq" : QualityIO.FastqPhredIterator,
 262                     "fastq-solexa" : QualityIO.FastqSolexaIterator,
 263                     "qual" : QualityIO.QualPhredIterator,
 264                     }
 265
 266 _FormatToWriter ={"fasta" : FastaIO.FastaWriter,
 267                   "gb" : InsdcIO.GenBankWriter,
 268                   "genbank" : InsdcIO.GenBankWriter,
 269                   "tab" : TabIO.TabWriter,
 270                   "fastq" : QualityIO.FastqPhredWriter,
 271                   "fastq-solexa" : QualityIO.FastqSolexaWriter,
 272                   "qual" : QualityIO.QualPhredWriter,
 273                   }
 274
 275 def write(sequences, handle, format) :
 276     """Write complete set of sequences to a file.
 277
 278      - sequences - A list (or iterator) of SeqRecord objects.
 279      - handle    - File handle object to write to.
 280      - format    - lower case string describing the file format to write.
 281
 282     You should close the handle after calling this function.
 283
 284     Returns the number of records written (as an integer).
 285     """
 286     from Bio import AlignIO
 287
 288     #Try and give helpful error messages:
 289     if isinstance(handle, basestring) :
 290         raise TypeError("Need a file handle, not a string (i.e. not a filename)")
 291     if not isinstance(format, basestring) :
 292         raise TypeError("Need a string for the file format (lower case)")
 293     if not format :
 294         raise ValueError("Format required (lower case string)")
 295     if format != format.lower() :
 296         raise ValueError("Format string '%s' should be lower case" % format)
 297     if isinstance(sequences,SeqRecord):
 298         raise ValueError("Use a SeqRecord list/iterator, not just a single SeqRecord")
 299
 300     #Map the file format to a writer class
 301     if format in _FormatToWriter :
 302         writer_class = _FormatToWriter[format]
 303         count = writer_class(handle).write_file(sequences)
 304     elif format in AlignIO._FormatToWriter :
 305         #Try and turn all the records into a single alignment,
 306         #and write that using Bio.AlignIO
 307         alignment = to_alignment(sequences)
 308         alignment_count = AlignIO.write([alignment], handle, format)
 309         assert alignment_count == 1, "Internal error - the underlying writer " \
 310            + " should have returned 1, not %s" % repr(alignment_count)
 311         count = len(alignment.get_all_seqs())
 312         del alignment_count, alignment
 313     elif format in _FormatToIterator or format in AlignIO._FormatToIterator :
 314         raise ValueError("Reading format '%s' is supported, but not writing" \
 315                          % format)
 316     else :
 317         raise ValueError("Unknown format '%s'" % format)
 318
 319     assert isinstance(count, int), "Internal error - the underlying writer " \
 320            + " should have returned the record count, not %s" % repr(count)
 321     return count
 322
 323 def parse(handle, format, alphabet=None) :
 324     r"""Turns a sequence file into an iterator returning SeqRecords.
 325
 326      - handle   - handle to the file.
 327      - format   - lower case string describing the file format.
 328      - alphabet - optional Alphabet object, useful when the sequence type
 329                   cannot be automatically inferred from the file itself
 330                   (e.g. format="fasta" or "tab")
 331
 332     Typical usage, opening a file to read in, and looping over the record(s):
 333
 334     >>> from Bio import SeqIO
 335     >>> filename = "Nucleic/sweetpea.nu"
 336     >>> for record in SeqIO.parse(open(filename,"rU"), "fasta") :
 337     ...    print "ID", record.id
 338     ...    print "Sequence length", len(record)
 339     ...    print "Sequence alphabet", record.seq.alphabet
 340     ID gi|3176602|gb|U78617.1|LOU78617
 341     Sequence length 309
 342     Sequence alphabet SingleLetterAlphabet()
 343
 344     For file formats like FASTA where the alphabet cannot be determined, it
 345     may be useful to specify the alphabet explicitly:
 346
 347     >>> from Bio import SeqIO
 348     >>> from Bio.Alphabet import generic_dna
 349     >>> filename = "Nucleic/sweetpea.nu"
 350     >>> for record in SeqIO.parse(open(filename,"rU"), "fasta", generic_dna) :
 351     ...    print "ID", record.id
 352     ...    print "Sequence length", len(record)
 353     ...    print "Sequence alphabet", record.seq.alphabet
 354     ID gi|3176602|gb|U78617.1|LOU78617
 355     Sequence length 309
 356     Sequence alphabet DNAAlphabet()
 357
 358     If you have a string 'data' containing the file contents, you must
 359     first turn this into a handle in order to parse it:
 360
 361     >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
 362     >>> from Bio import SeqIO
 363     >>> from StringIO import StringIO
 364     >>> for record in SeqIO.parse(StringIO(data), "fasta") :
 365     ...     print record.id, record.seq
 366     Alpha ACCGGATGTA
 367     Beta AGGCTCGGTTA
 368
 369     Use the Bio.SeqIO.read(handle, format) function when you expect a single
 370     record only.
 371     """
 372     #NOTE - The above docstring has some raw \n characters needed
 373     #for the StringIO example, hense the whole docstring is in raw
 374     #string more (see the leading r before the opening quote).
 375     from Bio import AlignIO
 376
 377     #Try and give helpful error messages:
 378     if isinstance(handle, basestring) :
 379         raise TypeError("Need a file handle, not a string (i.e. not a filename)")
 380     if not isinstance(format, basestring) :
 381         raise TypeError("Need a string for the file format (lower case)")
 382     if not format :
 383         raise ValueError("Format required (lower case string)")
 384     if format != format.lower() :
 385         raise ValueError("Format string '%s' should be lower case" % format)
 386     if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
 387                                      isinstance(alphabet, AlphabetEncoder)) :
 388         raise ValueError("Invalid alphabet, %s" % repr(alphabet))
 389
 390     #Map the file format to a sequence iterator:
 391     if format in _FormatToIterator :
 392         iterator_generator = _FormatToIterator[format]
 393         if alphabet is None :
 394             return iterator_generator(handle)
 395         try :
 396             return iterator_generator(handle, alphabet=alphabet)
 397         except :
 398             return _force_alphabet(iterator_generator(handle), alphabet)
 399     elif format in AlignIO._FormatToIterator :
 400         #Use Bio.AlignIO to read in the alignments
 401         #TODO - Once we drop support for Python 2.3, this helper function can be
 402         #replaced with a generator expression.
 403         return _iterate_via_AlignIO(handle, format, alphabet)
 404     else :
 405         raise ValueError("Unknown format '%s'" % format)
 406
 407 #This is a generator function
 408 def _iterate_via_AlignIO(handle, format, alphabet) :
 409     """Iterate over all records in several alignments (PRIVATE)."""
 410     from Bio import AlignIO
 411     for align in AlignIO.parse(handle, format, alphabet=alphabet) :
 412         for record in align :
 413             yield record
 414
 415 def _force_alphabet(record_iterator, alphabet) :
 416      """Iterate over records, over-riding the alphabet (PRIVATE)."""
 417      #Assume the alphabet argument has been pre-validated
 418      given_base_class = _get_base_alphabet(alphabet).__class__
 419      for record in record_iterator :
 420          if isinstance(_get_base_alphabet(record.seq.alphabet),
 421                        given_base_class) :
 422              record.seq.alphabet = alphabet
 423              yield record
 424          else :
 425              raise ValueError("Specified alphabet %s clashes with "\
 426                               "that determined from the file, %s" \
 427                               % (repr(alphabet), repr(record.seq.alphabet)))
 428
 429 def read(handle, format, alphabet=None) :
 430     """Turns a sequence file into a single SeqRecord.
 431
 432      - handle   - handle to the file.
 433      - format   - string describing the file format.
 434      - alphabet - optional Alphabet object, useful when the sequence type
 435                   cannot be automatically inferred from the file itself
 436                   (e.g. format="fasta" or "tab")
 437
 438     This function is for use parsing sequence files containing
 439     exactly one record.  For example, reading a GenBank file:
 440
 441     >>> from Bio import SeqIO
 442     >>> record = SeqIO.read(open("GenBank/arab1.gb", "rU"), "genbank")
 443     >>> print "ID", record.id
 444     ID AC007323.5
 445     >>> print "Sequence length", len(record)
 446     Sequence length 86436
 447     >>> print "Sequence alphabet", record.seq.alphabet
 448     Sequence alphabet IUPACAmbiguousDNA()
 449
 450     If the handle contains no records, or more than one record,
 451     an exception is raised.  For example:
 452
 453     >>> from Bio import SeqIO
 454     >>> record = SeqIO.read(open("GenBank/cor6_6.gb", "rU"), "genbank")
 455     Traceback (most recent call last):
 456         ...
 457     ValueError: More than one record found in handle
 458
 459     If however you want the first record from a file containing
 460     multiple records this function would raise an exception (as
 461     shown in the example above).  Instead use:
 462
 463     >>> from Bio import SeqIO
 464     >>> record = SeqIO.parse(open("GenBank/cor6_6.gb", "rU"), "genbank").next()
 465     >>> print "First record's ID", record.id
 466     First record's ID X55053.1
 467
 468     Use the Bio.SeqIO.parse(handle, format) function if you want
 469     to read multiple records from the handle.
 470     """
 471     iterator = parse(handle, format, alphabet)
 472     try :
 473         first = iterator.next()
 474     except StopIteration :
 475         first = None
 476     if first is None :
 477         raise ValueError("No records found in handle")
 478     try :
 479         second = iterator.next()
 480     except StopIteration :
 481         second = None
 482     if second is not None :
 483         raise ValueError("More than one record found in handle")
 484     return first
 485
 486 def to_dict(sequences, key_function=None) :
 487     """Turns a sequence iterator or list into a dictionary.
 488
 489      - sequences  - An iterator that returns SeqRecord objects,
 490                     or simply a list of SeqRecord objects.
 491      - key_function - Optional function which when given a SeqRecord
 492                       returns a unique string for the dictionary key.
 493
 494     e.g. key_function = lambda rec : rec.name
 495     or,  key_function = lambda rec : rec.description.split()[0]
 496
 497     If key_function is ommitted then record.id is used, on the
 498     assumption that the records objects returned are SeqRecords
 499     with a unique id field.
 500
 501     If there are duplicate keys, an error is raised.
 502
 503     Example usage, defaulting to using the record.id as key:
 504
 505     >>> from Bio import SeqIO
 506     >>> handle = open("GenBank/cor6_6.gb", "rU")
 507     >>> format = "genbank"
 508     >>> id_dict = SeqIO.to_dict(SeqIO.parse(handle, format))
 509     >>> print id_dict.keys()
 510     ['L31939.1', 'AJ237582.1', 'X62281.1', 'AF297471.1', 'X55053.1', 'M81224.1']
 511     >>> print id_dict["L31939.1"].description
 512     Brassica rapa (clone bif72) kin mRNA, complete cds.
 513
 514     A more complex example, using the key_function argument in order to use
 515     a sequence checksum as the dictionary key:
 516
 517     >>> from Bio import SeqIO
 518     >>> from Bio.SeqUtils.CheckSum import seguid
 519     >>> handle = open("GenBank/cor6_6.gb", "rU")
 520     >>> format = "genbank"
 521     >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(handle, format),
 522     ...               key_function = lambda rec : seguid(rec.seq))
 523     >>> for key, record in seguid_dict.iteritems() :
 524     ...     print key, record.id
 525     SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
 526     l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
 527     /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
 528     TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
 529     uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
 530     BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
 531     """
 532     if key_function is None :
 533         key_function = lambda rec : rec.id
 534
 535     d = dict()
 536     for record in sequences :
 537         key = key_function(record)
 538         if key in d :
 539             raise ValueError("Duplicate key '%s'" % key)
 540         d[key] = record
 541     return d
 542
 543
 544 def to_alignment(sequences, alphabet=None, strict=True) :
 545     """Returns a multiple sequence alignment (OBSOLETE).
 546
 547      - sequences -An iterator that returns SeqRecord objects,
 548                   or simply a list of SeqRecord objects.  All
 549                   the record sequences must be the same length.
 550      - alphabet - Optional alphabet.  Stongly recommended.
 551      - strict   - Optional, defaults to True.  Should error checking
 552                   be done?
 553
 554     Using this function is now discouraged.  Rather doing this:
 555
 556     >>> from Bio import SeqIO
 557     >>> handle = open("Clustalw/protein.aln")
 558     >>> alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal"))
 559     >>> handle.close()
 560
 561     You are now encouraged to use Bio.AlignIO instead, e.g.
 562
 563     >>> from Bio import AlignIO
 564     >>> handle = open("Clustalw/protein.aln")
 565     >>> alignment = AlignIO.read(handle, "clustal")
 566     >>> handle.close()
 567     """
 568     #TODO - Move this functionality into the Alignment class instead?
 569     from Bio.Alphabet import generic_alphabet
 570     from Bio.Alphabet import _consensus_alphabet
 571     if alphabet is None :
 572         sequences = list(sequences)
 573         alphabet = _consensus_alphabet([rec.seq.alphabet for rec in sequences \
 574                                         if rec.seq is not None])
 575
 576     if not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)) :
 577         raise ValueError("Invalid alphabet")
 578
 579     alignment_length = None
 580     alignment = Alignment(alphabet)
 581     for record in sequences :
 582         if strict :
 583             if alignment_length is None :
 584                 alignment_length = len(record.seq)
 585             elif alignment_length != len(record.seq) :
 586                 raise ValueError("Sequences must all be the same length")
 587
 588             assert isinstance(record.seq.alphabet, Alphabet) \
 589             or isinstance(record.seq.alphabet, AlphabetEncoder), \
 590                 "Sequence does not have a valid alphabet"
 591
 592             #TODO - Move this alphabet comparison code into the Alphabet module/class?
 593             #TODO - Is a normal alphabet "ungapped" by default, or does it just mean
 594             #undecided?
 595             if isinstance(record.seq.alphabet, Alphabet) \
 596             and isinstance(alphabet, Alphabet) :
 597                 #Comparing two non-gapped alphabets
 598                 if not isinstance(record.seq.alphabet, alphabet.__class__) :
 599                     raise ValueError("Incompatible sequence alphabet " \
 600                                      + "%s for %s alignment" \
 601                                      % (record.seq.alphabet, alphabet))
 602             elif isinstance(record.seq.alphabet, AlphabetEncoder) \
 603             and isinstance(alphabet, Alphabet) :
 604                 raise ValueError("Sequence has a gapped alphabet, alignment does not")
 605             elif isinstance(record.seq.alphabet, Alphabet) \
 606             and isinstance(alphabet, Gapped) :
 607                 #Sequence isn't gapped, alignment is.
 608                 if not isinstance(record.seq.alphabet, alphabet.alphabet.__class__) :
 609                     raise ValueError("Incompatible sequence alphabet " \
 610                                      + "%s for %s alignment" \
 611                                      % (record.seq.alphabet, alphabet))
 612             else :
 613                 #Comparing two gapped alphabets
 614                 if not isinstance(record.seq.alphabet, alphabet.__class__) :
 615                     raise ValueError("Incompatible sequence alphabet " \
 616                                      + "%s for %s alignment" \
 617                                      % (record.seq.alphabet, alphabet))
 618                 if record.seq.alphabet.gap_char != alphabet.gap_char :
 619                     raise ValueError("Sequence gap characters != alignment gap char")
 620             #ToDo, additional checks on the specified alignment...
 621             #Should we look at the alphabet.contains() method?
 622         if record.seq is None :
 623             raise TypeError("SeqRecord (id=%s) has None for its sequence." % record.id)
 624
 625         #This is abusing the "private" records list,
 626         #we should really have a method like add_sequence
 627         #but which takes SeqRecord objects.  See also Bug 1944
 628         alignment._records.append(record)
 629     return alignment
 630
 631 def _test():
 632     """Run the Bio.SeqIO module's doctests.
 633
 634     This will try and locate the unit tests directory, and run the doctests
 635     from there in order that the relative paths used in the examples work.
 636     """
 637     import doctest
 638     import os
 639     if os.path.isdir(os.path.join("..","..","Tests")) :
 640         print "Runing doctests..."
 641         cur_dir = os.path.abspath(os.curdir)
 642         os.chdir(os.path.join("..","..","Tests"))
 643         doctest.testmod()
 644         os.chdir(cur_dir)
 645         del cur_dir
 646         print "Done"
 647
 648 if __name__ == "__main__":
 649     #Run the doctests
 650     _test()