+++ /dev/null
-"""Represent a Sequence Feature holding info about a part of a sequence.
-
-This is heavily modeled after the Biocorba SeqFeature objects, and
-may be pretty biased towards GenBank stuff since I'm writing it
-for the GenBank parser output...
-
-What's here:
-
-Base class to hold a Feature.
-----------------------------
-classes:
-o SeqFeature
-
-Hold information about a Reference.
-----------------------------------
-
-This is an attempt to create a General class to hold Reference type
-information.
-
-classes:
-o Reference
-
-Specify locations of a feature on a Sequence.
----------------------------------------------
-
-This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
-much the same way as Biocorba. This has the advantages of allowing us
-to handle fuzzy stuff in case anyone needs it, and also be compatible
-with Biocorba.
-
-classes:
-o FeatureLocation - Specify the start and end location of a feature.
-
-o ExactPosition - Specify the position as being exact.
-o WithinPosition - Specify a position occuring within some range.
-o BetweenPosition - Specify a position occuring between a range.
-o BeforePosition - Specify the position as being found before some base.
-o AfterPosition - Specify the position as being found after some base.
-"""
-
-class SeqFeature:
- """Represent a Sequence Feature on an object.
-
- Attributes:
- o location - the location of the feature on the sequence
- o type - the specified type of the feature (ie. CDS, exon, repeat...)
- o location_operator - a string specifying how this SeqFeature may
- be related to others. For example, in the example GenBank feature
- shown below, the location_operator would be "join"
- o strand - A value specifying on which strand (of a DNA sequence, for
- instance) the feature deals with. 1 indicates the plus strand, -1
- indicates the minus strand, 0 indicates both strands, and None indicates
- that strand doesn't apply (ie. for proteins) or is not known.
- o id - A string identifier for the feature.
- o ref - A reference to another sequence. This could be an accession
- number for some different sequence.
- o ref_db - A different database for the reference accession number.
- o qualifier - A dictionary of qualifiers on the feature. These are
- analagous to the qualifiers from a GenBank feature table. The keys of
- the dictionary are qualifier names, the values are the qualifier
- values.
- o sub_features - Additional SeqFeatures which fall under this 'parent'
- feature. For instance, if we having something like:
-
- CDS join(1..10,30..40,50..60)
-
- The the top level feature would be a CDS from 1 to 60, and the sub
- features would be of 'CDS_join' type and would be from 1 to 10, 30 to
- 40 and 50 to 60, respectively.
- """
- def __init__(self, location = None, type = '', location_operator = '',
- strand = None, id = "<unknown id>",
- qualifiers = {}, sub_features = [],
- ref = None, ref_db = None):
- """Initialize a SeqFeature on a Sequence.
- """
- self.location = location
-
- self.type = type
- self.location_operator = location_operator
- self.strand = strand
- self.id = id
- # XXX right now sub_features and qualifiers cannot be set
- # from the initializer because this causes all kinds
- # of recursive import problems. I can't understand why this is
- # at all :-<
- self.qualifiers = {}
- self.sub_features = []
- self.ref = ref
- self.ref_db = ref_db
-
- def __repr__(self):
- """A string representation of the record for debugging."""
- answer = "%s(%s" % (self.__class__, repr(self.location))
- if self.type :
- answer += ", type=%s" % repr(self.type)
- if self.location_operator :
- answer += ", location_operator=%s" % repr(self.location_operator)
- if self.strand :
- answer += ", strand=%s" % repr(self.strand)
- if self.id and self.id != "<unknown id>" :
- answer += ", id=%s" % repr(self.id)
- if self.ref :
- answer += ", ref=%s" % repr(self.ref)
- if self.ref_db :
- answer += ", ref_db=%s" % repr(self.ref_db)
- answer += ")"
- return answer
-
- def __str__(self):
- """A readable summary of the feature intended to be printed to screen.
- """
- out = "type: %s\n" % self.type
- out += "location: %s\n" % self.location
- out += "ref: %s:%s\n" % (self.ref, self.ref_db)
- out += "strand: %s\n" % self.strand
- out += "qualifiers: \n"
- qualifier_keys = self.qualifiers.keys()
- qualifier_keys.sort()
- for qual_key in qualifier_keys:
- out += "\tKey: %s, Value: %s\n" % (qual_key,
- self.qualifiers[qual_key])
- if len(self.sub_features) != 0:
- out += "Sub-Features\n"
- for sub_feature in self.sub_features:
- out +="%s\n" % sub_feature
-
- return out
-
- def _shift(self, offset) :
- """Returns a copy of the feature with its location shifted (PRIVATE).
-
- The annotation qaulifiers are copied."""
- answer = SeqFeature(location = self.location._shift(offset),
- type = self.type,
- location_operator = self.location_operator,
- strand = self.strand,
- id = self.id,
- #qualifiers = dict(self.qualifiers.iteritems()),
- #sub_features = [f._shift(offset) for f in self.sub_features],
- ref = self.ref,
- ref_db = self.ref_db)
- #TODO - Sort out the use of sub_feature and qualifiers in __init___
- answer.sub_features = [f._shift(offset) for f in self.sub_features]
- answer.qualifiers = dict(self.qualifiers.iteritems())
- return answer
-
-# --- References
-
-# TODO -- Will this hold PubMed and Medline information decently?
-class Reference:
- """Represent a Generic Reference object.
-
- Attributes:
- o location - A list of Location objects specifying regions of
- the sequence that the references correspond to. If no locations are
- specified, the entire sequence is assumed.
- o authors - A big old string, or a list split by author, of authors
- for the reference.
- o title - The title of the reference.
- o journal - Journal the reference was published in.
- o medline_id - A medline reference for the article.
- o pubmed_id - A pubmed reference for the article.
- o comment - A place to stick any comments about the reference.
- """
- def __init__(self):
- self.location = []
- self.authors = ''
- self.consrtm = ''
- self.title = ''
- self.journal = ''
- self.medline_id = ''
- self.pubmed_id = ''
- self.comment = ''
-
- def __str__(self):
- """Output an informative string for debugging.
- """
- out = ""
- for single_location in self.location:
- out += "location: %s\n" % single_location
- out += "authors: %s\n" % self.authors
- if self.consrtm:
- out += "consrtm: %s\n" % self.consrtm
- out += "title: %s\n" % self.title
- out += "journal: %s\n" % self.journal
- out += "medline id: %s\n" % self.medline_id
- out += "pubmed id: %s\n" % self.pubmed_id
- out += "comment: %s\n" % self.comment
-
- return out
-
-# --- Handling feature locations
-
-class FeatureLocation:
- """Specify the location of a feature along a sequence.
-
- This attempts to deal with fuzziness of position ends, but also
- make it easy to get the start and end in the 'normal' case (no
- fuzziness).
-
- You should access the start and end attributes with
- your_location.start and your_location.end. If the start and
- end are exact, this will return the positions, if not, we'll return
- the approriate Fuzzy class with info about the position and fuzziness.
-
- Note that the start and end location numbering follow Python's scheme,
- thus a GenBank entry of 123..150 (one based counting) becomes a location
- of [122:150] (zero based counting).
- """
- def __init__(self, start, end):
- """Specify the start and end of a sequence feature.
-
- start and end arguments specify the values where the feature begins
- and ends. These can either by any of the *Position objects that
- inherit from AbstractPosition, or can just be integers specifying the
- position. In the case of integers, the values are assumed to be
- exact and are converted in ExactPosition arguments. This is meant
- to make it easy to deal with non-fuzzy ends.
- """
- if isinstance(start, AbstractPosition):
- self._start = start
- else:
- self._start = ExactPosition(start)
-
- if isinstance(end, AbstractPosition):
- self._end = end
- else:
- self._end = ExactPosition(end)
-
- def __str__(self):
- """Returns a representation of the location (with python counting).
-
- For the simple case this uses the python splicing syntax, [122:150]
- (zero based counting) which GenBank would call 123..150 (one based
- counting).
- """
- return "[%s:%s]" % (self._start, self._end)
-
- def __repr__(self):
- """A string representation of the location for debugging."""
- return "%s(%s,%s)" \
- % (self.__class__, repr(self.start), repr(self.end))
-
- def _shift(self, offset) :
- """Returns a copy of the location shifted by the offset (PRIVATE)."""
- return FeatureLocation(start = self._start._shift(offset),
- end = self._end._shift(offset))
-
- def __getattr__(self, attr):
- """Make it easy to get non-fuzzy starts and ends.
-
- We override get_attribute here so that in non-fuzzy cases we
- can just return the start and end position without any hassle.
-
- To get fuzzy start and ends, just ask for item.start and
- item.end. To get non-fuzzy attributes (ie. the position only)
- ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return
- the largest range of the fuzzy position. So something like:
- (10.20)..(30.40) should return 10 for start, and 40 for end.
-
- The special tricky case where is when we have a single between position
- argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end
- to give a reasonable approximation of what this really means, which
- is an empty string -- so the same position for both. Doing a special
- case here sucks, but there is really not a general rule you can apply
- to this.
- """
- #TODO - these are not currently implemented as properties, this means
- #they do not show up via dir(...)
- if attr == 'start':
- return self._start
- elif attr == 'end':
- return self._end
- elif attr == 'nofuzzy_start':
- if ((self._start == self._end) and isinstance(self._start,
- BetweenPosition)):
- return self._start.position
- else:
- return min(self._start.position,
- self._start.position + self._start.extension)
- elif attr == 'nofuzzy_end':
- if ((self._start == self._end) and isinstance(self._start,
- BetweenPosition)):
- return self._end.position
- else:
- return max(self._end.position,
- self._end.position + self._end.extension)
- else:
- raise AttributeError("Cannot evaluate attribute %s." % attr)
-
-class AbstractPosition:
- """Abstract base class representing a position.
- """
- def __init__(self, position, extension):
- self.position = position
- self.extension = extension
-
- def __repr__(self) :
- """String representation of the location for debugging."""
- return "%s(%s,%s)" \
- % (self.__class__, repr(self.position), repr(self.extension))
-
- def __cmp__(self, other):
- """A simple comparison function for positions.
-
- This is very simple-minded and just compares the position attribute
- of the features; extensions are not considered at all. This could
- potentially be expanded to try to take advantage of extensions.
- """
- assert isinstance(other, AbstractPosition), \
- "We can only do comparisons between Biopython Position objects."
-
- return cmp(self.position, other.position)
-
- def _shift(self, offset) :
- #We want this to maintain the subclass when called from a subclass
- return self.__class__(self.position + offset, self.extension)
-
-class ExactPosition(AbstractPosition):
- """Specify the specific position of a boundary.
-
- o position - The position of the boundary.
- o extension - An optional argument which must be zero since we don't
- have an extension. The argument is provided so that the same number of
- arguments can be passed to all position types.
-
- In this case, there is no fuzziness associated with the position.
- """
- def __init__(self, position, extension = 0):
- if extension != 0:
- raise AttributeError("Non-zero extension %s for exact position."
- % extension)
- AbstractPosition.__init__(self, position, 0)
-
- def __repr__(self) :
- """String representation of the ExactPosition location for debugging."""
- assert self.extension == 0
- return "%s(%s)" % (self.__class__, repr(self.position))
-
- def __str__(self):
- return str(self.position)
-
-class WithinPosition(AbstractPosition):
- """Specify the position of a boundary within some coordinates.
-
- Arguments:
- o position - The start position of the boundary
- o extension - The range to which the boundary can extend.
-
- This allows dealing with a position like ((1.4)..100). This
- indicates that the start of the sequence is somewhere between 1
- and 4. To represent that with this class we would set position as
- 1 and extension as 3.
- """
- def __init__(self, position, extension = 0):
- AbstractPosition.__init__(self, position, extension)
-
- def __str__(self):
- return "(%s.%s)" % (self.position, self.position + self.extension)
-
-class BetweenPosition(AbstractPosition):
- """Specify the position of a boundary between two coordinates.
-
- Arguments:
- o position - The start position of the boundary.
- o extension - The range to the other position of a boundary.
-
- This specifies a coordinate which is found between the two positions.
- So this allows us to deal with a position like ((1^2)..100). To
- represent that with this class we set position as 1 and the
- extension as 1.
- """
- def __init__(self, position, extension = 0):
- AbstractPosition.__init__(self, position, extension)
-
- def __str__(self):
- return "(%s^%s)" % (self.position, self.position + self.extension)
-
-class BeforePosition(AbstractPosition):
- """Specify a position where the actual location occurs before it.
-
- Arguments:
- o position - The upper boundary of where the location can occur.
- o extension - An optional argument which must be zero since we don't
- have an extension. The argument is provided so that the same number of
- arguments can be passed to all position types.
-
- This is used to specify positions like (<10..100) where the location
- occurs somewhere before position 10.
- """
- def __init__(self, position, extension = 0):
- if extension != 0:
- raise AttributeError("Non-zero extension %s for exact position."
- % extension)
- AbstractPosition.__init__(self, position, 0)
-
- def __repr__(self) :
- """A string representation of the location for debugging."""
- assert self.extension == 0
- return "%s(%s)" % (self.__class__, repr(self.position))
-
- def __str__(self):
- return "<%s" % self.position
-
-class AfterPosition(AbstractPosition):
- """Specify a position where the actual location is found after it.
-
- Arguments:
- o position - The lower boundary of where the location can occur.
- o extension - An optional argument which must be zero since we don't
- have an extension. The argument is provided so that the same number of
- arguments can be passed to all position types.
-
- This is used to specify positions like (>10..100) where the location
- occurs somewhere after position 10.
- """
- def __init__(self, position, extension = 0):
- if extension != 0:
- raise AttributeError("Non-zero extension %s for exact position."
- % extension)
- AbstractPosition.__init__(self, position, 0)
-
- def __repr__(self) :
- """A string representation of the location for debugging."""
- assert self.extension == 0
- return "%s(%s)" % (self.__class__, repr(self.position))
-
- def __str__(self):
- return ">%s" % self.position
-
-class OneOfPosition(AbstractPosition):
- """Specify a position where the location can be multiple positions.
-
- This models the GenBank 'one-of(1888,1901)' function, and tries
- to make this fit within the Biopython Position models. In our case
- the position of the "one-of" is set as the lowest choice, and the
- extension is the range to the highest choice.
- """
- def __init__(self, position_list):
- """Initialize with a set of posssible positions.
-
- position_list is a list of AbstractPosition derived objects,
- specifying possible locations.
- """
- # unique attribute for this type of positions
- self.position_choices = position_list
- # find the smallest and largest position in the choices
- smallest = None
- largest = None
- for position_choice in self.position_choices:
- assert isinstance(position_choice, AbstractPosition), \
- "Expected position objects, got %r" % position_choice
- if smallest is None and largest is None:
- smallest = position_choice.position
- largest = position_choice.position
- elif position_choice.position > largest:
- largest = position_choice.position
- elif position_choice.position < smallest:
- smallest = position_choice.position
- # initialize with our definition of position and extension
- AbstractPosition.__init__(self, smallest, largest - smallest)
-
- def __repr__(self) :
- """String representation of the OneOfPosition location for debugging."""
- return "%s(%s)" % (self.__class__, repr(self.position_choices))
-
- def __str__(self):
- out = "one-of("
- for position in self.position_choices:
- out += "%s," % position
- # replace the last comma with the closing parenthesis
- out = out[:-1] + ")"
- return out
-
-class PositionGap:
- """Simple class to hold information about a gap between positions.
- """
- def __init__(self, gap_size):
- """Intialize with a position object containing the gap information.
- """
- self.gap_size = gap_size
-
- def __repr__(self) :
- """A string representation of the position gap for debugging."""
- return "%s(%s)" % (self.__class__, repr(self.gap_size))
-
- def __str__(self):
- out = "gap(%s)" % self.gap_size
- return out