1 """Represent a Sequence Feature holding info about a part of a sequence.
3 This is heavily modeled after the Biocorba SeqFeature objects, and
4 may be pretty biased towards GenBank stuff since I'm writing it
5 for the GenBank parser output...
9 Base class to hold a Feature.
10 ----------------------------
14 Hold information about a Reference.
15 ----------------------------------
17 This is an attempt to create a General class to hold Reference type
23 Specify locations of a feature on a Sequence.
24 ---------------------------------------------
26 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
27 much the same way as Biocorba. This has the advantages of allowing us
28 to handle fuzzy stuff in case anyone needs it, and also be compatible
32 o FeatureLocation - Specify the start and end location of a feature.
34 o ExactPosition - Specify the position as being exact.
35 o WithinPosition - Specify a position occuring within some range.
36 o BetweenPosition - Specify a position occuring between a range.
37 o BeforePosition - Specify the position as being found before some base.
38 o AfterPosition - Specify the position as being found after some base.
42 """Represent a Sequence Feature on an object.
45 o location - the location of the feature on the sequence
46 o type - the specified type of the feature (ie. CDS, exon, repeat...)
47 o location_operator - a string specifying how this SeqFeature may
48 be related to others. For example, in the example GenBank feature
49 shown below, the location_operator would be "join"
50 o strand - A value specifying on which strand (of a DNA sequence, for
51 instance) the feature deals with. 1 indicates the plus strand, -1
52 indicates the minus strand, 0 indicates both strands, and None indicates
53 that strand doesn't apply (ie. for proteins) or is not known.
54 o id - A string identifier for the feature.
55 o ref - A reference to another sequence. This could be an accession
56 number for some different sequence.
57 o ref_db - A different database for the reference accession number.
58 o qualifier - A dictionary of qualifiers on the feature. These are
59 analagous to the qualifiers from a GenBank feature table. The keys of
60 the dictionary are qualifier names, the values are the qualifier
62 o sub_features - Additional SeqFeatures which fall under this 'parent'
63 feature. For instance, if we having something like:
65 CDS join(1..10,30..40,50..60)
67 The the top level feature would be a CDS from 1 to 60, and the sub
68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to
69 40 and 50 to 60, respectively.
71 def __init__(self, location = None, type = '', location_operator = '',
72 strand = None, id = "<unknown id>",
73 qualifiers = {}, sub_features = [],
74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence.
77 self.location = location
80 self.location_operator = location_operator
83 # XXX right now sub_features and qualifiers cannot be set
84 # from the initializer because this causes all kinds
85 # of recursive import problems. I can't understand why this is
88 self.sub_features = []
93 """A string representation of the record for debugging."""
94 answer = "%s(%s" % (self.__class__, repr(self.location))
96 answer += ", type=%s" % repr(self.type)
97 if self.location_operator :
98 answer += ", location_operator=%s" % repr(self.location_operator)
100 answer += ", strand=%s" % repr(self.strand)
101 if self.id and self.id != "<unknown id>" :
102 answer += ", id=%s" % repr(self.id)
104 answer += ", ref=%s" % repr(self.ref)
106 answer += ", ref_db=%s" % repr(self.ref_db)
111 """A readable summary of the feature intended to be printed to screen.
113 out = "type: %s\n" % self.type
114 out += "location: %s\n" % self.location
115 out += "ref: %s:%s\n" % (self.ref, self.ref_db)
116 out += "strand: %s\n" % self.strand
117 out += "qualifiers: \n"
118 qualifier_keys = self.qualifiers.keys()
119 qualifier_keys.sort()
120 for qual_key in qualifier_keys:
121 out += "\tKey: %s, Value: %s\n" % (qual_key,
122 self.qualifiers[qual_key])
123 if len(self.sub_features) != 0:
124 out += "Sub-Features\n"
125 for sub_feature in self.sub_features:
126 out +="%s\n" % sub_feature
130 def _shift(self, offset) :
131 """Returns a copy of the feature with its location shifted (PRIVATE).
133 The annotation qaulifiers are copied."""
134 answer = SeqFeature(location = self.location._shift(offset),
136 location_operator = self.location_operator,
137 strand = self.strand,
139 #qualifiers = dict(self.qualifiers.iteritems()),
140 #sub_features = [f._shift(offset) for f in self.sub_features],
142 ref_db = self.ref_db)
143 #TODO - Sort out the use of sub_feature and qualifiers in __init___
144 answer.sub_features = [f._shift(offset) for f in self.sub_features]
145 answer.qualifiers = dict(self.qualifiers.iteritems())
150 # TODO -- Will this hold PubMed and Medline information decently?
152 """Represent a Generic Reference object.
155 o location - A list of Location objects specifying regions of
156 the sequence that the references correspond to. If no locations are
157 specified, the entire sequence is assumed.
158 o authors - A big old string, or a list split by author, of authors
160 o title - The title of the reference.
161 o journal - Journal the reference was published in.
162 o medline_id - A medline reference for the article.
163 o pubmed_id - A pubmed reference for the article.
164 o comment - A place to stick any comments about the reference.
177 """Output an informative string for debugging.
180 for single_location in self.location:
181 out += "location: %s\n" % single_location
182 out += "authors: %s\n" % self.authors
184 out += "consrtm: %s\n" % self.consrtm
185 out += "title: %s\n" % self.title
186 out += "journal: %s\n" % self.journal
187 out += "medline id: %s\n" % self.medline_id
188 out += "pubmed id: %s\n" % self.pubmed_id
189 out += "comment: %s\n" % self.comment
193 # --- Handling feature locations
195 class FeatureLocation:
196 """Specify the location of a feature along a sequence.
198 This attempts to deal with fuzziness of position ends, but also
199 make it easy to get the start and end in the 'normal' case (no
202 You should access the start and end attributes with
203 your_location.start and your_location.end. If the start and
204 end are exact, this will return the positions, if not, we'll return
205 the approriate Fuzzy class with info about the position and fuzziness.
207 Note that the start and end location numbering follow Python's scheme,
208 thus a GenBank entry of 123..150 (one based counting) becomes a location
209 of [122:150] (zero based counting).
211 def __init__(self, start, end):
212 """Specify the start and end of a sequence feature.
214 start and end arguments specify the values where the feature begins
215 and ends. These can either by any of the *Position objects that
216 inherit from AbstractPosition, or can just be integers specifying the
217 position. In the case of integers, the values are assumed to be
218 exact and are converted in ExactPosition arguments. This is meant
219 to make it easy to deal with non-fuzzy ends.
221 if isinstance(start, AbstractPosition):
224 self._start = ExactPosition(start)
226 if isinstance(end, AbstractPosition):
229 self._end = ExactPosition(end)
232 """Returns a representation of the location (with python counting).
234 For the simple case this uses the python splicing syntax, [122:150]
235 (zero based counting) which GenBank would call 123..150 (one based
238 return "[%s:%s]" % (self._start, self._end)
241 """A string representation of the location for debugging."""
243 % (self.__class__, repr(self.start), repr(self.end))
245 def _shift(self, offset) :
246 """Returns a copy of the location shifted by the offset (PRIVATE)."""
247 return FeatureLocation(start = self._start._shift(offset),
248 end = self._end._shift(offset))
250 def __getattr__(self, attr):
251 """Make it easy to get non-fuzzy starts and ends.
253 We override get_attribute here so that in non-fuzzy cases we
254 can just return the start and end position without any hassle.
256 To get fuzzy start and ends, just ask for item.start and
257 item.end. To get non-fuzzy attributes (ie. the position only)
258 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return
259 the largest range of the fuzzy position. So something like:
260 (10.20)..(30.40) should return 10 for start, and 40 for end.
262 The special tricky case where is when we have a single between position
263 argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end
264 to give a reasonable approximation of what this really means, which
265 is an empty string -- so the same position for both. Doing a special
266 case here sucks, but there is really not a general rule you can apply
269 #TODO - these are not currently implemented as properties, this means
270 #they do not show up via dir(...)
275 elif attr == 'nofuzzy_start':
276 if ((self._start == self._end) and isinstance(self._start,
278 return self._start.position
280 return min(self._start.position,
281 self._start.position + self._start.extension)
282 elif attr == 'nofuzzy_end':
283 if ((self._start == self._end) and isinstance(self._start,
285 return self._end.position
287 return max(self._end.position,
288 self._end.position + self._end.extension)
290 raise AttributeError("Cannot evaluate attribute %s." % attr)
292 class AbstractPosition:
293 """Abstract base class representing a position.
295 def __init__(self, position, extension):
296 self.position = position
297 self.extension = extension
300 """String representation of the location for debugging."""
302 % (self.__class__, repr(self.position), repr(self.extension))
304 def __cmp__(self, other):
305 """A simple comparison function for positions.
307 This is very simple-minded and just compares the position attribute
308 of the features; extensions are not considered at all. This could
309 potentially be expanded to try to take advantage of extensions.
311 assert isinstance(other, AbstractPosition), \
312 "We can only do comparisons between Biopython Position objects."
314 return cmp(self.position, other.position)
316 def _shift(self, offset) :
317 #We want this to maintain the subclass when called from a subclass
318 return self.__class__(self.position + offset, self.extension)
320 class ExactPosition(AbstractPosition):
321 """Specify the specific position of a boundary.
323 o position - The position of the boundary.
324 o extension - An optional argument which must be zero since we don't
325 have an extension. The argument is provided so that the same number of
326 arguments can be passed to all position types.
328 In this case, there is no fuzziness associated with the position.
330 def __init__(self, position, extension = 0):
332 raise AttributeError("Non-zero extension %s for exact position."
334 AbstractPosition.__init__(self, position, 0)
337 """String representation of the ExactPosition location for debugging."""
338 assert self.extension == 0
339 return "%s(%s)" % (self.__class__, repr(self.position))
342 return str(self.position)
344 class WithinPosition(AbstractPosition):
345 """Specify the position of a boundary within some coordinates.
348 o position - The start position of the boundary
349 o extension - The range to which the boundary can extend.
351 This allows dealing with a position like ((1.4)..100). This
352 indicates that the start of the sequence is somewhere between 1
353 and 4. To represent that with this class we would set position as
354 1 and extension as 3.
356 def __init__(self, position, extension = 0):
357 AbstractPosition.__init__(self, position, extension)
360 return "(%s.%s)" % (self.position, self.position + self.extension)
362 class BetweenPosition(AbstractPosition):
363 """Specify the position of a boundary between two coordinates.
366 o position - The start position of the boundary.
367 o extension - The range to the other position of a boundary.
369 This specifies a coordinate which is found between the two positions.
370 So this allows us to deal with a position like ((1^2)..100). To
371 represent that with this class we set position as 1 and the
374 def __init__(self, position, extension = 0):
375 AbstractPosition.__init__(self, position, extension)
378 return "(%s^%s)" % (self.position, self.position + self.extension)
380 class BeforePosition(AbstractPosition):
381 """Specify a position where the actual location occurs before it.
384 o position - The upper boundary of where the location can occur.
385 o extension - An optional argument which must be zero since we don't
386 have an extension. The argument is provided so that the same number of
387 arguments can be passed to all position types.
389 This is used to specify positions like (<10..100) where the location
390 occurs somewhere before position 10.
392 def __init__(self, position, extension = 0):
394 raise AttributeError("Non-zero extension %s for exact position."
396 AbstractPosition.__init__(self, position, 0)
399 """A string representation of the location for debugging."""
400 assert self.extension == 0
401 return "%s(%s)" % (self.__class__, repr(self.position))
404 return "<%s" % self.position
406 class AfterPosition(AbstractPosition):
407 """Specify a position where the actual location is found after it.
410 o position - The lower boundary of where the location can occur.
411 o extension - An optional argument which must be zero since we don't
412 have an extension. The argument is provided so that the same number of
413 arguments can be passed to all position types.
415 This is used to specify positions like (>10..100) where the location
416 occurs somewhere after position 10.
418 def __init__(self, position, extension = 0):
420 raise AttributeError("Non-zero extension %s for exact position."
422 AbstractPosition.__init__(self, position, 0)
425 """A string representation of the location for debugging."""
426 assert self.extension == 0
427 return "%s(%s)" % (self.__class__, repr(self.position))
430 return ">%s" % self.position
432 class OneOfPosition(AbstractPosition):
433 """Specify a position where the location can be multiple positions.
435 This models the GenBank 'one-of(1888,1901)' function, and tries
436 to make this fit within the Biopython Position models. In our case
437 the position of the "one-of" is set as the lowest choice, and the
438 extension is the range to the highest choice.
440 def __init__(self, position_list):
441 """Initialize with a set of posssible positions.
443 position_list is a list of AbstractPosition derived objects,
444 specifying possible locations.
446 # unique attribute for this type of positions
447 self.position_choices = position_list
448 # find the smallest and largest position in the choices
451 for position_choice in self.position_choices:
452 assert isinstance(position_choice, AbstractPosition), \
453 "Expected position objects, got %r" % position_choice
454 if smallest is None and largest is None:
455 smallest = position_choice.position
456 largest = position_choice.position
457 elif position_choice.position > largest:
458 largest = position_choice.position
459 elif position_choice.position < smallest:
460 smallest = position_choice.position
461 # initialize with our definition of position and extension
462 AbstractPosition.__init__(self, smallest, largest - smallest)
465 """String representation of the OneOfPosition location for debugging."""
466 return "%s(%s)" % (self.__class__, repr(self.position_choices))
470 for position in self.position_choices:
471 out += "%s," % position
472 # replace the last comma with the closing parenthesis
477 """Simple class to hold information about a gap between positions.
479 def __init__(self, gap_size):
480 """Intialize with a position object containing the gap information.
482 self.gap_size = gap_size
485 """A string representation of the position gap for debugging."""
486 return "%s(%s)" % (self.__class__, repr(self.gap_size))
489 out = "gap(%s)" % self.gap_size