1 package jalview.datamodel.xdb.embl;
3 import jalview.datamodel.DBRefEntry;
4 import jalview.datamodel.Sequence;
5 import jalview.datamodel.SequenceFeature;
6 import jalview.datamodel.SequenceI;
8 import java.util.Enumeration;
9 import java.util.Hashtable;
10 import java.util.Iterator;
11 import java.util.Vector;
13 public class EmblEntry {
25 EmblSequence sequence;
27 * @return the accession
29 public String getAccession() {
33 * @param accession the accession to set
35 public void setAccession(String accession) {
36 this.accession = accession;
41 public Vector getDbRefs() {
45 * @param dbRefs the dbRefs to set
47 public void setDbRefs(Vector dbRefs) {
53 public String getDesc() {
57 * @param desc the desc to set
59 public void setDesc(String desc) {
63 * @return the features
65 public Vector getFeatures() {
69 * @param features the features to set
71 public void setFeatures(Vector features) {
72 this.features = features;
75 * @return the keywords
77 public Vector getKeywords() {
81 * @param keywords the keywords to set
83 public void setKeywords(Vector keywords) {
84 this.keywords = keywords;
87 * @return the lastUpdated
89 public String getLastUpdated() {
93 * @param lastUpdated the lastUpdated to set
95 public void setLastUpdated(String lastUpdated) {
96 this.lastUpdated = lastUpdated;
101 public Vector getRefs() {
105 * @param refs the refs to set
107 public void setRefs(Vector refs) {
111 * @return the releaseCreated
113 public String getRCreated() {
117 * @param releaseCreated the releaseCreated to set
119 public void setRcreated(String releaseCreated) {
120 this.rCreated = releaseCreated;
123 * @return the releaseLastUpdated
125 public String getRLastUpdated() {
129 * @param releaseLastUpdated the releaseLastUpdated to set
131 public void setRLastUpdated(String releaseLastUpdated) {
132 this.rLastUpdated = releaseLastUpdated;
135 * @return the sequence
137 public EmblSequence getSequence() {
141 * @param sequence the sequence to set
143 public void setSequence(EmblSequence sequence) {
144 this.sequence = sequence;
147 * @return the taxDivision
149 public String getTaxDivision() {
153 * @param taxDivision the taxDivision to set
155 public void setTaxDivision(String taxDivision) {
156 this.taxDivision = taxDivision;
159 * @return the version
161 public String getVersion() {
165 * @param version the version to set
167 public void setVersion(String version) {
168 this.version = version;
171 * EMBL Feature support is limited. The text below is included for the benefit of
172 * any developer working on improving EMBL feature import in Jalview.
173 * Extract from EMBL feature specification
174 * see http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
178 The location indicates the region of the presented sequence which corresponds
181 3.5.2 Format and conventions
182 The location contains at least one sequence location descriptor and may
183 contain one or more operators with one or more sequence location descriptors.
184 Base numbers refer to the numbering in the entry. This numbering designates
185 the first base (5' end) of the presented sequence as base 1.
186 Base locations beyond the range of the presented sequence may not be used in
187 location descriptors, the only exception being location in a remote entry (see
190 Location operators and descriptors are discussed in more detail below.
192 3.5.2.1 Location descriptors
194 The location descriptor can be one of the following:
195 (a) a single base number
196 (b) a site between two indicated adjoining bases
197 (c) a single base chosen from within a specified range of bases (not allowed for new
199 (d) the base numbers delimiting a sequence span
200 (e) a remote entry identifier followed by a local location descriptor
203 A site between two adjoining nucleotides, such as endonucleolytic cleavage
204 site, is indicated by listing the two points separated by a carat (^). The
205 permitted formats for this descriptor are n^n+1 (for example 55^56), or, for
206 circular molecules, n^1, where "n" is the full length of the molecule, ie
207 1000^1 for circular molecule with length 1000.
209 A single base chosen from a range of bases is indicated by the first base
210 number and the last base number of the range separated by a single period
211 (e.g., '12.21' indicates a single base taken from between the indicated
212 points). From October 2006 the usage of this descriptor is restricted :
213 it is illegal to use "a single base from a range" (c) either on its own or
214 in combination with the "sequence span" (d) descriptor for newly created entries.
215 The existing entries where such descriptors exist are going to be retrofitted.
217 Sequence spans are indicated by the starting base number and the ending base
218 number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may
219 be used with the starting and ending base numbers to indicate that an end
220 point is beyond the specified base number. The starting and ending base
221 positions can be represented as distinct base numbers ('34..456') or a site
222 between two indicated adjoining bases.
224 A location in a remote entry (not the entry to which the feature table
225 belongs) can be specified by giving the accession-number and sequence version
226 of the remote entry, followed by a colon ":", followed by a location
227 descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see
232 The location operator is a prefix that specifies what must be done to the
233 indicated sequence to find or construct the location corresponding to the
234 feature. A list of operators is given below with their definitions and most
238 Find the complement of the presented sequence in the span specified by "
239 location" (i.e., read the complement of the presented strand in its 5'-to-3'
242 join(location,location, ... location)
243 The indicated elements should be joined (placed end-to-end) to form one
246 order(location,location, ... location)
247 The elements can be found in the
248 specified order (5' to 3' direction), but nothing is implied about the
249 reasonableness about joining them
251 Note : location operator "complement" can be used in combination with either "
252 join" or "order" within the same location; combinations of "join" and "order"
253 within the same location (nested operators) are illegal.
257 3.5.3 Location examples
259 The following is a list of common location descriptors with their meanings:
263 467 Points to a single base in the presented sequence
265 340..565 Points to a continuous range of bases bounded by and
266 including the starting and ending bases
268 <345..500 Indicates that the exact lower boundary point of a feature
269 is unknown. The location begins at some base previous to
270 the first base specified (which need not be contained in
271 the presented sequence) and continues to and includes the
274 <1..888 The feature starts before the first sequenced base and
275 continues to and includes base 888
277 1..>888 The feature starts at the first sequenced base and
278 continues beyond base 888
280 102.110 Indicates that the exact location is unknown but that it is
281 one of the bases between bases 102 and 110, inclusive
283 123^124 Points to a site between bases 123 and 124
285 join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form
286 one contiguous sequence
289 complement(34..126) Start at the base complementary to 126 and finish at the
290 base complementary to base 34 (the feature is on the strand
291 complementary to the presented strand)
294 complement(join(2691..4571,4918..5163))
295 Joins regions 2691 to 4571 and 4918 to 5163, then
296 complements the joined segments (the feature is on the
297 strand complementary to the presented strand)
299 join(complement(4918..5163),complement(2691..4571))
300 Complements regions 4918 to 5163 and 2691 to 4571, then
301 joins the complemented segments (the feature is on the
302 strand complementary to the presented strand)
304 J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in
305 this database) with primary accession number 'J00194'
307 join(1..100,J00194.1:100..202)
308 Joins region 1..100 of the existing entry with the region
309 100..202 of remote entry J00194
313 * Recover annotated sequences from EMBL file
314 * @param noNa don't return nucleic acid sequences
315 * @param sourceDb TODO
316 * @param noProtein don't return any translated protein sequences marked in features
317 * @return dataset sequences with DBRefs and features - DNA always comes first
319 public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) {
320 Vector seqs=new Vector();
323 dna = new Sequence(sourceDb+"|"+accession, sequence.getSequence());
324 dna.setDescription(desc);
325 dna.addDBRef(new DBRefEntry(sourceDb, version, accession));
326 // TODO: add mapping for parentAccession attribute
327 // TODO: transform EMBL Database refs to canonical form
329 for (Iterator i=dbRefs.iterator(); i.hasNext(); dna.addDBRef((DBRefEntry)i.next()));
331 for (Iterator i=features.iterator(); i.hasNext(); ) {
332 EmblFeature feature = (EmblFeature) i.next();
334 if (feature.dbRefs!=null && feature.dbRefs.size()>0) {
335 for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) )
339 if (feature.getName().equalsIgnoreCase("CDS")) {
340 // extract coding region(s)
341 jalview.datamodel.Mapping map = null;
343 if (feature.locations!=null && feature.locations.size()>0) {
344 for (Iterator locs=feature.locations.iterator();
346 EmblFeatureLocations loc = (EmblFeatureLocations) locs.next();
347 int[] se = loc.getElementRanges();
351 int[] t=new int[exon.length+se.length];
352 System.arraycopy(exon, 0, t, 0, exon.length);
353 System.arraycopy(se, 0, t, exon.length,se.length);
359 String prname=new String();
361 Hashtable vals=new Hashtable();
364 if (feature.getQualifiers()!=null && feature.getQualifiers().size()>0) {
365 for (Iterator quals=feature.getQualifiers().iterator(); quals.hasNext(); ) {
366 Qualifier q = (Qualifier) quals.next();
367 if (q.getName().equals("translation"))
372 if (q.getName().equals("protein_id"))
377 if (q.getName().equals("codon_start"))
379 prstart = Integer.parseInt(q.getValue());
382 if (q.getName().equals("product")){
383 prname = q.getValue();
385 // throw anything else into the additional properties hash
386 vals.put(q.getName(), q.getValue());
390 Sequence product=null;
391 if (prseq!=null && prname!=null && prid!=null) {
394 product = new Sequence(sourceDb+"|"+"EMBLCDS|"+prid+"|"+prname, prseq, prstart, prstart+prseq.length()-1);
395 product.setDescription("Protein Product from "+sourceDb);
398 // we have everything - create the mapping and perhaps the protein sequence
399 map = new jalview.datamodel.Mapping(product, exon, new int[] { prstart, prstart+prseq.length()-1}, 3, 1);
400 // add cds feature to dna seq - this may include the stop codon
401 for (int xint=0;xint<exon.length; xint+=2) {
402 SequenceFeature sf = new SequenceFeature();
403 sf.setBegin(exon[xint]);
404 sf.setEnd(exon[xint+1]);
405 sf.setType(feature.getName());
406 sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL);
407 sf.setDescription("Exon "+(1+xint)+" for protein '"+prname+"' EMBLCDS:"+prid);
408 if (vals!=null && vals.size()>0) {
409 Enumeration kv = vals.elements();
410 while (kv.hasMoreElements()) {
411 Object key=kv.nextElement();
413 sf.setValue(key.toString(), vals.get(key));
416 dna.addSequenceFeature(sf);
419 // add dbRefs to sequence
420 if (feature.dbRefs!=null && feature.dbRefs.size()>0)
422 for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); )
424 DBRefEntry ref = (DBRefEntry)dbr.next();
425 ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref.getSource()));
426 if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT))
431 DBRefEntry pref = new DBRefEntry(ref.getSource(), ref.getVersion(), ref.getAccessionId());
432 pref.setMap(null); // reference is direct
439 // General feature type.
441 if (feature.dbRefs!=null && feature.dbRefs.size()>0) {
442 for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) )
452 SequenceI[] sqs = new SequenceI[seqs.size()];
453 for (int i=0,j=seqs.size();i<j; i++) {
454 sqs[i] = (SequenceI) seqs.elementAt(i);