1 package jalview.datamodel.xdb.embl;
\r
3 import jalview.datamodel.DBRefEntry;
\r
4 import jalview.datamodel.Sequence;
\r
5 import jalview.datamodel.SequenceFeature;
\r
6 import jalview.datamodel.SequenceI;
\r
8 import java.util.Enumeration;
\r
9 import java.util.Hashtable;
\r
10 import java.util.Iterator;
\r
11 import java.util.Vector;
\r
13 public class EmblEntry {
\r
19 String rLastUpdated;
\r
25 EmblSequence sequence;
\r
27 * @return the accession
\r
29 public String getAccession() {
\r
33 * @param accession the accession to set
\r
35 public void setAccession(String accession) {
\r
36 this.accession = accession;
\r
39 * @return the dbRefs
\r
41 public Vector getDbRefs() {
\r
45 * @param dbRefs the dbRefs to set
\r
47 public void setDbRefs(Vector dbRefs) {
\r
48 this.dbRefs = dbRefs;
\r
53 public String getDesc() {
\r
57 * @param desc the desc to set
\r
59 public void setDesc(String desc) {
\r
63 * @return the features
\r
65 public Vector getFeatures() {
\r
69 * @param features the features to set
\r
71 public void setFeatures(Vector features) {
\r
72 this.features = features;
\r
75 * @return the keywords
\r
77 public Vector getKeywords() {
\r
81 * @param keywords the keywords to set
\r
83 public void setKeywords(Vector keywords) {
\r
84 this.keywords = keywords;
\r
87 * @return the lastUpdated
\r
89 public String getLastUpdated() {
\r
93 * @param lastUpdated the lastUpdated to set
\r
95 public void setLastUpdated(String lastUpdated) {
\r
96 this.lastUpdated = lastUpdated;
\r
101 public Vector getRefs() {
\r
105 * @param refs the refs to set
\r
107 public void setRefs(Vector refs) {
\r
111 * @return the releaseCreated
\r
113 public String getRCreated() {
\r
117 * @param releaseCreated the releaseCreated to set
\r
119 public void setRcreated(String releaseCreated) {
\r
120 this.rCreated = releaseCreated;
\r
123 * @return the releaseLastUpdated
\r
125 public String getRLastUpdated() {
\r
126 return rLastUpdated;
\r
129 * @param releaseLastUpdated the releaseLastUpdated to set
\r
131 public void setRLastUpdated(String releaseLastUpdated) {
\r
132 this.rLastUpdated = releaseLastUpdated;
\r
135 * @return the sequence
\r
137 public EmblSequence getSequence() {
\r
141 * @param sequence the sequence to set
\r
143 public void setSequence(EmblSequence sequence) {
\r
144 this.sequence = sequence;
\r
147 * @return the taxDivision
\r
149 public String getTaxDivision() {
\r
150 return taxDivision;
\r
153 * @param taxDivision the taxDivision to set
\r
155 public void setTaxDivision(String taxDivision) {
\r
156 this.taxDivision = taxDivision;
\r
159 * @return the version
\r
161 public String getVersion() {
\r
165 * @param version the version to set
\r
167 public void setVersion(String version) {
\r
168 this.version = version;
\r
171 * EMBL Feature support is limited. The text below is included for the benefit of
\r
172 * any developer working on improving EMBL feature import in Jalview.
\r
173 * Extract from EMBL feature specification
\r
174 * see http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
\r
178 The location indicates the region of the presented sequence which corresponds
\r
181 3.5.2 Format and conventions
\r
182 The location contains at least one sequence location descriptor and may
\r
183 contain one or more operators with one or more sequence location descriptors.
\r
184 Base numbers refer to the numbering in the entry. This numbering designates
\r
185 the first base (5' end) of the presented sequence as base 1.
\r
186 Base locations beyond the range of the presented sequence may not be used in
\r
187 location descriptors, the only exception being location in a remote entry (see
\r
190 Location operators and descriptors are discussed in more detail below.
\r
192 3.5.2.1 Location descriptors
\r
194 The location descriptor can be one of the following:
\r
195 (a) a single base number
\r
196 (b) a site between two indicated adjoining bases
\r
197 (c) a single base chosen from within a specified range of bases (not allowed for new
\r
199 (d) the base numbers delimiting a sequence span
\r
200 (e) a remote entry identifier followed by a local location descriptor
\r
203 A site between two adjoining nucleotides, such as endonucleolytic cleavage
\r
204 site, is indicated by listing the two points separated by a carat (^). The
\r
205 permitted formats for this descriptor are n^n+1 (for example 55^56), or, for
\r
206 circular molecules, n^1, where "n" is the full length of the molecule, ie
\r
207 1000^1 for circular molecule with length 1000.
\r
209 A single base chosen from a range of bases is indicated by the first base
\r
210 number and the last base number of the range separated by a single period
\r
211 (e.g., '12.21' indicates a single base taken from between the indicated
\r
212 points). From October 2006 the usage of this descriptor is restricted :
\r
213 it is illegal to use "a single base from a range" (c) either on its own or
\r
214 in combination with the "sequence span" (d) descriptor for newly created entries.
\r
215 The existing entries where such descriptors exist are going to be retrofitted.
\r
217 Sequence spans are indicated by the starting base number and the ending base
\r
218 number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may
\r
219 be used with the starting and ending base numbers to indicate that an end
\r
220 point is beyond the specified base number. The starting and ending base
\r
221 positions can be represented as distinct base numbers ('34..456') or a site
\r
222 between two indicated adjoining bases.
\r
224 A location in a remote entry (not the entry to which the feature table
\r
225 belongs) can be specified by giving the accession-number and sequence version
\r
226 of the remote entry, followed by a colon ":", followed by a location
\r
227 descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see
\r
228 also examples below)
\r
232 The location operator is a prefix that specifies what must be done to the
\r
233 indicated sequence to find or construct the location corresponding to the
\r
234 feature. A list of operators is given below with their definitions and most
\r
237 complement(location)
\r
238 Find the complement of the presented sequence in the span specified by "
\r
239 location" (i.e., read the complement of the presented strand in its 5'-to-3'
\r
242 join(location,location, ... location)
\r
243 The indicated elements should be joined (placed end-to-end) to form one
\r
244 contiguous sequence
\r
246 order(location,location, ... location)
\r
247 The elements can be found in the
\r
248 specified order (5' to 3' direction), but nothing is implied about the
\r
249 reasonableness about joining them
\r
251 Note : location operator "complement" can be used in combination with either "
\r
252 join" or "order" within the same location; combinations of "join" and "order"
\r
253 within the same location (nested operators) are illegal.
\r
257 3.5.3 Location examples
\r
259 The following is a list of common location descriptors with their meanings:
\r
261 Location Description
\r
263 467 Points to a single base in the presented sequence
\r
265 340..565 Points to a continuous range of bases bounded by and
\r
266 including the starting and ending bases
\r
268 <345..500 Indicates that the exact lower boundary point of a feature
\r
269 is unknown. The location begins at some base previous to
\r
270 the first base specified (which need not be contained in
\r
271 the presented sequence) and continues to and includes the
\r
274 <1..888 The feature starts before the first sequenced base and
\r
275 continues to and includes base 888
\r
277 1..>888 The feature starts at the first sequenced base and
\r
278 continues beyond base 888
\r
280 102.110 Indicates that the exact location is unknown but that it is
\r
281 one of the bases between bases 102 and 110, inclusive
\r
283 123^124 Points to a site between bases 123 and 124
\r
285 join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form
\r
286 one contiguous sequence
\r
289 complement(34..126) Start at the base complementary to 126 and finish at the
\r
290 base complementary to base 34 (the feature is on the strand
\r
291 complementary to the presented strand)
\r
294 complement(join(2691..4571,4918..5163))
\r
295 Joins regions 2691 to 4571 and 4918 to 5163, then
\r
296 complements the joined segments (the feature is on the
\r
297 strand complementary to the presented strand)
\r
299 join(complement(4918..5163),complement(2691..4571))
\r
300 Complements regions 4918 to 5163 and 2691 to 4571, then
\r
301 joins the complemented segments (the feature is on the
\r
302 strand complementary to the presented strand)
\r
304 J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in
\r
305 this database) with primary accession number 'J00194'
\r
307 join(1..100,J00194.1:100..202)
\r
308 Joins region 1..100 of the existing entry with the region
\r
309 100..202 of remote entry J00194
\r
313 * Recover annotated sequences from EMBL file
\r
314 * @param noNa don't return nucleic acid sequences
\r
315 * @param sourceDb TODO
\r
316 * @param noProtein don't return any translated protein sequences marked in features
\r
317 * @return dataset sequences with DBRefs and features - DNA always comes first
\r
319 public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) {
\r
320 Vector seqs=new Vector();
\r
323 dna = new Sequence(sourceDb+"|"+accession, sequence.getSequence());
\r
324 dna.setDescription(desc);
\r
325 dna.addDBRef(new DBRefEntry(sourceDb, version, accession));
\r
326 // TODO: add mapping for parentAccession attribute
\r
327 // TODO: transform EMBL Database refs to canonical form
\r
329 for (Iterator i=dbRefs.iterator(); i.hasNext(); dna.addDBRef((DBRefEntry)i.next()));
\r
331 for (Iterator i=features.iterator(); i.hasNext(); ) {
\r
332 EmblFeature feature = (EmblFeature) i.next();
\r
334 if (feature.dbRefs!=null && feature.dbRefs.size()>0) {
\r
335 for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) )
\r
339 if (feature.getName().equalsIgnoreCase("CDS")) {
\r
340 // extract coding region(s)
\r
341 jalview.datamodel.Mapping map = null;
\r
343 if (feature.locations!=null && feature.locations.size()>0) {
\r
344 for (Iterator locs=feature.locations.iterator();
\r
345 locs.hasNext(); ) {
\r
346 EmblFeatureLocations loc = (EmblFeatureLocations) locs.next();
\r
347 int[] se = loc.getElementRanges();
\r
351 int[] t=new int[exon.length+se.length];
\r
352 System.arraycopy(exon, 0, t, 0, exon.length);
\r
353 System.arraycopy(se, 0, t, exon.length,se.length);
\r
359 String prname=new String();
\r
361 Hashtable vals=new Hashtable();
\r
364 if (feature.getQualifiers()!=null && feature.getQualifiers().size()>0) {
\r
365 for (Iterator quals=feature.getQualifiers().iterator(); quals.hasNext(); ) {
\r
366 Qualifier q = (Qualifier) quals.next();
\r
367 if (q.getName().equals("translation"))
\r
369 prseq=q.getValue();
\r
372 if (q.getName().equals("protein_id"))
\r
377 if (q.getName().equals("codon_start"))
\r
379 prstart = Integer.parseInt(q.getValue());
\r
382 if (q.getName().equals("product")){
\r
383 prname = q.getValue();
\r
385 // throw anything else into the additional properties hash
\r
386 vals.put(q.getName(), q.getValue());
\r
390 Sequence product=null;
\r
391 if (prseq!=null && prname!=null && prid!=null) {
\r
392 // extract proteins.
\r
394 product = new Sequence(sourceDb+"|"+"EMBLCDS|"+prid+"|"+prname, prseq, prstart, prstart+prseq.length()-1);
\r
395 product.setDescription("Protein Product from "+sourceDb);
\r
398 // we have everything - create the mapping and perhaps the protein sequence
\r
399 map = new jalview.datamodel.Mapping(product, exon, new int[] { prstart, prstart+prseq.length()-1}, 3, 1);
\r
400 // add cds feature to dna seq - this may include the stop codon
\r
401 for (int xint=0;xint<exon.length; xint+=2) {
\r
402 SequenceFeature sf = new SequenceFeature();
\r
403 sf.setBegin(exon[xint]);
\r
404 sf.setEnd(exon[xint+1]);
\r
405 sf.setType(feature.getName());
\r
406 sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL);
\r
407 sf.setDescription("Exon "+(1+xint)+" for protein '"+prname+"' EMBLCDS:"+prid);
\r
408 if (vals!=null && vals.size()>0) {
\r
409 Enumeration kv = vals.elements();
\r
410 while (kv.hasMoreElements()) {
\r
411 Object key=kv.nextElement();
\r
413 sf.setValue(key.toString(), vals.get(key));
\r
416 dna.addSequenceFeature(sf);
\r
419 // add dbRefs to sequence
\r
420 if (feature.dbRefs!=null && feature.dbRefs.size()>0)
\r
422 for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); )
\r
424 DBRefEntry ref = (DBRefEntry)dbr.next();
\r
425 ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref.getSource()));
\r
426 if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT))
\r
430 if (product!=null) {
\r
431 DBRefEntry pref = new DBRefEntry(ref.getSource(), ref.getVersion(), ref.getAccessionId());
\r
432 pref.setMap(null); // reference is direct
\r
439 // General feature type.
\r
441 if (feature.dbRefs!=null && feature.dbRefs.size()>0) {
\r
442 for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) )
\r
452 SequenceI[] sqs = new SequenceI[seqs.size()];
\r
453 for (int i=0,j=seqs.size();i<j; i++) {
\r
454 sqs[i] = (SequenceI) seqs.elementAt(i);
\r