package jalview.datamodel.xdb.embl; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.util.Enumeration; import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; public class EmblEntry { String accession; String version; String taxDivision; String desc; String rCreated; String rLastUpdated; String lastUpdated; Vector keywords; Vector refs; Vector dbRefs; Vector features; EmblSequence sequence; /** * @return the accession */ public String getAccession() { return accession; } /** * @param accession the accession to set */ public void setAccession(String accession) { this.accession = accession; } /** * @return the dbRefs */ public Vector getDbRefs() { return dbRefs; } /** * @param dbRefs the dbRefs to set */ public void setDbRefs(Vector dbRefs) { this.dbRefs = dbRefs; } /** * @return the desc */ public String getDesc() { return desc; } /** * @param desc the desc to set */ public void setDesc(String desc) { this.desc = desc; } /** * @return the features */ public Vector getFeatures() { return features; } /** * @param features the features to set */ public void setFeatures(Vector features) { this.features = features; } /** * @return the keywords */ public Vector getKeywords() { return keywords; } /** * @param keywords the keywords to set */ public void setKeywords(Vector keywords) { this.keywords = keywords; } /** * @return the lastUpdated */ public String getLastUpdated() { return lastUpdated; } /** * @param lastUpdated the lastUpdated to set */ public void setLastUpdated(String lastUpdated) { this.lastUpdated = lastUpdated; } /** * @return the refs */ public Vector getRefs() { return refs; } /** * @param refs the refs to set */ public void setRefs(Vector refs) { this.refs = refs; } /** * @return the releaseCreated */ public String getRCreated() { return rCreated; } /** * @param releaseCreated the releaseCreated to set */ public void setRcreated(String releaseCreated) { this.rCreated = releaseCreated; } /** * @return the releaseLastUpdated */ public String getRLastUpdated() { return rLastUpdated; } /** * @param releaseLastUpdated the releaseLastUpdated to set */ public void setRLastUpdated(String releaseLastUpdated) { this.rLastUpdated = releaseLastUpdated; } /** * @return the sequence */ public EmblSequence getSequence() { return sequence; } /** * @param sequence the sequence to set */ public void setSequence(EmblSequence sequence) { this.sequence = sequence; } /** * @return the taxDivision */ public String getTaxDivision() { return taxDivision; } /** * @param taxDivision the taxDivision to set */ public void setTaxDivision(String taxDivision) { this.taxDivision = taxDivision; } /** * @return the version */ public String getVersion() { return version; } /** * @param version the version to set */ public void setVersion(String version) { this.version = version; } /* * EMBL Feature support is limited. The text below is included for the benefit of * any developer working on improving EMBL feature import in Jalview. * Extract from EMBL feature specification * see http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html 3.5 Location 3.5.1 Purpose The location indicates the region of the presented sequence which corresponds to a feature. 3.5.2 Format and conventions The location contains at least one sequence location descriptor and may contain one or more operators with one or more sequence location descriptors. Base numbers refer to the numbering in the entry. This numbering designates the first base (5' end) of the presented sequence as base 1. Base locations beyond the range of the presented sequence may not be used in location descriptors, the only exception being location in a remote entry (see 3.5.2.1, e). Location operators and descriptors are discussed in more detail below. 3.5.2.1 Location descriptors The location descriptor can be one of the following: (a) a single base number (b) a site between two indicated adjoining bases (c) a single base chosen from within a specified range of bases (not allowed for new entries) (d) the base numbers delimiting a sequence span (e) a remote entry identifier followed by a local location descriptor (i.e., a-d) A site between two adjoining nucleotides, such as endonucleolytic cleavage site, is indicated by listing the two points separated by a carat (^). The permitted formats for this descriptor are n^n+1 (for example 55^56), or, for circular molecules, n^1, where "n" is the full length of the molecule, ie 1000^1 for circular molecule with length 1000. A single base chosen from a range of bases is indicated by the first base number and the last base number of the range separated by a single period (e.g., '12.21' indicates a single base taken from between the indicated points). From October 2006 the usage of this descriptor is restricted : it is illegal to use "a single base from a range" (c) either on its own or in combination with the "sequence span" (d) descriptor for newly created entries. The existing entries where such descriptors exist are going to be retrofitted. Sequence spans are indicated by the starting base number and the ending base number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may be used with the starting and ending base numbers to indicate that an end point is beyond the specified base number. The starting and ending base positions can be represented as distinct base numbers ('34..456') or a site between two indicated adjoining bases. A location in a remote entry (not the entry to which the feature table belongs) can be specified by giving the accession-number and sequence version of the remote entry, followed by a colon ":", followed by a location descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see also examples below) 3.5.2.2 Operators The location operator is a prefix that specifies what must be done to the indicated sequence to find or construct the location corresponding to the feature. A list of operators is given below with their definitions and most common format. complement(location) Find the complement of the presented sequence in the span specified by " location" (i.e., read the complement of the presented strand in its 5'-to-3' direction) join(location,location, ... location) The indicated elements should be joined (placed end-to-end) to form one contiguous sequence order(location,location, ... location) The elements can be found in the specified order (5' to 3' direction), but nothing is implied about the reasonableness about joining them Note : location operator "complement" can be used in combination with either " join" or "order" within the same location; combinations of "join" and "order" within the same location (nested operators) are illegal. 3.5.3 Location examples The following is a list of common location descriptors with their meanings: Location Description 467 Points to a single base in the presented sequence 340..565 Points to a continuous range of bases bounded by and including the starting and ending bases <345..500 Indicates that the exact lower boundary point of a feature is unknown. The location begins at some base previous to the first base specified (which need not be contained in the presented sequence) and continues to and includes the ending base <1..888 The feature starts before the first sequenced base and continues to and includes base 888 1..>888 The feature starts at the first sequenced base and continues beyond base 888 102.110 Indicates that the exact location is unknown but that it is one of the bases between bases 102 and 110, inclusive 123^124 Points to a site between bases 123 and 124 join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form one contiguous sequence complement(34..126) Start at the base complementary to 126 and finish at the base complementary to base 34 (the feature is on the strand complementary to the presented strand) complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and 4918 to 5163, then complements the joined segments (the feature is on the strand complementary to the presented strand) join(complement(4918..5163),complement(2691..4571)) Complements regions 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the feature is on the strand complementary to the presented strand) J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in this database) with primary accession number 'J00194' join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry with the region 100..202 of remote entry J00194 */ /** * Recover annotated sequences from EMBL file * @param noNa don't return nucleic acid sequences * @param sourceDb TODO * @param noProtein don't return any translated protein sequences marked in features * @return dataset sequences with DBRefs and features - DNA always comes first */ public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) { Vector seqs=new Vector(); Sequence dna=null; if (!noNa) { dna = new Sequence(sourceDb+"|"+accession, sequence.getSequence()); dna.setDescription(desc); dna.addDBRef(new DBRefEntry(sourceDb, version, accession)); // TODO: add mapping for parentAccession attribute // TODO: transform EMBL Database refs to canonical form if (dbRefs!=null) for (Iterator i=dbRefs.iterator(); i.hasNext(); dna.addDBRef((DBRefEntry)i.next())); } for (Iterator i=features.iterator(); i.hasNext(); ) { EmblFeature feature = (EmblFeature) i.next(); if (!noNa) { if (feature.dbRefs!=null && feature.dbRefs.size()>0) { for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) ) ; } } if (feature.getName().equalsIgnoreCase("CDS")) { // extract coding region(s) jalview.datamodel.Mapping map = null; int[] exon=null; if (feature.locations!=null && feature.locations.size()>0) { for (Iterator locs=feature.locations.iterator(); locs.hasNext(); ) { EmblFeatureLocations loc = (EmblFeatureLocations) locs.next(); int[] se = loc.getElementRanges(); if (exon==null) { exon=se; } else { int[] t=new int[exon.length+se.length]; System.arraycopy(exon, 0, t, 0, exon.length); System.arraycopy(se, 0, t, exon.length,se.length); exon=t; } } } String prseq=null; String prname=new String(); String prid=null; Hashtable vals=new Hashtable(); int prstart=1; // get qualifiers if (feature.getQualifiers()!=null && feature.getQualifiers().size()>0) { for (Iterator quals=feature.getQualifiers().iterator(); quals.hasNext(); ) { Qualifier q = (Qualifier) quals.next(); if (q.getName().equals("translation")) { prseq=q.getValues()[0]; } else if (q.getName().equals("protein_id")) { prid=q.getValues()[0]; } else if (q.getName().equals("codon_start")) { prstart = Integer.parseInt(q.getValues()[0]); } else if (q.getName().equals("product")){ prname = q.getValues()[0]; } else { // throw anything else into the additional properties hash vals.put(q.getName(), q.getValues().toString()); } } } Sequence product=null; if (prseq!=null && prname!=null && prid!=null) { // extract proteins. if (!noPeptide) { product = new Sequence(sourceDb+"|"+"EMBLCDS|"+prid+"|"+prname, prseq, prstart, prstart+prseq.length()-1); product.setDescription("Protein Product from "+sourceDb); seqs.add(product); } // we have everything - create the mapping and perhaps the protein sequence map = new jalview.datamodel.Mapping(product, exon, new int[] { prstart, prstart+prseq.length()-1}, 3, 1); // add cds feature to dna seq - this may include the stop codon for (int xint=0;xint0) { Enumeration kv = vals.elements(); while (kv.hasMoreElements()) { Object key=kv.nextElement(); if (key!=null) sf.setValue(key.toString(), vals.get(key)); } } dna.addSequenceFeature(sf); } } // add dbRefs to sequence if (feature.dbRefs!=null && feature.dbRefs.size()>0) { for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); ) { DBRefEntry ref = (DBRefEntry)dbr.next(); ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref.getSource())); if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT)) { ref.setMap(map); } if (product!=null) { DBRefEntry pref = new DBRefEntry(ref.getSource(), ref.getVersion(), ref.getAccessionId()); pref.setMap(null); // reference is direct } dna.addDBRef(ref); } } } else { // General feature type. if (!noNa) { if (feature.dbRefs!=null && feature.dbRefs.size()>0) { for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) ) ; } } } } if (!noNa) { seqs.add(dna); } SequenceI[] sqs = new SequenceI[seqs.size()]; for (int i=0,j=seqs.size();i