From 3f89c3afc646bf1d83607110e538f20f9c01c3b2 Mon Sep 17 00:00:00 2001 From: jprocter Date: Wed, 2 May 2007 15:20:18 +0000 Subject: [PATCH] feature locations are retrieved by associated accession string and public debug report request in case embl parsing fails --- src/jalview/datamodel/xdb/embl/EmblEntry.java | 689 +++++++++++++++---------- 1 file changed, 403 insertions(+), 286 deletions(-) diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 49d51ad..e6c7197 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -10,447 +10,564 @@ import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; -public class EmblEntry { +public class EmblEntry +{ String accession; + String version; + String taxDivision; + String desc; + String rCreated; + String rLastUpdated; + String lastUpdated; + Vector keywords; + Vector refs; + Vector dbRefs; + Vector features; + EmblSequence sequence; + /** * @return the accession */ - public String getAccession() { + public String getAccession() + { return accession; } + /** - * @param accession the accession to set + * @param accession + * the accession to set */ - public void setAccession(String accession) { + public void setAccession(String accession) + { this.accession = accession; } + /** * @return the dbRefs */ - public Vector getDbRefs() { + public Vector getDbRefs() + { return dbRefs; } + /** - * @param dbRefs the dbRefs to set + * @param dbRefs + * the dbRefs to set */ - public void setDbRefs(Vector dbRefs) { + public void setDbRefs(Vector dbRefs) + { this.dbRefs = dbRefs; } + /** * @return the desc */ - public String getDesc() { + public String getDesc() + { return desc; } + /** - * @param desc the desc to set + * @param desc + * the desc to set */ - public void setDesc(String desc) { + public void setDesc(String desc) + { this.desc = desc; } + /** * @return the features */ - public Vector getFeatures() { + public Vector getFeatures() + { return features; } + /** - * @param features the features to set + * @param features + * the features to set */ - public void setFeatures(Vector features) { + public void setFeatures(Vector features) + { this.features = features; } + /** * @return the keywords */ - public Vector getKeywords() { + public Vector getKeywords() + { return keywords; } + /** - * @param keywords the keywords to set + * @param keywords + * the keywords to set */ - public void setKeywords(Vector keywords) { + public void setKeywords(Vector keywords) + { this.keywords = keywords; } + /** * @return the lastUpdated */ - public String getLastUpdated() { + public String getLastUpdated() + { return lastUpdated; } + /** - * @param lastUpdated the lastUpdated to set + * @param lastUpdated + * the lastUpdated to set */ - public void setLastUpdated(String lastUpdated) { + public void setLastUpdated(String lastUpdated) + { this.lastUpdated = lastUpdated; } + /** * @return the refs */ - public Vector getRefs() { + public Vector getRefs() + { return refs; } + /** - * @param refs the refs to set + * @param refs + * the refs to set */ - public void setRefs(Vector refs) { + public void setRefs(Vector refs) + { this.refs = refs; } + /** * @return the releaseCreated */ - public String getRCreated() { + public String getRCreated() + { return rCreated; } + /** - * @param releaseCreated the releaseCreated to set + * @param releaseCreated + * the releaseCreated to set */ - public void setRcreated(String releaseCreated) { + public void setRcreated(String releaseCreated) + { this.rCreated = releaseCreated; } + /** * @return the releaseLastUpdated */ - public String getRLastUpdated() { + public String getRLastUpdated() + { return rLastUpdated; } + /** - * @param releaseLastUpdated the releaseLastUpdated to set + * @param releaseLastUpdated + * the releaseLastUpdated to set */ - public void setRLastUpdated(String releaseLastUpdated) { + public void setRLastUpdated(String releaseLastUpdated) + { this.rLastUpdated = releaseLastUpdated; } + /** * @return the sequence */ - public EmblSequence getSequence() { + public EmblSequence getSequence() + { return sequence; } + /** - * @param sequence the sequence to set + * @param sequence + * the sequence to set */ - public void setSequence(EmblSequence sequence) { + public void setSequence(EmblSequence sequence) + { this.sequence = sequence; } + /** * @return the taxDivision */ - public String getTaxDivision() { + public String getTaxDivision() + { return taxDivision; } + /** - * @param taxDivision the taxDivision to set + * @param taxDivision + * the taxDivision to set */ - public void setTaxDivision(String taxDivision) { + public void setTaxDivision(String taxDivision) + { this.taxDivision = taxDivision; } + /** * @return the version */ - public String getVersion() { + public String getVersion() + { return version; } + /** - * @param version the version to set + * @param version + * the version to set */ - public void setVersion(String version) { + public void setVersion(String version) + { this.version = version; } -/* - * EMBL Feature support is limited. The text below is included for the benefit of - * any developer working on improving EMBL feature import in Jalview. - * Extract from EMBL feature specification - * see http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html -3.5 Location -3.5.1 Purpose - -The location indicates the region of the presented sequence which corresponds -to a feature. - -3.5.2 Format and conventions -The location contains at least one sequence location descriptor and may -contain one or more operators with one or more sequence location descriptors. -Base numbers refer to the numbering in the entry. This numbering designates -the first base (5' end) of the presented sequence as base 1. -Base locations beyond the range of the presented sequence may not be used in -location descriptors, the only exception being location in a remote entry (see -3.5.2.1, e). - -Location operators and descriptors are discussed in more detail below. - -3.5.2.1 Location descriptors - -The location descriptor can be one of the following: -(a) a single base number -(b) a site between two indicated adjoining bases -(c) a single base chosen from within a specified range of bases (not allowed for new - entries) -(d) the base numbers delimiting a sequence span -(e) a remote entry identifier followed by a local location descriptor - (i.e., a-d) - -A site between two adjoining nucleotides, such as endonucleolytic cleavage -site, is indicated by listing the two points separated by a carat (^). The -permitted formats for this descriptor are n^n+1 (for example 55^56), or, for -circular molecules, n^1, where "n" is the full length of the molecule, ie -1000^1 for circular molecule with length 1000. - -A single base chosen from a range of bases is indicated by the first base -number and the last base number of the range separated by a single period -(e.g., '12.21' indicates a single base taken from between the indicated -points). From October 2006 the usage of this descriptor is restricted : -it is illegal to use "a single base from a range" (c) either on its own or -in combination with the "sequence span" (d) descriptor for newly created entries. -The existing entries where such descriptors exist are going to be retrofitted. - -Sequence spans are indicated by the starting base number and the ending base -number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may -be used with the starting and ending base numbers to indicate that an end -point is beyond the specified base number. The starting and ending base -positions can be represented as distinct base numbers ('34..456') or a site -between two indicated adjoining bases. - -A location in a remote entry (not the entry to which the feature table -belongs) can be specified by giving the accession-number and sequence version -of the remote entry, followed by a colon ":", followed by a location -descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see -also examples below) - -3.5.2.2 Operators - -The location operator is a prefix that specifies what must be done to the -indicated sequence to find or construct the location corresponding to the -feature. A list of operators is given below with their definitions and most -common format. - -complement(location) -Find the complement of the presented sequence in the span specified by " -location" (i.e., read the complement of the presented strand in its 5'-to-3' -direction) - -join(location,location, ... location) -The indicated elements should be joined (placed end-to-end) to form one -contiguous sequence - -order(location,location, ... location) -The elements can be found in the -specified order (5' to 3' direction), but nothing is implied about the -reasonableness about joining them - -Note : location operator "complement" can be used in combination with either " -join" or "order" within the same location; combinations of "join" and "order" -within the same location (nested operators) are illegal. - - - -3.5.3 Location examples - -The following is a list of common location descriptors with their meanings: - -Location Description - -467 Points to a single base in the presented sequence - -340..565 Points to a continuous range of bases bounded by and - including the starting and ending bases - -<345..500 Indicates that the exact lower boundary point of a feature - is unknown. The location begins at some base previous to - the first base specified (which need not be contained in - the presented sequence) and continues to and includes the - ending base - -<1..888 The feature starts before the first sequenced base and - continues to and includes base 888 - -1..>888 The feature starts at the first sequenced base and - continues beyond base 888 - -102.110 Indicates that the exact location is unknown but that it is - one of the bases between bases 102 and 110, inclusive - -123^124 Points to a site between bases 123 and 124 - -join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form - one contiguous sequence - - -complement(34..126) Start at the base complementary to 126 and finish at the - base complementary to base 34 (the feature is on the strand - complementary to the presented strand) - - -complement(join(2691..4571,4918..5163)) - Joins regions 2691 to 4571 and 4918 to 5163, then - complements the joined segments (the feature is on the - strand complementary to the presented strand) -join(complement(4918..5163),complement(2691..4571)) - Complements regions 4918 to 5163 and 2691 to 4571, then - joins the complemented segments (the feature is on the - strand complementary to the presented strand) - -J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in - this database) with primary accession number 'J00194' - -join(1..100,J00194.1:100..202) - Joins region 1..100 of the existing entry with the region - 100..202 of remote entry J00194 - - */ + /* + * EMBL Feature support is limited. The text below is included for the benefit + * of any developer working on improving EMBL feature import in Jalview. + * Extract from EMBL feature specification see + * http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html + * 3.5 Location 3.5.1 Purpose + * + * The location indicates the region of the presented sequence which + * corresponds to a feature. + * + * 3.5.2 Format and conventions The location contains at least one sequence + * location descriptor and may contain one or more operators with one or more + * sequence location descriptors. Base numbers refer to the numbering in the + * entry. This numbering designates the first base (5' end) of the presented + * sequence as base 1. Base locations beyond the range of the presented + * sequence may not be used in location descriptors, the only exception being + * location in a remote entry (see 3.5.2.1, e). + * + * Location operators and descriptors are discussed in more detail below. + * + * 3.5.2.1 Location descriptors + * + * The location descriptor can be one of the following: (a) a single base + * number (b) a site between two indicated adjoining bases (c) a single base + * chosen from within a specified range of bases (not allowed for new entries) + * (d) the base numbers delimiting a sequence span (e) a remote entry + * identifier followed by a local location descriptor (i.e., a-d) + * + * A site between two adjoining nucleotides, such as endonucleolytic cleavage + * site, is indicated by listing the two points separated by a carat (^). The + * permitted formats for this descriptor are n^n+1 (for example 55^56), or, + * for circular molecules, n^1, where "n" is the full length of the molecule, + * ie 1000^1 for circular molecule with length 1000. + * + * A single base chosen from a range of bases is indicated by the first base + * number and the last base number of the range separated by a single period + * (e.g., '12.21' indicates a single base taken from between the indicated + * points). From October 2006 the usage of this descriptor is restricted : it + * is illegal to use "a single base from a range" (c) either on its own or in + * combination with the "sequence span" (d) descriptor for newly created + * entries. The existing entries where such descriptors exist are going to be + * retrofitted. + * + * Sequence spans are indicated by the starting base number and the ending + * base number separated by two periods (e.g., '34..456'). The '<' and '>' + * symbols may be used with the starting and ending base numbers to indicate + * that an end point is beyond the specified base number. The starting and + * ending base positions can be represented as distinct base numbers + * ('34..456') or a site between two indicated adjoining bases. + * + * A location in a remote entry (not the entry to which the feature table + * belongs) can be specified by giving the accession-number and sequence + * version of the remote entry, followed by a colon ":", followed by a + * location descriptor which applies to that entry's sequence (i.e. + * J12345.1:1..15, see also examples below) + * + * 3.5.2.2 Operators + * + * The location operator is a prefix that specifies what must be done to the + * indicated sequence to find or construct the location corresponding to the + * feature. A list of operators is given below with their definitions and most + * common format. + * + * complement(location) Find the complement of the presented sequence in the + * span specified by " location" (i.e., read the complement of the presented + * strand in its 5'-to-3' direction) + * + * join(location,location, ... location) The indicated elements should be + * joined (placed end-to-end) to form one contiguous sequence + * + * order(location,location, ... location) The elements can be found in the + * specified order (5' to 3' direction), but nothing is implied about the + * reasonableness about joining them + * + * Note : location operator "complement" can be used in combination with + * either " join" or "order" within the same location; combinations of "join" + * and "order" within the same location (nested operators) are illegal. + * + * + * + * 3.5.3 Location examples + * + * The following is a list of common location descriptors with their meanings: + * + * Location Description + * + * 467 Points to a single base in the presented sequence + * + * 340..565 Points to a continuous range of bases bounded by and including the + * starting and ending bases + * + * <345..500 Indicates that the exact lower boundary point of a feature is + * unknown. The location begins at some base previous to the first base + * specified (which need not be contained in the presented sequence) and + * continues to and includes the ending base + * + * <1..888 The feature starts before the first sequenced base and continues to + * and includes base 888 + * + * 1..>888 The feature starts at the first sequenced base and continues beyond + * base 888 + * + * 102.110 Indicates that the exact location is unknown but that it is one of + * the bases between bases 102 and 110, inclusive + * + * 123^124 Points to a site between bases 123 and 124 + * + * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to + * form one contiguous sequence + * + * + * complement(34..126) Start at the base complementary to 126 and finish at + * the base complementary to base 34 (the feature is on the strand + * complementary to the presented strand) + * + * + * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and 4918 + * to 5163, then complements the joined segments (the feature is on the strand + * complementary to the presented strand) + * + * join(complement(4918..5163),complement(2691..4571)) Complements regions + * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the + * feature is on the strand complementary to the presented strand) + * + * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in + * this database) with primary accession number 'J00194' + * + * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry + * with the region 100..202 of remote entry J00194 + * + */ /** * Recover annotated sequences from EMBL file - * @param noNa don't return nucleic acid sequences - * @param sourceDb TODO - * @param noProtein don't return any translated protein sequences marked in features + * + * @param noNa + * don't return nucleic acid sequences + * @param sourceDb + * TODO + * @param noProtein + * don't return any translated protein sequences marked in features * @return dataset sequences with DBRefs and features - DNA always comes first */ - public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) { - Vector seqs=new Vector(); - Sequence dna=null; - if (!noNa) { - dna = new Sequence(sourceDb+"|"+accession, sequence.getSequence()); + public jalview.datamodel.SequenceI[] getSequences(boolean noNa, + boolean noPeptide, String sourceDb) + { + Vector seqs = new Vector(); + Sequence dna = null; + if (!noNa) + { + dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); dna.setDescription(desc); dna.addDBRef(new DBRefEntry(sourceDb, version, accession)); // TODO: add mapping for parentAccession attribute // TODO: transform EMBL Database refs to canonical form - if (dbRefs!=null) - for (Iterator i=dbRefs.iterator(); i.hasNext(); dna.addDBRef((DBRefEntry)i.next())); + if (dbRefs != null) + for (Iterator i = dbRefs.iterator(); i.hasNext(); dna + .addDBRef((DBRefEntry) i.next())) + ; } - for (Iterator i=features.iterator(); i.hasNext(); ) { - EmblFeature feature = (EmblFeature) i.next(); - if (!noNa) { - if (feature.dbRefs!=null && feature.dbRefs.size()>0) { - for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) ) - ; + try + { + for (Iterator i = features.iterator(); i.hasNext();) + { + boolean nextFeature=false; + EmblFeature feature = (EmblFeature) i.next(); + if (!noNa) + { + if (feature.dbRefs != null && feature.dbRefs.size() > 0) + { + for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna + .addDBRef((DBRefEntry) dbr.next())) + ; + } } - } - if (feature.getName().equalsIgnoreCase("CDS")) { - // extract coding region(s) - jalview.datamodel.Mapping map = null; - int[] exon=null; - if (feature.locations!=null && feature.locations.size()>0) { - for (Iterator locs=feature.locations.iterator(); - locs.hasNext(); ) { - EmblFeatureLocations loc = (EmblFeatureLocations) locs.next(); - int[] se = loc.getElementRanges(); - if (exon==null) { - exon=se; - } else { - int[] t=new int[exon.length+se.length]; - System.arraycopy(exon, 0, t, 0, exon.length); - System.arraycopy(se, 0, t, exon.length,se.length); - exon=t; + if (feature.getName().equalsIgnoreCase("CDS")) + { + // extract coding region(s) + jalview.datamodel.Mapping map = null; + int[] exon = null; + if (feature.locations != null && feature.locations.size() > 0) + { + for (Enumeration locs = feature.locations.elements(); locs + .hasMoreElements();) + { + EmblFeatureLocations loc = (EmblFeatureLocations) locs + .nextElement(); + int[] se = loc.getElementRanges(accession); + if (exon == null) + { + exon = se; + } + else + { + int[] t = new int[exon.length + se.length]; + System.arraycopy(exon, 0, t, 0, exon.length); + System.arraycopy(se, 0, t, exon.length, se.length); + exon = t; + } } } - } - String prseq=null; - String prname=new String(); - String prid=null; - Hashtable vals=new Hashtable(); - int prstart=1; - // get qualifiers - if (feature.getQualifiers()!=null && feature.getQualifiers().size()>0) { - for (Iterator quals=feature.getQualifiers().iterator(); quals.hasNext(); ) { - Qualifier q = (Qualifier) quals.next(); - if (q.getName().equals("translation")) + String prseq = null; + String prname = new String(); + String prid = null; + Hashtable vals = new Hashtable(); + int prstart = 1; + // get qualifiers + if (feature.getQualifiers() != null + && feature.getQualifiers().size() > 0) + { + for (Iterator quals = feature.getQualifiers().iterator(); quals + .hasNext();) { - prseq=q.getValues()[0]; - } - else - if (q.getName().equals("protein_id")) + Qualifier q = (Qualifier) quals.next(); + if (q.getName().equals("translation")) + { + prseq = q.getValues()[0]; + } + else if (q.getName().equals("protein_id")) + { + prid = q.getValues()[0]; + } + else if (q.getName().equals("codon_start")) { - prid=q.getValues()[0]; + prstart = Integer.parseInt(q.getValues()[0]); + } + else if (q.getName().equals("product")) + { + prname = q.getValues()[0]; } else - if (q.getName().equals("codon_start")) + { + // throw anything else into the additional properties hash + vals.put(q.getName(), q.getValues().toString()); + } + } + } + Sequence product = null; + if (prseq != null && prname != null && prid != null) + { + // extract proteins. + if (!noPeptide) + { + product = new Sequence(sourceDb + "|" + "EMBLCDS|" + prid + + "|" + prname, prseq, prstart, prstart + + prseq.length() - 1); + product.setDescription("Protein Product from " + sourceDb); + seqs.add(product); + } + // we have everything - create the mapping and perhaps the protein + // sequence + map = new jalview.datamodel.Mapping(product, exon, new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + // add cds feature to dna seq - this may include the stop codon + for (int xint = 0; xint < exon.length; xint += 2) + { + SequenceFeature sf = new SequenceFeature(); + sf.setBegin(exon[xint]); + sf.setEnd(exon[xint + 1]); + sf.setType(feature.getName()); + sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL); + sf.setDescription("Exon " + (1 + xint) + " for protein '" + + prname + "' EMBLCDS:" + prid); + if (vals != null && vals.size() > 0) + { + Enumeration kv = vals.elements(); + while (kv.hasMoreElements()) { - prstart = Integer.parseInt(q.getValues()[0]); - } - else - if (q.getName().equals("product")){ - prname = q.getValues()[0]; - } else { - // throw anything else into the additional properties hash - vals.put(q.getName(), q.getValues().toString()); + Object key = kv.nextElement(); + if (key != null) + sf.setValue(key.toString(), vals.get(key)); } + } + dna.addSequenceFeature(sf); + } } - } - Sequence product=null; - if (prseq!=null && prname!=null && prid!=null) { - // extract proteins. - if (!noPeptide) { - product = new Sequence(sourceDb+"|"+"EMBLCDS|"+prid+"|"+prname, prseq, prstart, prstart+prseq.length()-1); - product.setDescription("Protein Product from "+sourceDb); - seqs.add(product); - } - // we have everything - create the mapping and perhaps the protein sequence - map = new jalview.datamodel.Mapping(product, exon, new int[] { prstart, prstart+prseq.length()-1}, 3, 1); - // add cds feature to dna seq - this may include the stop codon - for (int xint=0;xint0) { - Enumeration kv = vals.elements(); - while (kv.hasMoreElements()) { - Object key=kv.nextElement(); - if (key!=null) - sf.setValue(key.toString(), vals.get(key)); + // add dbRefs to sequence + if (feature.dbRefs != null && feature.dbRefs.size() > 0) + { + for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) + { + DBRefEntry ref = (DBRefEntry) dbr.next(); + ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref + .getSource())); + if (ref.getSource().equals( + jalview.datamodel.DBRefSource.UNIPROT)) + { + ref.setMap(map); + } + if (product != null) + { + DBRefEntry pref = new DBRefEntry(ref.getSource(), ref + .getVersion(), ref.getAccessionId()); + pref.setMap(null); // reference is direct } + dna.addDBRef(ref); } - dna.addSequenceFeature(sf); } + } - // add dbRefs to sequence - if (feature.dbRefs!=null && feature.dbRefs.size()>0) + else { - for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); ) + // General feature type. + if (!noNa) { - DBRefEntry ref = (DBRefEntry)dbr.next(); - ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref.getSource())); - if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT)) + if (feature.dbRefs != null && feature.dbRefs.size() > 0) { - ref.setMap(map); - } - if (product!=null) { - DBRefEntry pref = new DBRefEntry(ref.getSource(), ref.getVersion(), ref.getAccessionId()); - pref.setMap(null); // reference is direct + for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna + .addDBRef((DBRefEntry) dbr.next())) + ; } - dna.addDBRef(ref); - } - } - - } else { - // General feature type. - if (!noNa) { - if (feature.dbRefs!=null && feature.dbRefs.size()>0) { - for (Iterator dbr=feature.dbRefs.iterator(); dbr.hasNext(); dna.addDBRef((DBRefEntry)dbr.next()) ) - ; } } } - + } catch (Exception e) + { + System.err.println("EMBL Record Features parsing error!"); + System.err.println("Please report the following to help@jalview.org :"); + System.err.println("EMBL Record "+accession); + System.err.println("Resulted in exception: "+e.getMessage()); + e.printStackTrace(System.err); } - if (!noNa) { + if (!noNa && dna!=null) + { seqs.add(dna); } SequenceI[] sqs = new SequenceI[seqs.size()]; - for (int i=0,j=seqs.size();i