X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=27aee2412eb9f6560621d01ef44b0f2f855c210b;hb=838e4f91d4a53dd315640dbc9ff6ef7a815ee576;hp=a1c4c3c4dc4a332fe21a954b65b4928c634419af;hpb=202c28a9c7cdcb1ffe878627bf8d2d3f98fafbb6;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index a1c4c3c..27aee24 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -1,3 +1,23 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer (Version 2.9.0b1) + * Copyright (C) 2015 The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.datamodel.xdb.embl; import jalview.datamodel.DBRefEntry; @@ -8,11 +28,18 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import java.util.Enumeration; import java.util.Hashtable; -import java.util.Iterator; +import java.util.Map.Entry; import java.util.Vector; +/** + * Data model for one entry returned from an EMBL query, as marshalled by a + * Castor binding file + * + * For example: http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/x53828/emblxml + * + * @see embl_mapping.xml + */ public class EmblEntry { String accession; @@ -29,13 +56,11 @@ public class EmblEntry String lastUpdated; - Vector keywords; + Vector keywords; - Vector refs; + Vector dbRefs; - Vector dbRefs; - - Vector features; + Vector features; EmblSequence sequence; @@ -59,7 +84,7 @@ public class EmblEntry /** * @return the dbRefs */ - public Vector getDbRefs() + public Vector getDbRefs() { return dbRefs; } @@ -68,7 +93,7 @@ public class EmblEntry * @param dbRefs * the dbRefs to set */ - public void setDbRefs(Vector dbRefs) + public void setDbRefs(Vector dbRefs) { this.dbRefs = dbRefs; } @@ -93,7 +118,7 @@ public class EmblEntry /** * @return the features */ - public Vector getFeatures() + public Vector getFeatures() { return features; } @@ -102,7 +127,7 @@ public class EmblEntry * @param features * the features to set */ - public void setFeatures(Vector features) + public void setFeatures(Vector features) { this.features = features; } @@ -110,7 +135,7 @@ public class EmblEntry /** * @return the keywords */ - public Vector getKeywords() + public Vector getKeywords() { return keywords; } @@ -119,7 +144,7 @@ public class EmblEntry * @param keywords * the keywords to set */ - public void setKeywords(Vector keywords) + public void setKeywords(Vector keywords) { this.keywords = keywords; } @@ -142,23 +167,6 @@ public class EmblEntry } /** - * @return the refs - */ - public Vector getRefs() - { - return refs; - } - - /** - * @param refs - * the refs to set - */ - public void setRefs(Vector refs) - { - this.refs = refs; - } - - /** * @return the releaseCreated */ public String getRCreated() @@ -170,7 +178,7 @@ public class EmblEntry * @param releaseCreated * the releaseCreated to set */ - public void setRcreated(String releaseCreated) + public void setRCreated(String releaseCreated) { this.rCreated = releaseCreated; } @@ -247,8 +255,8 @@ public class EmblEntry * EMBL Feature support is limited. The text below is included for the benefit * of any developer working on improving EMBL feature import in Jalview. * Extract from EMBL feature specification see - * http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html - * 3.5 Location 3.5.1 Purpose + * http://www.embl-ebi.ac.uk/embl/Documentation + * /FT_definitions/feature_table.html 3.5 Location 3.5.1 Purpose * * The location indicates the region of the presented sequence which * corresponds to a feature. @@ -372,7 +380,6 @@ public class EmblEntry * * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry * with the region 100..202 of remote entry J00194 - * */ /** * Recover annotated sequences from EMBL file @@ -387,33 +394,44 @@ public class EmblEntry */ public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) - { - Vector seqs = new Vector(); + { // TODO: ensure emblEntry.getSequences behaves correctly for returning all + // cases of noNa and noPeptide + Vector seqs = new Vector(); Sequence dna = null; if (!noNa) { + // In theory we still need to create this if noNa is set to avoid a null + // pointer exception dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); dna.setDescription(desc); - dna.addDBRef(new DBRefEntry(sourceDb, version, accession)); - // TODO: add mapping for parentAccession attribute + DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession); + dna.addDBRef(retrievedref); + // add map to indicate the sequence is a valid coordinate frame for the + // dbref + retrievedref.setMap(new Mapping(null, + new int[] { 1, dna.getLength() }, new int[] { 1, + dna.getLength() }, 1, 1)); // TODO: transform EMBL Database refs to canonical form if (dbRefs != null) - for (Iterator i = dbRefs.iterator(); i.hasNext(); dna - .addDBRef((DBRefEntry) i.next())) - ; + { + for (DBRefEntry dbref : dbRefs) + { + dna.addDBRef(dbref); + } + } } try { - for (Iterator i = features.iterator(); i.hasNext();) + for (EmblFeature feature : features) { - EmblFeature feature = (EmblFeature) i.next(); if (!noNa) { - if (feature.dbRefs != null && feature.dbRefs.size() > 0) + if (feature.dbRefs != null) { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna - .addDBRef((DBRefEntry) dbr.next())) - ; + for (DBRefEntry dbref : feature.dbRefs) + { + dna.addDBRef(dbref); + } } } if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) @@ -423,19 +441,20 @@ public class EmblEntry else { // General feature type. + // TODO this is just duplicated code ?? if (!noNa) { - if (feature.dbRefs != null && feature.dbRefs.size() > 0) + if (feature.dbRefs != null) { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna - .addDBRef((DBRefEntry) dbr.next())) - ; + for (DBRefEntry dbref : feature.dbRefs) + { + dna.addDBRef(dbref); + } } } } } - } - catch (Exception e) + } catch (Exception e) { System.err.println("EMBL Record Features parsing error!"); System.err @@ -451,33 +470,38 @@ public class EmblEntry SequenceI[] sqs = new SequenceI[seqs.size()]; for (int i = 0, j = seqs.size(); i < j; i++) { - sqs[i] = (SequenceI) seqs.elementAt(i); + sqs[i] = seqs.elementAt(i); seqs.set(i, null); } return sqs; } /** - * attempt to extract coding region and product from a feature and properly decorate it with annotations. - * @param feature coding feature - * @param sourceDb source database for the EMBLXML - * @param seqs place where sequences go - * @param dna parent dna sequence for this record - * @param noPeptide flag for generation of Peptide sequence objects + * attempt to extract coding region and product from a feature and properly + * decorate it with annotations. + * + * @param feature + * coding feature + * @param sourceDb + * source database for the EMBLXML + * @param seqs + * place where sequences go + * @param dna + * parent dna sequence for this record + * @param noPeptide + * flag for generation of Peptide sequence objects */ - private void parseCodingFeature(EmblFeature feature, String sourceDb, Vector seqs, Sequence dna, boolean noPeptide) + private void parseCodingFeature(EmblFeature feature, String sourceDb, + Vector seqs, Sequence dna, boolean noPeptide) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); // extract coding region(s) jalview.datamodel.Mapping map = null; int[] exon = null; - if (feature.locations != null && feature.locations.size() > 0) + if (feature.locations != null) { - for (Enumeration locs = feature.locations.elements(); locs - .hasMoreElements();) + for (EmblFeatureLocations loc : feature.locations) { - EmblFeatureLocations loc = (EmblFeatureLocations) locs - .nextElement(); int[] se = loc.getElementRanges(accession); if (exon == null) { @@ -495,19 +519,17 @@ public class EmblEntry String prseq = null; String prname = new String(); String prid = null; - Hashtable vals = new Hashtable(); + Hashtable vals = new Hashtable(); int prstart = 1; // get qualifiers - if (feature.getQualifiers() != null - && feature.getQualifiers().size() > 0) + if (feature.getQualifiers() != null) { - for (Iterator quals = feature.getQualifiers().iterator(); quals - .hasNext();) + for (Qualifier q : feature.getQualifiers()) { - Qualifier q = (Qualifier) quals.next(); - if (q.getName().equals("translation")) + String qname = q.getName(); + if (qname.equals("translation")) { - StringBuffer prsq = new StringBuffer(q.getValues()[0]); + StringBuilder prsq = new StringBuilder(q.getValues()[0]); int p = prsq.indexOf(" "); while (p > -1) { @@ -518,34 +540,47 @@ public class EmblEntry prsq = null; } - else if (q.getName().equals("protein_id")) + else if (qname.equals("protein_id")) { prid = q.getValues()[0]; } - else if (q.getName().equals("codon_start")) + else if (qname.equals("codon_start")) { prstart = Integer.parseInt(q.getValues()[0]); } - else if (q.getName().equals("product")) + else if (qname.equals("product")) { prname = q.getValues()[0]; } else { // throw anything else into the additional properties hash - vals.put(q.getName(), q.getValues().toString()); + String[] s = q.getValues(); + StringBuilder sb = new StringBuilder(); + if (s != null) + { + for (int i = 0; i < s.length; i++) + { + sb.append(s[i]); + sb.append("\n"); + } + } + vals.put(qname, sb.toString()); } } } Sequence product = null; + DBRefEntry protEMBLCDS = null; + exon = adjustForPrStart(prstart, exon); + boolean noProteinDbref = true; + if (prseq != null && prname != null && prid != null) { // extract proteins. - product = new Sequence(sourceDb + "|" + "EMBLCDS|" + prid - +((prname.length()==0) ? "" : " " + prname), prseq, prstart, prstart - + prseq.length() - 1); - product.setDescription("Protein Product from " + sourceDb); - + product = new Sequence(prid, prseq, 1, prseq.length()); + product.setDescription(((prname.length() == 0) ? "Protein Product from " + + sourceDb + : prname)); if (!noPeptide) { // Protein is also added to vector of sequences returned @@ -558,50 +593,69 @@ public class EmblEntry System.err .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (prseq.length() * 3 == dna.getSequence().length) + if (prseq.length() * 3 == (1 - prstart + dna.getSequence().length)) { + System.err + .println("Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are // marked. - exon = new int[] - { dna.getStart(), dna.getEnd() }; - map = new jalview.datamodel.Mapping(product, exon, - new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + exon = new int[] { dna.getStart() + (prstart - 1), dna.getEnd() }; + map = new jalview.datamodel.Mapping(product, exon, new int[] { 1, + prseq.length() }, 3, 1); } - if ((prseq.length() + 1) * 3 == dna.getSequence().length) + if ((prseq.length() + 1) * 3 == (1 - prstart + dna.getSequence().length)) { - exon = new int[] - { dna.getStart(), dna.getEnd() - 3 }; - map = new jalview.datamodel.Mapping(product, exon, - new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + System.err + .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); + exon = new int[] { dna.getStart() + (prstart - 1), + dna.getEnd() - 3 }; + map = new jalview.datamodel.Mapping(product, exon, new int[] { 1, + prseq.length() }, 3, 1); } } else { + // Trim the exon mapping if necessary - the given product may only be a + // fragment of a larger protein. (EMBL:AY043181 is an example) + if (isEmblCdna) { // TODO: Add a DbRef back to the parent EMBL sequence with the exon // map - + // if given a dataset reference, search dataset for parent EMBL + // sequence if it exists and set its map // make a new feature annotating the coding contig } else { - map = new jalview.datamodel.Mapping(product, exon, - new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + // final product length trunctation check + + map = new jalview.datamodel.Mapping(product, + adjustForProteinLength(prseq.length(), exon), new int[] { + 1, prseq.length() }, 3, 1); // reconstruct the EMBLCDS entry + // TODO: this is only necessary when there codon annotation is + // complete (I think JBPNote) DBRefEntry pcdnaref = new DBRefEntry(); pcdnaref.setAccessionId(prid); pcdnaref.setSource(DBRefSource.EMBLCDS); pcdnaref.setVersion(getVersion()); // same as parent EMBL version. - jalview.util.MapList mp = new jalview.util.MapList(new int[] - { 1+(prstart-1)*3, 1+(prstart-1)*3 + (prseq.length()-1)*3 }, new int[] { prstart, prstart+prseq.length() - 1 }, 3, 1); + jalview.util.MapList mp = new jalview.util.MapList(new int[] { 1, + prseq.length() }, new int[] { 1 + (prstart - 1), + (prstart - 1) + 3 * prseq.length() }, 1, 3); + // { 1 + (prstart - 1) * 3, + // 1 + (prstart - 1) * 3 + prseq.length() * 3 - 1 }, new int[] + // { 1prstart, prstart + prseq.length() - 1 }, 3, 1); pcdnaref.setMap(new Mapping(mp)); - if (product!=null) + if (product != null) + { product.addDBRef(pcdnaref); - + protEMBLCDS = new DBRefEntry(pcdnaref); + protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct); + product.addDBRef(protEMBLCDS); + + } + } } // add cds feature to dna seq - this may include the stop codon @@ -612,55 +666,59 @@ public class EmblEntry sf.setEnd(exon[xint + 1]); sf.setType(feature.getName()); sf.setFeatureGroup(sourceDb); - sf.setDescription("Exon " + (1 + xint) + " for protein '" + sf.setDescription("Exon " + (1 + xint / 2) + " for protein '" + prname + "' EMBLCDS:" + prid); sf.setValue(FeatureProperties.EXONPOS, new Integer(1 + xint)); sf.setValue(FeatureProperties.EXONPRODUCT, prname); - if (vals != null && vals.size() > 0) + if (vals != null) { - Enumeration kv = vals.elements(); - while (kv.hasMoreElements()) + for (Entry val : vals.entrySet()) { - Object key = kv.nextElement(); - if (key != null) - sf.setValue(key.toString(), vals.get(key)); + sf.setValue(val.getKey(), val.getValue()); } } dna.addSequenceFeature(sf); } } // add dbRefs to sequence - if (feature.dbRefs != null && feature.dbRefs.size() > 0) + if (feature.dbRefs != null) { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) + for (DBRefEntry ref : feature.dbRefs) { - DBRefEntry ref = (DBRefEntry) dbr.next(); ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref .getSource())); // Hard code the kind of protein product accessions that EMBL cite - if (ref.getSource().equals( - jalview.datamodel.DBRefSource.UNIPROT)) + if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT)) { ref.setMap(map); - if (map!=null && map.getTo()!=null) + if (map != null && map.getTo() != null) { - map.getTo().addDBRef(new DBRefEntry(ref.getSource(), ref.getVersion(), ref.getAccessionId())); // don't copy map over. + map.getTo().addDBRef( + new DBRefEntry(ref.getSource(), ref.getVersion(), ref + .getAccessionId())); // don't copy map over. + if (map.getTo().getName().indexOf(prid) == 0) + { + map.getTo().setName( + jalview.datamodel.DBRefSource.UNIPROT + "|" + + ref.getAccessionId()); + } } + noProteinDbref = false; } if (product != null) { - DBRefEntry pref = new DBRefEntry(ref.getSource(), ref - .getVersion(), ref.getAccessionId()); + DBRefEntry pref = new DBRefEntry(ref.getSource(), + ref.getVersion(), ref.getAccessionId()); pref.setMap(null); // reference is direct product.addDBRef(pref); // Add converse mapping reference if (map != null) { Mapping pmap = new Mapping(dna, map.getMap().getInverse()); - pref = new DBRefEntry(sourceDb, getVersion(), this - .getAccession()); + pref = new DBRefEntry(sourceDb, getVersion(), + this.getAccession()); pref.setMap(pmap); - if (map.getTo()!=null) + if (map.getTo() != null) { map.getTo().addDBRef(pref); } @@ -668,6 +726,120 @@ public class EmblEntry } dna.addDBRef(ref); } + if (noProteinDbref && product != null) + { + // add protein coding reference to dna sequence so xref matches + if (protEMBLCDS == null) + { + protEMBLCDS = new DBRefEntry(); + protEMBLCDS.setAccessionId(prid); + protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct); + protEMBLCDS.setVersion(getVersion()); + protEMBLCDS + .setMap(new Mapping(product, map.getMap().getInverse())); + } + product.addDBRef(protEMBLCDS); + + // Add converse mapping reference + if (map != null) + { + Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap() + .getInverse()); + DBRefEntry ncMap = new DBRefEntry(protEMBLCDS); + ncMap.setMap(pmap); + if (map.getTo() != null) + { + dna.addDBRef(ncMap); + } + } + } + } + } + + private int[] adjustForPrStart(int prstart, int[] exon) + { + + int origxon[], sxpos = -1; + int sxstart, sxstop; // unnecessary variables used for debugging + // first adjust range for codon start attribute + if (prstart > 1) + { + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length && sxpos == -1; x += 2) + { + cdspos += exon[x + 1] - exon[x] + 1; + if (prstart <= cdspos) + { + sxpos = x; + sxstart = exon[x]; + sxstop = exon[x + 1]; + // and adjust start boundary of first exon. + exon[x] = exon[x + 1] - cdspos + prstart; + break; + } + } + + if (sxpos > 0) + { + int[] nxon = new int[exon.length - sxpos]; + System.arraycopy(exon, sxpos, nxon, 0, exon.length - sxpos); + exon = nxon; + } + } + return exon; + } + + /** + * truncate the last exon interval to the prlength'th codon + * + * @param prlength + * @param exon + * @return new exon + */ + private int[] adjustForProteinLength(int prlength, int[] exon) + { + + int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3; + int sxstart, sxstop; // unnecessary variables used for debugging + // first adjust range for codon start attribute + if (prlength >= 1 && exon != null) + { + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length && sxpos == -1; x += 2) + { + cdspos += exon[x + 1] - exon[x] + 1; + if (cdslength <= cdspos) + { + // advanced beyond last codon. + sxpos = x; + sxstart = exon[x]; + sxstop = exon[x + 1]; + if (cdslength != cdspos) + { + System.err + .println("Truncating final exon interval on region by " + + (cdspos - cdslength)); + } + // locate the new end boundary of final exon as endxon + endxon = exon[x + 1] - cdspos + cdslength; + break; + } + } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } } + return exon; } }