X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=0cee0b410a2bfc81cd4373ee406d05afc1085fee;hb=506d60f0e188723ddc91c26824b41ac7034df3fe;hp=e6c7197e22c76962063a405e32287194ef00118d;hpb=3f89c3afc646bf1d83607110e538f20f9c01c3b2;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index e6c7197..0cee0b4 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -1,6 +1,27 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer (Version 2.4) + * Copyright (C) 2008 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ package jalview.datamodel.xdb.embl; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -46,7 +67,7 @@ public class EmblEntry /** * @param accession - * the accession to set + * the accession to set */ public void setAccession(String accession) { @@ -63,7 +84,7 @@ public class EmblEntry /** * @param dbRefs - * the dbRefs to set + * the dbRefs to set */ public void setDbRefs(Vector dbRefs) { @@ -80,7 +101,7 @@ public class EmblEntry /** * @param desc - * the desc to set + * the desc to set */ public void setDesc(String desc) { @@ -97,7 +118,7 @@ public class EmblEntry /** * @param features - * the features to set + * the features to set */ public void setFeatures(Vector features) { @@ -114,7 +135,7 @@ public class EmblEntry /** * @param keywords - * the keywords to set + * the keywords to set */ public void setKeywords(Vector keywords) { @@ -131,7 +152,7 @@ public class EmblEntry /** * @param lastUpdated - * the lastUpdated to set + * the lastUpdated to set */ public void setLastUpdated(String lastUpdated) { @@ -148,7 +169,7 @@ public class EmblEntry /** * @param refs - * the refs to set + * the refs to set */ public void setRefs(Vector refs) { @@ -165,7 +186,7 @@ public class EmblEntry /** * @param releaseCreated - * the releaseCreated to set + * the releaseCreated to set */ public void setRcreated(String releaseCreated) { @@ -182,7 +203,7 @@ public class EmblEntry /** * @param releaseLastUpdated - * the releaseLastUpdated to set + * the releaseLastUpdated to set */ public void setRLastUpdated(String releaseLastUpdated) { @@ -199,7 +220,7 @@ public class EmblEntry /** * @param sequence - * the sequence to set + * the sequence to set */ public void setSequence(EmblSequence sequence) { @@ -216,7 +237,7 @@ public class EmblEntry /** * @param taxDivision - * the taxDivision to set + * the taxDivision to set */ public void setTaxDivision(String taxDivision) { @@ -233,7 +254,7 @@ public class EmblEntry /** * @param version - * the version to set + * the version to set */ public void setVersion(String version) { @@ -375,20 +396,24 @@ public class EmblEntry * Recover annotated sequences from EMBL file * * @param noNa - * don't return nucleic acid sequences + * don't return nucleic acid sequences * @param sourceDb - * TODO + * TODO * @param noProtein - * don't return any translated protein sequences marked in features + * don't return any translated protein sequences marked in + * features * @return dataset sequences with DBRefs and features - DNA always comes first */ public jalview.datamodel.SequenceI[] getSequences(boolean noNa, boolean noPeptide, String sourceDb) - { + { // TODO: ensure emblEntry.getSequences behaves correctly for returning all + // cases of noNa and noPeptide Vector seqs = new Vector(); Sequence dna = null; if (!noNa) { + // In theory we still need to create this if noNa is set to avoid a null + // pointer exception dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); dna.setDescription(desc); dna.addDBRef(new DBRefEntry(sourceDb, version, accession)); @@ -403,7 +428,6 @@ public class EmblEntry { for (Iterator i = features.iterator(); i.hasNext();) { - boolean nextFeature=false; EmblFeature feature = (EmblFeature) i.next(); if (!noNa) { @@ -414,130 +438,9 @@ public class EmblEntry ; } } - if (feature.getName().equalsIgnoreCase("CDS")) + if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { - // extract coding region(s) - jalview.datamodel.Mapping map = null; - int[] exon = null; - if (feature.locations != null && feature.locations.size() > 0) - { - for (Enumeration locs = feature.locations.elements(); locs - .hasMoreElements();) - { - EmblFeatureLocations loc = (EmblFeatureLocations) locs - .nextElement(); - int[] se = loc.getElementRanges(accession); - if (exon == null) - { - exon = se; - } - else - { - int[] t = new int[exon.length + se.length]; - System.arraycopy(exon, 0, t, 0, exon.length); - System.arraycopy(se, 0, t, exon.length, se.length); - exon = t; - } - } - } - String prseq = null; - String prname = new String(); - String prid = null; - Hashtable vals = new Hashtable(); - int prstart = 1; - // get qualifiers - if (feature.getQualifiers() != null - && feature.getQualifiers().size() > 0) - { - for (Iterator quals = feature.getQualifiers().iterator(); quals - .hasNext();) - { - Qualifier q = (Qualifier) quals.next(); - if (q.getName().equals("translation")) - { - prseq = q.getValues()[0]; - } - else if (q.getName().equals("protein_id")) - { - prid = q.getValues()[0]; - } - else if (q.getName().equals("codon_start")) - { - prstart = Integer.parseInt(q.getValues()[0]); - } - else if (q.getName().equals("product")) - { - prname = q.getValues()[0]; - } - else - { - // throw anything else into the additional properties hash - vals.put(q.getName(), q.getValues().toString()); - } - } - } - Sequence product = null; - if (prseq != null && prname != null && prid != null) - { - // extract proteins. - if (!noPeptide) - { - product = new Sequence(sourceDb + "|" + "EMBLCDS|" + prid - + "|" + prname, prseq, prstart, prstart - + prseq.length() - 1); - product.setDescription("Protein Product from " + sourceDb); - seqs.add(product); - } - // we have everything - create the mapping and perhaps the protein - // sequence - map = new jalview.datamodel.Mapping(product, exon, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); - // add cds feature to dna seq - this may include the stop codon - for (int xint = 0; xint < exon.length; xint += 2) - { - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(exon[xint]); - sf.setEnd(exon[xint + 1]); - sf.setType(feature.getName()); - sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL); - sf.setDescription("Exon " + (1 + xint) + " for protein '" - + prname + "' EMBLCDS:" + prid); - if (vals != null && vals.size() > 0) - { - Enumeration kv = vals.elements(); - while (kv.hasMoreElements()) - { - Object key = kv.nextElement(); - if (key != null) - sf.setValue(key.toString(), vals.get(key)); - } - } - dna.addSequenceFeature(sf); - } - } - // add dbRefs to sequence - if (feature.dbRefs != null && feature.dbRefs.size() > 0) - { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) - { - DBRefEntry ref = (DBRefEntry) dbr.next(); - ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref - .getSource())); - if (ref.getSource().equals( - jalview.datamodel.DBRefSource.UNIPROT)) - { - ref.setMap(map); - } - if (product != null) - { - DBRefEntry pref = new DBRefEntry(ref.getSource(), ref - .getVersion(), ref.getAccessionId()); - pref.setMap(null); // reference is direct - } - dna.addDBRef(ref); - } - } - + parseCodingFeature(feature, sourceDb, seqs, dna, noPeptide); } else { @@ -556,12 +459,13 @@ public class EmblEntry } catch (Exception e) { System.err.println("EMBL Record Features parsing error!"); - System.err.println("Please report the following to help@jalview.org :"); - System.err.println("EMBL Record "+accession); - System.err.println("Resulted in exception: "+e.getMessage()); + System.err + .println("Please report the following to help@jalview.org :"); + System.err.println("EMBL Record " + accession); + System.err.println("Resulted in exception: " + e.getMessage()); e.printStackTrace(System.err); } - if (!noNa && dna!=null) + if (!noNa && dna != null) { seqs.add(dna); } @@ -573,4 +477,245 @@ public class EmblEntry } return sqs; } + + /** + * attempt to extract coding region and product from a feature and properly + * decorate it with annotations. + * + * @param feature + * coding feature + * @param sourceDb + * source database for the EMBLXML + * @param seqs + * place where sequences go + * @param dna + * parent dna sequence for this record + * @param noPeptide + * flag for generation of Peptide sequence objects + */ + private void parseCodingFeature(EmblFeature feature, String sourceDb, + Vector seqs, Sequence dna, boolean noPeptide) + { + boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); + // extract coding region(s) + jalview.datamodel.Mapping map = null; + int[] exon = null; + if (feature.locations != null && feature.locations.size() > 0) + { + for (Enumeration locs = feature.locations.elements(); locs + .hasMoreElements();) + { + EmblFeatureLocations loc = (EmblFeatureLocations) locs + .nextElement(); + int[] se = loc.getElementRanges(accession); + if (exon == null) + { + exon = se; + } + else + { + int[] t = new int[exon.length + se.length]; + System.arraycopy(exon, 0, t, 0, exon.length); + System.arraycopy(se, 0, t, exon.length, se.length); + exon = t; + } + } + } + String prseq = null; + String prname = new String(); + String prid = null; + Hashtable vals = new Hashtable(); + int prstart = 1; + // get qualifiers + if (feature.getQualifiers() != null + && feature.getQualifiers().size() > 0) + { + for (Iterator quals = feature.getQualifiers().iterator(); quals + .hasNext();) + { + Qualifier q = (Qualifier) quals.next(); + if (q.getName().equals("translation")) + { + StringBuffer prsq = new StringBuffer(q.getValues()[0]); + int p = prsq.indexOf(" "); + while (p > -1) + { + prsq.deleteCharAt(p); + p = prsq.indexOf(" ", p); + } + prseq = prsq.toString(); + prsq = null; + + } + else if (q.getName().equals("protein_id")) + { + prid = q.getValues()[0]; + } + else if (q.getName().equals("codon_start")) + { + prstart = Integer.parseInt(q.getValues()[0]); + } + else if (q.getName().equals("product")) + { + prname = q.getValues()[0]; + } + else + { + // throw anything else into the additional properties hash + String[] s = q.getValues(); + StringBuffer sb = new StringBuffer(); + if (s != null) + { + for (int i = 0; i < s.length; i++) + { + sb.append(s[i]); + sb.append("\n"); + } + } + vals.put(q.getName(), sb.toString()); + } + } + } + Sequence product = null; + if (prseq != null && prname != null && prid != null) + { + // extract proteins. + product = new Sequence(prid, prseq, prstart, prstart + prseq.length() + - 1); + product + .setDescription(((prname.length() == 0) ? "Protein Product from " + + sourceDb + : prname)); + + if (!noPeptide) + { + // Protein is also added to vector of sequences returned + seqs.add(product); + } + // we have everything - create the mapping and perhaps the protein + // sequence + if (exon == null || exon.length == 0) + { + System.err + .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + + sourceDb + ":" + getAccession() + ")"); + if (prseq.length() * 3 == dna.getSequence().length) + { + // this might occur for CDS sequences where no features are + // marked. + exon = new int[] + { dna.getStart(), dna.getEnd() }; + map = new jalview.datamodel.Mapping(product, exon, new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + } + if ((prseq.length() + 1) * 3 == dna.getSequence().length) + { + exon = new int[] + { dna.getStart(), dna.getEnd() - 3 }; + map = new jalview.datamodel.Mapping(product, exon, new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + } + } + else + { + if (isEmblCdna) + { + // TODO: Add a DbRef back to the parent EMBL sequence with the exon + // map + // if given a dataset reference, search dataset for parent EMBL + // sequence if it exists and set its map + // make a new feature annotating the coding contig + } + else + { + map = new jalview.datamodel.Mapping(product, exon, new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + // reconstruct the EMBLCDS entry + DBRefEntry pcdnaref = new DBRefEntry(); + pcdnaref.setAccessionId(prid); + pcdnaref.setSource(DBRefSource.EMBLCDS); + pcdnaref.setVersion(getVersion()); // same as parent EMBL version. + jalview.util.MapList mp = new jalview.util.MapList(new int[] + { 1 + (prstart - 1) * 3, + 1 + (prstart - 1) * 3 + (prseq.length() - 1) * 3 }, new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + pcdnaref.setMap(new Mapping(mp)); + if (product != null) + product.addDBRef(pcdnaref); + + } + } + // add cds feature to dna seq - this may include the stop codon + for (int xint = 0; exon != null && xint < exon.length; xint += 2) + { + SequenceFeature sf = new SequenceFeature(); + sf.setBegin(exon[xint]); + sf.setEnd(exon[xint + 1]); + sf.setType(feature.getName()); + sf.setFeatureGroup(sourceDb); + sf.setDescription("Exon " + (1 + (int) (xint / 2)) + + " for protein '" + prname + "' EMBLCDS:" + prid); + sf.setValue(FeatureProperties.EXONPOS, new Integer(1 + xint)); + sf.setValue(FeatureProperties.EXONPRODUCT, prname); + if (vals != null && vals.size() > 0) + { + Enumeration kv = vals.elements(); + while (kv.hasMoreElements()) + { + Object key = kv.nextElement(); + if (key != null) + sf.setValue(key.toString(), vals.get(key)); + } + } + dna.addSequenceFeature(sf); + } + } + // add dbRefs to sequence + if (feature.dbRefs != null && feature.dbRefs.size() > 0) + { + for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) + { + DBRefEntry ref = (DBRefEntry) dbr.next(); + ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref + .getSource())); + // Hard code the kind of protein product accessions that EMBL cite + if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT)) + { + ref.setMap(map); + if (map != null && map.getTo() != null) + { + map.getTo().addDBRef( + new DBRefEntry(ref.getSource(), ref.getVersion(), ref + .getAccessionId())); // don't copy map over. + if (map.getTo().getName().indexOf(prid) == 0) + { + map.getTo().setName( + jalview.datamodel.DBRefSource.UNIPROT + "|" + + ref.getAccessionId()); + } + } + } + if (product != null) + { + DBRefEntry pref = new DBRefEntry(ref.getSource(), ref + .getVersion(), ref.getAccessionId()); + pref.setMap(null); // reference is direct + product.addDBRef(pref); + // Add converse mapping reference + if (map != null) + { + Mapping pmap = new Mapping(dna, map.getMap().getInverse()); + pref = new DBRefEntry(sourceDb, getVersion(), this + .getAccession()); + pref.setMap(pmap); + if (map.getTo() != null) + { + map.getTo().addDBRef(pref); + } + } + } + dna.addDBRef(ref); + } + } + } }