2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel.xdb.embl;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.datamodel.DBRefEntry;
25 import jalview.datamodel.DBRefSource;
26 import jalview.datamodel.FeatureProperties;
27 import jalview.datamodel.Mapping;
28 import jalview.datamodel.Sequence;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.util.DBRefUtils;
32 import jalview.util.MapList;
33 import jalview.util.MappingUtils;
34 import jalview.util.StringUtils;
36 import java.util.Arrays;
37 import java.util.Hashtable;
38 import java.util.List;
40 import java.util.Map.Entry;
41 import java.util.Vector;
42 import java.util.regex.Pattern;
45 * Data model for one entry returned from an EMBL query, as marshalled by a
49 * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
52 * @see embl_mapping.xml
54 public class EmblEntry
56 private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
72 Vector<String> keywords;
74 Vector<DBRefEntry> dbRefs;
76 Vector<EmblFeature> features;
78 EmblSequence sequence;
81 * @return the accession
83 public String getAccession()
90 * the accession to set
92 public void setAccession(String accession)
94 this.accession = accession;
100 public Vector<DBRefEntry> getDbRefs()
109 public void setDbRefs(Vector<DBRefEntry> dbRefs)
111 this.dbRefs = dbRefs;
117 public String getDesc()
126 public void setDesc(String desc)
132 * @return the features
134 public Vector<EmblFeature> getFeatures()
141 * the features to set
143 public void setFeatures(Vector<EmblFeature> features)
145 this.features = features;
149 * @return the keywords
151 public Vector<String> getKeywords()
158 * the keywords to set
160 public void setKeywords(Vector<String> keywords)
162 this.keywords = keywords;
166 * @return the lastUpdated
168 public String getLastUpdated()
175 * the lastUpdated to set
177 public void setLastUpdated(String lastUpdated)
179 this.lastUpdated = lastUpdated;
183 * @return the releaseCreated
185 public String getRCreated()
191 * @param releaseCreated
192 * the releaseCreated to set
194 public void setRCreated(String releaseCreated)
196 this.rCreated = releaseCreated;
200 * @return the releaseLastUpdated
202 public String getRLastUpdated()
208 * @param releaseLastUpdated
209 * the releaseLastUpdated to set
211 public void setRLastUpdated(String releaseLastUpdated)
213 this.rLastUpdated = releaseLastUpdated;
217 * @return the sequence
219 public EmblSequence getSequence()
226 * the sequence to set
228 public void setSequence(EmblSequence sequence)
230 this.sequence = sequence;
234 * @return the taxDivision
236 public String getTaxDivision()
243 * the taxDivision to set
245 public void setTaxDivision(String taxDivision)
247 this.taxDivision = taxDivision;
251 * @return the version
253 public String getVersion()
262 public void setVersion(String version)
264 this.version = version;
268 * Recover annotated sequences from EMBL file
272 * a list of protein products found so far (to add to)
273 * @return dna dataset sequence with DBRefs and features
275 public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
277 SequenceI dna = new Sequence(sourceDb + "|" + accession,
278 sequence.getSequence());
279 dna.setDescription(desc);
280 DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
281 dna.addDBRef(retrievedref);
282 // add map to indicate the sequence is a valid coordinate frame for the
284 retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
285 new int[] { 1, dna.getLength() }, 1, 1));
286 // TODO: transform EMBL Database refs to canonical form
289 for (DBRefEntry dbref : dbRefs)
297 for (EmblFeature feature : features)
299 if (feature.dbRefs != null)
301 for (DBRefEntry dbref : feature.dbRefs)
306 if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
308 parseCodingFeature(feature, sourceDb, dna, peptides);
311 } catch (Exception e)
313 System.err.println("EMBL Record Features parsing error!");
315 .println("Please report the following to help@jalview.org :");
316 System.err.println("EMBL Record " + accession);
317 System.err.println("Resulted in exception: " + e.getMessage());
318 e.printStackTrace(System.err);
325 * Extracts coding region and product from a CDS feature and properly decorate
326 * it with annotations.
331 * source database for the EMBLXML
333 * parent dna sequence for this record
335 * list of protein product sequences for Embl entry
337 void parseCodingFeature(EmblFeature feature, String sourceDb,
338 SequenceI dna, List<SequenceI> peptides)
340 boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
342 int[] exon = getCdsRanges(feature);
347 Map<String, String> vals = new Hashtable<String, String>();
348 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
351 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
352 * (phase is required for CDS features in GFF3 format)
357 * parse qualifiers, saving protein translation, protein id,
358 * codon start position, product (name), and 'other values'
360 if (feature.getQualifiers() != null)
362 for (Qualifier q : feature.getQualifiers())
364 String qname = q.getName();
365 if (qname.equals("translation"))
367 // remove all spaces (precompiled String.replaceAll(" ", ""))
368 prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
370 else if (qname.equals("protein_id"))
372 prid = q.getValues()[0];
374 else if (qname.equals("codon_start"))
378 codonStart = Integer.parseInt(q.getValues()[0]);
379 } catch (NumberFormatException e)
381 System.err.println("Invalid codon_start in XML for "
382 + accession + ": " + e.getMessage());
385 else if (qname.equals("product"))
387 // sometimes name is returned e.g. for V00488
388 prname = q.getValues()[0];
392 // throw anything else into the additional properties hash
393 String[] qvals = q.getValues();
396 String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
398 vals.put(qname, commaSeparated);
404 DBRefEntry protEMBLCDS = null;
405 exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
406 boolean noProteinDbref = true;
408 SequenceI product = null;
410 if (prseq != null && prname != null && prid != null)
413 * look for product in peptides list, if not found, add it
415 product = matcher.findIdMatch(prid);
418 product = new Sequence(prid, prseq, 1, prseq.length());
419 product.setDescription(((prname.length() == 0) ? "Protein Product from "
422 peptides.add(product);
423 matcher.add(product);
426 // we have everything - create the mapping and perhaps the protein
428 if (exon == null || exon.length == 0)
431 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
432 + sourceDb + ":" + getAccession() + ")");
433 if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
436 .println("Not allowing for additional stop codon at end of cDNA fragment... !");
437 // this might occur for CDS sequences where no features are
439 exon = new int[] { dna.getStart() + (codonStart - 1),
441 map = new Mapping(product, exon, new int[] { 1, prseq.length() },
444 if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
447 .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
448 exon = new int[] { dna.getStart() + (codonStart - 1),
450 map = new Mapping(product, exon, new int[] { 1, prseq.length() },
456 // Trim the exon mapping if necessary - the given product may only be a
457 // fragment of a larger protein. (EMBL:AY043181 is an example)
461 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
463 // if given a dataset reference, search dataset for parent EMBL
464 // sequence if it exists and set its map
465 // make a new feature annotating the coding contig
469 // final product length truncation check
470 // TODO should from range include stop codon even if not in protein
471 // in order to include stop codon in CDS sequence (as done for
473 int[] cdsRanges = adjustForProteinLength(prseq.length(), exon);
474 map = new Mapping(product, cdsRanges, new int[] { 1,
475 prseq.length() }, 3, 1);
476 // reconstruct the EMBLCDS entry
477 // TODO: this is only necessary when there codon annotation is
478 // complete (I think JBPNote)
479 DBRefEntry pcdnaref = new DBRefEntry();
480 pcdnaref.setAccessionId(prid);
481 pcdnaref.setSource(DBRefSource.EMBLCDS);
482 pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
483 MapList mp = new MapList(new int[] { 1, prseq.length() },
484 new int[] { 1 + (codonStart - 1),
485 (codonStart - 1) + 3 * prseq.length() }, 1, 3);
486 pcdnaref.setMap(new Mapping(mp));
489 product.addDBRef(pcdnaref);
490 protEMBLCDS = new DBRefEntry(pcdnaref);
491 protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
492 product.addDBRef(protEMBLCDS);
496 // add cds feature to dna seq - this may include the stop codon
497 for (int xint = 0; exon != null && xint < exon.length; xint += 2)
499 SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
501 sf.setType(feature.getName()); // "CDS"
502 sf.setFeatureGroup(sourceDb);
503 dna.addSequenceFeature(sf);
508 * add dbRefs to sequence, and mappings for Uniprot xrefs
510 if (feature.dbRefs != null)
512 boolean mappingUsed = false;
513 for (DBRefEntry ref : feature.dbRefs)
515 ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
516 if (ref.getSource().equals(DBRefSource.UNIPROT))
518 String proteinSeqName = DBRefSource.UNIPROT + "|"
519 + ref.getAccessionId();
520 if (map != null && map.getTo() != null)
525 * two or more Uniprot xrefs for the same CDS -
526 * each needs a distinct Mapping (as to a different sequence)
528 map = new Mapping(map);
533 * try to locate the protein mapped to (possibly by a
534 * previous CDS feature)
536 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
537 if (proteinSeq == null)
539 proteinSeq = new Sequence(proteinSeqName,
540 product.getSequenceAsString());
541 matcher.add(proteinSeq);
542 peptides.add(proteinSeq);
544 map.setTo(proteinSeq);
545 map.getTo().addDBRef(
546 new DBRefEntry(ref.getSource(), ref.getVersion(), ref
550 noProteinDbref = false;
554 DBRefEntry pref = new DBRefEntry(ref.getSource(),
555 ref.getVersion(), ref.getAccessionId());
556 pref.setMap(null); // reference is direct
557 product.addDBRef(pref);
558 // Add converse mapping reference
561 Mapping pmap = new Mapping(dna, map.getMap().getInverse());
562 pref = new DBRefEntry(sourceDb, getVersion(),
563 this.getAccession());
565 if (map.getTo() != null)
567 map.getTo().addDBRef(pref);
573 if (noProteinDbref && product != null)
575 // add protein coding reference to dna sequence so xref matches
576 if (protEMBLCDS == null)
578 protEMBLCDS = new DBRefEntry();
579 protEMBLCDS.setAccessionId(prid);
580 protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
581 protEMBLCDS.setVersion(getVersion());
583 .setMap(new Mapping(product, map.getMap().getInverse()));
585 product.addDBRef(protEMBLCDS);
587 // Add converse mapping reference
590 Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap()
592 DBRefEntry ncMap = new DBRefEntry(protEMBLCDS);
594 if (map.getTo() != null)
604 * Helper method to construct a SequenceFeature for one cds range
607 * array of cds [start, end, ...] positions
608 * @param exonStartIndex
609 * offset into the exons array
611 * @param proteinAccessionId
613 * map of 'miscellaneous values' for feature
615 * codon start position for CDS (1/2/3, normally 1)
618 protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
619 String proteinName, String proteinAccessionId,
620 Map<String, String> vals, int codonStart)
622 int exonNumber = exonStartIndex / 2 + 1;
623 SequenceFeature sf = new SequenceFeature();
624 sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
625 sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
626 sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s",
627 exonNumber, proteinName, proteinAccessionId));
628 sf.setPhase(String.valueOf(codonStart - 1));
629 sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+"
631 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
632 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
635 StringBuilder sb = new StringBuilder();
636 boolean first = true;
637 for (Entry<String, String> val : vals.entrySet())
643 sb.append(val.getKey()).append("=").append(val.getValue());
645 sf.setValue(val.getKey(), val.getValue());
647 sf.setAttributes(sb.toString());
653 * Returns the CDS positions as a list of [start, end, start, end...]
654 * positions. If on the reverse strand, these will be in descending order.
659 protected int[] getCdsRanges(EmblFeature feature)
661 if (feature.locations == null)
665 int cdsBoundaryCount = 0; // count of all start/stop locations
666 int[][] cdsLocations = new int[feature.locations.size()][];
667 int locationNumber = 0;
668 for (EmblFeatureLocations loc : feature.locations)
670 int[] locationRanges = loc.getElementRanges(accession);
671 cdsLocations[locationNumber++] = locationRanges;
672 cdsBoundaryCount += locationRanges.length;
674 int[] cdsRanges = new int[cdsBoundaryCount];
676 for (int[] ranges : cdsLocations)
678 System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length);
679 copyTo += ranges.length;
686 * truncate the last exon interval to the prlength'th codon
692 static int[] adjustForProteinLength(int prlength, int[] exon)
694 if (prlength <= 0 || exon == null)
698 int desiredCdsLength = prlength * 3;
699 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
702 * assuming here exon might include stop codon in addition to protein codons
704 if (desiredCdsLength == exonLength
705 || desiredCdsLength == exonLength - 3)
713 origxon = new int[exon.length];
714 System.arraycopy(exon, 0, origxon, 0, exon.length);
716 for (int x = 0; x < exon.length; x += 2)
718 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
719 if (desiredCdsLength <= cdspos)
721 // advanced beyond last codon.
723 if (desiredCdsLength != cdspos)
726 // .println("Truncating final exon interval on region by "
727 // + (cdspos - cdslength));
731 * shrink the final exon - reduce end position if forward
732 * strand, increase it if reverse
734 if (exon[x + 1] >= exon[x])
736 endxon = exon[x + 1] - cdspos + desiredCdsLength;
740 endxon = exon[x + 1] + cdspos - desiredCdsLength;
748 // and trim the exon interval set if necessary
749 int[] nxon = new int[sxpos + 2];
750 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
751 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon