2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel.xdb.embl;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.datamodel.DBRefEntry;
25 import jalview.datamodel.DBRefSource;
26 import jalview.datamodel.FeatureProperties;
27 import jalview.datamodel.Mapping;
28 import jalview.datamodel.Sequence;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.util.DBRefUtils;
32 import jalview.util.MapList;
33 import jalview.util.MappingUtils;
34 import jalview.util.StringUtils;
36 import java.util.Hashtable;
37 import java.util.List;
39 import java.util.Map.Entry;
40 import java.util.Vector;
41 import java.util.regex.Pattern;
44 * Data model for one entry returned from an EMBL query, as marshalled by a
48 * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
51 * @see embl_mapping.xml
53 public class EmblEntry
55 private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
71 Vector<String> keywords;
73 Vector<DBRefEntry> dbRefs;
75 Vector<EmblFeature> features;
77 EmblSequence sequence;
80 * @return the accession
82 public String getAccession()
89 * the accession to set
91 public void setAccession(String accession)
93 this.accession = accession;
99 public Vector<DBRefEntry> getDbRefs()
108 public void setDbRefs(Vector<DBRefEntry> dbRefs)
110 this.dbRefs = dbRefs;
116 public String getDesc()
125 public void setDesc(String desc)
131 * @return the features
133 public Vector<EmblFeature> getFeatures()
140 * the features to set
142 public void setFeatures(Vector<EmblFeature> features)
144 this.features = features;
148 * @return the keywords
150 public Vector<String> getKeywords()
157 * the keywords to set
159 public void setKeywords(Vector<String> keywords)
161 this.keywords = keywords;
165 * @return the lastUpdated
167 public String getLastUpdated()
174 * the lastUpdated to set
176 public void setLastUpdated(String lastUpdated)
178 this.lastUpdated = lastUpdated;
182 * @return the releaseCreated
184 public String getRCreated()
190 * @param releaseCreated
191 * the releaseCreated to set
193 public void setRCreated(String releaseCreated)
195 this.rCreated = releaseCreated;
199 * @return the releaseLastUpdated
201 public String getRLastUpdated()
207 * @param releaseLastUpdated
208 * the releaseLastUpdated to set
210 public void setRLastUpdated(String releaseLastUpdated)
212 this.rLastUpdated = releaseLastUpdated;
216 * @return the sequence
218 public EmblSequence getSequence()
225 * the sequence to set
227 public void setSequence(EmblSequence sequence)
229 this.sequence = sequence;
233 * @return the taxDivision
235 public String getTaxDivision()
242 * the taxDivision to set
244 public void setTaxDivision(String taxDivision)
246 this.taxDivision = taxDivision;
250 * @return the version
252 public String getVersion()
261 public void setVersion(String version)
263 this.version = version;
267 * Recover annotated sequences from EMBL file
271 * a list of protein products found so far (to add to)
272 * @return dna dataset sequence with DBRefs and features
274 public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
276 SequenceI dna = new Sequence(sourceDb + "|" + accession,
277 sequence.getSequence());
278 dna.setDescription(desc);
279 DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
280 dna.addDBRef(retrievedref);
281 // add map to indicate the sequence is a valid coordinate frame for the
283 retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
284 new int[] { 1, dna.getLength() }, 1, 1));
285 // TODO: transform EMBL Database refs to canonical form
288 for (DBRefEntry dbref : dbRefs)
296 for (EmblFeature feature : features)
298 if (feature.dbRefs != null)
300 for (DBRefEntry dbref : feature.dbRefs)
305 if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
307 parseCodingFeature(feature, sourceDb, dna, peptides);
310 } catch (Exception e)
312 System.err.println("EMBL Record Features parsing error!");
314 .println("Please report the following to help@jalview.org :");
315 System.err.println("EMBL Record " + accession);
316 System.err.println("Resulted in exception: " + e.getMessage());
317 e.printStackTrace(System.err);
324 * Extracts coding region and product from a CDS feature and properly decorate
325 * it with annotations.
330 * source database for the EMBLXML
332 * parent dna sequence for this record
334 * list of protein product sequences for Embl entry
336 void parseCodingFeature(EmblFeature feature, String sourceDb,
337 SequenceI dna, List<SequenceI> peptides)
339 boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
341 int[] exon = getCdsRanges(feature);
346 Map<String, String> vals = new Hashtable<String, String>();
347 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
350 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
351 * (phase is required for CDS features in GFF3 format)
356 * parse qualifiers, saving protein translation, protein id,
357 * codon start position, product (name), and 'other values'
359 if (feature.getQualifiers() != null)
361 for (Qualifier q : feature.getQualifiers())
363 String qname = q.getName();
364 if (qname.equals("translation"))
366 // remove all spaces (precompiled String.replaceAll(" ", ""))
367 prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
369 else if (qname.equals("protein_id"))
371 prid = q.getValues()[0];
373 else if (qname.equals("codon_start"))
377 codonStart = Integer.parseInt(q.getValues()[0]);
378 } catch (NumberFormatException e)
380 System.err.println("Invalid codon_start in XML for "
381 + accession + ": " + e.getMessage());
384 else if (qname.equals("product"))
386 // sometimes name is returned e.g. for V00488
387 prname = q.getValues()[0];
391 // throw anything else into the additional properties hash
392 String[] qvals = q.getValues();
395 String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
397 vals.put(qname, commaSeparated);
403 // SequenceI product = null;
404 DBRefEntry protEMBLCDS = null;
405 exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
406 boolean noProteinDbref = true;
408 SequenceI product = null;
410 if (prseq != null && prname != null && prid != null)
413 * look for product in peptides list, if not found, add it
415 product = matcher.findIdMatch(prid);
418 product = new Sequence(prid, prseq, 1, prseq.length());
419 product.setDescription(((prname.length() == 0) ? "Protein Product from "
422 peptides.add(product);
423 matcher.add(product);
426 // we have everything - create the mapping and perhaps the protein
428 if (exon == null || exon.length == 0)
431 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
432 + sourceDb + ":" + getAccession() + ")");
433 if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
436 .println("Not allowing for additional stop codon at end of cDNA fragment... !");
437 // this might occur for CDS sequences where no features are
439 exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() };
440 map = new Mapping(product, exon, new int[] { 1, prseq.length() },
443 if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
446 .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
447 exon = new int[] { dna.getStart() + (codonStart - 1),
449 map = new Mapping(product, exon, new int[] { 1, prseq.length() },
455 // Trim the exon mapping if necessary - the given product may only be a
456 // fragment of a larger protein. (EMBL:AY043181 is an example)
460 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
462 // if given a dataset reference, search dataset for parent EMBL
463 // sequence if it exists and set its map
464 // make a new feature annotating the coding contig
468 // final product length truncation check
469 // TODO should from range include stop codon even if not in protein
470 // in order to include stop codon in CDS sequence (as done for
472 int[] cdsRanges = adjustForProteinLength(prseq.length(),
474 map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1);
475 // reconstruct the EMBLCDS entry
476 // TODO: this is only necessary when there codon annotation is
477 // complete (I think JBPNote)
478 DBRefEntry pcdnaref = new DBRefEntry();
479 pcdnaref.setAccessionId(prid);
480 pcdnaref.setSource(DBRefSource.EMBLCDS);
481 pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
482 MapList mp = new MapList(new int[] { 1, prseq.length() },
483 new int[] { 1 + (codonStart - 1),
484 (codonStart - 1) + 3 * prseq.length() }, 1, 3);
485 pcdnaref.setMap(new Mapping(mp));
488 product.addDBRef(pcdnaref);
489 protEMBLCDS = new DBRefEntry(pcdnaref);
490 protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
491 product.addDBRef(protEMBLCDS);
495 // add cds feature to dna seq - this may include the stop codon
496 for (int xint = 0; exon != null && xint < exon.length; xint += 2)
498 SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
500 sf.setType(feature.getName()); // "CDS"
501 sf.setFeatureGroup(sourceDb);
502 dna.addSequenceFeature(sf);
505 // add dbRefs to sequence
506 if (feature.dbRefs != null)
508 boolean productMapped = false;
509 for (DBRefEntry ref : feature.dbRefs)
511 ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
512 // Hard code the kind of protein product accessions that EMBL cite
513 if (ref.getSource().equals(DBRefSource.UNIPROT))
515 String refSeqName = DBRefSource.UNIPROT + "|"
516 + ref.getAccessionId();
518 if (map != null && map.getTo() != null)
520 // if (!productMapped)
522 // map.getTo().setName(refSeqName);
523 // map.getTo().addDBRef(
524 // new DBRefEntry(ref.getSource(), ref.getVersion(), ref
525 // .getAccessionId())); // don't copy map over.
526 // // if (map.getTo().getName().startsWith(prid))
527 // productMapped = true;
532 * an alternate UNIPROT product for CDS - same mapping
533 * but to a sequence with a different name
535 SequenceI newSeq = matcher.findIdMatch(refSeqName);
538 newSeq = new Sequence(refSeqName, map.getTo()
539 .getSequenceAsString());
541 peptides.add(newSeq);
543 Mapping newMap = new Mapping(newSeq, map.getMap());
547 noProteinDbref = false;
551 DBRefEntry pref = new DBRefEntry(ref.getSource(),
552 ref.getVersion(), ref.getAccessionId());
553 pref.setMap(null); // reference is direct
554 product.addDBRef(pref);
555 // Add converse mapping reference
558 Mapping pmap = new Mapping(dna, map.getMap().getInverse());
559 pref = new DBRefEntry(sourceDb, getVersion(),
560 this.getAccession());
562 if (map.getTo() != null)
564 map.getTo().addDBRef(pref);
570 if (noProteinDbref && product != null)
572 // add protein coding reference to dna sequence so xref matches
573 if (protEMBLCDS == null)
575 protEMBLCDS = new DBRefEntry();
576 protEMBLCDS.setAccessionId(prid);
577 protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
578 protEMBLCDS.setVersion(getVersion());
580 .setMap(new Mapping(product, map.getMap().getInverse()));
582 product.addDBRef(protEMBLCDS);
584 // Add converse mapping reference
587 Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap()
589 DBRefEntry ncMap = new DBRefEntry(protEMBLCDS);
591 if (map.getTo() != null)
601 * Helper method to construct a SequenceFeature for one cds range
604 * array of cds [start, end, ...] positions
605 * @param exonStartIndex
606 * offset into the exons array
608 * @param proteinAccessionId
610 * map of 'miscellaneous values' for feature
612 * codon start position for CDS (1/2/3, normally 1)
615 protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
616 String proteinName, String proteinAccessionId,
617 Map<String, String> vals, int codonStart)
619 int exonNumber = exonStartIndex / 2 + 1;
620 SequenceFeature sf = new SequenceFeature();
621 sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
622 sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
623 sf.setDescription(String.format(
624 "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName,
625 proteinAccessionId));
626 sf.setPhase(String.valueOf(codonStart - 1));
627 sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-");
628 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
629 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
632 StringBuilder sb = new StringBuilder();
633 boolean first = true;
634 for (Entry<String, String> val : vals.entrySet())
640 sb.append(val.getKey()).append("=").append(val.getValue());
642 sf.setValue(val.getKey(), val.getValue());
644 sf.setAttributes(sb.toString());
650 * Returns the CDS positions as a list of [start, end, start, end...]
651 * positions. If on the reverse strand, these will be in descending order.
656 protected int[] getCdsRanges(EmblFeature feature)
658 if (feature.locations == null)
662 int cdsBoundaryCount = 0; // count of all start/stop locations
663 int[][] cdsLocations = new int[feature.locations.size()][];
664 int locationNumber = 0;
665 for (EmblFeatureLocations loc : feature.locations)
667 int[] locationRanges = loc.getElementRanges(accession);
668 cdsLocations[locationNumber++] = locationRanges;
669 cdsBoundaryCount += locationRanges.length;
671 int[] cdsRanges = new int[cdsBoundaryCount];
673 for (int[] ranges : cdsLocations)
675 System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length);
676 copyTo += ranges.length;
683 * truncate the last exon interval to the prlength'th codon
689 private int[] adjustForProteinLength(int prlength, int[] exon)
692 int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3;
693 // first adjust range for codon start attribute
694 if (prlength >= 1 && exon != null)
696 origxon = new int[exon.length];
697 System.arraycopy(exon, 0, origxon, 0, exon.length);
699 for (int x = 0; x < exon.length && sxpos == -1; x += 2)
701 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
702 if (cdslength <= cdspos)
704 // advanced beyond last codon.
706 if (cdslength != cdspos)
709 .println("Truncating final exon interval on region by "
710 + (cdspos - cdslength));
712 // locate the new end boundary of final exon as endxon
713 endxon = exon[x + 1] - cdspos + cdslength;
720 // and trim the exon interval set if necessary
721 int[] nxon = new int[sxpos + 2];
722 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
723 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon