2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel.xdb.embl;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.bin.Cache;
25 import jalview.datamodel.DBRefEntry;
26 import jalview.datamodel.DBRefSource;
27 import jalview.datamodel.FeatureProperties;
28 import jalview.datamodel.Mapping;
29 import jalview.datamodel.Sequence;
30 import jalview.datamodel.SequenceFeature;
31 import jalview.datamodel.SequenceI;
32 import jalview.util.DBRefUtils;
33 import jalview.util.DnaUtils;
34 import jalview.util.MapList;
35 import jalview.util.MappingUtils;
36 import jalview.util.StringUtils;
38 import java.text.ParseException;
39 import java.util.Arrays;
40 import java.util.Hashtable;
41 import java.util.List;
43 import java.util.Map.Entry;
44 import java.util.Vector;
45 import java.util.regex.Pattern;
48 * Data model for one entry returned from an EMBL query, as marshalled by a
52 * http://www.ebi.ac.uk/ena/data/view/J03321&display=xml
54 * @see embl_mapping.xml
56 public class EmblEntry
58 private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
64 String sequenceVersion;
72 String sequenceLength;
74 String taxonomicDivision;
78 String firstPublicDate;
80 String firstPublicRelease;
82 String lastUpdatedDate;
84 String lastUpdatedRelease;
86 Vector<String> keywords;
88 Vector<DBRefEntry> dbRefs;
90 Vector<EmblFeature> features;
92 EmblSequence sequence;
95 * @return the accession
97 public String getAccession()
104 * the accession to set
106 public void setAccession(String accession)
108 this.accession = accession;
114 public Vector<DBRefEntry> getDbRefs()
123 public void setDbRefs(Vector<DBRefEntry> dbRefs)
125 this.dbRefs = dbRefs;
129 * @return the features
131 public Vector<EmblFeature> getFeatures()
138 * the features to set
140 public void setFeatures(Vector<EmblFeature> features)
142 this.features = features;
146 * @return the keywords
148 public Vector<String> getKeywords()
155 * the keywords to set
157 public void setKeywords(Vector<String> keywords)
159 this.keywords = keywords;
163 * @return the sequence
165 public EmblSequence getSequence()
172 * the sequence to set
174 public void setSequence(EmblSequence sequence)
176 this.sequence = sequence;
180 * Recover annotated sequences from EMBL file
184 * a list of protein products found so far (to add to)
185 * @return dna dataset sequence with DBRefs and features
187 public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
189 SequenceI dna = makeSequence(sourceDb);
194 dna.setDescription(description);
195 DBRefEntry retrievedref = new DBRefEntry(sourceDb,
196 getSequenceVersion(), accession);
197 dna.addDBRef(retrievedref);
198 dna.setSourceDBRef(retrievedref);
199 // add map to indicate the sequence is a valid coordinate frame for the
201 retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
202 new int[] { 1, dna.getLength() }, 1, 1));
206 * transform EMBL Database refs to canonical form
210 for (DBRefEntry dbref : dbRefs)
212 dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource()));
217 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
220 for (EmblFeature feature : features)
222 if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
224 parseCodingFeature(feature, sourceDb, dna, peptides, matcher);
227 } catch (Exception e)
229 System.err.println("EMBL Record Features parsing error!");
231 .println("Please report the following to help@jalview.org :");
232 System.err.println("EMBL Record " + accession);
233 System.err.println("Resulted in exception: " + e.getMessage());
234 e.printStackTrace(System.err);
244 SequenceI makeSequence(String sourceDb)
246 if (sequence == null)
248 System.err.println("No sequence was returned for ENA accession "
252 SequenceI dna = new Sequence(sourceDb + "|" + accession,
253 sequence.getSequence());
258 * Extracts coding region and product from a CDS feature and properly decorate
259 * it with annotations.
264 * source database for the EMBLXML
266 * parent dna sequence for this record
268 * list of protein product sequences for Embl entry
270 * helper to match xrefs in already retrieved sequences
272 void parseCodingFeature(EmblFeature feature, String sourceDb,
273 SequenceI dna, List<SequenceI> peptides, SequenceIdMatcher matcher)
275 boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
277 int[] exons = getCdsRanges(feature);
279 String translation = null;
280 String proteinName = "";
281 String proteinId = null;
282 Map<String, String> vals = new Hashtable<String, String>();
285 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
286 * (phase is required for CDS features in GFF3 format)
291 * parse qualifiers, saving protein translation, protein id,
292 * codon start position, product (name), and 'other values'
294 if (feature.getQualifiers() != null)
296 for (Qualifier q : feature.getQualifiers())
298 String qname = q.getName();
299 if (qname.equals("translation"))
301 // remove all spaces (precompiled String.replaceAll(" ", ""))
302 translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
304 else if (qname.equals("protein_id"))
306 proteinId = q.getValues()[0].trim();
308 else if (qname.equals("codon_start"))
312 codonStart = Integer.parseInt(q.getValues()[0].trim());
313 } catch (NumberFormatException e)
315 System.err.println("Invalid codon_start in XML for "
316 + accession + ": " + e.getMessage());
319 else if (qname.equals("product"))
321 // sometimes name is returned e.g. for V00488
322 proteinName = q.getValues()[0].trim();
326 // throw anything else into the additional properties hash
327 String[] qvals = q.getValues();
330 String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
332 vals.put(qname, commaSeparated);
338 DBRefEntry proteinToEmblProteinRef = null;
339 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
341 SequenceI product = null;
342 Mapping dnaToProteinMapping = null;
343 if (translation != null && proteinName != null && proteinId != null)
345 int translationLength = translation.length();
348 * look for product in peptides list, if not found, add it
350 product = matcher.findIdMatch(proteinId);
353 product = new Sequence(proteinId, translation, 1, translationLength);
354 product.setDescription(((proteinName.length() == 0) ? "Protein Product from "
357 peptides.add(product);
358 matcher.add(product);
361 // we have everything - create the mapping and perhaps the protein
363 if (exons == null || exons.length == 0)
366 * workaround until we handle dna location for CDS sequence
367 * e.g. location="X53828.1:60..1058" correctly
370 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
371 + sourceDb + ":" + getAccession() + ")");
372 if (translationLength * 3 == (1 - codonStart + dna.getSequence().length))
375 .println("Not allowing for additional stop codon at end of cDNA fragment... !");
376 // this might occur for CDS sequences where no features are marked
377 exons = new int[] { dna.getStart() + (codonStart - 1),
379 dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
380 translationLength }, 3, 1);
382 if ((translationLength + 1) * 3 == (1 - codonStart + dna
383 .getSequence().length))
386 .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
387 exons = new int[] { dna.getStart() + (codonStart - 1),
389 dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
390 translationLength }, 3, 1);
395 // Trim the exon mapping if necessary - the given product may only be a
396 // fragment of a larger protein. (EMBL:AY043181 is an example)
400 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
402 // if given a dataset reference, search dataset for parent EMBL
403 // sequence if it exists and set its map
404 // make a new feature annotating the coding contig
408 // final product length truncation check
409 int[] cdsRanges = adjustForProteinLength(translationLength, exons);
410 dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] {
411 1, translationLength }, 3, 1);
415 * make xref with mapping from protein to EMBL dna
417 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
418 getSequenceVersion(), proteinId, new Mapping(
419 dnaToProteinMapping.getMap().getInverse()));
420 product.addDBRef(proteinToEmblRef);
423 * make xref from protein to EMBLCDS; we assume here that the
424 * CDS sequence version is same as dna sequence (?!)
426 MapList proteinToCdsMapList = new MapList(new int[] { 1,
427 translationLength }, new int[] { 1 + (codonStart - 1),
428 (codonStart - 1) + 3 * translationLength }, 1, 3);
429 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
430 DBRefSource.EMBLCDS, getSequenceVersion(), proteinId,
431 new Mapping(proteinToCdsMapList));
432 product.addDBRef(proteinToEmblCdsRef);
435 * make 'direct' xref from protein to EMBLCDSPROTEIN
437 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
438 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
439 proteinToEmblProteinRef.setMap(null);
440 product.addDBRef(proteinToEmblProteinRef);
446 * add cds features to dna sequence
448 for (int xint = 0; exons != null && xint < exons.length; xint += 2)
450 SequenceFeature sf = makeCdsFeature(exons, xint, proteinName,
451 proteinId, vals, codonStart);
452 sf.setType(feature.getName()); // "CDS"
453 sf.setEnaLocation(feature.getLocation());
454 sf.setFeatureGroup(sourceDb);
455 dna.addSequenceFeature(sf);
460 * add feature dbRefs to sequence, and mappings for Uniprot xrefs
462 boolean hasUniprotDbref = false;
463 if (feature.dbRefs != null)
465 boolean mappingUsed = false;
466 for (DBRefEntry ref : feature.dbRefs)
469 * ensure UniProtKB/Swiss-Prot converted to UNIPROT
471 String source = DBRefUtils.getCanonicalName(ref.getSource());
472 ref.setSource(source);
473 DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref
475 if (source.equals(DBRefSource.UNIPROT))
477 String proteinSeqName = DBRefSource.UNIPROT + "|"
478 + ref.getAccessionId();
479 if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null)
484 * two or more Uniprot xrefs for the same CDS -
485 * each needs a distinct Mapping (as to a different sequence)
487 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
492 * try to locate the protein mapped to (possibly by a
493 * previous CDS feature); if not found, construct it from
494 * the EMBL translation
496 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
497 if (proteinSeq == null)
499 proteinSeq = new Sequence(proteinSeqName,
500 product.getSequenceAsString());
501 matcher.add(proteinSeq);
502 peptides.add(proteinSeq);
504 dnaToProteinMapping.setTo(proteinSeq);
505 dnaToProteinMapping.setMappedFromId(proteinId);
506 proteinSeq.addDBRef(proteinDbRef);
507 proteinSeq.setSourceDBRef(proteinDbRef);
508 ref.setMap(dnaToProteinMapping);
510 hasUniprotDbref = true;
515 * copy feature dbref to our protein product
517 DBRefEntry pref = proteinDbRef;
518 pref.setMap(null); // reference is direct
519 product.addDBRef(pref);
520 // Add converse mapping reference
521 if (dnaToProteinMapping != null)
523 Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap()
525 pref = new DBRefEntry(sourceDb, getSequenceVersion(),
526 this.getAccession());
528 if (dnaToProteinMapping.getTo() != null)
530 dnaToProteinMapping.getTo().addDBRef(pref);
539 * if we have a product (translation) but no explicit Uniprot dbref
540 * (example: EMBL AAFI02000057 protein_id EAL65544.1)
541 * then construct mappings to an assumed EMBLCDSPROTEIN accession
543 if (!hasUniprotDbref && product != null)
545 if (proteinToEmblProteinRef == null)
547 // assuming CDSPROTEIN sequence version = dna version (?!)
548 proteinToEmblProteinRef = new DBRefEntry(
549 DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
551 product.addDBRef(proteinToEmblProteinRef);
552 product.setSourceDBRef(proteinToEmblProteinRef);
554 if (dnaToProteinMapping != null
555 && dnaToProteinMapping.getTo() != null)
557 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
558 DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
559 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
560 dnaToProteinMapping.setMappedFromId(proteinId);
561 dna.addDBRef(dnaToEmblProteinRef);
567 * Helper method to construct a SequenceFeature for one cds range
570 * array of cds [start, end, ...] positions
571 * @param exonStartIndex
572 * offset into the exons array
574 * @param proteinAccessionId
576 * map of 'miscellaneous values' for feature
578 * codon start position for CDS (1/2/3, normally 1)
581 protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
582 String proteinName, String proteinAccessionId,
583 Map<String, String> vals, int codonStart)
585 int exonNumber = exonStartIndex / 2 + 1;
586 SequenceFeature sf = new SequenceFeature();
587 sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
588 sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
589 sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s",
590 exonNumber, proteinName, proteinAccessionId));
591 sf.setPhase(String.valueOf(codonStart - 1));
592 sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+"
594 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
595 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
598 StringBuilder sb = new StringBuilder();
599 boolean first = true;
600 for (Entry<String, String> val : vals.entrySet())
606 sb.append(val.getKey()).append("=").append(val.getValue());
608 sf.setValue(val.getKey(), val.getValue());
610 sf.setAttributes(sb.toString());
616 * Returns the CDS positions as a single array of [start, end, start, end...]
617 * positions. If on the reverse strand, these will be in descending order.
622 protected int[] getCdsRanges(EmblFeature feature)
624 if (feature.location == null)
631 List<int[]> ranges = DnaUtils.parseLocation(feature.location);
632 return listToArray(ranges);
633 } catch (ParseException e)
635 Cache.log.warn(String.format(
636 "Not parsing inexact CDS location %s in ENA %s",
637 feature.location, this.accession));
643 * Converts a list of [start, end] ranges to a single array of [start, end,
649 int[] listToArray(List<int[]> ranges)
651 int[] result = new int[ranges.size() * 2];
653 for (int[] range : ranges)
655 result[i++] = range[0];
656 result[i++] = range[1];
662 * Truncates (if necessary) the exon intervals to match 3 times the length of
663 * the protein; also accepts 3 bases longer (for stop codon not included in
666 * @param proteinLength
668 * an array of [start, end, start, end...] intervals
669 * @return the same array (if unchanged) or a truncated copy
671 static int[] adjustForProteinLength(int proteinLength, int[] exon)
673 if (proteinLength <= 0 || exon == null)
677 int expectedCdsLength = proteinLength * 3;
678 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
681 * if exon length matches protein, or is shorter, or longer by the
682 * length of a stop codon (3 bases), then leave it unchanged
684 if (expectedCdsLength >= exonLength
685 || expectedCdsLength == exonLength - 3)
693 origxon = new int[exon.length];
694 System.arraycopy(exon, 0, origxon, 0, exon.length);
696 for (int x = 0; x < exon.length; x += 2)
698 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
699 if (expectedCdsLength <= cdspos)
701 // advanced beyond last codon.
703 if (expectedCdsLength != cdspos)
706 // .println("Truncating final exon interval on region by "
707 // + (cdspos - cdslength));
711 * shrink the final exon - reduce end position if forward
712 * strand, increase it if reverse
714 if (exon[x + 1] >= exon[x])
716 endxon = exon[x + 1] - cdspos + expectedCdsLength;
720 endxon = exon[x + 1] + cdspos - expectedCdsLength;
728 // and trim the exon interval set if necessary
729 int[] nxon = new int[sxpos + 2];
730 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
731 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
738 public String getSequenceVersion()
740 return sequenceVersion;
743 public void setSequenceVersion(String sequenceVersion)
745 this.sequenceVersion = sequenceVersion;
748 public String getSequenceLength()
750 return sequenceLength;
753 public void setSequenceLength(String sequenceLength)
755 this.sequenceLength = sequenceLength;
758 public String getEntryVersion()
763 public void setEntryVersion(String entryVersion)
765 this.entryVersion = entryVersion;
768 public String getMoleculeType()
773 public void setMoleculeType(String moleculeType)
775 this.moleculeType = moleculeType;
778 public String getTopology()
783 public void setTopology(String topology)
785 this.topology = topology;
788 public String getTaxonomicDivision()
790 return taxonomicDivision;
793 public void setTaxonomicDivision(String taxonomicDivision)
795 this.taxonomicDivision = taxonomicDivision;
798 public String getDescription()
803 public void setDescription(String description)
805 this.description = description;
808 public String getFirstPublicDate()
810 return firstPublicDate;
813 public void setFirstPublicDate(String firstPublicDate)
815 this.firstPublicDate = firstPublicDate;
818 public String getFirstPublicRelease()
820 return firstPublicRelease;
823 public void setFirstPublicRelease(String firstPublicRelease)
825 this.firstPublicRelease = firstPublicRelease;
828 public String getLastUpdatedDate()
830 return lastUpdatedDate;
833 public void setLastUpdatedDate(String lastUpdatedDate)
835 this.lastUpdatedDate = lastUpdatedDate;
838 public String getLastUpdatedRelease()
840 return lastUpdatedRelease;
843 public void setLastUpdatedRelease(String lastUpdatedRelease)
845 this.lastUpdatedRelease = lastUpdatedRelease;
848 public String getDataClass()
853 public void setDataClass(String dataClass)
855 this.dataClass = dataClass;