2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel.xdb.embl;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.datamodel.DBRefEntry;
25 import jalview.datamodel.DBRefSource;
26 import jalview.datamodel.FeatureProperties;
27 import jalview.datamodel.Mapping;
28 import jalview.datamodel.Sequence;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.util.DBRefUtils;
32 import jalview.util.DnaUtils;
33 import jalview.util.MapList;
34 import jalview.util.MappingUtils;
35 import jalview.util.StringUtils;
37 import java.util.Arrays;
38 import java.util.Hashtable;
39 import java.util.List;
41 import java.util.Map.Entry;
42 import java.util.Vector;
43 import java.util.regex.Pattern;
46 * Data model for one entry returned from an EMBL query, as marshalled by a
50 * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
53 * @see embl_mapping.xml
55 public class EmblEntry
57 private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
63 String sequenceVersion;
69 String sequenceLength;
81 Vector<String> keywords;
83 Vector<DBRefEntry> dbRefs;
85 Vector<EmblFeature> features;
87 EmblSequence sequence;
90 * @return the accession
92 public String getAccession()
99 * the accession to set
101 public void setAccession(String accession)
103 this.accession = accession;
109 public Vector<DBRefEntry> getDbRefs()
118 public void setDbRefs(Vector<DBRefEntry> dbRefs)
120 this.dbRefs = dbRefs;
126 public String getDesc()
135 public void setDesc(String desc)
141 * @return the features
143 public Vector<EmblFeature> getFeatures()
150 * the features to set
152 public void setFeatures(Vector<EmblFeature> features)
154 this.features = features;
158 * @return the keywords
160 public Vector<String> getKeywords()
167 * the keywords to set
169 public void setKeywords(Vector<String> keywords)
171 this.keywords = keywords;
175 * @return the lastUpdated
177 public String getLastUpdated()
184 * the lastUpdated to set
186 public void setLastUpdated(String lastUpdated)
188 this.lastUpdated = lastUpdated;
192 * @return the releaseCreated
194 public String getRCreated()
200 * @param releaseCreated
201 * the releaseCreated to set
203 public void setRCreated(String releaseCreated)
205 this.rCreated = releaseCreated;
209 * @return the releaseLastUpdated
211 public String getRLastUpdated()
217 * @param releaseLastUpdated
218 * the releaseLastUpdated to set
220 public void setRLastUpdated(String releaseLastUpdated)
222 this.rLastUpdated = releaseLastUpdated;
226 * @return the sequence
228 public EmblSequence getSequence()
235 * the sequence to set
237 public void setSequence(EmblSequence sequence)
239 this.sequence = sequence;
243 * @return the taxDivision
245 public String getTaxDivision()
252 * the taxDivision to set
254 public void setTaxDivision(String taxDivision)
256 this.taxDivision = taxDivision;
260 * @return the entry version
262 public String getEntryVersion()
271 public void setEntryVersion(String version)
273 this.entryVersion = version;
277 * Recover annotated sequences from EMBL file
281 * a list of protein products found so far (to add to)
282 * @return dna dataset sequence with DBRefs and features
284 public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
286 SequenceI dna = new Sequence(sourceDb + "|" + accession,
287 sequence.getSequence());
288 dna.setDescription(desc);
289 DBRefEntry retrievedref = new DBRefEntry(sourceDb,
290 getSequenceVersion(), accession);
291 dna.addDBRef(retrievedref);
292 // add map to indicate the sequence is a valid coordinate frame for the
294 retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
295 new int[] { 1, dna.getLength() }, 1, 1));
296 // TODO: transform EMBL Database refs to canonical form
299 for (DBRefEntry dbref : dbRefs)
307 for (EmblFeature feature : features)
309 if (feature.dbRefs != null)
311 for (DBRefEntry dbref : feature.dbRefs)
316 if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
318 parseCodingFeature(feature, sourceDb, dna, peptides);
321 } catch (Exception e)
323 System.err.println("EMBL Record Features parsing error!");
325 .println("Please report the following to help@jalview.org :");
326 System.err.println("EMBL Record " + accession);
327 System.err.println("Resulted in exception: " + e.getMessage());
328 e.printStackTrace(System.err);
335 * Extracts coding region and product from a CDS feature and properly decorate
336 * it with annotations.
341 * source database for the EMBLXML
343 * parent dna sequence for this record
345 * list of protein product sequences for Embl entry
347 void parseCodingFeature(EmblFeature feature, String sourceDb,
348 SequenceI dna, List<SequenceI> peptides)
350 boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
352 int[] exon = getCdsRanges(feature);
357 Map<String, String> vals = new Hashtable<String, String>();
358 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
361 * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
362 * (phase is required for CDS features in GFF3 format)
367 * parse qualifiers, saving protein translation, protein id,
368 * codon start position, product (name), and 'other values'
370 if (feature.getQualifiers() != null)
372 for (Qualifier q : feature.getQualifiers())
374 String qname = q.getName();
375 if (qname.equals("translation"))
377 // remove all spaces (precompiled String.replaceAll(" ", ""))
378 prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
380 else if (qname.equals("protein_id"))
382 prid = q.getValues()[0];
384 else if (qname.equals("codon_start"))
388 codonStart = Integer.parseInt(q.getValues()[0]);
389 } catch (NumberFormatException e)
391 System.err.println("Invalid codon_start in XML for "
392 + accession + ": " + e.getMessage());
395 else if (qname.equals("product"))
397 // sometimes name is returned e.g. for V00488
398 prname = q.getValues()[0];
402 // throw anything else into the additional properties hash
403 String[] qvals = q.getValues();
406 String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
408 vals.put(qname, commaSeparated);
414 DBRefEntry protEMBLCDS = null;
415 exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
416 boolean noProteinDbref = true;
418 SequenceI product = null;
420 if (prseq != null && prname != null && prid != null)
423 * look for product in peptides list, if not found, add it
425 product = matcher.findIdMatch(prid);
428 product = new Sequence(prid, prseq, 1, prseq.length());
429 product.setDescription(((prname.length() == 0) ? "Protein Product from "
432 peptides.add(product);
433 matcher.add(product);
436 // we have everything - create the mapping and perhaps the protein
438 if (exon == null || exon.length == 0)
441 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
442 + sourceDb + ":" + getAccession() + ")");
443 if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
446 .println("Not allowing for additional stop codon at end of cDNA fragment... !");
447 // this might occur for CDS sequences where no features are
449 exon = new int[] { dna.getStart() + (codonStart - 1),
451 map = new Mapping(product, exon, new int[] { 1, prseq.length() },
454 if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
457 .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
458 exon = new int[] { dna.getStart() + (codonStart - 1),
460 map = new Mapping(product, exon, new int[] { 1, prseq.length() },
466 // Trim the exon mapping if necessary - the given product may only be a
467 // fragment of a larger protein. (EMBL:AY043181 is an example)
471 // TODO: Add a DbRef back to the parent EMBL sequence with the exon
473 // if given a dataset reference, search dataset for parent EMBL
474 // sequence if it exists and set its map
475 // make a new feature annotating the coding contig
479 // final product length truncation check
480 // TODO should from range include stop codon even if not in protein
481 // in order to include stop codon in CDS sequence (as done for
483 int[] cdsRanges = adjustForProteinLength(prseq.length(), exon);
484 map = new Mapping(product, cdsRanges, new int[] { 1,
485 prseq.length() }, 3, 1);
486 // reconstruct the EMBLCDS entry
487 // TODO: this is only necessary when there codon annotation is
488 // complete (I think JBPNote)
489 DBRefEntry pcdnaref = new DBRefEntry();
490 pcdnaref.setAccessionId(prid);
491 pcdnaref.setSource(DBRefSource.EMBLCDS);
492 pcdnaref.setVersion(getSequenceVersion()); // same as parent EMBL
494 MapList mp = new MapList(new int[] { 1, prseq.length() },
495 new int[] { 1 + (codonStart - 1),
496 (codonStart - 1) + 3 * prseq.length() }, 1, 3);
497 pcdnaref.setMap(new Mapping(mp));
500 product.addDBRef(pcdnaref);
501 protEMBLCDS = new DBRefEntry(pcdnaref);
502 protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
503 product.addDBRef(protEMBLCDS);
507 // add cds feature to dna seq - this may include the stop codon
508 for (int xint = 0; exon != null && xint < exon.length; xint += 2)
510 SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
512 sf.setType(feature.getName()); // "CDS"
513 sf.setEnaLocation(feature.getLocation());
514 sf.setFeatureGroup(sourceDb);
515 dna.addSequenceFeature(sf);
520 * add dbRefs to sequence, and mappings for Uniprot xrefs
522 if (feature.dbRefs != null)
524 boolean mappingUsed = false;
525 for (DBRefEntry ref : feature.dbRefs)
527 ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
528 if (ref.getSource().equals(DBRefSource.UNIPROT))
530 String proteinSeqName = DBRefSource.UNIPROT + "|"
531 + ref.getAccessionId();
532 if (map != null && map.getTo() != null)
537 * two or more Uniprot xrefs for the same CDS -
538 * each needs a distinct Mapping (as to a different sequence)
540 map = new Mapping(map);
545 * try to locate the protein mapped to (possibly by a
546 * previous CDS feature)
548 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
549 if (proteinSeq == null)
551 proteinSeq = new Sequence(proteinSeqName,
552 product.getSequenceAsString());
553 matcher.add(proteinSeq);
554 peptides.add(proteinSeq);
556 map.setTo(proteinSeq);
557 map.getTo().addDBRef(
558 new DBRefEntry(ref.getSource(), ref.getVersion(), ref
562 noProteinDbref = false;
566 DBRefEntry pref = new DBRefEntry(ref.getSource(),
567 ref.getVersion(), ref.getAccessionId());
568 pref.setMap(null); // reference is direct
569 product.addDBRef(pref);
570 // Add converse mapping reference
573 Mapping pmap = new Mapping(dna, map.getMap().getInverse());
574 pref = new DBRefEntry(sourceDb, getSequenceVersion(),
575 this.getAccession());
577 if (map.getTo() != null)
579 map.getTo().addDBRef(pref);
585 if (noProteinDbref && product != null)
587 // add protein coding reference to dna sequence so xref matches
588 if (protEMBLCDS == null)
590 protEMBLCDS = new DBRefEntry();
591 protEMBLCDS.setAccessionId(prid);
592 protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
593 protEMBLCDS.setVersion(getSequenceVersion());
595 .setMap(new Mapping(product, map.getMap().getInverse()));
597 product.addDBRef(protEMBLCDS);
599 // Add converse mapping reference
602 Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap()
604 DBRefEntry ncMap = new DBRefEntry(protEMBLCDS);
606 if (map.getTo() != null)
616 * Helper method to construct a SequenceFeature for one cds range
619 * array of cds [start, end, ...] positions
620 * @param exonStartIndex
621 * offset into the exons array
623 * @param proteinAccessionId
625 * map of 'miscellaneous values' for feature
627 * codon start position for CDS (1/2/3, normally 1)
630 protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
631 String proteinName, String proteinAccessionId,
632 Map<String, String> vals, int codonStart)
634 int exonNumber = exonStartIndex / 2 + 1;
635 SequenceFeature sf = new SequenceFeature();
636 sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
637 sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
638 sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s",
639 exonNumber, proteinName, proteinAccessionId));
640 sf.setPhase(String.valueOf(codonStart - 1));
641 sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+"
643 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
644 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
647 StringBuilder sb = new StringBuilder();
648 boolean first = true;
649 for (Entry<String, String> val : vals.entrySet())
655 sb.append(val.getKey()).append("=").append(val.getValue());
657 sf.setValue(val.getKey(), val.getValue());
659 sf.setAttributes(sb.toString());
665 * Returns the CDS positions as a list of [start, end, start, end...]
666 * positions. If on the reverse strand, these will be in descending order.
671 protected int[] getCdsRanges(EmblFeature feature)
673 if (feature.location == null)
677 List<int[]> ranges = DnaUtils.parseLocation(feature.location);
678 return ranges == null ? new int[] {} : listToArray(ranges);
682 * Converts a list of [start, end] ranges to a single array of [start, end,
688 int[] listToArray(List<int[]> ranges)
690 int[] result = new int[ranges.size() * 2];
692 for (int[] range : ranges)
694 result[i++] = range[0];
695 result[i++] = range[1];
701 * truncate the last exon interval to the prlength'th codon
707 static int[] adjustForProteinLength(int prlength, int[] exon)
709 if (prlength <= 0 || exon == null)
713 int desiredCdsLength = prlength * 3;
714 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
717 * assuming here exon might include stop codon in addition to protein codons
719 if (desiredCdsLength == exonLength
720 || desiredCdsLength == exonLength - 3)
728 origxon = new int[exon.length];
729 System.arraycopy(exon, 0, origxon, 0, exon.length);
731 for (int x = 0; x < exon.length; x += 2)
733 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
734 if (desiredCdsLength <= cdspos)
736 // advanced beyond last codon.
738 if (desiredCdsLength != cdspos)
741 // .println("Truncating final exon interval on region by "
742 // + (cdspos - cdslength));
746 * shrink the final exon - reduce end position if forward
747 * strand, increase it if reverse
749 if (exon[x + 1] >= exon[x])
751 endxon = exon[x + 1] - cdspos + desiredCdsLength;
755 endxon = exon[x + 1] + cdspos - desiredCdsLength;
763 // and trim the exon interval set if necessary
764 int[] nxon = new int[sxpos + 2];
765 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
766 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
773 public String getSequenceVersion()
775 return sequenceVersion;
778 public void setSequenceVersion(String sequenceVersion)
780 this.sequenceVersion = sequenceVersion;
783 public String getMoleculeType()
788 public void setMoleculeType(String moleculeType)
790 this.moleculeType = moleculeType;
793 public String getTopology()
798 public void setTopology(String topology)
800 this.topology = topology;
803 public String getSequenceLength()
805 return sequenceLength;
808 public void setSequenceLength(String sequenceLength)
810 this.sequenceLength = sequenceLength;
813 public String getrCreated()
818 public void setrCreated(String rCreated)
820 this.rCreated = rCreated;
823 public String getrLastUpdated()
828 public void setrLastUpdated(String rLastUpdated)
830 this.rLastUpdated = rLastUpdated;