X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=bbe6a209bc0dd44aad43aa30473f7daae6438278;hb=88515cdb74e4603a40c8c1dca14107b5ca503e04;hp=e6c7197e22c76962063a405e32287194ef00118d;hpb=3f89c3afc646bf1d83607110e538f20f9c01c3b2;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index e6c7197..bbe6a20 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -1,38 +1,92 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.datamodel.xdb.embl; +import jalview.analysis.SequenceIdMatcher; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; - -import java.util.Enumeration; +import jalview.util.DBRefUtils; +import jalview.util.DnaUtils; +import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.StringUtils; + +import java.text.ParseException; +import java.util.Arrays; import java.util.Hashtable; -import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import java.util.Vector; - +import java.util.regex.Pattern; + +/** + * Data model for one entry returned from an EMBL query, as marshalled by a + * Castor binding file + * + * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml + * + * @see embl_mapping.xml + */ public class EmblEntry { + private static final Pattern SPACE_PATTERN = Pattern.compile(" "); + String accession; - String version; + String entryVersion; + + String sequenceVersion; + + String dataClass; - String taxDivision; + String moleculeType; - String desc; + String topology; - String rCreated; + String sequenceLength; - String rLastUpdated; + String taxonomicDivision; - String lastUpdated; + String description; - Vector keywords; + String firstPublicDate; - Vector refs; + String firstPublicRelease; - Vector dbRefs; + String lastUpdatedDate; - Vector features; + String lastUpdatedRelease; + + Vector keywords; + + Vector dbRefs; + + Vector features; EmblSequence sequence; @@ -56,7 +110,7 @@ public class EmblEntry /** * @return the dbRefs */ - public Vector getDbRefs() + public Vector getDbRefs() { return dbRefs; } @@ -65,32 +119,15 @@ public class EmblEntry * @param dbRefs * the dbRefs to set */ - public void setDbRefs(Vector dbRefs) + public void setDbRefs(Vector dbRefs) { this.dbRefs = dbRefs; } /** - * @return the desc - */ - public String getDesc() - { - return desc; - } - - /** - * @param desc - * the desc to set - */ - public void setDesc(String desc) - { - this.desc = desc; - } - - /** * @return the features */ - public Vector getFeatures() + public Vector getFeatures() { return features; } @@ -99,7 +136,7 @@ public class EmblEntry * @param features * the features to set */ - public void setFeatures(Vector features) + public void setFeatures(Vector features) { this.features = features; } @@ -107,7 +144,7 @@ public class EmblEntry /** * @return the keywords */ - public Vector getKeywords() + public Vector getKeywords() { return keywords; } @@ -116,80 +153,12 @@ public class EmblEntry * @param keywords * the keywords to set */ - public void setKeywords(Vector keywords) + public void setKeywords(Vector keywords) { this.keywords = keywords; } /** - * @return the lastUpdated - */ - public String getLastUpdated() - { - return lastUpdated; - } - - /** - * @param lastUpdated - * the lastUpdated to set - */ - public void setLastUpdated(String lastUpdated) - { - this.lastUpdated = lastUpdated; - } - - /** - * @return the refs - */ - public Vector getRefs() - { - return refs; - } - - /** - * @param refs - * the refs to set - */ - public void setRefs(Vector refs) - { - this.refs = refs; - } - - /** - * @return the releaseCreated - */ - public String getRCreated() - { - return rCreated; - } - - /** - * @param releaseCreated - * the releaseCreated to set - */ - public void setRcreated(String releaseCreated) - { - this.rCreated = releaseCreated; - } - - /** - * @return the releaseLastUpdated - */ - public String getRLastUpdated() - { - return rLastUpdated; - } - - /** - * @param releaseLastUpdated - * the releaseLastUpdated to set - */ - public void setRLastUpdated(String releaseLastUpdated) - { - this.rLastUpdated = releaseLastUpdated; - } - - /** * @return the sequence */ public EmblSequence getSequence() @@ -207,370 +176,697 @@ public class EmblEntry } /** - * @return the taxDivision + * Recover annotated sequences from EMBL file + * + * @param sourceDb + * @param peptides + * a list of protein products found so far (to add to) + * @return dna dataset sequence with DBRefs and features */ - public String getTaxDivision() + public SequenceI getSequence(String sourceDb, List peptides) { - return taxDivision; - } + SequenceI dna = makeSequence(sourceDb); + if (dna == null) + { + return null; + } + dna.setDescription(description); + DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), + accession); + dna.addDBRef(retrievedref); + // add map to indicate the sequence is a valid coordinate frame for the + // dbref + retrievedref + .setMap(new Mapping(null, new int[] + { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, + 1)); + + /* + * transform EMBL Database refs to canonical form + */ + if (dbRefs != null) + { + for (DBRefEntry dbref : dbRefs) + { + dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource())); + dna.addDBRef(dbref); + } + } - /** - * @param taxDivision - * the taxDivision to set - */ - public void setTaxDivision(String taxDivision) - { - this.taxDivision = taxDivision; - } + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); + try + { + for (EmblFeature feature : features) + { + if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) + { + parseCodingFeature(feature, sourceDb, dna, peptides, matcher); + } + } + } catch (Exception e) + { + System.err.println("EMBL Record Features parsing error!"); + System.err + .println("Please report the following to help@jalview.org :"); + System.err.println("EMBL Record " + accession); + System.err.println("Resulted in exception: " + e.getMessage()); + e.printStackTrace(System.err); + } - /** - * @return the version - */ - public String getVersion() - { - return version; + return dna; } /** - * @param version - * the version to set + * @param sourceDb + * @return */ - public void setVersion(String version) + SequenceI makeSequence(String sourceDb) { - this.version = version; + if (sequence == null) + { + System.err.println( + "No sequence was returned for ENA accession " + accession); + return null; + } + SequenceI dna = new Sequence(sourceDb + "|" + accession, + sequence.getSequence()); + return dna; } - /* - * EMBL Feature support is limited. The text below is included for the benefit - * of any developer working on improving EMBL feature import in Jalview. - * Extract from EMBL feature specification see - * http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html - * 3.5 Location 3.5.1 Purpose - * - * The location indicates the region of the presented sequence which - * corresponds to a feature. - * - * 3.5.2 Format and conventions The location contains at least one sequence - * location descriptor and may contain one or more operators with one or more - * sequence location descriptors. Base numbers refer to the numbering in the - * entry. This numbering designates the first base (5' end) of the presented - * sequence as base 1. Base locations beyond the range of the presented - * sequence may not be used in location descriptors, the only exception being - * location in a remote entry (see 3.5.2.1, e). - * - * Location operators and descriptors are discussed in more detail below. - * - * 3.5.2.1 Location descriptors - * - * The location descriptor can be one of the following: (a) a single base - * number (b) a site between two indicated adjoining bases (c) a single base - * chosen from within a specified range of bases (not allowed for new entries) - * (d) the base numbers delimiting a sequence span (e) a remote entry - * identifier followed by a local location descriptor (i.e., a-d) - * - * A site between two adjoining nucleotides, such as endonucleolytic cleavage - * site, is indicated by listing the two points separated by a carat (^). The - * permitted formats for this descriptor are n^n+1 (for example 55^56), or, - * for circular molecules, n^1, where "n" is the full length of the molecule, - * ie 1000^1 for circular molecule with length 1000. - * - * A single base chosen from a range of bases is indicated by the first base - * number and the last base number of the range separated by a single period - * (e.g., '12.21' indicates a single base taken from between the indicated - * points). From October 2006 the usage of this descriptor is restricted : it - * is illegal to use "a single base from a range" (c) either on its own or in - * combination with the "sequence span" (d) descriptor for newly created - * entries. The existing entries where such descriptors exist are going to be - * retrofitted. - * - * Sequence spans are indicated by the starting base number and the ending - * base number separated by two periods (e.g., '34..456'). The '<' and '>' - * symbols may be used with the starting and ending base numbers to indicate - * that an end point is beyond the specified base number. The starting and - * ending base positions can be represented as distinct base numbers - * ('34..456') or a site between two indicated adjoining bases. - * - * A location in a remote entry (not the entry to which the feature table - * belongs) can be specified by giving the accession-number and sequence - * version of the remote entry, followed by a colon ":", followed by a - * location descriptor which applies to that entry's sequence (i.e. - * J12345.1:1..15, see also examples below) - * - * 3.5.2.2 Operators - * - * The location operator is a prefix that specifies what must be done to the - * indicated sequence to find or construct the location corresponding to the - * feature. A list of operators is given below with their definitions and most - * common format. - * - * complement(location) Find the complement of the presented sequence in the - * span specified by " location" (i.e., read the complement of the presented - * strand in its 5'-to-3' direction) - * - * join(location,location, ... location) The indicated elements should be - * joined (placed end-to-end) to form one contiguous sequence - * - * order(location,location, ... location) The elements can be found in the - * specified order (5' to 3' direction), but nothing is implied about the - * reasonableness about joining them - * - * Note : location operator "complement" can be used in combination with - * either " join" or "order" within the same location; combinations of "join" - * and "order" within the same location (nested operators) are illegal. - * - * - * - * 3.5.3 Location examples - * - * The following is a list of common location descriptors with their meanings: - * - * Location Description - * - * 467 Points to a single base in the presented sequence - * - * 340..565 Points to a continuous range of bases bounded by and including the - * starting and ending bases - * - * <345..500 Indicates that the exact lower boundary point of a feature is - * unknown. The location begins at some base previous to the first base - * specified (which need not be contained in the presented sequence) and - * continues to and includes the ending base - * - * <1..888 The feature starts before the first sequenced base and continues to - * and includes base 888 - * - * 1..>888 The feature starts at the first sequenced base and continues beyond - * base 888 - * - * 102.110 Indicates that the exact location is unknown but that it is one of - * the bases between bases 102 and 110, inclusive - * - * 123^124 Points to a site between bases 123 and 124 - * - * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to - * form one contiguous sequence - * - * - * complement(34..126) Start at the base complementary to 126 and finish at - * the base complementary to base 34 (the feature is on the strand - * complementary to the presented strand) - * - * - * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and 4918 - * to 5163, then complements the joined segments (the feature is on the strand - * complementary to the presented strand) - * - * join(complement(4918..5163),complement(2691..4571)) Complements regions - * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the - * feature is on the strand complementary to the presented strand) - * - * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in - * this database) with primary accession number 'J00194' - * - * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry - * with the region 100..202 of remote entry J00194 - * - */ /** - * Recover annotated sequences from EMBL file + * Extracts coding region and product from a CDS feature and properly decorate + * it with annotations. * - * @param noNa - * don't return nucleic acid sequences + * @param feature + * coding feature * @param sourceDb - * TODO - * @param noProtein - * don't return any translated protein sequences marked in features - * @return dataset sequences with DBRefs and features - DNA always comes first + * source database for the EMBLXML + * @param dna + * parent dna sequence for this record + * @param peptides + * list of protein product sequences for Embl entry + * @param matcher + * helper to match xrefs in already retrieved sequences */ - public jalview.datamodel.SequenceI[] getSequences(boolean noNa, - boolean noPeptide, String sourceDb) + void parseCodingFeature(EmblFeature feature, String sourceDb, + SequenceI dna, List peptides, + SequenceIdMatcher matcher) { - Vector seqs = new Vector(); - Sequence dna = null; - if (!noNa) - { - dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); - dna.setDescription(desc); - dna.addDBRef(new DBRefEntry(sourceDb, version, accession)); - // TODO: add mapping for parentAccession attribute - // TODO: transform EMBL Database refs to canonical form - if (dbRefs != null) - for (Iterator i = dbRefs.iterator(); i.hasNext(); dna - .addDBRef((DBRefEntry) i.next())) - ; - } - try + boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); + + int[] exons = getCdsRanges(feature); + + String translation = null; + String proteinName = ""; + String proteinId = null; + Map vals = new Hashtable<>(); + + /* + * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS + * (phase is required for CDS features in GFF3 format) + */ + int codonStart = 1; + + /* + * parse qualifiers, saving protein translation, protein id, + * codon start position, product (name), and 'other values' + */ + if (feature.getQualifiers() != null) { - for (Iterator i = features.iterator(); i.hasNext();) + for (Qualifier q : feature.getQualifiers()) { - boolean nextFeature=false; - EmblFeature feature = (EmblFeature) i.next(); - if (!noNa) + String qname = q.getName(); + if (qname.equals("translation")) + { + // remove all spaces (precompiled String.replaceAll(" ", "")) + translation = SPACE_PATTERN.matcher(q.getValues()[0]) + .replaceAll(""); + } + else if (qname.equals("protein_id")) + { + proteinId = q.getValues()[0].trim(); + } + else if (qname.equals("codon_start")) { - if (feature.dbRefs != null && feature.dbRefs.size() > 0) + try { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna - .addDBRef((DBRefEntry) dbr.next())) - ; + codonStart = Integer.parseInt(q.getValues()[0].trim()); + } catch (NumberFormatException e) + { + System.err.println("Invalid codon_start in XML for " + accession + + ": " + e.getMessage()); } } - if (feature.getName().equalsIgnoreCase("CDS")) + else if (qname.equals("product")) { - // extract coding region(s) - jalview.datamodel.Mapping map = null; - int[] exon = null; - if (feature.locations != null && feature.locations.size() > 0) + // sometimes name is returned e.g. for V00488 + proteinName = q.getValues()[0].trim(); + } + else + { + // throw anything else into the additional properties hash + String[] qvals = q.getValues(); + if (qvals != null) { - for (Enumeration locs = feature.locations.elements(); locs - .hasMoreElements();) - { - EmblFeatureLocations loc = (EmblFeatureLocations) locs - .nextElement(); - int[] se = loc.getElementRanges(accession); - if (exon == null) - { - exon = se; - } - else - { - int[] t = new int[exon.length + se.length]; - System.arraycopy(exon, 0, t, 0, exon.length); - System.arraycopy(se, 0, t, exon.length, se.length); - exon = t; - } - } + String commaSeparated = StringUtils.arrayToSeparatorList(qvals, + ","); + vals.put(qname, commaSeparated); } - String prseq = null; - String prname = new String(); - String prid = null; - Hashtable vals = new Hashtable(); - int prstart = 1; - // get qualifiers - if (feature.getQualifiers() != null - && feature.getQualifiers().size() > 0) + } + } + } + + DBRefEntry proteinToEmblProteinRef = null; + exons = MappingUtils.removeStartPositions(codonStart - 1, exons); + + SequenceI product = null; + Mapping dnaToProteinMapping = null; + if (translation != null && proteinName != null && proteinId != null) + { + int translationLength = translation.length(); + + /* + * look for product in peptides list, if not found, add it + */ + product = matcher.findIdMatch(proteinId); + if (product == null) + { + product = new Sequence(proteinId, translation, 1, + translationLength); + product.setDescription(((proteinName.length() == 0) + ? "Protein Product from " + sourceDb + : proteinName)); + peptides.add(product); + matcher.add(product); + } + + // we have everything - create the mapping and perhaps the protein + // sequence + if (exons == null || exons.length == 0) + { + /* + * workaround until we handle dna location for CDS sequence + * e.g. location="X53828.1:60..1058" correctly + */ + System.err.println( + "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + + sourceDb + ":" + getAccession() + ")"); + int dnaLength = dna.getLength(); + if (translationLength * 3 == (1 - codonStart + dnaLength)) + { + System.err.println( + "Not allowing for additional stop codon at end of cDNA fragment... !"); + // this might occur for CDS sequences where no features are marked + exons = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() }; + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); + } + if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength)) + { + System.err.println( + "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); + exons = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() - 3 }; + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); + } + } + else + { + // Trim the exon mapping if necessary - the given product may only be a + // fragment of a larger protein. (EMBL:AY043181 is an example) + + if (isEmblCdna) + { + // TODO: Add a DbRef back to the parent EMBL sequence with the exon + // map + // if given a dataset reference, search dataset for parent EMBL + // sequence if it exists and set its map + // make a new feature annotating the coding contig + } + else + { + // final product length truncation check + int[] cdsRanges = adjustForProteinLength(translationLength, + exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, + new int[] + { 1, translationLength }, 3, 1); + if (product != null) { - for (Iterator quals = feature.getQualifiers().iterator(); quals - .hasNext();) - { - Qualifier q = (Qualifier) quals.next(); - if (q.getName().equals("translation")) - { - prseq = q.getValues()[0]; - } - else if (q.getName().equals("protein_id")) - { - prid = q.getValues()[0]; - } - else if (q.getName().equals("codon_start")) - { - prstart = Integer.parseInt(q.getValues()[0]); - } - else if (q.getName().equals("product")) - { - prname = q.getValues()[0]; - } - else - { - // throw anything else into the additional properties hash - vals.put(q.getName(), q.getValues().toString()); - } - } + /* + * make xref with mapping from protein to EMBL dna + */ + DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, + getSequenceVersion(), proteinId, + new Mapping(dnaToProteinMapping.getMap().getInverse())); + product.addDBRef(proteinToEmblRef); + + /* + * make xref from protein to EMBLCDS; we assume here that the + * CDS sequence version is same as dna sequence (?!) + */ + MapList proteinToCdsMapList = new MapList( + new int[] + { 1, translationLength }, + new int[] + { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, + 1, 3); + DBRefEntry proteinToEmblCdsRef = new DBRefEntry( + DBRefSource.EMBLCDS, getSequenceVersion(), proteinId, + new Mapping(proteinToCdsMapList)); + product.addDBRef(proteinToEmblCdsRef); + + /* + * make 'direct' xref from protein to EMBLCDSPROTEIN + */ + proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); + proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); + proteinToEmblProteinRef.setMap(null); + product.addDBRef(proteinToEmblProteinRef); } - Sequence product = null; - if (prseq != null && prname != null && prid != null) + } + } + + /* + * add cds features to dna sequence + */ + String cds = feature.getName(); // "CDS" + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) + { + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + int exonNumber = xint / 2 + 1; + String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinId); + + SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, + sourceDb, vals); + + sf.setEnaLocation(feature.getLocation()); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + + dna.addSequenceFeature(sf); + } + } + + /* + * add feature dbRefs to sequence, and mappings for Uniprot xrefs + */ + boolean hasUniprotDbref = false; + if (feature.dbRefs != null) + { + boolean mappingUsed = false; + for (DBRefEntry ref : feature.dbRefs) + { + /* + * ensure UniProtKB/Swiss-Prot converted to UNIPROT + */ + String source = DBRefUtils.getCanonicalName(ref.getSource()); + ref.setSource(source); + DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), + ref.getVersion(), ref.getAccessionId()); + if (source.equals(DBRefSource.UNIPROT)) + { + String proteinSeqName = DBRefSource.UNIPROT + "|" + + ref.getAccessionId(); + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) { - // extract proteins. - if (!noPeptide) - { - product = new Sequence(sourceDb + "|" + "EMBLCDS|" + prid - + "|" + prname, prseq, prstart, prstart - + prseq.length() - 1); - product.setDescription("Protein Product from " + sourceDb); - seqs.add(product); - } - // we have everything - create the mapping and perhaps the protein - // sequence - map = new jalview.datamodel.Mapping(product, exon, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); - // add cds feature to dna seq - this may include the stop codon - for (int xint = 0; xint < exon.length; xint += 2) + if (mappingUsed) { - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(exon[xint]); - sf.setEnd(exon[xint + 1]); - sf.setType(feature.getName()); - sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL); - sf.setDescription("Exon " + (1 + xint) + " for protein '" - + prname + "' EMBLCDS:" + prid); - if (vals != null && vals.size() > 0) - { - Enumeration kv = vals.elements(); - while (kv.hasMoreElements()) - { - Object key = kv.nextElement(); - if (key != null) - sf.setValue(key.toString(), vals.get(key)); - } - } - dna.addSequenceFeature(sf); + /* + * two or more Uniprot xrefs for the same CDS - + * each needs a distinct Mapping (as to a different sequence) + */ + dnaToProteinMapping = new Mapping(dnaToProteinMapping); } - } - // add dbRefs to sequence - if (feature.dbRefs != null && feature.dbRefs.size() > 0) - { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) + mappingUsed = true; + + /* + * try to locate the protein mapped to (possibly by a + * previous CDS feature); if not found, construct it from + * the EMBL translation + */ + SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName); + if (proteinSeq == null) { - DBRefEntry ref = (DBRefEntry) dbr.next(); - ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref - .getSource())); - if (ref.getSource().equals( - jalview.datamodel.DBRefSource.UNIPROT)) - { - ref.setMap(map); - } - if (product != null) - { - DBRefEntry pref = new DBRefEntry(ref.getSource(), ref - .getVersion(), ref.getAccessionId()); - pref.setMap(null); // reference is direct - } - dna.addDBRef(ref); + proteinSeq = new Sequence(proteinSeqName, + product.getSequenceAsString()); + matcher.add(proteinSeq); + peptides.add(proteinSeq); } + dnaToProteinMapping.setTo(proteinSeq); + dnaToProteinMapping.setMappedFromId(proteinId); + proteinSeq.addDBRef(proteinDbRef); + ref.setMap(dnaToProteinMapping); } - + hasUniprotDbref = true; } - else + if (product != null) { - // General feature type. - if (!noNa) + /* + * copy feature dbref to our protein product + */ + DBRefEntry pref = proteinDbRef; + pref.setMap(null); // reference is direct + product.addDBRef(pref); + // Add converse mapping reference + if (dnaToProteinMapping != null) { - if (feature.dbRefs != null && feature.dbRefs.size() > 0) + Mapping pmap = new Mapping(dna, + dnaToProteinMapping.getMap().getInverse()); + pref = new DBRefEntry(sourceDb, getSequenceVersion(), + this.getAccession()); + pref.setMap(pmap); + if (dnaToProteinMapping.getTo() != null) { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna - .addDBRef((DBRefEntry) dbr.next())) - ; + dnaToProteinMapping.getTo().addDBRef(pref); } } } + dna.addDBRef(ref); } - } catch (Exception e) + } + + /* + * if we have a product (translation) but no explicit Uniprot dbref + * (example: EMBL AAFI02000057 protein_id EAL65544.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession + */ + if (!hasUniprotDbref && product != null) { - System.err.println("EMBL Record Features parsing error!"); - System.err.println("Please report the following to help@jalview.org :"); - System.err.println("EMBL Record "+accession); - System.err.println("Resulted in exception: "+e.getMessage()); - e.printStackTrace(System.err); + if (proteinToEmblProteinRef == null) + { + // assuming CDSPROTEIN sequence version = dna version (?!) + proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct, + getSequenceVersion(), proteinId); + } + product.addDBRef(proteinToEmblProteinRef); + + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) + { + DBRefEntry dnaToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, getSequenceVersion(), + proteinId); + dnaToEmblProteinRef.setMap(dnaToProteinMapping); + dnaToProteinMapping.setMappedFromId(proteinId); + dna.addDBRef(dnaToEmblProteinRef); + } + } + } + + /** + * Helper method to construct a SequenceFeature for one cds range + * + * @param type + * feature type ("CDS") + * @param desc + * description + * @param begin + * start position + * @param end + * end position + * @param group + * feature group + * @param vals + * map of 'miscellaneous values' for feature + * @return + */ + protected SequenceFeature makeCdsFeature(String type, String desc, + int begin, int end, String group, Map vals) + { + SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); + if (!vals.isEmpty()) + { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Entry val : vals.entrySet()) + { + if (!first) + { + sb.append(";"); + } + sb.append(val.getKey()).append("=").append(val.getValue()); + first = false; + sf.setValue(val.getKey(), val.getValue()); + } + sf.setAttributes(sb.toString()); + } + return sf; + } + + /** + * Returns the CDS positions as a single array of [start, end, start, end...] + * positions. If on the reverse strand, these will be in descending order. + * + * @param feature + * @return + */ + protected int[] getCdsRanges(EmblFeature feature) + { + if (feature.location == null) + { + return new int[] {}; + } + + try + { + List ranges = DnaUtils.parseLocation(feature.location); + return listToArray(ranges); + } catch (ParseException e) + { + Cache.log.warn( + String.format("Not parsing inexact CDS location %s in ENA %s", + feature.location, this.accession)); + return new int[] {}; + } + } + + /** + * Converts a list of [start, end] ranges to a single array of [start, end, + * start, end ...] + * + * @param ranges + * @return + */ + int[] listToArray(List ranges) + { + int[] result = new int[ranges.size() * 2]; + int i = 0; + for (int[] range : ranges) + { + result[i++] = range[0]; + result[i++] = range[1]; } - if (!noNa && dna!=null) + return result; + } + + /** + * Truncates (if necessary) the exon intervals to match 3 times the length of + * the protein; also accepts 3 bases longer (for stop codon not included in + * protein) + * + * @param proteinLength + * @param exon + * an array of [start, end, start, end...] intervals + * @return the same array (if unchanged) or a truncated copy + */ + static int[] adjustForProteinLength(int proteinLength, int[] exon) + { + if (proteinLength <= 0 || exon == null) { - seqs.add(dna); + return exon; } - SequenceI[] sqs = new SequenceI[seqs.size()]; - for (int i = 0, j = seqs.size(); i < j; i++) + int expectedCdsLength = proteinLength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); + + /* + * if exon length matches protein, or is shorter, or longer by the + * length of a stop codon (3 bases), then leave it unchanged + */ + if (expectedCdsLength >= exonLength + || expectedCdsLength == exonLength - 3) { - sqs[i] = (SequenceI) seqs.elementAt(i); - seqs.set(i, null); + return exon; } - return sqs; + + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) + { + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (expectedCdsLength <= cdspos) + { + // advanced beyond last codon. + sxpos = x; + if (expectedCdsLength != cdspos) + { + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); + } + + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + expectedCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - expectedCdsLength; + } + break; + } + } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } + return exon; + } + + public String getSequenceVersion() + { + return sequenceVersion; + } + + public void setSequenceVersion(String sequenceVersion) + { + this.sequenceVersion = sequenceVersion; + } + + public String getSequenceLength() + { + return sequenceLength; + } + + public void setSequenceLength(String sequenceLength) + { + this.sequenceLength = sequenceLength; + } + + public String getEntryVersion() + { + return entryVersion; + } + + public void setEntryVersion(String entryVersion) + { + this.entryVersion = entryVersion; + } + + public String getMoleculeType() + { + return moleculeType; + } + + public void setMoleculeType(String moleculeType) + { + this.moleculeType = moleculeType; + } + + public String getTopology() + { + return topology; + } + + public void setTopology(String topology) + { + this.topology = topology; + } + + public String getTaxonomicDivision() + { + return taxonomicDivision; + } + + public void setTaxonomicDivision(String taxonomicDivision) + { + this.taxonomicDivision = taxonomicDivision; + } + + public String getDescription() + { + return description; + } + + public void setDescription(String description) + { + this.description = description; + } + + public String getFirstPublicDate() + { + return firstPublicDate; + } + + public void setFirstPublicDate(String firstPublicDate) + { + this.firstPublicDate = firstPublicDate; + } + + public String getFirstPublicRelease() + { + return firstPublicRelease; + } + + public void setFirstPublicRelease(String firstPublicRelease) + { + this.firstPublicRelease = firstPublicRelease; + } + + public String getLastUpdatedDate() + { + return lastUpdatedDate; + } + + public void setLastUpdatedDate(String lastUpdatedDate) + { + this.lastUpdatedDate = lastUpdatedDate; + } + + public String getLastUpdatedRelease() + { + return lastUpdatedRelease; + } + + public void setLastUpdatedRelease(String lastUpdatedRelease) + { + this.lastUpdatedRelease = lastUpdatedRelease; + } + + public String getDataClass() + { + return dataClass; + } + + public void setDataClass(String dataClass) + { + this.dataClass = dataClass; } }