X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=0ae49b998d1d27a1f8dac69e6eb10a4a476d4944;hb=HEAD;hp=a2354edb653b80f81a14b1746d02f872a0e39ad5;hpb=f28d892d6d2584e7eb44ff7333d49d60d787f706;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java deleted file mode 100644 index a2354ed..0000000 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ /dev/null @@ -1,825 +0,0 @@ -/* - * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) - * Copyright (C) $$Year-Rel$$ The Jalview Authors - * - * This file is part of Jalview. - * - * Jalview is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, either version 3 - * of the License, or (at your option) any later version. - * - * Jalview is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Jalview. If not, see . - * The Jalview Authors are detailed in the 'AUTHORS' file. - */ -package jalview.datamodel.xdb.embl; - -import jalview.analysis.SequenceIdMatcher; -import jalview.bin.Cache; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; -import jalview.datamodel.FeatureProperties; -import jalview.datamodel.Mapping; -import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.util.DBRefUtils; -import jalview.util.DnaUtils; -import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.StringUtils; - -import java.text.ParseException; -import java.util.Arrays; -import java.util.Hashtable; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Vector; -import java.util.regex.Pattern; - -/** - * Data model for one entry returned from an EMBL query, as marshalled by a - * Castor binding file - * - * For example: - * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321 - * &format=emblxml - * - * @see embl_mapping.xml - */ -public class EmblEntry -{ - private static final Pattern SPACE_PATTERN = Pattern.compile(" "); - - String accession; - - String entryVersion; - - String sequenceVersion; - - String dataClass; - - String moleculeType; - - String topology; - - String sequenceLength; - - String taxonomicDivision; - - String description; - - String firstPublicDate; - - String firstPublicRelease; - - String lastUpdatedDate; - - String lastUpdatedRelease; - - Vector keywords; - - Vector dbRefs; - - Vector features; - - EmblSequence sequence; - - /** - * @return the accession - */ - public String getAccession() - { - return accession; - } - - /** - * @param accession - * the accession to set - */ - public void setAccession(String accession) - { - this.accession = accession; - } - - /** - * @return the dbRefs - */ - public Vector getDbRefs() - { - return dbRefs; - } - - /** - * @param dbRefs - * the dbRefs to set - */ - public void setDbRefs(Vector dbRefs) - { - this.dbRefs = dbRefs; - } - - /** - * @return the features - */ - public Vector getFeatures() - { - return features; - } - - /** - * @param features - * the features to set - */ - public void setFeatures(Vector features) - { - this.features = features; - } - - /** - * @return the keywords - */ - public Vector getKeywords() - { - return keywords; - } - - /** - * @param keywords - * the keywords to set - */ - public void setKeywords(Vector keywords) - { - this.keywords = keywords; - } - - /** - * @return the sequence - */ - public EmblSequence getSequence() - { - return sequence; - } - - /** - * @param sequence - * the sequence to set - */ - public void setSequence(EmblSequence sequence) - { - this.sequence = sequence; - } - - /** - * Recover annotated sequences from EMBL file - * - * @param sourceDb - * @param peptides - * a list of protein products found so far (to add to) - * @return dna dataset sequence with DBRefs and features - */ - public SequenceI getSequence(String sourceDb, List peptides) - { - SequenceI dna = new Sequence(sourceDb + "|" + accession, - sequence.getSequence()); - dna.setDescription(description); - DBRefEntry retrievedref = new DBRefEntry(sourceDb, - getSequenceVersion(), accession); - dna.addDBRef(retrievedref); - // add map to indicate the sequence is a valid coordinate frame for the - // dbref - retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, - new int[] { 1, dna.getLength() }, 1, 1)); - - - /* - * transform EMBL Database refs to canonical form - */ - if (dbRefs != null) - { - for (DBRefEntry dbref : dbRefs) - { - dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource())); - dna.addDBRef(dbref); - } - } - - SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); - try - { - for (EmblFeature feature : features) - { - if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) - { - parseCodingFeature(feature, sourceDb, dna, peptides, matcher); - } - } - } catch (Exception e) - { - System.err.println("EMBL Record Features parsing error!"); - System.err - .println("Please report the following to help@jalview.org :"); - System.err.println("EMBL Record " + accession); - System.err.println("Resulted in exception: " + e.getMessage()); - e.printStackTrace(System.err); - } - - return dna; - } - - /** - * Extracts coding region and product from a CDS feature and properly decorate - * it with annotations. - * - * @param feature - * coding feature - * @param sourceDb - * source database for the EMBLXML - * @param dna - * parent dna sequence for this record - * @param peptides - * list of protein product sequences for Embl entry - * @param matcher - * helper to match xrefs in already retrieved sequences - */ - void parseCodingFeature(EmblFeature feature, String sourceDb, - SequenceI dna, List peptides, SequenceIdMatcher matcher) - { - boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); - - int[] exons = getCdsRanges(feature); - - String translation = null; - String proteinName = ""; - String proteinId = null; - Map vals = new Hashtable(); - - /* - * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS - * (phase is required for CDS features in GFF3 format) - */ - int codonStart = 1; - - /* - * parse qualifiers, saving protein translation, protein id, - * codon start position, product (name), and 'other values' - */ - if (feature.getQualifiers() != null) - { - for (Qualifier q : feature.getQualifiers()) - { - String qname = q.getName(); - if (qname.equals("translation")) - { - // remove all spaces (precompiled String.replaceAll(" ", "")) - translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(""); - } - else if (qname.equals("protein_id")) - { - proteinId = q.getValues()[0].trim(); - } - else if (qname.equals("codon_start")) - { - try - { - codonStart = Integer.parseInt(q.getValues()[0].trim()); - } catch (NumberFormatException e) - { - System.err.println("Invalid codon_start in XML for " - + accession + ": " + e.getMessage()); - } - } - else if (qname.equals("product")) - { - // sometimes name is returned e.g. for V00488 - proteinName = q.getValues()[0].trim(); - } - else - { - // throw anything else into the additional properties hash - String[] qvals = q.getValues(); - if (qvals != null) - { - String commaSeparated = StringUtils.arrayToSeparatorList(qvals, - ","); - vals.put(qname, commaSeparated); - } - } - } - } - - DBRefEntry proteinToEmblProteinRef = null; - exons = MappingUtils.removeStartPositions(codonStart - 1, exons); - - SequenceI product = null; - Mapping dnaToProteinMapping = null; - if (translation != null && proteinName != null && proteinId != null) - { - /* - * look for product in peptides list, if not found, add it - */ - product = matcher.findIdMatch(proteinId); - if (product == null) - { - product = new Sequence(proteinId, translation, 1, translation.length()); - product.setDescription(((proteinName.length() == 0) ? "Protein Product from " - + sourceDb - : proteinName)); - peptides.add(product); - matcher.add(product); - } - - // we have everything - create the mapping and perhaps the protein - // sequence - if (exons == null || exons.length == 0) - { - System.err - .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" - + sourceDb + ":" + getAccession() + ")"); - if (translation.length() * 3 == (1 - codonStart + dna.getSequence().length)) - { - System.err - .println("Not allowing for additional stop codon at end of cDNA fragment... !"); - // this might occur for CDS sequences where no features are - // marked. - exons = new int[] { dna.getStart() + (codonStart - 1), - dna.getEnd() }; - dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translation.length() }, - 3, 1); - } - if ((translation.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length)) - { - System.err - .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); - exons = new int[] { dna.getStart() + (codonStart - 1), - dna.getEnd() - 3 }; - dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translation.length() }, - 3, 1); - } - } - else - { - // Trim the exon mapping if necessary - the given product may only be a - // fragment of a larger protein. (EMBL:AY043181 is an example) - - if (isEmblCdna) - { - // TODO: Add a DbRef back to the parent EMBL sequence with the exon - // map - // if given a dataset reference, search dataset for parent EMBL - // sequence if it exists and set its map - // make a new feature annotating the coding contig - } - else - { - // final product length truncation check - int[] cdsRanges = adjustForProteinLength(translation.length(), exons); - dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { 1, - translation.length() }, 3, 1); - if (product != null) - { - /* - * make xrefs from protein to EMBLCDS and EMBLCDSPROTEIN - */ - DBRefEntry proteinToEmblCdsRef = new DBRefEntry(); - proteinToEmblCdsRef.setAccessionId(proteinId); - proteinToEmblCdsRef.setSource(DBRefSource.EMBLCDS); - proteinToEmblCdsRef.setVersion(getSequenceVersion()); // same as - // parent EMBL - // version. - MapList mp = new MapList(new int[] { 1, translation.length() }, - new int[] { 1 + (codonStart - 1), - (codonStart - 1) + 3 * translation.length() }, 1, 3); - proteinToEmblCdsRef.setMap(new Mapping(mp)); - product.addDBRef(proteinToEmblCdsRef); - proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); - MapList mp2 = new MapList( - new int[] { 1, translation.length() }, new int[] { 1, - translation.length() }, 1, 1); - proteinToEmblProteinRef.setMap(new Mapping(mp2)); - proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); - product.addDBRef(proteinToEmblProteinRef); - } - } - } - - /* - * add cds features to dna sequence - */ - for (int xint = 0; exons != null && xint < exons.length; xint += 2) - { - SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, proteinId, vals, - codonStart); - sf.setType(feature.getName()); // "CDS" - sf.setEnaLocation(feature.getLocation()); - sf.setFeatureGroup(sourceDb); - dna.addSequenceFeature(sf); - } - } - - /* - * add feature dbRefs to sequence, and mappings for Uniprot xrefs - */ - boolean hasUniprotDbref = false; - if (feature.dbRefs != null) - { - boolean mappingUsed = false; - for (DBRefEntry ref : feature.dbRefs) - { - /* - * ensure UniProtKB/Swiss-Prot converted to UNIPROT - */ - String source = DBRefUtils.getCanonicalName(ref.getSource()); - ref.setSource(source); - DBRefEntry proteinToDnaRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref - .getAccessionId()); - if (source.equals(DBRefSource.UNIPROT)) - { - String proteinSeqName = DBRefSource.UNIPROT + "|" - + ref.getAccessionId(); - if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) - { - if (mappingUsed) - { - /* - * two or more Uniprot xrefs for the same CDS - - * each needs a distinct Mapping (as to a different sequence) - */ - dnaToProteinMapping = new Mapping(dnaToProteinMapping); - } - mappingUsed = true; - - /* - * try to locate the protein mapped to (possibly by a - * previous CDS feature); if not found, construct it from - * the EMBL translation - */ - SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName); - if (proteinSeq == null) - { - proteinSeq = new Sequence(proteinSeqName, - product.getSequenceAsString()); - matcher.add(proteinSeq); - peptides.add(proteinSeq); - } - dnaToProteinMapping.setTo(proteinSeq); - proteinSeq.addDBRef(proteinToDnaRef); - ref.setMap(dnaToProteinMapping); - } - hasUniprotDbref = true; - } - if (product != null) - { - /* - * copy feature dbref to our protein product - */ - DBRefEntry pref = proteinToDnaRef; - pref.setMap(null); // reference is direct - product.addDBRef(pref); - // Add converse mapping reference - if (dnaToProteinMapping != null) - { - Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap() - .getInverse()); - pref = new DBRefEntry(sourceDb, getSequenceVersion(), - this.getAccession()); - pref.setMap(pmap); - if (dnaToProteinMapping.getTo() != null) - { - dnaToProteinMapping.getTo().addDBRef(pref); - } - } - } - dna.addDBRef(ref); - } - } - /* - * if we have a product (translation) but no explicit Uniprot dbref - * (example: EMBL AAFI02000057 protein_id EAL65544.1 - * construct mappings to an assumed EMBLCDSPROTEIN accession - */ - if (!hasUniprotDbref && product != null) - { - if (proteinToEmblProteinRef == null) - { - proteinToEmblProteinRef = new DBRefEntry(); - proteinToEmblProteinRef.setAccessionId(proteinId); - proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); - proteinToEmblProteinRef.setVersion(getSequenceVersion()); - proteinToEmblProteinRef.setMap(new Mapping(product, - dnaToProteinMapping.getMap().getInverse())); - } - product.addDBRef(proteinToEmblProteinRef); - - if (dnaToProteinMapping != null - && dnaToProteinMapping.getTo() != null) - { - DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - proteinToEmblProteinRef); - dnaToEmblProteinRef.setMap(dnaToProteinMapping); - dna.addDBRef(dnaToEmblProteinRef); - } - } - } - - /** - * Helper method to construct a SequenceFeature for one cds range - * - * @param exons - * array of cds [start, end, ...] positions - * @param exonStartIndex - * offset into the exons array - * @param proteinName - * @param proteinAccessionId - * @param vals - * map of 'miscellaneous values' for feature - * @param codonStart - * codon start position for CDS (1/2/3, normally 1) - * @return - */ - protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex, - String proteinName, String proteinAccessionId, - Map vals, int codonStart) - { - int exonNumber = exonStartIndex / 2 + 1; - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s", - exonNumber, proteinName, proteinAccessionId)); - sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" - : "-"); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); - if (!vals.isEmpty()) - { - StringBuilder sb = new StringBuilder(); - boolean first = true; - for (Entry val : vals.entrySet()) - { - if (!first) - { - sb.append(";"); - } - sb.append(val.getKey()).append("=").append(val.getValue()); - first = false; - sf.setValue(val.getKey(), val.getValue()); - } - sf.setAttributes(sb.toString()); - } - return sf; - } - - /** - * Returns the CDS positions as a single array of [start, end, start, end...] - * positions. If on the reverse strand, these will be in descending order. - * - * @param feature - * @return - */ - protected int[] getCdsRanges(EmblFeature feature) - { - if (feature.location == null) - { - return new int[] {}; - } - - try - { - List ranges = DnaUtils.parseLocation(feature.location); - return listToArray(ranges); - } catch (ParseException e) - { - Cache.log.warn(String.format( - "Not parsing inexact CDS location %s in ENA %s", - feature.location, this.accession)); - return new int[] {}; - } - } - - /** - * Converts a list of [start, end] ranges to a single array of [start, end, - * start, end ...] - * - * @param ranges - * @return - */ - int[] listToArray(List ranges) - { - int[] result = new int[ranges.size() * 2]; - int i = 0; - for (int[] range : ranges) - { - result[i++] = range[0]; - result[i++] = range[1]; - } - return result; - } - - /** - * Truncates (if necessary) the exon intervals to match 3 times the length of - * the protein; also accepts 3 bases longer (for stop codon not included in - * protein) - * - * @param proteinLength - * @param exon - * an array of [start, end, start, end...] intervals - * @return the same array (if unchanged) or a truncated copy - */ - static int[] adjustForProteinLength(int proteinLength, int[] exon) - { - if (proteinLength <= 0 || exon == null) - { - return exon; - } - int expectedCdsLength = proteinLength * 3; - int exonLength = MappingUtils.getLength(Arrays.asList(exon)); - - /* - * if exon length matches protein, or is shorter, or longer by the - * length of a stop codon (3 bases), then leave it unchanged - */ - if (expectedCdsLength >= exonLength - || expectedCdsLength == exonLength - 3) - { - return exon; - } - - int origxon[]; - int sxpos = -1; - int endxon = 0; - origxon = new int[exon.length]; - System.arraycopy(exon, 0, origxon, 0, exon.length); - int cdspos = 0; - for (int x = 0; x < exon.length; x += 2) - { - cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; - if (expectedCdsLength <= cdspos) - { - // advanced beyond last codon. - sxpos = x; - if (expectedCdsLength != cdspos) - { - // System.err - // .println("Truncating final exon interval on region by " - // + (cdspos - cdslength)); - } - - /* - * shrink the final exon - reduce end position if forward - * strand, increase it if reverse - */ - if (exon[x + 1] >= exon[x]) - { - endxon = exon[x + 1] - cdspos + expectedCdsLength; - } - else - { - endxon = exon[x + 1] + cdspos - expectedCdsLength; - } - break; - } - } - - if (sxpos != -1) - { - // and trim the exon interval set if necessary - int[] nxon = new int[sxpos + 2]; - System.arraycopy(exon, 0, nxon, 0, sxpos + 2); - nxon[sxpos + 1] = endxon; // update the end boundary for the new exon - // set - exon = nxon; - } - return exon; - } - - public String getSequenceVersion() - { - return sequenceVersion; - } - - public void setSequenceVersion(String sequenceVersion) - { - this.sequenceVersion = sequenceVersion; - } - - public String getSequenceLength() - { - return sequenceLength; - } - - public void setSequenceLength(String sequenceLength) - { - this.sequenceLength = sequenceLength; - } - - public String getEntryVersion() - { - return entryVersion; - } - - public void setEntryVersion(String entryVersion) - { - this.entryVersion = entryVersion; - } - - public String getMoleculeType() - { - return moleculeType; - } - - public void setMoleculeType(String moleculeType) - { - this.moleculeType = moleculeType; - } - - public String getTopology() - { - return topology; - } - - public void setTopology(String topology) - { - this.topology = topology; - } - - public String getTaxonomicDivision() - { - return taxonomicDivision; - } - - public void setTaxonomicDivision(String taxonomicDivision) - { - this.taxonomicDivision = taxonomicDivision; - } - - public String getDescription() - { - return description; - } - - public void setDescription(String description) - { - this.description = description; - } - - public String getFirstPublicDate() - { - return firstPublicDate; - } - - public void setFirstPublicDate(String firstPublicDate) - { - this.firstPublicDate = firstPublicDate; - } - - public String getFirstPublicRelease() - { - return firstPublicRelease; - } - - public void setFirstPublicRelease(String firstPublicRelease) - { - this.firstPublicRelease = firstPublicRelease; - } - - public String getLastUpdatedDate() - { - return lastUpdatedDate; - } - - public void setLastUpdatedDate(String lastUpdatedDate) - { - this.lastUpdatedDate = lastUpdatedDate; - } - - public String getLastUpdatedRelease() - { - return lastUpdatedRelease; - } - - public void setLastUpdatedRelease(String lastUpdatedRelease) - { - this.lastUpdatedRelease = lastUpdatedRelease; - } - - public String getDataClass() - { - return dataClass; - } - - public void setDataClass(String dataClass) - { - this.dataClass = dataClass; - } -}