X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=c3d4e668e2157025041851a32497407fac32bc03;hb=136c0793b90b72b928c4d77dc109dd5c644e00d3;hp=a2354edb653b80f81a14b1746d02f872a0e39ad5;hpb=f28d892d6d2584e7eb44ff7333d49d60d787f706;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index a2354ed..c3d4e66 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -48,9 +48,7 @@ import java.util.regex.Pattern; * Data model for one entry returned from an EMBL query, as marshalled by a * Castor binding file * - * For example: - * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321 - * &format=emblxml + * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml * * @see embl_mapping.xml */ @@ -187,8 +185,11 @@ public class EmblEntry */ public SequenceI getSequence(String sourceDb, List peptides) { - SequenceI dna = new Sequence(sourceDb + "|" + accession, - sequence.getSequence()); + SequenceI dna = makeSequence(sourceDb); + if (dna == null) + { + return null; + } dna.setDescription(description); DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), accession); @@ -198,7 +199,6 @@ public class EmblEntry retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); - /* * transform EMBL Database refs to canonical form */ @@ -235,6 +235,23 @@ public class EmblEntry } /** + * @param sourceDb + * @return + */ + SequenceI makeSequence(String sourceDb) + { + if (sequence == null) + { + System.err.println("No sequence was returned for ENA accession " + + accession); + return null; + } + SequenceI dna = new Sequence(sourceDb + "|" + accession, + sequence.getSequence()); + return dna; + } + + /** * Extracts coding region and product from a CDS feature and properly decorate * it with annotations. * @@ -279,7 +296,8 @@ public class EmblEntry if (qname.equals("translation")) { // remove all spaces (precompiled String.replaceAll(" ", "")) - translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(""); + translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll( + ""); } else if (qname.equals("protein_id")) { @@ -322,13 +340,15 @@ public class EmblEntry Mapping dnaToProteinMapping = null; if (translation != null && proteinName != null && proteinId != null) { + int translationLength = translation.length(); + /* * look for product in peptides list, if not found, add it */ product = matcher.findIdMatch(proteinId); if (product == null) { - product = new Sequence(proteinId, translation, 1, translation.length()); + product = new Sequence(proteinId, translation, 1, translationLength); product.setDescription(((proteinName.length() == 0) ? "Protein Product from " + sourceDb : proteinName)); @@ -340,30 +360,32 @@ public class EmblEntry // sequence if (exons == null || exons.length == 0) { + /* + * workaround until we handle dna location for CDS sequence + * e.g. location="X53828.1:60..1058" correctly + */ System.err .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (translation.length() * 3 == (1 - codonStart + dna.getSequence().length)) + if (translationLength * 3 == (1 - codonStart + dna.getSequence().length)) { System.err .println("Not allowing for additional stop codon at end of cDNA fragment... !"); - // this might occur for CDS sequences where no features are - // marked. + // this might occur for CDS sequences where no features are marked exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translation.length() }, - 3, 1); + translationLength }, 3, 1); } - if ((translation.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length)) + if ((translationLength + 1) * 3 == (1 - codonStart + dna + .getSequence().length)) { System.err .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translation.length() }, - 3, 1); + translationLength }, 3, 1); } } else @@ -382,31 +404,37 @@ public class EmblEntry else { // final product length truncation check - int[] cdsRanges = adjustForProteinLength(translation.length(), exons); - dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { 1, - translation.length() }, 3, 1); + int[] cdsRanges = adjustForProteinLength(translationLength, exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { + 1, translationLength }, 3, 1); if (product != null) { /* - * make xrefs from protein to EMBLCDS and EMBLCDSPROTEIN + * make xref with mapping from protein to EMBL dna */ - DBRefEntry proteinToEmblCdsRef = new DBRefEntry(); - proteinToEmblCdsRef.setAccessionId(proteinId); - proteinToEmblCdsRef.setSource(DBRefSource.EMBLCDS); - proteinToEmblCdsRef.setVersion(getSequenceVersion()); // same as - // parent EMBL - // version. - MapList mp = new MapList(new int[] { 1, translation.length() }, - new int[] { 1 + (codonStart - 1), - (codonStart - 1) + 3 * translation.length() }, 1, 3); - proteinToEmblCdsRef.setMap(new Mapping(mp)); + DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, + getSequenceVersion(), proteinId, new Mapping( + dnaToProteinMapping.getMap().getInverse())); + product.addDBRef(proteinToEmblRef); + + /* + * make xref from protein to EMBLCDS; we assume here that the + * CDS sequence version is same as dna sequence (?!) + */ + MapList proteinToCdsMapList = new MapList(new int[] { 1, + translationLength }, new int[] { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, 1, 3); + DBRefEntry proteinToEmblCdsRef = new DBRefEntry( + DBRefSource.EMBLCDS, getSequenceVersion(), proteinId, + new Mapping(proteinToCdsMapList)); product.addDBRef(proteinToEmblCdsRef); + + /* + * make 'direct' xref from protein to EMBLCDSPROTEIN + */ proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); - MapList mp2 = new MapList( - new int[] { 1, translation.length() }, new int[] { 1, - translation.length() }, 1, 1); - proteinToEmblProteinRef.setMap(new Mapping(mp2)); proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); + proteinToEmblProteinRef.setMap(null); product.addDBRef(proteinToEmblProteinRef); } } @@ -415,13 +443,27 @@ public class EmblEntry /* * add cds features to dna sequence */ - for (int xint = 0; exons != null && xint < exons.length; xint += 2) + String cds = feature.getName(); // "CDS" + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { - SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, proteinId, vals, - codonStart); - sf.setType(feature.getName()); // "CDS" + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + int exonNumber = xint / 2 + 1; + String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinId); + + SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, + sourceDb, vals); + sf.setEnaLocation(feature.getLocation()); - sf.setFeatureGroup(sourceDb); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + dna.addSequenceFeature(sf); } } @@ -440,13 +482,14 @@ public class EmblEntry */ String source = DBRefUtils.getCanonicalName(ref.getSource()); ref.setSource(source); - DBRefEntry proteinToDnaRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref - .getAccessionId()); + DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), + ref.getVersion(), ref.getAccessionId()); if (source.equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" + ref.getAccessionId(); - if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) { if (mappingUsed) { @@ -472,7 +515,8 @@ public class EmblEntry peptides.add(proteinSeq); } dnaToProteinMapping.setTo(proteinSeq); - proteinSeq.addDBRef(proteinToDnaRef); + dnaToProteinMapping.setMappedFromId(proteinId); + proteinSeq.addDBRef(proteinDbRef); ref.setMap(dnaToProteinMapping); } hasUniprotDbref = true; @@ -482,7 +526,7 @@ public class EmblEntry /* * copy feature dbref to our protein product */ - DBRefEntry pref = proteinToDnaRef; + DBRefEntry pref = proteinDbRef; pref.setMap(null); // reference is direct product.addDBRef(pref); // Add converse mapping reference @@ -502,21 +546,19 @@ public class EmblEntry dna.addDBRef(ref); } } + /* * if we have a product (translation) but no explicit Uniprot dbref - * (example: EMBL AAFI02000057 protein_id EAL65544.1 - * construct mappings to an assumed EMBLCDSPROTEIN accession + * (example: EMBL AAFI02000057 protein_id EAL65544.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession */ if (!hasUniprotDbref && product != null) { if (proteinToEmblProteinRef == null) { - proteinToEmblProteinRef = new DBRefEntry(); - proteinToEmblProteinRef.setAccessionId(proteinId); - proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); - proteinToEmblProteinRef.setVersion(getSequenceVersion()); - proteinToEmblProteinRef.setMap(new Mapping(product, - dnaToProteinMapping.getMap().getInverse())); + // assuming CDSPROTEIN sequence version = dna version (?!) + proteinToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); } product.addDBRef(proteinToEmblProteinRef); @@ -524,8 +566,9 @@ public class EmblEntry && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - proteinToEmblProteinRef); + DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); + dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); } } @@ -534,33 +577,25 @@ public class EmblEntry /** * Helper method to construct a SequenceFeature for one cds range * - * @param exons - * array of cds [start, end, ...] positions - * @param exonStartIndex - * offset into the exons array - * @param proteinName - * @param proteinAccessionId + * @param type + * feature type ("CDS") + * @param desc + * description + * @param begin + * start position + * @param end + * end position + * @param group + * feature group * @param vals * map of 'miscellaneous values' for feature - * @param codonStart - * codon start position for CDS (1/2/3, normally 1) * @return */ - protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex, - String proteinName, String proteinAccessionId, - Map vals, int codonStart) - { - int exonNumber = exonStartIndex / 2 + 1; - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s", - exonNumber, proteinName, proteinAccessionId)); - sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" - : "-"); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + protected SequenceFeature makeCdsFeature(String type, String desc, + int begin, int end, String group, Map vals) + { + SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); + if (!vals.isEmpty()) { StringBuilder sb = new StringBuilder();