X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=4d09bdc2a408d74ba04ec1e61ae48482f00bd0fb;hb=37de9310bec3501cbc6381e0c3dcb282fcaad812;hp=a2354edb653b80f81a14b1746d02f872a0e39ad5;hpb=f28d892d6d2584e7eb44ff7333d49d60d787f706;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index a2354ed..4d09bdc 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -48,9 +48,7 @@ import java.util.regex.Pattern; * Data model for one entry returned from an EMBL query, as marshalled by a * Castor binding file * - * For example: - * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321 - * &format=emblxml + * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml * * @see embl_mapping.xml */ @@ -187,8 +185,11 @@ public class EmblEntry */ public SequenceI getSequence(String sourceDb, List peptides) { - SequenceI dna = new Sequence(sourceDb + "|" + accession, - sequence.getSequence()); + SequenceI dna = makeSequence(sourceDb); + if (dna == null) + { + return null; + } dna.setDescription(description); DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), accession); @@ -198,7 +199,6 @@ public class EmblEntry retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); - /* * transform EMBL Database refs to canonical form */ @@ -235,6 +235,23 @@ public class EmblEntry } /** + * @param sourceDb + * @return + */ + SequenceI makeSequence(String sourceDb) + { + if (sequence == null) + { + System.err.println("No sequence was returned for ENA accession " + + accession); + return null; + } + SequenceI dna = new Sequence(sourceDb + "|" + accession, + sequence.getSequence()); + return dna; + } + + /** * Extracts coding region and product from a CDS feature and properly decorate * it with annotations. * @@ -279,7 +296,8 @@ public class EmblEntry if (qname.equals("translation")) { // remove all spaces (precompiled String.replaceAll(" ", "")) - translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(""); + translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll( + ""); } else if (qname.equals("protein_id")) { @@ -322,13 +340,15 @@ public class EmblEntry Mapping dnaToProteinMapping = null; if (translation != null && proteinName != null && proteinId != null) { + int translationLength = translation.length(); + /* * look for product in peptides list, if not found, add it */ product = matcher.findIdMatch(proteinId); if (product == null) { - product = new Sequence(proteinId, translation, 1, translation.length()); + product = new Sequence(proteinId, translation, 1, translationLength); product.setDescription(((proteinName.length() == 0) ? "Protein Product from " + sourceDb : proteinName)); @@ -340,30 +360,32 @@ public class EmblEntry // sequence if (exons == null || exons.length == 0) { + /* + * workaround until we handle dna location for CDS sequence + * e.g. location="X53828.1:60..1058" correctly + */ System.err .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (translation.length() * 3 == (1 - codonStart + dna.getSequence().length)) + if (translationLength * 3 == (1 - codonStart + dna.getSequence().length)) { System.err .println("Not allowing for additional stop codon at end of cDNA fragment... !"); - // this might occur for CDS sequences where no features are - // marked. + // this might occur for CDS sequences where no features are marked exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translation.length() }, - 3, 1); + translationLength }, 3, 1); } - if ((translation.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length)) + if ((translationLength + 1) * 3 == (1 - codonStart + dna + .getSequence().length)) { System.err .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translation.length() }, - 3, 1); + translationLength }, 3, 1); } } else @@ -382,31 +404,37 @@ public class EmblEntry else { // final product length truncation check - int[] cdsRanges = adjustForProteinLength(translation.length(), exons); - dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { 1, - translation.length() }, 3, 1); + int[] cdsRanges = adjustForProteinLength(translationLength, exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { + 1, translationLength }, 3, 1); if (product != null) { /* - * make xrefs from protein to EMBLCDS and EMBLCDSPROTEIN + * make xref with mapping from protein to EMBL dna */ - DBRefEntry proteinToEmblCdsRef = new DBRefEntry(); - proteinToEmblCdsRef.setAccessionId(proteinId); - proteinToEmblCdsRef.setSource(DBRefSource.EMBLCDS); - proteinToEmblCdsRef.setVersion(getSequenceVersion()); // same as - // parent EMBL - // version. - MapList mp = new MapList(new int[] { 1, translation.length() }, - new int[] { 1 + (codonStart - 1), - (codonStart - 1) + 3 * translation.length() }, 1, 3); - proteinToEmblCdsRef.setMap(new Mapping(mp)); + DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, + getSequenceVersion(), proteinId, new Mapping( + dnaToProteinMapping.getMap().getInverse())); + product.addDBRef(proteinToEmblRef); + + /* + * make xref from protein to EMBLCDS; we assume here that the + * CDS sequence version is same as dna sequence (?!) + */ + MapList proteinToCdsMapList = new MapList(new int[] { 1, + translationLength }, new int[] { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, 1, 3); + DBRefEntry proteinToEmblCdsRef = new DBRefEntry( + DBRefSource.EMBLCDS, getSequenceVersion(), proteinId, + new Mapping(proteinToCdsMapList)); product.addDBRef(proteinToEmblCdsRef); + + /* + * make 'direct' xref from protein to EMBLCDSPROTEIN + */ proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); - MapList mp2 = new MapList( - new int[] { 1, translation.length() }, new int[] { 1, - translation.length() }, 1, 1); - proteinToEmblProteinRef.setMap(new Mapping(mp2)); proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); + proteinToEmblProteinRef.setMap(null); product.addDBRef(proteinToEmblProteinRef); } } @@ -417,8 +445,8 @@ public class EmblEntry */ for (int xint = 0; exons != null && xint < exons.length; xint += 2) { - SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, proteinId, vals, - codonStart); + SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, + proteinId, vals, codonStart); sf.setType(feature.getName()); // "CDS" sf.setEnaLocation(feature.getLocation()); sf.setFeatureGroup(sourceDb); @@ -440,13 +468,14 @@ public class EmblEntry */ String source = DBRefUtils.getCanonicalName(ref.getSource()); ref.setSource(source); - DBRefEntry proteinToDnaRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref - .getAccessionId()); + DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), + ref.getVersion(), ref.getAccessionId()); if (source.equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" + ref.getAccessionId(); - if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) { if (mappingUsed) { @@ -472,7 +501,8 @@ public class EmblEntry peptides.add(proteinSeq); } dnaToProteinMapping.setTo(proteinSeq); - proteinSeq.addDBRef(proteinToDnaRef); + dnaToProteinMapping.setMappedFromId(proteinId); + proteinSeq.addDBRef(proteinDbRef); ref.setMap(dnaToProteinMapping); } hasUniprotDbref = true; @@ -482,7 +512,7 @@ public class EmblEntry /* * copy feature dbref to our protein product */ - DBRefEntry pref = proteinToDnaRef; + DBRefEntry pref = proteinDbRef; pref.setMap(null); // reference is direct product.addDBRef(pref); // Add converse mapping reference @@ -502,21 +532,19 @@ public class EmblEntry dna.addDBRef(ref); } } + /* * if we have a product (translation) but no explicit Uniprot dbref - * (example: EMBL AAFI02000057 protein_id EAL65544.1 - * construct mappings to an assumed EMBLCDSPROTEIN accession + * (example: EMBL AAFI02000057 protein_id EAL65544.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession */ if (!hasUniprotDbref && product != null) { if (proteinToEmblProteinRef == null) { - proteinToEmblProteinRef = new DBRefEntry(); - proteinToEmblProteinRef.setAccessionId(proteinId); - proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); - proteinToEmblProteinRef.setVersion(getSequenceVersion()); - proteinToEmblProteinRef.setMap(new Mapping(product, - dnaToProteinMapping.getMap().getInverse())); + // assuming CDSPROTEIN sequence version = dna version (?!) + proteinToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); } product.addDBRef(proteinToEmblProteinRef); @@ -524,8 +552,9 @@ public class EmblEntry && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - proteinToEmblProteinRef); + DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); + dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); } }