X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=bbe6a209bc0dd44aad43aa30473f7daae6438278;hb=0ca13d2d06bf5ee99eb7dc123ee7ffcea016f733;hp=56b132508ce5ad76c91fef93ce9885864fb02443;hpb=c14d9e8c9ff59de1857d4834ee70e80abf623415;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 56b1325..bbe6a20 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -48,9 +48,7 @@ import java.util.regex.Pattern; * Data model for one entry returned from an EMBL query, as marshalled by a * Castor binding file * - * For example: - * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321 - * &format=emblxml + * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml * * @see embl_mapping.xml */ @@ -188,16 +186,20 @@ public class EmblEntry public SequenceI getSequence(String sourceDb, List peptides) { SequenceI dna = makeSequence(sourceDb); + if (dna == null) + { + return null; + } dna.setDescription(description); - DBRefEntry retrievedref = new DBRefEntry(sourceDb, - getSequenceVersion(), accession); + DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), + accession); dna.addDBRef(retrievedref); - dna.setSourceDBRef(retrievedref); // add map to indicate the sequence is a valid coordinate frame for the // dbref - retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, - new int[] { 1, dna.getLength() }, 1, 1)); - + retrievedref + .setMap(new Mapping(null, new int[] + { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, + 1)); /* * transform EMBL Database refs to canonical form @@ -240,6 +242,12 @@ public class EmblEntry */ SequenceI makeSequence(String sourceDb) { + if (sequence == null) + { + System.err.println( + "No sequence was returned for ENA accession " + accession); + return null; + } SequenceI dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); return dna; @@ -261,7 +269,8 @@ public class EmblEntry * helper to match xrefs in already retrieved sequences */ void parseCodingFeature(EmblFeature feature, String sourceDb, - SequenceI dna, List peptides, SequenceIdMatcher matcher) + SequenceI dna, List peptides, + SequenceIdMatcher matcher) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); @@ -270,7 +279,7 @@ public class EmblEntry String translation = null; String proteinName = ""; String proteinId = null; - Map vals = new Hashtable(); + Map vals = new Hashtable<>(); /* * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS @@ -290,7 +299,8 @@ public class EmblEntry if (qname.equals("translation")) { // remove all spaces (precompiled String.replaceAll(" ", "")) - translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(""); + translation = SPACE_PATTERN.matcher(q.getValues()[0]) + .replaceAll(""); } else if (qname.equals("protein_id")) { @@ -303,8 +313,8 @@ public class EmblEntry codonStart = Integer.parseInt(q.getValues()[0].trim()); } catch (NumberFormatException e) { - System.err.println("Invalid codon_start in XML for " - + accession + ": " + e.getMessage()); + System.err.println("Invalid codon_start in XML for " + accession + + ": " + e.getMessage()); } } else if (qname.equals("product")) @@ -341,9 +351,10 @@ public class EmblEntry product = matcher.findIdMatch(proteinId); if (product == null) { - product = new Sequence(proteinId, translation, 1, translationLength); - product.setDescription(((proteinName.length() == 0) ? "Protein Product from " - + sourceDb + product = new Sequence(proteinId, translation, 1, + translationLength); + product.setDescription(((proteinName.length() == 0) + ? "Protein Product from " + sourceDb : proteinName)); peptides.add(product); matcher.add(product); @@ -357,28 +368,30 @@ public class EmblEntry * workaround until we handle dna location for CDS sequence * e.g. location="X53828.1:60..1058" correctly */ - System.err - .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + System.err.println( + "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (translationLength * 3 == (1 - codonStart + dna.getSequence().length)) + int dnaLength = dna.getLength(); + if (translationLength * 3 == (1 - codonStart + dnaLength)) { - System.err - .println("Not allowing for additional stop codon at end of cDNA fragment... !"); + System.err.println( + "Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are marked exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; - dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translationLength }, 3, 1); + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); } - if ((translationLength + 1) * 3 == (1 - codonStart + dna - .getSequence().length)) + if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength)) { - System.err - .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); + System.err.println( + "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; - dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translationLength }, 3, 1); + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); } } else @@ -397,26 +410,32 @@ public class EmblEntry else { // final product length truncation check - int[] cdsRanges = adjustForProteinLength(translationLength, exons); - dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { - 1, translationLength }, 3, 1); + int[] cdsRanges = adjustForProteinLength(translationLength, + exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, + new int[] + { 1, translationLength }, 3, 1); if (product != null) { /* * make xref with mapping from protein to EMBL dna */ DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, - getSequenceVersion(), proteinId, new Mapping( - dnaToProteinMapping.getMap().getInverse())); + getSequenceVersion(), proteinId, + new Mapping(dnaToProteinMapping.getMap().getInverse())); product.addDBRef(proteinToEmblRef); /* * make xref from protein to EMBLCDS; we assume here that the * CDS sequence version is same as dna sequence (?!) */ - MapList proteinToCdsMapList = new MapList(new int[] { 1, - translationLength }, new int[] { 1 + (codonStart - 1), - (codonStart - 1) + 3 * translationLength }, 1, 3); + MapList proteinToCdsMapList = new MapList( + new int[] + { 1, translationLength }, + new int[] + { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, + 1, 3); DBRefEntry proteinToEmblCdsRef = new DBRefEntry( DBRefSource.EMBLCDS, getSequenceVersion(), proteinId, new Mapping(proteinToCdsMapList)); @@ -436,13 +455,27 @@ public class EmblEntry /* * add cds features to dna sequence */ - for (int xint = 0; exons != null && xint < exons.length; xint += 2) + String cds = feature.getName(); // "CDS" + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { - SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, - proteinId, vals, codonStart); - sf.setType(feature.getName()); // "CDS" + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + int exonNumber = xint / 2 + 1; + String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinId); + + SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, + sourceDb, vals); + sf.setEnaLocation(feature.getLocation()); - sf.setFeatureGroup(sourceDb); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + dna.addSequenceFeature(sf); } } @@ -461,13 +494,14 @@ public class EmblEntry */ String source = DBRefUtils.getCanonicalName(ref.getSource()); ref.setSource(source); - DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref - .getAccessionId()); + DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), + ref.getVersion(), ref.getAccessionId()); if (source.equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" + ref.getAccessionId(); - if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) { if (mappingUsed) { @@ -495,7 +529,6 @@ public class EmblEntry dnaToProteinMapping.setTo(proteinSeq); dnaToProteinMapping.setMappedFromId(proteinId); proteinSeq.addDBRef(proteinDbRef); - proteinSeq.setSourceDBRef(proteinDbRef); ref.setMap(dnaToProteinMapping); } hasUniprotDbref = true; @@ -511,8 +544,8 @@ public class EmblEntry // Add converse mapping reference if (dnaToProteinMapping != null) { - Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap() - .getInverse()); + Mapping pmap = new Mapping(dna, + dnaToProteinMapping.getMap().getInverse()); pref = new DBRefEntry(sourceDb, getSequenceVersion(), this.getAccession()); pref.setMap(pmap); @@ -536,17 +569,17 @@ public class EmblEntry if (proteinToEmblProteinRef == null) { // assuming CDSPROTEIN sequence version = dna version (?!) - proteinToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); + proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct, + getSequenceVersion(), proteinId); } product.addDBRef(proteinToEmblProteinRef); - product.setSourceDBRef(proteinToEmblProteinRef); if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); + DBRefSource.EMBLCDSProduct, getSequenceVersion(), + proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); @@ -557,33 +590,24 @@ public class EmblEntry /** * Helper method to construct a SequenceFeature for one cds range * - * @param exons - * array of cds [start, end, ...] positions - * @param exonStartIndex - * offset into the exons array - * @param proteinName - * @param proteinAccessionId + * @param type + * feature type ("CDS") + * @param desc + * description + * @param begin + * start position + * @param end + * end position + * @param group + * feature group * @param vals * map of 'miscellaneous values' for feature - * @param codonStart - * codon start position for CDS (1/2/3, normally 1) * @return */ - protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex, - String proteinName, String proteinAccessionId, - Map vals, int codonStart) - { - int exonNumber = exonStartIndex / 2 + 1; - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s", - exonNumber, proteinName, proteinAccessionId)); - sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" - : "-"); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + protected SequenceFeature makeCdsFeature(String type, String desc, + int begin, int end, String group, Map vals) + { + SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); if (!vals.isEmpty()) { StringBuilder sb = new StringBuilder(); @@ -623,9 +647,9 @@ public class EmblEntry return listToArray(ranges); } catch (ParseException e) { - Cache.log.warn(String.format( - "Not parsing inexact CDS location %s in ENA %s", - feature.location, this.accession)); + Cache.log.warn( + String.format("Not parsing inexact CDS location %s in ENA %s", + feature.location, this.accession)); return new int[] {}; } }