X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=bbe6a209bc0dd44aad43aa30473f7daae6438278;hb=88515cdb74e4603a40c8c1dca14107b5ca503e04;hp=4d09bdc2a408d74ba04ec1e61ae48482f00bd0fb;hpb=37de9310bec3501cbc6381e0c3dcb282fcaad812;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 4d09bdc..bbe6a20 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -191,13 +191,15 @@ public class EmblEntry return null; } dna.setDescription(description); - DBRefEntry retrievedref = new DBRefEntry(sourceDb, - getSequenceVersion(), accession); + DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), + accession); dna.addDBRef(retrievedref); // add map to indicate the sequence is a valid coordinate frame for the // dbref - retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, - new int[] { 1, dna.getLength() }, 1, 1)); + retrievedref + .setMap(new Mapping(null, new int[] + { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, + 1)); /* * transform EMBL Database refs to canonical form @@ -242,8 +244,8 @@ public class EmblEntry { if (sequence == null) { - System.err.println("No sequence was returned for ENA accession " - + accession); + System.err.println( + "No sequence was returned for ENA accession " + accession); return null; } SequenceI dna = new Sequence(sourceDb + "|" + accession, @@ -267,7 +269,8 @@ public class EmblEntry * helper to match xrefs in already retrieved sequences */ void parseCodingFeature(EmblFeature feature, String sourceDb, - SequenceI dna, List peptides, SequenceIdMatcher matcher) + SequenceI dna, List peptides, + SequenceIdMatcher matcher) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); @@ -276,7 +279,7 @@ public class EmblEntry String translation = null; String proteinName = ""; String proteinId = null; - Map vals = new Hashtable(); + Map vals = new Hashtable<>(); /* * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS @@ -296,8 +299,8 @@ public class EmblEntry if (qname.equals("translation")) { // remove all spaces (precompiled String.replaceAll(" ", "")) - translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll( - ""); + translation = SPACE_PATTERN.matcher(q.getValues()[0]) + .replaceAll(""); } else if (qname.equals("protein_id")) { @@ -310,8 +313,8 @@ public class EmblEntry codonStart = Integer.parseInt(q.getValues()[0].trim()); } catch (NumberFormatException e) { - System.err.println("Invalid codon_start in XML for " - + accession + ": " + e.getMessage()); + System.err.println("Invalid codon_start in XML for " + accession + + ": " + e.getMessage()); } } else if (qname.equals("product")) @@ -348,9 +351,10 @@ public class EmblEntry product = matcher.findIdMatch(proteinId); if (product == null) { - product = new Sequence(proteinId, translation, 1, translationLength); - product.setDescription(((proteinName.length() == 0) ? "Protein Product from " - + sourceDb + product = new Sequence(proteinId, translation, 1, + translationLength); + product.setDescription(((proteinName.length() == 0) + ? "Protein Product from " + sourceDb : proteinName)); peptides.add(product); matcher.add(product); @@ -364,28 +368,30 @@ public class EmblEntry * workaround until we handle dna location for CDS sequence * e.g. location="X53828.1:60..1058" correctly */ - System.err - .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + System.err.println( + "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (translationLength * 3 == (1 - codonStart + dna.getSequence().length)) + int dnaLength = dna.getLength(); + if (translationLength * 3 == (1 - codonStart + dnaLength)) { - System.err - .println("Not allowing for additional stop codon at end of cDNA fragment... !"); + System.err.println( + "Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are marked exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; - dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translationLength }, 3, 1); + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); } - if ((translationLength + 1) * 3 == (1 - codonStart + dna - .getSequence().length)) + if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength)) { - System.err - .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); + System.err.println( + "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; - dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, - translationLength }, 3, 1); + dnaToProteinMapping = new Mapping(product, exons, + new int[] + { 1, translationLength }, 3, 1); } } else @@ -404,26 +410,32 @@ public class EmblEntry else { // final product length truncation check - int[] cdsRanges = adjustForProteinLength(translationLength, exons); - dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { - 1, translationLength }, 3, 1); + int[] cdsRanges = adjustForProteinLength(translationLength, + exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, + new int[] + { 1, translationLength }, 3, 1); if (product != null) { /* * make xref with mapping from protein to EMBL dna */ DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, - getSequenceVersion(), proteinId, new Mapping( - dnaToProteinMapping.getMap().getInverse())); + getSequenceVersion(), proteinId, + new Mapping(dnaToProteinMapping.getMap().getInverse())); product.addDBRef(proteinToEmblRef); /* * make xref from protein to EMBLCDS; we assume here that the * CDS sequence version is same as dna sequence (?!) */ - MapList proteinToCdsMapList = new MapList(new int[] { 1, - translationLength }, new int[] { 1 + (codonStart - 1), - (codonStart - 1) + 3 * translationLength }, 1, 3); + MapList proteinToCdsMapList = new MapList( + new int[] + { 1, translationLength }, + new int[] + { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, + 1, 3); DBRefEntry proteinToEmblCdsRef = new DBRefEntry( DBRefSource.EMBLCDS, getSequenceVersion(), proteinId, new Mapping(proteinToCdsMapList)); @@ -443,13 +455,27 @@ public class EmblEntry /* * add cds features to dna sequence */ - for (int xint = 0; exons != null && xint < exons.length; xint += 2) + String cds = feature.getName(); // "CDS" + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { - SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, - proteinId, vals, codonStart); - sf.setType(feature.getName()); // "CDS" + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + int exonNumber = xint / 2 + 1; + String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinId); + + SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, + sourceDb, vals); + sf.setEnaLocation(feature.getLocation()); - sf.setFeatureGroup(sourceDb); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + dna.addSequenceFeature(sf); } } @@ -518,8 +544,8 @@ public class EmblEntry // Add converse mapping reference if (dnaToProteinMapping != null) { - Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap() - .getInverse()); + Mapping pmap = new Mapping(dna, + dnaToProteinMapping.getMap().getInverse()); pref = new DBRefEntry(sourceDb, getSequenceVersion(), this.getAccession()); pref.setMap(pmap); @@ -543,8 +569,8 @@ public class EmblEntry if (proteinToEmblProteinRef == null) { // assuming CDSPROTEIN sequence version = dna version (?!) - proteinToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); + proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct, + getSequenceVersion(), proteinId); } product.addDBRef(proteinToEmblProteinRef); @@ -552,7 +578,8 @@ public class EmblEntry && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); + DBRefSource.EMBLCDSProduct, getSequenceVersion(), + proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); @@ -563,33 +590,24 @@ public class EmblEntry /** * Helper method to construct a SequenceFeature for one cds range * - * @param exons - * array of cds [start, end, ...] positions - * @param exonStartIndex - * offset into the exons array - * @param proteinName - * @param proteinAccessionId + * @param type + * feature type ("CDS") + * @param desc + * description + * @param begin + * start position + * @param end + * end position + * @param group + * feature group * @param vals * map of 'miscellaneous values' for feature - * @param codonStart - * codon start position for CDS (1/2/3, normally 1) * @return */ - protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex, - String proteinName, String proteinAccessionId, - Map vals, int codonStart) - { - int exonNumber = exonStartIndex / 2 + 1; - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s", - exonNumber, proteinName, proteinAccessionId)); - sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" - : "-"); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + protected SequenceFeature makeCdsFeature(String type, String desc, + int begin, int end, String group, Map vals) + { + SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); if (!vals.isEmpty()) { StringBuilder sb = new StringBuilder(); @@ -629,9 +647,9 @@ public class EmblEntry return listToArray(ranges); } catch (ParseException e) { - Cache.log.warn(String.format( - "Not parsing inexact CDS location %s in ENA %s", - feature.location, this.accession)); + Cache.log.warn( + String.format("Not parsing inexact CDS location %s in ENA %s", + feature.location, this.accession)); return new int[] {}; } }