X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=ac0d3394e5463f38da47a4dcba9c360f6c515f80;hb=663695f530e1cdef9cabdcd75d947219a1dd5c83;hp=d83013047a2214235cc2b5e64e9c268034a2b8a1;hpb=81acbfca392c2868b5f41fc40906dba33891a15d;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index d830130..ac0d339 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -21,6 +21,7 @@ package jalview.datamodel.xdb.embl; import jalview.analysis.SequenceIdMatcher; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; @@ -29,10 +30,13 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; +import jalview.util.DnaUtils; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.StringUtils; +import java.text.ParseException; +import java.util.Arrays; import java.util.Hashtable; import java.util.List; import java.util.Map; @@ -56,17 +60,29 @@ public class EmblEntry String accession; - String version; + String entryVersion; - String taxDivision; + String sequenceVersion; - String desc; + String dataClass; - String rCreated; + String moleculeType; - String rLastUpdated; + String topology; - String lastUpdated; + String sequenceLength; + + String taxonomicDivision; + + String description; + + String firstPublicDate; + + String firstPublicRelease; + + String lastUpdatedDate; + + String lastUpdatedRelease; Vector keywords; @@ -111,23 +127,6 @@ public class EmblEntry } /** - * @return the desc - */ - public String getDesc() - { - return desc; - } - - /** - * @param desc - * the desc to set - */ - public void setDesc(String desc) - { - this.desc = desc; - } - - /** * @return the features */ public Vector getFeatures() @@ -162,57 +161,6 @@ public class EmblEntry } /** - * @return the lastUpdated - */ - public String getLastUpdated() - { - return lastUpdated; - } - - /** - * @param lastUpdated - * the lastUpdated to set - */ - public void setLastUpdated(String lastUpdated) - { - this.lastUpdated = lastUpdated; - } - - /** - * @return the releaseCreated - */ - public String getRCreated() - { - return rCreated; - } - - /** - * @param releaseCreated - * the releaseCreated to set - */ - public void setRCreated(String releaseCreated) - { - this.rCreated = releaseCreated; - } - - /** - * @return the releaseLastUpdated - */ - public String getRLastUpdated() - { - return rLastUpdated; - } - - /** - * @param releaseLastUpdated - * the releaseLastUpdated to set - */ - public void setRLastUpdated(String releaseLastUpdated) - { - this.rLastUpdated = releaseLastUpdated; - } - - /** * @return the sequence */ public EmblSequence getSequence() @@ -230,40 +178,6 @@ public class EmblEntry } /** - * @return the taxDivision - */ - public String getTaxDivision() - { - return taxDivision; - } - - /** - * @param taxDivision - * the taxDivision to set - */ - public void setTaxDivision(String taxDivision) - { - this.taxDivision = taxDivision; - } - - /** - * @return the version - */ - public String getVersion() - { - return version; - } - - /** - * @param version - * the version to set - */ - public void setVersion(String version) - { - this.version = version; - } - - /** * Recover annotated sequences from EMBL file * * @param sourceDb @@ -275,18 +189,23 @@ public class EmblEntry { SequenceI dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); - dna.setDescription(desc); - DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession); + dna.setDescription(description); + DBRefEntry retrievedref = new DBRefEntry(sourceDb, + getSequenceVersion(), accession); dna.addDBRef(retrievedref); // add map to indicate the sequence is a valid coordinate frame for the // dbref retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); - // TODO: transform EMBL Database refs to canonical form + + /* + * transform EMBL Database refs to canonical form + */ if (dbRefs != null) { for (DBRefEntry dbref : dbRefs) { + dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource())); dna.addDBRef(dbref); } } @@ -295,13 +214,6 @@ public class EmblEntry { for (EmblFeature feature : features) { - if (feature.dbRefs != null) - { - for (DBRefEntry dbref : feature.dbRefs) - { - dna.addDBRef(dbref); - } - } if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { parseCodingFeature(feature, sourceDb, dna, peptides); @@ -400,7 +312,6 @@ public class EmblEntry } } - // SequenceI product = null; DBRefEntry protEMBLCDS = null; exon = MappingUtils.removeStartPositions(codonStart - 1, exon); boolean noProteinDbref = true; @@ -436,7 +347,8 @@ public class EmblEntry .println("Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are // marked. - exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; + exon = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() }; map = new Mapping(product, exon, new int[] { 1, prseq.length() }, 3, 1); } @@ -469,16 +381,17 @@ public class EmblEntry // TODO should from range include stop codon even if not in protein // in order to include stop codon in CDS sequence (as done for // Ensembl)? - int[] cdsRanges = adjustForProteinLength(prseq.length(), - exon); - map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1); + int[] cdsRanges = adjustForProteinLength(prseq.length(), exon); + map = new Mapping(product, cdsRanges, new int[] { 1, + prseq.length() }, 3, 1); // reconstruct the EMBLCDS entry // TODO: this is only necessary when there codon annotation is // complete (I think JBPNote) DBRefEntry pcdnaref = new DBRefEntry(); pcdnaref.setAccessionId(prid); pcdnaref.setSource(DBRefSource.EMBLCDS); - pcdnaref.setVersion(getVersion()); // same as parent EMBL version. + pcdnaref.setVersion(getSequenceVersion()); // same as parent EMBL + // version. MapList mp = new MapList(new int[] { 1, prseq.length() }, new int[] { 1 + (codonStart - 1), (codonStart - 1) + 3 * prseq.length() }, 1, 3); @@ -498,6 +411,7 @@ public class EmblEntry SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals, codonStart); sf.setType(feature.getName()); // "CDS" + sf.setEnaLocation(feature.getLocation()); sf.setFeatureGroup(sourceDb); dna.addSequenceFeature(sf); } @@ -511,6 +425,9 @@ public class EmblEntry boolean mappingUsed = false; for (DBRefEntry ref : feature.dbRefs) { + /* + * ensure UniProtKB/Swiss-Prot converted to UNIPROT + */ ref.setSource(DBRefUtils.getCanonicalName(ref.getSource())); if (ref.getSource().equals(DBRefSource.UNIPROT)) { @@ -536,8 +453,7 @@ public class EmblEntry if (proteinSeq == null) { proteinSeq = new Sequence(proteinSeqName, - product - .getSequenceAsString()); + product.getSequenceAsString()); matcher.add(proteinSeq); peptides.add(proteinSeq); } @@ -559,7 +475,7 @@ public class EmblEntry if (map != null) { Mapping pmap = new Mapping(dna, map.getMap().getInverse()); - pref = new DBRefEntry(sourceDb, getVersion(), + pref = new DBRefEntry(sourceDb, getSequenceVersion(), this.getAccession()); pref.setMap(pmap); if (map.getTo() != null) @@ -578,7 +494,7 @@ public class EmblEntry protEMBLCDS = new DBRefEntry(); protEMBLCDS.setAccessionId(prid); protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct); - protEMBLCDS.setVersion(getVersion()); + protEMBLCDS.setVersion(getSequenceVersion()); protEMBLCDS .setMap(new Mapping(product, map.getMap().getInverse())); } @@ -623,11 +539,11 @@ public class EmblEntry SequenceFeature sf = new SequenceFeature(); sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format( - "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName, - proteinAccessionId)); + sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinAccessionId)); sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-"); + sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" + : "-"); sf.setValue(FeatureProperties.EXONPOS, exonNumber); sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); if (!vals.isEmpty()) @@ -650,7 +566,7 @@ public class EmblEntry } /** - * Returns the CDS positions as a list of [start, end, start, end...] + * Returns the CDS positions as a single array of [start, end, start, end...] * positions. If on the reverse strand, these will be in descending order. * * @param feature @@ -658,28 +574,41 @@ public class EmblEntry */ protected int[] getCdsRanges(EmblFeature feature) { - if (feature.locations == null) + if (feature.location == null) { return new int[] {}; } - int cdsBoundaryCount = 0; // count of all start/stop locations - int[][] cdsLocations = new int[feature.locations.size()][]; - int locationNumber = 0; - for (EmblFeatureLocations loc : feature.locations) + + try { - int[] locationRanges = loc.getElementRanges(accession); - cdsLocations[locationNumber++] = locationRanges; - cdsBoundaryCount += locationRanges.length; - } - int[] cdsRanges = new int[cdsBoundaryCount]; - int copyTo = 0; - for (int[] ranges : cdsLocations) + List ranges = DnaUtils.parseLocation(feature.location); + return listToArray(ranges); + } catch (ParseException e) { - System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length); - copyTo += ranges.length; + Cache.log.warn(String.format( + "Not parsing inexact CDS location %s in ENA %s", + feature.location, this.accession)); + return new int[] {}; } - return cdsRanges; + } + /** + * Converts a list of [start, end] ranges to a single array of [start, end, + * start, end ...] + * + * @param ranges + * @return + */ + int[] listToArray(List ranges) + { + int[] result = new int[ranges.size() * 2]; + int i = 0; + for (int[] range : ranges) + { + result[i++] = range[0]; + result[i++] = range[1]; + } + return result; } /** @@ -689,45 +618,189 @@ public class EmblEntry * @param exon * @return new exon */ - private int[] adjustForProteinLength(int prlength, int[] exon) + static int[] adjustForProteinLength(int prlength, int[] exon) { + if (prlength <= 0 || exon == null) + { + return exon; + } + int desiredCdsLength = prlength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); - int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3; - // first adjust range for codon start attribute - if (prlength >= 1 && exon != null) + /* + * assuming here exon might include stop codon in addition to protein codons + */ + if (desiredCdsLength == exonLength + || desiredCdsLength == exonLength - 3) + { + return exon; + } + + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) { - origxon = new int[exon.length]; - System.arraycopy(exon, 0, origxon, 0, exon.length); - int cdspos = 0; - for (int x = 0; x < exon.length && sxpos == -1; x += 2) + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (desiredCdsLength <= cdspos) { - cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; - if (cdslength <= cdspos) + // advanced beyond last codon. + sxpos = x; + if (desiredCdsLength != cdspos) { - // advanced beyond last codon. - sxpos = x; - if (cdslength != cdspos) - { - System.err - .println("Truncating final exon interval on region by " - + (cdspos - cdslength)); - } - // locate the new end boundary of final exon as endxon - endxon = exon[x + 1] - cdspos + cdslength; - break; + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); } - } - if (sxpos != -1) - { - // and trim the exon interval set if necessary - int[] nxon = new int[sxpos + 2]; - System.arraycopy(exon, 0, nxon, 0, sxpos + 2); - nxon[sxpos + 1] = endxon; // update the end boundary for the new exon - // set - exon = nxon; + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + desiredCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - desiredCdsLength; + } + break; } } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } return exon; } + + public String getSequenceVersion() + { + return sequenceVersion; + } + + public void setSequenceVersion(String sequenceVersion) + { + this.sequenceVersion = sequenceVersion; + } + + public String getSequenceLength() + { + return sequenceLength; + } + + public void setSequenceLength(String sequenceLength) + { + this.sequenceLength = sequenceLength; + } + + public String getEntryVersion() + { + return entryVersion; + } + + public void setEntryVersion(String entryVersion) + { + this.entryVersion = entryVersion; + } + + public String getMoleculeType() + { + return moleculeType; + } + + public void setMoleculeType(String moleculeType) + { + this.moleculeType = moleculeType; + } + + public String getTopology() + { + return topology; + } + + public void setTopology(String topology) + { + this.topology = topology; + } + + public String getTaxonomicDivision() + { + return taxonomicDivision; + } + + public void setTaxonomicDivision(String taxonomicDivision) + { + this.taxonomicDivision = taxonomicDivision; + } + + public String getDescription() + { + return description; + } + + public void setDescription(String description) + { + this.description = description; + } + + public String getFirstPublicDate() + { + return firstPublicDate; + } + + public void setFirstPublicDate(String firstPublicDate) + { + this.firstPublicDate = firstPublicDate; + } + + public String getFirstPublicRelease() + { + return firstPublicRelease; + } + + public void setFirstPublicRelease(String firstPublicRelease) + { + this.firstPublicRelease = firstPublicRelease; + } + + public String getLastUpdatedDate() + { + return lastUpdatedDate; + } + + public void setLastUpdatedDate(String lastUpdatedDate) + { + this.lastUpdatedDate = lastUpdatedDate; + } + + public String getLastUpdatedRelease() + { + return lastUpdatedRelease; + } + + public void setLastUpdatedRelease(String lastUpdatedRelease) + { + this.lastUpdatedRelease = lastUpdatedRelease; + } + + public String getDataClass() + { + return dataClass; + } + + public void setDataClass(String dataClass) + { + this.dataClass = dataClass; + } }