From af2dbf10bbcc54d9b9e4d398e9283b3c4a758e06 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Wed, 24 Aug 2016 17:28:30 +0100 Subject: [PATCH] JAL-2106 removed setSourceDBRef, refactor getSourceDBRef to getPrimaryDBRefs * minimal test for PDB cornercase * minimal revision of AlignmentUtils/Tests, CrossRef * basic patch to SIFTSClient - need to revisit MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit TODO * jalview.io.StructureFile re ‘PDB’ primary dbrefs generated from filenames. * verify primary refs added correctly for Uniprot, EMBL, Ensembl and EnsemblGenomes in their unit tests. --- src/jalview/analysis/AlignmentUtils.java | 37 +++++++++----- src/jalview/analysis/CrossRef.java | 16 ++++--- src/jalview/datamodel/Sequence.java | 61 ++++++++++++++++++++---- src/jalview/datamodel/SequenceI.java | 21 +++----- src/jalview/datamodel/xdb/embl/EmblEntry.java | 3 -- src/jalview/ext/ensembl/EnsemblSeqProxy.java | 6 +-- src/jalview/gui/StructureChooser.java | 2 +- src/jalview/io/StructureFile.java | 4 +- src/jalview/ws/dbsources/Uniprot.java | 4 +- src/jalview/ws/sifts/SiftsClient.java | 43 ++++++----------- test/jalview/analysis/AlignmentUtilsTests.java | 6 ++- test/jalview/datamodel/SequenceTest.java | 15 +++++- 12 files changed, 133 insertions(+), 85 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 9aaaed2..ec5b6a1 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -22,7 +22,6 @@ package jalview.analysis; import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE; -import jalview.api.DBRefEntryI; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; @@ -1682,6 +1681,10 @@ public class AlignmentUtils * its dataset sequence to the dataset */ cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping); + // cdsSeq has a name constructed as CDS| + // will be either the accession for the coding sequence, + // marked in the /via/ dbref to the protein product accession + // or it will be the original nucleotide accession. SequenceI cdsSeqDss = cdsSeq.createDatasetSequence(); cdsSeqs.add(cdsSeq); if (!dataset.getSequences().contains(cdsSeqDss)) @@ -1745,16 +1748,28 @@ public class AlignmentUtils * same source and accession, so need a different accession for * the CDS from the dna sequence */ - DBRefEntryI dnaRef = dnaDss.getSourceDBRef(); - if (dnaRef != null) - { - // assuming cds version same as dna ?!? - DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(), - dnaRef.getVersion(), cdsSeq.getName()); - proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap - .getInverse())); - proteinProduct.addDBRef(proteinToCdsRef); - } + // specific use case: + // Genomic contig ENSCHR:1, contains coding regions for ENSG01, + // ENSG02, ENSG03, with transcripts and products similarly named. + // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01 + // JBPNote: ?? can't actually create an example that demonstrates we + // need to + // synthesize an xref. + // TODO: merge conflicts from JAL-2154 branch and use PrimaryDBRefs() + // for (DBRefEntry primRef:dnaDss.getPrimaryDBRefs()) + // { + // creates a complementary cross-reference to the source sequence's + // primary reference. + + // // problem here is that the cross-reference is synthesized - + // cdsSeq.getName() may be like 'CDS|dnaaccession' or 'CDS|emblcdsacc' + // // assuming cds version same as dna ?!? + // DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(), + // dnaRef.getVersion(), cdsSeq.getName()); + // proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap + // .getInverse())); + // proteinProduct.addDBRef(proteinToCdsRef); + // } /* * transfer any features on dna that overlap the CDS diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 288d60e..a169ba6 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -706,14 +706,16 @@ public class CrossRef /* * and add a reverse DbRef with the inverse mapping */ - if (mapFrom.getDatasetSequence() != null - && mapFrom.getDatasetSequence().getSourceDBRef() != null) + if (mapFrom.getDatasetSequence() != null && false) + // && mapFrom.getDatasetSequence().getSourceDBRef() != null) { - DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence() - .getSourceDBRef()); - dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping - .getInverse())); - mapTo.addDBRef(dbref); + // possible need to search primary references... except, why doesn't xref + // == getSourceDBRef ?? + // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence() + // .getSourceDBRef()); + // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping + // .getInverse())); + // mapTo.addDBRef(dbref); } if (fromDna) diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java index 98cd4b2..ec8aa81 100755 --- a/src/jalview/datamodel/Sequence.java +++ b/src/jalview/datamodel/Sequence.java @@ -22,6 +22,8 @@ package jalview.datamodel; import jalview.analysis.AlignSeq; import jalview.api.DBRefEntryI; +import jalview.util.DBRefUtils; +import jalview.util.MapList; import jalview.util.StringUtils; import java.util.ArrayList; @@ -235,8 +237,6 @@ public class Sequence extends ASequence implements SequenceI seq.getEnd()); } description = seq.getDescription(); - sourceDBRef = seq.getSourceDBRef() == null ? null : new DBRefEntry( - seq.getSourceDBRef()); if (seq != datasetSequence) { setDatasetSequence(seq.getDatasetSequence()); @@ -1413,16 +1413,59 @@ public class Sequence extends ASequence implements SequenceI return null; } - @Override - public void setSourceDBRef(DBRefEntryI dbRef) - { - this.sourceDBRef = dbRef; - } @Override - public DBRefEntryI getSourceDBRef() + public List getPrimaryDBRefs() { - return this.sourceDBRef; + if (datasetSequence!=null) + { + return datasetSequence.getPrimaryDBRefs(); + } + if (dbrefs==null || dbrefs.length==0) + { + return Arrays.asList(new DBRefEntry[0]); + } + synchronized (dbrefs) + { + List primaries = new ArrayList(); + DBRefEntry tmp[] = new DBRefEntry[1], res[] = null; + for (DBRefEntry ref : dbrefs) + { + if (!ref.isPrimary()) + { + continue; + } + if (ref.hasMap()) + { + MapList mp = ref.getMap().getMap(); + if (mp.getFromLowest() > start || mp.getFromHighest() < end) + { + // map only involves a subsequence, so cannot be primary + continue; + } + } + // whilst it looks like it is a primary ref, we also sanity check type + if (DBRefUtils.getCanonicalName(DBRefSource.PDB).equals( + DBRefUtils.getCanonicalName(ref.getSource()))) + { + // PDB dbrefs imply there should be a PDBEntry associated + if (getPDBEntry(ref.getAccessionId()) != null) + { + primaries.add(ref); + } + continue; + } + // check standard protein or dna sources + tmp[0] = ref; + res = DBRefUtils.selectDbRefs(!isProtein(), tmp); + if (res != null && res[0] == tmp[0]) + { + primaries.add(ref); + continue; + } + } + return primaries; + } } } diff --git a/src/jalview/datamodel/SequenceI.java b/src/jalview/datamodel/SequenceI.java index 45a767c..ec7520b 100755 --- a/src/jalview/datamodel/SequenceI.java +++ b/src/jalview/datamodel/SequenceI.java @@ -20,8 +20,6 @@ */ package jalview.datamodel; -import jalview.api.DBRefEntryI; - import java.util.List; import java.util.Vector; @@ -443,21 +441,14 @@ public interface SequenceI extends ASequenceI */ public PDBEntry getPDBEntry(String pdbId); - /** - * Set the distinct source database, and accession number from which a - * sequence and its start-end data were derived from. This is very important - * for SIFTS mappings and must be set prior to performing SIFTS mapping. - * - * @param dbRef - * the source dbRef for the sequence - */ - public void setSourceDBRef(DBRefEntryI dbRef); /** - * Get the distinct source database, and accession number from which a - * sequence and its start-end data were derived from. + * Get all primary database/accessions for this sequence's data. These + * DBRefEntry are expected to resolve to a valid record in the associated + * external database, either directly or via a provided 1:1 Mapping. * - * @return + * @return just the primary references (if any) for this sequence, or an empty + * list */ - public DBRefEntryI getSourceDBRef(); + public List getPrimaryDBRefs(); } diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 56b1325..8688720 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -192,7 +192,6 @@ public class EmblEntry DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(), accession); dna.addDBRef(retrievedref); - dna.setSourceDBRef(retrievedref); // add map to indicate the sequence is a valid coordinate frame for the // dbref retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, @@ -495,7 +494,6 @@ public class EmblEntry dnaToProteinMapping.setTo(proteinSeq); dnaToProteinMapping.setMappedFromId(proteinId); proteinSeq.addDBRef(proteinDbRef); - proteinSeq.setSourceDBRef(proteinDbRef); ref.setMap(dnaToProteinMapping); } hasUniprotDbref = true; @@ -540,7 +538,6 @@ public class EmblEntry DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); } product.addDBRef(proteinToEmblProteinRef); - product.setSourceDBRef(proteinToEmblProteinRef); if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null) diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 31552af..e44b610 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -276,8 +276,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { // clunky: ensure Uniprot xref if we have one is on mapped sequence SequenceI ds = proteinSeq.getDatasetSequence(); - ds.setSourceDBRef(proteinSeq.getSourceDBRef()); - + // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence() Mapping map = new Mapping(ds, mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), proteinSeq.getName(), map); @@ -322,7 +321,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), seq.getName()); seq.addDBRef(self); - seq.setSourceDBRef(self); } /** @@ -382,7 +380,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(), getEnsemblDataVersion(), name); - sq.setSourceDBRef(dbref); + sq.addDBRef(dbref); } } if (alignment == null) diff --git a/src/jalview/gui/StructureChooser.java b/src/jalview/gui/StructureChooser.java index 13fa460..b2cc70f 100644 --- a/src/jalview/gui/StructureChooser.java +++ b/src/jalview/gui/StructureChooser.java @@ -867,7 +867,7 @@ public class StructureChooser extends GStructureChooser implements ArrayList seqsWithoutSourceDBRef = new ArrayList(); for (SequenceI seq : sequences) { - if (seq.getSourceDBRef() == null && seq.getDBRefs() == null) + if (seq.getPrimaryDBRefs().size() == 0) { seqsWithoutSourceDBRef.add(seq); continue; diff --git a/src/jalview/io/StructureFile.java b/src/jalview/io/StructureFile.java index fc0e207..f095383 100644 --- a/src/jalview/io/StructureFile.java +++ b/src/jalview/io/StructureFile.java @@ -117,7 +117,9 @@ public abstract class StructureFile extends AlignFile DBRefEntry sourceDBRef = new DBRefEntry(); sourceDBRef.setAccessionId(getId()); sourceDBRef.setSource(DBRefSource.PDB); - pdbSequence.setSourceDBRef(sourceDBRef); + // TODO: specify version for 'PDB' database ref if it is read from a file. + // TODO: decide if jalview.io should be creating primary refs! + sourceDBRef.setVersion(""); pdbSequence.addPDBId(entry); pdbSequence.addDBRef(sourceDBRef); SequenceI chainseq = pdbSequence; diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 8cc0ce4..81b4caf 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -205,10 +205,10 @@ public class Uniprot extends DbSourceProxyImpl { DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, accessionId); + + // mark dbRef as a primary reference for this sequence dbRefs.add(dbRef); } - sequence.setSourceDBRef((dbRefs != null && dbRefs.size() > 0) ? dbRefs - .get(0) : null); Vector onlyPdbEntries = new Vector(); for (PDBEntry pdb : entry.getDbReference()) diff --git a/src/jalview/ws/sifts/SiftsClient.java b/src/jalview/ws/sifts/SiftsClient.java index 6c94723..0ab6e7d 100644 --- a/src/jalview/ws/sifts/SiftsClient.java +++ b/src/jalview/ws/sifts/SiftsClient.java @@ -323,41 +323,28 @@ public class SiftsClient implements SiftsClientI public DBRefEntryI getValidSourceDBRef(SequenceI seq) throws SiftsException { - DBRefEntryI sourceDBRef = null; - sourceDBRef = seq.getSourceDBRef(); - if (sourceDBRef != null && isValidDBRefEntry(sourceDBRef)) + DBRefEntry[] dbRefs = seq.getDBRefs(); + if (dbRefs == null || dbRefs.length < 1) { - return sourceDBRef; + throw new SiftsException( + "Source DBRef could not be determined. DBRefs might not have been retrieved."); } - else + + for (DBRefEntryI dbRef : dbRefs) { - DBRefEntry[] dbRefs = seq.getDBRefs(); - if (dbRefs == null || dbRefs.length < 1) + if (dbRef == null || dbRef.getAccessionId() == null + || dbRef.getSource() == null) { - throw new SiftsException( - "Source DBRef could not be determined. DBRefs might not have been retrieved."); + continue; } - - for (DBRefEntryI dbRef : dbRefs) + if (isValidDBRefEntry(dbRef) + && dbRef.isPrimary() + && (dbRef.getSource().equalsIgnoreCase(DBRefSource.UNIPROT) || dbRef + .getSource().equalsIgnoreCase(DBRefSource.PDB))) { - if (dbRef == null || dbRef.getAccessionId() == null - || dbRef.getSource() == null) - { - continue; - } - if (isFoundInSiftsEntry(dbRef.getAccessionId()) - && (dbRef.getSource().equalsIgnoreCase(DBRefSource.UNIPROT) || dbRef - .getSource().equalsIgnoreCase(DBRefSource.PDB))) - { - seq.setSourceDBRef(dbRef); - return dbRef; - } + return dbRef; } } - if (sourceDBRef != null && isValidDBRefEntry(sourceDBRef)) - { - return sourceDBRef; - } throw new SiftsException("Could not get source DB Ref"); } @@ -440,7 +427,7 @@ public class SiftsClient implements SiftsClientI String originalSeq = AlignSeq.extractGaps( jalview.util.Comparison.GapChars, seq.getSequenceAsString()); HashMap mapping = new HashMap(); - DBRefEntryI sourceDBRef = seq.getSourceDBRef(); + DBRefEntryI sourceDBRef; sourceDBRef = getValidSourceDBRef(seq); // TODO ensure sequence start/end is in the same coordinate system and // consistent with the choosen sourceDBRef diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 7661c5d..1df7fd9 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -997,9 +997,11 @@ public class AlignmentUtilsTests * sequence */ DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1"); - dna1.getDatasetSequence().setSourceDBRef(dbref); + dna1.getDatasetSequence().addDBRef(dbref); + org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0)); dbref = new DBRefEntry("ENSEMBL", "0", "dna2"); - dna2.getDatasetSequence().setSourceDBRef(dbref); + dna2.getDatasetSequence().addDBRef(dbref); + org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0)); /* * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment diff --git a/test/jalview/datamodel/SequenceTest.java b/test/jalview/datamodel/SequenceTest.java index 1fed98b..f586776 100644 --- a/test/jalview/datamodel/SequenceTest.java +++ b/test/jalview/datamodel/SequenceTest.java @@ -499,7 +499,7 @@ public class SequenceTest new AlignmentAnnotation("Test annot", "Test annot description", annots)); Assert.assertEquals(sq.getDescription(), "Test sequence description.."); - Assert.assertEquals(sq.getDBRefs().length, 4); + Assert.assertEquals(sq.getDBRefs().length, 5); Assert.assertEquals(sq.getAllPDBEntries().size(), 4); Assert.assertNotNull(sq.getAnnotation()); Assert.assertEquals(sq.getAnnotation()[0].annotations.length, 2); @@ -512,7 +512,7 @@ public class SequenceTest Assert.assertEquals(derived.getDescription(), "Test sequence description.."); - Assert.assertEquals(derived.getDBRefs().length, 4); + Assert.assertEquals(derived.getDBRefs().length, 4); // come from dataset Assert.assertEquals(derived.getAllPDBEntries().size(), 4); Assert.assertNotNull(derived.getAnnotation()); Assert.assertEquals(derived.getAnnotation()[0].annotations.length, 2); @@ -530,6 +530,17 @@ public class SequenceTest assertNotNull(sq.getSequenceFeatures()); assertArrayEquals(sq.getSequenceFeatures(), derived.getSequenceFeatures()); + + /* + * verify we have primary db refs *just* for PDB IDs with associated + * PDBEntry objects + */ + + assertEquals(primRefs, sq.getPrimaryDBRefs()); + assertEquals(primRefs, sq.getDatasetSequence().getPrimaryDBRefs()); + + assertEquals(sq.getPrimaryDBRefs(), derived.getPrimaryDBRefs()); + } /** -- 1.7.10.2