JAL-2106 removed setSourceDBRef, refactor getSourceDBRef to getPrimaryDBRefs
authorJim Procter <jprocter@issues.jalview.org>
Wed, 24 Aug 2016 16:28:30 +0000 (17:28 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Wed, 24 Aug 2016 16:28:30 +0000 (17:28 +0100)
 * minimal test for PDB cornercase
 * minimal revision of AlignmentUtils/Tests, CrossRef
 * basic patch to SIFTSClient - need to revisit

TODO
 * jalview.io.StructureFile re ‘PDB’ primary dbrefs generated from filenames.
 * verify primary refs added correctly for Uniprot, EMBL,  Ensembl and EnsemblGenomes in their unit tests.

12 files changed:
src/jalview/analysis/AlignmentUtils.java
src/jalview/analysis/CrossRef.java
src/jalview/datamodel/Sequence.java
src/jalview/datamodel/SequenceI.java
src/jalview/datamodel/xdb/embl/EmblEntry.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java
src/jalview/gui/StructureChooser.java
src/jalview/io/StructureFile.java
src/jalview/ws/dbsources/Uniprot.java
src/jalview/ws/sifts/SiftsClient.java
test/jalview/analysis/AlignmentUtilsTests.java
test/jalview/datamodel/SequenceTest.java

index 9aaaed2..ec5b6a1 100644 (file)
@@ -22,7 +22,6 @@ package jalview.analysis;
 
 import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
 
-import jalview.api.DBRefEntryI;
 import jalview.datamodel.AlignedCodon;
 import jalview.datamodel.AlignedCodonFrame;
 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
@@ -1682,6 +1681,10 @@ public class AlignmentUtils
            * its dataset sequence to the dataset
            */
           cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping);
+          // cdsSeq has a name constructed as CDS|<dbref>
+          // <dbref> will be either the accession for the coding sequence,
+          // marked in the /via/ dbref to the protein product accession
+          // or it will be the original nucleotide accession.
           SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
           cdsSeqs.add(cdsSeq);
           if (!dataset.getSequences().contains(cdsSeqDss))
@@ -1745,16 +1748,28 @@ public class AlignmentUtils
            * same source and accession, so need a different accession for
            * the CDS from the dna sequence
            */
-          DBRefEntryI dnaRef = dnaDss.getSourceDBRef();
-          if (dnaRef != null)
-          {
-            // assuming cds version same as dna ?!?
-            DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
-                    dnaRef.getVersion(), cdsSeq.getName());
-            proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
-                    .getInverse()));
-            proteinProduct.addDBRef(proteinToCdsRef);
-          }
+          // specific use case:
+          // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
+          // ENSG02, ENSG03, with transcripts and products similarly named.
+          // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
+          // JBPNote: ?? can't actually create an example that demonstrates we
+          // need to
+          // synthesize an xref.
+          // TODO: merge conflicts from JAL-2154 branch and use PrimaryDBRefs()
+          // for (DBRefEntry primRef:dnaDss.getPrimaryDBRefs())
+          // {
+          // creates a complementary cross-reference to the source sequence's
+          // primary reference.
+
+          // // problem here is that the cross-reference is synthesized -
+          // cdsSeq.getName() may be like 'CDS|dnaaccession' or 'CDS|emblcdsacc'
+          // // assuming cds version same as dna ?!?
+          // DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
+          // dnaRef.getVersion(), cdsSeq.getName());
+          // proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
+          // .getInverse()));
+          // proteinProduct.addDBRef(proteinToCdsRef);
+          // }
 
           /*
            * transfer any features on dna that overlap the CDS
index 288d60e..a169ba6 100644 (file)
@@ -706,14 +706,16 @@ public class CrossRef
     /*
      * and add a reverse DbRef with the inverse mapping
      */
-    if (mapFrom.getDatasetSequence() != null
-            && mapFrom.getDatasetSequence().getSourceDBRef() != null)
+    if (mapFrom.getDatasetSequence() != null && false)
+    // && mapFrom.getDatasetSequence().getSourceDBRef() != null)
     {
-      DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
-              .getSourceDBRef());
-      dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
-              .getInverse()));
-      mapTo.addDBRef(dbref);
+      // possible need to search primary references... except, why doesn't xref
+      // == getSourceDBRef ??
+      // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
+      // .getSourceDBRef());
+      // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
+      // .getInverse()));
+      // mapTo.addDBRef(dbref);
     }
 
     if (fromDna)
index 98cd4b2..ec8aa81 100755 (executable)
@@ -22,6 +22,8 @@ package jalview.datamodel;
 
 import jalview.analysis.AlignSeq;
 import jalview.api.DBRefEntryI;
+import jalview.util.DBRefUtils;
+import jalview.util.MapList;
 import jalview.util.StringUtils;
 
 import java.util.ArrayList;
@@ -235,8 +237,6 @@ public class Sequence extends ASequence implements SequenceI
             seq.getEnd());
     }
     description = seq.getDescription();
-    sourceDBRef = seq.getSourceDBRef() == null ? null : new DBRefEntry(
-            seq.getSourceDBRef());
     if (seq != datasetSequence)
     {
       setDatasetSequence(seq.getDatasetSequence());
@@ -1413,16 +1413,59 @@ public class Sequence extends ASequence implements SequenceI
     return null;
   }
 
-  @Override
-  public void setSourceDBRef(DBRefEntryI dbRef)
-  {
-    this.sourceDBRef = dbRef;
-  }
 
   @Override
-  public DBRefEntryI getSourceDBRef()
+  public List<DBRefEntry> getPrimaryDBRefs()
   {
-    return this.sourceDBRef;
+    if (datasetSequence!=null)
+    {
+      return datasetSequence.getPrimaryDBRefs();
+    }
+    if (dbrefs==null || dbrefs.length==0)
+    {
+      return Arrays.asList(new DBRefEntry[0]);
+    }
+    synchronized (dbrefs)
+    {
+      List<DBRefEntry> primaries = new ArrayList<DBRefEntry>();
+      DBRefEntry tmp[] = new DBRefEntry[1], res[] = null;
+      for (DBRefEntry ref : dbrefs)
+      {
+        if (!ref.isPrimary())
+        {
+          continue;
+        }
+        if (ref.hasMap())
+        {
+          MapList mp = ref.getMap().getMap();
+          if (mp.getFromLowest() > start || mp.getFromHighest() < end)
+          {
+            // map only involves a subsequence, so cannot be primary
+            continue;
+          }
+        }
+        // whilst it looks like it is a primary ref, we also sanity check type
+        if (DBRefUtils.getCanonicalName(DBRefSource.PDB).equals(
+                DBRefUtils.getCanonicalName(ref.getSource())))
+        {
+          // PDB dbrefs imply there should be a PDBEntry associated
+          if (getPDBEntry(ref.getAccessionId()) != null)
+          {
+            primaries.add(ref);
+          }
+          continue;
+        }
+        // check standard protein or dna sources
+        tmp[0] = ref;
+        res = DBRefUtils.selectDbRefs(!isProtein(), tmp);
+        if (res != null && res[0] == tmp[0])
+        {
+          primaries.add(ref);
+          continue;
+        }
+      }
+      return primaries;
+    }
   }
 
 }
index 45a767c..ec7520b 100755 (executable)
@@ -20,8 +20,6 @@
  */
 package jalview.datamodel;
 
-import jalview.api.DBRefEntryI;
-
 import java.util.List;
 import java.util.Vector;
 
@@ -443,21 +441,14 @@ public interface SequenceI extends ASequenceI
    */
   public PDBEntry getPDBEntry(String pdbId);
 
-  /**
-   * Set the distinct source database, and accession number from which a
-   * sequence and its start-end data were derived from. This is very important
-   * for SIFTS mappings and must be set prior to performing SIFTS mapping.
-   * 
-   * @param dbRef
-   *          the source dbRef for the sequence
-   */
-  public void setSourceDBRef(DBRefEntryI dbRef);
 
   /**
-   * Get the distinct source database, and accession number from which a
-   * sequence and its start-end data were derived from.
+   * Get all primary database/accessions for this sequence's data. These
+   * DBRefEntry are expected to resolve to a valid record in the associated
+   * external database, either directly or via a provided 1:1 Mapping.
    * 
-   * @return
+   * @return just the primary references (if any) for this sequence, or an empty
+   *         list
    */
-  public DBRefEntryI getSourceDBRef();
+  public List<DBRefEntry> getPrimaryDBRefs();
 }
index 56b1325..8688720 100644 (file)
@@ -192,7 +192,6 @@ public class EmblEntry
     DBRefEntry retrievedref = new DBRefEntry(sourceDb,
             getSequenceVersion(), accession);
     dna.addDBRef(retrievedref);
-    dna.setSourceDBRef(retrievedref);
     // add map to indicate the sequence is a valid coordinate frame for the
     // dbref
     retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
@@ -495,7 +494,6 @@ public class EmblEntry
             dnaToProteinMapping.setTo(proteinSeq);
             dnaToProteinMapping.setMappedFromId(proteinId);
             proteinSeq.addDBRef(proteinDbRef);
-            proteinSeq.setSourceDBRef(proteinDbRef);
             ref.setMap(dnaToProteinMapping);
           }
           hasUniprotDbref = true;
@@ -540,7 +538,6 @@ public class EmblEntry
                 DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
       }
       product.addDBRef(proteinToEmblProteinRef);
-      product.setSourceDBRef(proteinToEmblProteinRef);
 
       if (dnaToProteinMapping != null
               && dnaToProteinMapping.getTo() != null)
index 31552af..e44b610 100644 (file)
@@ -276,8 +276,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       {
         // clunky: ensure Uniprot xref if we have one is on mapped sequence
         SequenceI ds = proteinSeq.getDatasetSequence();
-        ds.setSourceDBRef(proteinSeq.getSourceDBRef());
-
+        // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence()
         Mapping map = new Mapping(ds, mapList);
         DBRefEntry dbr = new DBRefEntry(getDbSource(),
                 getEnsemblDataVersion(), proteinSeq.getName(), map);
@@ -322,7 +321,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     DBRefEntry self = new DBRefEntry(getDbSource(),
             getEnsemblDataVersion(), seq.getName());
     seq.addDBRef(self);
-    seq.setSourceDBRef(self);
   }
 
   /**
@@ -382,7 +380,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         {
           DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(),
                   getEnsemblDataVersion(), name);
-          sq.setSourceDBRef(dbref);
+          sq.addDBRef(dbref);
         }
       }
       if (alignment == null)
index 13fa460..b2cc70f 100644 (file)
@@ -867,7 +867,7 @@ public class StructureChooser extends GStructureChooser implements
       ArrayList<SequenceI> seqsWithoutSourceDBRef = new ArrayList<SequenceI>();
       for (SequenceI seq : sequences)
       {
-        if (seq.getSourceDBRef() == null && seq.getDBRefs() == null)
+        if (seq.getPrimaryDBRefs().size() == 0)
         {
             seqsWithoutSourceDBRef.add(seq);
             continue;
index fc0e207..f095383 100644 (file)
@@ -117,7 +117,9 @@ public abstract class StructureFile extends AlignFile
     DBRefEntry sourceDBRef = new DBRefEntry();
     sourceDBRef.setAccessionId(getId());
     sourceDBRef.setSource(DBRefSource.PDB);
-    pdbSequence.setSourceDBRef(sourceDBRef);
+    // TODO: specify version for 'PDB' database ref if it is read from a file.
+    // TODO: decide if jalview.io should be creating primary refs!
+    sourceDBRef.setVersion("");
     pdbSequence.addPDBId(entry);
     pdbSequence.addDBRef(sourceDBRef);
     SequenceI chainseq = pdbSequence;
index 8cc0ce4..81b4caf 100644 (file)
@@ -205,10 +205,10 @@ public class Uniprot extends DbSourceProxyImpl
     {
       DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
               accessionId);
+
+      // mark dbRef as a primary reference for this sequence
       dbRefs.add(dbRef);
     }
-    sequence.setSourceDBRef((dbRefs != null && dbRefs.size() > 0) ? dbRefs
-            .get(0) : null);
 
     Vector<PDBEntry> onlyPdbEntries = new Vector<PDBEntry>();
     for (PDBEntry pdb : entry.getDbReference())
index 6c94723..0ab6e7d 100644 (file)
@@ -323,41 +323,28 @@ public class SiftsClient implements SiftsClientI
   public DBRefEntryI getValidSourceDBRef(SequenceI seq)
           throws SiftsException
   {
-    DBRefEntryI sourceDBRef = null;
-    sourceDBRef = seq.getSourceDBRef();
-    if (sourceDBRef != null && isValidDBRefEntry(sourceDBRef))
+    DBRefEntry[] dbRefs = seq.getDBRefs();
+    if (dbRefs == null || dbRefs.length < 1)
     {
-      return sourceDBRef;
+      throw new SiftsException(
+              "Source DBRef could not be determined. DBRefs might not have been retrieved.");
     }
-    else
+
+    for (DBRefEntryI dbRef : dbRefs)
     {
-      DBRefEntry[] dbRefs = seq.getDBRefs();
-      if (dbRefs == null || dbRefs.length < 1)
+      if (dbRef == null || dbRef.getAccessionId() == null
+              || dbRef.getSource() == null)
       {
-        throw new SiftsException(
-                "Source DBRef could not be determined. DBRefs might not have been retrieved.");
+        continue;
       }
-
-      for (DBRefEntryI dbRef : dbRefs)
+      if (isValidDBRefEntry(dbRef)
+              && dbRef.isPrimary()
+              && (dbRef.getSource().equalsIgnoreCase(DBRefSource.UNIPROT) || dbRef
+                      .getSource().equalsIgnoreCase(DBRefSource.PDB)))
       {
-        if (dbRef == null || dbRef.getAccessionId() == null
-                || dbRef.getSource() == null)
-        {
-          continue;
-        }
-        if (isFoundInSiftsEntry(dbRef.getAccessionId())
-                && (dbRef.getSource().equalsIgnoreCase(DBRefSource.UNIPROT) || dbRef
-                        .getSource().equalsIgnoreCase(DBRefSource.PDB)))
-        {
-          seq.setSourceDBRef(dbRef);
-          return dbRef;
-        }
+        return dbRef;
       }
     }
-    if (sourceDBRef != null && isValidDBRefEntry(sourceDBRef))
-    {
-      return sourceDBRef;
-    }
     throw new SiftsException("Could not get source DB Ref");
   }
 
@@ -440,7 +427,7 @@ public class SiftsClient implements SiftsClientI
     String originalSeq = AlignSeq.extractGaps(
             jalview.util.Comparison.GapChars, seq.getSequenceAsString());
     HashMap<Integer, int[]> mapping = new HashMap<Integer, int[]>();
-    DBRefEntryI sourceDBRef = seq.getSourceDBRef();
+    DBRefEntryI sourceDBRef;
     sourceDBRef = getValidSourceDBRef(seq);
     // TODO ensure sequence start/end is in the same coordinate system and
     // consistent with the choosen sourceDBRef
index 7661c5d..1df7fd9 100644 (file)
@@ -997,9 +997,11 @@ public class AlignmentUtilsTests
      * sequence
      */
     DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
-    dna1.getDatasetSequence().setSourceDBRef(dbref);
+    dna1.getDatasetSequence().addDBRef(dbref);
+    org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
     dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
-    dna2.getDatasetSequence().setSourceDBRef(dbref);
+    dna2.getDatasetSequence().addDBRef(dbref);
+    org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
 
     /*
      * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
index 1fed98b..f586776 100644 (file)
@@ -499,7 +499,7 @@ public class SequenceTest
             new AlignmentAnnotation("Test annot", "Test annot description",
                     annots));
     Assert.assertEquals(sq.getDescription(), "Test sequence description..");
-    Assert.assertEquals(sq.getDBRefs().length, 4);
+    Assert.assertEquals(sq.getDBRefs().length, 5);
     Assert.assertEquals(sq.getAllPDBEntries().size(), 4);
     Assert.assertNotNull(sq.getAnnotation());
     Assert.assertEquals(sq.getAnnotation()[0].annotations.length, 2);
@@ -512,7 +512,7 @@ public class SequenceTest
 
     Assert.assertEquals(derived.getDescription(),
             "Test sequence description..");
-    Assert.assertEquals(derived.getDBRefs().length, 4);
+    Assert.assertEquals(derived.getDBRefs().length, 4); // come from dataset
     Assert.assertEquals(derived.getAllPDBEntries().size(), 4);
     Assert.assertNotNull(derived.getAnnotation());
     Assert.assertEquals(derived.getAnnotation()[0].annotations.length, 2);
@@ -530,6 +530,17 @@ public class SequenceTest
     assertNotNull(sq.getSequenceFeatures());
     assertArrayEquals(sq.getSequenceFeatures(),
             derived.getSequenceFeatures());
+    
+    /*
+     *  verify we have primary db refs *just* for PDB IDs with associated
+     *  PDBEntry objects
+     */
+
+    assertEquals(primRefs, sq.getPrimaryDBRefs());
+    assertEquals(primRefs, sq.getDatasetSequence().getPrimaryDBRefs());
+
+    assertEquals(sq.getPrimaryDBRefs(), derived.getPrimaryDBRefs());
+
   }
 
   /**