Merge branch 'develop' into refactor/JAL-2106_sourceDbRef_revision

author Jim Procter <jprocter@issues.jalview.org>

Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)

committer Jim Procter <jprocter@issues.jalview.org>

Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
author Jim Procter <jprocter@issues.jalview.org>
Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
committer Jim Procter <jprocter@issues.jalview.org>
Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
diff --combined src/jalview/analysis/AlignmentUtils.java

index ec5b6a1,d93f42f..dc57f6e
--- 1/src/jalview/analysis/AlignmentUtils.java
--- 2/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@@ -22,6 -22,7 +22,6 @@@ package jalview.analysis
   
   import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
   
- -import jalview.api.DBRefEntryI;
   import jalview.datamodel.AlignedCodon;
   import jalview.datamodel.AlignedCodonFrame;
   import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
@@@ -1681,10 -1682,6 +1681,10 @@@ public class AlignmentUtil
              * its dataset sequence to the dataset
              */
             cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping);
+ +          // cdsSeq has a name constructed as CDS|<dbref>
+ +          // <dbref> will be either the accession for the coding sequence,
+ +          // marked in the /via/ dbref to the protein product accession
+ +          // or it will be the original nucleotide accession.
             SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
             cdsSeqs.add(cdsSeq);
             if (!dataset.getSequences().contains(cdsSeqDss))
@@@ -1748,28 -1745,16 +1748,28 @@@
              * same source and accession, so need a different accession for
              * the CDS from the dna sequence
              */
- -          DBRefEntryI dnaRef = dnaDss.getSourceDBRef();
- -          if (dnaRef != null)
- -          {
- -            // assuming cds version same as dna ?!?
- -            DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
- -                    dnaRef.getVersion(), cdsSeq.getName());
- -            proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
- -                    .getInverse()));
- -            proteinProduct.addDBRef(proteinToCdsRef);
- -          }
+ +          // specific use case:
+ +          // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
+ +          // ENSG02, ENSG03, with transcripts and products similarly named.
+ +          // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
+ +          // JBPNote: ?? can't actually create an example that demonstrates we
+ +          // need to
+ +          // synthesize an xref.
+ +          // TODO: merge conflicts from JAL-2154 branch and use PrimaryDBRefs()
+ +          // for (DBRefEntry primRef:dnaDss.getPrimaryDBRefs())
+ +          // {
+ +          // creates a complementary cross-reference to the source sequence's
+ +          // primary reference.
+ +
+ +          // // problem here is that the cross-reference is synthesized -
+ +          // cdsSeq.getName() may be like 'CDS|dnaaccession' or 'CDS|emblcdsacc'
+ +          // // assuming cds version same as dna ?!?
+ +          // DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
+ +          // dnaRef.getVersion(), cdsSeq.getName());
+ +          // proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
+ +          // .getInverse()));
+ +          // proteinProduct.addDBRef(proteinToCdsRef);
+ +          // }
   
             /*
              * transfer any features on dna that overlap the CDS
@@@ -2649,13 -2634,16 +2649,16 @@@
         return false; // should only pass alignments with datasets here
       }
   
-     // map from dataset sequence to alignment sequence
-     Map<SequenceI, SequenceI> alignedDatasets = new HashMap<SequenceI, SequenceI>();
+     // map from dataset sequence to alignment sequence(s)
+     Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<SequenceI, List<SequenceI>>();
       for (SequenceI seq : aligned.getSequences())
       {
-       // JAL-2110: fail if two or more alignment sequences have a common dataset
-       // sequence.
-       alignedDatasets.put(seq.getDatasetSequence(), seq);
+       SequenceI ds = seq.getDatasetSequence();
+       if (alignedDatasets.get(ds) == null)
+       {
+         alignedDatasets.put(ds, new ArrayList<SequenceI>());
+       }
+       alignedDatasets.get(ds).add(seq);
       }
   
       /*
@@@ -2671,17 -2659,22 +2674,22 @@@
       }
   
       /*
-      * second pass - copy aligned sequences
+      * second pass - copy aligned sequences;
+      * heuristic rule: pair off sequences in order for the case where 
+      * more than one shares the same dataset sequence 
        */
       for (SequenceI seq : unaligned.getSequences())
       {
-       SequenceI alignedSequence = alignedDatasets.get(seq
+       List<SequenceI> alignedSequences = alignedDatasets.get(seq
                 .getDatasetSequence());
-       // JAL-2110: fail if two or more alignment sequences have common dataset
-       // sequence.
         // TODO: getSequenceAsString() will be deprecated in the future
         // TODO: need to leave to SequenceI implementor to update gaps
-       seq.setSequence(alignedSequence.getSequenceAsString());
+       seq.setSequence(alignedSequences.get(0).getSequenceAsString());
+       if (alignedSequences.size() > 0)
+       {
+         // pop off aligned sequences (except the last one)
+         alignedSequences.remove(0);
+       }
       }
   
       return true;
diff --combined src/jalview/datamodel/xdb/embl/EmblEntry.java

index 8688720,06e929d..3ba36ca
--- 1/src/jalview/datamodel/xdb/embl/EmblEntry.java
--- 2/src/jalview/datamodel/xdb/embl/EmblEntry.java
+++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java
@@@ -49,8 -49,7 +49,7 @@@ import java.util.regex.Pattern
    * Castor binding file
    * 
    * For example:
-  * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
-  * &format=emblxml
+  * http://www.ebi.ac.uk/ena/data/view/J03321&display=xml
    * 
    * @see embl_mapping.xml
    */
@@@ -188,10 -187,15 +187,14 @@@ public class EmblEntr
     public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
     {
       SequenceI dna = makeSequence(sourceDb);
+     if (dna == null)
+     {
+       return null;
+     }
       dna.setDescription(description);
       DBRefEntry retrievedref = new DBRefEntry(sourceDb,
               getSequenceVersion(), accession);
       dna.addDBRef(retrievedref);
- -    dna.setSourceDBRef(retrievedref);
       // add map to indicate the sequence is a valid coordinate frame for the
       // dbref
       retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
@@@ -239,6 -243,12 +242,12 @@@
      */
     SequenceI makeSequence(String sourceDb)
     {
+     if (sequence == null)
+     {
+       System.err.println("No sequence was returned for ENA accession "
+               + accession);
+       return null;
+     }
       SequenceI dna = new Sequence(sourceDb + "|" + accession,
               sequence.getSequence());
       return dna;
@@@ -494,6 -504,7 +503,6 @@@
               dnaToProteinMapping.setTo(proteinSeq);
               dnaToProteinMapping.setMappedFromId(proteinId);
               proteinSeq.addDBRef(proteinDbRef);
- -            proteinSeq.setSourceDBRef(proteinDbRef);
               ref.setMap(dnaToProteinMapping);
             }
             hasUniprotDbref = true;
@@@ -538,6 -549,7 +547,6 @@@
                   DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
         }
         product.addDBRef(proteinToEmblProteinRef);
- -      product.setSourceDBRef(proteinToEmblProteinRef);
   
         if (dnaToProteinMapping != null
                 && dnaToProteinMapping.getTo() != null)
diff --combined test/jalview/analysis/AlignmentUtilsTests.java

index 34ec73b,22bb680..0426091
--- 1/test/jalview/analysis/AlignmentUtilsTests.java
--- 2/test/jalview/analysis/AlignmentUtilsTests.java
+++ b/test/jalview/analysis/AlignmentUtilsTests.java
@@@ -997,11 -997,9 +997,11 @@@ public class AlignmentUtilsTest
        * sequence
        */
       DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
- -    dna1.getDatasetSequence().setSourceDBRef(dbref);
+ +    dna1.getDatasetSequence().addDBRef(dbref);
+ +    org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
       dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
- -    dna2.getDatasetSequence().setSourceDBRef(dbref);
+ +    dna2.getDatasetSequence().addDBRef(dbref);
+ +    org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
   
       /*
        * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
@@@ -1059,7 -1057,6 +1059,7 @@@
        * verify peptide has added a dbref with reverse mapping to CDS
        */
       assertNotNull(pep1.getDBRefs());
+ +    // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ?
       assertEquals(2, pep1.getDBRefs().length);
       dbref = pep1.getDBRefs()[1];
       assertEquals("ENSEMBL", dbref.getSource());
@@@ -2452,8 -2449,9 +2452,9 @@@
     {
       SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
       SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
-     SequenceI as1 = dna1.deriveSequence(), as2 = dna1.deriveSequence()
-             .getSubSequence(3, 7), as3 = dna2.deriveSequence();
+     SequenceI as1 = dna1.deriveSequence();
+     SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7);
+     SequenceI as3 = dna2.deriveSequence();
       as1.insertCharAt(6, 5, '-');
       String s_as1 = as1.getSequenceAsString();
       as2.insertCharAt(6, 5, '-');
@@@ -2464,8 -2462,9 +2465,9 @@@
   
       // why do we need to cast this still ?
       ((Alignment) aligned).createDatasetAlignment();
-     SequenceI uas1 = dna1.deriveSequence(), uas2 = dna1.deriveSequence()
-             .getSubSequence(3, 7), uas3 = dna2.deriveSequence();
+     SequenceI uas1 = dna1.deriveSequence();
+     SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7);
+     SequenceI uas3 = dna2.deriveSequence();
       AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2,
           uas3 });
       ((Alignment) tobealigned).createDatasetAlignment();
diff --combined test/jalview/datamodel/SequenceTest.java

index f586776,cfc4cbb..fcd24dd
--- 1/test/jalview/datamodel/SequenceTest.java
--- 2/test/jalview/datamodel/SequenceTest.java
+++ b/test/jalview/datamodel/SequenceTest.java
@@@ -110,8 -110,7 +110,7 @@@ public class SequenceTes
     {
       AlignmentAnnotation ann1 = addAnnotation("label1", "desc1", "calcId1",
               1f);
-     AlignmentAnnotation ann2 = addAnnotation("label2", "desc2", "calcId2",
-             1f);
+     addAnnotation("label2", "desc2", "calcId2", 1f);
       AlignmentAnnotation ann3 = addAnnotation("label1", "desc3", "calcId3",
               1f);
       AlignmentAnnotation[] anns = seq.getAnnotation("label1");
@@@ -133,16 -132,15 +132,15 @@@
     @Test(groups = { "Functional" })
     public void testGetAlignmentAnnotations_forCalcIdAndLabel()
     {
-     AlignmentAnnotation ann1 = addAnnotation("label1", "desc1", "calcId1",
-             1f);
+     addAnnotation("label1", "desc1", "calcId1", 1f);
       AlignmentAnnotation ann2 = addAnnotation("label2", "desc2", "calcId2",
               1f);
-     AlignmentAnnotation ann3 = addAnnotation("label2", "desc3", "calcId3",
-             1f);
+     addAnnotation("label2", "desc3", "calcId3", 1f);
       AlignmentAnnotation ann4 = addAnnotation("label2", "desc3", "calcId2",
               1f);
-     AlignmentAnnotation ann5 = addAnnotation("label5", "desc3", null, 1f);
-     AlignmentAnnotation ann6 = addAnnotation(null, "desc3", "calcId3", 1f);
+     addAnnotation("label5", "desc3", null, 1f);
+     addAnnotation(null, "desc3", "calcId3", 1f);
+ 
       List<AlignmentAnnotation> anns = seq.getAlignmentAnnotations("calcId2",
               "label2");
       assertEquals(2, anns.size());
@@@ -440,54 -438,36 +438,54 @@@
   
       sq.setDescription("Test sequence description..");
       sq.setVamsasId("TestVamsasId");
- -    sq.setSourceDBRef(new DBRefEntry("PDB", "version0", "1TST"));
+ +    sq.addDBRef(new DBRefEntry("PDB", "version0", "1TST"));
   
- -    sq.addDBRef(new DBRefEntry("PDB", "version1", "1Tst"));
- -    sq.addDBRef(new DBRefEntry("PDB", "version2", "2Tst"));
- -    sq.addDBRef(new DBRefEntry("PDB", "version3", "3Tst"));
- -    sq.addDBRef(new DBRefEntry("PDB", "version4", "4Tst"));
+ +    sq.addDBRef(new DBRefEntry("PDB", "version1", "1PDB"));
+ +    sq.addDBRef(new DBRefEntry("PDB", "version2", "2PDB"));
+ +    sq.addDBRef(new DBRefEntry("PDB", "version3", "3PDB"));
+ +    sq.addDBRef(new DBRefEntry("PDB", "version4", "4PDB"));
   
       sq.addPDBId(new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1"));
       sq.addPDBId(new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1"));
       sq.addPDBId(new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2"));
       sq.addPDBId(new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2"));
- -
- -    sq.getDatasetSequence().addDBRef(
- -            new DBRefEntry("PDB", "version1", "1Tst"));
+ +    
+ +    DBRefEntry pdb1pdb = new DBRefEntry("PDB", "version1", "1PDB");
+ +    DBRefEntry pdb2pdb = new DBRefEntry("PDB", "version1", "2PDB");
+ +    List<DBRefEntry> primRefs = Arrays.asList(new DBRefEntry[] { pdb1pdb,
+ +        pdb2pdb });
+ +
+ +    sq.getDatasetSequence().addDBRef(pdb1pdb);
+ +    sq.getDatasetSequence().addDBRef(pdb2pdb);
       sq.getDatasetSequence().addDBRef(
- -            new DBRefEntry("PDB", "version2", "2Tst"));
+ +            new DBRefEntry("PDB", "version3", "3PDB"));
       sq.getDatasetSequence().addDBRef(
- -            new DBRefEntry("PDB", "version3", "3Tst"));
- -    sq.getDatasetSequence().addDBRef(
- -            new DBRefEntry("PDB", "version4", "4Tst"));
- -
+ +            new DBRefEntry("PDB", "version4", "4PDB"));
+ +    
+ +    PDBEntry pdbe1a=new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1");
+ +    PDBEntry pdbe1b = new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1");
+ +    PDBEntry pdbe2a=new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2");
+ +    PDBEntry pdbe2b = new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2");
       sq.getDatasetSequence().addPDBId(
- -            new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1"));
+ +            pdbe1a);
       sq.getDatasetSequence().addPDBId(
- -            new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1"));
- -    sq.getDatasetSequence().addPDBId(
- -            new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2"));
- -    sq.getDatasetSequence().addPDBId(
- -            new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2"));
+ +            pdbe1b);
+ +    sq.getDatasetSequence().addPDBId(pdbe2a);
+ +    sq.getDatasetSequence().addPDBId(pdbe2b);
+ +
+ +    /*
+ +     * test we added pdb entries to the dataset sequence
+ +     */
+ +    Assert.assertEquals(sq.getDatasetSequence().getAllPDBEntries(), Arrays
+ +            .asList(new PDBEntry[] { pdbe1a, pdbe1b, pdbe2a, pdbe2b }),
+ +            "PDB Entries were not found on dataset sequence.");
   
+ +    /*
+ +     * we should recover a pdb entry that is on the dataset sequence via PDBEntry
+ +     */
+ +    Assert.assertEquals(pdbe1a,
+ +            sq.getDatasetSequence().getPDBEntry("1PDB"),
+ +            "PDB Entry '1PDB' not found on dataset sequence via getPDBEntry.");
       ArrayList<Annotation> annotsList = new ArrayList<Annotation>();
       System.out.println(">>>>>> " + sq.getSequenceAsString().length());
       annotsList.add(new Annotation("A", "A", 'X', 0.1f));
@@@ -499,7 -479,7 +497,7 @@@
               new AlignmentAnnotation("Test annot", "Test annot description",
                       annots));
       Assert.assertEquals(sq.getDescription(), "Test sequence description..");
- -    Assert.assertEquals(sq.getDBRefs().length, 4);
+ +    Assert.assertEquals(sq.getDBRefs().length, 5);
       Assert.assertEquals(sq.getAllPDBEntries().size(), 4);
       Assert.assertNotNull(sq.getAnnotation());
       Assert.assertEquals(sq.getAnnotation()[0].annotations.length, 2);
@@@ -512,7 -492,7 +510,7 @@@
   
       Assert.assertEquals(derived.getDescription(),
               "Test sequence description..");
- -    Assert.assertEquals(derived.getDBRefs().length, 4);
+ +    Assert.assertEquals(derived.getDBRefs().length, 4); // come from dataset
       Assert.assertEquals(derived.getAllPDBEntries().size(), 4);
       Assert.assertNotNull(derived.getAnnotation());
       Assert.assertEquals(derived.getAnnotation()[0].annotations.length, 2);
@@@ -530,17 -510,6 +528,17 @@@
       assertNotNull(sq.getSequenceFeatures());
       assertArrayEquals(sq.getSequenceFeatures(),
               derived.getSequenceFeatures());
+ +    
+ +    /*
+ +     *  verify we have primary db refs *just* for PDB IDs with associated
+ +     *  PDBEntry objects
+ +     */
+ +
+ +    assertEquals(primRefs, sq.getPrimaryDBRefs());
+ +    assertEquals(primRefs, sq.getDatasetSequence().getPrimaryDBRefs());
+ +
+ +    assertEquals(sq.getPrimaryDBRefs(), derived.getPrimaryDBRefs());
+ +
     }
   
     /**
author	Jim Procter <jprocter@issues.jalview.org>
	Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
committer	Jim Procter <jprocter@issues.jalview.org>
	Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
		1	2
src/jalview/analysis/AlignmentUtils.java	patch \|	diff1 \|	diff2 \|	blob \| history
src/jalview/datamodel/xdb/embl/EmblEntry.java	patch \|	diff1 \|	diff2 \|	blob \| history
test/jalview/analysis/AlignmentUtilsTests.java	patch \|	diff1 \|	diff2 \|	blob \| history
test/jalview/datamodel/SequenceTest.java	patch \|	diff1 \|	diff2 \|	blob \| history