Merge branch 'develop' into refactor/JAL-2106_sourceDbRef_revision
authorJim Procter <jprocter@issues.jalview.org>
Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Thu, 25 Aug 2016 09:54:04 +0000 (10:54 +0100)
1  2 
src/jalview/analysis/AlignmentUtils.java
src/jalview/datamodel/xdb/embl/EmblEntry.java
test/jalview/analysis/AlignmentUtilsTests.java
test/jalview/datamodel/SequenceTest.java

@@@ -22,6 -22,7 +22,6 @@@ package jalview.analysis
  
  import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
  
 -import jalview.api.DBRefEntryI;
  import jalview.datamodel.AlignedCodon;
  import jalview.datamodel.AlignedCodonFrame;
  import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
@@@ -1681,10 -1682,6 +1681,10 @@@ public class AlignmentUtil
             * its dataset sequence to the dataset
             */
            cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping);
 +          // cdsSeq has a name constructed as CDS|<dbref>
 +          // <dbref> will be either the accession for the coding sequence,
 +          // marked in the /via/ dbref to the protein product accession
 +          // or it will be the original nucleotide accession.
            SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
            cdsSeqs.add(cdsSeq);
            if (!dataset.getSequences().contains(cdsSeqDss))
             * same source and accession, so need a different accession for
             * the CDS from the dna sequence
             */
 -          DBRefEntryI dnaRef = dnaDss.getSourceDBRef();
 -          if (dnaRef != null)
 -          {
 -            // assuming cds version same as dna ?!?
 -            DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
 -                    dnaRef.getVersion(), cdsSeq.getName());
 -            proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
 -                    .getInverse()));
 -            proteinProduct.addDBRef(proteinToCdsRef);
 -          }
 +          // specific use case:
 +          // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
 +          // ENSG02, ENSG03, with transcripts and products similarly named.
 +          // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
 +          // JBPNote: ?? can't actually create an example that demonstrates we
 +          // need to
 +          // synthesize an xref.
 +          // TODO: merge conflicts from JAL-2154 branch and use PrimaryDBRefs()
 +          // for (DBRefEntry primRef:dnaDss.getPrimaryDBRefs())
 +          // {
 +          // creates a complementary cross-reference to the source sequence's
 +          // primary reference.
 +
 +          // // problem here is that the cross-reference is synthesized -
 +          // cdsSeq.getName() may be like 'CDS|dnaaccession' or 'CDS|emblcdsacc'
 +          // // assuming cds version same as dna ?!?
 +          // DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
 +          // dnaRef.getVersion(), cdsSeq.getName());
 +          // proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
 +          // .getInverse()));
 +          // proteinProduct.addDBRef(proteinToCdsRef);
 +          // }
  
            /*
             * transfer any features on dna that overlap the CDS
        return false; // should only pass alignments with datasets here
      }
  
-     // map from dataset sequence to alignment sequence
-     Map<SequenceI, SequenceI> alignedDatasets = new HashMap<SequenceI, SequenceI>();
+     // map from dataset sequence to alignment sequence(s)
+     Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<SequenceI, List<SequenceI>>();
      for (SequenceI seq : aligned.getSequences())
      {
-       // JAL-2110: fail if two or more alignment sequences have a common dataset
-       // sequence.
-       alignedDatasets.put(seq.getDatasetSequence(), seq);
+       SequenceI ds = seq.getDatasetSequence();
+       if (alignedDatasets.get(ds) == null)
+       {
+         alignedDatasets.put(ds, new ArrayList<SequenceI>());
+       }
+       alignedDatasets.get(ds).add(seq);
      }
  
      /*
      }
  
      /*
-      * second pass - copy aligned sequences
+      * second pass - copy aligned sequences;
+      * heuristic rule: pair off sequences in order for the case where 
+      * more than one shares the same dataset sequence 
       */
      for (SequenceI seq : unaligned.getSequences())
      {
-       SequenceI alignedSequence = alignedDatasets.get(seq
+       List<SequenceI> alignedSequences = alignedDatasets.get(seq
                .getDatasetSequence());
-       // JAL-2110: fail if two or more alignment sequences have common dataset
-       // sequence.
        // TODO: getSequenceAsString() will be deprecated in the future
        // TODO: need to leave to SequenceI implementor to update gaps
-       seq.setSequence(alignedSequence.getSequenceAsString());
+       seq.setSequence(alignedSequences.get(0).getSequenceAsString());
+       if (alignedSequences.size() > 0)
+       {
+         // pop off aligned sequences (except the last one)
+         alignedSequences.remove(0);
+       }
      }
  
      return true;
@@@ -49,8 -49,7 +49,7 @@@ import java.util.regex.Pattern
   * Castor binding file
   * 
   * For example:
-  * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
-  * &format=emblxml
+  * http://www.ebi.ac.uk/ena/data/view/J03321&display=xml
   * 
   * @see embl_mapping.xml
   */
@@@ -188,10 -187,15 +187,14 @@@ public class EmblEntr
    public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
    {
      SequenceI dna = makeSequence(sourceDb);
+     if (dna == null)
+     {
+       return null;
+     }
      dna.setDescription(description);
      DBRefEntry retrievedref = new DBRefEntry(sourceDb,
              getSequenceVersion(), accession);
      dna.addDBRef(retrievedref);
 -    dna.setSourceDBRef(retrievedref);
      // add map to indicate the sequence is a valid coordinate frame for the
      // dbref
      retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
     */
    SequenceI makeSequence(String sourceDb)
    {
+     if (sequence == null)
+     {
+       System.err.println("No sequence was returned for ENA accession "
+               + accession);
+       return null;
+     }
      SequenceI dna = new Sequence(sourceDb + "|" + accession,
              sequence.getSequence());
      return dna;
              dnaToProteinMapping.setTo(proteinSeq);
              dnaToProteinMapping.setMappedFromId(proteinId);
              proteinSeq.addDBRef(proteinDbRef);
 -            proteinSeq.setSourceDBRef(proteinDbRef);
              ref.setMap(dnaToProteinMapping);
            }
            hasUniprotDbref = true;
                  DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
        }
        product.addDBRef(proteinToEmblProteinRef);
 -      product.setSourceDBRef(proteinToEmblProteinRef);
  
        if (dnaToProteinMapping != null
                && dnaToProteinMapping.getTo() != null)
@@@ -997,11 -997,9 +997,11 @@@ public class AlignmentUtilsTest
       * sequence
       */
      DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
 -    dna1.getDatasetSequence().setSourceDBRef(dbref);
 +    dna1.getDatasetSequence().addDBRef(dbref);
 +    org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
      dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
 -    dna2.getDatasetSequence().setSourceDBRef(dbref);
 +    dna2.getDatasetSequence().addDBRef(dbref);
 +    org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
  
      /*
       * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
       * verify peptide has added a dbref with reverse mapping to CDS
       */
      assertNotNull(pep1.getDBRefs());
 +    // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ?
      assertEquals(2, pep1.getDBRefs().length);
      dbref = pep1.getDBRefs()[1];
      assertEquals("ENSEMBL", dbref.getSource());
    {
      SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
      SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
-     SequenceI as1 = dna1.deriveSequence(), as2 = dna1.deriveSequence()
-             .getSubSequence(3, 7), as3 = dna2.deriveSequence();
+     SequenceI as1 = dna1.deriveSequence();
+     SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7);
+     SequenceI as3 = dna2.deriveSequence();
      as1.insertCharAt(6, 5, '-');
      String s_as1 = as1.getSequenceAsString();
      as2.insertCharAt(6, 5, '-');
  
      // why do we need to cast this still ?
      ((Alignment) aligned).createDatasetAlignment();
-     SequenceI uas1 = dna1.deriveSequence(), uas2 = dna1.deriveSequence()
-             .getSubSequence(3, 7), uas3 = dna2.deriveSequence();
+     SequenceI uas1 = dna1.deriveSequence();
+     SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7);
+     SequenceI uas3 = dna2.deriveSequence();
      AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2,
          uas3 });
      ((Alignment) tobealigned).createDatasetAlignment();
@@@ -110,8 -110,7 +110,7 @@@ public class SequenceTes
    {
      AlignmentAnnotation ann1 = addAnnotation("label1", "desc1", "calcId1",
              1f);
-     AlignmentAnnotation ann2 = addAnnotation("label2", "desc2", "calcId2",
-             1f);
+     addAnnotation("label2", "desc2", "calcId2", 1f);
      AlignmentAnnotation ann3 = addAnnotation("label1", "desc3", "calcId3",
              1f);
      AlignmentAnnotation[] anns = seq.getAnnotation("label1");
    @Test(groups = { "Functional" })
    public void testGetAlignmentAnnotations_forCalcIdAndLabel()
    {
-     AlignmentAnnotation ann1 = addAnnotation("label1", "desc1", "calcId1",
-             1f);
+     addAnnotation("label1", "desc1", "calcId1", 1f);
      AlignmentAnnotation ann2 = addAnnotation("label2", "desc2", "calcId2",
              1f);
-     AlignmentAnnotation ann3 = addAnnotation("label2", "desc3", "calcId3",
-             1f);
+     addAnnotation("label2", "desc3", "calcId3", 1f);
      AlignmentAnnotation ann4 = addAnnotation("label2", "desc3", "calcId2",
              1f);
-     AlignmentAnnotation ann5 = addAnnotation("label5", "desc3", null, 1f);
-     AlignmentAnnotation ann6 = addAnnotation(null, "desc3", "calcId3", 1f);
+     addAnnotation("label5", "desc3", null, 1f);
+     addAnnotation(null, "desc3", "calcId3", 1f);
      List<AlignmentAnnotation> anns = seq.getAlignmentAnnotations("calcId2",
              "label2");
      assertEquals(2, anns.size());
  
      sq.setDescription("Test sequence description..");
      sq.setVamsasId("TestVamsasId");
 -    sq.setSourceDBRef(new DBRefEntry("PDB", "version0", "1TST"));
 +    sq.addDBRef(new DBRefEntry("PDB", "version0", "1TST"));
  
 -    sq.addDBRef(new DBRefEntry("PDB", "version1", "1Tst"));
 -    sq.addDBRef(new DBRefEntry("PDB", "version2", "2Tst"));
 -    sq.addDBRef(new DBRefEntry("PDB", "version3", "3Tst"));
 -    sq.addDBRef(new DBRefEntry("PDB", "version4", "4Tst"));
 +    sq.addDBRef(new DBRefEntry("PDB", "version1", "1PDB"));
 +    sq.addDBRef(new DBRefEntry("PDB", "version2", "2PDB"));
 +    sq.addDBRef(new DBRefEntry("PDB", "version3", "3PDB"));
 +    sq.addDBRef(new DBRefEntry("PDB", "version4", "4PDB"));
  
      sq.addPDBId(new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1"));
      sq.addPDBId(new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1"));
      sq.addPDBId(new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2"));
      sq.addPDBId(new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2"));
 -
 -    sq.getDatasetSequence().addDBRef(
 -            new DBRefEntry("PDB", "version1", "1Tst"));
 +    
 +    DBRefEntry pdb1pdb = new DBRefEntry("PDB", "version1", "1PDB");
 +    DBRefEntry pdb2pdb = new DBRefEntry("PDB", "version1", "2PDB");
 +    List<DBRefEntry> primRefs = Arrays.asList(new DBRefEntry[] { pdb1pdb,
 +        pdb2pdb });
 +
 +    sq.getDatasetSequence().addDBRef(pdb1pdb);
 +    sq.getDatasetSequence().addDBRef(pdb2pdb);
      sq.getDatasetSequence().addDBRef(
 -            new DBRefEntry("PDB", "version2", "2Tst"));
 +            new DBRefEntry("PDB", "version3", "3PDB"));
      sq.getDatasetSequence().addDBRef(
 -            new DBRefEntry("PDB", "version3", "3Tst"));
 -    sq.getDatasetSequence().addDBRef(
 -            new DBRefEntry("PDB", "version4", "4Tst"));
 -
 +            new DBRefEntry("PDB", "version4", "4PDB"));
 +    
 +    PDBEntry pdbe1a=new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1");
 +    PDBEntry pdbe1b = new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1");
 +    PDBEntry pdbe2a=new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2");
 +    PDBEntry pdbe2b = new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2");
      sq.getDatasetSequence().addPDBId(
 -            new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1"));
 +            pdbe1a);
      sq.getDatasetSequence().addPDBId(
 -            new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1"));
 -    sq.getDatasetSequence().addPDBId(
 -            new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2"));
 -    sq.getDatasetSequence().addPDBId(
 -            new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2"));
 +            pdbe1b);
 +    sq.getDatasetSequence().addPDBId(pdbe2a);
 +    sq.getDatasetSequence().addPDBId(pdbe2b);
 +
 +    /*
 +     * test we added pdb entries to the dataset sequence
 +     */
 +    Assert.assertEquals(sq.getDatasetSequence().getAllPDBEntries(), Arrays
 +            .asList(new PDBEntry[] { pdbe1a, pdbe1b, pdbe2a, pdbe2b }),
 +            "PDB Entries were not found on dataset sequence.");
  
 +    /*
 +     * we should recover a pdb entry that is on the dataset sequence via PDBEntry
 +     */
 +    Assert.assertEquals(pdbe1a,
 +            sq.getDatasetSequence().getPDBEntry("1PDB"),
 +            "PDB Entry '1PDB' not found on dataset sequence via getPDBEntry.");
      ArrayList<Annotation> annotsList = new ArrayList<Annotation>();
      System.out.println(">>>>>> " + sq.getSequenceAsString().length());
      annotsList.add(new Annotation("A", "A", 'X', 0.1f));
              new AlignmentAnnotation("Test annot", "Test annot description",
                      annots));
      Assert.assertEquals(sq.getDescription(), "Test sequence description..");
 -    Assert.assertEquals(sq.getDBRefs().length, 4);
 +    Assert.assertEquals(sq.getDBRefs().length, 5);
      Assert.assertEquals(sq.getAllPDBEntries().size(), 4);
      Assert.assertNotNull(sq.getAnnotation());
      Assert.assertEquals(sq.getAnnotation()[0].annotations.length, 2);
  
      Assert.assertEquals(derived.getDescription(),
              "Test sequence description..");
 -    Assert.assertEquals(derived.getDBRefs().length, 4);
 +    Assert.assertEquals(derived.getDBRefs().length, 4); // come from dataset
      Assert.assertEquals(derived.getAllPDBEntries().size(), 4);
      Assert.assertNotNull(derived.getAnnotation());
      Assert.assertEquals(derived.getAnnotation()[0].annotations.length, 2);
      assertNotNull(sq.getSequenceFeatures());
      assertArrayEquals(sq.getSequenceFeatures(),
              derived.getSequenceFeatures());
 +    
 +    /*
 +     *  verify we have primary db refs *just* for PDB IDs with associated
 +     *  PDBEntry objects
 +     */
 +
 +    assertEquals(primRefs, sq.getPrimaryDBRefs());
 +    assertEquals(primRefs, sq.getDatasetSequence().getPrimaryDBRefs());
 +
 +    assertEquals(sq.getPrimaryDBRefs(), derived.getPrimaryDBRefs());
 +
    }
  
    /**