import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
@Test(groups = { "Functional" })
public void testMakeCdsAlignment()
{
+ /*
+ * scenario:
+ * dna1 --> [4, 6] [10,12] --> pep1
+ * dna2 --> [1, 3] [7, 9] [13,15] --> pep1
+ */
SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
SequenceI pep1 = new Sequence("pep1", "GF");
SequenceI pep2 = new Sequence("pep2", "GFP");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "pep1"));
+ pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "pep2"));
dna1.createDatasetSequence();
dna2.createDatasetSequence();
pep1.createDatasetSequence();
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
dna.setDataset(null);
- MapList map = new MapList(new int[] { 4, 6, 10, 12 },
+ /*
+ * need a sourceDbRef if we are to construct dbrefs to the CDS
+ * sequence from the dna contig sequences
+ */
+ DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
+ dna1.getDatasetSequence().addDBRef(dbref);
+ org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
+ dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
+ dna2.getDatasetSequence().addDBRef(dbref);
+ org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
+
+ /*
+ * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
+ * dataset (e.g. added from dbrefs by CrossRef.findXrefSequences)
+ */
+ MapList mapfordna1 = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 2 }, 3, 1);
AlignedCodonFrame acf = new AlignedCodonFrame();
- acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
+ acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(),
+ mapfordna1);
dna.addCodonFrame(acf);
- map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
+ MapList mapfordna2 = new MapList(new int[] { 1, 3, 7, 9, 13, 15 },
+ new int[] { 1, 3 },
3, 1);
acf = new AlignedCodonFrame();
- acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
+ acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(),
+ mapfordna2);
dna.addCodonFrame(acf);
/*
+ * In this case, mappings originally came from matching Uniprot accessions - so need an xref on dna involving those regions. These are normally constructed from CDS annotation
+ */
+ DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1",
+ new Mapping(mapfordna1));
+ dna1.getDatasetSequence().addDBRef(dna1xref);
+ DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2",
+ new Mapping(mapfordna2));
+ dna2.getDatasetSequence().addDBRef(dna2xref);
+
+ /*
* execute method under test:
*/
AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
dna1, dna2 }, dna.getDataset(), null);
+ /*
+ * verify cds sequences
+ */
assertEquals(2, cds.getSequences().size());
assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString());
* verify shared, extended alignment dataset
*/
assertSame(dna.getDataset(), cds.getDataset());
- assertTrue(dna.getDataset().getSequences()
- .contains(cds.getSequenceAt(0).getDatasetSequence()));
- assertTrue(dna.getDataset().getSequences()
- .contains(cds.getSequenceAt(1).getDatasetSequence()));
+ SequenceI cds1Dss = cds.getSequenceAt(0).getDatasetSequence();
+ SequenceI cds2Dss = cds.getSequenceAt(1).getDatasetSequence();
+ assertTrue(dna.getDataset().getSequences().contains(cds1Dss));
+ assertTrue(dna.getDataset().getSequences().contains(cds2Dss));
+
+ /*
+ * verify CDS has a dbref with mapping to peptide
+ */
+ assertNotNull(cds1Dss.getDBRefs());
+ assertEquals(2, cds1Dss.getDBRefs().length);
+ dbref = cds1Dss.getDBRefs()[0];
+ assertEquals(dna1xref.getSource(), dbref.getSource());
+ // version is via ensembl's primary ref
+ assertEquals(dna1xref.getVersion(), dbref.getVersion());
+ assertEquals(dna1xref.getAccessionId(), dbref.getAccessionId());
+ assertNotNull(dbref.getMap());
+ assertSame(pep1.getDatasetSequence(), dbref.getMap().getTo());
+ MapList cdsMapping = new MapList(new int[] { 1, 6 },
+ new int[] { 1, 2 }, 3, 1);
+ assertEquals(cdsMapping, dbref.getMap().getMap());
/*
- * verify cds has dbref with mapping to protein and vice versa
+ * verify peptide has added a dbref with reverse mapping to CDS
*/
- DBRefEntry[] cdsDbrefs = cds.getSequenceAt(0).getDBRefs();
- // assertNotNull(cdsDbrefs);
- // assertEquals(1, cdsDbrefs.length);
- // assertNotNull(cdsDbrefs[0].getMap());
+ assertNotNull(pep1.getDBRefs());
+ // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ?
+ assertEquals(2, pep1.getDBRefs().length);
+ dbref = pep1.getDBRefs()[1];
+ assertEquals("ENSEMBL", dbref.getSource());
+ assertEquals("0", dbref.getVersion());
+ assertEquals("CDS|dna1", dbref.getAccessionId());
+ assertNotNull(dbref.getMap());
+ assertSame(cds1Dss, dbref.getMap().getTo());
+ assertEquals(cdsMapping.getInverse(), dbref.getMap().getMap());
/*
* Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
assertEquals(1, sr.getResults().size());
Match m = sr.getResults().get(0);
- assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
+ assertSame(cds1Dss, m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
// map F to TTT
sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
m = sr.getResults().get(0);
- assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
+ assertSame(cds1Dss, m.getSequence());
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
assertEquals(1, sr.getResults().size());
m = sr.getResults().get(0);
- assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
+ assertSame(cds2Dss, m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
// map F to TTT
sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
m = sr.getResults().get(0);
- assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
+ assertSame(cds2Dss, m.getSequence());
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
// map P to CCC
sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
m = sr.getResults().get(0);
- assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
+ assertSame(cds2Dss, m.getSequence());
assertEquals(7, m.getStart());
assertEquals(9, m.getEnd());
}
SequenceI cdsSeq = cds.get(0);
assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
// assertEquals("dna1|A12345", cdsSeq.getName());
- assertEquals("dna1|pep1", cdsSeq.getName());
+ assertEquals("CDS|dna1", cdsSeq.getName());
// assertEquals(1, cdsSeq.getDBRefs().length);
// DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
// assertEquals("EMBLCDS", cdsRef.getSource());
cdsSeq = cds.get(1);
assertEquals("aaaccc", cdsSeq.getSequenceAsString());
// assertEquals("dna1|A12346", cdsSeq.getName());
- assertEquals("dna1|pep2", cdsSeq.getName());
+ assertEquals("CDS|dna1", cdsSeq.getName());
// assertEquals(1, cdsSeq.getDBRefs().length);
// cdsRef = cdsSeq.getDBRefs()[0];
// assertEquals("EMBLCDS", cdsRef.getSource());
cdsSeq = cds.get(2);
assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
// assertEquals("dna1|A12347", cdsSeq.getName());
- assertEquals("dna1|pep3", cdsSeq.getName());
+ assertEquals("CDS|dna1", cdsSeq.getName());
// assertEquals(1, cdsSeq.getDBRefs().length);
// cdsRef = cdsSeq.getDBRefs()[0];
// assertEquals("EMBLCDS", cdsRef.getSource());
public void testComputePeptideVariants()
{
/*
- * scenario: AAATTTCCC codes for KFP, with variants
- * GAA -> E
- * CAA -> Q
- * AAG synonymous
- * AAT -> N
- * TTC synonymous
- * CAC,CGC -> H,R (as one variant)
+ * scenario: AAATTTCCC codes for KFP
+ * variants:
+ * GAA -> E source: Ensembl
+ * CAA -> Q source: dbSNP
+ * AAG synonymous source: COSMIC
+ * AAT -> N source: Ensembl
+ * ...TTC synonymous source: dbSNP
+ * ......CAC,CGC -> H,R source: COSMIC
+ * (one variant with two alleles)
*/
SequenceI peptide = new Sequence("pep/10-12", "KFP");
* two distinct variants for codon 1 position 1
* second one has clinical significance
*/
+ String ensembl = "Ensembl";
+ String dbSnp = "dbSNP";
+ String cosmic = "COSMIC";
SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
- 0f, null);
+ 0f, ensembl);
sf1.setValue("alleles", "A,G"); // GAA -> E
sf1.setValue("ID", "var1.125A>G");
SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
- 0f, null);
+ 0f, dbSnp);
sf2.setValue("alleles", "A,C"); // CAA -> Q
sf2.setValue("ID", "var2");
sf2.setValue("clinical_significance", "Dodgy");
SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
- 0f, null);
+ 0f, cosmic);
sf3.setValue("alleles", "A,G"); // synonymous
sf3.setValue("ID", "var3");
sf3.setValue("clinical_significance", "None");
SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
- 0f, null);
+ 0f, ensembl);
sf4.setValue("alleles", "A,T"); // AAT -> N
sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
sf4.setValue("clinical_significance", "Benign");
SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
- 0f, null);
+ 0f, dbSnp);
sf5.setValue("alleles", "T,C"); // synonymous
sf5.setValue("ID", "var5");
sf5.setValue("clinical_significance", "Bad");
SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
- 0f, null);
+ 0f, cosmic);
sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
sf6.setValue("ID", "var6");
sf6.setValue("clinical_significance", "Good");
/*
* verify added sequence features for
- * var1 K -> E
- * var2 K -> Q
- * var4 K -> N
- * var6 P -> H
- * var6 P -> R
+ * var1 K -> E Ensembl
+ * var2 K -> Q dbSNP
+ * var4 K -> N Ensembl
+ * var6 P -> H COSMIC
+ * var6 P -> R COSMIC
*/
SequenceFeature[] sfs = peptide.getSequenceFeatures();
assertEquals(5, sfs.length);
+
SequenceFeature sf = sfs[0];
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
assertEquals(
"p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
sf.links.get(0));
- assertEquals("Jalview", sf.getFeatureGroup());
+ assertEquals(ensembl, sf.getFeatureGroup());
+
sf = sfs[1];
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
assertEquals(
"p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
sf.links.get(0));
- assertEquals("Jalview", sf.getFeatureGroup());
+ assertEquals(dbSnp, sf.getFeatureGroup());
+
sf = sfs[2];
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
assertEquals(
"p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
sf.links.get(0));
- assertEquals("Jalview", sf.getFeatureGroup());
+ assertEquals(ensembl, sf.getFeatureGroup());
+
+ // var5 generates two distinct protein variant features
sf = sfs[3];
assertEquals(3, sf.getBegin());
assertEquals(3, sf.getEnd());
assertEquals(
"p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
sf.links.get(0));
- // var5 generates two distinct protein variant features
- assertEquals("Jalview", sf.getFeatureGroup());
+ assertEquals(cosmic, sf.getFeatureGroup());
+
sf = sfs[4];
assertEquals(3, sf.getBegin());
assertEquals(3, sf.getEnd());
assertEquals(
"p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
sf.links.get(0));
- assertEquals("Jalview", sf.getFeatureGroup());
+ assertEquals(cosmic, sf.getFeatureGroup());
}
/**
* execute method under test to find CDS for EMBL peptides only
*/
AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
- dna1, dna2 }, dna.getDataset(), emblPeptides);
+ dna1, dna2 }, dna.getDataset(), emblPeptides.getSequencesArray());
assertEquals(2, cds.getSequences().size());
assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
assertEquals(7, m.getStart());
assertEquals(9, m.getEnd());
}
+
+ /**
+ * Test the method that just copies aligned sequences, provided all sequences
+ * to be aligned share the aligned sequence's dataset
+ */
+ @Test(groups = "Functional")
+ public void testAlignAsSameSequences()
+ {
+ SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
+ SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
+ AlignmentI al1 = new Alignment(new SequenceI[] { dna1, dna2 });
+ ((Alignment) al1).createDatasetAlignment();
+
+ SequenceI dna3 = new Sequence(dna1);
+ SequenceI dna4 = new Sequence(dna2);
+ assertSame(dna3.getDatasetSequence(), dna1.getDatasetSequence());
+ assertSame(dna4.getDatasetSequence(), dna2.getDatasetSequence());
+ String seq1 = "-cc-GG-GT-TT--aaa";
+ dna3.setSequence(seq1);
+ String seq2 = "C--C-Cgg--gtt-tAA-A-";
+ dna4.setSequence(seq2);
+ AlignmentI al2 = new Alignment(new SequenceI[] { dna3, dna4 });
+ ((Alignment) al2).createDatasetAlignment();
+
+ assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2));
+ assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString());
+ assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString());
+
+ /*
+ * add another sequence to 'aligned' - should still succeed, since
+ * unaligned sequences still share a dataset with aligned sequences
+ */
+ SequenceI dna5 = new Sequence("dna5", "CCCgggtttAAA");
+ dna5.createDatasetSequence();
+ al2.addSequence(dna5);
+ assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2));
+ assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString());
+ assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString());
+
+ /*
+ * add another sequence to 'unaligned' - should fail, since now not
+ * all unaligned sequences share a dataset with aligned sequences
+ */
+ SequenceI dna6 = new Sequence("dna6", "CCCgggtttAAA");
+ dna6.createDatasetSequence();
+ al1.addSequence(dna6);
+ // JAL-2110 JBP Comment: what's the use case for this behaviour ?
+ assertFalse(AlignmentUtils.alignAsSameSequences(al1, al2));
+ }
+
+ @Test(groups = "Functional")
+ public void testAlignAsSameSequencesMultipleSubSeq()
+ {
+ SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
+ SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
+ SequenceI as1 = dna1.deriveSequence();
+ SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7);
+ SequenceI as3 = dna2.deriveSequence();
+ as1.insertCharAt(6, 5, '-');
+ String s_as1 = as1.getSequenceAsString();
+ as2.insertCharAt(6, 5, '-');
+ String s_as2 = as2.getSequenceAsString();
+ as3.insertCharAt(6, 5, '-');
+ String s_as3 = as3.getSequenceAsString();
+ AlignmentI aligned = new Alignment(new SequenceI[] { as1, as2, as3 });
+
+ // why do we need to cast this still ?
+ ((Alignment) aligned).createDatasetAlignment();
+ SequenceI uas1 = dna1.deriveSequence();
+ SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7);
+ SequenceI uas3 = dna2.deriveSequence();
+ AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2,
+ uas3 });
+ ((Alignment) tobealigned).createDatasetAlignment();
+
+ assertTrue(AlignmentUtils.alignAsSameSequences(tobealigned, aligned));
+ assertEquals(s_as1, uas1.getSequenceAsString());
+ assertEquals(s_as2, uas2.getSequenceAsString());
+ assertEquals(s_as3, uas3.getSequenceAsString());
+ }
+
}