X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=test%2Fjalview%2Fanalysis%2FAlignmentUtilsTests.java;h=70ae6a03bfecf2c7a3cfabf7bec98314ee11d534;hb=3c8a25936a2d805e7e3d7ab82f83b13135406d18;hp=8a667b39c2842c29e29bdc8b594bc9d784fd8d16;hpb=5f4e1e4c330b045e9c8bce28ee132a0fca3834d8;p=jalview.git diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 8a667b3..70ae6a0 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -34,15 +34,21 @@ import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResultMatchI; import jalview.datamodel.SearchResultsI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.SequenceFeatures; import jalview.gui.JvOptionPane; import jalview.io.AppletFormatAdapter; +import jalview.io.DataSourceType; +import jalview.io.FileFormat; +import jalview.io.FileFormatI; import jalview.io.FormatAdapter; +import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; @@ -59,6 +65,8 @@ import org.testng.annotations.Test; public class AlignmentUtilsTests { + private static Sequence ts = new Sequence("short", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm"); @BeforeClass(alwaysRun = true) public void setUpJvOptionPane() @@ -67,9 +75,6 @@ public class AlignmentUtilsTests JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); } - public static Sequence ts = new Sequence("short", - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm"); - @Test(groups = { "Functional" }) public void testExpandContext() { @@ -79,14 +84,15 @@ public class AlignmentUtilsTests SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7); al.addSequence(s1); } - System.out.println(new AppletFormatAdapter().formatSequences("Clustal", + System.out.println(new AppletFormatAdapter().formatSequences( + FileFormat.Clustal, al, true)); for (int flnk = -1; flnk < 25; flnk++) { AlignmentI exp = AlignmentUtils.expandContext(al, flnk); System.out.println("\nFlank size: " + flnk); System.out.println(new AppletFormatAdapter().formatSequences( - "Clustal", exp, true)); + FileFormat.Clustal, exp, true)); if (flnk == -1) { /* @@ -219,7 +225,7 @@ public class AlignmentUtilsTests { final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n" + ">Seq1Name\nABCD\n"; - AlignmentI al = loadAlignment(data, "FASTA"); + AlignmentI al = loadAlignment(data, FileFormat.Fasta); Map> map = AlignmentUtils .getSequencesByName(al); assertEquals(2, map.keySet().size()); @@ -239,11 +245,11 @@ public class AlignmentUtilsTests * @return * @throws IOException */ - protected AlignmentI loadAlignment(final String data, String format) + protected AlignmentI loadAlignment(final String data, FileFormatI format) throws IOException { AlignmentI a = new FormatAdapter().readFile(data, - AppletFormatAdapter.PASTE, format); + DataSourceType.PASTE, format); a.setDataset(null); return a; } @@ -258,14 +264,14 @@ public class AlignmentUtilsTests @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_noXrefs() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ @@ -502,7 +508,7 @@ public class AlignmentUtilsTests acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map); acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map); acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); - ArrayList acfs = new ArrayList(); + ArrayList acfs = new ArrayList<>(); acfs.add(acf); protein.setCodonFrames(acfs); @@ -600,14 +606,14 @@ public class AlignmentUtilsTests public void testMapProteinAlignmentToCdna_withStartAndStopCodons() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); // start + SAR: dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC")); // = EIQ + stop @@ -692,14 +698,14 @@ public class AlignmentUtilsTests @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_withXrefs() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ @@ -769,14 +775,14 @@ public class AlignmentUtilsTests public void testMapProteinAlignmentToCdna_prioritiseXrefs() throws IOException { - List protseqs = new ArrayList(); + List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); AlignmentI protein = new Alignment( protseqs.toArray(new SequenceI[protseqs.size()])); protein.setDataset(null); - List dnaseqs = new ArrayList(); + List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs @@ -843,8 +849,8 @@ public class AlignmentUtilsTests al.addAnnotation(ann4); // Temp for seq1 al.addAnnotation(ann5); // Temp for seq2 al.addAnnotation(ann6); // Temp for no sequence - List types = new ArrayList(); - List scope = new ArrayList(); + List types = new ArrayList<>(); + List scope = new ArrayList<>(); /* * Set all sequence related Structure to hidden (ann1, ann2) @@ -1039,14 +1045,18 @@ public class AlignmentUtilsTests dna.addCodonFrame(acf); /* - * In this case, mappings originally came from matching Uniprot accessions - so need an xref on dna involving those regions. These are normally constructed from CDS annotation + * In this case, mappings originally came from matching Uniprot accessions + * - so need an xref on dna involving those regions. + * These are normally constructed from CDS annotation */ DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1", new Mapping(mapfordna1)); - dna1.getDatasetSequence().addDBRef(dna1xref); + dna1.addDBRef(dna1xref); + assertEquals(2, dna1.getDBRefs().length); // to self and to pep1 DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2", new Mapping(mapfordna2)); - dna2.getDatasetSequence().addDBRef(dna2xref); + dna2.addDBRef(dna2xref); + assertEquals(2, dna2.getDBRefs().length); // to self and to pep2 /* * execute method under test: @@ -1101,6 +1111,38 @@ public class AlignmentUtilsTests assertEquals(cdsMapping.getInverse(), dbref.getMap().getMap()); /* + * verify cDNA has added a dbref with mapping to CDS + */ + assertEquals(3, dna1.getDBRefs().length); + DBRefEntry dbRefEntry = dna1.getDBRefs()[2]; + assertSame(cds1Dss, dbRefEntry.getMap().getTo()); + MapList dnaToCdsMapping = new MapList(new int[] { 4, 6, 10, 12 }, + new int[] { 1, 6 }, 1, 1); + assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap()); + assertEquals(3, dna2.getDBRefs().length); + dbRefEntry = dna2.getDBRefs()[2]; + assertSame(cds2Dss, dbRefEntry.getMap().getTo()); + dnaToCdsMapping = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, + new int[] { 1, 9 }, 1, 1); + assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap()); + + /* + * verify CDS has added a dbref with mapping to cDNA + */ + assertEquals(2, cds1Dss.getDBRefs().length); + dbRefEntry = cds1Dss.getDBRefs()[1]; + assertSame(dna1.getDatasetSequence(), dbRefEntry.getMap().getTo()); + MapList cdsToDnaMapping = new MapList(new int[] { 1, 6 }, new int[] { + 4, 6, 10, 12 }, 1, 1); + assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap()); + assertEquals(2, cds2Dss.getDBRefs().length); + dbRefEntry = cds2Dss.getDBRefs()[1]; + assertSame(dna2.getDatasetSequence(), dbRefEntry.getMap().getTo()); + cdsToDnaMapping = new MapList(new int[] { 1, 9 }, new int[] { 1, 3, 7, + 9, 13, 15 }, 1, 1); + assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap()); + + /* * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide * the mappings are on the shared alignment dataset * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep) @@ -1175,12 +1217,12 @@ public class AlignmentUtilsTests /* * check cds2 acquired a variant feature in position 5 */ - SequenceFeature[] sfs = cds2Dss.getSequenceFeatures(); + List sfs = cds2Dss.getSequenceFeatures(); assertNotNull(sfs); - assertEquals(1, sfs.length); - assertEquals("variant", sfs[0].type); - assertEquals(5, sfs[0].begin); - assertEquals(5, sfs[0].end); + assertEquals(1, sfs.size()); + assertEquals("variant", sfs.get(0).type); + assertEquals(5, sfs.get(0).begin); + assertEquals(5, sfs.get(0).end); } /** @@ -1485,39 +1527,39 @@ public class AlignmentUtilsTests * that partially overlap 5' or 3' (start or end) of target sequence */ AlignmentUtils.transferFeatures(dna, cds, map, null); - SequenceFeature[] sfs = cds.getSequenceFeatures(); - assertEquals(6, sfs.length); + List sfs = cds.getSequenceFeatures(); + assertEquals(6, sfs.size()); - SequenceFeature sf = sfs[0]; + SequenceFeature sf = sfs.get(0); assertEquals("type2", sf.getType()); assertEquals("desc2", sf.getDescription()); assertEquals(2f, sf.getScore()); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); - sf = sfs[1]; + sf = sfs.get(1); assertEquals("type3", sf.getType()); assertEquals("desc3", sf.getDescription()); assertEquals(3f, sf.getScore()); assertEquals(1, sf.getBegin()); assertEquals(3, sf.getEnd()); - sf = sfs[2]; + sf = sfs.get(2); assertEquals("type4", sf.getType()); assertEquals(2, sf.getBegin()); assertEquals(5, sf.getEnd()); - sf = sfs[3]; + sf = sfs.get(3); assertEquals("type5", sf.getType()); assertEquals(1, sf.getBegin()); assertEquals(6, sf.getEnd()); - sf = sfs[4]; + sf = sfs.get(4); assertEquals("type8", sf.getType()); assertEquals(6, sf.getBegin()); assertEquals(6, sf.getEnd()); - sf = sfs[5]; + sf = sfs.get(5); assertEquals("type9", sf.getType()); assertEquals(6, sf.getBegin()); assertEquals(6, sf.getEnd()); @@ -1547,10 +1589,10 @@ public class AlignmentUtilsTests // desc4 and desc8 are the 'omit these' varargs AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8"); - SequenceFeature[] sfs = cds.getSequenceFeatures(); - assertEquals(1, sfs.length); + List sfs = cds.getSequenceFeatures(); + assertEquals(1, sfs.size()); - SequenceFeature sf = sfs[0]; + SequenceFeature sf = sfs.get(0); assertEquals("type5", sf.getType()); assertEquals(1, sf.getBegin()); assertEquals(6, sf.getEnd()); @@ -1580,10 +1622,10 @@ public class AlignmentUtilsTests // "type5" is the 'select this type' argument AlignmentUtils.transferFeatures(dna, cds, map, "type5"); - SequenceFeature[] sfs = cds.getSequenceFeatures(); - assertEquals(1, sfs.length); + List sfs = cds.getSequenceFeatures(); + assertEquals(1, sfs.size()); - SequenceFeature sf = sfs[0]; + SequenceFeature sf = sfs.get(0); assertEquals("type5", sf.getType()); assertEquals(1, sf.getBegin()); assertEquals(6, sf.getEnd()); @@ -1742,7 +1784,7 @@ public class AlignmentUtilsTests map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1); acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); - ArrayList acfs = new ArrayList(); + ArrayList acfs = new ArrayList<>(); acfs.add(acf); protein.setCodonFrames(acfs); @@ -1891,6 +1933,7 @@ public class AlignmentUtilsTests sf6.setValue("alleles", "g, a"); // should force to upper-case sf6.setValue("ID", "sequence_variant:rs758803216"); dna.addSequenceFeature(sf6); + SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null); sf7.setValue("alleles", "A, T"); @@ -1980,6 +2023,7 @@ public class AlignmentUtilsTests * variants: * GAA -> E source: Ensembl * CAA -> Q source: dbSNP + * TAA -> STOP source: dnSNP * AAG synonymous source: COSMIC * AAT -> N source: Ensembl * ...TTC synonymous source: dbSNP @@ -1995,39 +2039,55 @@ public class AlignmentUtilsTests String ensembl = "Ensembl"; String dbSnp = "dbSNP"; String cosmic = "COSMIC"; + + /* + * NB setting "id" (as returned by Ensembl for features in JSON format); + * previously "ID" (as returned for GFF3 format) + */ SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1, 0f, ensembl); - sf1.setValue("alleles", "A,G"); // GAA -> E - sf1.setValue("ID", "var1.125A>G"); + sf1.setValue("alleles", "A,G"); // AAA -> GAA -> K/E + sf1.setValue("id", "var1.125A>G"); + SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1, 0f, dbSnp); - sf2.setValue("alleles", "A,C"); // CAA -> Q - sf2.setValue("ID", "var2"); + sf2.setValue("alleles", "A,C"); // AAA -> CAA -> K/Q + sf2.setValue("id", "var2"); sf2.setValue("clinical_significance", "Dodgy"); - SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3, - 0f, cosmic); - sf3.setValue("alleles", "A,G"); // synonymous - sf3.setValue("ID", "var3"); - sf3.setValue("clinical_significance", "None"); + + SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 1, 1, + 0f, dbSnp); + sf3.setValue("alleles", "A,T"); // AAA -> TAA -> stop codon + sf3.setValue("id", "var3"); + sf3.setValue("clinical_significance", "Bad"); + SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3, + 0f, cosmic); + sf4.setValue("alleles", "A,G"); // AAA -> AAG synonymous + sf4.setValue("id", "var4"); + sf4.setValue("clinical_significance", "None"); + + SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 3, 3, 0f, ensembl); - sf4.setValue("alleles", "A,T"); // AAT -> N - sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off - sf4.setValue("clinical_significance", "Benign"); - SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6, + sf5.setValue("alleles", "A,T"); // AAA -> AAT -> K/N + sf5.setValue("id", "sequence_variant:var5"); // prefix gets stripped off + sf5.setValue("clinical_significance", "Benign"); + + SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 6, 6, 0f, dbSnp); - sf5.setValue("alleles", "T,C"); // synonymous - sf5.setValue("ID", "var5"); - sf5.setValue("clinical_significance", "Bad"); - SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8, + sf6.setValue("alleles", "T,C"); // TTT -> TTC synonymous + sf6.setValue("id", "var6"); + + SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 8, 8, 0f, cosmic); - sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R - sf6.setValue("ID", "var6"); - sf6.setValue("clinical_significance", "Good"); + sf7.setValue("alleles", "C,A,G"); // CCC -> CAC,CGC -> P/H/R + sf7.setValue("id", "var7"); + sf7.setValue("clinical_significance", "Good"); + + List codon1Variants = new ArrayList<>(); + List codon2Variants = new ArrayList<>(); + List codon3Variants = new ArrayList<>(); - List codon1Variants = new ArrayList(); - List codon2Variants = new ArrayList(); - List codon3Variants = new ArrayList(); List codonVariants[] = new ArrayList[3]; codonVariants[0] = codon1Variants; codonVariants[1] = codon2Variants; @@ -2038,10 +2098,11 @@ public class AlignmentUtilsTests */ codon1Variants.add(new DnaVariant("A", sf1)); codon1Variants.add(new DnaVariant("A", sf2)); + codon1Variants.add(new DnaVariant("A", sf3)); codon2Variants.add(new DnaVariant("A")); - codon2Variants.add(new DnaVariant("A")); - codon3Variants.add(new DnaVariant("A", sf3)); + // codon2Variants.add(new DnaVariant("A")); codon3Variants.add(new DnaVariant("A", sf4)); + codon3Variants.add(new DnaVariant("A", sf5)); AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants); /* @@ -2052,7 +2113,7 @@ public class AlignmentUtilsTests codon3Variants.clear(); codon1Variants.add(new DnaVariant("T")); codon2Variants.add(new DnaVariant("T")); - codon3Variants.add(new DnaVariant("T", sf5)); + codon3Variants.add(new DnaVariant("T", sf6)); AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants); /* @@ -2062,7 +2123,7 @@ public class AlignmentUtilsTests codon2Variants.clear(); codon3Variants.clear(); codon1Variants.add(new DnaVariant("C")); - codon2Variants.add(new DnaVariant("C", sf6)); + codon2Variants.add(new DnaVariant("C", sf7)); codon3Variants.add(new DnaVariant("C")); AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants); @@ -2070,77 +2131,142 @@ public class AlignmentUtilsTests * verify added sequence features for * var1 K -> E Ensembl * var2 K -> Q dbSNP - * var4 K -> N Ensembl - * var6 P -> H COSMIC - * var6 P -> R COSMIC + * var3 K -> stop + * var4 synonymous + * var5 K -> N Ensembl + * var6 synonymous + * var7 P -> H COSMIC + * var8 P -> R COSMIC */ - SequenceFeature[] sfs = peptide.getSequenceFeatures(); - assertEquals(5, sfs.length); + List sfs = peptide.getSequenceFeatures(); + SequenceFeatures.sortFeatures(sfs, true); + assertEquals(8, sfs.size()); - SequenceFeature sf = sfs[0]; + /* + * features are sorted by start position ascending, but in no + * particular order where start positions match; asserts here + * simply match the data returned (the order is not important) + */ + // AAA -> AAT -> K/N + SequenceFeature sf = sfs.get(0); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); - assertEquals("p.Lys1Glu", sf.getDescription()); - assertEquals("var1.125A>G", sf.getValue("ID")); - assertNull(sf.getValue("clinical_significance")); - assertEquals("ID=var1.125A>G", sf.getAttributes()); + assertEquals("nonsynonymous_variant", sf.getType()); + assertEquals("p.Lys1Asn", sf.getDescription()); + assertEquals("var5", sf.getValue("id")); + assertEquals("Benign", sf.getValue("clinical_significance")); + assertEquals("id=var5;clinical_significance=Benign", + sf.getAttributes()); assertEquals(1, sf.links.size()); - // link to variation is urlencoded assertEquals( - "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG", + "p.Lys1Asn var5|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var5", sf.links.get(0)); assertEquals(ensembl, sf.getFeatureGroup()); - sf = sfs[1]; + // AAA -> CAA -> K/Q + sf = sfs.get(1); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); + assertEquals("nonsynonymous_variant", sf.getType()); assertEquals("p.Lys1Gln", sf.getDescription()); - assertEquals("var2", sf.getValue("ID")); + assertEquals("var2", sf.getValue("id")); assertEquals("Dodgy", sf.getValue("clinical_significance")); - assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes()); + assertEquals("id=var2;clinical_significance=Dodgy", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2", sf.links.get(0)); assertEquals(dbSnp, sf.getFeatureGroup()); - sf = sfs[2]; + // AAA -> GAA -> K/E + sf = sfs.get(2); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); - assertEquals("p.Lys1Asn", sf.getDescription()); - assertEquals("var4", sf.getValue("ID")); - assertEquals("Benign", sf.getValue("clinical_significance")); - assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes()); + assertEquals("nonsynonymous_variant", sf.getType()); + assertEquals("p.Lys1Glu", sf.getDescription()); + assertEquals("var1.125A>G", sf.getValue("id")); + assertNull(sf.getValue("clinical_significance")); + assertEquals("id=var1.125A>G", sf.getAttributes()); assertEquals(1, sf.links.size()); + // link to variation is urlencoded assertEquals( - "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4", + "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG", sf.links.get(0)); assertEquals(ensembl, sf.getFeatureGroup()); - // var5 generates two distinct protein variant features - sf = sfs[3]; + // AAA -> TAA -> stop codon + sf = sfs.get(3); + assertEquals(1, sf.getBegin()); + assertEquals(1, sf.getEnd()); + assertEquals("stop_gained", sf.getType()); + assertEquals("Aaa/Taa", sf.getDescription()); + assertEquals("var3", sf.getValue("id")); + assertEquals("Bad", sf.getValue("clinical_significance")); + assertEquals("id=var3;clinical_significance=Bad", sf.getAttributes()); + assertEquals(1, sf.links.size()); + assertEquals( + "Aaa/Taa var3|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var3", + sf.links.get(0)); + assertEquals(dbSnp, sf.getFeatureGroup()); + + // AAA -> AAG synonymous + sf = sfs.get(4); + assertEquals(1, sf.getBegin()); + assertEquals(1, sf.getEnd()); + assertEquals("synonymous_variant", sf.getType()); + assertEquals("aaA/aaG", sf.getDescription()); + assertEquals("var4", sf.getValue("id")); + assertEquals("None", sf.getValue("clinical_significance")); + assertEquals("id=var4;clinical_significance=None", sf.getAttributes()); + assertEquals(1, sf.links.size()); + assertEquals( + "aaA/aaG var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4", + sf.links.get(0)); + assertEquals(cosmic, sf.getFeatureGroup()); + + // TTT -> TTC synonymous + sf = sfs.get(5); + assertEquals(2, sf.getBegin()); + assertEquals(2, sf.getEnd()); + assertEquals("synonymous_variant", sf.getType()); + assertEquals("ttT/ttC", sf.getDescription()); + assertEquals("var6", sf.getValue("id")); + assertNull(sf.getValue("clinical_significance")); + assertEquals("id=var6", sf.getAttributes()); + assertEquals(1, sf.links.size()); + assertEquals( + "ttT/ttC var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6", + sf.links.get(0)); + assertEquals(dbSnp, sf.getFeatureGroup()); + + // var7 generates two distinct protein variant features (two alleles) + // CCC -> CGC -> P/R + sf = sfs.get(6); assertEquals(3, sf.getBegin()); assertEquals(3, sf.getEnd()); - assertEquals("p.Pro3His", sf.getDescription()); - assertEquals("var6", sf.getValue("ID")); + assertEquals("nonsynonymous_variant", sf.getType()); + assertEquals("p.Pro3Arg", sf.getDescription()); + assertEquals("var7", sf.getValue("id")); assertEquals("Good", sf.getValue("clinical_significance")); - assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes()); + assertEquals("id=var7;clinical_significance=Good", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( - "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6", + "p.Pro3Arg var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); - sf = sfs[4]; + // CCC -> CAC -> P/H + sf = sfs.get(7); assertEquals(3, sf.getBegin()); assertEquals(3, sf.getEnd()); - assertEquals("p.Pro3Arg", sf.getDescription()); - assertEquals("var6", sf.getValue("ID")); + assertEquals("nonsynonymous_variant", sf.getType()); + assertEquals("p.Pro3His", sf.getDescription()); + assertEquals("var7", sf.getValue("id")); assertEquals("Good", sf.getValue("clinical_significance")); - assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes()); + assertEquals("id=var7;clinical_significance=Good", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( - "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6", + "p.Pro3His var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); } @@ -2261,7 +2387,7 @@ public class AlignmentUtilsTests seq1.createDatasetSequence(); Mapping mapping = new Mapping(seq1, new MapList( new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); - Map> map = new TreeMap>(); + Map> map = new TreeMap<>(); AlignmentUtils.addMappedPositions(seq1, from, mapping, map); /* @@ -2293,7 +2419,7 @@ public class AlignmentUtilsTests seq1.createDatasetSequence(); Mapping mapping = new Mapping(seq1, new MapList( new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); - Map> map = new TreeMap>(); + Map> map = new TreeMap<>(); AlignmentUtils.addMappedPositions(seq1, from, mapping, map); /* @@ -2522,4 +2648,307 @@ public class AlignmentUtilsTests assertEquals(s_as3, uas3.getSequenceAsString()); } + @Test(groups = { "Functional" }) + public void testTransferGeneLoci() + { + SequenceI from = new Sequence("transcript", + "aaacccgggTTTAAACCCGGGtttaaacccgggttt"); + SequenceI to = new Sequence("CDS", "TTTAAACCCGGG"); + MapList map = new MapList(new int[] { 1, 12 }, new int[] { 10, 21 }, 1, + 1); + + /* + * first with nothing to transfer + */ + AlignmentUtils.transferGeneLoci(from, map, to); + assertNull(to.getGeneLoci()); + + /* + * next with gene loci set on 'from' sequence + */ + int[] exons = new int[] { 100, 105, 155, 164, 210, 229 }; + MapList geneMap = new MapList(new int[] { 1, 36 }, exons, 1, 1); + from.setGeneLoci("human", "GRCh38", "7", geneMap); + AlignmentUtils.transferGeneLoci(from, map, to); + + GeneLociI toLoci = to.getGeneLoci(); + assertNotNull(toLoci); + // DBRefEntry constructor upper-cases 'source' + assertEquals("HUMAN", toLoci.getSpeciesId()); + assertEquals("GRCh38", toLoci.getAssemblyId()); + assertEquals("7", toLoci.getChromosomeId()); + + /* + * transcript 'exons' are 1-6, 7-16, 17-36 + * CDS 1:12 is transcript 10-21 + * transcript 'CDS' is 10-16, 17-21 + * which is 'gene' 158-164, 210-214 + */ + MapList toMap = toLoci.getMap(); + assertEquals(1, toMap.getFromRanges().size()); + assertEquals(2, toMap.getFromRanges().get(0).length); + assertEquals(1, toMap.getFromRanges().get(0)[0]); + assertEquals(12, toMap.getFromRanges().get(0)[1]); + assertEquals(2, toMap.getToRanges().size()); + assertEquals(2, toMap.getToRanges().get(0).length); + assertEquals(158, toMap.getToRanges().get(0)[0]); + assertEquals(164, toMap.getToRanges().get(0)[1]); + assertEquals(210, toMap.getToRanges().get(1)[0]); + assertEquals(214, toMap.getToRanges().get(1)[1]); + // or summarised as (but toString might change in future): + assertEquals("[ [1, 12] ] 1:1 to [ [158, 164] [210, 214] ]", + toMap.toString()); + + /* + * an existing value is not overridden + */ + geneMap = new MapList(new int[] { 1, 36 }, new int[] { 36, 1 }, 1, 1); + from.setGeneLoci("inhuman", "GRCh37", "6", geneMap); + AlignmentUtils.transferGeneLoci(from, map, to); + assertEquals("GRCh38", toLoci.getAssemblyId()); + assertEquals("7", toLoci.getChromosomeId()); + toMap = toLoci.getMap(); + assertEquals("[ [1, 12] ] 1:1 to [ [158, 164] [210, 214] ]", + toMap.toString()); + } + + /** + * Tests for the method that maps nucleotide to protein based on CDS features + */ + @Test(groups = "Functional") + public void testMapCdsToProtein() + { + SequenceI peptide = new Sequence("pep", "KLQ"); + + /* + * Case 1: CDS 3 times length of peptide + * NB method only checks lengths match, not translation + */ + SequenceI dna = new Sequence("dna", "AACGacgtCTCCT"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 13, null)); + MapList ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertEquals(3, ml.getFromRatio()); + assertEquals(1, ml.getToRatio()); + assertEquals("[[1, 3]]", + Arrays.deepToString(ml.getToRanges().toArray())); + assertEquals("[[1, 4], [9, 13]]", + Arrays.deepToString(ml.getFromRanges().toArray())); + + /* + * Case 2: CDS 3 times length of peptide + stop codon + * (note code does not currently check trailing codon is a stop codon) + */ + dna = new Sequence("dna", "AACGacgtCTCCTCCC"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 16, null)); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertEquals(3, ml.getFromRatio()); + assertEquals(1, ml.getToRatio()); + assertEquals("[[1, 3]]", + Arrays.deepToString(ml.getToRanges().toArray())); + assertEquals("[[1, 4], [9, 13]]", + Arrays.deepToString(ml.getFromRanges().toArray())); + + /* + * Case 3: CDS longer than 3 * peptide + stop codon - no mapping is made + */ + dna = new Sequence("dna", "AACGacgtCTCCTTGATCA"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 19, null)); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertNull(ml); + + /* + * Case 4: CDS shorter than 3 * peptide - no mapping is made + */ + dna = new Sequence("dna", "AACGacgtCTCC"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 12, null)); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertNull(ml); + + /* + * Case 5: CDS 3 times length of peptide + part codon - mapping is truncated + */ + dna = new Sequence("dna", "AACGacgtCTCCTTG"); + dna.createDatasetSequence(); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, null)); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertEquals(3, ml.getFromRatio()); + assertEquals(1, ml.getToRatio()); + assertEquals("[[1, 3]]", + Arrays.deepToString(ml.getToRanges().toArray())); + assertEquals("[[1, 4], [9, 13]]", + Arrays.deepToString(ml.getFromRanges().toArray())); + + /* + * Case 6: incomplete start codon corresponding to X in peptide + */ + dna = new Sequence("dna", "ACGacgtCTCCTTGG"); + dna.createDatasetSequence(); + SequenceFeature sf = new SequenceFeature("CDS", "", 1, 3, null); + sf.setPhase("2"); // skip 2 positions (AC) to start of next codon (GCT) + dna.addSequenceFeature(sf); + dna.addSequenceFeature(new SequenceFeature("CDS", "", 8, 15, null)); + peptide = new Sequence("pep", "XLQ"); + ml = AlignmentUtils.mapCdsToProtein(dna, peptide); + assertEquals("[[2, 3]]", + Arrays.deepToString(ml.getToRanges().toArray())); + assertEquals("[[3, 3], [8, 12]]", + Arrays.deepToString(ml.getFromRanges().toArray())); + } + + /** + * Tests for the method that locates the CDS sequence that has a mapping to + * the given protein. That is, given a transcript-to-peptide mapping, find the + * cds-to-peptide mapping that relates to both, and return the CDS sequence. + */ + @Test + public void testFindCdsForProtein() + { + List mappings = new ArrayList<>(); + AlignedCodonFrame acf1 = new AlignedCodonFrame(); + mappings.add(acf1); + + SequenceI dna1 = new Sequence("dna1", "cgatATcgGCTATCTATGacg"); + dna1.createDatasetSequence(); + + // NB we currently exclude STOP codon from CDS sequences + // the test would need to change if this changes in future + SequenceI cds1 = new Sequence("cds1", "ATGCTATCT"); + cds1.createDatasetSequence(); + + SequenceI pep1 = new Sequence("pep1", "MLS"); + pep1.createDatasetSequence(); + List seqMappings = new ArrayList<>(); + MapList mapList = new MapList( + new int[] + { 5, 6, 9, 15 }, new int[] { 1, 3 }, 3, 1); + Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList); + + // add dna to peptide mapping + seqMappings.add(acf1); + acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), + mapList); + + /* + * first case - no dna-to-CDS mapping exists - search fails + */ + SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1, + seqMappings, dnaToPeptide); + assertNull(seq); + + /* + * second case - CDS-to-peptide mapping exists but no dna-to-CDS + * - search fails + */ + // todo this test fails if the mapping is added to acf1, not acf2 + // need to tidy up use of lists of mappings in AlignedCodonFrame + AlignedCodonFrame acf2 = new AlignedCodonFrame(); + mappings.add(acf2); + MapList cdsToPeptideMapping = new MapList(new int[] + { 1, 9 }, new int[] { 1, 3 }, 3, 1); + acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(), + cdsToPeptideMapping); + assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide)); + + /* + * third case - add dna-to-CDS mapping - CDS is now found! + */ + MapList dnaToCdsMapping = new MapList(new int[] { 5, 6, 9, 15 }, + new int[] + { 1, 9 }, 1, 1); + acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), + dnaToCdsMapping); + seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide); + assertSame(seq, cds1.getDatasetSequence()); + } + + /** + * Tests for the method that locates the CDS sequence that has a mapping to + * the given protein. That is, given a transcript-to-peptide mapping, find the + * cds-to-peptide mapping that relates to both, and return the CDS sequence. + * This test is for the case where transcript and CDS are the same length. + */ + @Test + public void testFindCdsForProtein_noUTR() + { + List mappings = new ArrayList<>(); + AlignedCodonFrame acf1 = new AlignedCodonFrame(); + mappings.add(acf1); + + SequenceI dna1 = new Sequence("dna1", "ATGCTATCTTAA"); + dna1.createDatasetSequence(); + + // NB we currently exclude STOP codon from CDS sequences + // the test would need to change if this changes in future + SequenceI cds1 = new Sequence("cds1", "ATGCTATCT"); + cds1.createDatasetSequence(); + + SequenceI pep1 = new Sequence("pep1", "MLS"); + pep1.createDatasetSequence(); + List seqMappings = new ArrayList<>(); + MapList mapList = new MapList( + new int[] + { 1, 9 }, new int[] { 1, 3 }, 3, 1); + Mapping dnaToPeptide = new Mapping(pep1.getDatasetSequence(), mapList); + + // add dna to peptide mapping + seqMappings.add(acf1); + acf1.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), + mapList); + + /* + * first case - transcript lacks CDS features - it appears to be + * the CDS sequence and is returned + */ + SequenceI seq = AlignmentUtils.findCdsForProtein(mappings, dna1, + seqMappings, dnaToPeptide); + assertSame(seq, dna1.getDatasetSequence()); + + /* + * second case - transcript has CDS feature - this means it is + * not returned as a match for CDS (CDS sequences don't have CDS features) + */ + dna1.addSequenceFeature( + new SequenceFeature(SequenceOntologyI.CDS, "cds", 1, 12, null)); + seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide); + assertNull(seq); + + /* + * third case - CDS-to-peptide mapping exists but no dna-to-CDS + * - search fails + */ + // todo this test fails if the mapping is added to acf1, not acf2 + // need to tidy up use of lists of mappings in AlignedCodonFrame + AlignedCodonFrame acf2 = new AlignedCodonFrame(); + mappings.add(acf2); + MapList cdsToPeptideMapping = new MapList(new int[] + { 1, 9 }, new int[] { 1, 3 }, 3, 1); + acf2.addMap(cds1.getDatasetSequence(), pep1.getDatasetSequence(), + cdsToPeptideMapping); + assertNull(AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide)); + + /* + * fourth case - add dna-to-CDS mapping - CDS is now found! + */ + MapList dnaToCdsMapping = new MapList(new int[] { 1, 9 }, + new int[] + { 1, 9 }, 1, 1); + acf1.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), + dnaToCdsMapping); + seq = AlignmentUtils.findCdsForProtein(mappings, dna1, seqMappings, + dnaToPeptide); + assertSame(seq, cds1.getDatasetSequence()); + } }