/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertNotNull; import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; import jalview.analysis.AlignmentUtils.DnaVariant; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResultMatchI; import jalview.datamodel.SearchResultsI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; import jalview.gui.JvOptionPane; import jalview.io.AppletFormatAdapter; import jalview.io.DataSourceType; import jalview.io.FileFormat; import jalview.io.FileFormatI; import jalview.io.FormatAdapter; import jalview.util.MapList; import jalview.util.MappingUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; public class AlignmentUtilsTests { private static Sequence ts = new Sequence("short", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm"); @BeforeClass(alwaysRun = true) public void setUpJvOptionPane() { JvOptionPane.setInteractiveMode(false); JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); } @Test(groups = { "Functional" }) public void testExpandContext() { AlignmentI al = new Alignment(new Sequence[] {}); for (int i = 4; i < 14; i += 2) { SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7); al.addSequence(s1); } System.out.println(new AppletFormatAdapter().formatSequences( FileFormat.Clustal, al, true)); for (int flnk = -1; flnk < 25; flnk++) { AlignmentI exp = AlignmentUtils.expandContext(al, flnk); System.out.println("\nFlank size: " + flnk); System.out.println(new AppletFormatAdapter().formatSequences( FileFormat.Clustal, exp, true)); if (flnk == -1) { /* * Full expansion to complete sequences */ for (SequenceI sq : exp.getSequences()) { String ung = sq.getSequenceAsString().replaceAll("-+", ""); final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n" + ung + "\n" + sq.getDatasetSequence().getSequenceAsString(); assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence() .getSequenceAsString())); } } else if (flnk == 24) { /* * Last sequence is fully expanded, others have leading gaps to match */ assertTrue(exp.getSequenceAt(4).getSequenceAsString() .startsWith("abc")); assertTrue(exp.getSequenceAt(3).getSequenceAsString() .startsWith("--abc")); assertTrue(exp.getSequenceAt(2).getSequenceAsString() .startsWith("----abc")); assertTrue(exp.getSequenceAt(1).getSequenceAsString() .startsWith("------abc")); assertTrue(exp.getSequenceAt(0).getSequenceAsString() .startsWith("--------abc")); } } } /** * Test that annotations are correctly adjusted by expandContext */ @Test(groups = { "Functional" }) public void testExpandContext_annotation() { AlignmentI al = new Alignment(new Sequence[] {}); SequenceI ds = new Sequence("Seq1", "ABCDEFGHI"); // subsequence DEF: SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6); al.addSequence(seq1); /* * Annotate DEF with 4/5/6 respectively */ Annotation[] anns = new Annotation[] { new Annotation(4), new Annotation(5), new Annotation(6) }; AlignmentAnnotation ann = new AlignmentAnnotation("SS", "secondary structure", anns); seq1.addAlignmentAnnotation(ann); /* * The annotations array should match aligned positions */ assertEquals(3, ann.annotations.length); assertEquals(4, ann.annotations[0].value, 0.001); assertEquals(5, ann.annotations[1].value, 0.001); assertEquals(6, ann.annotations[2].value, 0.001); /* * Check annotation to sequence position mappings before expanding the * sequence; these are set up in Sequence.addAlignmentAnnotation -> * Annotation.setSequenceRef -> createSequenceMappings */ assertNull(ann.getAnnotationForPosition(1)); assertNull(ann.getAnnotationForPosition(2)); assertNull(ann.getAnnotationForPosition(3)); assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001); assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001); assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001); assertNull(ann.getAnnotationForPosition(7)); assertNull(ann.getAnnotationForPosition(8)); assertNull(ann.getAnnotationForPosition(9)); /* * Expand the subsequence to the full sequence abcDEFghi */ AlignmentI expanded = AlignmentUtils.expandContext(al, -1); assertEquals("abcDEFghi", expanded.getSequenceAt(0) .getSequenceAsString()); /* * Confirm the alignment and sequence have the same SS annotation, * referencing the expanded sequence */ ann = expanded.getSequenceAt(0).getAnnotation()[0]; assertSame(ann, expanded.getAlignmentAnnotation()[0]); assertSame(expanded.getSequenceAt(0), ann.sequenceRef); /* * The annotations array should have null values except for annotated * positions */ assertNull(ann.annotations[0]); assertNull(ann.annotations[1]); assertNull(ann.annotations[2]); assertEquals(4, ann.annotations[3].value, 0.001); assertEquals(5, ann.annotations[4].value, 0.001); assertEquals(6, ann.annotations[5].value, 0.001); assertNull(ann.annotations[6]); assertNull(ann.annotations[7]); assertNull(ann.annotations[8]); /* * sequence position mappings should be unchanged */ assertNull(ann.getAnnotationForPosition(1)); assertNull(ann.getAnnotationForPosition(2)); assertNull(ann.getAnnotationForPosition(3)); assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001); assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001); assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001); assertNull(ann.getAnnotationForPosition(7)); assertNull(ann.getAnnotationForPosition(8)); assertNull(ann.getAnnotationForPosition(9)); } /** * Test method that returns a map of lists of sequences by sequence name. * * @throws IOException */ @Test(groups = { "Functional" }) public void testGetSequencesByName() throws IOException { final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n" + ">Seq1Name\nABCD\n"; AlignmentI al = loadAlignment(data, FileFormat.Fasta); Map> map = AlignmentUtils .getSequencesByName(al); assertEquals(2, map.keySet().size()); assertEquals(2, map.get("Seq1Name").size()); assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString()); assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString()); assertEquals(1, map.get("Seq2Name").size()); assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString()); } /** * Helper method to load an alignment and ensure dataset sequences are set up. * * @param data * @param format * TODO * @return * @throws IOException */ protected AlignmentI loadAlignment(final String data, FileFormatI format) throws IOException { AlignmentI a = new FormatAdapter().readFile(data, DataSourceType.PASTE, format); a.setDataset(null); return a; } /** * Test mapping of protein to cDNA, for the case where we have no sequence * cross-references, so mappings are made first-served 1-1 where sequences * translate. * * @throws IOException */ @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_noXrefs() throws IOException { List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4])); cdna.setDataset(null); assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); // 3 mappings made, each from 1 to 1 sequence assertEquals(3, protein.getCodonFrames().size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size()); // V12345 mapped to A22222 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) .get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), acf.getdnaSeqs()[0]); Mapping[] protMappings = acf.getProtMappings(); assertEquals(1, protMappings.length); MapList mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges() .get(0))); assertEquals(1, mapList.getFromRanges().size()); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges().get(0))); assertEquals(1, mapList.getToRanges().size()); // V12346 mapped to A33333 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(2).getDatasetSequence(), acf.getdnaSeqs()[0]); // V12347 mapped to A11111 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), acf.getdnaSeqs()[0]); // no mapping involving the 'extra' A44444 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty()); } /** * Test for the alignSequenceAs method that takes two sequences and a mapping. */ @Test(groups = { "Functional" }) public void testAlignSequenceAs_withMapping_noIntrons() { MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1); /* * No existing gaps in dna: */ checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map, "---GGG---AAA"); /* * Now introduce gaps in dna but ignore them when realigning. */ checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map, "---GGG---AAA"); /* * Now include gaps in dna when realigning. First retaining 'mapped' gaps * only, i.e. those within the exon region. */ checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map, "---G-G--G---A--A-A"); /* * Include all gaps in dna when realigning (within and without the exon * region). The leading gap, and the gaps between codons, are subsumed by * the protein alignment gap. */ checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map, "---G-GG---AA-A---"); /* * Include only unmapped gaps in dna when realigning (outside the exon * region). The leading gap, and the gaps between codons, are subsumed by * the protein alignment gap. */ checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map, "---GGG---AAA---"); } /** * Test for the alignSequenceAs method that takes two sequences and a mapping. */ @Test(groups = { "Functional" }) public void testAlignSequenceAs_withMapping_withIntrons() { /* * Exons at codon 2 (AAA) and 4 (TTT) */ MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 2 }, 3, 1); /* * Simple case: no gaps in dna */ checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map, "GGG---AAACCCTTTGGG"); /* * Add gaps to dna - but ignore when realigning. */ checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-", false, false, map, "GGG---AAACCCTTTGGG"); /* * Add gaps to dna - include within exons only when realigning. */ checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", true, false, map, "GGG---A--A---ACCCT-TTGGG"); /* * Include gaps outside exons only when realigning. */ checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", false, true, map, "-G-G-GAAAC-CCTTT-GG-G-"); /* * Include gaps following first intron if we are 'preserving mapped gaps' */ checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); /* * Include all gaps in dna when realigning. */ checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-", true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-"); } /** * Test for the case where not all of the protein sequence is mapped to cDNA. */ @Test(groups = { "Functional" }) public void testAlignSequenceAs_withMapping_withUnmappedProtein() { /* * Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P */ final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 1, 3, 3 }, 3, 1); /* * -L- 'aligns' ccc------ */ checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map, "gggAAAccc------TTTggg"); } /** * Helper method that performs and verifies the method under test. * * @param alignee * the sequence to be realigned * @param alignModel * the sequence whose alignment is to be copied * @param preserveMappedGaps * @param preserveUnmappedGaps * @param map * @param expected */ protected void checkAlignSequenceAs(final String alignee, final String alignModel, final boolean preserveMappedGaps, final boolean preserveUnmappedGaps, MapList map, final String expected) { SequenceI alignMe = new Sequence("Seq1", alignee); alignMe.createDatasetSequence(); SequenceI alignFrom = new Sequence("Seq2", alignModel); alignFrom.createDatasetSequence(); AlignedCodonFrame acf = new AlignedCodonFrame(); acf.addMap(alignMe.getDatasetSequence(), alignFrom.getDatasetSequence(), map); AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-', preserveMappedGaps, preserveUnmappedGaps); assertEquals(expected, alignMe.getSequenceAsString()); } /** * Test for the alignSequenceAs method where we preserve gaps in introns only. */ @Test(groups = { "Functional" }) public void testAlignSequenceAs_keepIntronGapsOnly() { /* * Intron GGGAAA followed by exon CCCTTT */ MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1); checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map, "GG-G-AA-ACCCTTT"); } /** * Test the method that realigns protein to match mapped codon alignment. */ @Test(groups = { "Functional" }) public void testAlignProteinAsDna() { // seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12] SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-"); // seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13] SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG"); // seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13] SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG"); AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 }); dna.setDataset(null); // protein alignment will be realigned like dna SequenceI prot1 = new Sequence("Seq1", "CHYQ"); SequenceI prot2 = new Sequence("Seq2", "CHYQ"); SequenceI prot3 = new Sequence("Seq3", "CHYQ"); SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2, prot3, prot4 }); protein.setDataset(null); MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1); AlignedCodonFrame acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map); acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map); acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); ArrayList acfs = new ArrayList<>(); acfs.add(acf); protein.setCodonFrames(acfs); /* * Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9] * [8,9,10] [10,11,12] [11,12,13] */ AlignmentUtils.alignProteinAsDna(protein, dna); assertEquals("C-H--Y-Q-", prot1.getSequenceAsString()); assertEquals("-C--H-Y-Q", prot2.getSequenceAsString()); assertEquals("C--H--Y-Q", prot3.getSequenceAsString()); assertEquals("R-QSV", prot4.getSequenceAsString()); } /** * Test the method that tests whether a CDNA sequence translates to a protein * sequence */ @Test(groups = { "Functional" }) public void testTranslatesAs() { // null arguments check assertFalse(AlignmentUtils.translatesAs(null, 0, null)); assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null)); assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' })); // straight translation assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, "FPKG".toCharArray())); // with extra start codon (not in protein) assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(), 3, "FPKG".toCharArray())); // with stop codon1 (not in protein) assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(), 0, "FPKG".toCharArray())); // with stop codon1 (in protein as *) assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(), 0, "FPKG*".toCharArray())); // with stop codon2 (not in protein) assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(), 0, "FPKG".toCharArray())); // with stop codon3 (not in protein) assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(), 0, "FPKG".toCharArray())); // with start and stop codon1 assertTrue(AlignmentUtils.translatesAs( "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray())); // with start and stop codon1 (in protein as *) assertTrue(AlignmentUtils.translatesAs( "atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray())); // with start and stop codon2 assertTrue(AlignmentUtils.translatesAs( "atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray())); // with start and stop codon3 assertTrue(AlignmentUtils.translatesAs( "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray())); // with embedded stop codons assertTrue(AlignmentUtils.translatesAs( "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3, "F*PK*G".toCharArray())); // wrong protein assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, "FPMG".toCharArray())); // truncated dna assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0, "FPKG".toCharArray())); // truncated protein assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, "FPK".toCharArray())); // overlong dna (doesn't end in stop codon) assertFalse(AlignmentUtils.translatesAs( "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray())); // dna + stop codon + more assertFalse(AlignmentUtils.translatesAs( "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray())); // overlong protein assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, "FPKGQ".toCharArray())); } /** * Test mapping of protein to cDNA, for cases where the cDNA has start and/or * stop codons in addition to the protein coding sequence. * * @throws IOException */ @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_withStartAndStopCodons() throws IOException { List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); List dnaseqs = new ArrayList<>(); // start + SAR: dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC")); // = EIQ + stop dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA")); // = start +EIQ + stop dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG")); dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4])); cdna.setDataset(null); assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); // 3 mappings made, each from 1 to 1 sequence assertEquals(3, protein.getCodonFrames().size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size()); // V12345 mapped from A22222 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) .get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), acf.getdnaSeqs()[0]); Mapping[] protMappings = acf.getProtMappings(); assertEquals(1, protMappings.length); MapList mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges() .get(0))); assertEquals(1, mapList.getFromRanges().size()); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges().get(0))); assertEquals(1, mapList.getToRanges().size()); // V12346 mapped from A33333 starting position 4 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(2).getDatasetSequence(), acf.getdnaSeqs()[0]); protMappings = acf.getProtMappings(); assertEquals(1, protMappings.length); mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges() .get(0))); assertEquals(1, mapList.getFromRanges().size()); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges().get(0))); assertEquals(1, mapList.getToRanges().size()); // V12347 mapped to A11111 starting position 4 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), acf.getdnaSeqs()[0]); protMappings = acf.getProtMappings(); assertEquals(1, protMappings.length); mapList = protMappings[0].getMap(); assertEquals(3, mapList.getFromRatio()); assertEquals(1, mapList.getToRatio()); assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges() .get(0))); assertEquals(1, mapList.getFromRanges().size()); assertTrue(Arrays.equals(new int[] { 1, 3 }, mapList.getToRanges().get(0))); assertEquals(1, mapList.getToRanges().size()); // no mapping involving the 'extra' A44444 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty()); } /** * Test mapping of protein to cDNA, for the case where we have some sequence * cross-references. Verify that 1-to-many mappings are made where * cross-references exist and sequences are mappable. * * @throws IOException */ @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_withXrefs() throws IOException { List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12347", "SAR")); AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3])); protein.setDataset(null); List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5])); cdna.setDataset(null); // Xref A22222 to V12345 (should get mapped) dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345")); // Xref V12345 to A44444 (should get mapped) protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444")); // Xref A33333 to V12347 (sequence mismatch - should not get mapped) dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347")); // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped. // it should get paired up with the unmapped A33333 // A11111 should be mapped to V12347 // A55555 is spare and has no xref so is not mapped assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7 assertEquals(3, protein.getCodonFrames().size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size()); // one mapping for each of the first 4 cDNA sequences assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size()); assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size()); assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size()); assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size()); // V12345 mapped to A22222 and A44444 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) .get(0); assertEquals(2, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), acf.getdnaSeqs()[0]); assertEquals(cdna.getSequenceAt(3).getDatasetSequence(), acf.getdnaSeqs()[1]); // V12346 mapped to A33333 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(2).getDatasetSequence(), acf.getdnaSeqs()[0]); // V12347 mapped to A11111 acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), acf.getdnaSeqs()[0]); // no mapping involving the 'extra' A55555 assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty()); } /** * Test mapping of protein to cDNA, for the case where we have some sequence * cross-references. Verify that once we have made an xref mapping we don't * also map un-xrefd sequeces. * * @throws IOException */ @Test(groups = { "Functional" }) public void testMapProteinAlignmentToCdna_prioritiseXrefs() throws IOException { List protseqs = new ArrayList<>(); protseqs.add(new Sequence("UNIPROT|V12345", "EIQ")); protseqs.add(new Sequence("UNIPROT|V12346", "EIQ")); AlignmentI protein = new Alignment( protseqs.toArray(new SequenceI[protseqs.size()])); protein.setDataset(null); List dnaseqs = new ArrayList<>(); dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs .size()])); cdna.setDataset(null); // Xref A22222 to V12345 (should get mapped) // A11111 should then be mapped to the unmapped V12346 dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345")); assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna)); // 2 protein mappings made assertEquals(2, protein.getCodonFrames().size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size()); assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size()); // one mapping for each of the cDNA sequences assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size()); assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size()); // V12345 mapped to A22222 AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0)) .get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(1).getDatasetSequence(), acf.getdnaSeqs()[0]); // V12346 mapped to A11111 acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0); assertEquals(1, acf.getdnaSeqs().length); assertEquals(cdna.getSequenceAt(0).getDatasetSequence(), acf.getdnaSeqs()[0]); } /** * Test the method that shows or hides sequence annotations by type(s) and * selection group. */ @Test(groups = { "Functional" }) public void testShowOrHideSequenceAnnotations() { SequenceI seq1 = new Sequence("Seq1", "AAA"); SequenceI seq2 = new Sequence("Seq2", "BBB"); SequenceI seq3 = new Sequence("Seq3", "CCC"); Annotation[] anns = new Annotation[] { new Annotation(2f) }; AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1", anns); ann1.setSequenceRef(seq1); AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2", anns); ann2.setSequenceRef(seq2); AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3", anns); AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns); ann4.setSequenceRef(seq1); AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns); ann5.setSequenceRef(seq2); AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns); AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 }); al.addAnnotation(ann1); // Structure for Seq1 al.addAnnotation(ann2); // Structure for Seq2 al.addAnnotation(ann3); // Structure for no sequence al.addAnnotation(ann4); // Temp for seq1 al.addAnnotation(ann5); // Temp for seq2 al.addAnnotation(ann6); // Temp for no sequence List types = new ArrayList<>(); List scope = new ArrayList<>(); /* * Set all sequence related Structure to hidden (ann1, ann2) */ types.add("Structure"); AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false, false); assertFalse(ann1.visible); assertFalse(ann2.visible); assertTrue(ann3.visible); // not sequence-related, not affected assertTrue(ann4.visible); // not Structure, not affected assertTrue(ann5.visible); // " assertTrue(ann6.visible); // not sequence-related, not affected /* * Set Temp in {seq1, seq3} to hidden */ types.clear(); types.add("Temp"); scope.add(seq1); scope.add(seq3); AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false, false); assertFalse(ann1.visible); // unchanged assertFalse(ann2.visible); // unchanged assertTrue(ann3.visible); // not sequence-related, not affected assertFalse(ann4.visible); // Temp for seq1 hidden assertTrue(ann5.visible); // not in scope, not affected assertTrue(ann6.visible); // not sequence-related, not affected /* * Set Temp in all sequences to hidden */ types.clear(); types.add("Temp"); scope.add(seq1); scope.add(seq3); AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false, false); assertFalse(ann1.visible); // unchanged assertFalse(ann2.visible); // unchanged assertTrue(ann3.visible); // not sequence-related, not affected assertFalse(ann4.visible); // Temp for seq1 hidden assertFalse(ann5.visible); // Temp for seq2 hidden assertTrue(ann6.visible); // not sequence-related, not affected /* * Set all types in {seq1, seq3} to visible */ types.clear(); scope.clear(); scope.add(seq1); scope.add(seq3); AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true, true); assertTrue(ann1.visible); // Structure for seq1 set visible assertFalse(ann2.visible); // not in scope, unchanged assertTrue(ann3.visible); // not sequence-related, not affected assertTrue(ann4.visible); // Temp for seq1 set visible assertFalse(ann5.visible); // not in scope, unchanged assertTrue(ann6.visible); // not sequence-related, not affected /* * Set all types in all scope to hidden */ AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true, false); assertFalse(ann1.visible); assertFalse(ann2.visible); assertTrue(ann3.visible); // not sequence-related, not affected assertFalse(ann4.visible); assertFalse(ann5.visible); assertTrue(ann6.visible); // not sequence-related, not affected } /** * Tests for the method that checks if one sequence cross-references another */ @Test(groups = { "Functional" }) public void testHasCrossRef() { assertFalse(AlignmentUtils.hasCrossRef(null, null)); SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF"); assertFalse(AlignmentUtils.hasCrossRef(seq1, null)); assertFalse(AlignmentUtils.hasCrossRef(null, seq1)); SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF"); assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2)); // different ref seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193")); assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2)); // case-insensitive; version number is ignored seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192")); assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2)); // right case! seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192")); assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2)); // test is one-way only assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1)); } /** * Tests for the method that checks if either sequence cross-references the * other */ @Test(groups = { "Functional" }) public void testHaveCrossRef() { assertFalse(AlignmentUtils.hasCrossRef(null, null)); SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF"); assertFalse(AlignmentUtils.haveCrossRef(seq1, null)); assertFalse(AlignmentUtils.haveCrossRef(null, seq1)); SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF"); assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2)); seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192")); assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); // next is true for haveCrossRef, false for hasCrossRef assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); // now the other way round seq1.setDBRefs(null); seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345")); assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); // now both ways seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192")); assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); } /** * Test the method that extracts the cds-only part of a dna alignment. */ @Test(groups = { "Functional" }) public void testMakeCdsAlignment() { /* * scenario: * dna1 --> [4, 6] [10,12] --> pep1 * dna2 --> [1, 3] [7, 9] [13,15] --> pep2 */ SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC"); SequenceI pep1 = new Sequence("pep1", "GF"); SequenceI pep2 = new Sequence("pep2", "GFP"); pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "pep1")); pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "pep2")); dna1.createDatasetSequence(); dna2.createDatasetSequence(); pep1.createDatasetSequence(); pep2.createDatasetSequence(); AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 }); dna.setDataset(null); /* * put a variant feature on dna2 base 8 * - should transfer to cds2 base 5 */ dna2.addSequenceFeature(new SequenceFeature("variant", "hgmd", 8, 8, 0f, null)); /* * need a sourceDbRef if we are to construct dbrefs to the CDS * sequence from the dna contig sequences */ DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1"); dna1.getDatasetSequence().addDBRef(dbref); org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0)); dbref = new DBRefEntry("ENSEMBL", "0", "dna2"); dna2.getDatasetSequence().addDBRef(dbref); org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0)); /* * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment * dataset (e.g. added from dbrefs by CrossRef.findXrefSequences) */ MapList mapfordna1 = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 2 }, 3, 1); AlignedCodonFrame acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), mapfordna1); dna.addCodonFrame(acf); MapList mapfordna2 = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 }, 3, 1); acf = new AlignedCodonFrame(); acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), mapfordna2); dna.addCodonFrame(acf); /* * In this case, mappings originally came from matching Uniprot accessions * - so need an xref on dna involving those regions. * These are normally constructed from CDS annotation */ DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1", new Mapping(mapfordna1)); dna1.addDBRef(dna1xref); assertEquals(2, dna1.getDBRefs().length); // to self and to pep1 DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2", new Mapping(mapfordna2)); dna2.addDBRef(dna2xref); assertEquals(2, dna2.getDBRefs().length); // to self and to pep2 /* * execute method under test: */ AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] { dna1, dna2 }, dna.getDataset(), null); /* * verify cds sequences */ assertEquals(2, cds.getSequences().size()); assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString()); assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString()); /* * verify shared, extended alignment dataset */ assertSame(dna.getDataset(), cds.getDataset()); SequenceI cds1Dss = cds.getSequenceAt(0).getDatasetSequence(); SequenceI cds2Dss = cds.getSequenceAt(1).getDatasetSequence(); assertTrue(dna.getDataset().getSequences().contains(cds1Dss)); assertTrue(dna.getDataset().getSequences().contains(cds2Dss)); /* * verify CDS has a dbref with mapping to peptide */ assertNotNull(cds1Dss.getDBRefs()); assertEquals(2, cds1Dss.getDBRefs().length); dbref = cds1Dss.getDBRefs()[0]; assertEquals(dna1xref.getSource(), dbref.getSource()); // version is via ensembl's primary ref assertEquals(dna1xref.getVersion(), dbref.getVersion()); assertEquals(dna1xref.getAccessionId(), dbref.getAccessionId()); assertNotNull(dbref.getMap()); assertSame(pep1.getDatasetSequence(), dbref.getMap().getTo()); MapList cdsMapping = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1); assertEquals(cdsMapping, dbref.getMap().getMap()); /* * verify peptide has added a dbref with reverse mapping to CDS */ assertNotNull(pep1.getDBRefs()); // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ? assertEquals(2, pep1.getDBRefs().length); dbref = pep1.getDBRefs()[1]; assertEquals("ENSEMBL", dbref.getSource()); assertEquals("0", dbref.getVersion()); assertEquals("CDS|dna1", dbref.getAccessionId()); assertNotNull(dbref.getMap()); assertSame(cds1Dss, dbref.getMap().getTo()); assertEquals(cdsMapping.getInverse(), dbref.getMap().getMap()); /* * verify cDNA has added a dbref with mapping to CDS */ assertEquals(3, dna1.getDBRefs().length); DBRefEntry dbRefEntry = dna1.getDBRefs()[2]; assertSame(cds1Dss, dbRefEntry.getMap().getTo()); MapList dnaToCdsMapping = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 6 }, 1, 1); assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap()); assertEquals(3, dna2.getDBRefs().length); dbRefEntry = dna2.getDBRefs()[2]; assertSame(cds2Dss, dbRefEntry.getMap().getTo()); dnaToCdsMapping = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 9 }, 1, 1); assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap()); /* * verify CDS has added a dbref with mapping to cDNA */ assertEquals(2, cds1Dss.getDBRefs().length); dbRefEntry = cds1Dss.getDBRefs()[1]; assertSame(dna1.getDatasetSequence(), dbRefEntry.getMap().getTo()); MapList cdsToDnaMapping = new MapList(new int[] { 1, 6 }, new int[] { 4, 6, 10, 12 }, 1, 1); assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap()); assertEquals(2, cds2Dss.getDBRefs().length); dbRefEntry = cds2Dss.getDBRefs()[1]; assertSame(dna2.getDatasetSequence(), dbRefEntry.getMap().getTo()); cdsToDnaMapping = new MapList(new int[] { 1, 9 }, new int[] { 1, 3, 7, 9, 13, 15 }, 1, 1); assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap()); /* * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide * the mappings are on the shared alignment dataset * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep) */ List cdsMappings = cds.getDataset().getCodonFrames(); assertEquals(6, cdsMappings.size()); /* * verify that mapping sets for dna and cds alignments are different * [not current behaviour - all mappings are on the alignment dataset] */ // select -> subselect type to test. // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames()); // assertEquals(4, dna.getCodonFrames().size()); // assertEquals(4, cds.getCodonFrames().size()); /* * Two mappings involve pep1 (dna to pep1, cds to pep1) * Mapping from pep1 to GGGTTT in first new exon sequence */ List pep1Mappings = MappingUtils .findMappingsForSequence(pep1, cdsMappings); assertEquals(2, pep1Mappings.size()); List mappings = MappingUtils .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings); assertEquals(1, mappings.size()); // map G to GGG SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1, mappings); assertEquals(1, sr.getResults().size()); SearchResultMatchI m = sr.getResults().get(0); assertSame(cds1Dss, m.getSequence()); assertEquals(1, m.getStart()); assertEquals(3, m.getEnd()); // map F to TTT sr = MappingUtils.buildSearchResults(pep1, 2, mappings); m = sr.getResults().get(0); assertSame(cds1Dss, m.getSequence()); assertEquals(4, m.getStart()); assertEquals(6, m.getEnd()); /* * Two mappings involve pep2 (dna to pep2, cds to pep2) * Verify mapping from pep2 to GGGTTTCCC in second new exon sequence */ List pep2Mappings = MappingUtils .findMappingsForSequence(pep2, cdsMappings); assertEquals(2, pep2Mappings.size()); mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1), pep2Mappings); assertEquals(1, mappings.size()); // map G to GGG sr = MappingUtils.buildSearchResults(pep2, 1, mappings); assertEquals(1, sr.getResults().size()); m = sr.getResults().get(0); assertSame(cds2Dss, m.getSequence()); assertEquals(1, m.getStart()); assertEquals(3, m.getEnd()); // map F to TTT sr = MappingUtils.buildSearchResults(pep2, 2, mappings); m = sr.getResults().get(0); assertSame(cds2Dss, m.getSequence()); assertEquals(4, m.getStart()); assertEquals(6, m.getEnd()); // map P to CCC sr = MappingUtils.buildSearchResults(pep2, 3, mappings); m = sr.getResults().get(0); assertSame(cds2Dss, m.getSequence()); assertEquals(7, m.getStart()); assertEquals(9, m.getEnd()); /* * check cds2 acquired a variant feature in position 5 */ List sfs = cds2Dss.getSequenceFeatures(); assertNotNull(sfs); assertEquals(1, sfs.size()); assertEquals("variant", sfs.get(0).type); assertEquals(5, sfs.get(0).begin); assertEquals(5, sfs.get(0).end); } /** * Test the method that makes a cds-only alignment from a DNA sequence and its * product mappings, for the case where there are multiple exon mappings to * different protein products. */ @Test(groups = { "Functional" }) public void testMakeCdsAlignment_multipleProteins() { SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT dna1.createDatasetSequence(); pep1.createDatasetSequence(); pep2.createDatasetSequence(); pep3.createDatasetSequence(); pep1.getDatasetSequence().addDBRef( new DBRefEntry("EMBLCDS", "2", "A12345")); pep2.getDatasetSequence().addDBRef( new DBRefEntry("EMBLCDS", "3", "A12346")); pep3.getDatasetSequence().addDBRef( new DBRefEntry("EMBLCDS", "4", "A12347")); /* * Create the CDS alignment */ AlignmentI dna = new Alignment(new SequenceI[] { dna1 }); dna.setDataset(null); /* * Make the mappings from dna to protein */ // map ...GGG...TTT to GF MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 2 }, 3, 1); AlignedCodonFrame acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); dna.addCodonFrame(acf); // map aaa...ccc to KP map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1); acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map); dna.addCodonFrame(acf); // map aaa......TTT to KF map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1); acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map); dna.addCodonFrame(acf); /* * execute method under test */ AlignmentI cdsal = AlignmentUtils.makeCdsAlignment( new SequenceI[] { dna1 }, dna.getDataset(), null); /* * Verify we have 3 cds sequences, mapped to pep1/2/3 respectively */ List cds = cdsal.getSequences(); assertEquals(3, cds.size()); /* * verify shared, extended alignment dataset */ assertSame(cdsal.getDataset(), dna.getDataset()); assertTrue(dna.getDataset().getSequences() .contains(cds.get(0).getDatasetSequence())); assertTrue(dna.getDataset().getSequences() .contains(cds.get(1).getDatasetSequence())); assertTrue(dna.getDataset().getSequences() .contains(cds.get(2).getDatasetSequence())); /* * verify aligned cds sequences and their xrefs */ SequenceI cdsSeq = cds.get(0); assertEquals("GGGTTT", cdsSeq.getSequenceAsString()); // assertEquals("dna1|A12345", cdsSeq.getName()); assertEquals("CDS|dna1", cdsSeq.getName()); // assertEquals(1, cdsSeq.getDBRefs().length); // DBRefEntry cdsRef = cdsSeq.getDBRefs()[0]; // assertEquals("EMBLCDS", cdsRef.getSource()); // assertEquals("2", cdsRef.getVersion()); // assertEquals("A12345", cdsRef.getAccessionId()); cdsSeq = cds.get(1); assertEquals("aaaccc", cdsSeq.getSequenceAsString()); // assertEquals("dna1|A12346", cdsSeq.getName()); assertEquals("CDS|dna1", cdsSeq.getName()); // assertEquals(1, cdsSeq.getDBRefs().length); // cdsRef = cdsSeq.getDBRefs()[0]; // assertEquals("EMBLCDS", cdsRef.getSource()); // assertEquals("3", cdsRef.getVersion()); // assertEquals("A12346", cdsRef.getAccessionId()); cdsSeq = cds.get(2); assertEquals("aaaTTT", cdsSeq.getSequenceAsString()); // assertEquals("dna1|A12347", cdsSeq.getName()); assertEquals("CDS|dna1", cdsSeq.getName()); // assertEquals(1, cdsSeq.getDBRefs().length); // cdsRef = cdsSeq.getDBRefs()[0]; // assertEquals("EMBLCDS", cdsRef.getSource()); // assertEquals("4", cdsRef.getVersion()); // assertEquals("A12347", cdsRef.getAccessionId()); /* * Verify there are mappings from each cds sequence to its protein product * and also to its dna source */ List newMappings = cdsal.getCodonFrames(); /* * 6 mappings involve dna1 (to pep1/2/3, cds1/2/3) */ List dnaMappings = MappingUtils .findMappingsForSequence(dna1, newMappings); assertEquals(6, dnaMappings.size()); /* * dna1 to pep1 */ List mappings = MappingUtils .findMappingsForSequence(pep1, dnaMappings); assertEquals(1, mappings.size()); assertEquals(1, mappings.get(0).getMappings().size()); assertSame(pep1.getDatasetSequence(), mappings.get(0).getMappings() .get(0).getMapping().getTo()); /* * dna1 to cds1 */ List dnaToCds1Mappings = MappingUtils .findMappingsForSequence(cds.get(0), dnaMappings); Mapping mapping = dnaToCds1Mappings.get(0).getMappings().get(0) .getMapping(); assertSame(cds.get(0).getDatasetSequence(), mapping.getTo()); assertEquals("G(1) in CDS should map to G(4) in DNA", 4, mapping .getMap().getToPosition(1)); /* * dna1 to pep2 */ mappings = MappingUtils.findMappingsForSequence(pep2, dnaMappings); assertEquals(1, mappings.size()); assertEquals(1, mappings.get(0).getMappings().size()); assertSame(pep2.getDatasetSequence(), mappings.get(0).getMappings() .get(0).getMapping().getTo()); /* * dna1 to cds2 */ List dnaToCds2Mappings = MappingUtils .findMappingsForSequence(cds.get(1), dnaMappings); mapping = dnaToCds2Mappings.get(0).getMappings().get(0).getMapping(); assertSame(cds.get(1).getDatasetSequence(), mapping.getTo()); assertEquals("c(4) in CDS should map to c(7) in DNA", 7, mapping .getMap().getToPosition(4)); /* * dna1 to pep3 */ mappings = MappingUtils.findMappingsForSequence(pep3, dnaMappings); assertEquals(1, mappings.size()); assertEquals(1, mappings.get(0).getMappings().size()); assertSame(pep3.getDatasetSequence(), mappings.get(0).getMappings() .get(0).getMapping().getTo()); /* * dna1 to cds3 */ List dnaToCds3Mappings = MappingUtils .findMappingsForSequence(cds.get(2), dnaMappings); mapping = dnaToCds3Mappings.get(0).getMappings().get(0).getMapping(); assertSame(cds.get(2).getDatasetSequence(), mapping.getTo()); assertEquals("T(4) in CDS should map to T(10) in DNA", 10, mapping .getMap().getToPosition(4)); } @Test(groups = { "Functional" }) public void testIsMappable() { SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT"); SequenceI aa1 = new Sequence("aa1", "RSG"); AlignmentI al1 = new Alignment(new SequenceI[] { dna1 }); AlignmentI al2 = new Alignment(new SequenceI[] { aa1 }); assertFalse(AlignmentUtils.isMappable(null, null)); assertFalse(AlignmentUtils.isMappable(al1, null)); assertFalse(AlignmentUtils.isMappable(null, al1)); assertFalse(AlignmentUtils.isMappable(al1, al1)); assertFalse(AlignmentUtils.isMappable(al2, al2)); assertTrue(AlignmentUtils.isMappable(al1, al2)); assertTrue(AlignmentUtils.isMappable(al2, al1)); } /** * Test creating a mapping when the sequences involved do not start at residue * 1 * * @throws IOException */ @Test(groups = { "Functional" }) public void testMapCdnaToProtein_forSubsequence() throws IOException { SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12); prot.createDatasetSequence(); SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48); dna.createDatasetSequence(); MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna); assertEquals(10, map.getToLowest()); assertEquals(12, map.getToHighest()); assertEquals(40, map.getFromLowest()); assertEquals(48, map.getFromHighest()); } /** * Test for the alignSequenceAs method where we have protein mapped to protein */ @Test(groups = { "Functional" }) public void testAlignSequenceAs_mappedProteinProtein() { SequenceI alignMe = new Sequence("Match", "MGAASEV"); alignMe.createDatasetSequence(); SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR"); alignFrom.createDatasetSequence(); AlignedCodonFrame acf = new AlignedCodonFrame(); // this is like a domain or motif match of part of a peptide sequence MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1); acf.addMap(alignFrom.getDatasetSequence(), alignMe.getDatasetSequence(), map); AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true, true); assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString()); } /** * Test for the alignSequenceAs method where there are trailing unmapped * residues in the model sequence */ @Test(groups = { "Functional" }) public void testAlignSequenceAs_withTrailingPeptide() { // map first 3 codons to KPF; G is a trailing unmapped residue MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1); checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map, "AAA---CCCTTT---"); } /** * Tests for transferring features between mapped sequences */ @Test(groups = { "Functional" }) public void testTransferFeatures() { SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt"); SequenceI cds = new Sequence("cds/10-15", "TAGGCC"); // no overlap dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f, null)); // partial overlap - to [1, 1] dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f, null)); // exact overlap - to [1, 3] dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f, null)); // spanning overlap - to [2, 5] dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f, null)); // exactly overlaps whole mapped range [1, 6] dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f, null)); // no overlap (internal) dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f, null)); // no overlap (3' end) dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15, 7f, null)); // overlap (3' end) - to [6, 6] dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12, 8f, null)); // extended overlap - to [6, +] dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13, 9f, null)); MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 6 }, 1, 1); /* * transferFeatures() will build 'partial overlap' for regions * that partially overlap 5' or 3' (start or end) of target sequence */ AlignmentUtils.transferFeatures(dna, cds, map, null); List sfs = cds.getSequenceFeatures(); assertEquals(6, sfs.size()); SequenceFeature sf = sfs.get(0); assertEquals("type2", sf.getType()); assertEquals("desc2", sf.getDescription()); assertEquals(2f, sf.getScore()); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); sf = sfs.get(1); assertEquals("type3", sf.getType()); assertEquals("desc3", sf.getDescription()); assertEquals(3f, sf.getScore()); assertEquals(1, sf.getBegin()); assertEquals(3, sf.getEnd()); sf = sfs.get(2); assertEquals("type4", sf.getType()); assertEquals(2, sf.getBegin()); assertEquals(5, sf.getEnd()); sf = sfs.get(3); assertEquals("type5", sf.getType()); assertEquals(1, sf.getBegin()); assertEquals(6, sf.getEnd()); sf = sfs.get(4); assertEquals("type8", sf.getType()); assertEquals(6, sf.getBegin()); assertEquals(6, sf.getEnd()); sf = sfs.get(5); assertEquals("type9", sf.getType()); assertEquals(6, sf.getBegin()); assertEquals(6, sf.getEnd()); } /** * Tests for transferring features between mapped sequences */ @Test(groups = { "Functional" }) public void testTransferFeatures_withOmit() { SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt"); SequenceI cds = new Sequence("cds/10-15", "TAGGCC"); MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 6 }, 1, 1); // [5, 11] maps to [2, 5] dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f, null)); // [4, 12] maps to [1, 6] dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f, null)); // [12, 12] maps to [6, 6] dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12, 8f, null)); // desc4 and desc8 are the 'omit these' varargs AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8"); List sfs = cds.getSequenceFeatures(); assertEquals(1, sfs.size()); SequenceFeature sf = sfs.get(0); assertEquals("type5", sf.getType()); assertEquals(1, sf.getBegin()); assertEquals(6, sf.getEnd()); } /** * Tests for transferring features between mapped sequences */ @Test(groups = { "Functional" }) public void testTransferFeatures_withSelect() { SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt"); SequenceI cds = new Sequence("cds/10-15", "TAGGCC"); MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 6 }, 1, 1); // [5, 11] maps to [2, 5] dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f, null)); // [4, 12] maps to [1, 6] dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f, null)); // [12, 12] maps to [6, 6] dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12, 8f, null)); // "type5" is the 'select this type' argument AlignmentUtils.transferFeatures(dna, cds, map, "type5"); List sfs = cds.getSequenceFeatures(); assertEquals(1, sfs.size()); SequenceFeature sf = sfs.get(0); assertEquals("type5", sf.getType()); assertEquals(1, sf.getBegin()); assertEquals(6, sf.getEnd()); } /** * Test the method that extracts the cds-only part of a dna alignment, for the * case where the cds should be aligned to match its nucleotide sequence. */ @Test(groups = { "Functional" }) public void testMakeCdsAlignment_alternativeTranscripts() { SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG"); // alternative transcript of same dna skips CCC codon SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG"); // dna3 has no mapping (protein product) so should be ignored here SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG"); SequenceI pep1 = new Sequence("pep1", "GPFG"); SequenceI pep2 = new Sequence("pep2", "GPG"); dna1.createDatasetSequence(); dna2.createDatasetSequence(); dna3.createDatasetSequence(); pep1.createDatasetSequence(); pep2.createDatasetSequence(); AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 }); dna.setDataset(null); MapList map = new MapList(new int[] { 4, 12, 16, 18 }, new int[] { 1, 4 }, 3, 1); AlignedCodonFrame acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); dna.addCodonFrame(acf); map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 }, new int[] { 1, 3 }, 3, 1); acf = new AlignedCodonFrame(); acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map); dna.addCodonFrame(acf); AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] { dna1, dna2, dna3 }, dna.getDataset(), null); List cdsSeqs = cds.getSequences(); assertEquals(2, cdsSeqs.size()); assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString()); assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString()); /* * verify shared, extended alignment dataset */ assertSame(dna.getDataset(), cds.getDataset()); assertTrue(dna.getDataset().getSequences() .contains(cdsSeqs.get(0).getDatasetSequence())); assertTrue(dna.getDataset().getSequences() .contains(cdsSeqs.get(1).getDatasetSequence())); /* * Verify 6 mappings: dna1 to cds1, cds1 to pep1, dna1 to pep1 * and the same for dna2/cds2/pep2 */ List mappings = cds.getCodonFrames(); assertEquals(6, mappings.size()); /* * 2 mappings involve pep1 */ List pep1Mappings = MappingUtils .findMappingsForSequence(pep1, mappings); assertEquals(2, pep1Mappings.size()); /* * Get mapping of pep1 to cds1 and verify it * maps GPFG to 1-3,4-6,7-9,10-12 */ List pep1CdsMappings = MappingUtils .findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings); assertEquals(1, pep1CdsMappings.size()); SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1, pep1CdsMappings); assertEquals(1, sr.getResults().size()); SearchResultMatchI m = sr.getResults().get(0); assertEquals(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence()); assertEquals(1, m.getStart()); assertEquals(3, m.getEnd()); sr = MappingUtils.buildSearchResults(pep1, 2, pep1CdsMappings); m = sr.getResults().get(0); assertEquals(4, m.getStart()); assertEquals(6, m.getEnd()); sr = MappingUtils.buildSearchResults(pep1, 3, pep1CdsMappings); m = sr.getResults().get(0); assertEquals(7, m.getStart()); assertEquals(9, m.getEnd()); sr = MappingUtils.buildSearchResults(pep1, 4, pep1CdsMappings); m = sr.getResults().get(0); assertEquals(10, m.getStart()); assertEquals(12, m.getEnd()); /* * Get mapping of pep2 to cds2 and verify it * maps GPG in pep2 to 1-3,4-6,7-9 in second CDS sequence */ List pep2Mappings = MappingUtils .findMappingsForSequence(pep2, mappings); assertEquals(2, pep2Mappings.size()); List pep2CdsMappings = MappingUtils .findMappingsForSequence(cds.getSequenceAt(1), pep2Mappings); assertEquals(1, pep2CdsMappings.size()); sr = MappingUtils.buildSearchResults(pep2, 1, pep2CdsMappings); assertEquals(1, sr.getResults().size()); m = sr.getResults().get(0); assertEquals(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); assertEquals(1, m.getStart()); assertEquals(3, m.getEnd()); sr = MappingUtils.buildSearchResults(pep2, 2, pep2CdsMappings); m = sr.getResults().get(0); assertEquals(4, m.getStart()); assertEquals(6, m.getEnd()); sr = MappingUtils.buildSearchResults(pep2, 3, pep2CdsMappings); m = sr.getResults().get(0); assertEquals(7, m.getStart()); assertEquals(9, m.getEnd()); } /** * Test the method that realigns protein to match mapped codon alignment. */ @Test(groups = { "Functional" }) public void testAlignProteinAsDna_incompleteStartCodon() { // seq1: incomplete start codon (not mapped), then [3, 11] SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-"); // seq2 codons are [4, 5], [8, 11] SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-"); // seq3 incomplete start codon at 'tt' SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-"); AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 }); dna.setDataset(null); // prot1 has 'X' for incomplete start codon (not mapped) SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start SequenceI prot2 = new Sequence("Seq2", "NG"); SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2, prot3 }); protein.setDataset(null); // map dna1 [3, 11] to prot1 [2, 4] KFG MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1); AlignedCodonFrame acf = new AlignedCodonFrame(); acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map); // map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1); acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map); // map dna3 [9, 11] to prot3 [2, 2] G map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1); acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map); ArrayList acfs = new ArrayList<>(); acfs.add(acf); protein.setCodonFrames(acfs); /* * verify X is included in the aligned proteins, and placed just * before the first mapped residue * CCT is between CCC and TTT */ AlignmentUtils.alignProteinAsDna(protein, dna); assertEquals("XK-FG", prot1.getSequenceAsString()); assertEquals("--N-G", prot2.getSequenceAsString()); assertEquals("---XG", prot3.getSequenceAsString()); } /** * Tests for the method that maps the subset of a dna sequence that has CDS * (or subtype) feature - case where the start codon is incomplete. */ @Test(groups = "Functional") public void testFindCdsPositions_fivePrimeIncomplete() { SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); dnaSeq.createDatasetSequence(); SequenceI ds = dnaSeq.getDatasetSequence(); // CDS for dna 5-6 (incomplete codon), 7-9 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); sf.setPhase("2"); // skip 2 bases to start of next codon ds.addSequenceFeature(sf); // CDS for dna 13-15 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null); ds.addSequenceFeature(sf); List ranges = AlignmentUtils.findCdsPositions(dnaSeq); /* * check the mapping starts with the first complete codon */ assertEquals(6, MappingUtils.getLength(ranges)); assertEquals(2, ranges.size()); assertEquals(7, ranges.get(0)[0]); assertEquals(9, ranges.get(0)[1]); assertEquals(13, ranges.get(1)[0]); assertEquals(15, ranges.get(1)[1]); } /** * Tests for the method that maps the subset of a dna sequence that has CDS * (or subtype) feature. */ @Test(groups = "Functional") public void testFindCdsPositions() { SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); dnaSeq.createDatasetSequence(); SequenceI ds = dnaSeq.getDatasetSequence(); // CDS for dna 10-12 SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null); sf.setStrand("+"); ds.addSequenceFeature(sf); // CDS for dna 4-6 sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); sf.setStrand("+"); ds.addSequenceFeature(sf); // exon feature should be ignored here sf = new SequenceFeature("exon", "", 7, 9, 0f, null); ds.addSequenceFeature(sf); List ranges = AlignmentUtils.findCdsPositions(dnaSeq); /* * verify ranges { [4-6], [12-10] } * note CDS ranges are ordered ascending even if the CDS * features are not */ assertEquals(6, MappingUtils.getLength(ranges)); assertEquals(2, ranges.size()); assertEquals(4, ranges.get(0)[0]); assertEquals(6, ranges.get(0)[1]); assertEquals(10, ranges.get(1)[0]); assertEquals(12, ranges.get(1)[1]); } /** * Test the method that computes a map of codon variants for each protein * position from "sequence_variant" features on dna */ @Test(groups = "Functional") public void testBuildDnaVariantsMap() { SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag"); MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1); /* * first with no variants on dna */ LinkedHashMap[]> variantsMap = AlignmentUtils .buildDnaVariantsMap(dna, map); assertTrue(variantsMap.isEmpty()); /* * single allele codon 1, on base 1 */ SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1, 0f, null); sf1.setValue("alleles", "T"); sf1.setValue("ID", "sequence_variant:rs758803211"); dna.addSequenceFeature(sf1); /* * two alleles codon 2, on bases 2 and 3 (distinct variants) */ SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null); sf2.setValue("alleles", "T"); sf2.setValue("ID", "sequence_variant:rs758803212"); dna.addSequenceFeature(sf2); SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null); sf3.setValue("alleles", "G"); sf3.setValue("ID", "sequence_variant:rs758803213"); dna.addSequenceFeature(sf3); /* * two alleles codon 3, both on base 2 (one variant) */ SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null); sf4.setValue("alleles", "C, G"); sf4.setValue("ID", "sequence_variant:rs758803214"); dna.addSequenceFeature(sf4); // no alleles on codon 4 /* * alleles on codon 5 on all 3 bases (distinct variants) */ SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null); sf5.setValue("alleles", "C, G"); // (C duplicates given base value) sf5.setValue("ID", "sequence_variant:rs758803215"); dna.addSequenceFeature(sf5); SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null); sf6.setValue("alleles", "g, a"); // should force to upper-case sf6.setValue("ID", "sequence_variant:rs758803216"); dna.addSequenceFeature(sf6); SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null); sf7.setValue("alleles", "A, T"); sf7.setValue("ID", "sequence_variant:rs758803217"); dna.addSequenceFeature(sf7); /* * build map - expect variants on positions 1, 2, 3, 5 */ variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map); assertEquals(4, variantsMap.size()); /* * protein residue 1: variant on codon (ATG) base 1, not on 2 or 3 */ List[] pep1Variants = variantsMap.get(1); assertEquals(3, pep1Variants.length); assertEquals(1, pep1Variants[0].size()); assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant assertEquals(1, pep1Variants[1].size()); assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base assertNull(pep1Variants[1].get(0).variant); // no variant here assertEquals(1, pep1Variants[2].size()); assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base assertNull(pep1Variants[2].get(0).variant); // no variant here /* * protein residue 2: variants on codon (AAA) bases 2 and 3 */ List[] pep2Variants = variantsMap.get(2); assertEquals(3, pep2Variants.length); assertEquals(1, pep2Variants[0].size()); // codon[1] base recorded while processing variant on codon[2] assertEquals("A", pep2Variants[0].get(0).base); assertNull(pep2Variants[0].get(0).variant); // no variant here // codon[2] base and variant: assertEquals(1, pep2Variants[1].size()); assertEquals("A", pep2Variants[1].get(0).base); assertSame(sf2, pep2Variants[1].get(0).variant); // codon[3] base was recorded when processing codon[2] variant // and then the variant for codon[3] added to it assertEquals(1, pep2Variants[2].size()); assertEquals("A", pep2Variants[2].get(0).base); assertSame(sf3, pep2Variants[2].get(0).variant); /* * protein residue 3: variants on codon (TTT) base 2 only */ List[] pep3Variants = variantsMap.get(3); assertEquals(3, pep3Variants.length); assertEquals(1, pep3Variants[0].size()); assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base assertNull(pep3Variants[0].get(0).variant); // no variant here assertEquals(1, pep3Variants[1].size()); assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant assertEquals(1, pep3Variants[2].size()); assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base assertNull(pep3Variants[2].get(0).variant); // no variant here /* * three variants on protein position 5 */ List[] pep5Variants = variantsMap.get(5); assertEquals(3, pep5Variants.length); assertEquals(1, pep5Variants[0].size()); assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant assertEquals(1, pep5Variants[1].size()); assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant assertEquals(1, pep5Variants[2].size()); assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant } /** * Tests for the method that computes all peptide variants given codon * variants */ @Test(groups = "Functional") public void testComputePeptideVariants() { /* * scenario: AAATTTCCC codes for KFP * variants: * GAA -> E source: Ensembl * CAA -> Q source: dbSNP * TAA -> STOP source: dnSNP * AAG synonymous source: COSMIC * AAT -> N source: Ensembl * ...TTC synonymous source: dbSNP * ......CAC,CGC -> H,R source: COSMIC * (one variant with two alleles) */ SequenceI peptide = new Sequence("pep/10-12", "KFP"); /* * two distinct variants for codon 1 position 1 * second one has clinical significance */ String ensembl = "Ensembl"; String dbSnp = "dbSNP"; String cosmic = "COSMIC"; SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1, 0f, ensembl); sf1.setValue("alleles", "A,G"); // AAA -> GAA -> K/E sf1.setValue("ID", "var1.125A>G"); SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1, 0f, dbSnp); sf2.setValue("alleles", "A,C"); // AAA -> CAA -> K/Q sf2.setValue("ID", "var2"); sf2.setValue("clinical_significance", "Dodgy"); SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 1, 1, 0f, dbSnp); sf3.setValue("alleles", "A,T"); // AAA -> TAA -> stop codon sf3.setValue("ID", "var3"); sf3.setValue("clinical_significance", "Bad"); SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3, 0f, cosmic); sf4.setValue("alleles", "A,G"); // AAA -> AAG synonymous sf4.setValue("ID", "var4"); sf4.setValue("clinical_significance", "None"); SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 3, 3, 0f, ensembl); sf5.setValue("alleles", "A,T"); // AAA -> AAT -> K/N sf5.setValue("ID", "sequence_variant:var5"); // prefix gets stripped off sf5.setValue("clinical_significance", "Benign"); SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 6, 6, 0f, dbSnp); sf6.setValue("alleles", "T,C"); // TTT -> TTC synonymous sf6.setValue("ID", "var6"); SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 8, 8, 0f, cosmic); sf7.setValue("alleles", "C,A,G"); // CCC -> CAC,CGC -> P/H/R sf7.setValue("ID", "var7"); sf7.setValue("clinical_significance", "Good"); List codon1Variants = new ArrayList<>(); List codon2Variants = new ArrayList<>(); List codon3Variants = new ArrayList<>(); List codonVariants[] = new ArrayList[3]; codonVariants[0] = codon1Variants; codonVariants[1] = codon2Variants; codonVariants[2] = codon3Variants; /* * compute variants for protein position 1 */ codon1Variants.add(new DnaVariant("A", sf1)); codon1Variants.add(new DnaVariant("A", sf2)); codon1Variants.add(new DnaVariant("A", sf3)); codon2Variants.add(new DnaVariant("A")); // codon2Variants.add(new DnaVariant("A")); codon3Variants.add(new DnaVariant("A", sf4)); codon3Variants.add(new DnaVariant("A", sf5)); AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants); /* * compute variants for protein position 2 */ codon1Variants.clear(); codon2Variants.clear(); codon3Variants.clear(); codon1Variants.add(new DnaVariant("T")); codon2Variants.add(new DnaVariant("T")); codon3Variants.add(new DnaVariant("T", sf6)); AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants); /* * compute variants for protein position 3 */ codon1Variants.clear(); codon2Variants.clear(); codon3Variants.clear(); codon1Variants.add(new DnaVariant("C")); codon2Variants.add(new DnaVariant("C", sf7)); codon3Variants.add(new DnaVariant("C")); AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants); /* * verify added sequence features for * var1 K -> E Ensembl * var2 K -> Q dbSNP * var3 K -> stop * var4 synonymous * var5 K -> N Ensembl * var6 synonymous * var7 P -> H COSMIC * var8 P -> R COSMIC */ List sfs = peptide.getSequenceFeatures(); SequenceFeatures.sortFeatures(sfs, true); assertEquals(8, sfs.size()); /* * features are sorted by start position ascending, but in no * particular order where start positions match; asserts here * simply match the data returned (the order is not important) */ // AAA -> AAT -> K/N SequenceFeature sf = sfs.get(0); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); assertEquals("p.Lys1Asn", sf.getDescription()); assertEquals("var5", sf.getValue("ID")); assertEquals("Benign", sf.getValue("clinical_significance")); assertEquals("ID=var5;clinical_significance=Benign", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "p.Lys1Asn var5|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var5", sf.links.get(0)); assertEquals(ensembl, sf.getFeatureGroup()); // AAA -> CAA -> K/Q sf = sfs.get(1); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); assertEquals("p.Lys1Gln", sf.getDescription()); assertEquals("var2", sf.getValue("ID")); assertEquals("Dodgy", sf.getValue("clinical_significance")); assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2", sf.links.get(0)); assertEquals(dbSnp, sf.getFeatureGroup()); // AAA -> GAA -> K/E sf = sfs.get(2); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); assertEquals("p.Lys1Glu", sf.getDescription()); assertEquals("var1.125A>G", sf.getValue("ID")); assertNull(sf.getValue("clinical_significance")); assertEquals("ID=var1.125A>G", sf.getAttributes()); assertEquals(1, sf.links.size()); // link to variation is urlencoded assertEquals( "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG", sf.links.get(0)); assertEquals(ensembl, sf.getFeatureGroup()); // AAA -> TAA -> stop codon sf = sfs.get(3); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); assertEquals("stop_gained", sf.getType()); assertEquals("Aaa/Taa", sf.getDescription()); assertEquals("var3", sf.getValue("ID")); assertEquals("Bad", sf.getValue("clinical_significance")); assertEquals("ID=var3;clinical_significance=Bad", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "Aaa/Taa var3|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var3", sf.links.get(0)); assertEquals(dbSnp, sf.getFeatureGroup()); // AAA -> AAG synonymous sf = sfs.get(4); assertEquals(1, sf.getBegin()); assertEquals(1, sf.getEnd()); assertEquals("synonymous_variant", sf.getType()); assertEquals("aaA/aaG", sf.getDescription()); assertEquals("var4", sf.getValue("ID")); assertEquals("None", sf.getValue("clinical_significance")); assertEquals("ID=var4;clinical_significance=None", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "aaA/aaG var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); // TTT -> TTC synonymous sf = sfs.get(5); assertEquals(2, sf.getBegin()); assertEquals(2, sf.getEnd()); assertEquals("synonymous_variant", sf.getType()); assertEquals("ttT/ttC", sf.getDescription()); assertEquals("var6", sf.getValue("ID")); assertNull(sf.getValue("clinical_significance")); assertEquals("ID=var6", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "ttT/ttC var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6", sf.links.get(0)); assertEquals(dbSnp, sf.getFeatureGroup()); // var7 generates two distinct protein variant features (two alleles) // CCC -> CGC -> P/R sf = sfs.get(6); assertEquals(3, sf.getBegin()); assertEquals(3, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); assertEquals("p.Pro3Arg", sf.getDescription()); assertEquals("var7", sf.getValue("ID")); assertEquals("Good", sf.getValue("clinical_significance")); assertEquals("ID=var7;clinical_significance=Good", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "p.Pro3Arg var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); // CCC -> CAC -> P/H sf = sfs.get(7); assertEquals(3, sf.getBegin()); assertEquals(3, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); assertEquals("p.Pro3His", sf.getDescription()); assertEquals("var7", sf.getValue("ID")); assertEquals("Good", sf.getValue("clinical_significance")); assertEquals("ID=var7;clinical_significance=Good", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( "p.Pro3His var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); } /** * Tests for the method that maps the subset of a dna sequence that has CDS * (or subtype) feature, with CDS strand = '-' (reverse) */ // test turned off as currently findCdsPositions is not strand-dependent // left in case it comes around again... @Test(groups = "Functional", enabled = false) public void testFindCdsPositions_reverseStrand() { SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); dnaSeq.createDatasetSequence(); SequenceI ds = dnaSeq.getDatasetSequence(); // CDS for dna 4-6 SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); sf.setStrand("-"); ds.addSequenceFeature(sf); // exon feature should be ignored here sf = new SequenceFeature("exon", "", 7, 9, 0f, null); ds.addSequenceFeature(sf); // CDS for dna 10-12 sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null); sf.setStrand("-"); ds.addSequenceFeature(sf); List ranges = AlignmentUtils.findCdsPositions(dnaSeq); /* * verify ranges { [12-10], [6-4] } */ assertEquals(6, MappingUtils.getLength(ranges)); assertEquals(2, ranges.size()); assertEquals(12, ranges.get(0)[0]); assertEquals(10, ranges.get(0)[1]); assertEquals(6, ranges.get(1)[0]); assertEquals(4, ranges.get(1)[1]); } /** * Tests for the method that maps the subset of a dna sequence that has CDS * (or subtype) feature - reverse strand case where the start codon is * incomplete. */ @Test(groups = "Functional", enabled = false) // test turned off as currently findCdsPositions is not strand-dependent // left in case it comes around again... public void testFindCdsPositions_reverseStrandThreePrimeIncomplete() { SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); dnaSeq.createDatasetSequence(); SequenceI ds = dnaSeq.getDatasetSequence(); // CDS for dna 5-9 SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); sf.setStrand("-"); ds.addSequenceFeature(sf); // CDS for dna 13-15 sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null); sf.setStrand("-"); sf.setPhase("2"); // skip 2 bases to start of next codon ds.addSequenceFeature(sf); List ranges = AlignmentUtils.findCdsPositions(dnaSeq); /* * check the mapping starts with the first complete codon * expect ranges [13, 13], [9, 5] */ assertEquals(6, MappingUtils.getLength(ranges)); assertEquals(2, ranges.size()); assertEquals(13, ranges.get(0)[0]); assertEquals(13, ranges.get(0)[1]); assertEquals(9, ranges.get(1)[0]); assertEquals(5, ranges.get(1)[1]); } @Test(groups = "Functional") public void testAlignAs_alternateTranscriptsUngapped() { SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 }); ((Alignment) dna).createDatasetAlignment(); SequenceI cds1 = new Sequence("cds1", "GGGTTT"); SequenceI cds2 = new Sequence("cds2", "CCCAAA"); AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 }); ((Alignment) cds).createDatasetAlignment(); AlignedCodonFrame acf = new AlignedCodonFrame(); MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1); acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map); map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1); acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map); /* * verify CDS alignment is as: * cccGGGTTTaaa (cdna) * CCCgggtttAAA (cdna) * * ---GGGTTT--- (cds) * CCC------AAA (cds) */ dna.addCodonFrame(acf); AlignmentUtils.alignAs(cds, dna); assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString()); assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString()); } @Test(groups = { "Functional" }) public void testAddMappedPositions() { SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g"); SequenceI seq1 = new Sequence("cds", "AAATTT"); from.createDatasetSequence(); seq1.createDatasetSequence(); Mapping mapping = new Mapping(seq1, new MapList( new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); Map> map = new TreeMap<>(); AlignmentUtils.addMappedPositions(seq1, from, mapping, map); /* * verify map has seq1 residues in columns 3,4,6,7,11,12 */ assertEquals(6, map.size()); assertEquals('A', map.get(3).get(seq1).charValue()); assertEquals('A', map.get(4).get(seq1).charValue()); assertEquals('A', map.get(6).get(seq1).charValue()); assertEquals('T', map.get(7).get(seq1).charValue()); assertEquals('T', map.get(11).get(seq1).charValue()); assertEquals('T', map.get(12).get(seq1).charValue()); /* * */ } /** * Test case where the mapping 'from' range includes a stop codon which is * absent in the 'to' range */ @Test(groups = { "Functional" }) public void testAddMappedPositions_withStopCodon() { SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g"); SequenceI seq1 = new Sequence("cds", "AAATTT"); from.createDatasetSequence(); seq1.createDatasetSequence(); Mapping mapping = new Mapping(seq1, new MapList( new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1)); Map> map = new TreeMap<>(); AlignmentUtils.addMappedPositions(seq1, from, mapping, map); /* * verify map has seq1 residues in columns 3,4,6,7,11,12 */ assertEquals(6, map.size()); assertEquals('A', map.get(3).get(seq1).charValue()); assertEquals('A', map.get(4).get(seq1).charValue()); assertEquals('A', map.get(6).get(seq1).charValue()); assertEquals('T', map.get(7).get(seq1).charValue()); assertEquals('T', map.get(11).get(seq1).charValue()); assertEquals('T', map.get(12).get(seq1).charValue()); } /** * Test for the case where the products for which we want CDS are specified. * This is to represent the case where EMBL has CDS mappings to both Uniprot * and EMBLCDSPROTEIN. makeCdsAlignment() should only return the mappings for * the protein sequences specified. */ @Test(groups = { "Functional" }) public void testMakeCdsAlignment_filterProducts() { SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC"); SequenceI pep1 = new Sequence("Uniprot|pep1", "GF"); SequenceI pep2 = new Sequence("Uniprot|pep2", "GFP"); SequenceI pep3 = new Sequence("EMBL|pep3", "GF"); SequenceI pep4 = new Sequence("EMBL|pep4", "GFP"); dna1.createDatasetSequence(); dna2.createDatasetSequence(); pep1.createDatasetSequence(); pep2.createDatasetSequence(); pep3.createDatasetSequence(); pep4.createDatasetSequence(); AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 }); dna.setDataset(null); AlignmentI emblPeptides = new Alignment(new SequenceI[] { pep3, pep4 }); emblPeptides.setDataset(null); AlignedCodonFrame acf = new AlignedCodonFrame(); MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] { 1, 2 }, 3, 1); acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map); dna.addCodonFrame(acf); acf = new AlignedCodonFrame(); map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 }, 3, 1); acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map); acf.addMap(dna2.getDatasetSequence(), pep4.getDatasetSequence(), map); dna.addCodonFrame(acf); /* * execute method under test to find CDS for EMBL peptides only */ AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] { dna1, dna2 }, dna.getDataset(), emblPeptides.getSequencesArray()); assertEquals(2, cds.getSequences().size()); assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString()); assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString()); /* * verify shared, extended alignment dataset */ assertSame(dna.getDataset(), cds.getDataset()); assertTrue(dna.getDataset().getSequences() .contains(cds.getSequenceAt(0).getDatasetSequence())); assertTrue(dna.getDataset().getSequences() .contains(cds.getSequenceAt(1).getDatasetSequence())); /* * Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide * the mappings are on the shared alignment dataset */ List cdsMappings = cds.getDataset().getCodonFrames(); /* * 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep) */ assertEquals(6, cdsMappings.size()); /* * verify that mapping sets for dna and cds alignments are different * [not current behaviour - all mappings are on the alignment dataset] */ // select -> subselect type to test. // Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames()); // assertEquals(4, dna.getCodonFrames().size()); // assertEquals(4, cds.getCodonFrames().size()); /* * Two mappings involve pep3 (dna to pep3, cds to pep3) * Mapping from pep3 to GGGTTT in first new exon sequence */ List pep3Mappings = MappingUtils .findMappingsForSequence(pep3, cdsMappings); assertEquals(2, pep3Mappings.size()); List mappings = MappingUtils .findMappingsForSequence(cds.getSequenceAt(0), pep3Mappings); assertEquals(1, mappings.size()); // map G to GGG SearchResultsI sr = MappingUtils.buildSearchResults(pep3, 1, mappings); assertEquals(1, sr.getResults().size()); SearchResultMatchI m = sr.getResults().get(0); assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence()); assertEquals(1, m.getStart()); assertEquals(3, m.getEnd()); // map F to TTT sr = MappingUtils.buildSearchResults(pep3, 2, mappings); m = sr.getResults().get(0); assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence()); assertEquals(4, m.getStart()); assertEquals(6, m.getEnd()); /* * Two mappings involve pep4 (dna to pep4, cds to pep4) * Verify mapping from pep4 to GGGTTTCCC in second new exon sequence */ List pep4Mappings = MappingUtils .findMappingsForSequence(pep4, cdsMappings); assertEquals(2, pep4Mappings.size()); mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1), pep4Mappings); assertEquals(1, mappings.size()); // map G to GGG sr = MappingUtils.buildSearchResults(pep4, 1, mappings); assertEquals(1, sr.getResults().size()); m = sr.getResults().get(0); assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); assertEquals(1, m.getStart()); assertEquals(3, m.getEnd()); // map F to TTT sr = MappingUtils.buildSearchResults(pep4, 2, mappings); m = sr.getResults().get(0); assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); assertEquals(4, m.getStart()); assertEquals(6, m.getEnd()); // map P to CCC sr = MappingUtils.buildSearchResults(pep4, 3, mappings); m = sr.getResults().get(0); assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence()); assertEquals(7, m.getStart()); assertEquals(9, m.getEnd()); } /** * Test the method that just copies aligned sequences, provided all sequences * to be aligned share the aligned sequence's dataset */ @Test(groups = "Functional") public void testAlignAsSameSequences() { SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); AlignmentI al1 = new Alignment(new SequenceI[] { dna1, dna2 }); ((Alignment) al1).createDatasetAlignment(); SequenceI dna3 = new Sequence(dna1); SequenceI dna4 = new Sequence(dna2); assertSame(dna3.getDatasetSequence(), dna1.getDatasetSequence()); assertSame(dna4.getDatasetSequence(), dna2.getDatasetSequence()); String seq1 = "-cc-GG-GT-TT--aaa"; dna3.setSequence(seq1); String seq2 = "C--C-Cgg--gtt-tAA-A-"; dna4.setSequence(seq2); AlignmentI al2 = new Alignment(new SequenceI[] { dna3, dna4 }); ((Alignment) al2).createDatasetAlignment(); assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2)); assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString()); assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString()); /* * add another sequence to 'aligned' - should still succeed, since * unaligned sequences still share a dataset with aligned sequences */ SequenceI dna5 = new Sequence("dna5", "CCCgggtttAAA"); dna5.createDatasetSequence(); al2.addSequence(dna5); assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2)); assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString()); assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString()); /* * add another sequence to 'unaligned' - should fail, since now not * all unaligned sequences share a dataset with aligned sequences */ SequenceI dna6 = new Sequence("dna6", "CCCgggtttAAA"); dna6.createDatasetSequence(); al1.addSequence(dna6); // JAL-2110 JBP Comment: what's the use case for this behaviour ? assertFalse(AlignmentUtils.alignAsSameSequences(al1, al2)); } @Test(groups = "Functional") public void testAlignAsSameSequencesMultipleSubSeq() { SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); SequenceI as1 = dna1.deriveSequence(); SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7); SequenceI as3 = dna2.deriveSequence(); as1.insertCharAt(6, 5, '-'); String s_as1 = as1.getSequenceAsString(); as2.insertCharAt(6, 5, '-'); String s_as2 = as2.getSequenceAsString(); as3.insertCharAt(6, 5, '-'); String s_as3 = as3.getSequenceAsString(); AlignmentI aligned = new Alignment(new SequenceI[] { as1, as2, as3 }); // why do we need to cast this still ? ((Alignment) aligned).createDatasetAlignment(); SequenceI uas1 = dna1.deriveSequence(); SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7); SequenceI uas3 = dna2.deriveSequence(); AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2, uas3 }); ((Alignment) tobealigned).createDatasetAlignment(); assertTrue(AlignmentUtils.alignAsSameSequences(tobealigned, aligned)); assertEquals(s_as1, uas1.getSequenceAsString()); assertEquals(s_as2, uas2.getSequenceAsString()); assertEquals(s_as3, uas3.getSequenceAsString()); } @Test(groups = { "Functional" }) public void testTransferGeneLoci() { SequenceI from = new Sequence("transcript", "aaacccgggTTTAAACCCGGGtttaaacccgggttt"); SequenceI to = new Sequence("CDS", "TTTAAACCCGGG"); MapList map = new MapList(new int[] { 1, 12 }, new int[] { 10, 21 }, 1, 1); /* * first with nothing to transfer */ AlignmentUtils.transferGeneLoci(from, map, to); assertNull(to.getGeneLoci()); /* * next with gene loci set on 'from' sequence */ int[] exons = new int[] { 100, 105, 155, 164, 210, 229 }; MapList geneMap = new MapList(new int[] { 1, 36 }, exons, 1, 1); from.setGeneLoci("human", "GRCh38", "7", geneMap); AlignmentUtils.transferGeneLoci(from, map, to); GeneLociI toLoci = to.getGeneLoci(); assertNotNull(toLoci); // DBRefEntry constructor upper-cases 'source' assertEquals("HUMAN", toLoci.getSpeciesId()); assertEquals("GRCh38", toLoci.getAssemblyId()); assertEquals("7", toLoci.getChromosomeId()); /* * transcript 'exons' are 1-6, 7-16, 17-36 * CDS 1:12 is transcript 10-21 * transcript 'CDS' is 10-16, 17-21 * which is 'gene' 158-164, 210-214 */ MapList toMap = toLoci.getMap(); assertEquals(1, toMap.getFromRanges().size()); assertEquals(2, toMap.getFromRanges().get(0).length); assertEquals(1, toMap.getFromRanges().get(0)[0]); assertEquals(12, toMap.getFromRanges().get(0)[1]); assertEquals(1, toMap.getToRanges().size()); assertEquals(4, toMap.getToRanges().get(0).length); assertEquals(158, toMap.getToRanges().get(0)[0]); assertEquals(164, toMap.getToRanges().get(0)[1]); assertEquals(210, toMap.getToRanges().get(0)[2]); assertEquals(214, toMap.getToRanges().get(0)[3]); // or summarised as (but toString might change in future): assertEquals("[ [1, 12] ] 1:1 to [ [158, 164, 210, 214] ]", toMap.toString()); /* * an existing value is not overridden */ geneMap = new MapList(new int[] { 1, 36 }, new int[] { 36, 1 }, 1, 1); from.setGeneLoci("inhuman", "GRCh37", "6", geneMap); AlignmentUtils.transferGeneLoci(from, map, to); assertEquals("GRCh38", toLoci.getAssemblyId()); assertEquals("7", toLoci.getChromosomeId()); toMap = toLoci.getMap(); assertEquals("[ [1, 12] ] 1:1 to [ [158, 164, 210, 214] ]", toMap.toString()); } /** * Tests for the method that maps nucleotide to protein based on CDS features */ @Test(groups = "Functional") public void testMapCdsToProtein() { SequenceI peptide = new Sequence("pep", "KLQ"); /* * Case 1: CDS 3 times length of peptide * NB method only checks lengths match, not translation */ SequenceI dna = new Sequence("dna", "AACGacgtCTCCT"); dna.createDatasetSequence(); dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 13, null)); MapList ml = AlignmentUtils.mapCdsToProtein(dna, peptide); assertEquals(3, ml.getFromRatio()); assertEquals(1, ml.getToRatio()); assertEquals("[[1, 3]]", Arrays.deepToString(ml.getToRanges().toArray())); assertEquals("[[1, 4], [9, 13]]", Arrays.deepToString(ml.getFromRanges().toArray())); /* * Case 2: CDS 3 times length of peptide + stop codon * (note code does not currently check trailing codon is a stop codon) */ dna = new Sequence("dna", "AACGacgtCTCCTTGA"); dna.createDatasetSequence(); dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 16, null)); ml = AlignmentUtils.mapCdsToProtein(dna, peptide); assertEquals(3, ml.getFromRatio()); assertEquals(1, ml.getToRatio()); assertEquals("[[1, 3]]", Arrays.deepToString(ml.getToRanges().toArray())); assertEquals("[[1, 4], [9, 13]]", Arrays.deepToString(ml.getFromRanges().toArray())); /* * Case 3: CDS not 3 times length of peptide - no mapping is made */ dna = new Sequence("dna", "AACGacgtCTCCTTG"); dna.createDatasetSequence(); dna.addSequenceFeature(new SequenceFeature("CDS", "", 1, 4, null)); dna.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, null)); ml = AlignmentUtils.mapCdsToProtein(dna, peptide); assertNull(ml); /* * Case 4: incomplete start codon corresponding to X in peptide */ dna = new Sequence("dna", "ACGacgtCTCCTTGG"); dna.createDatasetSequence(); SequenceFeature sf = new SequenceFeature("CDS", "", 1, 3, null); sf.setPhase("2"); // skip 2 positions (AC) to start of next codon (GCT) dna.addSequenceFeature(sf); dna.addSequenceFeature(new SequenceFeature("CDS", "", 8, 15, null)); peptide = new Sequence("pep", "XLQ"); ml = AlignmentUtils.mapCdsToProtein(dna, peptide); assertEquals("[[2, 3]]", Arrays.deepToString(ml.getToRanges().toArray())); assertEquals("[[3, 3], [8, 12]]", Arrays.deepToString(ml.getFromRanges().toArray())); } }