/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.analysis;
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
import jalview.analysis.AlignmentUtils.DnaVariant;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.Annotation;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.GeneLociI;
import jalview.datamodel.Mapping;
import jalview.datamodel.SearchResultMatchI;
import jalview.datamodel.SearchResultsI;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.datamodel.features.SequenceFeatures;
import jalview.gui.JvOptionPane;
import jalview.io.AppletFormatAdapter;
import jalview.io.DataSourceType;
import jalview.io.FileFormat;
import jalview.io.FileFormatI;
import jalview.io.FormatAdapter;
import jalview.util.MapList;
import jalview.util.MappingUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
public class AlignmentUtilsTests
{
@BeforeClass(alwaysRun = true)
public void setUpJvOptionPane()
{
JvOptionPane.setInteractiveMode(false);
JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
}
private static Sequence ts = new Sequence("short",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm");
@Test(groups = { "Functional" })
public void testExpandContext()
{
AlignmentI al = new Alignment(new Sequence[] {});
for (int i = 4; i < 14; i += 2)
{
SequenceI s1 = ts.deriveSequence().getSubSequence(i, i + 7);
al.addSequence(s1);
}
System.out.println(new AppletFormatAdapter().formatSequences(
FileFormat.Clustal,
al, true));
for (int flnk = -1; flnk < 25; flnk++)
{
AlignmentI exp = AlignmentUtils.expandContext(al, flnk);
System.out.println("\nFlank size: " + flnk);
System.out.println(new AppletFormatAdapter().formatSequences(
FileFormat.Clustal, exp, true));
if (flnk == -1)
{
/*
* Full expansion to complete sequences
*/
for (SequenceI sq : exp.getSequences())
{
String ung = sq.getSequenceAsString().replaceAll("-+", "");
final String errorMsg = "Flanking sequence not the same as original dataset sequence.\n"
+ ung
+ "\n"
+ sq.getDatasetSequence().getSequenceAsString();
assertTrue(errorMsg, ung.equalsIgnoreCase(sq.getDatasetSequence()
.getSequenceAsString()));
}
}
else if (flnk == 24)
{
/*
* Last sequence is fully expanded, others have leading gaps to match
*/
assertTrue(exp.getSequenceAt(4).getSequenceAsString()
.startsWith("abc"));
assertTrue(exp.getSequenceAt(3).getSequenceAsString()
.startsWith("--abc"));
assertTrue(exp.getSequenceAt(2).getSequenceAsString()
.startsWith("----abc"));
assertTrue(exp.getSequenceAt(1).getSequenceAsString()
.startsWith("------abc"));
assertTrue(exp.getSequenceAt(0).getSequenceAsString()
.startsWith("--------abc"));
}
}
}
/**
* Test that annotations are correctly adjusted by expandContext
*/
@Test(groups = { "Functional" })
public void testExpandContext_annotation()
{
AlignmentI al = new Alignment(new Sequence[] {});
SequenceI ds = new Sequence("Seq1", "ABCDEFGHI");
// subsequence DEF:
SequenceI seq1 = ds.deriveSequence().getSubSequence(3, 6);
al.addSequence(seq1);
/*
* Annotate DEF with 4/5/6 respectively
*/
Annotation[] anns = new Annotation[] { new Annotation(4),
new Annotation(5), new Annotation(6) };
AlignmentAnnotation ann = new AlignmentAnnotation("SS",
"secondary structure", anns);
seq1.addAlignmentAnnotation(ann);
/*
* The annotations array should match aligned positions
*/
assertEquals(3, ann.annotations.length);
assertEquals(4, ann.annotations[0].value, 0.001);
assertEquals(5, ann.annotations[1].value, 0.001);
assertEquals(6, ann.annotations[2].value, 0.001);
/*
* Check annotation to sequence position mappings before expanding the
* sequence; these are set up in Sequence.addAlignmentAnnotation ->
* Annotation.setSequenceRef -> createSequenceMappings
*/
assertNull(ann.getAnnotationForPosition(1));
assertNull(ann.getAnnotationForPosition(2));
assertNull(ann.getAnnotationForPosition(3));
assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
assertNull(ann.getAnnotationForPosition(7));
assertNull(ann.getAnnotationForPosition(8));
assertNull(ann.getAnnotationForPosition(9));
/*
* Expand the subsequence to the full sequence abcDEFghi
*/
AlignmentI expanded = AlignmentUtils.expandContext(al, -1);
assertEquals("abcDEFghi", expanded.getSequenceAt(0)
.getSequenceAsString());
/*
* Confirm the alignment and sequence have the same SS annotation,
* referencing the expanded sequence
*/
ann = expanded.getSequenceAt(0).getAnnotation()[0];
assertSame(ann, expanded.getAlignmentAnnotation()[0]);
assertSame(expanded.getSequenceAt(0), ann.sequenceRef);
/*
* The annotations array should have null values except for annotated
* positions
*/
assertNull(ann.annotations[0]);
assertNull(ann.annotations[1]);
assertNull(ann.annotations[2]);
assertEquals(4, ann.annotations[3].value, 0.001);
assertEquals(5, ann.annotations[4].value, 0.001);
assertEquals(6, ann.annotations[5].value, 0.001);
assertNull(ann.annotations[6]);
assertNull(ann.annotations[7]);
assertNull(ann.annotations[8]);
/*
* sequence position mappings should be unchanged
*/
assertNull(ann.getAnnotationForPosition(1));
assertNull(ann.getAnnotationForPosition(2));
assertNull(ann.getAnnotationForPosition(3));
assertEquals(4, ann.getAnnotationForPosition(4).value, 0.001);
assertEquals(5, ann.getAnnotationForPosition(5).value, 0.001);
assertEquals(6, ann.getAnnotationForPosition(6).value, 0.001);
assertNull(ann.getAnnotationForPosition(7));
assertNull(ann.getAnnotationForPosition(8));
assertNull(ann.getAnnotationForPosition(9));
}
/**
* Test method that returns a map of lists of sequences by sequence name.
*
* @throws IOException
*/
@Test(groups = { "Functional" })
public void testGetSequencesByName() throws IOException
{
final String data = ">Seq1Name\nKQYL\n" + ">Seq2Name\nRFPW\n"
+ ">Seq1Name\nABCD\n";
AlignmentI al = loadAlignment(data, FileFormat.Fasta);
Map> map = AlignmentUtils
.getSequencesByName(al);
assertEquals(2, map.keySet().size());
assertEquals(2, map.get("Seq1Name").size());
assertEquals("KQYL", map.get("Seq1Name").get(0).getSequenceAsString());
assertEquals("ABCD", map.get("Seq1Name").get(1).getSequenceAsString());
assertEquals(1, map.get("Seq2Name").size());
assertEquals("RFPW", map.get("Seq2Name").get(0).getSequenceAsString());
}
/**
* Helper method to load an alignment and ensure dataset sequences are set up.
*
* @param data
* @param format
* TODO
* @return
* @throws IOException
*/
protected AlignmentI loadAlignment(final String data, FileFormatI format)
throws IOException
{
AlignmentI a = new FormatAdapter().readFile(data,
DataSourceType.PASTE, format);
a.setDataset(null);
return a;
}
/**
* Test mapping of protein to cDNA, for the case where we have no sequence
* cross-references, so mappings are made first-served 1-1 where sequences
* translate.
*
* @throws IOException
*/
@Test(groups = { "Functional" })
public void testMapProteinAlignmentToCdna_noXrefs() throws IOException
{
List protseqs = new ArrayList();
protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
protein.setDataset(null);
List dnaseqs = new ArrayList();
dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
cdna.setDataset(null);
assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
// 3 mappings made, each from 1 to 1 sequence
assertEquals(3, protein.getCodonFrames().size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
// V12345 mapped to A22222
AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
.get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
acf.getdnaSeqs()[0]);
Mapping[] protMappings = acf.getProtMappings();
assertEquals(1, protMappings.length);
MapList mapList = protMappings[0].getMap();
assertEquals(3, mapList.getFromRatio());
assertEquals(1, mapList.getToRatio());
assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
.get(0)));
assertEquals(1, mapList.getFromRanges().size());
assertTrue(Arrays.equals(new int[] { 1, 3 },
mapList.getToRanges().get(0)));
assertEquals(1, mapList.getToRanges().size());
// V12346 mapped to A33333
acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
acf.getdnaSeqs()[0]);
// V12347 mapped to A11111
acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
acf.getdnaSeqs()[0]);
// no mapping involving the 'extra' A44444
assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
}
/**
* Test for the alignSequenceAs method that takes two sequences and a mapping.
*/
@Test(groups = { "Functional" })
public void testAlignSequenceAs_withMapping_noIntrons()
{
MapList map = new MapList(new int[] { 1, 6 }, new int[] { 1, 2 }, 3, 1);
/*
* No existing gaps in dna:
*/
checkAlignSequenceAs("GGGAAA", "-A-L-", false, false, map,
"---GGG---AAA");
/*
* Now introduce gaps in dna but ignore them when realigning.
*/
checkAlignSequenceAs("-G-G-G-A-A-A-", "-A-L-", false, false, map,
"---GGG---AAA");
/*
* Now include gaps in dna when realigning. First retaining 'mapped' gaps
* only, i.e. those within the exon region.
*/
checkAlignSequenceAs("-G-G--G-A--A-A-", "-A-L-", true, false, map,
"---G-G--G---A--A-A");
/*
* Include all gaps in dna when realigning (within and without the exon
* region). The leading gap, and the gaps between codons, are subsumed by
* the protein alignment gap.
*/
checkAlignSequenceAs("-G-GG--AA-A---", "-A-L-", true, true, map,
"---G-GG---AA-A---");
/*
* Include only unmapped gaps in dna when realigning (outside the exon
* region). The leading gap, and the gaps between codons, are subsumed by
* the protein alignment gap.
*/
checkAlignSequenceAs("-G-GG--AA-A-", "-A-L-", false, true, map,
"---GGG---AAA---");
}
/**
* Test for the alignSequenceAs method that takes two sequences and a mapping.
*/
@Test(groups = { "Functional" })
public void testAlignSequenceAs_withMapping_withIntrons()
{
/*
* Exons at codon 2 (AAA) and 4 (TTT)
*/
MapList map = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 2 }, 3, 1);
/*
* Simple case: no gaps in dna
*/
checkAlignSequenceAs("GGGAAACCCTTTGGG", "--A-L-", false, false, map,
"GGG---AAACCCTTTGGG");
/*
* Add gaps to dna - but ignore when realigning.
*/
checkAlignSequenceAs("-G-G-G--A--A---AC-CC-T-TT-GG-G-", "--A-L-",
false, false, map, "GGG---AAACCCTTTGGG");
/*
* Add gaps to dna - include within exons only when realigning.
*/
checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
true, false, map, "GGG---A--A---ACCCT-TTGGG");
/*
* Include gaps outside exons only when realigning.
*/
checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
false, true, map, "-G-G-GAAAC-CCTTT-GG-G-");
/*
* Include gaps following first intron if we are 'preserving mapped gaps'
*/
checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
/*
* Include all gaps in dna when realigning.
*/
checkAlignSequenceAs("-G-G-G--A--A---A-C-CC-T-TT-GG-G-", "--A-L-",
true, true, map, "-G-G-G--A--A---A-C-CC-T-TT-GG-G-");
}
/**
* Test for the case where not all of the protein sequence is mapped to cDNA.
*/
@Test(groups = { "Functional" })
public void testAlignSequenceAs_withMapping_withUnmappedProtein()
{
/*
* Exons at codon 2 (AAA) and 4 (TTT) mapped to A and P
*/
final MapList map = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
1, 1, 3, 3 }, 3, 1);
/*
* -L- 'aligns' ccc------
*/
checkAlignSequenceAs("gggAAAcccTTTggg", "-A-L-P-", false, false, map,
"gggAAAccc------TTTggg");
}
/**
* Helper method that performs and verifies the method under test.
*
* @param alignee
* the sequence to be realigned
* @param alignModel
* the sequence whose alignment is to be copied
* @param preserveMappedGaps
* @param preserveUnmappedGaps
* @param map
* @param expected
*/
protected void checkAlignSequenceAs(final String alignee,
final String alignModel, final boolean preserveMappedGaps,
final boolean preserveUnmappedGaps, MapList map,
final String expected)
{
SequenceI alignMe = new Sequence("Seq1", alignee);
alignMe.createDatasetSequence();
SequenceI alignFrom = new Sequence("Seq2", alignModel);
alignFrom.createDatasetSequence();
AlignedCodonFrame acf = new AlignedCodonFrame();
acf.addMap(alignMe.getDatasetSequence(),
alignFrom.getDatasetSequence(), map);
AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "---", '-',
preserveMappedGaps, preserveUnmappedGaps);
assertEquals(expected, alignMe.getSequenceAsString());
}
/**
* Test for the alignSequenceAs method where we preserve gaps in introns only.
*/
@Test(groups = { "Functional" })
public void testAlignSequenceAs_keepIntronGapsOnly()
{
/*
* Intron GGGAAA followed by exon CCCTTT
*/
MapList map = new MapList(new int[] { 7, 12 }, new int[] { 1, 2 }, 3, 1);
checkAlignSequenceAs("GG-G-AA-A-C-CC-T-TT", "AL", false, true, map,
"GG-G-AA-ACCCTTT");
}
/**
* Test the method that realigns protein to match mapped codon alignment.
*/
@Test(groups = { "Functional" })
public void testAlignProteinAsDna()
{
// seq1 codons are [1,2,3] [4,5,6] [7,8,9] [10,11,12]
SequenceI dna1 = new Sequence("Seq1", "TGCCATTACCAG-");
// seq2 codons are [1,3,4] [5,6,7] [8,9,10] [11,12,13]
SequenceI dna2 = new Sequence("Seq2", "T-GCCATTACCAG");
// seq3 codons are [1,2,3] [4,5,7] [8,9,10] [11,12,13]
SequenceI dna3 = new Sequence("Seq3", "TGCCA-TTACCAG");
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
dna.setDataset(null);
// protein alignment will be realigned like dna
SequenceI prot1 = new Sequence("Seq1", "CHYQ");
SequenceI prot2 = new Sequence("Seq2", "CHYQ");
SequenceI prot3 = new Sequence("Seq3", "CHYQ");
SequenceI prot4 = new Sequence("Seq4", "R-QSV"); // unmapped, unchanged
AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
prot3, prot4 });
protein.setDataset(null);
MapList map = new MapList(new int[] { 1, 12 }, new int[] { 1, 4 }, 3, 1);
AlignedCodonFrame acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
ArrayList acfs = new ArrayList();
acfs.add(acf);
protein.setCodonFrames(acfs);
/*
* Translated codon order is [1,2,3] [1,3,4] [4,5,6] [4,5,7] [5,6,7] [7,8,9]
* [8,9,10] [10,11,12] [11,12,13]
*/
AlignmentUtils.alignProteinAsDna(protein, dna);
assertEquals("C-H--Y-Q-", prot1.getSequenceAsString());
assertEquals("-C--H-Y-Q", prot2.getSequenceAsString());
assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
assertEquals("R-QSV", prot4.getSequenceAsString());
}
/**
* Test the method that tests whether a CDNA sequence translates to a protein
* sequence
*/
@Test(groups = { "Functional" })
public void testTranslatesAs()
{
// null arguments check
assertFalse(AlignmentUtils.translatesAs(null, 0, null));
assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
// straight translation
assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
"FPKG".toCharArray()));
// with extra start codon (not in protein)
assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
3, "FPKG".toCharArray()));
// with stop codon1 (not in protein)
assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
0, "FPKG".toCharArray()));
// with stop codon1 (in protein as *)
assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
0, "FPKG*".toCharArray()));
// with stop codon2 (not in protein)
assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
0, "FPKG".toCharArray()));
// with stop codon3 (not in protein)
assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
0, "FPKG".toCharArray()));
// with start and stop codon1
assertTrue(AlignmentUtils.translatesAs(
"atgtttcccaaagggtaa".toCharArray(), 3, "FPKG".toCharArray()));
// with start and stop codon1 (in protein as *)
assertTrue(AlignmentUtils.translatesAs(
"atgtttcccaaagggtaa".toCharArray(), 3, "FPKG*".toCharArray()));
// with start and stop codon2
assertTrue(AlignmentUtils.translatesAs(
"atgtttcccaaagggtag".toCharArray(), 3, "FPKG".toCharArray()));
// with start and stop codon3
assertTrue(AlignmentUtils.translatesAs(
"atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
// with embedded stop codons
assertTrue(AlignmentUtils.translatesAs(
"atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
"F*PK*G".toCharArray()));
// wrong protein
assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
0, "FPMG".toCharArray()));
// truncated dna
assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
"FPKG".toCharArray()));
// truncated protein
assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
0, "FPK".toCharArray()));
// overlong dna (doesn't end in stop codon)
assertFalse(AlignmentUtils.translatesAs(
"tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
// dna + stop codon + more
assertFalse(AlignmentUtils.translatesAs(
"tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
// overlong protein
assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
0, "FPKGQ".toCharArray()));
}
/**
* Test mapping of protein to cDNA, for cases where the cDNA has start and/or
* stop codons in addition to the protein coding sequence.
*
* @throws IOException
*/
@Test(groups = { "Functional" })
public void testMapProteinAlignmentToCdna_withStartAndStopCodons()
throws IOException
{
List protseqs = new ArrayList();
protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
protein.setDataset(null);
List dnaseqs = new ArrayList();
// start + SAR:
dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
// = EIQ + stop
dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
// = start +EIQ + stop
dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
cdna.setDataset(null);
assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
// 3 mappings made, each from 1 to 1 sequence
assertEquals(3, protein.getCodonFrames().size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
// V12345 mapped from A22222
AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
.get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
acf.getdnaSeqs()[0]);
Mapping[] protMappings = acf.getProtMappings();
assertEquals(1, protMappings.length);
MapList mapList = protMappings[0].getMap();
assertEquals(3, mapList.getFromRatio());
assertEquals(1, mapList.getToRatio());
assertTrue(Arrays.equals(new int[] { 1, 9 }, mapList.getFromRanges()
.get(0)));
assertEquals(1, mapList.getFromRanges().size());
assertTrue(Arrays.equals(new int[] { 1, 3 },
mapList.getToRanges().get(0)));
assertEquals(1, mapList.getToRanges().size());
// V12346 mapped from A33333 starting position 4
acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
acf.getdnaSeqs()[0]);
protMappings = acf.getProtMappings();
assertEquals(1, protMappings.length);
mapList = protMappings[0].getMap();
assertEquals(3, mapList.getFromRatio());
assertEquals(1, mapList.getToRatio());
assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
.get(0)));
assertEquals(1, mapList.getFromRanges().size());
assertTrue(Arrays.equals(new int[] { 1, 3 },
mapList.getToRanges().get(0)));
assertEquals(1, mapList.getToRanges().size());
// V12347 mapped to A11111 starting position 4
acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
acf.getdnaSeqs()[0]);
protMappings = acf.getProtMappings();
assertEquals(1, protMappings.length);
mapList = protMappings[0].getMap();
assertEquals(3, mapList.getFromRatio());
assertEquals(1, mapList.getToRatio());
assertTrue(Arrays.equals(new int[] { 4, 12 }, mapList.getFromRanges()
.get(0)));
assertEquals(1, mapList.getFromRanges().size());
assertTrue(Arrays.equals(new int[] { 1, 3 },
mapList.getToRanges().get(0)));
assertEquals(1, mapList.getToRanges().size());
// no mapping involving the 'extra' A44444
assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
}
/**
* Test mapping of protein to cDNA, for the case where we have some sequence
* cross-references. Verify that 1-to-many mappings are made where
* cross-references exist and sequences are mappable.
*
* @throws IOException
*/
@Test(groups = { "Functional" })
public void testMapProteinAlignmentToCdna_withXrefs() throws IOException
{
List protseqs = new ArrayList();
protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
protein.setDataset(null);
List dnaseqs = new ArrayList();
dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
cdna.setDataset(null);
// Xref A22222 to V12345 (should get mapped)
dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
// Xref V12345 to A44444 (should get mapped)
protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
// Xref A33333 to V12347 (sequence mismatch - should not get mapped)
dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
// as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
// it should get paired up with the unmapped A33333
// A11111 should be mapped to V12347
// A55555 is spare and has no xref so is not mapped
assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
// 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
assertEquals(3, protein.getCodonFrames().size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
// one mapping for each of the first 4 cDNA sequences
assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
// V12345 mapped to A22222 and A44444
AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
.get(0);
assertEquals(2, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
acf.getdnaSeqs()[0]);
assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
acf.getdnaSeqs()[1]);
// V12346 mapped to A33333
acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
acf.getdnaSeqs()[0]);
// V12347 mapped to A11111
acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
acf.getdnaSeqs()[0]);
// no mapping involving the 'extra' A55555
assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
}
/**
* Test mapping of protein to cDNA, for the case where we have some sequence
* cross-references. Verify that once we have made an xref mapping we don't
* also map un-xrefd sequeces.
*
* @throws IOException
*/
@Test(groups = { "Functional" })
public void testMapProteinAlignmentToCdna_prioritiseXrefs()
throws IOException
{
List protseqs = new ArrayList();
protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
AlignmentI protein = new Alignment(
protseqs.toArray(new SequenceI[protseqs.size()]));
protein.setDataset(null);
List dnaseqs = new ArrayList();
dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
.size()]));
cdna.setDataset(null);
// Xref A22222 to V12345 (should get mapped)
// A11111 should then be mapped to the unmapped V12346
dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
assertTrue(AlignmentUtils.mapProteinAlignmentToCdna(protein, cdna));
// 2 protein mappings made
assertEquals(2, protein.getCodonFrames().size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
// one mapping for each of the cDNA sequences
assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
// V12345 mapped to A22222
AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
.get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
acf.getdnaSeqs()[0]);
// V12346 mapped to A11111
acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
assertEquals(1, acf.getdnaSeqs().length);
assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
acf.getdnaSeqs()[0]);
}
/**
* Test the method that shows or hides sequence annotations by type(s) and
* selection group.
*/
@Test(groups = { "Functional" })
public void testShowOrHideSequenceAnnotations()
{
SequenceI seq1 = new Sequence("Seq1", "AAA");
SequenceI seq2 = new Sequence("Seq2", "BBB");
SequenceI seq3 = new Sequence("Seq3", "CCC");
Annotation[] anns = new Annotation[] { new Annotation(2f) };
AlignmentAnnotation ann1 = new AlignmentAnnotation("Structure", "ann1",
anns);
ann1.setSequenceRef(seq1);
AlignmentAnnotation ann2 = new AlignmentAnnotation("Structure", "ann2",
anns);
ann2.setSequenceRef(seq2);
AlignmentAnnotation ann3 = new AlignmentAnnotation("Structure", "ann3",
anns);
AlignmentAnnotation ann4 = new AlignmentAnnotation("Temp", "ann4", anns);
ann4.setSequenceRef(seq1);
AlignmentAnnotation ann5 = new AlignmentAnnotation("Temp", "ann5", anns);
ann5.setSequenceRef(seq2);
AlignmentAnnotation ann6 = new AlignmentAnnotation("Temp", "ann6", anns);
AlignmentI al = new Alignment(new SequenceI[] { seq1, seq2, seq3 });
al.addAnnotation(ann1); // Structure for Seq1
al.addAnnotation(ann2); // Structure for Seq2
al.addAnnotation(ann3); // Structure for no sequence
al.addAnnotation(ann4); // Temp for seq1
al.addAnnotation(ann5); // Temp for seq2
al.addAnnotation(ann6); // Temp for no sequence
List types = new ArrayList();
List scope = new ArrayList();
/*
* Set all sequence related Structure to hidden (ann1, ann2)
*/
types.add("Structure");
AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
false);
assertFalse(ann1.visible);
assertFalse(ann2.visible);
assertTrue(ann3.visible); // not sequence-related, not affected
assertTrue(ann4.visible); // not Structure, not affected
assertTrue(ann5.visible); // "
assertTrue(ann6.visible); // not sequence-related, not affected
/*
* Set Temp in {seq1, seq3} to hidden
*/
types.clear();
types.add("Temp");
scope.add(seq1);
scope.add(seq3);
AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, false,
false);
assertFalse(ann1.visible); // unchanged
assertFalse(ann2.visible); // unchanged
assertTrue(ann3.visible); // not sequence-related, not affected
assertFalse(ann4.visible); // Temp for seq1 hidden
assertTrue(ann5.visible); // not in scope, not affected
assertTrue(ann6.visible); // not sequence-related, not affected
/*
* Set Temp in all sequences to hidden
*/
types.clear();
types.add("Temp");
scope.add(seq1);
scope.add(seq3);
AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, false,
false);
assertFalse(ann1.visible); // unchanged
assertFalse(ann2.visible); // unchanged
assertTrue(ann3.visible); // not sequence-related, not affected
assertFalse(ann4.visible); // Temp for seq1 hidden
assertFalse(ann5.visible); // Temp for seq2 hidden
assertTrue(ann6.visible); // not sequence-related, not affected
/*
* Set all types in {seq1, seq3} to visible
*/
types.clear();
scope.clear();
scope.add(seq1);
scope.add(seq3);
AlignmentUtils.showOrHideSequenceAnnotations(al, types, scope, true,
true);
assertTrue(ann1.visible); // Structure for seq1 set visible
assertFalse(ann2.visible); // not in scope, unchanged
assertTrue(ann3.visible); // not sequence-related, not affected
assertTrue(ann4.visible); // Temp for seq1 set visible
assertFalse(ann5.visible); // not in scope, unchanged
assertTrue(ann6.visible); // not sequence-related, not affected
/*
* Set all types in all scope to hidden
*/
AlignmentUtils.showOrHideSequenceAnnotations(al, types, null, true,
false);
assertFalse(ann1.visible);
assertFalse(ann2.visible);
assertTrue(ann3.visible); // not sequence-related, not affected
assertFalse(ann4.visible);
assertFalse(ann5.visible);
assertTrue(ann6.visible); // not sequence-related, not affected
}
/**
* Tests for the method that checks if one sequence cross-references another
*/
@Test(groups = { "Functional" })
public void testHasCrossRef()
{
assertFalse(AlignmentUtils.hasCrossRef(null, null));
SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
assertFalse(AlignmentUtils.hasCrossRef(seq1, null));
assertFalse(AlignmentUtils.hasCrossRef(null, seq1));
SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
// different ref
seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20193"));
assertFalse(AlignmentUtils.hasCrossRef(seq1, seq2));
// case-insensitive; version number is ignored
seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "v20192"));
assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
// right case!
seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
assertTrue(AlignmentUtils.hasCrossRef(seq1, seq2));
// test is one-way only
assertFalse(AlignmentUtils.hasCrossRef(seq2, seq1));
}
/**
* Tests for the method that checks if either sequence cross-references the
* other
*/
@Test(groups = { "Functional" })
public void testHaveCrossRef()
{
assertFalse(AlignmentUtils.hasCrossRef(null, null));
SequenceI seq1 = new Sequence("EMBL|A12345", "ABCDEF");
assertFalse(AlignmentUtils.haveCrossRef(seq1, null));
assertFalse(AlignmentUtils.haveCrossRef(null, seq1));
SequenceI seq2 = new Sequence("UNIPROT|V20192", "ABCDEF");
assertFalse(AlignmentUtils.haveCrossRef(seq1, seq2));
seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
// next is true for haveCrossRef, false for hasCrossRef
assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
// now the other way round
seq1.setDBRefs(null);
seq2.addDBRef(new DBRefEntry("EMBL", "1", "A12345"));
assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
// now both ways
seq1.addDBRef(new DBRefEntry("UNIPROT", "1", "V20192"));
assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
}
/**
* Test the method that extracts the cds-only part of a dna alignment.
*/
@Test(groups = { "Functional" })
public void testMakeCdsAlignment()
{
/*
* scenario:
* dna1 --> [4, 6] [10,12] --> pep1
* dna2 --> [1, 3] [7, 9] [13,15] --> pep2
*/
SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
SequenceI pep1 = new Sequence("pep1", "GF");
SequenceI pep2 = new Sequence("pep2", "GFP");
pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "pep1"));
pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "pep2"));
dna1.createDatasetSequence();
dna2.createDatasetSequence();
pep1.createDatasetSequence();
pep2.createDatasetSequence();
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
dna.setDataset(null);
/*
* put a variant feature on dna2 base 8
* - should transfer to cds2 base 5
*/
dna2.addSequenceFeature(new SequenceFeature("variant", "hgmd", 8, 8,
0f, null));
/*
* need a sourceDbRef if we are to construct dbrefs to the CDS
* sequence from the dna contig sequences
*/
DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
dna1.getDatasetSequence().addDBRef(dbref);
org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
dna2.getDatasetSequence().addDBRef(dbref);
org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
/*
* CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
* dataset (e.g. added from dbrefs by CrossRef.findXrefSequences)
*/
MapList mapfordna1 = new MapList(new int[] { 4, 6, 10, 12 }, new int[] {
1, 2 }, 3, 1);
AlignedCodonFrame acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(),
mapfordna1);
dna.addCodonFrame(acf);
MapList mapfordna2 = new MapList(new int[] { 1, 3, 7, 9, 13, 15 },
new int[] { 1, 3 }, 3, 1);
acf = new AlignedCodonFrame();
acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(),
mapfordna2);
dna.addCodonFrame(acf);
/*
* In this case, mappings originally came from matching Uniprot accessions
* - so need an xref on dna involving those regions.
* These are normally constructed from CDS annotation
*/
DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1",
new Mapping(mapfordna1));
dna1.addDBRef(dna1xref);
assertEquals(2, dna1.getDBRefs().length); // to self and to pep1
DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2",
new Mapping(mapfordna2));
dna2.addDBRef(dna2xref);
assertEquals(2, dna2.getDBRefs().length); // to self and to pep2
/*
* execute method under test:
*/
AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
dna1, dna2 }, dna.getDataset(), null);
/*
* verify cds sequences
*/
assertEquals(2, cds.getSequences().size());
assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString());
/*
* verify shared, extended alignment dataset
*/
assertSame(dna.getDataset(), cds.getDataset());
SequenceI cds1Dss = cds.getSequenceAt(0).getDatasetSequence();
SequenceI cds2Dss = cds.getSequenceAt(1).getDatasetSequence();
assertTrue(dna.getDataset().getSequences().contains(cds1Dss));
assertTrue(dna.getDataset().getSequences().contains(cds2Dss));
/*
* verify CDS has a dbref with mapping to peptide
*/
assertNotNull(cds1Dss.getDBRefs());
assertEquals(2, cds1Dss.getDBRefs().length);
dbref = cds1Dss.getDBRefs()[0];
assertEquals(dna1xref.getSource(), dbref.getSource());
// version is via ensembl's primary ref
assertEquals(dna1xref.getVersion(), dbref.getVersion());
assertEquals(dna1xref.getAccessionId(), dbref.getAccessionId());
assertNotNull(dbref.getMap());
assertSame(pep1.getDatasetSequence(), dbref.getMap().getTo());
MapList cdsMapping = new MapList(new int[] { 1, 6 },
new int[] { 1, 2 }, 3, 1);
assertEquals(cdsMapping, dbref.getMap().getMap());
/*
* verify peptide has added a dbref with reverse mapping to CDS
*/
assertNotNull(pep1.getDBRefs());
// FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ?
assertEquals(2, pep1.getDBRefs().length);
dbref = pep1.getDBRefs()[1];
assertEquals("ENSEMBL", dbref.getSource());
assertEquals("0", dbref.getVersion());
assertEquals("CDS|dna1", dbref.getAccessionId());
assertNotNull(dbref.getMap());
assertSame(cds1Dss, dbref.getMap().getTo());
assertEquals(cdsMapping.getInverse(), dbref.getMap().getMap());
/*
* verify cDNA has added a dbref with mapping to CDS
*/
assertEquals(3, dna1.getDBRefs().length);
DBRefEntry dbRefEntry = dna1.getDBRefs()[2];
assertSame(cds1Dss, dbRefEntry.getMap().getTo());
MapList dnaToCdsMapping = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 6 }, 1, 1);
assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap());
assertEquals(3, dna2.getDBRefs().length);
dbRefEntry = dna2.getDBRefs()[2];
assertSame(cds2Dss, dbRefEntry.getMap().getTo());
dnaToCdsMapping = new MapList(new int[] { 1, 3, 7, 9, 13, 15 },
new int[] { 1, 9 }, 1, 1);
assertEquals(dnaToCdsMapping, dbRefEntry.getMap().getMap());
/*
* verify CDS has added a dbref with mapping to cDNA
*/
assertEquals(2, cds1Dss.getDBRefs().length);
dbRefEntry = cds1Dss.getDBRefs()[1];
assertSame(dna1.getDatasetSequence(), dbRefEntry.getMap().getTo());
MapList cdsToDnaMapping = new MapList(new int[] { 1, 6 }, new int[] {
4, 6, 10, 12 }, 1, 1);
assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap());
assertEquals(2, cds2Dss.getDBRefs().length);
dbRefEntry = cds2Dss.getDBRefs()[1];
assertSame(dna2.getDatasetSequence(), dbRefEntry.getMap().getTo());
cdsToDnaMapping = new MapList(new int[] { 1, 9 }, new int[] { 1, 3, 7,
9, 13, 15 }, 1, 1);
assertEquals(cdsToDnaMapping, dbRefEntry.getMap().getMap());
/*
* Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
* the mappings are on the shared alignment dataset
* 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
*/
List cdsMappings = cds.getDataset().getCodonFrames();
assertEquals(6, cdsMappings.size());
/*
* verify that mapping sets for dna and cds alignments are different
* [not current behaviour - all mappings are on the alignment dataset]
*/
// select -> subselect type to test.
// Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
// assertEquals(4, dna.getCodonFrames().size());
// assertEquals(4, cds.getCodonFrames().size());
/*
* Two mappings involve pep1 (dna to pep1, cds to pep1)
* Mapping from pep1 to GGGTTT in first new exon sequence
*/
List pep1Mappings = MappingUtils
.findMappingsForSequence(pep1, cdsMappings);
assertEquals(2, pep1Mappings.size());
List mappings = MappingUtils
.findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings);
assertEquals(1, mappings.size());
// map G to GGG
SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
assertEquals(1, sr.getResults().size());
SearchResultMatchI m = sr.getResults().get(0);
assertSame(cds1Dss, m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
// map F to TTT
sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
m = sr.getResults().get(0);
assertSame(cds1Dss, m.getSequence());
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
/*
* Two mappings involve pep2 (dna to pep2, cds to pep2)
* Verify mapping from pep2 to GGGTTTCCC in second new exon sequence
*/
List pep2Mappings = MappingUtils
.findMappingsForSequence(pep2, cdsMappings);
assertEquals(2, pep2Mappings.size());
mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1),
pep2Mappings);
assertEquals(1, mappings.size());
// map G to GGG
sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
assertEquals(1, sr.getResults().size());
m = sr.getResults().get(0);
assertSame(cds2Dss, m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
// map F to TTT
sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
m = sr.getResults().get(0);
assertSame(cds2Dss, m.getSequence());
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
// map P to CCC
sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
m = sr.getResults().get(0);
assertSame(cds2Dss, m.getSequence());
assertEquals(7, m.getStart());
assertEquals(9, m.getEnd());
/*
* check cds2 acquired a variant feature in position 5
*/
List sfs = cds2Dss.getSequenceFeatures();
assertNotNull(sfs);
assertEquals(1, sfs.size());
assertEquals("variant", sfs.get(0).type);
assertEquals(5, sfs.get(0).begin);
assertEquals(5, sfs.get(0).end);
}
/**
* Test the method that makes a cds-only alignment from a DNA sequence and its
* product mappings, for the case where there are multiple exon mappings to
* different protein products.
*/
@Test(groups = { "Functional" })
public void testMakeCdsAlignment_multipleProteins()
{
SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
SequenceI pep1 = new Sequence("pep1", "GF"); // GGGTTT
SequenceI pep2 = new Sequence("pep2", "KP"); // aaaccc
SequenceI pep3 = new Sequence("pep3", "KF"); // aaaTTT
dna1.createDatasetSequence();
pep1.createDatasetSequence();
pep2.createDatasetSequence();
pep3.createDatasetSequence();
pep1.getDatasetSequence().addDBRef(
new DBRefEntry("EMBLCDS", "2", "A12345"));
pep2.getDatasetSequence().addDBRef(
new DBRefEntry("EMBLCDS", "3", "A12346"));
pep3.getDatasetSequence().addDBRef(
new DBRefEntry("EMBLCDS", "4", "A12347"));
/*
* Create the CDS alignment
*/
AlignmentI dna = new Alignment(new SequenceI[] { dna1 });
dna.setDataset(null);
/*
* Make the mappings from dna to protein
*/
// map ...GGG...TTT to GF
MapList map = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 2 }, 3, 1);
AlignedCodonFrame acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
dna.addCodonFrame(acf);
// map aaa...ccc to KP
map = new MapList(new int[] { 1, 3, 7, 9 }, new int[] { 1, 2 }, 3, 1);
acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), pep2.getDatasetSequence(), map);
dna.addCodonFrame(acf);
// map aaa......TTT to KF
map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 2 }, 3, 1);
acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
dna.addCodonFrame(acf);
/*
* execute method under test
*/
AlignmentI cdsal = AlignmentUtils.makeCdsAlignment(
new SequenceI[] { dna1 }, dna.getDataset(), null);
/*
* Verify we have 3 cds sequences, mapped to pep1/2/3 respectively
*/
List cds = cdsal.getSequences();
assertEquals(3, cds.size());
/*
* verify shared, extended alignment dataset
*/
assertSame(cdsal.getDataset(), dna.getDataset());
assertTrue(dna.getDataset().getSequences()
.contains(cds.get(0).getDatasetSequence()));
assertTrue(dna.getDataset().getSequences()
.contains(cds.get(1).getDatasetSequence()));
assertTrue(dna.getDataset().getSequences()
.contains(cds.get(2).getDatasetSequence()));
/*
* verify aligned cds sequences and their xrefs
*/
SequenceI cdsSeq = cds.get(0);
assertEquals("GGGTTT", cdsSeq.getSequenceAsString());
// assertEquals("dna1|A12345", cdsSeq.getName());
assertEquals("CDS|dna1", cdsSeq.getName());
// assertEquals(1, cdsSeq.getDBRefs().length);
// DBRefEntry cdsRef = cdsSeq.getDBRefs()[0];
// assertEquals("EMBLCDS", cdsRef.getSource());
// assertEquals("2", cdsRef.getVersion());
// assertEquals("A12345", cdsRef.getAccessionId());
cdsSeq = cds.get(1);
assertEquals("aaaccc", cdsSeq.getSequenceAsString());
// assertEquals("dna1|A12346", cdsSeq.getName());
assertEquals("CDS|dna1", cdsSeq.getName());
// assertEquals(1, cdsSeq.getDBRefs().length);
// cdsRef = cdsSeq.getDBRefs()[0];
// assertEquals("EMBLCDS", cdsRef.getSource());
// assertEquals("3", cdsRef.getVersion());
// assertEquals("A12346", cdsRef.getAccessionId());
cdsSeq = cds.get(2);
assertEquals("aaaTTT", cdsSeq.getSequenceAsString());
// assertEquals("dna1|A12347", cdsSeq.getName());
assertEquals("CDS|dna1", cdsSeq.getName());
// assertEquals(1, cdsSeq.getDBRefs().length);
// cdsRef = cdsSeq.getDBRefs()[0];
// assertEquals("EMBLCDS", cdsRef.getSource());
// assertEquals("4", cdsRef.getVersion());
// assertEquals("A12347", cdsRef.getAccessionId());
/*
* Verify there are mappings from each cds sequence to its protein product
* and also to its dna source
*/
List newMappings = cdsal.getCodonFrames();
/*
* 6 mappings involve dna1 (to pep1/2/3, cds1/2/3)
*/
List dnaMappings = MappingUtils
.findMappingsForSequence(dna1, newMappings);
assertEquals(6, dnaMappings.size());
/*
* dna1 to pep1
*/
List mappings = MappingUtils
.findMappingsForSequence(pep1, dnaMappings);
assertEquals(1, mappings.size());
assertEquals(1, mappings.get(0).getMappings().size());
assertSame(pep1.getDatasetSequence(), mappings.get(0).getMappings()
.get(0).getMapping().getTo());
/*
* dna1 to cds1
*/
List dnaToCds1Mappings = MappingUtils
.findMappingsForSequence(cds.get(0), dnaMappings);
Mapping mapping = dnaToCds1Mappings.get(0).getMappings().get(0)
.getMapping();
assertSame(cds.get(0).getDatasetSequence(), mapping.getTo());
assertEquals("G(1) in CDS should map to G(4) in DNA", 4, mapping
.getMap().getToPosition(1));
/*
* dna1 to pep2
*/
mappings = MappingUtils.findMappingsForSequence(pep2, dnaMappings);
assertEquals(1, mappings.size());
assertEquals(1, mappings.get(0).getMappings().size());
assertSame(pep2.getDatasetSequence(), mappings.get(0).getMappings()
.get(0).getMapping().getTo());
/*
* dna1 to cds2
*/
List dnaToCds2Mappings = MappingUtils
.findMappingsForSequence(cds.get(1), dnaMappings);
mapping = dnaToCds2Mappings.get(0).getMappings().get(0).getMapping();
assertSame(cds.get(1).getDatasetSequence(), mapping.getTo());
assertEquals("c(4) in CDS should map to c(7) in DNA", 7, mapping
.getMap().getToPosition(4));
/*
* dna1 to pep3
*/
mappings = MappingUtils.findMappingsForSequence(pep3, dnaMappings);
assertEquals(1, mappings.size());
assertEquals(1, mappings.get(0).getMappings().size());
assertSame(pep3.getDatasetSequence(), mappings.get(0).getMappings()
.get(0).getMapping().getTo());
/*
* dna1 to cds3
*/
List dnaToCds3Mappings = MappingUtils
.findMappingsForSequence(cds.get(2), dnaMappings);
mapping = dnaToCds3Mappings.get(0).getMappings().get(0).getMapping();
assertSame(cds.get(2).getDatasetSequence(), mapping.getTo());
assertEquals("T(4) in CDS should map to T(10) in DNA", 10, mapping
.getMap().getToPosition(4));
}
@Test(groups = { "Functional" })
public void testIsMappable()
{
SequenceI dna1 = new Sequence("dna1", "cgCAGtgGT");
SequenceI aa1 = new Sequence("aa1", "RSG");
AlignmentI al1 = new Alignment(new SequenceI[] { dna1 });
AlignmentI al2 = new Alignment(new SequenceI[] { aa1 });
assertFalse(AlignmentUtils.isMappable(null, null));
assertFalse(AlignmentUtils.isMappable(al1, null));
assertFalse(AlignmentUtils.isMappable(null, al1));
assertFalse(AlignmentUtils.isMappable(al1, al1));
assertFalse(AlignmentUtils.isMappable(al2, al2));
assertTrue(AlignmentUtils.isMappable(al1, al2));
assertTrue(AlignmentUtils.isMappable(al2, al1));
}
/**
* Test creating a mapping when the sequences involved do not start at residue
* 1
*
* @throws IOException
*/
@Test(groups = { "Functional" })
public void testMapCdnaToProtein_forSubsequence() throws IOException
{
SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
prot.createDatasetSequence();
SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
dna.createDatasetSequence();
MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
assertEquals(10, map.getToLowest());
assertEquals(12, map.getToHighest());
assertEquals(40, map.getFromLowest());
assertEquals(48, map.getFromHighest());
}
/**
* Test for the alignSequenceAs method where we have protein mapped to protein
*/
@Test(groups = { "Functional" })
public void testAlignSequenceAs_mappedProteinProtein()
{
SequenceI alignMe = new Sequence("Match", "MGAASEV");
alignMe.createDatasetSequence();
SequenceI alignFrom = new Sequence("Query", "LQTGYMGAASEVMFSPTRR");
alignFrom.createDatasetSequence();
AlignedCodonFrame acf = new AlignedCodonFrame();
// this is like a domain or motif match of part of a peptide sequence
MapList map = new MapList(new int[] { 6, 12 }, new int[] { 1, 7 }, 1, 1);
acf.addMap(alignFrom.getDatasetSequence(),
alignMe.getDatasetSequence(), map);
AlignmentUtils.alignSequenceAs(alignMe, alignFrom, acf, "-", '-', true,
true);
assertEquals("-----MGAASEV-------", alignMe.getSequenceAsString());
}
/**
* Test for the alignSequenceAs method where there are trailing unmapped
* residues in the model sequence
*/
@Test(groups = { "Functional" })
public void testAlignSequenceAs_withTrailingPeptide()
{
// map first 3 codons to KPF; G is a trailing unmapped residue
MapList map = new MapList(new int[] { 1, 9 }, new int[] { 1, 3 }, 3, 1);
checkAlignSequenceAs("AAACCCTTT", "K-PFG", true, true, map,
"AAA---CCCTTT---");
}
/**
* Tests for transferring features between mapped sequences
*/
@Test(groups = { "Functional" })
public void testTransferFeatures()
{
SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
// no overlap
dna.addSequenceFeature(new SequenceFeature("type1", "desc1", 1, 2, 1f,
null));
// partial overlap - to [1, 1]
dna.addSequenceFeature(new SequenceFeature("type2", "desc2", 3, 4, 2f,
null));
// exact overlap - to [1, 3]
dna.addSequenceFeature(new SequenceFeature("type3", "desc3", 4, 6, 3f,
null));
// spanning overlap - to [2, 5]
dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
null));
// exactly overlaps whole mapped range [1, 6]
dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
null));
// no overlap (internal)
dna.addSequenceFeature(new SequenceFeature("type6", "desc6", 7, 9, 6f,
null));
// no overlap (3' end)
dna.addSequenceFeature(new SequenceFeature("type7", "desc7", 13, 15,
7f, null));
// overlap (3' end) - to [6, 6]
dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
8f, null));
// extended overlap - to [6, +]
dna.addSequenceFeature(new SequenceFeature("type9", "desc9", 12, 13,
9f, null));
MapList map = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 6 }, 1, 1);
/*
* transferFeatures() will build 'partial overlap' for regions
* that partially overlap 5' or 3' (start or end) of target sequence
*/
AlignmentUtils.transferFeatures(dna, cds, map, null);
List sfs = cds.getSequenceFeatures();
assertEquals(6, sfs.size());
SequenceFeature sf = sfs.get(0);
assertEquals("type2", sf.getType());
assertEquals("desc2", sf.getDescription());
assertEquals(2f, sf.getScore());
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
sf = sfs.get(1);
assertEquals("type3", sf.getType());
assertEquals("desc3", sf.getDescription());
assertEquals(3f, sf.getScore());
assertEquals(1, sf.getBegin());
assertEquals(3, sf.getEnd());
sf = sfs.get(2);
assertEquals("type4", sf.getType());
assertEquals(2, sf.getBegin());
assertEquals(5, sf.getEnd());
sf = sfs.get(3);
assertEquals("type5", sf.getType());
assertEquals(1, sf.getBegin());
assertEquals(6, sf.getEnd());
sf = sfs.get(4);
assertEquals("type8", sf.getType());
assertEquals(6, sf.getBegin());
assertEquals(6, sf.getEnd());
sf = sfs.get(5);
assertEquals("type9", sf.getType());
assertEquals(6, sf.getBegin());
assertEquals(6, sf.getEnd());
}
/**
* Tests for transferring features between mapped sequences
*/
@Test(groups = { "Functional" })
public void testTransferFeatures_withOmit()
{
SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
MapList map = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 6 }, 1, 1);
// [5, 11] maps to [2, 5]
dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
null));
// [4, 12] maps to [1, 6]
dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
null));
// [12, 12] maps to [6, 6]
dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
8f, null));
// desc4 and desc8 are the 'omit these' varargs
AlignmentUtils.transferFeatures(dna, cds, map, null, "type4", "type8");
List sfs = cds.getSequenceFeatures();
assertEquals(1, sfs.size());
SequenceFeature sf = sfs.get(0);
assertEquals("type5", sf.getType());
assertEquals(1, sf.getBegin());
assertEquals(6, sf.getEnd());
}
/**
* Tests for transferring features between mapped sequences
*/
@Test(groups = { "Functional" })
public void testTransferFeatures_withSelect()
{
SequenceI dna = new Sequence("dna/20-34", "acgTAGcaaGCCcgt");
SequenceI cds = new Sequence("cds/10-15", "TAGGCC");
MapList map = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 6 }, 1, 1);
// [5, 11] maps to [2, 5]
dna.addSequenceFeature(new SequenceFeature("type4", "desc4", 5, 11, 4f,
null));
// [4, 12] maps to [1, 6]
dna.addSequenceFeature(new SequenceFeature("type5", "desc5", 4, 12, 5f,
null));
// [12, 12] maps to [6, 6]
dna.addSequenceFeature(new SequenceFeature("type8", "desc8", 12, 12,
8f, null));
// "type5" is the 'select this type' argument
AlignmentUtils.transferFeatures(dna, cds, map, "type5");
List sfs = cds.getSequenceFeatures();
assertEquals(1, sfs.size());
SequenceFeature sf = sfs.get(0);
assertEquals("type5", sf.getType());
assertEquals(1, sf.getBegin());
assertEquals(6, sf.getEnd());
}
/**
* Test the method that extracts the cds-only part of a dna alignment, for the
* case where the cds should be aligned to match its nucleotide sequence.
*/
@Test(groups = { "Functional" })
public void testMakeCdsAlignment_alternativeTranscripts()
{
SequenceI dna1 = new Sequence("dna1", "aaaGGGCC-----CTTTaaaGGG");
// alternative transcript of same dna skips CCC codon
SequenceI dna2 = new Sequence("dna2", "aaaGGGCC-----cttTaaaGGG");
// dna3 has no mapping (protein product) so should be ignored here
SequenceI dna3 = new Sequence("dna3", "aaaGGGCCCCCGGGcttTaaaGGG");
SequenceI pep1 = new Sequence("pep1", "GPFG");
SequenceI pep2 = new Sequence("pep2", "GPG");
dna1.createDatasetSequence();
dna2.createDatasetSequence();
dna3.createDatasetSequence();
pep1.createDatasetSequence();
pep2.createDatasetSequence();
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
dna.setDataset(null);
MapList map = new MapList(new int[] { 4, 12, 16, 18 },
new int[] { 1, 4 }, 3, 1);
AlignedCodonFrame acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
dna.addCodonFrame(acf);
map = new MapList(new int[] { 4, 8, 12, 12, 16, 18 },
new int[] { 1, 3 }, 3, 1);
acf = new AlignedCodonFrame();
acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
dna.addCodonFrame(acf);
AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
dna1, dna2, dna3 }, dna.getDataset(), null);
List cdsSeqs = cds.getSequences();
assertEquals(2, cdsSeqs.size());
assertEquals("GGGCCCTTTGGG", cdsSeqs.get(0).getSequenceAsString());
assertEquals("GGGCCTGGG", cdsSeqs.get(1).getSequenceAsString());
/*
* verify shared, extended alignment dataset
*/
assertSame(dna.getDataset(), cds.getDataset());
assertTrue(dna.getDataset().getSequences()
.contains(cdsSeqs.get(0).getDatasetSequence()));
assertTrue(dna.getDataset().getSequences()
.contains(cdsSeqs.get(1).getDatasetSequence()));
/*
* Verify 6 mappings: dna1 to cds1, cds1 to pep1, dna1 to pep1
* and the same for dna2/cds2/pep2
*/
List mappings = cds.getCodonFrames();
assertEquals(6, mappings.size());
/*
* 2 mappings involve pep1
*/
List pep1Mappings = MappingUtils
.findMappingsForSequence(pep1, mappings);
assertEquals(2, pep1Mappings.size());
/*
* Get mapping of pep1 to cds1 and verify it
* maps GPFG to 1-3,4-6,7-9,10-12
*/
List pep1CdsMappings = MappingUtils
.findMappingsForSequence(cds.getSequenceAt(0), pep1Mappings);
assertEquals(1, pep1CdsMappings.size());
SearchResultsI sr = MappingUtils.buildSearchResults(pep1, 1,
pep1CdsMappings);
assertEquals(1, sr.getResults().size());
SearchResultMatchI m = sr.getResults().get(0);
assertEquals(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
sr = MappingUtils.buildSearchResults(pep1, 2, pep1CdsMappings);
m = sr.getResults().get(0);
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
sr = MappingUtils.buildSearchResults(pep1, 3, pep1CdsMappings);
m = sr.getResults().get(0);
assertEquals(7, m.getStart());
assertEquals(9, m.getEnd());
sr = MappingUtils.buildSearchResults(pep1, 4, pep1CdsMappings);
m = sr.getResults().get(0);
assertEquals(10, m.getStart());
assertEquals(12, m.getEnd());
/*
* Get mapping of pep2 to cds2 and verify it
* maps GPG in pep2 to 1-3,4-6,7-9 in second CDS sequence
*/
List pep2Mappings = MappingUtils
.findMappingsForSequence(pep2, mappings);
assertEquals(2, pep2Mappings.size());
List pep2CdsMappings = MappingUtils
.findMappingsForSequence(cds.getSequenceAt(1), pep2Mappings);
assertEquals(1, pep2CdsMappings.size());
sr = MappingUtils.buildSearchResults(pep2, 1, pep2CdsMappings);
assertEquals(1, sr.getResults().size());
m = sr.getResults().get(0);
assertEquals(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
sr = MappingUtils.buildSearchResults(pep2, 2, pep2CdsMappings);
m = sr.getResults().get(0);
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
sr = MappingUtils.buildSearchResults(pep2, 3, pep2CdsMappings);
m = sr.getResults().get(0);
assertEquals(7, m.getStart());
assertEquals(9, m.getEnd());
}
/**
* Test the method that realigns protein to match mapped codon alignment.
*/
@Test(groups = { "Functional" })
public void testAlignProteinAsDna_incompleteStartCodon()
{
// seq1: incomplete start codon (not mapped), then [3, 11]
SequenceI dna1 = new Sequence("Seq1", "ccAAA-TTT-GGG-");
// seq2 codons are [4, 5], [8, 11]
SequenceI dna2 = new Sequence("Seq2", "ccaAA-ttT-GGG-");
// seq3 incomplete start codon at 'tt'
SequenceI dna3 = new Sequence("Seq3", "ccaaa-ttt-GGG-");
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2, dna3 });
dna.setDataset(null);
// prot1 has 'X' for incomplete start codon (not mapped)
SequenceI prot1 = new Sequence("Seq1", "XKFG"); // X for incomplete start
SequenceI prot2 = new Sequence("Seq2", "NG");
SequenceI prot3 = new Sequence("Seq3", "XG"); // X for incomplete start
AlignmentI protein = new Alignment(new SequenceI[] { prot1, prot2,
prot3 });
protein.setDataset(null);
// map dna1 [3, 11] to prot1 [2, 4] KFG
MapList map = new MapList(new int[] { 3, 11 }, new int[] { 2, 4 }, 3, 1);
AlignedCodonFrame acf = new AlignedCodonFrame();
acf.addMap(dna1.getDatasetSequence(), prot1.getDatasetSequence(), map);
// map dna2 [4, 5] [8, 11] to prot2 [1, 2] NG
map = new MapList(new int[] { 4, 5, 8, 11 }, new int[] { 1, 2 }, 3, 1);
acf.addMap(dna2.getDatasetSequence(), prot2.getDatasetSequence(), map);
// map dna3 [9, 11] to prot3 [2, 2] G
map = new MapList(new int[] { 9, 11 }, new int[] { 2, 2 }, 3, 1);
acf.addMap(dna3.getDatasetSequence(), prot3.getDatasetSequence(), map);
ArrayList acfs = new ArrayList();
acfs.add(acf);
protein.setCodonFrames(acfs);
/*
* verify X is included in the aligned proteins, and placed just
* before the first mapped residue
* CCT is between CCC and TTT
*/
AlignmentUtils.alignProteinAsDna(protein, dna);
assertEquals("XK-FG", prot1.getSequenceAsString());
assertEquals("--N-G", prot2.getSequenceAsString());
assertEquals("---XG", prot3.getSequenceAsString());
}
/**
* Tests for the method that maps the subset of a dna sequence that has CDS
* (or subtype) feature - case where the start codon is incomplete.
*/
@Test(groups = "Functional")
public void testFindCdsPositions_fivePrimeIncomplete()
{
SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
dnaSeq.createDatasetSequence();
SequenceI ds = dnaSeq.getDatasetSequence();
// CDS for dna 5-6 (incomplete codon), 7-9
SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
sf.setPhase("2"); // skip 2 bases to start of next codon
ds.addSequenceFeature(sf);
// CDS for dna 13-15
sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
ds.addSequenceFeature(sf);
List ranges = AlignmentUtils.findCdsPositions(dnaSeq);
/*
* check the mapping starts with the first complete codon
*/
assertEquals(6, MappingUtils.getLength(ranges));
assertEquals(2, ranges.size());
assertEquals(7, ranges.get(0)[0]);
assertEquals(9, ranges.get(0)[1]);
assertEquals(13, ranges.get(1)[0]);
assertEquals(15, ranges.get(1)[1]);
}
/**
* Tests for the method that maps the subset of a dna sequence that has CDS
* (or subtype) feature.
*/
@Test(groups = "Functional")
public void testFindCdsPositions()
{
SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
dnaSeq.createDatasetSequence();
SequenceI ds = dnaSeq.getDatasetSequence();
// CDS for dna 10-12
SequenceFeature sf = new SequenceFeature("CDS_predicted", "", 10, 12,
0f, null);
sf.setStrand("+");
ds.addSequenceFeature(sf);
// CDS for dna 4-6
sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
sf.setStrand("+");
ds.addSequenceFeature(sf);
// exon feature should be ignored here
sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
ds.addSequenceFeature(sf);
List ranges = AlignmentUtils.findCdsPositions(dnaSeq);
/*
* verify ranges { [4-6], [12-10] }
* note CDS ranges are ordered ascending even if the CDS
* features are not
*/
assertEquals(6, MappingUtils.getLength(ranges));
assertEquals(2, ranges.size());
assertEquals(4, ranges.get(0)[0]);
assertEquals(6, ranges.get(0)[1]);
assertEquals(10, ranges.get(1)[0]);
assertEquals(12, ranges.get(1)[1]);
}
/**
* Test the method that computes a map of codon variants for each protein
* position from "sequence_variant" features on dna
*/
@Test(groups = "Functional")
public void testBuildDnaVariantsMap()
{
SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
/*
* first with no variants on dna
*/
LinkedHashMap[]> variantsMap = AlignmentUtils
.buildDnaVariantsMap(dna, map);
assertTrue(variantsMap.isEmpty());
/*
* single allele codon 1, on base 1
*/
SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
0f, null);
sf1.setValue("alleles", "T");
sf1.setValue("ID", "sequence_variant:rs758803211");
dna.addSequenceFeature(sf1);
/*
* two alleles codon 2, on bases 2 and 3 (distinct variants)
*/
SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 5, 5,
0f, null);
sf2.setValue("alleles", "T");
sf2.setValue("ID", "sequence_variant:rs758803212");
dna.addSequenceFeature(sf2);
SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 6, 6,
0f, null);
sf3.setValue("alleles", "G");
sf3.setValue("ID", "sequence_variant:rs758803213");
dna.addSequenceFeature(sf3);
/*
* two alleles codon 3, both on base 2 (one variant)
*/
SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 8, 8,
0f, null);
sf4.setValue("alleles", "C, G");
sf4.setValue("ID", "sequence_variant:rs758803214");
dna.addSequenceFeature(sf4);
// no alleles on codon 4
/*
* alleles on codon 5 on all 3 bases (distinct variants)
*/
SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 13,
13, 0f, null);
sf5.setValue("alleles", "C, G"); // (C duplicates given base value)
sf5.setValue("ID", "sequence_variant:rs758803215");
dna.addSequenceFeature(sf5);
SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 14,
14, 0f, null);
sf6.setValue("alleles", "g, a"); // should force to upper-case
sf6.setValue("ID", "sequence_variant:rs758803216");
dna.addSequenceFeature(sf6);
SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 15,
15, 0f, null);
sf7.setValue("alleles", "A, T");
sf7.setValue("ID", "sequence_variant:rs758803217");
dna.addSequenceFeature(sf7);
/*
* build map - expect variants on positions 1, 2, 3, 5
*/
variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
assertEquals(4, variantsMap.size());
/*
* protein residue 1: variant on codon (ATG) base 1, not on 2 or 3
*/
List[] pep1Variants = variantsMap.get(1);
assertEquals(3, pep1Variants.length);
assertEquals(1, pep1Variants[0].size());
assertEquals("A", pep1Variants[0].get(0).base); // codon[1] base
assertSame(sf1, pep1Variants[0].get(0).variant); // codon[1] variant
assertEquals(1, pep1Variants[1].size());
assertEquals("T", pep1Variants[1].get(0).base); // codon[2] base
assertNull(pep1Variants[1].get(0).variant); // no variant here
assertEquals(1, pep1Variants[2].size());
assertEquals("G", pep1Variants[2].get(0).base); // codon[3] base
assertNull(pep1Variants[2].get(0).variant); // no variant here
/*
* protein residue 2: variants on codon (AAA) bases 2 and 3
*/
List[] pep2Variants = variantsMap.get(2);
assertEquals(3, pep2Variants.length);
assertEquals(1, pep2Variants[0].size());
// codon[1] base recorded while processing variant on codon[2]
assertEquals("A", pep2Variants[0].get(0).base);
assertNull(pep2Variants[0].get(0).variant); // no variant here
// codon[2] base and variant:
assertEquals(1, pep2Variants[1].size());
assertEquals("A", pep2Variants[1].get(0).base);
assertSame(sf2, pep2Variants[1].get(0).variant);
// codon[3] base was recorded when processing codon[2] variant
// and then the variant for codon[3] added to it
assertEquals(1, pep2Variants[2].size());
assertEquals("A", pep2Variants[2].get(0).base);
assertSame(sf3, pep2Variants[2].get(0).variant);
/*
* protein residue 3: variants on codon (TTT) base 2 only
*/
List[] pep3Variants = variantsMap.get(3);
assertEquals(3, pep3Variants.length);
assertEquals(1, pep3Variants[0].size());
assertEquals("T", pep3Variants[0].get(0).base); // codon[1] base
assertNull(pep3Variants[0].get(0).variant); // no variant here
assertEquals(1, pep3Variants[1].size());
assertEquals("T", pep3Variants[1].get(0).base); // codon[2] base
assertSame(sf4, pep3Variants[1].get(0).variant); // codon[2] variant
assertEquals(1, pep3Variants[2].size());
assertEquals("T", pep3Variants[2].get(0).base); // codon[3] base
assertNull(pep3Variants[2].get(0).variant); // no variant here
/*
* three variants on protein position 5
*/
List[] pep5Variants = variantsMap.get(5);
assertEquals(3, pep5Variants.length);
assertEquals(1, pep5Variants[0].size());
assertEquals("C", pep5Variants[0].get(0).base); // codon[1] base
assertSame(sf5, pep5Variants[0].get(0).variant); // codon[1] variant
assertEquals(1, pep5Variants[1].size());
assertEquals("C", pep5Variants[1].get(0).base); // codon[2] base
assertSame(sf6, pep5Variants[1].get(0).variant); // codon[2] variant
assertEquals(1, pep5Variants[2].size());
assertEquals("C", pep5Variants[2].get(0).base); // codon[3] base
assertSame(sf7, pep5Variants[2].get(0).variant); // codon[3] variant
}
/**
* Tests for the method that computes all peptide variants given codon
* variants
*/
@Test(groups = "Functional")
public void testComputePeptideVariants()
{
/*
* scenario: AAATTTCCC codes for KFP
* variants:
* GAA -> E source: Ensembl
* CAA -> Q source: dbSNP
* AAG synonymous source: COSMIC
* AAT -> N source: Ensembl
* ...TTC synonymous source: dbSNP
* ......CAC,CGC -> H,R source: COSMIC
* (one variant with two alleles)
*/
SequenceI peptide = new Sequence("pep/10-12", "KFP");
/*
* two distinct variants for codon 1 position 1
* second one has clinical significance
*/
String ensembl = "Ensembl";
String dbSnp = "dbSNP";
String cosmic = "COSMIC";
SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
0f, ensembl);
sf1.setValue("alleles", "A,G"); // GAA -> E
sf1.setValue("ID", "var1.125A>G");
SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
0f, dbSnp);
sf2.setValue("alleles", "A,C"); // CAA -> Q
sf2.setValue("ID", "var2");
sf2.setValue("clinical_significance", "Dodgy");
SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
0f, cosmic);
sf3.setValue("alleles", "A,G"); // synonymous
sf3.setValue("ID", "var3");
sf3.setValue("clinical_significance", "None");
SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
0f, ensembl);
sf4.setValue("alleles", "A,T"); // AAT -> N
sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
sf4.setValue("clinical_significance", "Benign");
SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
0f, dbSnp);
sf5.setValue("alleles", "T,C"); // synonymous
sf5.setValue("ID", "var5");
sf5.setValue("clinical_significance", "Bad");
SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
0f, cosmic);
sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
sf6.setValue("ID", "var6");
sf6.setValue("clinical_significance", "Good");
List codon1Variants = new ArrayList();
List codon2Variants = new ArrayList();
List codon3Variants = new ArrayList();
List codonVariants[] = new ArrayList[3];
codonVariants[0] = codon1Variants;
codonVariants[1] = codon2Variants;
codonVariants[2] = codon3Variants;
/*
* compute variants for protein position 1
*/
codon1Variants.add(new DnaVariant("A", sf1));
codon1Variants.add(new DnaVariant("A", sf2));
codon2Variants.add(new DnaVariant("A"));
codon2Variants.add(new DnaVariant("A"));
codon3Variants.add(new DnaVariant("A", sf3));
codon3Variants.add(new DnaVariant("A", sf4));
AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants);
/*
* compute variants for protein position 2
*/
codon1Variants.clear();
codon2Variants.clear();
codon3Variants.clear();
codon1Variants.add(new DnaVariant("T"));
codon2Variants.add(new DnaVariant("T"));
codon3Variants.add(new DnaVariant("T", sf5));
AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants);
/*
* compute variants for protein position 3
*/
codon1Variants.clear();
codon2Variants.clear();
codon3Variants.clear();
codon1Variants.add(new DnaVariant("C"));
codon2Variants.add(new DnaVariant("C", sf6));
codon3Variants.add(new DnaVariant("C"));
AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants);
/*
* verify added sequence features for
* var1 K -> E Ensembl
* var2 K -> Q dbSNP
* var4 K -> N Ensembl
* var6 P -> H COSMIC
* var6 P -> R COSMIC
*/
List sfs = peptide.getSequenceFeatures();
SequenceFeatures.sortFeatures(sfs, true);
assertEquals(5, sfs.size());
/*
* features are sorted by start position ascending, but in no
* particular order where start positions match; asserts here
* simply match the data returned (the order is not important)
*/
SequenceFeature sf = sfs.get(0);
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
assertEquals("p.Lys1Asn", sf.getDescription());
assertEquals("var4", sf.getValue("ID"));
assertEquals("Benign", sf.getValue("clinical_significance"));
assertEquals("ID=var4;clinical_significance=Benign", sf.getAttributes());
assertEquals(1, sf.links.size());
assertEquals(
"p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
sf.links.get(0));
assertEquals(ensembl, sf.getFeatureGroup());
sf = sfs.get(1);
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
assertEquals("p.Lys1Gln", sf.getDescription());
assertEquals("var2", sf.getValue("ID"));
assertEquals("Dodgy", sf.getValue("clinical_significance"));
assertEquals("ID=var2;clinical_significance=Dodgy", sf.getAttributes());
assertEquals(1, sf.links.size());
assertEquals(
"p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
sf.links.get(0));
assertEquals(dbSnp, sf.getFeatureGroup());
sf = sfs.get(2);
assertEquals(1, sf.getBegin());
assertEquals(1, sf.getEnd());
assertEquals("p.Lys1Glu", sf.getDescription());
assertEquals("var1.125A>G", sf.getValue("ID"));
assertNull(sf.getValue("clinical_significance"));
assertEquals("ID=var1.125A>G", sf.getAttributes());
assertEquals(1, sf.links.size());
// link to variation is urlencoded
assertEquals(
"p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
sf.links.get(0));
assertEquals(ensembl, sf.getFeatureGroup());
sf = sfs.get(3);
assertEquals(3, sf.getBegin());
assertEquals(3, sf.getEnd());
assertEquals("p.Pro3Arg", sf.getDescription());
assertEquals("var6", sf.getValue("ID"));
assertEquals("Good", sf.getValue("clinical_significance"));
assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
assertEquals(1, sf.links.size());
assertEquals(
"p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
sf.links.get(0));
assertEquals(cosmic, sf.getFeatureGroup());
// var5 generates two distinct protein variant features
sf = sfs.get(4);
assertEquals(3, sf.getBegin());
assertEquals(3, sf.getEnd());
assertEquals("p.Pro3His", sf.getDescription());
assertEquals("var6", sf.getValue("ID"));
assertEquals("Good", sf.getValue("clinical_significance"));
assertEquals("ID=var6;clinical_significance=Good", sf.getAttributes());
assertEquals(1, sf.links.size());
assertEquals(
"p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
sf.links.get(0));
assertEquals(cosmic, sf.getFeatureGroup());
}
/**
* Tests for the method that maps the subset of a dna sequence that has CDS
* (or subtype) feature, with CDS strand = '-' (reverse)
*/
// test turned off as currently findCdsPositions is not strand-dependent
// left in case it comes around again...
@Test(groups = "Functional", enabled = false)
public void testFindCdsPositions_reverseStrand()
{
SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
dnaSeq.createDatasetSequence();
SequenceI ds = dnaSeq.getDatasetSequence();
// CDS for dna 4-6
SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
sf.setStrand("-");
ds.addSequenceFeature(sf);
// exon feature should be ignored here
sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
ds.addSequenceFeature(sf);
// CDS for dna 10-12
sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
sf.setStrand("-");
ds.addSequenceFeature(sf);
List ranges = AlignmentUtils.findCdsPositions(dnaSeq);
/*
* verify ranges { [12-10], [6-4] }
*/
assertEquals(6, MappingUtils.getLength(ranges));
assertEquals(2, ranges.size());
assertEquals(12, ranges.get(0)[0]);
assertEquals(10, ranges.get(0)[1]);
assertEquals(6, ranges.get(1)[0]);
assertEquals(4, ranges.get(1)[1]);
}
/**
* Tests for the method that maps the subset of a dna sequence that has CDS
* (or subtype) feature - reverse strand case where the start codon is
* incomplete.
*/
@Test(groups = "Functional", enabled = false)
// test turned off as currently findCdsPositions is not strand-dependent
// left in case it comes around again...
public void testFindCdsPositions_reverseStrandThreePrimeIncomplete()
{
SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
dnaSeq.createDatasetSequence();
SequenceI ds = dnaSeq.getDatasetSequence();
// CDS for dna 5-9
SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
sf.setStrand("-");
ds.addSequenceFeature(sf);
// CDS for dna 13-15
sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
sf.setStrand("-");
sf.setPhase("2"); // skip 2 bases to start of next codon
ds.addSequenceFeature(sf);
List ranges = AlignmentUtils.findCdsPositions(dnaSeq);
/*
* check the mapping starts with the first complete codon
* expect ranges [13, 13], [9, 5]
*/
assertEquals(6, MappingUtils.getLength(ranges));
assertEquals(2, ranges.size());
assertEquals(13, ranges.get(0)[0]);
assertEquals(13, ranges.get(0)[1]);
assertEquals(9, ranges.get(1)[0]);
assertEquals(5, ranges.get(1)[1]);
}
@Test(groups = "Functional")
public void testAlignAs_alternateTranscriptsUngapped()
{
SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
((Alignment) dna).createDatasetAlignment();
SequenceI cds1 = new Sequence("cds1", "GGGTTT");
SequenceI cds2 = new Sequence("cds2", "CCCAAA");
AlignmentI cds = new Alignment(new SequenceI[] { cds1, cds2 });
((Alignment) cds).createDatasetAlignment();
AlignedCodonFrame acf = new AlignedCodonFrame();
MapList map = new MapList(new int[] { 4, 9 }, new int[] { 1, 6 }, 1, 1);
acf.addMap(dna1.getDatasetSequence(), cds1.getDatasetSequence(), map);
map = new MapList(new int[] { 1, 3, 10, 12 }, new int[] { 1, 6 }, 1, 1);
acf.addMap(dna2.getDatasetSequence(), cds2.getDatasetSequence(), map);
/*
* verify CDS alignment is as:
* cccGGGTTTaaa (cdna)
* CCCgggtttAAA (cdna)
*
* ---GGGTTT--- (cds)
* CCC------AAA (cds)
*/
dna.addCodonFrame(acf);
AlignmentUtils.alignAs(cds, dna);
assertEquals("---GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
assertEquals("CCC------AAA", cds.getSequenceAt(1).getSequenceAsString());
}
@Test(groups = { "Functional" })
public void testAddMappedPositions()
{
SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
SequenceI seq1 = new Sequence("cds", "AAATTT");
from.createDatasetSequence();
seq1.createDatasetSequence();
Mapping mapping = new Mapping(seq1, new MapList(
new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1));
Map> map = new TreeMap>();
AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
/*
* verify map has seq1 residues in columns 3,4,6,7,11,12
*/
assertEquals(6, map.size());
assertEquals('A', map.get(3).get(seq1).charValue());
assertEquals('A', map.get(4).get(seq1).charValue());
assertEquals('A', map.get(6).get(seq1).charValue());
assertEquals('T', map.get(7).get(seq1).charValue());
assertEquals('T', map.get(11).get(seq1).charValue());
assertEquals('T', map.get(12).get(seq1).charValue());
/*
*
*/
}
/**
* Test case where the mapping 'from' range includes a stop codon which is
* absent in the 'to' range
*/
@Test(groups = { "Functional" })
public void testAddMappedPositions_withStopCodon()
{
SequenceI from = new Sequence("dna", "ggAA-ATcc-TT-g");
SequenceI seq1 = new Sequence("cds", "AAATTT");
from.createDatasetSequence();
seq1.createDatasetSequence();
Mapping mapping = new Mapping(seq1, new MapList(
new int[] { 3, 6, 9, 10 }, new int[] { 1, 6 }, 1, 1));
Map> map = new TreeMap>();
AlignmentUtils.addMappedPositions(seq1, from, mapping, map);
/*
* verify map has seq1 residues in columns 3,4,6,7,11,12
*/
assertEquals(6, map.size());
assertEquals('A', map.get(3).get(seq1).charValue());
assertEquals('A', map.get(4).get(seq1).charValue());
assertEquals('A', map.get(6).get(seq1).charValue());
assertEquals('T', map.get(7).get(seq1).charValue());
assertEquals('T', map.get(11).get(seq1).charValue());
assertEquals('T', map.get(12).get(seq1).charValue());
}
/**
* Test for the case where the products for which we want CDS are specified.
* This is to represent the case where EMBL has CDS mappings to both Uniprot
* and EMBLCDSPROTEIN. makeCdsAlignment() should only return the mappings for
* the protein sequences specified.
*/
@Test(groups = { "Functional" })
public void testMakeCdsAlignment_filterProducts()
{
SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
SequenceI pep1 = new Sequence("Uniprot|pep1", "GF");
SequenceI pep2 = new Sequence("Uniprot|pep2", "GFP");
SequenceI pep3 = new Sequence("EMBL|pep3", "GF");
SequenceI pep4 = new Sequence("EMBL|pep4", "GFP");
dna1.createDatasetSequence();
dna2.createDatasetSequence();
pep1.createDatasetSequence();
pep2.createDatasetSequence();
pep3.createDatasetSequence();
pep4.createDatasetSequence();
AlignmentI dna = new Alignment(new SequenceI[] { dna1, dna2 });
dna.setDataset(null);
AlignmentI emblPeptides = new Alignment(new SequenceI[] { pep3, pep4 });
emblPeptides.setDataset(null);
AlignedCodonFrame acf = new AlignedCodonFrame();
MapList map = new MapList(new int[] { 4, 6, 10, 12 },
new int[] { 1, 2 }, 3, 1);
acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
acf.addMap(dna1.getDatasetSequence(), pep3.getDatasetSequence(), map);
dna.addCodonFrame(acf);
acf = new AlignedCodonFrame();
map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
3, 1);
acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
acf.addMap(dna2.getDatasetSequence(), pep4.getDatasetSequence(), map);
dna.addCodonFrame(acf);
/*
* execute method under test to find CDS for EMBL peptides only
*/
AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
dna1, dna2 }, dna.getDataset(), emblPeptides.getSequencesArray());
assertEquals(2, cds.getSequences().size());
assertEquals("GGGTTT", cds.getSequenceAt(0).getSequenceAsString());
assertEquals("GGGTTTCCC", cds.getSequenceAt(1).getSequenceAsString());
/*
* verify shared, extended alignment dataset
*/
assertSame(dna.getDataset(), cds.getDataset());
assertTrue(dna.getDataset().getSequences()
.contains(cds.getSequenceAt(0).getDatasetSequence()));
assertTrue(dna.getDataset().getSequences()
.contains(cds.getSequenceAt(1).getDatasetSequence()));
/*
* Verify mappings from CDS to peptide, cDNA to CDS, and cDNA to peptide
* the mappings are on the shared alignment dataset
*/
List cdsMappings = cds.getDataset().getCodonFrames();
/*
* 6 mappings, 2*(DNA->CDS), 2*(DNA->Pep), 2*(CDS->Pep)
*/
assertEquals(6, cdsMappings.size());
/*
* verify that mapping sets for dna and cds alignments are different
* [not current behaviour - all mappings are on the alignment dataset]
*/
// select -> subselect type to test.
// Assert.assertNotSame(dna.getCodonFrames(), cds.getCodonFrames());
// assertEquals(4, dna.getCodonFrames().size());
// assertEquals(4, cds.getCodonFrames().size());
/*
* Two mappings involve pep3 (dna to pep3, cds to pep3)
* Mapping from pep3 to GGGTTT in first new exon sequence
*/
List pep3Mappings = MappingUtils
.findMappingsForSequence(pep3, cdsMappings);
assertEquals(2, pep3Mappings.size());
List mappings = MappingUtils
.findMappingsForSequence(cds.getSequenceAt(0), pep3Mappings);
assertEquals(1, mappings.size());
// map G to GGG
SearchResultsI sr = MappingUtils.buildSearchResults(pep3, 1, mappings);
assertEquals(1, sr.getResults().size());
SearchResultMatchI m = sr.getResults().get(0);
assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
// map F to TTT
sr = MappingUtils.buildSearchResults(pep3, 2, mappings);
m = sr.getResults().get(0);
assertSame(cds.getSequenceAt(0).getDatasetSequence(), m.getSequence());
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
/*
* Two mappings involve pep4 (dna to pep4, cds to pep4)
* Verify mapping from pep4 to GGGTTTCCC in second new exon sequence
*/
List pep4Mappings = MappingUtils
.findMappingsForSequence(pep4, cdsMappings);
assertEquals(2, pep4Mappings.size());
mappings = MappingUtils.findMappingsForSequence(cds.getSequenceAt(1),
pep4Mappings);
assertEquals(1, mappings.size());
// map G to GGG
sr = MappingUtils.buildSearchResults(pep4, 1, mappings);
assertEquals(1, sr.getResults().size());
m = sr.getResults().get(0);
assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
assertEquals(1, m.getStart());
assertEquals(3, m.getEnd());
// map F to TTT
sr = MappingUtils.buildSearchResults(pep4, 2, mappings);
m = sr.getResults().get(0);
assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
assertEquals(4, m.getStart());
assertEquals(6, m.getEnd());
// map P to CCC
sr = MappingUtils.buildSearchResults(pep4, 3, mappings);
m = sr.getResults().get(0);
assertSame(cds.getSequenceAt(1).getDatasetSequence(), m.getSequence());
assertEquals(7, m.getStart());
assertEquals(9, m.getEnd());
}
/**
* Test the method that just copies aligned sequences, provided all sequences
* to be aligned share the aligned sequence's dataset
*/
@Test(groups = "Functional")
public void testAlignAsSameSequences()
{
SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
AlignmentI al1 = new Alignment(new SequenceI[] { dna1, dna2 });
((Alignment) al1).createDatasetAlignment();
SequenceI dna3 = new Sequence(dna1);
SequenceI dna4 = new Sequence(dna2);
assertSame(dna3.getDatasetSequence(), dna1.getDatasetSequence());
assertSame(dna4.getDatasetSequence(), dna2.getDatasetSequence());
String seq1 = "-cc-GG-GT-TT--aaa";
dna3.setSequence(seq1);
String seq2 = "C--C-Cgg--gtt-tAA-A-";
dna4.setSequence(seq2);
AlignmentI al2 = new Alignment(new SequenceI[] { dna3, dna4 });
((Alignment) al2).createDatasetAlignment();
assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2));
assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString());
assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString());
/*
* add another sequence to 'aligned' - should still succeed, since
* unaligned sequences still share a dataset with aligned sequences
*/
SequenceI dna5 = new Sequence("dna5", "CCCgggtttAAA");
dna5.createDatasetSequence();
al2.addSequence(dna5);
assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2));
assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString());
assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString());
/*
* add another sequence to 'unaligned' - should fail, since now not
* all unaligned sequences share a dataset with aligned sequences
*/
SequenceI dna6 = new Sequence("dna6", "CCCgggtttAAA");
dna6.createDatasetSequence();
al1.addSequence(dna6);
// JAL-2110 JBP Comment: what's the use case for this behaviour ?
assertFalse(AlignmentUtils.alignAsSameSequences(al1, al2));
}
@Test(groups = "Functional")
public void testAlignAsSameSequencesMultipleSubSeq()
{
SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa");
SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA");
SequenceI as1 = dna1.deriveSequence();
SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7);
SequenceI as3 = dna2.deriveSequence();
as1.insertCharAt(6, 5, '-');
String s_as1 = as1.getSequenceAsString();
as2.insertCharAt(6, 5, '-');
String s_as2 = as2.getSequenceAsString();
as3.insertCharAt(6, 5, '-');
String s_as3 = as3.getSequenceAsString();
AlignmentI aligned = new Alignment(new SequenceI[] { as1, as2, as3 });
// why do we need to cast this still ?
((Alignment) aligned).createDatasetAlignment();
SequenceI uas1 = dna1.deriveSequence();
SequenceI uas2 = dna1.deriveSequence().getSubSequence(3, 7);
SequenceI uas3 = dna2.deriveSequence();
AlignmentI tobealigned = new Alignment(new SequenceI[] { uas1, uas2,
uas3 });
((Alignment) tobealigned).createDatasetAlignment();
assertTrue(AlignmentUtils.alignAsSameSequences(tobealigned, aligned));
assertEquals(s_as1, uas1.getSequenceAsString());
assertEquals(s_as2, uas2.getSequenceAsString());
assertEquals(s_as3, uas3.getSequenceAsString());
}
@Test(groups = { "Functional" })
public void testTransferGeneLoci()
{
SequenceI from = new Sequence("transcript",
"aaacccgggTTTAAACCCGGGtttaaacccgggttt");
SequenceI to = new Sequence("CDS", "TTTAAACCCGGG");
MapList map = new MapList(new int[] { 1, 12 }, new int[] { 10, 21 }, 1,
1);
/*
* first with nothing to transfer
*/
AlignmentUtils.transferGeneLoci(from, map, to);
assertNull(to.getGeneLoci());
/*
* next with gene loci set on 'from' sequence
*/
int[] exons = new int[] { 100, 105, 155, 164, 210, 229 };
MapList geneMap = new MapList(new int[] { 1, 36 }, exons, 1, 1);
from.setGeneLoci("human", "GRCh38", "7", geneMap);
AlignmentUtils.transferGeneLoci(from, map, to);
GeneLociI toLoci = to.getGeneLoci();
assertNotNull(toLoci);
// DBRefEntry constructor upper-cases 'source'
assertEquals("HUMAN", toLoci.getSpeciesId());
assertEquals("GRCh38", toLoci.getAssemblyId());
assertEquals("7", toLoci.getChromosomeId());
/*
* transcript 'exons' are 1-6, 7-16, 17-36
* CDS 1:12 is transcript 10-21
* transcript 'CDS' is 10-16, 17-21
* which is 'gene' 158-164, 210-214
*/
MapList toMap = toLoci.getMap();
assertEquals(1, toMap.getFromRanges().size());
assertEquals(2, toMap.getFromRanges().get(0).length);
assertEquals(1, toMap.getFromRanges().get(0)[0]);
assertEquals(12, toMap.getFromRanges().get(0)[1]);
assertEquals(1, toMap.getToRanges().size());
assertEquals(4, toMap.getToRanges().get(0).length);
assertEquals(158, toMap.getToRanges().get(0)[0]);
assertEquals(164, toMap.getToRanges().get(0)[1]);
assertEquals(210, toMap.getToRanges().get(0)[2]);
assertEquals(214, toMap.getToRanges().get(0)[3]);
// or summarised as (but toString might change in future):
assertEquals("[ [1, 12] ] 1:1 to [ [158, 164, 210, 214] ]",
toMap.toString());
/*
* an existing value is not overridden
*/
geneMap = new MapList(new int[] { 1, 36 }, new int[] { 36, 1 }, 1, 1);
from.setGeneLoci("inhuman", "GRCh37", "6", geneMap);
AlignmentUtils.transferGeneLoci(from, map, to);
assertEquals("GRCh38", toLoci.getAssemblyId());
assertEquals("7", toLoci.getChromosomeId());
toMap = toLoci.getMap();
assertEquals("[ [1, 12] ] 1:1 to [ [158, 164, 210, 214] ]",
toMap.toString());
}
}