import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
{
/**
- * Represents the 3 possible results of trying to map one alignment to
- * another.
- */
- public enum MappingResult
- {
- Mapped, NotMapped, AlreadyMapped
- }
-
- /**
* given an existing alignment, create a new alignment including all, or up to
* flankSize additional symbols from each sequence's dataset sequence
*
/**
* Build mapping of protein to cDNA alignment. Mappings are made between
* sequences where the cDNA translates to the protein sequence. Any new
- * mappings are added to the protein alignment. Has a 3-valued result: either
- * Mapped (at least one sequence mapping was created), AlreadyMapped (all
- * possible sequence mappings already exist), or NotMapped (no possible
- * sequence mappings exist).
+ * mappings are added to the protein alignment. Returns true if any mappings
+ * either already exist or were added, else false.
*
* @param proteinAlignment
* @param cdnaAlignment
* @return
*/
- public static MappingResult mapProteinToCdna(
+ public static boolean mapProteinToCdna(
final AlignmentI proteinAlignment,
final AlignmentI cdnaAlignment)
{
if (proteinAlignment == null || cdnaAlignment == null)
{
- return MappingResult.NotMapped;
+ return false;
}
- boolean mappingPossible = false;
- boolean mappingPerformed = false;
+ Set<SequenceI> mappedDna = new HashSet<SequenceI>();
+ Set<SequenceI> mappedProtein = new HashSet<SequenceI>();
- List<SequenceI> mapped = new ArrayList<SequenceI>();
+ /*
+ * First pass - map sequences where cross-references exist. This include
+ * 1-to-many mappings to support, for example, variant cDNA.
+ */
+ boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
+ cdnaAlignment, mappedDna, mappedProtein, true);
+ /*
+ * Second pass - map sequences where no cross-references exist. This only
+ * does 1-to-1 mappings and assumes corresponding sequences are in the same
+ * order in the alignments.
+ */
+ mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
+ mappedDna, mappedProtein, false);
+ return mappingPerformed;
+ }
+
+ /**
+ * Make mappings between compatible sequences (where the cDNA translation
+ * matches the protein).
+ *
+ * @param proteinAlignment
+ * @param cdnaAlignment
+ * @param mappedDna
+ * a set of mapped DNA sequences (to add to)
+ * @param mappedProtein
+ * a set of mapped Protein sequences (to add to)
+ * @param xrefsOnly
+ * if true, only map sequences where xrefs exist
+ * @return
+ */
+ protected static boolean mapProteinToCdna(
+ final AlignmentI proteinAlignment,
+ final AlignmentI cdnaAlignment, Set<SequenceI> mappedDna,
+ Set<SequenceI> mappedProtein, boolean xrefsOnly)
+ {
+ boolean mappingPerformed = false;
List<SequenceI> thisSeqs = proteinAlignment.getSequences();
-
for (SequenceI aaSeq : thisSeqs)
{
+ boolean proteinMapped = false;
AlignedCodonFrame acf = new AlignedCodonFrame();
for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
{
/*
- * Heuristic rule: don't map more than one AA sequence to the same cDNA;
- * map progressively assuming that alignments have mappable sequences in
- * the same respective order
+ * Always try to map if sequences have xref to each other; this supports
+ * variant cDNA or alternative splicing for a protein sequence.
+ *
+ * If no xrefs, try to map progressively, assuming that alignments have
+ * mappable sequences in corresponding order. These are not
+ * many-to-many, as that would risk mixing species with similar cDNA
+ * sequences.
*/
- if (mapped.contains(cdnaSeq))
+ if (xrefsOnly && !CrossRef.haveCrossRef(aaSeq, cdnaSeq))
+ {
+ continue;
+ }
+
+ /*
+ * Don't map non-xrefd sequences more than once each. This heuristic
+ * allows us to pair up similar sequences in ordered alignments.
+ */
+ if (!xrefsOnly
+ && (mappedProtein.contains(aaSeq) || mappedDna
+ .contains(cdnaSeq)))
{
continue;
}
{
acf.addMap(cdnaSeq, aaSeq, map);
mappingPerformed = true;
- mapped.add(cdnaSeq);
-
- /*
- * Heuristic rule #2: don't map AA sequence to more than one cDNA
- */
- break;
+ proteinMapped = true;
+ mappedDna.add(cdnaSeq);
+ mappedProtein.add(aaSeq);
}
}
}
- proteinAlignment.addCodonFrame(acf);
- }
-
- /*
- * If at least one mapping was possible but none was done, then the
- * alignments are already as mapped as they can be.
- */
- if (mappingPossible && !mappingPerformed)
- {
- return MappingResult.AlreadyMapped;
- }
- else
- {
- return mappingPerformed ? MappingResult.Mapped
- : MappingResult.NotMapped;
+ if (proteinMapped)
+ {
+ proteinAlignment.addCodonFrame(acf);
+ }
}
+ return mappingPerformed;
}
/**
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
-import jalview.analysis.AlignmentUtils.MappingResult;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefEntry;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import jalview.util.MapList;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
a.setDataset(null);
return a;
}
+
/**
- * Test mapping of protein to cDNA.
+ * Test mapping of protein to cDNA, for the case where we have no sequence
+ * cross-references, so mappings are made first-served 1-1 where sequences
+ * translate.
*
* @throws IOException
*/
@Test
- public void testMapProteinToCdna() throws IOException
+ public void testMapProteinToCdna_noXrefs() throws IOException
{
- // protein: Human + Mouse, 3 residues
- AlignmentI protein = loadAlignment(
- ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n",
- "FASTA");
- // cDNA: Mouse, Human, Mouse, 9 bases
- // @formatter:off
- String dnaData =
- ">Mouse\nGAAATCCAG\n" +
- ">Human\nTTCGATTAC\n" +
- ">Mouse\nGTCGTTTGC\n" +
- ">Mouse\nGTCGTTTGCgac\n" + // not mapped - wrong length
- ">Fly\nGTCGTTTGC\n"; // not mapped - no name match
- // @formatter:on
- AlignmentI cdna1 = loadAlignment(
- dnaData,
- "FASTA");
- MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1);
- assertEquals(mapped, MappingResult.Mapped);
-
- /*
- * Check two mappings (one for Mouse, one for Human)
- */
- assertEquals(2, protein.getCodonFrames().size());
- assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
- assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
-
- /*
- * Inspect mapping for Human protein
- */
- AlignedCodonFrame humanMapping = protein.getCodonFrame(
- protein.getSequenceAt(0)).get(0);
- assertEquals(1, humanMapping.getdnaSeqs().length);
- assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(),
- humanMapping.getdnaSeqs()[0]);
- Mapping[] protMappings = humanMapping.getProtMappings();
- assertEquals(1, protMappings.length);
- MapList mapList = protMappings[0].getMap();
- assertEquals(3, mapList.getFromRatio());
- assertEquals(1, mapList.getToRatio());
- assertTrue(Arrays.equals(new int[]
- { 1, 9 }, mapList.getFromRanges().get(0)));
- assertEquals(1, mapList.getFromRanges().size());
- assertTrue(Arrays.equals(new int[]
- { 1, 3 }, mapList.getToRanges().get(0)));
- assertEquals(1, mapList.getToRanges().size());
+ List<SequenceI> protseqs = new ArrayList<SequenceI>();
+ protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
+ AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
+ protein.setDataset(null);
- /*
- * Inspect mappings for Mouse protein
- */
- AlignedCodonFrame mouseMapping1 = protein.getCodonFrame(
- protein.getSequenceAt(1)).get(0);
- assertEquals(2, mouseMapping1.getdnaSeqs().length);
- assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(),
- mouseMapping1.getdnaSeqs()[0]);
- assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(),
- mouseMapping1.getdnaSeqs()[1]);
- protMappings = mouseMapping1.getProtMappings();
- assertEquals(2, protMappings.length);
- for (int i = 0; i < 2; i++)
- {
- mapList = protMappings[i].getMap();
- assertEquals(3, mapList.getFromRatio());
- assertEquals(1, mapList.getToRatio());
- assertTrue(Arrays.equals(new int[]
- { 1, 9 }, mapList.getFromRanges().get(0)));
- assertEquals(1, mapList.getFromRanges().size());
- assertTrue(Arrays.equals(new int[]
- { 1, 3 }, mapList.getToRanges().get(0)));
- assertEquals(1, mapList.getToRanges().size());
- }
- }
+ List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
+ dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
+ dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAA")); // = EIQ
+ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
+ dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
+ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
+ cdna.setDataset(null);
- /**
- * Test mapping of protein to cDNA which may include start and/or stop codons.
- *
- * @throws IOException
- */
- @Test
- public void testMapProteinToCdna_stopStartCodons() throws IOException
- {
- // protein: Human + Mouse, 3 residues
- AlignmentI protein = loadAlignment(
- ">Human\nKQY\n>Mouse\nAFP\n>Worm\nRST\n", "FASTA");
- // @formatter:off
- String dnaData =
- ">Mouse\natgGAAATCCAG\n" + // Mouse with start codon
- ">Human\nTTCGATtactaa\n" + // Human with stop codon TAA
- ">Mouse\nGTCGTTTGctaG\n" + // Mouse with stop codon TAG
- ">Human\nGTCGTTTgctGa\n" + // Human with stop codon TGA
- ">Mouse\nATGGTCGTTTGCtag\n"; // Mouse with start and stop codons
- // @formatter:on
- AlignmentI cdna1 = loadAlignment(
- dnaData,
- "FASTA");
- MappingResult mapped = AlignmentUtils.mapProteinToCdna(protein, cdna1);
- assertEquals(mapped, MappingResult.Mapped);
+ assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
- /*
- * Check two mappings (one for Mouse, one for Human)
- */
- assertEquals(2, protein.getCodonFrames().size());
+ // 3 mappings made, each from 1 to 1 sequence
+ assertEquals(3, protein.getCodonFrames().size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
- /*
- * Inspect mapping for Human protein - should map to 2nd and 4th cDNA seqs
- */
- AlignedCodonFrame humanMapping = protein.getCodonFrame(
+ // V12345 mapped to A22222
+ AlignedCodonFrame acf = protein.getCodonFrame(
protein.getSequenceAt(0)).get(0);
- assertEquals(2, humanMapping.getdnaSeqs().length);
- assertEquals(cdna1.getSequenceAt(1).getDatasetSequence(),
- humanMapping.getdnaSeqs()[0]);
- assertEquals(cdna1.getSequenceAt(3).getDatasetSequence(),
- humanMapping.getdnaSeqs()[1]);
- Mapping[] protMappings = humanMapping.getProtMappings();
- // two mappings, both to cDNA with stop codon
- assertEquals(2, protMappings.length);
-
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+ Mapping[] protMappings = acf.getProtMappings();
+ assertEquals(1, protMappings.length);
MapList mapList = protMappings[0].getMap();
assertEquals(3, mapList.getFromRatio());
assertEquals(1, mapList.getToRatio());
{ 1, 3 }, mapList.getToRanges().get(0)));
assertEquals(1, mapList.getToRanges().size());
- mapList = protMappings[1].getMap();
- assertEquals(3, mapList.getFromRatio());
- assertEquals(1, mapList.getToRatio());
- assertTrue(Arrays.equals(new int[]
- { 1, 9 }, mapList.getFromRanges().get(0)));
- assertEquals(1, mapList.getFromRanges().size());
- assertTrue(Arrays.equals(new int[]
- { 1, 3 }, mapList.getToRanges().get(0)));
- assertEquals(1, mapList.getToRanges().size());
+ // V12346 mapped to A33333
+ acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
- /*
- * Inspect mapping for Mouse protein - should map to 1st/3rd/5th cDNA seqs
- */
- AlignedCodonFrame mouseMapping = protein.getCodonFrame(
- protein.getSequenceAt(1)).get(0);
- assertEquals(3, mouseMapping.getdnaSeqs().length);
- assertEquals(cdna1.getSequenceAt(0).getDatasetSequence(),
- mouseMapping.getdnaSeqs()[0]);
- assertEquals(cdna1.getSequenceAt(2).getDatasetSequence(),
- mouseMapping.getdnaSeqs()[1]);
- assertEquals(cdna1.getSequenceAt(4).getDatasetSequence(),
- mouseMapping.getdnaSeqs()[2]);
-
- // three mappings
- protMappings = mouseMapping.getProtMappings();
- assertEquals(3, protMappings.length);
-
- // first mapping to cDNA with start codon
- mapList = protMappings[0].getMap();
- assertEquals(3, mapList.getFromRatio());
- assertEquals(1, mapList.getToRatio());
- assertTrue(Arrays.equals(new int[]
- { 4, 12 }, mapList.getFromRanges().get(0)));
- assertEquals(1, mapList.getFromRanges().size());
- assertTrue(Arrays.equals(new int[]
- { 1, 3 }, mapList.getToRanges().get(0)));
- assertEquals(1, mapList.getToRanges().size());
+ // V12347 mapped to A11111
+ acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
- // second mapping to cDNA with stop codon
- mapList = protMappings[1].getMap();
- assertEquals(3, mapList.getFromRatio());
- assertEquals(1, mapList.getToRatio());
- assertTrue(Arrays.equals(new int[]
- { 1, 9 }, mapList.getFromRanges().get(0)));
- assertEquals(1, mapList.getFromRanges().size());
- assertTrue(Arrays.equals(new int[]
- { 1, 3 }, mapList.getToRanges().get(0)));
- assertEquals(1, mapList.getToRanges().size());
-
- // third mapping to cDNA with start and stop codon
- mapList = protMappings[2].getMap();
- assertEquals(3, mapList.getFromRatio());
- assertEquals(1, mapList.getToRatio());
- assertTrue(Arrays.equals(new int[]
- { 4, 12 }, mapList.getFromRanges().get(0)));
- assertEquals(1, mapList.getFromRanges().size());
- assertTrue(Arrays.equals(new int[]
- { 1, 3 }, mapList.getToRanges().get(0)));
- assertEquals(1, mapList.getToRanges().size());
+ // no mapping involving the 'extra' A44444
+ assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
}
/**
0,
"FPMG".toCharArray()));
}
+
+ /**
+ * Test mapping of protein to cDNA, for cases where the cDNA has start and/or
+ * stop codons in addition to the protein coding sequence.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testMapProteinToCdna_withStartAndStopCodons()
+ throws IOException
+ {
+ List<SequenceI> protseqs = new ArrayList<SequenceI>();
+ protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
+ AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
+ protein.setDataset(null);
+
+ List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
+ // start + SAR:
+ dnaseqs.add(new Sequence("EMBL|A11111", "ATGTCAGCACGC"));
+ // = EIQ + stop
+ dnaseqs.add(new Sequence("EMBL|A22222", "GAGATACAATAA"));
+ // = start +EIQ + stop
+ dnaseqs.add(new Sequence("EMBL|A33333", "ATGGAAATCCAGTAG"));
+ dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG"));
+ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[4]));
+ cdna.setDataset(null);
+
+ assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
+
+ // 3 mappings made, each from 1 to 1 sequence
+ assertEquals(3, protein.getCodonFrames().size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
+
+ // V12345 mapped from A22222
+ AlignedCodonFrame acf = protein.getCodonFrame(
+ protein.getSequenceAt(0)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+ Mapping[] protMappings = acf.getProtMappings();
+ assertEquals(1, protMappings.length);
+ MapList mapList = protMappings[0].getMap();
+ assertEquals(3, mapList.getFromRatio());
+ assertEquals(1, mapList.getToRatio());
+ assertTrue(Arrays.equals(new int[]
+ { 1, 9 }, mapList.getFromRanges().get(0)));
+ assertEquals(1, mapList.getFromRanges().size());
+ assertTrue(Arrays.equals(new int[]
+ { 1, 3 }, mapList.getToRanges().get(0)));
+ assertEquals(1, mapList.getToRanges().size());
+
+ // V12346 mapped from A33333 starting position 4
+ acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+ protMappings = acf.getProtMappings();
+ assertEquals(1, protMappings.length);
+ mapList = protMappings[0].getMap();
+ assertEquals(3, mapList.getFromRatio());
+ assertEquals(1, mapList.getToRatio());
+ assertTrue(Arrays.equals(new int[]
+ { 4, 12 }, mapList.getFromRanges().get(0)));
+ assertEquals(1, mapList.getFromRanges().size());
+ assertTrue(Arrays.equals(new int[]
+ { 1, 3 }, mapList.getToRanges().get(0)));
+ assertEquals(1, mapList.getToRanges().size());
+
+ // V12347 mapped to A11111 starting position 4
+ acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+ protMappings = acf.getProtMappings();
+ assertEquals(1, protMappings.length);
+ mapList = protMappings[0].getMap();
+ assertEquals(3, mapList.getFromRatio());
+ assertEquals(1, mapList.getToRatio());
+ assertTrue(Arrays.equals(new int[]
+ { 4, 12 }, mapList.getFromRanges().get(0)));
+ assertEquals(1, mapList.getFromRanges().size());
+ assertTrue(Arrays.equals(new int[]
+ { 1, 3 }, mapList.getToRanges().get(0)));
+ assertEquals(1, mapList.getToRanges().size());
+
+ // no mapping involving the 'extra' A44444
+ assertTrue(protein.getCodonFrame(cdna.getSequenceAt(3)).isEmpty());
+ }
+
+ /**
+ * Test mapping of protein to cDNA, for the case where we have some sequence
+ * cross-references. Verify that 1-to-many mappings are made where
+ * cross-references exist and sequences are mappable.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testMapProteinToCdna_withXrefs() throws IOException
+ {
+ List<SequenceI> protseqs = new ArrayList<SequenceI>();
+ protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12347", "SAR"));
+ AlignmentI protein = new Alignment(protseqs.toArray(new SequenceI[3]));
+ protein.setDataset(null);
+
+ List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
+ dnaseqs.add(new Sequence("EMBL|A11111", "TCAGCACGC")); // = SAR
+ dnaseqs.add(new Sequence("EMBL|A22222", "ATGGAGATACAA")); // = start + EIQ
+ dnaseqs.add(new Sequence("EMBL|A33333", "GAAATCCAG")); // = EIQ
+ dnaseqs.add(new Sequence("EMBL|A44444", "GAAATTCAG")); // = EIQ
+ dnaseqs.add(new Sequence("EMBL|A55555", "GAGATTCAG")); // = EIQ
+ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[5]));
+ cdna.setDataset(null);
+
+ // Xref A22222 to V12345 (should get mapped)
+ dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
+ // Xref V12345 to A44444 (should get mapped)
+ protseqs.get(0).addDBRef(new DBRefEntry("EMBL", "1", "A44444"));
+ // Xref A33333 to V12347 (sequence mismatch - should not get mapped)
+ dnaseqs.get(2).addDBRef(new DBRefEntry("UNIPROT", "1", "V12347"));
+ // as V12345 is mapped to A22222 and A44444, this leaves V12346 unmapped.
+ // it should get paired up with the unmapped A33333
+ // A11111 should be mapped to V12347
+ // A55555 is spare and has no xref so is not mapped
+
+ assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
+
+ // 4 protein mappings made for 3 proteins, 2 to V12345, 1 each to V12346/7
+ assertEquals(3, protein.getCodonFrames().size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(2)).size());
+
+ // one mapping for each of the first 4 cDNA sequences
+ assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
+ assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
+ assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(2)).size());
+ assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(3)).size());
+
+ // V12345 mapped to A22222 and A44444
+ AlignedCodonFrame acf = protein.getCodonFrame(
+ protein.getSequenceAt(0)).get(0);
+ assertEquals(2, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+ assertEquals(cdna.getSequenceAt(3).getDatasetSequence(),
+ acf.getdnaSeqs()[1]);
+
+ // V12346 mapped to A33333
+ acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(2).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+
+ // V12347 mapped to A11111
+ acf = protein.getCodonFrame(protein.getSequenceAt(2)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+
+ // no mapping involving the 'extra' A55555
+ assertTrue(protein.getCodonFrame(cdna.getSequenceAt(4)).isEmpty());
+ }
+
+ /**
+ * Test mapping of protein to cDNA, for the case where we have some sequence
+ * cross-references. Verify that once we have made an xref mapping we don't
+ * also map un-xrefd sequeces.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testMapProteinToCdna_prioritiseXrefs() throws IOException
+ {
+ List<SequenceI> protseqs = new ArrayList<SequenceI>();
+ protseqs.add(new Sequence("UNIPROT|V12345", "EIQ"));
+ protseqs.add(new Sequence("UNIPROT|V12346", "EIQ"));
+ AlignmentI protein = new Alignment(
+ protseqs.toArray(new SequenceI[protseqs.size()]));
+ protein.setDataset(null);
+
+ List<SequenceI> dnaseqs = new ArrayList<SequenceI>();
+ dnaseqs.add(new Sequence("EMBL|A11111", "GAAATCCAG")); // = EIQ
+ dnaseqs.add(new Sequence("EMBL|A22222", "GAAATTCAG")); // = EIQ
+ AlignmentI cdna = new Alignment(dnaseqs.toArray(new SequenceI[dnaseqs
+ .size()]));
+ cdna.setDataset(null);
+
+ // Xref A22222 to V12345 (should get mapped)
+ // A11111 should then be mapped to the unmapped V12346
+ dnaseqs.get(1).addDBRef(new DBRefEntry("UNIPROT", "1", "V12345"));
+
+ assertTrue(AlignmentUtils.mapProteinToCdna(protein, cdna));
+
+ // 2 protein mappings made
+ assertEquals(2, protein.getCodonFrames().size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(0)).size());
+ assertEquals(1, protein.getCodonFrame(protein.getSequenceAt(1)).size());
+
+ // one mapping for each of the cDNA sequences
+ assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(0)).size());
+ assertEquals(1, protein.getCodonFrame(cdna.getSequenceAt(1)).size());
+
+ // V12345 mapped to A22222
+ AlignedCodonFrame acf = protein.getCodonFrame(protein.getSequenceAt(0))
+ .get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(1).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+
+ // V12346 mapped to A11111
+ acf = protein.getCodonFrame(protein.getSequenceAt(1)).get(0);
+ assertEquals(1, acf.getdnaSeqs().length);
+ assertEquals(cdna.getSequenceAt(0).getDatasetSequence(),
+ acf.getdnaSeqs()[0]);
+ }
}