/**
* Build mapping of protein to cDNA alignment. Mappings are made between
- * sequences which have the same name and compatible lengths. Any new mappings
- * are added to the protein alignment. Has a 3-valued result: either Mapped
- * (at least one sequence mapping was created), AlreadyMapped (all possible
- * sequence mappings already exist), or NotMapped (no possible sequence
- * mappings exist).
+ * sequences where the cDNA translates to the protein sequence. Any new
+ * mappings are added to the protein alignment. Has a 3-valued result: either
+ * Mapped (at least one sequence mapping was created), AlreadyMapped (all
+ * possible sequence mappings already exist), or NotMapped (no possible
+ * sequence mappings exist).
*
* @param proteinAlignment
* @param cdnaAlignment
boolean mappingPossible = false;
boolean mappingPerformed = false;
+ List<SequenceI> mapped = new ArrayList<SequenceI>();
+
List<SequenceI> thisSeqs = proteinAlignment.getSequences();
- /*
- * Build a look-up of cDNA sequences by name, for matching purposes.
- */
- Map<String, List<SequenceI>> cdnaSeqs = cdnaAlignment
- .getSequencesByName();
-
for (SequenceI aaSeq : thisSeqs)
{
AlignedCodonFrame acf = new AlignedCodonFrame();
- List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
- if (candidates == null)
+
+ for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
{
/*
- * No cDNA sequence with matching name, so no mapping possible for this
- * protein sequence
+ * Heuristic rule: don't map more than one AA sequence to the same cDNA;
+ * map progressively assuming that alignments have mappable sequences in
+ * the same respective order
*/
- continue;
- }
- mappingPossible = true;
- for (SequenceI cdnaSeq : candidates)
- {
+ if (mapped.contains(cdnaSeq))
+ {
+ continue;
+ }
if (!mappingExists(proteinAlignment.getCodonFrames(),
aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
{
{
acf.addMap(cdnaSeq, aaSeq, map);
mappingPerformed = true;
+ mapped.add(cdnaSeq);
+
+ /*
+ * Heuristic rule #2: don't map AA sequence to more than one cDNA
+ */
+ break;
}
}
}
/**
* Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
* must be three times the length of the protein, possibly after ignoring
- * start and/or stop codons. Returns null if no mapping is determined.
+ * start and/or stop codons, and must translate to the protein. Returns null
+ * if no mapping is determined.
*
* @param proteinSeqs
* @param cdnaSeq
SequenceI cdnaSeq)
{
/*
- * Here we handle either dataset sequence set (desktop) or absent (applet)
+ * Here we handle either dataset sequence set (desktop) or absent (applet).
+ * Use only the char[] form of the sequence to avoid creating possibly large
+ * String objects.
*/
final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
- String aaSeqString = proteinDataset != null ? proteinDataset
- .getSequenceAsString() : proteinSeq.getSequenceAsString();
+ char[] aaSeqChars = proteinDataset != null ? proteinDataset
+ .getSequence() : proteinSeq.getSequence();
final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
- String cdnaSeqString = cdnaDataset != null ? cdnaDataset
- .getSequenceAsString() : cdnaSeq.getSequenceAsString();
- if (aaSeqString == null || cdnaSeqString == null)
+ char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
+ : cdnaSeq.getSequence();
+ if (aaSeqChars == null || cdnaSeqChars == null)
{
return null;
}
- final int mappedLength = 3 * aaSeqString.length();
- int cdnaLength = cdnaSeqString.length();
+ /*
+ * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
+ */
+ final int mappedLength = 3 * aaSeqChars.length;
+ int cdnaLength = cdnaSeqChars.length;
int cdnaStart = 1;
int cdnaEnd = cdnaLength;
final int proteinStart = 1;
- final int proteinEnd = aaSeqString.length();
+ final int proteinEnd = aaSeqChars.length;
/*
* If lengths don't match, try ignoring stop codon.
*/
- if (cdnaLength != mappedLength)
+ if (cdnaLength != mappedLength && cdnaLength > 2)
{
- for (Object stop : ResidueProperties.STOP)
+ String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
+ .toUpperCase();
+ for (String stop : ResidueProperties.STOP)
{
- if (cdnaSeqString.toUpperCase().endsWith((String) stop))
+ if (lastCodon.equals(stop))
{
cdnaEnd -= 3;
cdnaLength -= 3;
* If lengths still don't match, try ignoring start codon.
*/
if (cdnaLength != mappedLength
- && cdnaSeqString.toUpperCase().startsWith(
+ && cdnaLength > 2
+ && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
+ .equals(
ResidueProperties.START))
{
cdnaStart += 3;
cdnaLength -= 3;
}
- if (cdnaLength == mappedLength)
+ if (cdnaLength != mappedLength)
{
- MapList map = new MapList(new int[]
- { cdnaStart, cdnaEnd }, new int[]
- { proteinStart, proteinEnd }, 3, 1);
- return map;
+ return null;
}
- else
+ if (!translatesAs(cdnaSeqChars, cdnaStart - 1, aaSeqChars))
{
return null;
}
+ MapList map = new MapList(new int[]
+ { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
+ }
+
+ /**
+ * Test whether the given cdna sequence, starting at the given offset,
+ * translates to the given amino acid sequence, using the standard translation
+ * table. Designed to fail fast i.e. as soon as a mismatch position is found.
+ *
+ * @param cdnaSeqChars
+ * @param cdnaStart
+ * @param aaSeqChars
+ * @return
+ */
+ protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
+ char[] aaSeqChars)
+ {
+ int aaResidue = 0;
+ for (int i = cdnaStart; i < cdnaSeqChars.length - 2
+ && aaResidue < aaSeqChars.length; i += 3)
+ {
+ String codon = String.valueOf(cdnaSeqChars, i, 3);
+ final String translated = ResidueProperties.codonTranslate(
+ codon);
+ if (translated == null
+ || !(aaSeqChars[aaResidue] == translated.charAt(0)))
+ {
+ return false;
+ }
+ aaResidue++;
+ }
+ // fail if we didn't match all of the aa sequence
+ return (aaResidue == aaSeqChars.length);
}
/**
package jalview.analysis;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import jalview.analysis.AlignmentUtils.MappingResult;
assertEquals("C--H--Y-Q", prot3.getSequenceAsString());
}
+ /**
+ * Test the method that tests whether a CDNA sequence translates to a protein
+ * sequence
+ */
+ @Test
+ public void testTranslatesAs()
+ {
+ assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
+ "FPKG".toCharArray()));
+ // with start codon
+ assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
+ 3, "FPKG".toCharArray()));
+ // with stop codon1
+ assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtaa".toCharArray(),
+ 0, "FPKG".toCharArray()));
+ // with stop codon2
+ assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtag".toCharArray(),
+ 0, "FPKG".toCharArray()));
+ // with stop codon3
+ assertTrue(AlignmentUtils.translatesAs("tttcccaaagggtga".toCharArray(),
+ 0, "FPKG".toCharArray()));
+ // with start and stop codon1
+ assertTrue(AlignmentUtils.translatesAs(
+ "atgtttcccaaaggtaa".toCharArray(), 3, "FPKG".toCharArray()));
+ // with start and stop codon2
+ assertTrue(AlignmentUtils.translatesAs(
+ "atgtttcccaaaggtag".toCharArray(), 3, "FPKG".toCharArray()));
+ // with start and stop codon3
+ assertTrue(AlignmentUtils.translatesAs(
+ "atgtttcccaaaggtga".toCharArray(), 3, "FPKG".toCharArray()));
+
+ // wrong protein
+ assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
+ 0,
+ "FPMG".toCharArray()));
+ }
+
+ @Test
+ public void testTranslatesAs_withAmbiguityCodes()
+ {
+ // R means A or G
+ assertTrue(AlignmentUtils.translatesAs("car".toCharArray(), 0,
+ "Q".toCharArray()));
+ }
}