*/
package jalview.analysis;
+import jalview.datamodel.AlignedCodon;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.SearchResults;
+import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import jalview.schemes.ResidueProperties;
import jalview.util.MapList;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.TreeMap;
/**
* grab bag of useful alignment manipulation operations Expect these to be
/**
* Build mapping of protein to cDNA alignment. Mappings are made between
- * sequences which have the same name and compatible lengths. Has a 3-valued
- * result: either Mapped (at least one sequence mapping was created),
- * AlreadyMapped (all possible sequence mappings already exist), or NotMapped
- * (no possible sequence mappings exist).
+ * sequences where the cDNA translates to the protein sequence. Any new
+ * mappings are added to the protein alignment. Has a 3-valued result: either
+ * Mapped (at least one sequence mapping was created), AlreadyMapped (all
+ * possible sequence mappings already exist), or NotMapped (no possible
+ * sequence mappings exist).
*
* @param proteinAlignment
* @param cdnaAlignment
final AlignmentI proteinAlignment,
final AlignmentI cdnaAlignment)
{
+ if (proteinAlignment == null || cdnaAlignment == null)
+ {
+ return MappingResult.NotMapped;
+ }
+
boolean mappingPossible = false;
boolean mappingPerformed = false;
+ List<SequenceI> mapped = new ArrayList<SequenceI>();
+
List<SequenceI> thisSeqs = proteinAlignment.getSequences();
- /*
- * Build a look-up of cDNA sequences by name, for matching purposes.
- */
- Map<String, List<SequenceI>> cdnaSeqs = cdnaAlignment
- .getSequencesByName();
-
for (SequenceI aaSeq : thisSeqs)
{
AlignedCodonFrame acf = new AlignedCodonFrame();
- List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
- if (candidates == null)
+
+ for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
{
/*
- * No cDNA sequence with matching name, so no mapping possible for this
- * protein sequence
+ * Heuristic rule: don't map more than one AA sequence to the same cDNA;
+ * map progressively assuming that alignments have mappable sequences in
+ * the same respective order
*/
- continue;
- }
- mappingPossible = true;
- for (SequenceI cdnaSeq : candidates)
- {
+ if (mapped.contains(cdnaSeq))
+ {
+ continue;
+ }
if (!mappingExists(proteinAlignment.getCodonFrames(),
aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
{
{
acf.addMap(cdnaSeq, aaSeq, map);
mappingPerformed = true;
+ mapped.add(cdnaSeq);
+
+ /*
+ * Heuristic rule #2: don't map AA sequence to more than one cDNA
+ */
+ break;
}
}
}
* Answers true if the mappings include one between the given (dataset)
* sequences.
*/
- public static boolean mappingExists(AlignedCodonFrame[] codonFrames,
+ public static boolean mappingExists(Set<AlignedCodonFrame> set,
SequenceI aaSeq, SequenceI cdnaSeq)
{
- if (codonFrames != null)
+ if (set != null)
{
- for (AlignedCodonFrame acf : codonFrames)
+ for (AlignedCodonFrame acf : set)
{
if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
{
/**
* Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
* must be three times the length of the protein, possibly after ignoring
- * start and/or stop codons. Returns null if no mapping is determined.
+ * start and/or stop codons, and must translate to the protein. Returns null
+ * if no mapping is determined.
*
* @param proteinSeqs
* @param cdnaSeq
public static MapList mapProteinToCdna(SequenceI proteinSeq,
SequenceI cdnaSeq)
{
- String aaSeqString = proteinSeq.getDatasetSequence()
- .getSequenceAsString();
- String cdnaSeqString = cdnaSeq.getDatasetSequence()
- .getSequenceAsString();
- if (aaSeqString == null || cdnaSeqString == null)
+ /*
+ * Here we handle either dataset sequence set (desktop) or absent (applet).
+ * Use only the char[] form of the sequence to avoid creating possibly large
+ * String objects.
+ */
+ final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
+ char[] aaSeqChars = proteinDataset != null ? proteinDataset
+ .getSequence() : proteinSeq.getSequence();
+ final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
+ char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
+ : cdnaSeq.getSequence();
+ if (aaSeqChars == null || cdnaSeqChars == null)
{
return null;
}
- final int mappedLength = 3 * aaSeqString.length();
- int cdnaLength = cdnaSeqString.length();
+ /*
+ * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
+ */
+ final int mappedLength = 3 * aaSeqChars.length;
+ int cdnaLength = cdnaSeqChars.length;
int cdnaStart = 1;
int cdnaEnd = cdnaLength;
final int proteinStart = 1;
- final int proteinEnd = aaSeqString.length();
+ final int proteinEnd = aaSeqChars.length;
/*
* If lengths don't match, try ignoring stop codon.
*/
- if (cdnaLength != mappedLength)
+ if (cdnaLength != mappedLength && cdnaLength > 2)
{
- for (Object stop : ResidueProperties.STOP)
+ String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
+ .toUpperCase();
+ for (String stop : ResidueProperties.STOP)
{
- if (cdnaSeqString.toUpperCase().endsWith((String) stop))
+ if (lastCodon.equals(stop))
{
cdnaEnd -= 3;
cdnaLength -= 3;
* If lengths still don't match, try ignoring start codon.
*/
if (cdnaLength != mappedLength
- && cdnaSeqString.toUpperCase().startsWith(
+ && cdnaLength > 2
+ && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
+ .equals(
ResidueProperties.START))
{
cdnaStart += 3;
cdnaLength -= 3;
}
- if (cdnaLength == mappedLength)
+ if (cdnaLength != mappedLength)
{
- MapList map = new MapList(new int[]
- { cdnaStart, cdnaEnd }, new int[]
- { proteinStart, proteinEnd }, 3, 1);
- return map;
+ return null;
}
- else
+ if (!translatesAs(cdnaSeqChars, cdnaStart - 1, aaSeqChars))
{
return null;
}
+ MapList map = new MapList(new int[]
+ { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
+ }
+
+ /**
+ * Test whether the given cdna sequence, starting at the given offset,
+ * translates to the given amino acid sequence, using the standard translation
+ * table. Designed to fail fast i.e. as soon as a mismatch position is found.
+ *
+ * @param cdnaSeqChars
+ * @param cdnaStart
+ * @param aaSeqChars
+ * @return
+ */
+ protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
+ char[] aaSeqChars)
+ {
+ int aaResidue = 0;
+ for (int i = cdnaStart; i < cdnaSeqChars.length - 2
+ && aaResidue < aaSeqChars.length; i += 3)
+ {
+ String codon = String.valueOf(cdnaSeqChars, i, 3);
+ final String translated = ResidueProperties.codonTranslate(
+ codon);
+ if (translated == null
+ || !(aaSeqChars[aaResidue] == translated.charAt(0)))
+ {
+ return false;
+ }
+ aaResidue++;
+ }
+ // fail if we didn't match all of the aa sequence
+ return (aaResidue == aaSeqChars.length);
}
/**
*/
// TODO there may be one AlignedCodonFrame per dataset sequence, or one with
// all mappings. Would it help to constrain this?
- AlignedCodonFrame[] mappings = al.getCodonFrame(seq);
- if (mappings == null)
+ List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
+ if (mappings == null || mappings.isEmpty())
{
return false;
}
}
return gapsToAdd;
}
+
+ /**
+ * Returns a list of sequences mapped from the given sequences and aligned
+ * (gapped) in the same way. For example, the cDNA for aligned protein, where
+ * a single gap in protein generates three gaps in cDNA.
+ *
+ * @param sequences
+ * @param gapCharacter
+ * @param mappings
+ * @return
+ */
+ public static List<SequenceI> getAlignedTranslation(
+ List<SequenceI> sequences, char gapCharacter,
+ Set<AlignedCodonFrame> mappings)
+ {
+ List<SequenceI> alignedSeqs = new ArrayList<SequenceI>();
+
+ for (SequenceI seq : sequences)
+ {
+ List<SequenceI> mapped = getAlignedTranslation(seq, gapCharacter,
+ mappings);
+ alignedSeqs.addAll(mapped);
+ }
+ return alignedSeqs;
+ }
+
+ /**
+ * Returns sequences aligned 'like' the source sequence, as mapped by the
+ * given mappings. Normally we expect zero or one 'mapped' sequences, but this
+ * will support 1-to-many as well.
+ *
+ * @param seq
+ * @param gapCharacter
+ * @param mappings
+ * @return
+ */
+ protected static List<SequenceI> getAlignedTranslation(SequenceI seq,
+ char gapCharacter, Set<AlignedCodonFrame> mappings)
+ {
+ List<SequenceI> result = new ArrayList<SequenceI>();
+ for (AlignedCodonFrame mapping : mappings)
+ {
+ if (mapping.involvesSequence(seq))
+ {
+ SequenceI mapped = getAlignedTranslation(seq, gapCharacter, mapping);
+ if (mapped != null)
+ {
+ result.add(mapped);
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Returns the translation of 'seq' (as held in the mapping) with
+ * corresponding alignment (gaps).
+ *
+ * @param seq
+ * @param gapCharacter
+ * @param mapping
+ * @return
+ */
+ protected static SequenceI getAlignedTranslation(SequenceI seq,
+ char gapCharacter, AlignedCodonFrame mapping)
+ {
+ String gap = String.valueOf(gapCharacter);
+ boolean toDna = false;
+ int fromRatio = 1;
+ SequenceI mapTo = mapping.getDnaForAaSeq(seq);
+ if (mapTo != null)
+ {
+ // mapping is from protein to nucleotide
+ toDna = true;
+ // should ideally get gap count ratio from mapping
+ gap = String.valueOf(new char[]
+ { gapCharacter, gapCharacter, gapCharacter });
+ }
+ else
+ {
+ // mapping is from nucleotide to protein
+ mapTo = mapping.getAaForDnaSeq(seq);
+ fromRatio = 3;
+ }
+ StringBuilder newseq = new StringBuilder(seq.getLength()
+ * (toDna ? 3 : 1));
+
+ int residueNo = 0; // in seq, base 1
+ int[] phrase = new int[fromRatio];
+ int phraseOffset = 0;
+ int gapWidth = 0;
+ boolean first = true;
+ final Sequence alignedSeq = new Sequence("", "");
+
+ for (char c : seq.getSequence())
+ {
+ if (c == gapCharacter)
+ {
+ gapWidth++;
+ if (gapWidth >= fromRatio)
+ {
+ newseq.append(gap);
+ gapWidth = 0;
+ }
+ }
+ else
+ {
+ phrase[phraseOffset++] = residueNo + 1;
+ if (phraseOffset == fromRatio)
+ {
+ /*
+ * Have read a whole codon (or protein residue), now translate: map
+ * source phrase to positions in target sequence add characters at
+ * these positions to newseq Note mapping positions are base 1, our
+ * sequence positions base 0.
+ */
+ SearchResults sr = new SearchResults();
+ for (int pos : phrase)
+ {
+ mapping.markMappedRegion(seq, pos, sr);
+ }
+ newseq.append(sr.toString());
+ if (first)
+ {
+ first = false;
+ // Hack: Copy sequence dataset, name and description from
+ // SearchResults.match[0].sequence
+ // TODO? carry over sequence names from original 'complement'
+ // alignment
+ SequenceI mappedTo = sr.getResultSequence(0);
+ alignedSeq.setName(mappedTo.getName());
+ alignedSeq.setDescription(mappedTo.getDescription());
+ alignedSeq.setDatasetSequence(mappedTo);
+ }
+ phraseOffset = 0;
+ }
+ residueNo++;
+ }
+ }
+ alignedSeq.setSequence(newseq.toString());
+ return alignedSeq;
+ }
+
+ /**
+ * Realigns the given protein to match the alignment of the dna, using codon
+ * mappings to translate aligned codon positions to protein residues.
+ *
+ * @param protein
+ * the alignment whose sequences are realigned by this method
+ * @param dna
+ * the dna alignment whose alignment we are 'copying'
+ * @return the number of sequences that were realigned
+ */
+ public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
+ {
+ Set<AlignedCodonFrame> mappings = protein.getCodonFrames();
+
+ /*
+ * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
+ * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
+ * comparator keeps the codon positions ordered.
+ */
+ Map<AlignedCodon, Map<SequenceI, String>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, String>>(
+ new CodonComparator());
+ for (SequenceI dnaSeq : dna.getSequences())
+ {
+ for (AlignedCodonFrame mapping : mappings)
+ {
+ Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
+ SequenceI prot = mapping.findAlignedSequence(
+ dnaSeq.getDatasetSequence(), protein);
+ if (prot != null)
+ {
+ addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
+ seqMap, alignedCodons);
+ }
+ }
+ }
+ return alignProteinAs(protein, alignedCodons);
+ }
+
+ /**
+ * Update the aligned protein sequences to match the codon alignments given in
+ * the map.
+ *
+ * @param protein
+ * @param alignedCodons
+ * an ordered map of codon positions (columns), with sequence/peptide
+ * values present in each column
+ * @return
+ */
+ protected static int alignProteinAs(AlignmentI protein,
+ Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
+ {
+ /*
+ * Prefill aligned sequences with gaps before inserting aligned protein
+ * residues.
+ */
+ int alignedWidth = alignedCodons.size();
+ char[] gaps = new char[alignedWidth];
+ Arrays.fill(gaps, protein.getGapCharacter());
+ String allGaps = String.valueOf(gaps);
+ for (SequenceI seq : protein.getSequences())
+ {
+ seq.setSequence(allGaps);
+ }
+
+ int column = 0;
+ for (AlignedCodon codon : alignedCodons.keySet())
+ {
+ final Map<SequenceI, String> columnResidues = alignedCodons.get(codon);
+ for (Entry<SequenceI, String> entry : columnResidues
+ .entrySet())
+ {
+ // place translated codon at its column position in sequence
+ entry.getKey().getSequence()[column] = entry.getValue().charAt(0);
+ }
+ column++;
+ }
+ return 0;
+ }
+
+ /**
+ * Populate the map of aligned codons by traversing the given sequence
+ * mapping, locating the aligned positions of mapped codons, and adding those
+ * positions and their translation products to the map.
+ *
+ * @param dna
+ * the aligned sequence we are mapping from
+ * @param protein
+ * the sequence to be aligned to the codons
+ * @param gapChar
+ * the gap character in the dna sequence
+ * @param seqMap
+ * a mapping to a sequence translation
+ * @param alignedCodons
+ * the map we are building up
+ */
+ static void addCodonPositions(SequenceI dna, SequenceI protein,
+ char gapChar,
+ Mapping seqMap,
+ Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
+ {
+ Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
+ while (codons.hasNext())
+ {
+ AlignedCodon codon = codons.next();
+ Map<SequenceI, String> seqProduct = alignedCodons.get(codon);
+ if (seqProduct == null)
+ {
+ seqProduct = new HashMap<SequenceI, String>();
+ alignedCodons.put(codon, seqProduct);
+ }
+ seqProduct.put(protein, codon.product);
+ }
+ }
}