/*
- * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
- * Copyright (C) 2014 The Jalview Authors
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
*/
package jalview.analysis;
+import jalview.datamodel.AlignedCodon;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.SearchResults;
+import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import jalview.schemes.ResidueProperties;
import jalview.util.MapList;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
+import java.util.TreeMap;
/**
* grab bag of useful alignment manipulation operations Expect these to be
/**
* Build mapping of protein to cDNA alignment. Mappings are made between
- * sequences which have the same name and compatible lengths. Has a 3-valued
- * result: either Mapped (at least one sequence mapping was created),
- * AlreadyMapped (all possible sequence mappings already exist), or NotMapped
- * (no possible sequence mappings exist).
+ * sequences which have the same name and compatible lengths. Any new mappings
+ * are added to the protein alignment. Has a 3-valued result: either Mapped
+ * (at least one sequence mapping was created), AlreadyMapped (all possible
+ * sequence mappings already exist), or NotMapped (no possible sequence
+ * mappings exist).
*
* @param proteinAlignment
* @param cdnaAlignment
final AlignmentI proteinAlignment,
final AlignmentI cdnaAlignment)
{
+ if (proteinAlignment == null || cdnaAlignment == null)
+ {
+ return MappingResult.NotMapped;
+ }
+
boolean mappingPossible = false;
boolean mappingPerformed = false;
public static MapList mapProteinToCdna(SequenceI proteinSeq,
SequenceI cdnaSeq)
{
- String aaSeqString = proteinSeq.getDatasetSequence()
- .getSequenceAsString();
- String cdnaSeqString = cdnaSeq.getDatasetSequence()
- .getSequenceAsString();
+ /*
+ * Here we handle either dataset sequence set (desktop) or absent (applet)
+ */
+ final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
+ String aaSeqString = proteinDataset != null ? proteinDataset
+ .getSequenceAsString() : proteinSeq.getSequenceAsString();
+ final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
+ String cdnaSeqString = cdnaDataset != null ? cdnaDataset
+ .getSequenceAsString() : cdnaSeq.getSequenceAsString();
if (aaSeqString == null || cdnaSeqString == null)
{
return null;
// TODO there may be one AlignedCodonFrame per dataset sequence, or one with
// all mappings. Would it help to constrain this?
List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
- if (mappings == null)
+ if (mappings == null || mappings.isEmpty())
{
return false;
}
}
return gapsToAdd;
}
+
+ /**
+ * Returns a list of sequences mapped from the given sequences and aligned
+ * (gapped) in the same way. For example, the cDNA for aligned protein, where
+ * a single gap in protein generates three gaps in cDNA.
+ *
+ * @param sequences
+ * @param gapCharacter
+ * @param mappings
+ * @return
+ */
+ public static List<SequenceI> getAlignedTranslation(
+ List<SequenceI> sequences, char gapCharacter,
+ Set<AlignedCodonFrame> mappings)
+ {
+ List<SequenceI> alignedSeqs = new ArrayList<SequenceI>();
+
+ for (SequenceI seq : sequences)
+ {
+ List<SequenceI> mapped = getAlignedTranslation(seq, gapCharacter,
+ mappings);
+ alignedSeqs.addAll(mapped);
+ }
+ return alignedSeqs;
+ }
+
+ /**
+ * Returns sequences aligned 'like' the source sequence, as mapped by the
+ * given mappings. Normally we expect zero or one 'mapped' sequences, but this
+ * will support 1-to-many as well.
+ *
+ * @param seq
+ * @param gapCharacter
+ * @param mappings
+ * @return
+ */
+ protected static List<SequenceI> getAlignedTranslation(SequenceI seq,
+ char gapCharacter, Set<AlignedCodonFrame> mappings)
+ {
+ List<SequenceI> result = new ArrayList<SequenceI>();
+ for (AlignedCodonFrame mapping : mappings)
+ {
+ if (mapping.involvesSequence(seq))
+ {
+ SequenceI mapped = getAlignedTranslation(seq, gapCharacter, mapping);
+ if (mapped != null)
+ {
+ result.add(mapped);
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Returns the translation of 'seq' (as held in the mapping) with
+ * corresponding alignment (gaps).
+ *
+ * @param seq
+ * @param gapCharacter
+ * @param mapping
+ * @return
+ */
+ protected static SequenceI getAlignedTranslation(SequenceI seq,
+ char gapCharacter, AlignedCodonFrame mapping)
+ {
+ String gap = String.valueOf(gapCharacter);
+ boolean toDna = false;
+ int fromRatio = 1;
+ SequenceI mapTo = mapping.getDnaForAaSeq(seq);
+ if (mapTo != null)
+ {
+ // mapping is from protein to nucleotide
+ toDna = true;
+ // should ideally get gap count ratio from mapping
+ gap = String.valueOf(new char[]
+ { gapCharacter, gapCharacter, gapCharacter });
+ }
+ else
+ {
+ // mapping is from nucleotide to protein
+ mapTo = mapping.getAaForDnaSeq(seq);
+ fromRatio = 3;
+ }
+ StringBuilder newseq = new StringBuilder(seq.getLength()
+ * (toDna ? 3 : 1));
+
+ int residueNo = 0; // in seq, base 1
+ int[] phrase = new int[fromRatio];
+ int phraseOffset = 0;
+ int gapWidth = 0;
+ boolean first = true;
+ final Sequence alignedSeq = new Sequence("", "");
+
+ for (char c : seq.getSequence())
+ {
+ if (c == gapCharacter)
+ {
+ gapWidth++;
+ if (gapWidth >= fromRatio)
+ {
+ newseq.append(gap);
+ gapWidth = 0;
+ }
+ }
+ else
+ {
+ phrase[phraseOffset++] = residueNo + 1;
+ if (phraseOffset == fromRatio)
+ {
+ /*
+ * Have read a whole codon (or protein residue), now translate: map
+ * source phrase to positions in target sequence add characters at
+ * these positions to newseq Note mapping positions are base 1, our
+ * sequence positions base 0.
+ */
+ SearchResults sr = new SearchResults();
+ for (int pos : phrase)
+ {
+ mapping.markMappedRegion(seq, pos, sr);
+ }
+ newseq.append(sr.toString());
+ if (first)
+ {
+ first = false;
+ // Hack: Copy sequence dataset, name and description from
+ // SearchResults.match[0].sequence
+ // TODO? carry over sequence names from original 'complement'
+ // alignment
+ SequenceI mappedTo = sr.getResultSequence(0);
+ alignedSeq.setName(mappedTo.getName());
+ alignedSeq.setDescription(mappedTo.getDescription());
+ alignedSeq.setDatasetSequence(mappedTo);
+ }
+ phraseOffset = 0;
+ }
+ residueNo++;
+ }
+ }
+ alignedSeq.setSequence(newseq.toString());
+ return alignedSeq;
+ }
+
+ /**
+ * Realigns the given protein to match the alignment of the dna, using codon
+ * mappings to translate aligned codon positions to protein residues.
+ *
+ * @param protein
+ * the alignment whose sequences are realigned by this method
+ * @param dna
+ * the dna alignment whose alignment we are 'copying'
+ * @return the number of sequences that were realigned
+ */
+ public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
+ {
+ Set<AlignedCodonFrame> mappings = protein.getCodonFrames();
+
+ /*
+ * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
+ * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
+ * comparator keeps the codon positions ordered.
+ */
+ Map<AlignedCodon, Map<SequenceI, String>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, String>>(
+ new CodonComparator());
+ for (SequenceI dnaSeq : dna.getSequences())
+ {
+ for (AlignedCodonFrame mapping : mappings)
+ {
+ Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
+ SequenceI prot = mapping.findAlignedSequence(
+ dnaSeq.getDatasetSequence(), protein);
+ if (prot != null)
+ {
+ addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
+ seqMap, alignedCodons);
+ }
+ }
+ }
+ return alignProteinAs(protein, alignedCodons);
+ }
+
+ /**
+ * Update the aligned protein sequences to match the codon alignments given in
+ * the map.
+ *
+ * @param protein
+ * @param alignedCodons
+ * an ordered map of codon positions (columns), with sequence/peptide
+ * values present in each column
+ * @return
+ */
+ protected static int alignProteinAs(AlignmentI protein,
+ Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
+ {
+ /*
+ * Prefill aligned sequences with gaps before inserting aligned protein
+ * residues.
+ */
+ int alignedWidth = alignedCodons.size();
+ char[] gaps = new char[alignedWidth];
+ Arrays.fill(gaps, protein.getGapCharacter());
+ String allGaps = String.valueOf(gaps);
+ for (SequenceI seq : protein.getSequences())
+ {
+ seq.setSequence(allGaps);
+ }
+
+ int column = 0;
+ for (AlignedCodon codon : alignedCodons.keySet())
+ {
+ final Map<SequenceI, String> columnResidues = alignedCodons.get(codon);
+ for (Entry<SequenceI, String> entry : columnResidues
+ .entrySet())
+ {
+ // place translated codon at its column position in sequence
+ entry.getKey().getSequence()[column] = entry.getValue().charAt(0);
+ }
+ column++;
+ }
+ return 0;
+ }
+
+ /**
+ * Populate the map of aligned codons by traversing the given sequence
+ * mapping, locating the aligned positions of mapped codons, and adding those
+ * positions and their translation products to the map.
+ *
+ * @param dna
+ * the aligned sequence we are mapping from
+ * @param protein
+ * the sequence to be aligned to the codons
+ * @param gapChar
+ * the gap character in the dna sequence
+ * @param seqMap
+ * a mapping to a sequence translation
+ * @param alignedCodons
+ * the map we are building up
+ */
+ static void addCodonPositions(SequenceI dna, SequenceI protein,
+ char gapChar,
+ Mapping seqMap,
+ Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
+ {
+ Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
+ while (codons.hasNext())
+ {
+ AlignedCodon codon = codons.next();
+ Map<SequenceI, String> seqProduct = alignedCodons.get(codon);
+ if (seqProduct == null)
+ {
+ seqProduct = new HashMap<SequenceI, String>();
+ alignedCodons.put(codon, seqProduct);
+ }
+ seqProduct.put(protein, codon.product);
+ }
+ }
}