From 3aa60eb1704441d7db522c13d6b45ab05cb43e2b Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 2 Mar 2016 16:35:00 +0000 Subject: [PATCH] JAL-1705 include any unmapped protein start 'X' when aligning to dna --- src/jalview/analysis/AlignmentUtils.java | 182 ++++++++++++++++++++++++++---- src/jalview/datamodel/AlignedCodon.java | 14 ++- src/jalview/datamodel/Mapping.java | 14 ++- 3 files changed, 182 insertions(+), 28 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index bdcfc2a..e624ce7 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -930,6 +930,34 @@ public class AlignmentUtils public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna) { List unmappedProtein = new ArrayList(); + Map> alignedCodons = buildCodonColumnsMap( + protein, dna, unmappedProtein); + return alignProteinAs(protein, alignedCodons, unmappedProtein); + } + + /** + * Builds a map whose key is an aligned codon position (3 alignment column + * numbers base 0), and whose value is a map from protein sequence to each + * protein's peptide residue for that codon. The map generates an ordering of + * the codons, and allows us to read off the peptides at each position in + * order to assemble 'aligned' protein sequences. + * + * @param protein + * the protein alignment + * @param dna + * the coding dna alignment + * @param unmappedProtein + * any unmapped proteins are added to this list + * @return + */ + protected static Map> buildCodonColumnsMap( + AlignmentI protein, AlignmentI dna, + List unmappedProtein) + { + /* + * maintain a list of any proteins with no mappings - these will be + * rendered 'as is' in the protein alignment as we can't align them + */ unmappedProtein.addAll(protein.getSequences()); List mappings = protein.getCodonFrames(); @@ -939,24 +967,114 @@ public class AlignmentUtils * {dnaSequence, {proteinSequence, codonProduct}} at that position. The * comparator keeps the codon positions ordered. */ - Map> alignedCodons = new TreeMap>( + Map> alignedCodons = new TreeMap>( new CodonComparator()); + for (SequenceI dnaSeq : dna.getSequences()) { for (AlignedCodonFrame mapping : mappings) { - Mapping seqMap = mapping.getMappingForSequence(dnaSeq); SequenceI prot = mapping.findAlignedSequence( dnaSeq.getDatasetSequence(), protein); if (prot != null) { + Mapping seqMap = mapping.getMappingForSequence(dnaSeq); addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap, alignedCodons); unmappedProtein.remove(prot); } } } - return alignProteinAs(protein, alignedCodons, unmappedProtein); + + /* + * Finally add any unmapped peptide start residues (e.g. for incomplete + * codons) as if at the codon position before the second residue + */ + int mappedSequenceCount = protein.getHeight() - unmappedProtein.size(); + addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount); + + return alignedCodons; + } + + /** + * Scans for any protein mapped from position 2 (meaning unmapped start + * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the + * preceding position in the alignment + * + * @param alignedCodons + * the codon-to-peptide map + * @param mappedSequenceCount + * the number of distinct sequences in the map + */ + protected static void addUnmappedPeptideStarts( + Map> alignedCodons, + int mappedSequenceCount) + { + // TODO there must be an easier way! root problem is that our mapping data + // model does not include phase so can't map part of a codon to a peptide + List sequencesChecked = new ArrayList(); + AlignedCodon lastCodon = null; + Map toAdd = new HashMap(); + + for (Entry> entry : alignedCodons + .entrySet()) + { + for (Entry sequenceCodon : entry.getValue() + .entrySet()) + { + SequenceI seq = sequenceCodon.getKey(); + if (sequencesChecked.contains(seq)) + { + continue; + } + sequencesChecked.add(seq); + AlignedCodon codon = sequenceCodon.getValue(); + if (codon.peptideCol > 1) + { + System.err + .println("Problem mapping protein with >1 unmapped start positions: " + + seq.getName()); + } + else if (codon.peptideCol == 1) + { + /* + * first position (peptideCol == 0) was unmapped - add it + */ + if (lastCodon != null) + { + AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1, + lastCodon.pos2, lastCodon.pos3, String.valueOf(seq + .getCharAt(0)), 0); + toAdd.put(seq, firstPeptide); + } + else + { + /* + * unmapped residue at start of alignment (no prior column) - + * 'insert' at nominal codon [0, 0, 0] + */ + AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0, + String.valueOf(seq.getCharAt(0)), 0); + toAdd.put(seq, firstPeptide); + } + } + if (sequencesChecked.size() == mappedSequenceCount) + { + // no need to check past first mapped position in all sequences + break; + } + } + lastCodon = entry.getKey(); + } + + /* + * add any new codons safely after iterating over the map + */ + for (Entry startCodon : toAdd.entrySet()) + { + addCodonToMap(alignedCodons, startCodon.getValue(), + startCodon.getKey()); + } } /** @@ -971,7 +1089,7 @@ public class AlignmentUtils * @return */ protected static int alignProteinAs(AlignmentI protein, - Map> alignedCodons, + Map> alignedCodons, List unmappedProtein) { /* @@ -993,12 +1111,13 @@ public class AlignmentUtils int column = 0; for (AlignedCodon codon : alignedCodons.keySet()) { - final Map columnResidues = alignedCodons + final Map columnResidues = alignedCodons .get(codon); - for (Entry entry : columnResidues.entrySet()) + for (Entry entry : columnResidues.entrySet()) { // place translated codon at its column position in sequence - entry.getKey().getSequence()[column] = entry.getValue().charAt(0); + entry.getKey().getSequence()[column] = entry.getValue().product + .charAt(0); } column++; } @@ -1023,21 +1142,20 @@ public class AlignmentUtils */ static void addCodonPositions(SequenceI dna, SequenceI protein, char gapChar, Mapping seqMap, - Map> alignedCodons) + Map> alignedCodons) { Iterator codons = seqMap.getCodonIterator(dna, gapChar); + + /* + * add codon positions, and their peptide translations, to the alignment + * map, while remembering the first codon mapped + */ while (codons.hasNext()) { try { AlignedCodon codon = codons.next(); - Map seqProduct = alignedCodons.get(codon); - if (seqProduct == null) - { - seqProduct = new HashMap(); - alignedCodons.put(codon, seqProduct); - } - seqProduct.put(protein, codon.product); + addCodonToMap(alignedCodons, codon, protein); } catch (IncompleteCodonException e) { // possible incomplete trailing codon - ignore @@ -1046,6 +1164,26 @@ public class AlignmentUtils } /** + * Helper method to add a codon-to-peptide entry to the aligned codons map + * + * @param alignedCodons + * @param codon + * @param protein + */ + protected static void addCodonToMap( + Map> alignedCodons, + AlignedCodon codon, SequenceI protein) + { + Map seqProduct = alignedCodons.get(codon); + if (seqProduct == null) + { + seqProduct = new HashMap(); + alignedCodons.put(codon, seqProduct); + } + seqProduct.put(protein, codon); + } + + /** * Returns true if a cDNA/Protein mapping either exists, or could be made, * between at least one pair of sequences in the two alignments. Currently, * the logic is: @@ -1474,8 +1612,7 @@ public class AlignmentUtils * @return */ protected static List> getMappedColumns( - List mappedSequences, - List mappings) + List mappedSequences, List mappings) { List> result = new ArrayList>(); for (SequenceI seq : mappedSequences) @@ -1668,8 +1805,8 @@ public class AlignmentUtils * @return */ protected static MapList addCdsMappings(SequenceI dnaSeq, - SequenceI cdsSeq, - Mapping dnaMapping, AlignedCodonFrame newMappings) + SequenceI cdsSeq, Mapping dnaMapping, + AlignedCodonFrame newMappings) { cdsSeq.createDatasetSequence(); @@ -1682,14 +1819,13 @@ public class AlignmentUtils cdsRanges.add(new int[] { 1, cdsDataset.getLength() }); MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap() .getToRanges(), 3, 1); - newMappings.addMap(cdsDataset, dnaMapping.getTo(), - cdsToPeptide); + newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide); /* * dna 'from' ranges map 1:1 to the contiguous extracted CDS */ - MapList dnaToCds = new MapList( - dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1); + MapList dnaToCds = new MapList(dnaMapping.getMap().getFromRanges(), + cdsRanges, 1, 1); newMappings.addMap(dnaSeq, cdsDataset, dnaToCds); return dnaToCds; } diff --git a/src/jalview/datamodel/AlignedCodon.java b/src/jalview/datamodel/AlignedCodon.java index 6179831..39a1853 100644 --- a/src/jalview/datamodel/AlignedCodon.java +++ b/src/jalview/datamodel/AlignedCodon.java @@ -27,32 +27,38 @@ package jalview.datamodel; * * Example: in "G-AT-C-GA" the aligned codons are (0, 2, 3) and (5, 7, 8). * - * JBPComment: Is this useful anywhere other than jalview.analysis.Dna ? - * * @author gmcarstairs * */ public final class AlignedCodon { + // base 1 aligned sequence position (base 0) public final int pos1; + // base 2 aligned sequence position (base 0) public final int pos2; + // base 3 aligned sequence position (base 0) public final int pos3; + // peptide aligned sequence position (base 0) + public final int peptideCol; + + // peptide coded for by this codon public final String product; public AlignedCodon(int i, int j, int k) { - this(i, j, k, null); + this(i, j, k, null, 0); } - public AlignedCodon(int i, int j, int k, String prod) + public AlignedCodon(int i, int j, int k, String prod, int prodCol) { pos1 = i; pos2 = j; pos3 = k; product = prod; + peptideCol = prodCol; } /** diff --git a/src/jalview/datamodel/Mapping.java b/src/jalview/datamodel/Mapping.java index c4c4a2a..bd83fe9 100644 --- a/src/jalview/datamodel/Mapping.java +++ b/src/jalview/datamodel/Mapping.java @@ -155,8 +155,9 @@ public class Mapping int[] alignedCodon = getAlignedCodon(codon); String peptide = getPeptide(); + int peptideCol = toPosition - 1 - Mapping.this.to.getStart(); return new AlignedCodon(alignedCodon[0], alignedCodon[1], - alignedCodon[2], peptide); + alignedCodon[2], peptide, peptideCol); } /** @@ -164,6 +165,8 @@ public class Mapping * sequence. * * @return + * @throws NoSuchElementException + * if the 'toRange' is exhausted (nothing to map to) */ private String getPeptide() { @@ -701,6 +704,15 @@ public class Mapping super.finalize(); } + /** + * Returns an iterator which can serve up the aligned codon column positions + * and their corresponding peptide products + * + * @param seq + * an aligned (i.e. possibly gapped) sequence + * @param gapChar + * @return + */ public Iterator getCodonIterator(SequenceI seq, char gapChar) { return new AlignedCodonIterator(seq, gapChar); -- 1.7.10.2