import jalview.util.DBRefUtils;
import jalview.util.MapList;
import jalview.util.MappingUtils;
+import jalview.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
}
else
{
- MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq);
+ MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
if (map != null)
{
acf.addMap(cdnaSeq, aaSeq, map);
}
/**
- * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
- * must be three times the length of the protein, possibly after ignoring
- * start and/or stop codons, and must translate to the protein. Returns null
- * if no mapping is determined.
+ * Builds a mapping (if possible) of a cDNA to a protein sequence.
+ * <ul>
+ * <li>first checks if the cdna translates exactly to the protein sequence</li>
+ * <li>else checks for translation after removing a STOP codon</li>
+ * <li>else checks for translation after removing a START codon</li>
+ * <li>if that fails, inspect CDS features on the cDNA sequence</li>
+ * </ul>
+ * Returns null if no mapping is determined.
*
- * @param proteinSeqs
+ * @param proteinSeq
+ * the aligned protein sequence
* @param cdnaSeq
+ * the aligned cdna sequence
* @return
*/
- public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq,
+ public static MapList mapCdnaToProtein(SequenceI proteinSeq,
SequenceI cdnaSeq)
{
/*
final int proteinEnd = proteinSeq.getEnd();
/*
- * If lengths don't match, try ignoring stop codon.
+ * If lengths don't match, try ignoring stop codon (if present)
*/
if (cdnaLength != mappedLength && cdnaLength > 2)
{
cdnaLength -= 3;
}
- if (cdnaLength != mappedLength)
- {
- return null;
- }
- if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
+ if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
{
- return null;
+ /*
+ * protein is translation of dna (+/- start/stop codons)
+ */
+ MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
}
- MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] {
- proteinStart, proteinEnd }, 3, 1);
- return map;
+
+ /*
+ * translation failed - try mapping CDS annotated regions of dna
+ */
+ return mapCdsToProtein(cdnaSeq, proteinSeq);
}
/**
return false;
}
- int aaResidue = 0;
- for (int i = cdnaStart; i < cdnaSeqChars.length - 2
- && aaResidue < aaSeqChars.length; i += 3, aaResidue++)
+ int aaPos = 0;
+ int dnaPos = cdnaStart;
+ for (; dnaPos < cdnaSeqChars.length - 2
+ && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
{
- String codon = String.valueOf(cdnaSeqChars, i, 3);
+ String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
final String translated = ResidueProperties.codonTranslate(codon);
+
/*
* allow * in protein to match untranslatable in dna
*/
- final char aaRes = aaSeqChars[aaResidue];
+ final char aaRes = aaSeqChars[aaPos];
if ((translated == null || "STOP".equals(translated)) && aaRes == '*')
{
continue;
return false;
}
}
- // fail if we didn't match all of the aa sequence
- return (aaResidue == aaSeqChars.length);
+
+ /*
+ * check we matched all of the protein sequence
+ */
+ if (aaPos != aaSeqChars.length)
+ {
+ return false;
+ }
+
+ /*
+ * check we matched all of the dna except
+ * for optional trailing STOP codon
+ */
+ if (dnaPos == cdnaSeqChars.length)
+ {
+ return true;
+ }
+ if (dnaPos == cdnaSeqChars.length - 3)
+ {
+ String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+ if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
+ {
+ return true;
+ }
+ }
+ return false;
}
/**
Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
int mappedSequenceCount)
{
- // TODO there must be an easier way! root problem is that our mapping data
- // model does not include phase so can't map part of a codon to a peptide
+ // TODO delete this ugly hack once JAL-2022 is resolved
+ // i.e. we can model startPhase > 0 (incomplete start codon)
+
List<SequenceI> sequencesChecked = new ArrayList<SequenceI>();
AlignedCodon lastCodon = null;
Map<SequenceI, AlignedCodon> toAdd = new HashMap<SequenceI, AlignedCodon>();
* Just try to make a mapping (it is not yet stored), test whether
* successful.
*/
- return mapProteinSequenceToCdna(proteinDs, dnaDs) != null;
+ return mapCdnaToProtein(proteinDs, dnaDs) != null;
}
/**
/**
* Constructs an alignment consisting of the mapped (CDS) regions in the given
* nucleotide sequences, and updates mappings to match. The new sequences are
- * aligned as per the original sequences (with gapped columns omitted).
+ * aligned as per the original sequence, with entirely gapped columns (codon
+ * interrupted by intron) omitted.
*
* @param dna
* aligned dna sequences
to.setName(to.getName() + "|" + cdsAccId);
}
}
+
+ /**
+ * Returns a mapping from dna to protein by inspecting sequence features of
+ * type "CDS" on the dna.
+ *
+ * @param dnaSeq
+ * @param proteinSeq
+ * @return
+ */
+ public static MapList mapCdsToProtein(SequenceI dnaSeq,
+ SequenceI proteinSeq)
+ {
+ List<int[]> ranges = findCdsPositions(dnaSeq);
+ int mappedDnaLength = MappingUtils.getLength(ranges);
+
+ int proteinLength = proteinSeq.getLength();
+ int proteinStart = proteinSeq.getStart();
+ int proteinEnd = proteinSeq.getEnd();
+
+ /*
+ * incomplete start codon may mean X at start of peptide
+ * we ignore both for mapping purposes
+ */
+ if (proteinSeq.getCharAt(0) == 'X')
+ {
+ // todo JAL-2022 support startPhase > 0
+ proteinStart++;
+ proteinLength--;
+ }
+ List<int[]> proteinRange = new ArrayList<int[]>();
+
+ /*
+ * dna length should map to protein (or protein plus stop codon)
+ */
+ int codesForResidues = mappedDnaLength / 3;
+ if (codesForResidues == (proteinLength + 1))
+ {
+ // assuming extra codon is for STOP and not in peptide
+ codesForResidues--;
+ }
+ if (codesForResidues == proteinLength)
+ {
+ proteinRange.add(new int[] { proteinStart, proteinEnd });
+ return new MapList(ranges, proteinRange, 3, 1);
+ }
+ return null;
+ }
+
+ /**
+ * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
+ * start/end positions of sequence features of type "CDS" (or a sub-type of
+ * CDS in the Sequence Ontology)
+ *
+ * @param dnaSeq
+ * @return
+ */
+ public static List<int[]> findCdsPositions(SequenceI dnaSeq)
+ {
+ List<int[]> result = new ArrayList<int[]>();
+ SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return result;
+ }
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ for (SequenceFeature sf : sfs)
+ {
+ /*
+ * process a CDS feature (or a sub-type of CDS)
+ */
+ if (so.isA(sf.getType(), SequenceOntologyI.CDS))
+ {
+ int phase = 0;
+ try {
+ phase = Integer.parseInt(sf.getPhase());
+ } catch (NumberFormatException e)
+ {
+ // ignore
+ }
+ /*
+ * phase > 0 on first codon means 5' incomplete - skip to the start
+ * of the next codon; example ENST00000496384
+ */
+ int begin = sf.getBegin();
+ int end = sf.getEnd();
+ if (result.isEmpty())
+ {
+ // TODO JAL-2022 support start phase > 0
+ begin += phase;
+ if (begin > end)
+ {
+ continue; // shouldn't happen?
+ }
+ }
+ result.add(new int[] { begin, end });
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Maps exon features from dna to protein, and computes variants in peptide
+ * product generated by variants in dna, and adds them as sequence_variant
+ * features on the protein sequence. Returns the number of variant features
+ * added.
+ *
+ * @param dnaSeq
+ * @param peptide
+ * @param dnaToProtein
+ */
+ public static int computeProteinFeatures(SequenceI dnaSeq,
+ SequenceI peptide, MapList dnaToProtein)
+ {
+ while (dnaSeq.getDatasetSequence() != null)
+ {
+ dnaSeq = dnaSeq.getDatasetSequence();
+ }
+ while (peptide.getDatasetSequence() != null)
+ {
+ peptide = peptide.getDatasetSequence();
+ }
+
+ transferFeatures(dnaSeq, peptide, dnaToProtein,
+ SequenceOntologyI.EXON);
+
+ LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
+ dnaSeq, dnaToProtein);
+
+ /*
+ * scan codon variations, compute peptide variants and add to peptide sequence
+ */
+ int count = 0;
+ for (Entry<Integer, String[][]> variant : variants.entrySet())
+ {
+ int peptidePos = variant.getKey();
+ String[][] codonVariants = variant.getValue();
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
+ List<String> peptideVariants = computePeptideVariants(codonVariants,
+ residue);
+ if (!peptideVariants.isEmpty())
+ {
+ String desc = StringUtils.listToDelimitedString(peptideVariants,
+ ", ");
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, 0f, null);
+ peptide.addSequenceFeature(sf);
+ count++;
+ }
+ }
+
+ /*
+ * ugly sort to get sequence features in start position order
+ * - would be better to store in Sequence as a TreeSet instead?
+ */
+ Arrays.sort(peptide.getSequenceFeatures(),
+ new Comparator<SequenceFeature>()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+ : c;
+ }
+ });
+ return count;
+ }
+
+ /**
+ * Builds a map whose key is position in the protein sequence, and value is an
+ * array of all variants for the coding codon positions
+ *
+ * @param dnaSeq
+ * @param dnaToProtein
+ * @return
+ */
+ static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
+ SequenceI dnaSeq, MapList dnaToProtein)
+ {
+ /*
+ * map from peptide position to all variant features of the codon for it
+ * LinkedHashMap ensures we add the peptide features in sequence order
+ */
+ LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+
+ SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
+ if (dnaFeatures == null)
+ {
+ return variants;
+ }
+
+ int dnaStart = dnaSeq.getStart();
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ String[][] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new String[3][];
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.toUpperCase().split(",");
+ int i = 0;
+ for (String allele : alleles)
+ {
+ alleles[i++] = allele.trim(); // lose any space characters "A, G"
+ }
+
+ /*
+ * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and this variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(
+ dnaSeq.getCharAt(codon[codonPos] - dnaStart))
+ .toUpperCase();
+ if (codonVariants[codonPos] == null)
+ {
+ /*
+ * record current dna base
+ */
+ codonVariants[codonPos] = new String[] { nucleotide };
+ }
+ if (codon[codonPos] == dnaCol)
+ {
+ /*
+ * add alleles to dna base (and any previously found alleles)
+ */
+ String[] known = codonVariants[codonPos];
+ String[] dnaVariants = new String[alleles.length + known.length];
+ System.arraycopy(known, 0, dnaVariants, 0, known.length);
+ System.arraycopy(alleles, 0, dnaVariants, known.length,
+ alleles.length);
+ codonVariants[codonPos] = dnaVariants;
+ }
+ }
+ }
+ }
+ return variants;
+ }
+
+ /**
+ * Returns a sorted, non-redundant list of all peptide translations generated
+ * by the given dna variants, excluding the current residue value
+ *
+ * @param codonVariants
+ * an array of base values (acgtACGT) for codon positions 1, 2, 3
+ * @param residue
+ * the current residue translation
+ * @return
+ */
+ static List<String> computePeptideVariants(
+ String[][] codonVariants, String residue)
+ {
+ List<String> result = new ArrayList<String>();
+ for (String base1 : codonVariants[0])
+ {
+ for (String base2 : codonVariants[1])
+ {
+ for (String base3 : codonVariants[2])
+ {
+ String codon = base1 + base2 + base3;
+ /*
+ * get peptide translation of codon e.g. GAT -> D
+ * note that variants which are not single alleles,
+ * e.g. multibase variants or HGMD_MUTATION etc
+ * are ignored here
+ */
+ String peptide = codon.contains("-") ? "-"
+ : (codon.length() > 3 ? null : ResidueProperties
+ .codonTranslate(codon));
+ if (peptide != null && !result.contains(peptide)
+ && !peptide.equalsIgnoreCase(residue))
+ {
+ result.add(peptide);
+ }
+ }
+ }
+ }
+
+ /*
+ * sort alphabetically with STOP at the end
+ */
+ Collections.sort(result, new Comparator<String>()
+ {
+
+ @Override
+ public int compare(String o1, String o2)
+ {
+ if ("STOP".equals(o1))
+ {
+ return 1;
+ }
+ else if ("STOP".equals(o2))
+ {
+ return -1;
+ }
+ else
+ {
+ return o1.compareTo(o2);
+ }
+ }
+ });
+ return result;
+ }
}