*/
package jalview.analysis;
+import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
+
import jalview.datamodel.AlignedCodon;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.util.MappingUtils;
import jalview.util.StringUtils;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
public class AlignmentUtils
{
+ private static final String SEQUENCE_VARIANT = "sequence_variant:";
+ private static final String ID = "ID";
+
+ /**
+ * A data model to hold the 'normal' base value at a position, and an optional
+ * sequence variant feature
+ */
+ static class DnaVariant
+ {
+ String base;
+
+ SequenceFeature variant;
+
+ DnaVariant(String nuc)
+ {
+ base = nuc;
+ }
+
+ DnaVariant(String nuc, SequenceFeature var)
+ {
+ base = nuc;
+ variant = var;
+ }
+ }
+
/**
* given an existing alignment, create a new alignment including all, or up to
* flankSize additional symbols from each sequence's dataset sequence
* /ENSP00000288602?feature=transcript_variation;content-type=text/xml
* which would be a bit slower but possibly more reliable
*/
- LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
+
+ /*
+ * build a map with codon variations for each potentially varying peptide
+ */
+ LinkedHashMap<Integer, List<DnaVariant>[]> variants = buildDnaVariantsMap(
dnaSeq, dnaToProtein);
/*
* scan codon variations, compute peptide variants and add to peptide sequence
*/
int count = 0;
- for (Entry<Integer, String[][]> variant : variants.entrySet())
+ for (Entry<Integer, List<DnaVariant>[]> variant : variants.entrySet())
{
int peptidePos = variant.getKey();
- String[][] codonVariants = variant.getValue();
- String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
- List<String> peptideVariants = computePeptideVariants(codonVariants,
- residue);
- if (!peptideVariants.isEmpty())
- {
- String desc = residue + "," // include canonical residue in description
- + StringUtils.listToDelimitedString(peptideVariants, ", ");
- SequenceFeature sf = new SequenceFeature(
- SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
- peptidePos, 0f, null);
- peptide.addSequenceFeature(sf);
- count++;
- }
+ List<DnaVariant>[] codonVariants = variant.getValue();
+ count += computePeptideVariants(peptide, peptidePos, codonVariants);
}
/*
- * ugly sort to get sequence features in start position order
- * - would be better to store in Sequence as a TreeSet instead?
+ * sort to get sequence features in start position order
+ * - would be better to store in Sequence as a TreeSet or NCList?
*/
Arrays.sort(peptide.getSequenceFeatures(),
new Comparator<SequenceFeature>()
}
/**
- * Builds a map whose key is position in the protein sequence, and value is an
- * array of all variants for the coding codon positions
+ * Computes non-synonymous peptide variants from codon variants and adds them
+ * as sequence_variant features on the protein sequence (one feature per
+ * allele variant). Selected attributes (variant id, clinical significance)
+ * are copied over to the new features.
+ *
+ * @param peptide
+ * the protein sequence
+ * @param peptidePos
+ * the position to compute peptide variants for
+ * @param codonVariants
+ * a list of dna variants per codon position
+ * @return the number of features added
+ */
+ static int computePeptideVariants(SequenceI peptide, int peptidePos,
+ List<DnaVariant>[] codonVariants)
+ {
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1));
+ int count = 0;
+ String base1 = codonVariants[0].get(0).base;
+ String base2 = codonVariants[1].get(0).base;
+ String base3 = codonVariants[2].get(0).base;
+
+ /*
+ * variants in first codon base
+ */
+ for (DnaVariant var : codonVariants[0])
+ {
+ if (var.variant != null)
+ {
+ String alleles = (String) var.variant.getValue("alleles");
+ if (alleles != null)
+ {
+ for (String base : alleles.split(","))
+ {
+ String codon = base + base2 + base3;
+ if (addPeptideVariant(peptide, peptidePos, residue, var, codon))
+ {
+ count++;
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * variants in second codon base
+ */
+ for (DnaVariant var : codonVariants[1])
+ {
+ if (var.variant != null)
+ {
+ String alleles = (String) var.variant.getValue("alleles");
+ if (alleles != null)
+ {
+ for (String base : alleles.split(","))
+ {
+ String codon = base1 + base + base3;
+ if (addPeptideVariant(peptide, peptidePos, residue, var, codon))
+ {
+ count++;
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * variants in third codon base
+ */
+ for (DnaVariant var : codonVariants[2])
+ {
+ if (var.variant != null)
+ {
+ String alleles = (String) var.variant.getValue("alleles");
+ if (alleles != null)
+ {
+ for (String base : alleles.split(","))
+ {
+ String codon = base1 + base2 + base;
+ if (addPeptideVariant(peptide, peptidePos, residue, var, codon))
+ {
+ count++;
+ }
+ }
+ }
+ }
+ }
+
+ return count;
+ }
+
+ /**
+ * Helper method that adds a peptide variant feature, provided the given codon
+ * translates to a value different to the current residue (is a non-synonymous
+ * variant). ID and clinical_significance attributes of the dna variant (if
+ * present) are copied to the new feature.
+ *
+ * @param peptide
+ * @param peptidePos
+ * @param residue
+ * @param var
+ * @param codon
+ * @return true if a feature was added, else false
+ */
+ static boolean addPeptideVariant(SequenceI peptide, int peptidePos,
+ String residue, DnaVariant var, String codon)
+ {
+ /*
+ * get peptide translation of codon e.g. GAT -> D
+ * note that variants which are not single alleles,
+ * e.g. multibase variants or HGMD_MUTATION etc
+ * are currently ignored here
+ */
+ String trans = codon.contains("-") ? "-"
+ : (codon.length() > 3 ? null : ResidueProperties
+ .codonTranslate(codon));
+ if (trans != null && !trans.equals(residue))
+ {
+ String residue3Char = StringUtils
+ .toSentenceCase(ResidueProperties.aa2Triplet.get(residue));
+ String trans3Char = StringUtils
+ .toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
+ String desc = "p." + residue3Char + peptidePos + trans3Char;
+ // set score to 0f so 'graduated colour' option is offered! JAL-2060
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, 0f, "Jalview");
+ StringBuilder attributes = new StringBuilder(32);
+ String id = (String) var.variant.getValue(ID);
+ if (id != null)
+ {
+ if (id.startsWith(SEQUENCE_VARIANT))
+ {
+ id = id.substring(SEQUENCE_VARIANT.length());
+ }
+ sf.setValue(ID, id);
+ attributes.append(ID).append("=").append(id);
+ // TODO handle other species variants
+ StringBuilder link = new StringBuilder(32);
+ try
+ {
+ link.append(desc).append(" ").append(id)
+ .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
+ .append(URLEncoder.encode(id, "UTF-8"));
+ sf.addLink(link.toString());
+ } catch (UnsupportedEncodingException e)
+ {
+ // as if
+ }
+ }
+ String clinSig = (String) var.variant
+ .getValue(CLINICAL_SIGNIFICANCE);
+ if (clinSig != null)
+ {
+ sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
+ attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=")
+ .append(clinSig);
+ }
+ peptide.addSequenceFeature(sf);
+ if (attributes.length() > 0)
+ {
+ sf.setAttributes(attributes.toString());
+ }
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Builds a map whose key is position in the protein sequence, and value is a
+ * list of the base and all variants for each corresponding codon position
*
* @param dnaSeq
* @param dnaToProtein
* @return
*/
- static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
+ static LinkedHashMap<Integer, List<DnaVariant>[]> buildDnaVariantsMap(
SequenceI dnaSeq, MapList dnaToProtein)
{
/*
- * map from peptide position to all variant features of the codon for it
- * LinkedHashMap ensures we add the peptide features in sequence order
+ * map from peptide position to all variants of the codon which codes for it
+ * LinkedHashMap ensures we keep the peptide features in sequence order
*/
- LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+ LinkedHashMap<Integer, List<DnaVariant>[]> variants = new LinkedHashMap<Integer, List<DnaVariant>[]>();
SequenceOntologyI so = SequenceOntologyFactory.getInstance();
SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
continue;
}
int peptidePosition = mapsTo[0];
- String[][] codonVariants = variants.get(peptidePosition);
+ List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
if (codonVariants == null)
{
- codonVariants = new String[3][];
+ codonVariants = new ArrayList[3];
+ codonVariants[0] = new ArrayList<DnaVariant>();
+ codonVariants[1] = new ArrayList<DnaVariant>();
+ codonVariants[2] = new ArrayList<DnaVariant>();
variants.put(peptidePosition, codonVariants);
}
lastCodon = codon;
/*
- * save nucleotide (and this variant) for each codon position
+ * save nucleotide (and any variant) for each codon position
*/
for (int codonPos = 0; codonPos < 3; codonPos++)
{
String nucleotide = String.valueOf(
dnaSeq.getCharAt(codon[codonPos] - dnaStart))
.toUpperCase();
- if (codonVariants[codonPos] == null)
+ List<DnaVariant> codonVariant = codonVariants[codonPos];
+ if (codon[codonPos] == dnaCol)
{
- /*
- * record current dna base
- */
- codonVariants[codonPos] = new String[] { nucleotide };
+ if (!codonVariant.isEmpty()
+ && codonVariant.get(0).variant == null)
+ {
+ /*
+ * already recorded base value, add this variant
+ */
+ codonVariant.get(0).variant = sf;
+ }
+ else
+ {
+ /*
+ * add variant with base value
+ */
+ codonVariant.add(new DnaVariant(nucleotide, sf));
+ }
}
- if (codon[codonPos] == dnaCol)
+ else if (codonVariant.isEmpty())
{
/*
- * add alleles to dna base (and any previously found alleles)
+ * record (possibly non-varying) base value
*/
- String[] known = codonVariants[codonPos];
- String[] dnaVariants = new String[alleles.length + known.length];
- System.arraycopy(known, 0, dnaVariants, 0, known.length);
- System.arraycopy(alleles, 0, dnaVariants, known.length,
- alleles.length);
- codonVariants[codonPos] = dnaVariants;
+ codonVariant.add(new DnaVariant(nucleotide));
}
}
}
}
/**
- * Returns a sorted, non-redundant list of all peptide translations generated
- * by the given dna variants, excluding the current residue value
- *
- * @param codonVariants
- * an array of base values (acgtACGT) for codon positions 1, 2, 3
- * @param residue
- * the current residue translation
- * @return
- */
- static List<String> computePeptideVariants(String[][] codonVariants,
- String residue)
- {
- List<String> result = new ArrayList<String>();
- for (String base1 : codonVariants[0])
- {
- for (String base2 : codonVariants[1])
- {
- for (String base3 : codonVariants[2])
- {
- String codon = base1 + base2 + base3;
- /*
- * get peptide translation of codon e.g. GAT -> D
- * note that variants which are not single alleles,
- * e.g. multibase variants or HGMD_MUTATION etc
- * are ignored here
- */
- String peptide = codon.contains("-") ? "-"
- : (codon.length() > 3 ? null : ResidueProperties
- .codonTranslate(codon));
- if (peptide != null && !result.contains(peptide)
- && !peptide.equalsIgnoreCase(residue))
- {
- result.add(peptide);
- }
- }
- }
- }
-
- /*
- * sort alphabetically with STOP at the end
- */
- Collections.sort(result, new Comparator<String>()
- {
-
- @Override
- public int compare(String o1, String o2)
- {
- if ("STOP".equals(o1))
- {
- return 1;
- }
- else if ("STOP".equals(o2))
- {
- return -1;
- }
- else
- {
- return o1.compareTo(o2);
- }
- }
- });
- return result;
- }
-
- /**
* Makes an alignment with a copy of the given sequences, adding in any
* non-redundant sequences which are mapped to by the cross-referenced
* sequences.
{
AlignmentI copy = new Alignment(new Alignment(seqs));
+ /*
+ * add mappings between sequences to the new alignment
+ */
+ AlignedCodonFrame mappings = new AlignedCodonFrame();
+ copy.addCodonFrame(mappings);
+ for (int i = 0; i < copy.getHeight(); i++)
+ {
+ SequenceI from = seqs[i];
+ SequenceI to = copy.getSequenceAt(i);
+ if (to.getDatasetSequence() != null)
+ {
+ to = to.getDatasetSequence();
+ }
+ int start = from.getStart();
+ int end = from.getEnd();
+ MapList map = new MapList(new int[] { start, end }, new int[] {
+ start, end }, 1, 1);
+ mappings.addMap(to, from, map);
+ }
+
SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
if (xrefs != null)
{