*/
package jalview.analysis;
+import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
+
import jalview.datamodel.AlignedCodon;
import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.util.MappingUtils;
import jalview.util.StringUtils;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
public class AlignmentUtils
{
+ private static final String SEQUENCE_VARIANT = "sequence_variant:";
+ private static final String ID = "ID";
+
+ /**
+ * A data model to hold the 'normal' base value at a position, and an optional
+ * sequence variant feature
+ */
+ static class DnaVariant
+ {
+ String base;
+
+ SequenceFeature variant;
+
+ DnaVariant(String nuc)
+ {
+ base = nuc;
+ }
+
+ DnaVariant(String nuc, SequenceFeature var)
+ {
+ base = nuc;
+ variant = var;
+ }
+ }
+
/**
* given an existing alignment, create a new alignment including all, or up to
* flankSize additional symbols from each sequence's dataset sequence
Collection<String> types, List<SequenceI> forSequences,
boolean anyType, boolean doShow)
{
- for (AlignmentAnnotation aa : al.getAlignmentAnnotation())
+ AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
+ if (anns != null)
{
- if (anyType || types.contains(aa.label))
+ for (AlignmentAnnotation aa : anns)
{
- if ((aa.sequenceRef != null)
- && (forSequences == null || forSequences
- .contains(aa.sequenceRef)))
+ if (anyType || types.contains(aa.label))
{
- aa.visible = doShow;
+ if ((aa.sequenceRef != null)
+ && (forSequences == null || forSequences
+ .contains(aa.sequenceRef)))
+ {
+ aa.visible = doShow;
+ }
}
}
}
* Constructs an alignment consisting of the mapped (CDS) regions in the given
* nucleotide sequences, and updates mappings to match. The CDS sequences are
* added to the original alignment's dataset, which is shared by the new
- * alignment.
+ * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
+ * added to the alignment dataset.
*
* @param dna
* aligned dna sequences
- * @param mappings
- * from dna to protein; these are replaced with new mappings
- * @param al
+ * @param dataset
+ * - throws error if not given a dataset
+ * @param products
+ * (optional) to restrict results to CDS that map to specified
+ * protein products
* @return an alignment whose sequences are the cds-only parts of the dna
* sequences (or null if no mappings are found)
*/
public static AlignmentI makeCdsAlignment(SequenceI[] dna,
- List<AlignedCodonFrame> mappings, AlignmentI al)
+ AlignmentI dataset, AlignmentI products)
{
+ if (dataset.getDataset() != null)
+ {
+ throw new Error(
+ "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
+ }
List<SequenceI> cdsSeqs = new ArrayList<SequenceI>();
-
+ List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
+ HashSet<SequenceI> productSeqs = null;
+ if (products != null)
+ {
+ productSeqs = new HashSet<SequenceI>();
+ for (SequenceI seq : products.getSequences())
+ {
+ productSeqs.add(seq.getDatasetSequence() == null ? seq : seq
+ .getDatasetSequence());
+ }
+ }
+
+ /*
+ * construct CDS sequences from the (cds-to-protein) mappings made earlier;
+ * this makes it possible to model multiple products from dna (e.g. EMBL);
+ * however it does mean we don't have the EMBL protein_id (a property on
+ * the CDS features) in order to make the CDS sequence name :-(
+ */
for (SequenceI seq : dna)
{
- AlignedCodonFrame cdsMappings = new AlignedCodonFrame();
+ SequenceI seqDss = seq.getDatasetSequence() == null ? seq : seq
+ .getDatasetSequence();
List<AlignedCodonFrame> seqMappings = MappingUtils
.findMappingsForSequence(seq, mappings);
- List<AlignedCodonFrame> alignmentMappings = al.getCodonFrames();
for (AlignedCodonFrame mapping : seqMappings)
{
- for (Mapping aMapping : mapping.getMappingsFromSequence(seq))
+ List<Mapping> mappingsFromSequence = mapping.getMappingsFromSequence(seq);
+
+ for (Mapping aMapping : mappingsFromSequence)
{
- SequenceI cdsSeq = makeCdsSequence(seq.getDatasetSequence(),
- aMapping);
+ if (aMapping.getMap().getFromRatio() == 1)
+ {
+ /*
+ * not a dna-to-protein mapping (likely dna-to-cds)
+ */
+ continue;
+ }
+
+ /*
+ * check for an existing CDS sequence i.e. a 3:1 mapping to
+ * the dna mapping's product
+ */
+ SequenceI cdsSeq = null;
+
+ // TODO better mappings collection data model so we can do
+ // a direct lookup instead of double loops to find mappings
+
+ SequenceI proteinProduct = aMapping.getTo();
+
+ /*
+ * skip if not mapped to one of a specified set of proteins
+ */
+ if (productSeqs != null && !productSeqs.contains(proteinProduct))
+ {
+ continue;
+ }
+
+ for (AlignedCodonFrame acf : MappingUtils
+ .findMappingsForSequence(proteinProduct, mappings))
+ {
+ for (SequenceToSequenceMapping map : acf.getMappings())
+ {
+ if (map.getMapping().getMap().getFromRatio() == 3
+ && proteinProduct == map.getMapping().getTo()
+ && seqDss != map.getFromSeq())
+ {
+ /*
+ * found a 3:1 mapping to the protein product which is not
+ * from the dna sequence...assume it is from the CDS sequence
+ * TODO mappings data model that brings together related
+ * dna-cds-protein mappings in one object
+ */
+ cdsSeq = map.getFromSeq();
+ }
+ }
+ }
+ if (cdsSeq != null)
+ {
+ /*
+ * mappings are always to dataset sequences so create an aligned
+ * sequence to own it; add the dataset sequence to the dataset
+ */
+ SequenceI derivedSequence = cdsSeq.deriveSequence();
+ cdsSeqs.add(derivedSequence);
+ if (!dataset.getSequences().contains(cdsSeq))
+ {
+ dataset.addSequence(cdsSeq);
+ }
+ continue;
+ }
+
+ /*
+ * didn't find mapped CDS sequence - construct it and add
+ * its dataset sequence to the dataset
+ */
+ cdsSeq = makeCdsSequence(seq.getDatasetSequence(), aMapping);
+ SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
cdsSeqs.add(cdsSeq);
-
+ if (!dataset.getSequences().contains(cdsSeqDss))
+ {
+ dataset.addSequence(cdsSeqDss);
+ }
+
/*
* add a mapping from CDS to the (unchanged) mapped to range
*/
MapList map = new MapList(cdsRange, aMapping.getMap()
.getToRanges(), aMapping.getMap().getFromRatio(),
aMapping.getMap().getToRatio());
- cdsMappings.addMap(cdsSeq, aMapping.getTo(), map);
+ AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
+ cdsToProteinMapping.addMap(cdsSeq, proteinProduct, map);
+
+ /*
+ * guard against duplicating the mapping if repeating this action
+ */
+ if (!mappings.contains(cdsToProteinMapping))
+ {
+ mappings.add(cdsToProteinMapping);
+ }
/*
* add another mapping from original 'from' range to CDS
*/
+ AlignedCodonFrame dnaToProteinMapping = new AlignedCodonFrame();
map = new MapList(aMapping.getMap().getFromRanges(), cdsRange, 1,
1);
- cdsMappings.addMap(seq.getDatasetSequence(), cdsSeq, map);
+ dnaToProteinMapping.addMap(seq.getDatasetSequence(), cdsSeq, map);
+ if (!mappings.contains(dnaToProteinMapping))
+ {
+ mappings.add(dnaToProteinMapping);
+ }
- alignmentMappings.add(cdsMappings);
/*
* transfer any features on dna that overlap the CDS
}
}
- /*
- * add CDS seqs to shared dataset
- */
- Alignment dataset = al.getDataset();
- for (SequenceI seq : cdsSeqs)
- {
- if (!dataset.getSequences().contains(seq.getDatasetSequence()))
- {
- dataset.addSequence(seq.getDatasetSequence());
- }
- }
AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs
.size()]));
cds.setDataset(dataset);
*
* @param seq
* @param mapping
- * @return
+ * @return CDS sequence (as a dataset sequence)
*/
static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping)
{
SequenceI newSeq = new Sequence(seq.getName() + "|"
+ mapping.getTo().getName(), newSeqChars, 1, newPos);
- newSeq.createDatasetSequence();
return newSeq;
}
* /ENSP00000288602?feature=transcript_variation;content-type=text/xml
* which would be a bit slower but possibly more reliable
*/
- LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
+
+ /*
+ * build a map with codon variations for each potentially varying peptide
+ */
+ LinkedHashMap<Integer, List<DnaVariant>[]> variants = buildDnaVariantsMap(
dnaSeq, dnaToProtein);
/*
* scan codon variations, compute peptide variants and add to peptide sequence
*/
int count = 0;
- for (Entry<Integer, String[][]> variant : variants.entrySet())
+ for (Entry<Integer, List<DnaVariant>[]> variant : variants.entrySet())
{
int peptidePos = variant.getKey();
- String[][] codonVariants = variant.getValue();
- String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
- List<String> peptideVariants = computePeptideVariants(codonVariants,
- residue);
- if (!peptideVariants.isEmpty())
+ List<DnaVariant>[] codonVariants = variant.getValue();
+ count += computePeptideVariants(peptide, peptidePos, codonVariants);
+ }
+
+ /*
+ * sort to get sequence features in start position order
+ * - would be better to store in Sequence as a TreeSet or NCList?
+ */
+ if (peptide.getSequenceFeatures() != null)
+ {
+ Arrays.sort(peptide.getSequenceFeatures(),
+ new Comparator<SequenceFeature>()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+ : c;
+ }
+ });
+ }
+ return count;
+ }
+
+ /**
+ * Computes non-synonymous peptide variants from codon variants and adds them
+ * as sequence_variant features on the protein sequence (one feature per
+ * allele variant). Selected attributes (variant id, clinical significance)
+ * are copied over to the new features.
+ *
+ * @param peptide
+ * the protein sequence
+ * @param peptidePos
+ * the position to compute peptide variants for
+ * @param codonVariants
+ * a list of dna variants per codon position
+ * @return the number of features added
+ */
+ static int computePeptideVariants(SequenceI peptide, int peptidePos,
+ List<DnaVariant>[] codonVariants)
+ {
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1));
+ int count = 0;
+ String base1 = codonVariants[0].get(0).base;
+ String base2 = codonVariants[1].get(0).base;
+ String base3 = codonVariants[2].get(0).base;
+
+ /*
+ * variants in first codon base
+ */
+ for (DnaVariant var : codonVariants[0])
+ {
+ if (var.variant != null)
{
- String desc = residue + "," // include canonical residue in description
- + StringUtils.listToDelimitedString(peptideVariants, ", ");
- SequenceFeature sf = new SequenceFeature(
- SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
- peptidePos, 0f, null);
- peptide.addSequenceFeature(sf);
- count++;
+ String alleles = (String) var.variant.getValue("alleles");
+ if (alleles != null)
+ {
+ for (String base : alleles.split(","))
+ {
+ String codon = base + base2 + base3;
+ if (addPeptideVariant(peptide, peptidePos, residue, var, codon))
+ {
+ count++;
+ }
+ }
+ }
}
}
/*
- * ugly sort to get sequence features in start position order
- * - would be better to store in Sequence as a TreeSet instead?
+ * variants in second codon base
*/
- Arrays.sort(peptide.getSequenceFeatures(),
- new Comparator<SequenceFeature>()
+ for (DnaVariant var : codonVariants[1])
+ {
+ if (var.variant != null)
+ {
+ String alleles = (String) var.variant.getValue("alleles");
+ if (alleles != null)
+ {
+ for (String base : alleles.split(","))
+ {
+ String codon = base1 + base + base3;
+ if (addPeptideVariant(peptide, peptidePos, residue, var, codon))
{
- @Override
- public int compare(SequenceFeature o1, SequenceFeature o2)
- {
- int c = Integer.compare(o1.getBegin(), o2.getBegin());
- return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
- : c;
- }
- });
+ count++;
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * variants in third codon base
+ */
+ for (DnaVariant var : codonVariants[2])
+ {
+ if (var.variant != null)
+ {
+ String alleles = (String) var.variant.getValue("alleles");
+ if (alleles != null)
+ {
+ for (String base : alleles.split(","))
+ {
+ String codon = base1 + base2 + base;
+ if (addPeptideVariant(peptide, peptidePos, residue, var, codon))
+ {
+ count++;
+ }
+ }
+ }
+ }
+ }
+
return count;
}
/**
- * Builds a map whose key is position in the protein sequence, and value is an
- * array of all variants for the coding codon positions
+ * Helper method that adds a peptide variant feature, provided the given codon
+ * translates to a value different to the current residue (is a non-synonymous
+ * variant). ID and clinical_significance attributes of the dna variant (if
+ * present) are copied to the new feature.
+ *
+ * @param peptide
+ * @param peptidePos
+ * @param residue
+ * @param var
+ * @param codon
+ * @return true if a feature was added, else false
+ */
+ static boolean addPeptideVariant(SequenceI peptide, int peptidePos,
+ String residue, DnaVariant var, String codon)
+ {
+ /*
+ * get peptide translation of codon e.g. GAT -> D
+ * note that variants which are not single alleles,
+ * e.g. multibase variants or HGMD_MUTATION etc
+ * are currently ignored here
+ */
+ String trans = codon.contains("-") ? "-"
+ : (codon.length() > 3 ? null : ResidueProperties
+ .codonTranslate(codon));
+ if (trans != null && !trans.equals(residue))
+ {
+ String residue3Char = StringUtils
+ .toSentenceCase(ResidueProperties.aa2Triplet.get(residue));
+ String trans3Char = StringUtils
+ .toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
+ String desc = "p." + residue3Char + peptidePos + trans3Char;
+ // set score to 0f so 'graduated colour' option is offered! JAL-2060
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, 0f, "Jalview");
+ StringBuilder attributes = new StringBuilder(32);
+ String id = (String) var.variant.getValue(ID);
+ if (id != null)
+ {
+ if (id.startsWith(SEQUENCE_VARIANT))
+ {
+ id = id.substring(SEQUENCE_VARIANT.length());
+ }
+ sf.setValue(ID, id);
+ attributes.append(ID).append("=").append(id);
+ // TODO handle other species variants
+ StringBuilder link = new StringBuilder(32);
+ try
+ {
+ link.append(desc).append(" ").append(id)
+ .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
+ .append(URLEncoder.encode(id, "UTF-8"));
+ sf.addLink(link.toString());
+ } catch (UnsupportedEncodingException e)
+ {
+ // as if
+ }
+ }
+ String clinSig = (String) var.variant
+ .getValue(CLINICAL_SIGNIFICANCE);
+ if (clinSig != null)
+ {
+ sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
+ attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=")
+ .append(clinSig);
+ }
+ peptide.addSequenceFeature(sf);
+ if (attributes.length() > 0)
+ {
+ sf.setAttributes(attributes.toString());
+ }
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Builds a map whose key is position in the protein sequence, and value is a
+ * list of the base and all variants for each corresponding codon position
*
* @param dnaSeq
* @param dnaToProtein
* @return
*/
- static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
+ static LinkedHashMap<Integer, List<DnaVariant>[]> buildDnaVariantsMap(
SequenceI dnaSeq, MapList dnaToProtein)
{
/*
- * map from peptide position to all variant features of the codon for it
- * LinkedHashMap ensures we add the peptide features in sequence order
+ * map from peptide position to all variants of the codon which codes for it
+ * LinkedHashMap ensures we keep the peptide features in sequence order
*/
- LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+ LinkedHashMap<Integer, List<DnaVariant>[]> variants = new LinkedHashMap<Integer, List<DnaVariant>[]>();
SequenceOntologyI so = SequenceOntologyFactory.getInstance();
SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
continue;
}
int peptidePosition = mapsTo[0];
- String[][] codonVariants = variants.get(peptidePosition);
+ List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
if (codonVariants == null)
{
- codonVariants = new String[3][];
+ codonVariants = new ArrayList[3];
+ codonVariants[0] = new ArrayList<DnaVariant>();
+ codonVariants[1] = new ArrayList<DnaVariant>();
+ codonVariants[2] = new ArrayList<DnaVariant>();
variants.put(peptidePosition, codonVariants);
}
lastCodon = codon;
/*
- * save nucleotide (and this variant) for each codon position
+ * save nucleotide (and any variant) for each codon position
*/
for (int codonPos = 0; codonPos < 3; codonPos++)
{
String nucleotide = String.valueOf(
dnaSeq.getCharAt(codon[codonPos] - dnaStart))
.toUpperCase();
- if (codonVariants[codonPos] == null)
+ List<DnaVariant> codonVariant = codonVariants[codonPos];
+ if (codon[codonPos] == dnaCol)
{
- /*
- * record current dna base
- */
- codonVariants[codonPos] = new String[] { nucleotide };
+ if (!codonVariant.isEmpty()
+ && codonVariant.get(0).variant == null)
+ {
+ /*
+ * already recorded base value, add this variant
+ */
+ codonVariant.get(0).variant = sf;
+ }
+ else
+ {
+ /*
+ * add variant with base value
+ */
+ codonVariant.add(new DnaVariant(nucleotide, sf));
+ }
}
- if (codon[codonPos] == dnaCol)
+ else if (codonVariant.isEmpty())
{
/*
- * add alleles to dna base (and any previously found alleles)
+ * record (possibly non-varying) base value
*/
- String[] known = codonVariants[codonPos];
- String[] dnaVariants = new String[alleles.length + known.length];
- System.arraycopy(known, 0, dnaVariants, 0, known.length);
- System.arraycopy(alleles, 0, dnaVariants, known.length,
- alleles.length);
- codonVariants[codonPos] = dnaVariants;
+ codonVariant.add(new DnaVariant(nucleotide));
}
}
}
}
/**
- * Returns a sorted, non-redundant list of all peptide translations generated
- * by the given dna variants, excluding the current residue value
- *
- * @param codonVariants
- * an array of base values (acgtACGT) for codon positions 1, 2, 3
- * @param residue
- * the current residue translation
- * @return
- */
- static List<String> computePeptideVariants(String[][] codonVariants,
- String residue)
- {
- List<String> result = new ArrayList<String>();
- for (String base1 : codonVariants[0])
- {
- for (String base2 : codonVariants[1])
- {
- for (String base3 : codonVariants[2])
- {
- String codon = base1 + base2 + base3;
- /*
- * get peptide translation of codon e.g. GAT -> D
- * note that variants which are not single alleles,
- * e.g. multibase variants or HGMD_MUTATION etc
- * are ignored here
- */
- String peptide = codon.contains("-") ? "-"
- : (codon.length() > 3 ? null : ResidueProperties
- .codonTranslate(codon));
- if (peptide != null && !result.contains(peptide)
- && !peptide.equalsIgnoreCase(residue))
- {
- result.add(peptide);
- }
- }
- }
- }
-
- /*
- * sort alphabetically with STOP at the end
- */
- Collections.sort(result, new Comparator<String>()
- {
-
- @Override
- public int compare(String o1, String o2)
- {
- if ("STOP".equals(o1))
- {
- return 1;
- }
- else if ("STOP".equals(o2))
- {
- return -1;
- }
- else
- {
- return o1.compareTo(o2);
- }
- }
- });
- return result;
- }
-
- /**
* Makes an alignment with a copy of the given sequences, adding in any
* non-redundant sequences which are mapped to by the cross-referenced
* sequences.
*
* @param seqs
* @param xrefs
+ * @param dataset
+ * alignment dataset shared by the new copy
* @return
*/
public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
- SequenceI[] xrefs)
+ SequenceI[] xrefs, AlignmentI dataset)
{
AlignmentI copy = new Alignment(new Alignment(seqs));
+ copy.setDataset(dataset);
SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
if (xrefs != null)
{
/*
* Map will hold, for each aligned column position, a map of
- * {unalignedSequence, sequenceCharacter} at that position.
+ * {unalignedSequence, characterPerSequence} at that position.
* TreeMap keeps the entries in ascending column order.
*/
Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
/*
- * report any sequences that have no mapping so can't be realigned
+ * record any sequences that have no mapping so can't be realigned
*/
unmapped.addAll(unaligned.getSequences());
SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
if (fromSeq != null)
{
- Mapping seqMap = mapping.getMappingForSequence(seq);
+ Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
if (addMappedPositions(seq, fromSeq, seqMap, map))
{
unmapped.remove(seq);
static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
{
+ if (seqMap == null)
+ {
+ return false;
+ }
+
+ /*
+ * invert mapping if it is from unaligned to aligned sequence
+ */
+ if (seqMap.getTo() == fromSeq.getDatasetSequence())
+ {
+ seqMap = new Mapping(seq.getDatasetSequence(), seqMap.getMap()
+ .getInverse());
+ }
+
char[] fromChars = fromSeq.getSequence();
int toStart = seq.getStart();
char[] toChars = seq.getSequence();
* of the next character of the mapped-to sequence; stop when all
* the characters of the range have been counted
*/
- while (mappedCharPos <= range[1])
+ while (mappedCharPos <= range[1] && fromCol <= fromChars.length
+ && fromCol >= 0)
{
if (!Comparison.isGap(fromChars[fromCol - 1]))
{
}
return true;
}
+
+ // strictly temporary hack until proper criteria for aligning protein to cds
+ // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
+ public static boolean looksLikeEnsembl(AlignmentI alignment)
+ {
+ for (SequenceI seq : alignment.getSequences())
+ {
+ String name = seq.getName();
+ if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
}