X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=081f446f0dcbd13ca7676ec949be1a16d27ff918;hb=6b8f46dd2ffc7f73948f0523c9b45336d2bd8128;hp=eb1ee4bb04bbd0c9eef128d0cc67c57b98146ab2;hpb=6ed535f7ef953468f8827255ec6ebcd5a6e54d8d;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index eb1ee4b..081f446 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -20,6 +20,8 @@ */ package jalview.analysis; +import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE; + import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; @@ -40,6 +42,8 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.StringUtils; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -66,6 +70,31 @@ import java.util.TreeMap; public class AlignmentUtils { + private static final String SEQUENCE_VARIANT = "sequence_variant:"; + private static final String ID = "ID"; + + /** + * A data model to hold the 'normal' base value at a position, and an optional + * sequence variant feature + */ + static class DnaVariant + { + String base; + + SequenceFeature variant; + + DnaVariant(String nuc) + { + base = nuc; + } + + DnaVariant(String nuc, SequenceFeature var) + { + base = nuc; + variant = var; + } + } + /** * given an existing alignment, create a new alignment including all, or up to * flankSize additional symbols from each sequence's dataset sequence @@ -1366,12 +1395,13 @@ public class AlignmentUtils * Constructs an alignment consisting of the mapped (CDS) regions in the given * nucleotide sequences, and updates mappings to match. The CDS sequences are * added to the original alignment's dataset, which is shared by the new - * alignment. + * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are + * added to the alignment dataset. * * @param dna * aligned dna sequences * @param mappings - * from dna to protein; these are replaced with new mappings + * from dna to protein * @param al * @return an alignment whose sequences are the cds-only parts of the dna * sequences (or null if no mappings are found) @@ -1743,35 +1773,27 @@ public class AlignmentUtils * /ENSP00000288602?feature=transcript_variation;content-type=text/xml * which would be a bit slower but possibly more reliable */ - LinkedHashMap variants = buildDnaVariantsMap( + + /* + * build a map with codon variations for each potentially varying peptide + */ + LinkedHashMap[]> variants = buildDnaVariantsMap( dnaSeq, dnaToProtein); /* * scan codon variations, compute peptide variants and add to peptide sequence */ int count = 0; - for (Entry variant : variants.entrySet()) + for (Entry[]> variant : variants.entrySet()) { int peptidePos = variant.getKey(); - String[][] codonVariants = variant.getValue(); - String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based - List peptideVariants = computePeptideVariants(codonVariants, - residue); - if (!peptideVariants.isEmpty()) - { - String desc = residue + "," // include canonical residue in description - + StringUtils.listToDelimitedString(peptideVariants, ", "); - SequenceFeature sf = new SequenceFeature( - SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, - peptidePos, 0f, null); - peptide.addSequenceFeature(sf); - count++; - } + List[] codonVariants = variant.getValue(); + count += computePeptideVariants(peptide, peptidePos, codonVariants); } /* - * ugly sort to get sequence features in start position order - * - would be better to store in Sequence as a TreeSet instead? + * sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet or NCList? */ Arrays.sort(peptide.getSequenceFeatures(), new Comparator() @@ -1788,21 +1810,190 @@ public class AlignmentUtils } /** - * Builds a map whose key is position in the protein sequence, and value is an - * array of all variants for the coding codon positions + * Computes non-synonymous peptide variants from codon variants and adds them + * as sequence_variant features on the protein sequence (one feature per + * allele variant). Selected attributes (variant id, clinical significance) + * are copied over to the new features. + * + * @param peptide + * the protein sequence + * @param peptidePos + * the position to compute peptide variants for + * @param codonVariants + * a list of dna variants per codon position + * @return the number of features added + */ + static int computePeptideVariants(SequenceI peptide, int peptidePos, + List[] codonVariants) + { + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); + int count = 0; + String base1 = codonVariants[0].get(0).base; + String base2 = codonVariants[1].get(0).base; + String base3 = codonVariants[2].get(0).base; + + /* + * variants in first codon base + */ + for (DnaVariant var : codonVariants[0]) + { + if (var.variant != null) + { + String alleles = (String) var.variant.getValue("alleles"); + if (alleles != null) + { + for (String base : alleles.split(",")) + { + String codon = base + base2 + base3; + if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) + { + count++; + } + } + } + } + } + + /* + * variants in second codon base + */ + for (DnaVariant var : codonVariants[1]) + { + if (var.variant != null) + { + String alleles = (String) var.variant.getValue("alleles"); + if (alleles != null) + { + for (String base : alleles.split(",")) + { + String codon = base1 + base + base3; + if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) + { + count++; + } + } + } + } + } + + /* + * variants in third codon base + */ + for (DnaVariant var : codonVariants[2]) + { + if (var.variant != null) + { + String alleles = (String) var.variant.getValue("alleles"); + if (alleles != null) + { + for (String base : alleles.split(",")) + { + String codon = base1 + base2 + base; + if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) + { + count++; + } + } + } + } + } + + return count; + } + + /** + * Helper method that adds a peptide variant feature, provided the given codon + * translates to a value different to the current residue (is a non-synonymous + * variant). ID and clinical_significance attributes of the dna variant (if + * present) are copied to the new feature. + * + * @param peptide + * @param peptidePos + * @param residue + * @param var + * @param codon + * @return true if a feature was added, else false + */ + static boolean addPeptideVariant(SequenceI peptide, int peptidePos, + String residue, DnaVariant var, String codon) + { + /* + * get peptide translation of codon e.g. GAT -> D + * note that variants which are not single alleles, + * e.g. multibase variants or HGMD_MUTATION etc + * are currently ignored here + */ + String trans = codon.contains("-") ? "-" + : (codon.length() > 3 ? null : ResidueProperties + .codonTranslate(codon)); + if (trans != null && !trans.equals(residue)) + { + String residue3Char = StringUtils + .toSentenceCase(ResidueProperties.aa2Triplet.get(residue)); + String trans3Char = StringUtils + .toSentenceCase(ResidueProperties.aa2Triplet.get(trans)); + String desc = "p." + residue3Char + peptidePos + trans3Char; + // set score to 0f so 'graduated colour' option is offered! JAL-2060 + SequenceFeature sf = new SequenceFeature( + SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, 0f, "Jalview"); + StringBuilder attributes = new StringBuilder(32); + String id = (String) var.variant.getValue(ID); + if (id != null) + { + if (id.startsWith(SEQUENCE_VARIANT)) + { + id = id.substring(SEQUENCE_VARIANT.length()); + } + sf.setValue(ID, id); + attributes.append(ID).append("=").append(id); + // TODO handle other species variants + StringBuilder link = new StringBuilder(32); + try + { + link.append(desc).append(" ").append(id) + .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=") + .append(URLEncoder.encode(id, "UTF-8")); + sf.addLink(link.toString()); + } catch (UnsupportedEncodingException e) + { + // as if + } + } + String clinSig = (String) var.variant + .getValue(CLINICAL_SIGNIFICANCE); + if (clinSig != null) + { + sf.setValue(CLINICAL_SIGNIFICANCE, clinSig); + attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=") + .append(clinSig); + } + peptide.addSequenceFeature(sf); + if (attributes.length() > 0) + { + sf.setAttributes(attributes.toString()); + } + return true; + } + return false; + } + + /** + * Builds a map whose key is position in the protein sequence, and value is a + * list of the base and all variants for each corresponding codon position * * @param dnaSeq * @param dnaToProtein * @return */ - static LinkedHashMap buildDnaVariantsMap( + static LinkedHashMap[]> buildDnaVariantsMap( SequenceI dnaSeq, MapList dnaToProtein) { /* - * map from peptide position to all variant features of the codon for it - * LinkedHashMap ensures we add the peptide features in sequence order + * map from peptide position to all variants of the codon which codes for it + * LinkedHashMap ensures we keep the peptide features in sequence order */ - LinkedHashMap variants = new LinkedHashMap(); + LinkedHashMap[]> variants = new LinkedHashMap[]>(); SequenceOntologyI so = SequenceOntologyFactory.getInstance(); SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); @@ -1835,10 +2026,13 @@ public class AlignmentUtils continue; } int peptidePosition = mapsTo[0]; - String[][] codonVariants = variants.get(peptidePosition); + List[] codonVariants = variants.get(peptidePosition); if (codonVariants == null) { - codonVariants = new String[3][]; + codonVariants = new ArrayList[3]; + codonVariants[0] = new ArrayList(); + codonVariants[1] = new ArrayList(); + codonVariants[2] = new ArrayList(); variants.put(peptidePosition, codonVariants); } @@ -1867,31 +2061,38 @@ public class AlignmentUtils lastCodon = codon; /* - * save nucleotide (and this variant) for each codon position + * save nucleotide (and any variant) for each codon position */ for (int codonPos = 0; codonPos < 3; codonPos++) { String nucleotide = String.valueOf( dnaSeq.getCharAt(codon[codonPos] - dnaStart)) .toUpperCase(); - if (codonVariants[codonPos] == null) + List codonVariant = codonVariants[codonPos]; + if (codon[codonPos] == dnaCol) { - /* - * record current dna base - */ - codonVariants[codonPos] = new String[] { nucleotide }; + if (!codonVariant.isEmpty() + && codonVariant.get(0).variant == null) + { + /* + * already recorded base value, add this variant + */ + codonVariant.get(0).variant = sf; + } + else + { + /* + * add variant with base value + */ + codonVariant.add(new DnaVariant(nucleotide, sf)); + } } - if (codon[codonPos] == dnaCol) + else if (codonVariant.isEmpty()) { /* - * add alleles to dna base (and any previously found alleles) + * record (possibly non-varying) base value */ - String[] known = codonVariants[codonPos]; - String[] dnaVariants = new String[alleles.length + known.length]; - System.arraycopy(known, 0, dnaVariants, 0, known.length); - System.arraycopy(alleles, 0, dnaVariants, known.length, - alleles.length); - codonVariants[codonPos] = dnaVariants; + codonVariant.add(new DnaVariant(nucleotide)); } } } @@ -1900,71 +2101,6 @@ public class AlignmentUtils } /** - * Returns a sorted, non-redundant list of all peptide translations generated - * by the given dna variants, excluding the current residue value - * - * @param codonVariants - * an array of base values (acgtACGT) for codon positions 1, 2, 3 - * @param residue - * the current residue translation - * @return - */ - static List computePeptideVariants(String[][] codonVariants, - String residue) - { - List result = new ArrayList(); - for (String base1 : codonVariants[0]) - { - for (String base2 : codonVariants[1]) - { - for (String base3 : codonVariants[2]) - { - String codon = base1 + base2 + base3; - /* - * get peptide translation of codon e.g. GAT -> D - * note that variants which are not single alleles, - * e.g. multibase variants or HGMD_MUTATION etc - * are ignored here - */ - String peptide = codon.contains("-") ? "-" - : (codon.length() > 3 ? null : ResidueProperties - .codonTranslate(codon)); - if (peptide != null && !result.contains(peptide) - && !peptide.equalsIgnoreCase(residue)) - { - result.add(peptide); - } - } - } - } - - /* - * sort alphabetically with STOP at the end - */ - Collections.sort(result, new Comparator() - { - - @Override - public int compare(String o1, String o2) - { - if ("STOP".equals(o1)) - { - return 1; - } - else if ("STOP".equals(o2)) - { - return -1; - } - else - { - return o1.compareTo(o2); - } - } - }); - return result; - } - - /** * Makes an alignment with a copy of the given sequences, adding in any * non-redundant sequences which are mapped to by the cross-referenced * sequences. @@ -1978,6 +2114,26 @@ public class AlignmentUtils { AlignmentI copy = new Alignment(new Alignment(seqs)); + /* + * add mappings between sequences to the new alignment + */ + AlignedCodonFrame mappings = new AlignedCodonFrame(); + copy.addCodonFrame(mappings); + for (int i = 0; i < copy.getHeight(); i++) + { + SequenceI from = seqs[i]; + SequenceI to = copy.getSequenceAt(i); + if (to.getDatasetSequence() != null) + { + to = to.getDatasetSequence(); + } + int start = from.getStart(); + int end = from.getEnd(); + MapList map = new MapList(new int[] { start, end }, new int[] { + start, end }, 1, 1); + mappings.addMap(to, from, map); + } + SequenceIdMatcher matcher = new SequenceIdMatcher(seqs); if (xrefs != null) { @@ -2093,7 +2249,7 @@ public class AlignmentUtils Map> map = new TreeMap>(); /* - * report any sequences that have no mapping so can't be realigned + * r any sequences that have no mapping so can't be realigned */ unmapped.addAll(unaligned.getSequences()); @@ -2106,7 +2262,7 @@ public class AlignmentUtils SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned); if (fromSeq != null) { - Mapping seqMap = mapping.getMappingForSequence(seq); + Mapping seqMap = mapping.getMappingBetween(fromSeq, seq); if (addMappedPositions(seq, fromSeq, seqMap, map)) { unmapped.remove(seq); @@ -2137,6 +2293,11 @@ public class AlignmentUtils static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq, Mapping seqMap, Map> map) { + if (seqMap == null) + { + return false; + } + char[] fromChars = fromSeq.getSequence(); int toStart = seq.getStart(); char[] toChars = seq.getSequence(); @@ -2193,4 +2354,19 @@ public class AlignmentUtils } return true; } + + // strictly temporary hack until proper criteria for aligning protein to cds + // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot + public static boolean looksLikeEnsembl(AlignmentI alignment) + { + for (SequenceI seq : alignment.getSequences()) + { + String name = seq.getName(); + if (!name.startsWith("ENSG") && !name.startsWith("ENST")) + { + return false; + } + } + return true; + } }