X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=949c47a69d9f8c559feb9f344df8a9033f36f654;hb=9c39e96af6b84257604da448101505361dced686;hp=14e39073c7e3ec25a52f59e4b3177779ee6ab3bb;hpb=10bdbbde9cd0307e3d4c2b39c47062b59b305155;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 14e3907..949c47a 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -20,6 +20,8 @@ */ package jalview.analysis; +import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE; + import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; @@ -40,6 +42,8 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.StringUtils; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -66,6 +70,31 @@ import java.util.TreeMap; public class AlignmentUtils { + private static final String SEQUENCE_VARIANT = "sequence_variant:"; + private static final String ID = "ID"; + + /** + * A data model to hold the 'normal' base value at a position, and an optional + * sequence variant feature + */ + static class DnaVariant + { + String base; + + SequenceFeature variant; + + DnaVariant(String nuc) + { + base = nuc; + } + + DnaVariant(String nuc, SequenceFeature var) + { + base = nuc; + variant = var; + } + } + /** * given an existing alignment, create a new alignment including all, or up to * flankSize additional symbols from each sequence's dataset sequence @@ -1382,6 +1411,12 @@ public class AlignmentUtils { List cdsSeqs = new ArrayList(); + /* + * construct CDS sequences from the (cds-to-protein) mappings made earlier; + * this makes it possible to model multiple products from dna (e.g. EMBL); + * however it does mean we don't have the EMBL protein_id (a property on + * the CDS features) in order to make the CDS sequence name :-( + */ for (SequenceI seq : dna) { AlignedCodonFrame cdsMappings = new AlignedCodonFrame(); @@ -1744,66 +1779,230 @@ public class AlignmentUtils * /ENSP00000288602?feature=transcript_variation;content-type=text/xml * which would be a bit slower but possibly more reliable */ - LinkedHashMap variants = buildDnaVariantsMap( + + /* + * build a map with codon variations for each potentially varying peptide + */ + LinkedHashMap[]> variants = buildDnaVariantsMap( dnaSeq, dnaToProtein); /* * scan codon variations, compute peptide variants and add to peptide sequence */ int count = 0; - for (Entry variant : variants.entrySet()) + for (Entry[]> variant : variants.entrySet()) { int peptidePos = variant.getKey(); - String[][] codonVariants = variant.getValue(); - String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based - List peptideVariants = computePeptideVariants(codonVariants, - residue); - if (!peptideVariants.isEmpty()) + List[] codonVariants = variant.getValue(); + count += computePeptideVariants(peptide, peptidePos, codonVariants); + } + + /* + * sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet or NCList? + */ + if (peptide.getSequenceFeatures() != null) + { + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); + } + return count; + } + + /** + * Computes non-synonymous peptide variants from codon variants and adds them + * as sequence_variant features on the protein sequence (one feature per + * allele variant). Selected attributes (variant id, clinical significance) + * are copied over to the new features. + * + * @param peptide + * the protein sequence + * @param peptidePos + * the position to compute peptide variants for + * @param codonVariants + * a list of dna variants per codon position + * @return the number of features added + */ + static int computePeptideVariants(SequenceI peptide, int peptidePos, + List[] codonVariants) + { + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); + int count = 0; + String base1 = codonVariants[0].get(0).base; + String base2 = codonVariants[1].get(0).base; + String base3 = codonVariants[2].get(0).base; + + /* + * variants in first codon base + */ + for (DnaVariant var : codonVariants[0]) + { + if (var.variant != null) { - String desc = residue + "," // include canonical residue in description - + StringUtils.listToDelimitedString(peptideVariants, ", "); - SequenceFeature sf = new SequenceFeature( - SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, - peptidePos, 0f, null); - peptide.addSequenceFeature(sf); - count++; + String alleles = (String) var.variant.getValue("alleles"); + if (alleles != null) + { + for (String base : alleles.split(",")) + { + String codon = base + base2 + base3; + if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) + { + count++; + } + } + } } } /* - * ugly sort to get sequence features in start position order - * - would be better to store in Sequence as a TreeSet instead? + * variants in second codon base */ - Arrays.sort(peptide.getSequenceFeatures(), - new Comparator() + for (DnaVariant var : codonVariants[1]) + { + if (var.variant != null) + { + String alleles = (String) var.variant.getValue("alleles"); + if (alleles != null) + { + for (String base : alleles.split(",")) + { + String codon = base1 + base + base3; + if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) - { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) - : c; - } - }); + count++; + } + } + } + } + } + + /* + * variants in third codon base + */ + for (DnaVariant var : codonVariants[2]) + { + if (var.variant != null) + { + String alleles = (String) var.variant.getValue("alleles"); + if (alleles != null) + { + for (String base : alleles.split(",")) + { + String codon = base1 + base2 + base; + if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) + { + count++; + } + } + } + } + } + return count; } /** - * Builds a map whose key is position in the protein sequence, and value is an - * array of all variants for the coding codon positions + * Helper method that adds a peptide variant feature, provided the given codon + * translates to a value different to the current residue (is a non-synonymous + * variant). ID and clinical_significance attributes of the dna variant (if + * present) are copied to the new feature. + * + * @param peptide + * @param peptidePos + * @param residue + * @param var + * @param codon + * @return true if a feature was added, else false + */ + static boolean addPeptideVariant(SequenceI peptide, int peptidePos, + String residue, DnaVariant var, String codon) + { + /* + * get peptide translation of codon e.g. GAT -> D + * note that variants which are not single alleles, + * e.g. multibase variants or HGMD_MUTATION etc + * are currently ignored here + */ + String trans = codon.contains("-") ? "-" + : (codon.length() > 3 ? null : ResidueProperties + .codonTranslate(codon)); + if (trans != null && !trans.equals(residue)) + { + String residue3Char = StringUtils + .toSentenceCase(ResidueProperties.aa2Triplet.get(residue)); + String trans3Char = StringUtils + .toSentenceCase(ResidueProperties.aa2Triplet.get(trans)); + String desc = "p." + residue3Char + peptidePos + trans3Char; + // set score to 0f so 'graduated colour' option is offered! JAL-2060 + SequenceFeature sf = new SequenceFeature( + SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, 0f, "Jalview"); + StringBuilder attributes = new StringBuilder(32); + String id = (String) var.variant.getValue(ID); + if (id != null) + { + if (id.startsWith(SEQUENCE_VARIANT)) + { + id = id.substring(SEQUENCE_VARIANT.length()); + } + sf.setValue(ID, id); + attributes.append(ID).append("=").append(id); + // TODO handle other species variants + StringBuilder link = new StringBuilder(32); + try + { + link.append(desc).append(" ").append(id) + .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=") + .append(URLEncoder.encode(id, "UTF-8")); + sf.addLink(link.toString()); + } catch (UnsupportedEncodingException e) + { + // as if + } + } + String clinSig = (String) var.variant + .getValue(CLINICAL_SIGNIFICANCE); + if (clinSig != null) + { + sf.setValue(CLINICAL_SIGNIFICANCE, clinSig); + attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=") + .append(clinSig); + } + peptide.addSequenceFeature(sf); + if (attributes.length() > 0) + { + sf.setAttributes(attributes.toString()); + } + return true; + } + return false; + } + + /** + * Builds a map whose key is position in the protein sequence, and value is a + * list of the base and all variants for each corresponding codon position * * @param dnaSeq * @param dnaToProtein * @return */ - static LinkedHashMap buildDnaVariantsMap( + static LinkedHashMap[]> buildDnaVariantsMap( SequenceI dnaSeq, MapList dnaToProtein) { /* - * map from peptide position to all variant features of the codon for it - * LinkedHashMap ensures we add the peptide features in sequence order + * map from peptide position to all variants of the codon which codes for it + * LinkedHashMap ensures we keep the peptide features in sequence order */ - LinkedHashMap variants = new LinkedHashMap(); + LinkedHashMap[]> variants = new LinkedHashMap[]>(); SequenceOntologyI so = SequenceOntologyFactory.getInstance(); SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); @@ -1836,10 +2035,13 @@ public class AlignmentUtils continue; } int peptidePosition = mapsTo[0]; - String[][] codonVariants = variants.get(peptidePosition); + List[] codonVariants = variants.get(peptidePosition); if (codonVariants == null) { - codonVariants = new String[3][]; + codonVariants = new ArrayList[3]; + codonVariants[0] = new ArrayList(); + codonVariants[1] = new ArrayList(); + codonVariants[2] = new ArrayList(); variants.put(peptidePosition, codonVariants); } @@ -1868,31 +2070,38 @@ public class AlignmentUtils lastCodon = codon; /* - * save nucleotide (and this variant) for each codon position + * save nucleotide (and any variant) for each codon position */ for (int codonPos = 0; codonPos < 3; codonPos++) { String nucleotide = String.valueOf( dnaSeq.getCharAt(codon[codonPos] - dnaStart)) .toUpperCase(); - if (codonVariants[codonPos] == null) + List codonVariant = codonVariants[codonPos]; + if (codon[codonPos] == dnaCol) { - /* - * record current dna base - */ - codonVariants[codonPos] = new String[] { nucleotide }; + if (!codonVariant.isEmpty() + && codonVariant.get(0).variant == null) + { + /* + * already recorded base value, add this variant + */ + codonVariant.get(0).variant = sf; + } + else + { + /* + * add variant with base value + */ + codonVariant.add(new DnaVariant(nucleotide, sf)); + } } - if (codon[codonPos] == dnaCol) + else if (codonVariant.isEmpty()) { /* - * add alleles to dna base (and any previously found alleles) + * record (possibly non-varying) base value */ - String[] known = codonVariants[codonPos]; - String[] dnaVariants = new String[alleles.length + known.length]; - System.arraycopy(known, 0, dnaVariants, 0, known.length); - System.arraycopy(alleles, 0, dnaVariants, known.length, - alleles.length); - codonVariants[codonPos] = dnaVariants; + codonVariant.add(new DnaVariant(nucleotide)); } } } @@ -1901,71 +2110,6 @@ public class AlignmentUtils } /** - * Returns a sorted, non-redundant list of all peptide translations generated - * by the given dna variants, excluding the current residue value - * - * @param codonVariants - * an array of base values (acgtACGT) for codon positions 1, 2, 3 - * @param residue - * the current residue translation - * @return - */ - static List computePeptideVariants(String[][] codonVariants, - String residue) - { - List result = new ArrayList(); - for (String base1 : codonVariants[0]) - { - for (String base2 : codonVariants[1]) - { - for (String base3 : codonVariants[2]) - { - String codon = base1 + base2 + base3; - /* - * get peptide translation of codon e.g. GAT -> D - * note that variants which are not single alleles, - * e.g. multibase variants or HGMD_MUTATION etc - * are ignored here - */ - String peptide = codon.contains("-") ? "-" - : (codon.length() > 3 ? null : ResidueProperties - .codonTranslate(codon)); - if (peptide != null && !result.contains(peptide) - && !peptide.equalsIgnoreCase(residue)) - { - result.add(peptide); - } - } - } - } - - /* - * sort alphabetically with STOP at the end - */ - Collections.sort(result, new Comparator() - { - - @Override - public int compare(String o1, String o2) - { - if ("STOP".equals(o1)) - { - return 1; - } - else if ("STOP".equals(o2)) - { - return -1; - } - else - { - return o1.compareTo(o2); - } - } - }); - return result; - } - - /** * Makes an alignment with a copy of the given sequences, adding in any * non-redundant sequences which are mapped to by the cross-referenced * sequences. @@ -1979,6 +2123,26 @@ public class AlignmentUtils { AlignmentI copy = new Alignment(new Alignment(seqs)); + /* + * add mappings between sequences to the new alignment + */ + AlignedCodonFrame mappings = new AlignedCodonFrame(); + copy.addCodonFrame(mappings); + for (int i = 0; i < copy.getHeight(); i++) + { + SequenceI from = seqs[i]; + SequenceI to = copy.getSequenceAt(i); + if (to.getDatasetSequence() != null) + { + to = to.getDatasetSequence(); + } + int start = from.getStart(); + int end = from.getEnd(); + MapList map = new MapList(new int[] { start, end }, new int[] { + start, end }, 1, 1); + mappings.addMap(to, from, map); + } + SequenceIdMatcher matcher = new SequenceIdMatcher(seqs); if (xrefs != null) {