X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FMappedFeatures.java;h=57c8c375b4d65c651d37379bd03c84935c67bc9d;hp=25f5ba4e29242ea87adb3989dfc02579a96dd166;hb=c6018dc0dc12720e13b75850a5303279ac7094b7;hpb=825ef108d5bfcf9b3e3eb9422b27658c80ab0854 diff --git a/src/jalview/datamodel/MappedFeatures.java b/src/jalview/datamodel/MappedFeatures.java index 25f5ba4..57c8c37 100644 --- a/src/jalview/datamodel/MappedFeatures.java +++ b/src/jalview/datamodel/MappedFeatures.java @@ -1,179 +1,315 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.datamodel; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + import jalview.io.gff.Gff3Helper; import jalview.schemes.ResidueProperties; +import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.StringUtils; -import java.util.ArrayList; -import java.util.List; - /** * A data bean to hold a list of mapped sequence features (e.g. CDS features - * mapped from protein), and the mapping between the sequences + * mapped from protein), and the mapping between the sequences. It also provides + * a method to derive peptide variants from codon variants. * * @author gmcarstairs */ public class MappedFeatures { /* - * the mapping from CDS to peptide + * VEP CSQ:HGVSp (if present) is a short-cut to the protein variant consequence */ - public final Mapping mapping; + private static final String HGV_SP = "HGVSp"; - /** - * the CDS sequence mapped to - */ - public final SequenceI fromSeq; + private static final String CSQ = "CSQ"; /* - * the residue position in the peptide sequence + * the sequence the mapped features are on */ - public final int fromPosition; + private final SequenceI featureSequence; /* - * the peptide residue at the position + * the mapping between sequences; + * NB this could be in either sense (from or to featureSequence) */ - public final char fromResidue; + private final Mapping mapping; /* - * features on CDS that overlap the codon positions + * features on featureSequence that overlap the mapped positions */ public final List features; + /* + * the residue position in the sequence mapped to + */ + private final int toPosition; + + /* + * the residue at toPosition + */ + private final char toResidue; + + /* + * if the mapping is 3:1 or 1:3 (peptide to CDS), this holds the + * mapped positions i.e. codon base positions in CDS; to + * support calculation of peptide variants from alleles + */ + private final int[] codonPos; + + private final char[] baseCodon; + /** * Constructor * * @param theMapping + * sequence mapping (which may be either to, or from, the sequence + * holding the linked features) + * @param featureSeq + * the sequence hosting the virtual features * @param pos + * the residue position in the sequence mapped to * @param res + * the residue character at position pos * @param theFeatures + * list of mapped features found in the 'featureSeq' sequence at the + * mapped position(s) */ - public MappedFeatures(Mapping theMapping, SequenceI from, int pos, - char res, - List theFeatures) + public MappedFeatures(Mapping theMapping, SequenceI featureSeq, int pos, + char res, List theFeatures) { mapping = theMapping; - fromSeq = from; - fromPosition = pos; - fromResidue = res; + featureSequence = featureSeq; + toPosition = pos; + toResidue = res; features = theFeatures; + + /* + * determine codon positions and canonical codon + * for a peptide-to-CDS mapping + */ + int[] codonIntervals = mapping.getMap().locateInFrom(toPosition, toPosition); + int[] codonPositions = codonIntervals == null ? null + : MappingUtils.flattenRanges(codonIntervals); + if (codonPositions != null && codonPositions.length == 3) + { + codonPos = codonPositions; + baseCodon = new char[3]; + int cdsStart = featureSequence.getStart(); + baseCodon[0] = Character + .toUpperCase(featureSequence.getCharAt(codonPos[0] - cdsStart)); + baseCodon[1] = Character + .toUpperCase(featureSequence.getCharAt(codonPos[1] - cdsStart)); + baseCodon[2] = Character + .toUpperCase(featureSequence.getCharAt(codonPos[2] - cdsStart)); + } + else + { + codonPos = null; + baseCodon = null; + } } /** - * Computes and returns a (possibly empty) list of HGVS notation peptide - * variants derived from codon allele variants + * Computes and returns comma-delimited HGVS notation peptide variants derived + * from codon allele variants. If no variants are found, answers an empty + * string. The peptide variant is either simply read from the "CSQ:HGVSp" + * attribute if present, else computed based on the "alleles" attribute if + * present. If neither attribute is found, no variant (empty string) is + * returned. * + * @param sf + * a sequence feature (which must be one of those held in this + * object) * @return */ - public List findProteinVariants() + public String findProteinVariants(SequenceFeature sf) { - List vars = new ArrayList<>(); - if (features.isEmpty()) + if (!features.contains(sf) || baseCodon == null) { - return vars; + return ""; + } + + /* + * VCF data may already contain the protein consequence + */ + String hgvsp = sf.getValueAsString(CSQ, HGV_SP); + if (hgvsp != null) + { + int colonPos = hgvsp.lastIndexOf(':'); + if (colonPos >= 0) + { + String var = hgvsp.substring(colonPos + 1); + if (var.contains("p.")) // sanity check + { + return var; + } + } } /* - * determine canonical codon + * otherwise, compute codon and peptide variant */ - int[] codonPos = MappingUtils.flattenRanges( - mapping.getMap().locateInFrom(fromPosition, fromPosition)); - if (codonPos.length != 3) + int cdsPos = sf.getBegin(); + if (cdsPos != sf.getEnd()) { - // error - return vars; + // not handling multi-locus variant features + return ""; + } + if (cdsPos != codonPos[0] && cdsPos != codonPos[1] + && cdsPos != codonPos[2]) + { + // e.g. feature on intron within spliced codon! + return ""; } - final char[] baseCodon = new char[3]; - int cdsStart = fromSeq.getStart(); - baseCodon[0] = fromSeq.getCharAt(codonPos[0] - cdsStart); - baseCodon[1] = fromSeq.getCharAt(codonPos[1] - cdsStart); - baseCodon[2] = fromSeq.getCharAt(codonPos[2] - cdsStart); - for (SequenceFeature sf : features) + String alls = (String) sf.getValue(Gff3Helper.ALLELES); + if (alls == null) { - /* - * VCF data may already contain the protein consequence - */ - String hgvsp = sf.getValueAsString("CSQ", "HGVSp"); - if (hgvsp != null) + return ""; + } + + String from3 = StringUtils.toSentenceCase( + ResidueProperties.aa2Triplet.get(String.valueOf(toResidue))); + + /* + * make a peptide variant for each SNP allele + * e.g. C,G,T gives variants G and T for base C + */ + Set variantPeptides = new HashSet<>(); + String[] alleles = alls.toUpperCase().split(","); + StringBuilder vars = new StringBuilder(); + + for (String allele : alleles) + { + allele = allele.trim().toUpperCase(); + if (allele.length() > 1 || "-".equals(allele)) { - int colonPos = hgvsp.indexOf(':'); - if (colonPos >= 0) - { - String var = hgvsp.substring(colonPos + 1); - if (!vars.contains(var)) - { - vars.add(var); - } - continue; - } + continue; // multi-locus variant } + char[] variantCodon = new char[3]; + variantCodon[0] = baseCodon[0]; + variantCodon[1] = baseCodon[1]; + variantCodon[2] = baseCodon[2]; /* - * otherwise, compute codon and peptide variant + * poke variant base into canonical codon; + * ignore first 'allele' (canonical base) */ - // todo avoid duplication of code in AlignmentUtils.buildDnaVariantsMap - int cdsPos = sf.getBegin(); - if (cdsPos != sf.getEnd()) + final int i = cdsPos == codonPos[0] ? 0 + : (cdsPos == codonPos[1] ? 1 : 2); + variantCodon[i] = allele.toUpperCase().charAt(0); + if (variantCodon[i] == baseCodon[i]) { - // not handling multi-locus variant features continue; } - if (cdsPos != codonPos[0] && cdsPos != codonPos[1] - && cdsPos != codonPos[2]) + String codon = new String(variantCodon); + String peptide = ResidueProperties.codonTranslate(codon); + boolean synonymous = toResidue == peptide.charAt(0); + StringBuilder var = new StringBuilder(); + if (synonymous) { - // e.g. feature on intron within spliced codon! - continue; - } - - String alls = (String) sf.getValue(Gff3Helper.ALLELES); - if (alls == null) - { - continue; + /* + * synonymous variant notation e.g. c.1062C>A(p.=) + */ + var.append("c.").append(String.valueOf(cdsPos)) + .append(String.valueOf(baseCodon[i])).append(">") + .append(String.valueOf(variantCodon[i])) + .append("(p.=)"); } - String from3 = StringUtils.toSentenceCase( - ResidueProperties.aa2Triplet - .get(String.valueOf(fromResidue))); - - /* - * make a peptide variant for each SNP allele - * e.g. C,G,T gives variants G and T for base C - */ - String[] alleles = alls.toUpperCase().split(","); - for (String allele : alleles) + else { - allele = allele.trim().toUpperCase(); - if (allele.length() > 1) - { - continue; // multi-locus variant - } - char[] variantCodon = new char[3]; - variantCodon[0] = baseCodon[0]; - variantCodon[1] = baseCodon[1]; - variantCodon[2] = baseCodon[2]; - /* - * poke variant base into canonical codon + * missense variant notation e.g. p.Arg355Met */ - int i = cdsPos == codonPos[0] ? 0 : (cdsPos == codonPos[1] ? 1 : 2); - variantCodon[i] = allele.toUpperCase().charAt(0); - String codon = new String(variantCodon); - String peptide = ResidueProperties.codonTranslate(codon); - if (fromResidue != peptide.charAt(0)) + String to3 = ResidueProperties.STOP.equals(peptide) ? "Ter" + : StringUtils.toSentenceCase( + ResidueProperties.aa2Triplet.get(peptide)); + var.append("p.").append(from3).append(String.valueOf(toPosition)) + .append(to3); + } + if (!variantPeptides.contains(peptide)) // duplicate consequence + { + variantPeptides.add(peptide); + if (vars.length() > 0) { - String to3 = ResidueProperties.STOP.equals(peptide) ? "STOP" - : StringUtils.toSentenceCase( - ResidueProperties.aa2Triplet.get(peptide)); - String var = "p." + from3 + fromPosition + to3; - if (!vars.contains(var)) - { - vars.add(var); - } + vars.append(","); } + vars.append(var); } } - return vars; + return vars.toString(); + } + + /** + * Answers the name of the linked sequence holding any mapped features + * + * @return + */ + public String getLinkedSequenceName() + { + return featureSequence == null ? null : featureSequence.getName(); + } + + /** + * Answers the mapped ranges (as one or more [start, end] positions) which + * correspond to the given [begin, end] range of the linked sequence. + * + *
+   * Example: MappedFeatures with CDS features mapped to peptide 
+   * CDS/200-220 gtc aac TGa acGt att AAC tta
+   * mapped to PEP/6-7 WN by mapping [206, 207, 210, 210, 215, 217] to [6, 7]
+   * getMappedPositions(206, 206) should return [6, 6]
+   * getMappedPositions(200, 214) should return [6, 6]
+   * getMappedPositions(210, 215) should return [6, 7]
+   * 
+ * + * @param begin + * @param end + * @return + */ + public int[] getMappedPositions(int begin, int end) + { + MapList map = mapping.getMap(); + return mapping.to == featureSequence ? map.getOverlapsInFrom(begin, end) + : map.getOverlapsInTo(begin, end); + } + + /** + * Answers true if the linked features are on coding sequence, false if on + * peptide + * + * @return + */ + public boolean isFromCds() + { + if (mapping.getMap().getFromRatio() == 3) + { + return mapping.to != featureSequence; + } + return mapping.to == featureSequence; } }