2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel;
23 import java.util.Locale;
25 import java.util.HashSet;
26 import java.util.List;
29 import jalview.io.gff.Gff3Helper;
30 import jalview.schemes.ResidueProperties;
31 import jalview.util.MapList;
32 import jalview.util.MappingUtils;
33 import jalview.util.StringUtils;
36 * A data bean to hold a list of mapped sequence features (e.g. CDS features
37 * mapped from protein), and the mapping between the sequences. It also provides
38 * a method to derive peptide variants from codon variants.
42 public class MappedFeatures
45 * VEP CSQ:HGVSp (if present) is a short-cut to the protein variant consequence
47 private static final String HGV_SP = "HGVSp";
49 private static final String CSQ = "CSQ";
52 * the sequence the mapped features are on
54 private final SequenceI featureSequence;
57 * the mapping between sequences;
58 * NB this could be in either sense (from or to featureSequence)
60 private final Mapping mapping;
63 * features on featureSequence that overlap the mapped positions
65 public final List<SequenceFeature> features;
68 * the residue position in the sequence mapped to
70 private final int toPosition;
73 * the residue at toPosition
75 private final char toResidue;
78 * if the mapping is 3:1 or 1:3 (peptide to CDS), this holds the
79 * mapped positions i.e. codon base positions in CDS; to
80 * support calculation of peptide variants from alleles
82 private final int[] codonPos;
84 private final char[] baseCodon;
90 * sequence mapping (which may be either to, or from, the sequence
91 * holding the linked features)
93 * the sequence hosting the virtual features
95 * the residue position in the sequence mapped to
97 * the residue character at position pos
99 * list of mapped features found in the 'featureSeq' sequence at the
102 public MappedFeatures(Mapping theMapping, SequenceI featureSeq, int pos,
103 char res, List<SequenceFeature> theFeatures)
105 mapping = theMapping;
106 featureSequence = featureSeq;
109 features = theFeatures;
112 * determine codon positions and canonical codon
113 * for a peptide-to-CDS mapping
115 int[] codonIntervals = mapping.getMap().locateInFrom(toPosition,
117 int[] codonPositions = codonIntervals == null ? null
118 : MappingUtils.flattenRanges(codonIntervals);
119 if (codonPositions != null && codonPositions.length == 3)
121 codonPos = codonPositions;
122 baseCodon = new char[3];
123 int cdsStart = featureSequence.getStart();
124 baseCodon[0] = Character.toUpperCase(
125 featureSequence.getCharAt(codonPos[0] - cdsStart));
126 baseCodon[1] = Character.toUpperCase(
127 featureSequence.getCharAt(codonPos[1] - cdsStart));
128 baseCodon[2] = Character.toUpperCase(
129 featureSequence.getCharAt(codonPos[2] - cdsStart));
139 * Computes and returns comma-delimited HGVS notation peptide variants derived
140 * from codon allele variants. If no variants are found, answers an empty
141 * string. The peptide variant is either simply read from the "CSQ:HGVSp"
142 * attribute if present, else computed based on the "alleles" attribute if
143 * present. If neither attribute is found, no variant (empty string) is
147 * a sequence feature (which must be one of those held in this
151 public String findProteinVariants(SequenceFeature sf)
153 if (!features.contains(sf) || baseCodon == null)
159 * VCF data may already contain the protein consequence
161 String hgvsp = sf.getValueAsString(CSQ, HGV_SP);
164 int colonPos = hgvsp.lastIndexOf(':');
167 String var = hgvsp.substring(colonPos + 1);
168 if (var.contains("p.")) // sanity check
176 * otherwise, compute codon and peptide variant
178 int cdsPos = sf.getBegin();
179 if (cdsPos != sf.getEnd())
181 // not handling multi-locus variant features
184 if (cdsPos != codonPos[0] && cdsPos != codonPos[1]
185 && cdsPos != codonPos[2])
187 // e.g. feature on intron within spliced codon!
191 String alls = (String) sf.getValue(Gff3Helper.ALLELES);
197 String from3 = StringUtils.toSentenceCase(
198 ResidueProperties.aa2Triplet.get(String.valueOf(toResidue)));
201 * make a peptide variant for each SNP allele
202 * e.g. C,G,T gives variants G and T for base C
204 Set<String> variantPeptides = new HashSet<>();
205 String[] alleles = alls.toUpperCase(Locale.ROOT).split(",");
206 StringBuilder vars = new StringBuilder();
208 for (String allele : alleles)
210 allele = allele.trim().toUpperCase(Locale.ROOT);
211 if (allele.length() > 1 || "-".equals(allele))
213 continue; // multi-locus variant
215 char[] variantCodon = new char[3];
216 variantCodon[0] = baseCodon[0];
217 variantCodon[1] = baseCodon[1];
218 variantCodon[2] = baseCodon[2];
221 * poke variant base into canonical codon;
222 * ignore first 'allele' (canonical base)
224 final int i = cdsPos == codonPos[0] ? 0
225 : (cdsPos == codonPos[1] ? 1 : 2);
226 variantCodon[i] = allele.toUpperCase(Locale.ROOT).charAt(0);
227 if (variantCodon[i] == baseCodon[i])
231 String codon = new String(variantCodon);
232 String peptide = ResidueProperties.codonTranslate(codon);
233 boolean synonymous = toResidue == peptide.charAt(0);
234 StringBuilder var = new StringBuilder();
238 * synonymous variant notation e.g. c.1062C>A(p.=)
240 var.append("c.").append(String.valueOf(cdsPos))
241 .append(String.valueOf(baseCodon[i])).append(">")
242 .append(String.valueOf(variantCodon[i])).append("(p.=)");
247 * missense variant notation e.g. p.Arg355Met
249 String to3 = ResidueProperties.STOP.equals(peptide) ? "Ter"
250 : StringUtils.toSentenceCase(
251 ResidueProperties.aa2Triplet.get(peptide));
252 var.append("p.").append(from3).append(String.valueOf(toPosition))
255 if (!variantPeptides.contains(peptide)) // duplicate consequence
257 variantPeptides.add(peptide);
258 if (vars.length() > 0)
266 return vars.toString();
270 * Answers the name of the linked sequence holding any mapped features
274 public String getLinkedSequenceName()
276 return featureSequence == null ? null : featureSequence.getName();
280 * Answers the mapped ranges (as one or more [start, end] positions) which
281 * correspond to the given [begin, end] range of the linked sequence.
284 * Example: MappedFeatures with CDS features mapped to peptide
285 * CDS/200-220 gtc aac TGa acGt att AAC tta
286 * mapped to PEP/6-7 WN by mapping [206, 207, 210, 210, 215, 217] to [6, 7]
287 * getMappedPositions(206, 206) should return [6, 6]
288 * getMappedPositions(200, 214) should return [6, 6]
289 * getMappedPositions(210, 215) should return [6, 7]
296 public int[] getMappedPositions(int begin, int end)
298 MapList map = mapping.getMap();
299 return mapping.to == featureSequence ? map.getOverlapsInFrom(begin, end)
300 : map.getOverlapsInTo(begin, end);
304 * Answers true if the linked features are on coding sequence, false if on
309 public boolean isFromCds()
311 if (mapping.getMap().getFromRatio() == 3)
313 return mapping.to != featureSequence;
315 return mapping.to == featureSequence;