1 package jalview.datamodel;
3 import java.util.HashSet;
7 import jalview.io.gff.Gff3Helper;
8 import jalview.schemes.ResidueProperties;
9 import jalview.util.MapList;
10 import jalview.util.MappingUtils;
11 import jalview.util.StringUtils;
14 * A data bean to hold a list of mapped sequence features (e.g. CDS features
15 * mapped from protein), and the mapping between the sequences. It also provides
16 * a method to derive peptide variants from codon variants.
20 public class MappedFeatures
23 * VEP CSQ:HGVSp (if present) is a short-cut to the protein variant consequence
25 private static final String HGV_SP = "HGVSp";
27 private static final String CSQ = "CSQ";
30 * the sequence the mapped features are on
32 private final SequenceI linkedSeq;
35 * the mapping between sequences;
36 * NB this could be in either sense
38 private final Mapping mapping;
41 * if true, mapping is from the linked sequence, else to the linked sequence
43 private boolean mappingIsFromLinkedSequence;
46 * features on linkedSeq that overlap the mapped positions
48 public final List<SequenceFeature> features;
51 * the residue position in the sequence mapped to
53 private final int toPosition;
56 * the residue at toPosition
58 private final char toResidue;
61 * if the mapping is 3:1 or 1:3 (peptide to CDS), this holds the
62 * mapped positions i.e. codon base positions in CDS; to
63 * support calculation of peptide variants from alleles
65 private final int[] codonPos;
67 private final char[] baseCodon;
74 * the sequence mapped from (e.g. CDS)
76 * the residue position in the sequence mapped to
78 * the residue character at position pos
80 * list of mapped features found in the 'from' sequence at
81 * the mapped position(s)
83 public MappedFeatures(Mapping theMapping, SequenceI from, int pos,
84 char res, List<SequenceFeature> theFeatures)
88 mappingIsFromLinkedSequence = mapping.to != linkedSeq;
91 features = theFeatures;
94 * determine codon positions and canonical codon
95 * for a peptide-to-CDS mapping
97 int[] codonIntervals = mapping.getMap().locateInFrom(toPosition, toPosition);
98 int[] codonPositions = codonIntervals == null ? null
99 : MappingUtils.flattenRanges(codonIntervals);
100 if (codonPositions != null && codonPositions.length == 3)
102 codonPos = codonPositions;
103 baseCodon = new char[3];
104 int cdsStart = linkedSeq.getStart();
105 baseCodon[0] = Character
106 .toUpperCase(linkedSeq.getCharAt(codonPos[0] - cdsStart));
107 baseCodon[1] = Character
108 .toUpperCase(linkedSeq.getCharAt(codonPos[1] - cdsStart));
109 baseCodon[2] = Character
110 .toUpperCase(linkedSeq.getCharAt(codonPos[2] - cdsStart));
120 * Computes and returns comma-delimited HGVS notation peptide variants derived
121 * from codon allele variants. If no variants are found, answers an empty
122 * string. The peptide variant is either simply read from the "CSQ:HGVSp"
123 * attribute if present, else computed based on the "alleles" attribute if
124 * present. If neither attribute is found, no variant (empty string) is
128 * a sequence feature (which must be one of those held in this
132 public String findProteinVariants(SequenceFeature sf)
134 if (!features.contains(sf) || baseCodon == null)
140 * VCF data may already contain the protein consequence
142 String hgvsp = sf.getValueAsString(CSQ, HGV_SP);
145 int colonPos = hgvsp.lastIndexOf(':');
148 String var = hgvsp.substring(colonPos + 1);
149 if (var.contains("p.")) // sanity check
157 * otherwise, compute codon and peptide variant
159 int cdsPos = sf.getBegin();
160 if (cdsPos != sf.getEnd())
162 // not handling multi-locus variant features
165 if (cdsPos != codonPos[0] && cdsPos != codonPos[1]
166 && cdsPos != codonPos[2])
168 // e.g. feature on intron within spliced codon!
172 String alls = (String) sf.getValue(Gff3Helper.ALLELES);
178 String from3 = StringUtils.toSentenceCase(
179 ResidueProperties.aa2Triplet.get(String.valueOf(toResidue)));
182 * make a peptide variant for each SNP allele
183 * e.g. C,G,T gives variants G and T for base C
185 Set<String> variantPeptides = new HashSet<>();
186 String[] alleles = alls.toUpperCase().split(",");
187 StringBuilder vars = new StringBuilder();
189 for (String allele : alleles)
191 allele = allele.trim().toUpperCase();
192 if (allele.length() > 1 || "-".equals(allele))
194 continue; // multi-locus variant
196 char[] variantCodon = new char[3];
197 variantCodon[0] = baseCodon[0];
198 variantCodon[1] = baseCodon[1];
199 variantCodon[2] = baseCodon[2];
202 * poke variant base into canonical codon;
203 * ignore first 'allele' (canonical base)
205 final int i = cdsPos == codonPos[0] ? 0
206 : (cdsPos == codonPos[1] ? 1 : 2);
207 variantCodon[i] = allele.toUpperCase().charAt(0);
208 if (variantCodon[i] == baseCodon[i])
212 String codon = new String(variantCodon);
213 String peptide = ResidueProperties.codonTranslate(codon);
214 boolean synonymous = toResidue == peptide.charAt(0);
215 StringBuilder var = new StringBuilder();
219 * synonymous variant notation e.g. c.1062C>A(p.=)
221 var.append("c.").append(String.valueOf(cdsPos))
222 .append(String.valueOf(baseCodon[i])).append(">")
223 .append(String.valueOf(variantCodon[i]))
229 * missense variant notation e.g. p.Arg355Met
231 String to3 = ResidueProperties.STOP.equals(peptide) ? "Ter"
232 : StringUtils.toSentenceCase(
233 ResidueProperties.aa2Triplet.get(peptide));
234 var.append("p.").append(from3).append(String.valueOf(toPosition))
237 if (!variantPeptides.contains(peptide)) // duplicate consequence
239 variantPeptides.add(peptide);
240 if (vars.length() > 0)
248 return vars.toString();
252 * Answers the name of the linked sequence holding any mapped features
256 public String getLinkedSequenceName()
258 return linkedSeq == null ? null : linkedSeq.getName();
262 * Answers the mapped ranges (as one or more [start, end] positions) which
263 * correspond to the given [begin, end] range of the linked sequence.
266 * Example: MappedFeatures with CDS features mapped to peptide
267 * CDS/200-220 gtc aac TGa acGt att AAC tta
268 * mapped to PEP/6-7 WN by mapping [206, 207, 210, 210, 215, 217] to [6, 7]
269 * getMappedPositions(206, 206) should return [6, 6]
270 * getMappedPositions(200, 214) should return [6, 6]
271 * getMappedPositions(210, 215) should return [6, 7]
278 public int[] getMappedPositions(int begin, int end)
280 MapList map = mapping.getMap();
281 return mappingIsFromLinkedSequence ? map.locateInTo(begin, end)
282 : map.locateInFrom(begin, end);
285 public boolean isFromCds()
287 if (mapping.getMap().getFromRatio() == 3)
289 return mappingIsFromLinkedSequence;
291 return !mappingIsFromLinkedSequence;