From 4f048da0d010c1f63ce2b52224fd7aa8831b2247 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 29 Sep 2017 16:11:11 +0100 Subject: [PATCH] JAL-2738 one sequence feature per (SNP) VCF allele --- src/jalview/io/vcf/VCFLoader.java | 261 ++++++++++++++++++++++++++++++++----- 1 file changed, 228 insertions(+), 33 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index ddcecfe..0acf49e 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -5,6 +5,7 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; @@ -24,6 +25,7 @@ import jalview.util.MappingUtils; import jalview.util.MessageManager; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -37,6 +39,12 @@ import java.util.Map.Entry; */ public class VCFLoader { + private static final String ALLELE_FREQUENCY_KEY = "AF"; + + private static final String COMMA = ","; + + private static final boolean FEATURE_PER_ALLELE = true; + private static final String FEATURE_GROUP_VCF = "VCF"; private static final String EXCL = "!"; @@ -53,6 +61,11 @@ public class VCFLoader */ private Map> assemblyMappings; + /* + * holds details of the VCF header lines (metadata) + */ + private VCFHeader header; + /** * Constructor given an alignment context * @@ -97,10 +110,11 @@ public class VCFLoader /** * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates. + * sequence's chromosomal coordinates * * @param filePath * @param gui + * optional callback handler for messages */ protected void doLoad(String filePath, AlignViewControllerGuiI gui) { @@ -110,7 +124,7 @@ public class VCFLoader // long start = System.currentTimeMillis(); reader = new VCFReader(filePath); - VCFHeader header = reader.getFileHeader(); + header = reader.getFileHeader(); VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); // check if reference is wrt assembly19 (GRCh37) @@ -140,7 +154,10 @@ public class VCFLoader String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); - gui.getFeatureSettingsUI().discoverAllFeatureData(); + if (gui.getFeatureSettingsUI() != null) + { + gui.getFeatureSettingsUI().discoverAllFeatureData(); + } } } catch (Throwable e) { @@ -216,10 +233,9 @@ public class VCFLoader /** * Tries to add overlapping variants read from a VCF file to the given - * sequence, and returns the number of overlapping variants found. Note that - * this requires the sequence to hold information as to its chromosomal - * positions and reference, in order to be able to map the VCF variants to the - * sequence. + * sequence, and returns the number of variant features added. Note that this + * requires the sequence to hold information as to its chromosomal positions + * and reference, in order to be able to map the VCF variants to the sequence. * * @param seq * @param reader @@ -268,12 +284,6 @@ public class VCFLoader String seqRef = seqCoords.getAssemblyId(); String species = seqCoords.getSpeciesId(); - // TODO handle species properly - if ("".equals(species)) - { - species = "human"; - } - /* * map chromosomal coordinates from GRCh38 (sequence) to * GRCh37 (VCF) if necessary @@ -284,7 +294,7 @@ public class VCFLoader if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) { String toRef = "GRCh37"; - int[] newRange = mapReferenceRange(range, chromosome, species, + int[] newRange = mapReferenceRange(range, chromosome, "human", fromRef, toRef); if (newRange == null) { @@ -326,7 +336,6 @@ public class VCFLoader continue; } - count++; int start = variant.getStart() - offset; int end = variant.getEnd() - offset; @@ -337,8 +346,8 @@ public class VCFLoader int[] seqLocation = mapping.locateInFrom(start, end); if (seqLocation != null) { - addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1], - forwardStrand); + count += addVariantFeature(seq, variant, seqLocation[0], + seqLocation[1], forwardStrand); } } @@ -349,7 +358,8 @@ public class VCFLoader /** * Inspects the VCF variant record, and adds variant features to the sequence. - * Only SNP variants are added, not INDELs. + * Only SNP variants are added, not INDELs. Returns the number of features + * added. *

* If the sequence maps to the reverse strand of the chromosome, reference and * variant bases are recorded as their complements (C/G, A/T). @@ -360,7 +370,7 @@ public class VCFLoader * @param featureEnd * @param forwardStrand */ - protected void addVariantFeatures(SequenceI seq, VariantContext variant, + protected int addVariantFeature(SequenceI seq, VariantContext variant, int featureStart, int featureEnd, boolean forwardStrand) { byte[] reference = variant.getReference().getBases(); @@ -369,7 +379,13 @@ public class VCFLoader /* * sorry, we don't handle INDEL variants */ - return; + return 0; + } + + if (FEATURE_PER_ALLELE) + { + return addAlleleFeatures(seq, variant, featureStart, featureEnd, + forwardStrand); } /* @@ -377,18 +393,7 @@ public class VCFLoader * this attribute is String for a simple SNP, but List if * multiple alleles at the locus; we extract for the simple case only */ - Object af = variant.getAttribute("AF"); - float score = 0f; - if (af instanceof String) - { - try - { - score = Float.parseFloat((String) af); - } catch (NumberFormatException e) - { - // leave as 0 - } - } + float score = getAlleleFrequency(variant, 0); StringBuilder sb = new StringBuilder(); sb.append(forwardStrand ? (char) reference[0] : complement(reference)); @@ -406,7 +411,7 @@ public class VCFLoader byte[] alleleBase = allele.getBases(); if (alleleBase.length == 1) { - sb.append(",").append( + sb.append(COMMA).append( forwardStrand ? (char) alleleBase[0] : complement(alleleBase)); } @@ -427,6 +432,196 @@ public class VCFLoader sf.setValue(att.getKey(), att.getValue()); } seq.addSequenceFeature(sf); + + return 1; + } + + /** + * A convenience method to get the AF value for the given alternate allele + * index + * + * @param variant + * @param alleleIndex + * @return + */ + protected float getAlleleFrequency(VariantContext variant, int alleleIndex) + { + float score = 0f; + String attributeValue = getAttributeValue(variant, + ALLELE_FREQUENCY_KEY, alleleIndex); + if (attributeValue != null) + { + try + { + score = Float.parseFloat(attributeValue); + } catch (NumberFormatException e) + { + // leave as 0 + } + } + + return score; + } + + /** + * A convenience method to get an attribute value for an alternate allele + * + * @param variant + * @param attributeName + * @param alleleIndex + * @return + */ + protected String getAttributeValue(VariantContext variant, + String attributeName, int alleleIndex) + { + Object att = variant.getAttribute(attributeName); + + if (att instanceof String) + { + return (String) att; + } + else if (att instanceof ArrayList) + { + return ((List) att).get(alleleIndex); + } + + return null; + } + + /** + * Adds one variant feature for each SNP allele in the VCF variant record, and + * returns the number of features added. + * + * @param seq + * @param variant + * @param featureStart + * @param featureEnd + * @param forwardStrand + * @return + */ + protected int addAlleleFeatures(SequenceI seq, VariantContext variant, + int featureStart, int featureEnd, boolean forwardStrand) + { + int added = 0; + + /* + * Javadoc says getAlternateAlleles() imposes no order on the list returned + * so we proceed defensively to get them in strict order + */ + int altAlleleCount = variant.getAlternateAlleles().size(); + for (int i = 0; i < altAlleleCount; i++) + { + added += addAlleleFeature(seq, variant, i, featureStart, featureEnd, + forwardStrand); + } + return added; + } + + /** + * Inspects one allele and attempts to add a variant feature for it to the + * sequence. Only SNP variants are added as features. We extract as much as + * possible of the additional data associated with this allele to store in the + * feature's key-value map. Answers the number of features added (0 or 1). + * + * @param seq + * @param variant + * @param altAlleleIndex + * @param featureStart + * @param featureEnd + * @param forwardStrand + * @return + */ + protected int addAlleleFeature(SequenceI seq, VariantContext variant, + int altAlleleIndex, int featureStart, int featureEnd, + boolean forwardStrand) + { + byte[] reference = variant.getReference().getBases(); + Allele alt = variant.getAlternateAllele(altAlleleIndex); + byte[] allele = alt.getBases(); + if (allele.length != 1) + { + /* + * not a SNP variant + */ + return 0; + } + + /* + * build the ref,alt allele description e.g. "G,A" + */ + StringBuilder sb = new StringBuilder(); + sb.append(forwardStrand ? (char) reference[0] : complement(reference)); + sb.append(COMMA); + sb.append(forwardStrand ? (char) allele[0] : complement(allele)); + String alleles = sb.toString(); // e.g. G,A + + String type = SequenceOntologyI.SEQUENCE_VARIANT; + float score = getAlleleFrequency(variant, altAlleleIndex); + + SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, + featureEnd, score, FEATURE_GROUP_VCF); + + sf.setValue(Gff3Helper.ALLELES, alleles); + + addAlleleProperties(variant, sf, altAlleleIndex); + + seq.addSequenceFeature(sf); + + return 1; + } + + /** + * Add any allele-specific VCF key-value data to the sequence feature + * + * @param variant + * @param sf + * @param altAlelleIndex + */ + protected void addAlleleProperties(VariantContext variant, + SequenceFeature sf, final int altAlelleIndex) + { + Map atts = variant.getAttributes(); + + /* + * process variant data, extracting values which are allele-specific + * these may be per alternate allele (INFO[key].Number = 'A') + * or per allele including reference (INFO[key].Number = 'R') + */ + for (Entry att : atts.entrySet()) + { + String key = att.getKey(); + VCFHeaderLineCount number = header.getInfoHeaderLine(key) + .getCountType(); + int index = altAlelleIndex; + if (number == VCFHeaderLineCount.R) + { + /* + * one value per allele including reference, so bump index + * e.g. the 3rd value is for the 2nd alternate allele + */ + index++; + } + /* + * CSQ behaves as if Number=A but declares as Number=. + * so give it special treatment + */ + else if (!"CSQ".equals(key) && number != VCFHeaderLineCount.A) + { + /* + * don't save other values as not allele-related + */ + continue; + } + + /* + * take the index'th value + */ + String value = getAttributeValue(variant, key, index); + if (value != null) + { + sf.setValue(key, value); + } + } } /** -- 1.7.10.2