X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=5adc55c0764ef1eacac1d74391bb4ad7380f1a74;hb=14193747f3831242bc7dfac12394eb20eb0ba480;hp=e725a22aa44aea55a0d1f9722967440499dfb8ea;hpb=1ef254a2c95c4226d9dc3a77af6ffcdfed675b7c;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index e725a22..5adc55c 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -5,23 +5,32 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import jalview.analysis.AlignmentUtils; +import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLoci; +import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; -import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; import jalview.ext.htsjdk.VCFReader; +import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; +import jalview.util.MessageManager; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -35,10 +44,58 @@ import java.util.Map.Entry; */ public class VCFLoader { + /* + * keys to fields of VEP CSQ consequence data + * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private static final String ALLELE_KEY = "Allele"; + + private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + private static final String FEATURE_KEY = "Feature"; // Ensembl stable id + + /* + * what comes before column headings in CSQ Description field + */ + private static final String FORMAT = "Format: "; + + /* + * default VCF INFO key for VEP consequence data + * NB this can be overridden running VEP with --vcf_info_field + * - we don't handle this case (require CSQ identifier) + */ + private static final String CSQ = "CSQ"; + + /* + * separator for fields in consequence data + */ + private static final String PIPE = "|"; + + private static final String PIPE_REGEX = "\\" + PIPE; + + /* + * key for Allele Frequency output by VEP + * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private static final String ALLELE_FREQUENCY_KEY = "AF"; + + /* + * delimiter that separates multiple consequence data blocks + */ + private static final String COMMA = ","; + + /* + * the feature group assigned to a VCF variant in Jalview + */ + private static final String FEATURE_GROUP_VCF = "VCF"; + + /* + * internal delimiter used to build keys for assemblyMappings + * + */ private static final String EXCL = "!"; /* - * the alignment we are associated VCF data with + * the alignment we are associating VCF data with */ private AlignmentI al; @@ -49,6 +106,26 @@ public class VCFLoader */ private Map> assemblyMappings; + /* + * holds details of the VCF header lines (metadata) + */ + private VCFHeader header; + + /* + * the position (0...) of field in each block of + * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) + * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private int csqAlleleFieldIndex = -1; + private int csqAlleleNumberFieldIndex = -1; + private int csqFeatureFieldIndex = -1; + + /* + * a unique identifier under which to save metadata about feature + * attributes (selected INFO field data) + */ + private String sourceId; + /** * Constructor given an alignment context * @@ -63,31 +140,64 @@ public class VCFLoader } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates. + * Starts a new thread to query and load VCF variant data on to the alignment *

* This method is not thread safe - concurrent threads should use separate * instances of this class. * * @param filePath - * @param status + * @param gui */ - public void loadVCF(String filePath, AlignViewControllerGuiI status) + public void loadVCF(final String filePath, + final AlignViewControllerGuiI gui) { - VCFReader reader = null; + if (gui != null) + { + gui.setStatus(MessageManager.getString("label.searching_vcf")); + } + new Thread() + { + + @Override + public void run() + { + VCFLoader.this.doLoad(filePath, gui); + } + + }.start(); + } + + /** + * Loads VCF on to an alignment - provided it can be related to one or more + * sequence's chromosomal coordinates + * + * @param filePath + * @param gui + * optional callback handler for messages + */ + protected void doLoad(String filePath, AlignViewControllerGuiI gui) + { + VCFReader reader = null; try { // long start = System.currentTimeMillis(); reader = new VCFReader(filePath); - VCFHeader header = reader.getFileHeader(); + header = reader.getFileHeader(); + + sourceId = filePath; + + saveMetadata(sourceId); + + /* + * get offset of CSQ ALLELE_NUM and Feature if declared + */ + locateCsqFields(); + VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - // check if reference is wrt assembly19 (GRCh37) - // todo may need to allow user to specify reference assembly? - boolean isRefGrch37 = (ref != null && ref.getValue().contains( - "assembly19")); + String vcfAssembly = ref.getValue(); int varCount = 0; int seqCount = 0; @@ -97,25 +207,33 @@ public class VCFLoader */ for (SequenceI seq : al.getSequences()) { - int added = loadVCF(seq, reader, isRefGrch37); + int added = loadSequenceVCF(seq, reader, vcfAssembly); if (added > 0) { seqCount++; varCount += added; - computePeptideVariants(seq); + transferAddedFeatures(seq); } } - // long elapsed = System.currentTimeMillis() - start; - String msg = String.format("Added %d VCF variants to %d sequence(s)", - varCount, seqCount); - if (status != null) + if (gui != null) { - status.setStatus(msg); + // long elapsed = System.currentTimeMillis() - start; + String msg = MessageManager.formatMessage("label.added_vcf", + varCount, seqCount); + gui.setStatus(msg); + if (gui.getFeatureSettingsUI() != null) + { + gui.getFeatureSettingsUI().discoverAllFeatureData(); + } } } catch (Throwable e) { System.err.println("Error processing VCF: " + e.getMessage()); e.printStackTrace(); + if (gui != null) + { + gui.setStatus("Error occurred - see console for details"); + } } finally { if (reader != null) @@ -132,16 +250,102 @@ public class VCFLoader } /** - * (Re-)computes peptide variants from dna variants, for any protein sequence - * to which the dna sequence has a mapping. Note that although duplicate - * features may get computed, they will not be added, since duplicate sequence - * features are ignored in Sequence.addSequenceFeature. + * Reads metadata (such as INFO field descriptions and datatypes) and saves + * them for future reference + * + * @param sourceId + */ + void saveMetadata(String sourceId) + { + FeatureSource metadata = new FeatureSource(sourceId); + + for (VCFInfoHeaderLine info : header.getInfoHeaderLines()) + { + String attributeId = info.getID(); + String desc = info.getDescription(); + VCFHeaderLineType type = info.getType(); + FeatureAttributeType attType = null; + switch (type) + { + case Character: + attType = FeatureAttributeType.Character; + break; + case Flag: + attType = FeatureAttributeType.Flag; + break; + case Float: + attType = FeatureAttributeType.Float; + break; + case Integer: + attType = FeatureAttributeType.Integer; + break; + case String: + attType = FeatureAttributeType.String; + break; + } + metadata.setAttributeName(attributeId, desc); + metadata.setAttributeType(attributeId, attType); + } + + FeatureSources.getInstance().addSource(sourceId, metadata); + } + + /** + * Records the position of selected fields defined in the CSQ INFO header (if + * there is one). CSQ fields are declared in the CSQ INFO Description e.g. + *

+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|... + */ + protected void locateCsqFields() + { + VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ); + if (csqInfo == null) + { + return; + } + + String desc = csqInfo.getDescription(); + int formatPos = desc.indexOf(FORMAT); + if (formatPos == -1) + { + System.err.println("Parse error, failed to find " + FORMAT + + " in " + desc); + return; + } + desc = desc.substring(formatPos + FORMAT.length()); + + if (desc != null) + { + String[] format = desc.split(PIPE_REGEX); + int index = 0; + for (String field : format) + { + if (ALLELE_NUM_KEY.equals(field)) + { + csqAlleleNumberFieldIndex = index; + } + if (ALLELE_KEY.equals(field)) + { + csqAlleleFieldIndex = index; + } + if (FEATURE_KEY.equals(field)) + { + csqFeatureFieldIndex = index; + } + index++; + } + } + } + + /** + * Transfers VCF features to sequences to which this sequence has a mapping. + * If the mapping is 3:1, computes peptide variants from nucleotide variants. * - * @param dnaSeq + * @param seq */ - protected void computePeptideVariants(SequenceI dnaSeq) + protected void transferAddedFeatures(SequenceI seq) { - DBRefEntry[] dbrefs = dnaSeq.getDBRefs(); + DBRefEntry[] dbrefs = seq.getDBRefs(); if (dbrefs == null) { return; @@ -149,48 +353,112 @@ public class VCFLoader for (DBRefEntry dbref : dbrefs) { Mapping mapping = dbref.getMap(); - if (mapping == null || mapping.getTo() == null - || mapping.getMap().getFromRatio() != 3) + if (mapping == null || mapping.getTo() == null) { continue; } - AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(), - mapping.getMap()); + + SequenceI mapTo = mapping.getTo(); + MapList map = mapping.getMap(); + if (map.getFromRatio() == 3) + { + /* + * dna-to-peptide product mapping + */ + AlignmentUtils.computeProteinFeatures(seq, mapTo, map); + } + else + { + /* + * nucleotide-to-nucleotide mapping e.g. transcript to CDS + */ + List features = seq.getFeatures() + .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT); + for (SequenceFeature sf : features) + { + if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup())) + { + transferFeature(sf, mapTo, map); + } + } + } } } /** * Tries to add overlapping variants read from a VCF file to the given - * sequence, and returns the number of overlapping variants found. Note that - * this requires the sequence to hold information as to its chromosomal - * positions and reference, in order to be able to map the VCF variants to the - * sequence. + * sequence, and returns the number of variant features added. Note that this + * requires the sequence to hold information as to its species, chromosomal + * positions and reference assembly, in order to be able to map the VCF + * variants to the sequence (or not) * * @param seq * @param reader - * @param isVcfRefGrch37 + * @param vcfAssembly * @return */ - protected int loadVCF(SequenceI seq, VCFReader reader, - boolean isVcfRefGrch37) + protected int loadSequenceVCF(SequenceI seq, VCFReader reader, + String vcfAssembly) { int count = 0; - GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); + GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { + System.out.println(String.format( + "Can't query VCF for %s as chromosome coordinates not known", + seq.getName())); + return 0; + } + + if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId())) + { return 0; } - List seqChromosomalContigs = seqCoords.mapping.getToRanges(); + List seqChromosomalContigs = seqCoords.getMap().getToRanges(); for (int[] range : seqChromosomalContigs) { - count += addVcfVariants(seq, reader, range, isVcfRefGrch37); + count += addVcfVariants(seq, reader, range, vcfAssembly); } return count; } /** + * Answers true if the species inferred from the VCF reference identifier + * matches that for the sequence + * + * @param vcfAssembly + * @param speciesId + * @return + */ + boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId) + { + // PROBLEM 1 + // there are many aliases for species - how to equate one with another? + // PROBLEM 2 + // VCF ##reference header is an unstructured URI - how to extract species? + // perhaps check if ref includes any (Ensembl) alias of speciesId?? + // TODO ask the user to confirm this?? + + if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example + && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id + { + return true; + } + + if (vcfAssembly.contains("c_elegans") // VEP VCF response example + && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl + { + return true; + } + + // this is not a sustainable solution... + + return false; + } + + /** * Queries the VCF reader for any variants that overlap the given chromosome * region of the sequence, and adds as variant features. Returns the number of * overlapping variants found. @@ -200,54 +468,52 @@ public class VCFLoader * @param range * start-end range of a sequence region in its chromosomal * coordinates - * @param isVcfRefGrch37 - * true if the VCF is with reference to GRCh37 + * @param vcfAssembly + * the '##reference' identifier for the VCF reference assembly * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, boolean isVcfRefGrch37) + int[] range, String vcfAssembly) { - GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); + GeneLociI seqCoords = seq.getGeneLoci(); - String chromosome = seqCoords.chromosome; - String seqRef = seqCoords.assembly; - String species = seqCoords.species; - - // TODO handle species properly - if ("".equals(species)) - { - species = "human"; - } + String chromosome = seqCoords.getChromosomeId(); + String seqRef = seqCoords.getAssemblyId(); + String species = seqCoords.getSpeciesId(); /* - * map chromosomal coordinates from GRCh38 (sequence) to - * GRCh37 (VCF) if necessary + * map chromosomal coordinates from sequence to VCF if the VCF + * data has a different reference assembly to the sequence */ - // TODO generalise for other assemblies and species + // TODO generalise for non-human species + // - or get the user to choose in a dialog + int offset = 0; - String fromRef = "GRCh38"; - if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) + if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl + && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD { String toRef = "GRCh37"; - int[] newRange = mapReferenceRange(range, chromosome, species, - fromRef, toRef); + int[] newRange = mapReferenceRange(range, chromosome, "human", + seqRef, toRef); if (newRange == null) { System.err.println(String.format( "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - fromRef, range[0], range[1], toRef)); + seqRef, range[0], range[1], toRef)); return 0; } offset = newRange[0] - range[0]; range = newRange; } + boolean forwardStrand = range[0] <= range[1]; + /* * query the VCF for overlaps * (convert a reverse strand range to forwards) */ int count = 0; - MapList mapping = seqCoords.mapping; + MapList mapping = seqCoords.getMap(); int fromLocus = Math.min(range[0], range[1]); int toLocus = Math.max(range[0], range[1]); @@ -260,27 +526,21 @@ public class VCFLoader */ VariantContext variant = variants.next(); - /* - * we can only process SNP variants (which can be reported - * as part of a MIXED variant record - */ - if (!variant.isSNP() && !variant.isMixed()) - { - continue; - } - - count++; int start = variant.getStart() - offset; int end = variant.getEnd() - offset; /* * convert chromosomal location to sequence coordinates + * - may be reverse strand (convert to forward for sequence feature) * - null if a partially overlapping feature */ int[] seqLocation = mapping.locateInFrom(start, end); if (seqLocation != null) { - addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1]); + int featureStart = Math.min(seqLocation[0], seqLocation[1]); + int featureEnd = Math.max(seqLocation[0], seqLocation[1]); + count += addAlleleFeatures(seq, variant, featureStart, featureEnd, + forwardStrand); } } @@ -290,85 +550,365 @@ public class VCFLoader } /** - * Inspects the VCF variant record, and adds variant features to the sequence. - * Only SNP variants are added, not INDELs. + * A convenience method to get the AF value for the given alternate allele + * index + * + * @param variant + * @param alleleIndex + * @return + */ + protected float getAlleleFrequency(VariantContext variant, int alleleIndex) + { + float score = 0f; + String attributeValue = getAttributeValue(variant, + ALLELE_FREQUENCY_KEY, alleleIndex); + if (attributeValue != null) + { + try + { + score = Float.parseFloat(attributeValue); + } catch (NumberFormatException e) + { + // leave as 0 + } + } + + return score; + } + + /** + * A convenience method to get an attribute value for an alternate allele + * + * @param variant + * @param attributeName + * @param alleleIndex + * @return + */ + protected String getAttributeValue(VariantContext variant, + String attributeName, int alleleIndex) + { + Object att = variant.getAttribute(attributeName); + + if (att instanceof String) + { + return (String) att; + } + else if (att instanceof ArrayList) + { + return ((List) att).get(alleleIndex); + } + + return null; + } + + /** + * Adds one variant feature for each allele in the VCF variant record, and + * returns the number of features added. * * @param seq * @param variant * @param featureStart * @param featureEnd + * @param forwardStrand + * @return */ - protected void addVariantFeatures(SequenceI seq, VariantContext variant, - int featureStart, int featureEnd) + protected int addAlleleFeatures(SequenceI seq, VariantContext variant, + int featureStart, int featureEnd, boolean forwardStrand) + { + int added = 0; + + /* + * Javadoc says getAlternateAlleles() imposes no order on the list returned + * so we proceed defensively to get them in strict order + */ + int altAlleleCount = variant.getAlternateAlleles().size(); + for (int i = 0; i < altAlleleCount; i++) + { + added += addAlleleFeature(seq, variant, i, featureStart, featureEnd, + forwardStrand); + } + return added; + } + + /** + * Inspects one allele and attempts to add a variant feature for it to the + * sequence. We extract as much as possible of the additional data associated + * with this allele to store in the feature's key-value map. Answers the + * number of features added (0 or 1). + * + * @param seq + * @param variant + * @param altAlleleIndex + * (0, 1..) + * @param featureStart + * @param featureEnd + * @param forwardStrand + * @return + */ + protected int addAlleleFeature(SequenceI seq, VariantContext variant, + int altAlleleIndex, int featureStart, int featureEnd, + boolean forwardStrand) { String reference = variant.getReference().getBaseString(); - if (reference.length() != 1) + Allele alt = variant.getAlternateAllele(altAlleleIndex); + String allele = alt.getBaseString(); + + /* + * build the ref,alt allele description e.g. "G,A", using the base + * complement if the sequence is on the reverse strand + */ + // TODO check how structural variants are shown on reverse strand + StringBuilder sb = new StringBuilder(); + sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); + sb.append(COMMA); + sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); + String alleles = sb.toString(); // e.g. G,A + + String type = SequenceOntologyI.SEQUENCE_VARIANT; + float score = getAlleleFrequency(variant, altAlleleIndex); + + SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, + featureEnd, score, FEATURE_GROUP_VCF); + sf.setSource(sourceId); + + sf.setValue(Gff3Helper.ALLELES, alleles); + + addAlleleProperties(variant, seq, sf, altAlleleIndex); + + seq.addSequenceFeature(sf); + + return 1; + } + + /** + * Add any allele-specific VCF key-value data to the sequence feature + * + * @param variant + * @param seq + * @param sf + * @param altAlelleIndex + * (0, 1..) + */ + protected void addAlleleProperties(VariantContext variant, SequenceI seq, + SequenceFeature sf, final int altAlelleIndex) + { + Map atts = variant.getAttributes(); + + for (Entry att : atts.entrySet()) { + String key = att.getKey(); + /* - * sorry, we don't handle INDEL variants + * extract Consequence data (if present) that we are able to + * associated with the allele for this variant feature */ + if (CSQ.equals(key)) + { + addConsequences(variant, seq, sf, altAlelleIndex); + continue; + } + + /* + * we extract values for other data which are allele-specific; + * these may be per alternate allele (INFO[key].Number = 'A') + * or per allele including reference (INFO[key].Number = 'R') + */ + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key); + if (infoHeader == null) + { + /* + * can't be sure what data belongs to this allele, so + * play safe and don't take any + */ + continue; + } + + VCFHeaderLineCount number = infoHeader.getCountType(); + int index = altAlelleIndex; + if (number == VCFHeaderLineCount.R) + { + /* + * one value per allele including reference, so bump index + * e.g. the 3rd value is for the 2nd alternate allele + */ + index++; + } + else if (number != VCFHeaderLineCount.A) + { + /* + * don't save other values as not allele-related + */ + continue; + } + + /* + * take the index'th value + */ + String value = getAttributeValue(variant, key, index); + if (value != null) + { + sf.setValue(key, value); + } + } + } + + /** + * Inspects CSQ data blocks (consequences) and adds attributes on the sequence + * feature for the current allele (and transcript if applicable) + *

+ * Allele matching: if field ALLELE_NUM is present, it must match + * altAlleleIndex. If not present, then field Allele value must match the VCF + * Allele. + *

+ * Transcript matching: if sequence name can be identified to at least one of + * the consequences' Feature values, then select only consequences that match + * the value (i.e. consequences for the current transcript sequence). If not, + * take all consequences (this is the case when adding features to the gene + * sequence). + * + * @param variant + * @param seq + * @param sf + * @param altAlelleIndex + * (0, 1..) + */ + protected void addConsequences(VariantContext variant, SequenceI seq, + SequenceFeature sf, int altAlelleIndex) + { + Object value = variant.getAttribute(CSQ); + + if (value == null || !(value instanceof ArrayList)) + { return; } + List consequences = (List) value; + /* - * for now we extract allele frequency as feature score; note - * this attribute is String for a simple SNP, but List if - * multiple alleles at the locus; we extract for the simple case only, - * since not sure how to match allele order with AF values + * if CSQ data includes 'Feature', and any value matches the sequence name, + * then restrict consequence data to only the matching value (transcript) + * i.e. just pick out consequences for the transcript the variant feature is on */ - Object af = variant.getAttribute("AF"); - float score = 0f; - if (af instanceof String) + String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); + String matchFeature = null; + if (csqFeatureFieldIndex > -1) { - try + for (String consequence : consequences) { - score = Float.parseFloat((String) af); - } catch (NumberFormatException e) + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > csqFeatureFieldIndex) + { + String featureIdentifier = csqFields[csqFeatureFieldIndex]; + if (featureIdentifier.length() > 4 + && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + { + matchFeature = featureIdentifier; + } + } + } + } + + StringBuilder sb = new StringBuilder(128); + boolean found = false; + + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + + if (includeConsequence(csqFields, matchFeature, variant, + altAlelleIndex)) { - // leave as 0 + if (found) + { + sb.append(COMMA); + } + found = true; + sb.append(consequence); } } - StringBuilder sb = new StringBuilder(); - sb.append(reference); + if (found) + { + sf.setValue(CSQ, sb.toString()); + } + } + /** + * Answers true if we want to associate this block of consequence data with + * the specified alternate allele of the VCF variant. + *

+ * If consequence data includes the ALLELE_NUM field, then this has to match + * altAlleleIndex. Otherwise the Allele field of the consequence data has to + * match the allele value. + *

+ * Optionally (if matchFeature is not null), restrict to only include + * consequences whose Feature value matches. This allows us to attach + * consequences to their respective transcripts. + * + * @param csqFields + * @param matchFeature + * @param variant + * @param altAlelleIndex + * (0, 1..) + * @return + */ + protected boolean includeConsequence(String[] csqFields, + String matchFeature, VariantContext variant, int altAlelleIndex) + { /* - * inspect alleles and record SNP variants (as the variant - * record could be MIXED and include INDEL and SNP alleles) + * check consequence is for the current transcript */ - int alleleCount = 0; + if (matchFeature != null) + { + if (csqFields.length <= csqFeatureFieldIndex) + { + return false; + } + String featureIdentifier = csqFields[csqFeatureFieldIndex]; + if (!featureIdentifier.equals(matchFeature)) + { + return false; // consequence is for a different transcript + } + } /* - * inspect alleles; warning: getAlleles gives no guarantee - * as to the order in which they are returned + * if ALLELE_NUM is present, it must match altAlleleIndex + * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex */ - for (Allele allele : variant.getAlleles()) + if (csqAlleleNumberFieldIndex > -1) { - if (!allele.isReference()) + if (csqFields.length <= csqAlleleNumberFieldIndex) { - String alleleBase = allele.getBaseString(); - if (alleleBase.length() == 1) - { - sb.append(",").append(alleleBase); - alleleCount++; - } + return false; } + String alleleNum = csqFields[csqAlleleNumberFieldIndex]; + return String.valueOf(altAlelleIndex + 1).equals(alleleNum); } - String alleles = sb.toString(); // e.g. G,A,C - - String type = SequenceOntologyI.SEQUENCE_VARIANT; - - SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, - featureEnd, score, "VCF"); - sf.setValue("alleles", alleles); - - Map atts = variant.getAttributes(); - for (Entry att : atts.entrySet()) + /* + * else consequence allele must match variant allele + */ + if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex) { - sf.setValue(att.getKey(), att.getValue()); + String csqAllele = csqFields[csqAlleleFieldIndex]; + String vcfAllele = variant.getAlternateAllele(altAlelleIndex) + .getBaseString(); + return csqAllele.equals(vcfAllele); } - seq.addSequenceFeature(sf); + + return false; + } + + /** + * A convenience method to complement a dna base and return the string value + * of its complement + * + * @param reference + * @return + */ + protected String complement(byte[] reference) + { + return String.valueOf(Dna.getComplement((char) reference[0])); } /** @@ -410,8 +950,8 @@ public class VCFLoader * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 */ EnsemblMap mapper = new EnsemblMap(); - int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef, - queryRange); + int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef, + toRef, queryRange); if (mapping == null) { @@ -487,6 +1027,32 @@ public class VCFLoader } /** + * Transfers the sequence feature to the target sequence, locating its start + * and end range based on the mapping. Features which do not overlap the + * target sequence are ignored. + * + * @param sf + * @param targetSequence + * @param mapping + * mapping from the feature's coordinates to the target sequence + */ + protected void transferFeature(SequenceFeature sf, + SequenceI targetSequence, MapList mapping) + { + int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd()); + + if (mappedRange != null) + { + String group = sf.getFeatureGroup(); + int newBegin = Math.min(mappedRange[0], mappedRange[1]); + int newEnd = Math.max(mappedRange[0], mappedRange[1]); + SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, + group, sf.getScore()); + targetSequence.addSequenceFeature(copy); + } + } + + /** * Formats a ranges map lookup key * * @param chromosome