X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=622da73dcf2552da32d3eb91fa9af8691852bea0;hb=a303b81242b1e3e7c1fdd530806d9a6e69f743a5;hp=155e7738b67ce6527b9818434e041582b41e2148;hpb=07108e505ff923f3b5135ffbdbb79259fe53432e;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 155e773..622da73 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -4,7 +4,6 @@ import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; @@ -14,6 +13,7 @@ import jalview.datamodel.features.FeatureAttributeType; import jalview.datamodel.features.FeatureSource; import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; @@ -21,17 +21,19 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.SortedMap; -import java.util.TreeMap; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; @@ -49,9 +51,40 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String DEFAULT_SPECIES = "homo_sapiens"; + + /** + * A class to model the mapping from sequence to VCF coordinates. Cases include + * + */ + class VCFMap + { + final String chromosome; + + final MapList map; + + VCFMap(String chr, MapList m) + { + chromosome = chr; + map = m; + } + + @Override + public String toString() + { + return chromosome + ":" + map.toString(); + } + } + /* * Lookup keys, and default values, for Preference entries that describe - * patterns for VCF and VEP fields to capture + * patterns for VCF and VEP fields to capture */ private static final String VEP_FIELDS_PREF = "VEP_FIELDS"; @@ -62,13 +95,25 @@ public class VCFLoader private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG"; /* + * Lookup keys, and default values, for Preference entries that give + * mappings from tokens in the 'reference' header to species or assembly + */ + private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY"; + + private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38"; + + private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human + + private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37 + + /* * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ - private static final String ALLELE_KEY = "Allele"; - - private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... - private static final String FEATURE_KEY = "Feature"; // Ensembl stable id + private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; + private static final String CSQ_ALLELE_KEY = "Allele"; + private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id /* * default VCF INFO key for VEP consequence data @@ -83,12 +128,6 @@ public class VCFLoader private static final String PIPE_REGEX = "\\|"; /* - * key for Allele Frequency output by VEP - * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html - */ - private static final String ALLELE_FREQUENCY_KEY = "AF"; - - /* * delimiter that separates multiple consequence data blocks */ private static final String COMMA = ","; @@ -105,9 +144,9 @@ public class VCFLoader private static final String EXCL = "!"; /* - * the alignment we are associating VCF data with + * the VCF file we are processing */ - private AlignmentI al; + protected String vcfFilePath; /* * mappings between VCF and sequence reference assembly regions, as @@ -116,20 +155,41 @@ public class VCFLoader */ private Map> assemblyMappings; + private VCFReader reader; + /* * holds details of the VCF header lines (metadata) */ private VCFHeader header; /* + * species (as a valid Ensembl term) the VCF is for + */ + private String vcfSpecies; + + /* + * genome assembly version (as a valid Ensembl identifier) the VCF is for + */ + private String vcfAssembly; + + /* + * a Dictionary of contigs (if present) referenced in the VCF file + */ + private SAMSequenceDictionary dictionary; + + /* * the position (0...) of field in each block of * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ + private int csqConsequenceFieldIndex = -1; private int csqAlleleFieldIndex = -1; private int csqAlleleNumberFieldIndex = -1; private int csqFeatureFieldIndex = -1; + // todo the same fields for SnpEff ANN data if wanted + // see http://snpeff.sourceforge.net/SnpEff_manual.html#input + /* * a unique identifier under which to save metadata about feature * attributes (selected INFO field data) @@ -150,29 +210,35 @@ public class VCFLoader Map vepFieldsOfInterest; /** - * Constructor given an alignment context + * Constructor given a VCF file * * @param alignment */ - public VCFLoader(AlignmentI alignment) + public VCFLoader(String vcfFile) { - al = alignment; + try + { + initialise(vcfFile); + } catch (IOException e) + { + System.err.println("Error opening VCF file: " + e.getMessage()); + } // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} assemblyMappings = new HashMap<>(); } /** - * Starts a new thread to query and load VCF variant data on to the alignment + * Starts a new thread to query and load VCF variant data on to the given + * sequences *

* This method is not thread safe - concurrent threads should use separate * instances of this class. * - * @param filePath + * @param seqs * @param gui */ - public void loadVCF(final String filePath, - final AlignViewControllerGuiI gui) + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) { if (gui != null) { @@ -181,46 +247,70 @@ public class VCFLoader new Thread() { - @Override public void run() { - VCFLoader.this.doLoad(filePath, gui); + VCFLoader.this.doLoad(seqs, gui); } - }.start(); } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * Reads the specified contig sequence and adds its VCF variants to it * - * @param filePath - * @param gui - * optional callback handler for messages + * @param contig + * the id of a single sequence (contig) to load + * @return */ - protected void doLoad(String filePath, AlignViewControllerGuiI gui) + public SequenceI loadVCFContig(String contig) { - VCFReader reader = null; - try + VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - header = reader.getFileHeader(); + Cache.log.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); + if (ref.startsWith("file://")) + { + ref = ref.substring(7); + } + setSpeciesAndAssembly(ref); - sourceId = filePath; + SequenceI seq = null; + File dbFile = new File(ref); - saveMetadata(sourceId); + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq); + db.close(); + } + else + { + Cache.log.error("VCF reference not found: " + ref); + } - /* - * get offset of CSQ ALLELE_NUM and Feature if declared - */ - parseCsqHeader(); + return seq; + } + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - String vcfAssembly = ref.getValue(); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); int varCount = 0; int seqCount = 0; @@ -228,9 +318,9 @@ public class VCFLoader /* * query for VCF overlapping each sequence in turn */ - for (SequenceI seq : al.getSequences()) + for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, reader, vcfAssembly); + int added = loadSequenceVCF(seq); if (added > 0) { seqCount++; @@ -240,7 +330,6 @@ public class VCFLoader } if (gui != null) { - // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); @@ -269,10 +358,109 @@ public class VCFLoader // ignore } } + header = null; + dictionary = null; } } /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the {@code reference} + * header line, configured in a property file, or (potentially) confirmed + * interactively by the user. + *

+ * The saved values should be identifiers valid for Ensembl's REST service + * {@code map} endpoint, so they can be used (if necessary) to retrieve the + * mapping between VCF coordinates and sequence coordinates. + * + * @param reference + * @see https://rest.ensembl.org/documentation/info/assembly_map + * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml + * @see https://rest.ensembl.org/info/species?content-type=text/xml + */ + protected void setSpeciesAndAssembly(String reference) + { + if (reference == null) + { + Cache.log.error("No VCF ##reference found, defaulting to " + + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES); + reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified + } + reference = reference.toLowerCase(); + + /* + * for a non-human species, or other assembly identifier, + * specify as a Jalview property file entry e.g. + * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37 + * VCF_SPECIES = c_elegans=celegans + * to map a token in the reference header to a value + */ + String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY); + for (String token : prop.split(",")) + { + String[] tokens = token.split("="); + if (tokens.length == 2) + { + if (reference.contains(tokens[0].trim().toLowerCase())) + { + vcfAssembly = tokens[1].trim(); + break; + } + } + } + + vcfSpecies = DEFAULT_SPECIES; + prop = Cache.getProperty(VCF_SPECIES); + if (prop != null) + { + for (String token : prop.split(",")) + { + String[] tokens = token.split("="); + if (tokens.length == 2) + { + if (reference.contains(tokens[0].trim().toLowerCase())) + { + vcfSpecies = tokens[1].trim(); + break; + } + } + } + } + } + + /** + * Opens the VCF file and parses header data + * + * @param filePath + * @throws IOException + */ + private void initialise(String filePath) throws IOException + { + vcfFilePath = filePath; + + reader = new VCFReader(filePath); + + header = reader.getFileHeader(); + + try + { + dictionary = header.getSequenceDictionary(); + } catch (SAMException e) + { + // ignore - thrown if any contig line lacks length info + } + + sourceId = filePath; + + saveMetadata(sourceId); + + /* + * get offset of CSQ ALLELE_NUM and Feature if declared + */ + parseCsqHeader(); + } + + /** * Reads metadata (such as INFO field descriptions and datatypes) and saves * them for future reference * @@ -378,15 +566,19 @@ public class VCFLoader int index = 0; for (String field : format) { - if (ALLELE_NUM_KEY.equals(field)) + if (CSQ_CONSEQUENCE_KEY.equals(field)) + { + csqConsequenceFieldIndex = index; + } + if (CSQ_ALLELE_NUM_KEY.equals(field)) { csqAlleleNumberFieldIndex = index; } - if (ALLELE_KEY.equals(field)) + if (CSQ_ALLELE_KEY.equals(field)) { csqAlleleFieldIndex = index; } - if (FEATURE_KEY.equals(field)) + if (CSQ_FEATURE_KEY.equals(field)) { csqFeatureFieldIndex = index; } @@ -483,194 +675,191 @@ public class VCFLoader } /** - * Tries to add overlapping variants read from a VCF file to the given - * sequence, and returns the number of variant features added. Note that this - * requires the sequence to hold information as to its species, chromosomal - * positions and reference assembly, in order to be able to map the VCF - * variants to the sequence (or not) + * Tries to add overlapping variants read from a VCF file to the given sequence, + * and returns the number of variant features added * * @param seq - * @param reader - * @param vcfAssembly * @return */ - protected int loadSequenceVCF(SequenceI seq, VCFReader reader, - String vcfAssembly) + protected int loadSequenceVCF(SequenceI seq) { - int count = 0; - GeneLociI seqCoords = seq.getGeneLoci(); - if (seqCoords == null) - { - System.out.println(String.format( - "Can't query VCF for %s as chromosome coordinates not known", - seq.getName())); - return 0; - } - - if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId())) + VCFMap vcfMap = getVcfMap(seq); + if (vcfMap == null) { return 0; } - List seqChromosomalContigs = seqCoords.getMap().getToRanges(); - for (int[] range : seqChromosomalContigs) + /* + * work with the dataset sequence here + */ + SequenceI dss = seq.getDatasetSequence(); + if (dss == null) { - count += addVcfVariants(seq, reader, range, vcfAssembly); + dss = seq; } - - return count; + return addVcfVariants(dss, vcfMap); } /** - * Answers true if the species inferred from the VCF reference identifier - * matches that for the sequence + * Answers a map from sequence coordinates to VCF chromosome ranges * - * @param vcfAssembly - * @param speciesId + * @param seq * @return */ - boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId) + private VCFMap getVcfMap(SequenceI seq) { - // PROBLEM 1 - // there are many aliases for species - how to equate one with another? - // PROBLEM 2 - // VCF ##reference header is an unstructured URI - how to extract species? - // perhaps check if ref includes any (Ensembl) alias of speciesId?? - // TODO ask the user to confirm this?? - - if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example - && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id + /* + * simplest case: sequence has id and length matching a VCF contig + */ + VCFMap vcfMap = null; + if (dictionary != null) { - return true; + vcfMap = getContigMap(seq); } - - if (vcfAssembly.contains("c_elegans") // VEP VCF response example - && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl + if (vcfMap != null) { - return true; + return vcfMap; } - // this is not a sustainable solution... - - return false; - } - - /** - * Queries the VCF reader for any variants that overlap the given chromosome - * region of the sequence, and adds as variant features. Returns the number of - * overlapping variants found. - * - * @param seq - * @param reader - * @param range - * start-end range of a sequence region in its chromosomal - * coordinates - * @param vcfAssembly - * the '##reference' identifier for the VCF reference assembly - * @return - */ - protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, String vcfAssembly) - { + /* + * otherwise, map to VCF from chromosomal coordinates + * of the sequence (if known) + */ GeneLociI seqCoords = seq.getGeneLoci(); + if (seqCoords == null) + { + Cache.log.warn(String.format( + "Can't query VCF for %s as chromosome coordinates not known", + seq.getName())); + return null; + } + String species = seqCoords.getSpeciesId(); String chromosome = seqCoords.getChromosomeId(); String seqRef = seqCoords.getAssemblyId(); - String species = seqCoords.getSpeciesId(); - - /* - * map chromosomal coordinates from sequence to VCF if the VCF - * data has a different reference assembly to the sequence - */ - // TODO generalise for non-human species - // - or get the user to choose in a dialog + MapList map = seqCoords.getMap(); - int offset = 0; - if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl - && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD + // note this requires the configured species to match that + // returned with the Ensembl sequence; todo: support aliases? + if (!vcfSpecies.equalsIgnoreCase(species)) { - String toRef = "GRCh37"; - int[] newRange = mapReferenceRange(range, chromosome, "human", - seqRef, toRef); - if (newRange == null) - { - System.err.println(String.format( - "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - seqRef, range[0], range[1], toRef)); - return 0; - } - offset = newRange[0] - range[0]; - range = newRange; + Cache.log.warn("No VCF loaded to " + seq.getName() + + " as species not matched"); + return null; } - boolean forwardStrand = range[0] <= range[1]; + if (seqRef.equalsIgnoreCase(vcfAssembly)) + { + return new VCFMap(chromosome, map); + } /* - * query the VCF for overlaps - * (convert a reverse strand range to forwards) + * VCF data has a different reference assembly to the sequence: + * query Ensembl to map chromosomal coordinates from sequence to VCF */ - int count = 0; - MapList mapping = seqCoords.getMap(); + List toVcfRanges = new ArrayList<>(); + List fromSequenceRanges = new ArrayList<>(); - int fromLocus = Math.min(range[0], range[1]); - int toLocus = Math.max(range[0], range[1]); - CloseableIterator variants = reader.query(chromosome, - fromLocus, toLocus); - while (variants.hasNext()) + for (int[] range : map.getToRanges()) { - /* - * get variant location in sequence chromosomal coordinates - */ - VariantContext variant = variants.next(); - - int start = variant.getStart() - offset; - int end = variant.getEnd() - offset; + int[] fromRange = map.locateInFrom(range[0], range[1]); + if (fromRange == null) + { + // corrupted map?!? + continue; + } - /* - * convert chromosomal location to sequence coordinates - * - may be reverse strand (convert to forward for sequence feature) - * - null if a partially overlapping feature - */ - int[] seqLocation = mapping.locateInFrom(start, end); - if (seqLocation != null) + int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, + vcfAssembly); + if (newRange == null) { - int featureStart = Math.min(seqLocation[0], seqLocation[1]); - int featureEnd = Math.max(seqLocation[0], seqLocation[1]); - count += addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); + Cache.log.error( + String.format("Failed to map %s:%s:%s:%d:%d to %s", species, + chromosome, seqRef, range[0], range[1], + vcfAssembly)); + continue; + } + else + { + toVcfRanges.add(newRange); + fromSequenceRanges.add(fromRange); } } - variants.close(); - - return count; + return new VCFMap(chromosome, + new MapList(fromSequenceRanges, toVcfRanges, 1, 1)); } /** - * A convenience method to get the AF value for the given alternate allele - * index + * If the sequence id matches a contig declared in the VCF file, and the + * sequence length matches the contig length, then returns a 1:1 map of the + * sequence to the contig, else returns null * - * @param variant - * @param alleleIndex + * @param seq * @return */ - protected float getAlleleFrequency(VariantContext variant, int alleleIndex) + private VCFMap getContigMap(SequenceI seq) { - float score = 0f; - String attributeValue = getAttributeValue(variant, - ALLELE_FREQUENCY_KEY, alleleIndex); - if (attributeValue != null) + String id = seq.getName(); + SAMSequenceRecord contig = dictionary.getSequence(id); + if (contig != null) { - try + int len = seq.getLength(); + if (len == contig.getSequenceLength()) { - score = Float.parseFloat(attributeValue); - } catch (NumberFormatException e) + MapList map = new MapList(new int[] { 1, len }, + new int[] + { 1, len }, 1, 1); + return new VCFMap(id, map); + } + } + return null; + } + + /** + * Queries the VCF reader for any variants that overlap the mapped chromosome + * ranges of the sequence, and adds as variant features. Returns the number of + * overlapping variants found. + * + * @param seq + * @param map + * mapping from sequence to VCF coordinates + * @return + */ + protected int addVcfVariants(SequenceI seq, VCFMap map) + { + boolean forwardStrand = map.map.isToForwardStrand(); + + /* + * query the VCF for overlaps of each contiguous chromosomal region + */ + int count = 0; + + for (int[] range : map.map.getToRanges()) + { + int vcfStart = Math.min(range[0], range[1]); + int vcfEnd = Math.max(range[0], range[1]); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) { - // leave as 0 + VariantContext variant = variants.next(); + + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); + + if (featureRange != null) + { + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + count += addAlleleFeatures(seq, variant, featureStart, featureEnd, + forwardStrand); + } } + variants.close(); } - return score; + return count; } /** @@ -729,9 +918,9 @@ public class VCFLoader /** * Inspects one allele and attempts to add a variant feature for it to the - * sequence. We extract as much as possible of the additional data associated - * with this allele to store in the feature's key-value map. Answers the - * number of features added (0 or 1). + * sequence. The additional data associated with this allele is extracted to + * store in the feature's key-value map. Answers the number of features added (0 + * or 1). * * @param seq * @param variant @@ -751,26 +940,55 @@ public class VCFLoader String allele = alt.getBaseString(); /* + * insertion after a genomic base, if on reverse strand, has to be + * converted to insertion of complement after the preceding position + */ + int referenceLength = reference.length(); + if (!forwardStrand && allele.length() > referenceLength + && allele.startsWith(reference)) + { + featureStart -= referenceLength; + featureEnd = featureStart; + char insertAfter = seq.getCharAt(featureStart - seq.getStart()); + reference = Dna.reverseComplement(String.valueOf(insertAfter)); + allele = allele.substring(referenceLength) + reference; + } + + /* * build the ref,alt allele description e.g. "G,A", using the base * complement if the sequence is on the reverse strand */ - // TODO check how structural variants are shown on reverse strand StringBuilder sb = new StringBuilder(); sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); sb.append(COMMA); sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); String alleles = sb.toString(); // e.g. G,A + /* + * pick out the consequence data (if any) that is for the current allele + * and feature (transcript) that matches the current sequence + */ + String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD, + altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), + csqFeatureFieldIndex); + + /* + * pick out the ontology term for the consequence type + */ String type = SequenceOntologyI.SEQUENCE_VARIANT; - float score = getAlleleFrequency(variant, altAlleleIndex); + if (consequence != null) + { + type = getOntologyTerm(consequence); + } SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, - featureEnd, score, FEATURE_GROUP_VCF); + featureEnd, FEATURE_GROUP_VCF); sf.setSource(sourceId); sf.setValue(Gff3Helper.ALLELES, alleles); - addAlleleProperties(variant, seq, sf, altAlleleIndex); + addAlleleProperties(variant, sf, altAlleleIndex, consequence); seq.addSequenceFeature(sf); @@ -778,16 +996,173 @@ public class VCFLoader } /** + * Determines the Sequence Ontology term to use for the variant feature type in + * Jalview. The default is 'sequence_variant', but a more specific term is used + * if: + *

+ * + * @param consequence + * @return + * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 + */ + String getOntologyTerm(String consequence) + { + String type = SequenceOntologyI.SEQUENCE_VARIANT; + + /* + * could we associate Consequence data with this allele and feature (transcript)? + * if so, prefer the consequence term from that data + */ + if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 + { + /* + * no Consequence data so we can't refine the ontology term + */ + return type; + } + + if (consequence != null) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > csqConsequenceFieldIndex) + { + type = csqFields[csqConsequenceFieldIndex]; + } + } + else + { + // todo the same for SnpEff consequence data matching if wanted + } + + /* + * if of the form (e.g.) missense_variant&splice_region_variant, + * just take the first ('most severe') consequence + */ + if (type != null) + { + int pos = type.indexOf('&'); + if (pos > 0) + { + type = type.substring(0, pos); + } + } + return type; + } + + /** + * Returns matched consequence data if it can be found, else null. + * + * If matched, the consequence is returned (as pipe-delimited fields). + * + * @param variant + * @param vcfInfoId + * @param altAlleleIndex + * @param alleleFieldIndex + * @param alleleNumberFieldIndex + * @param seqName + * @param featureFieldIndex + * @return + */ + private String getConsequenceForAlleleAndFeature(VariantContext variant, + String vcfInfoId, int altAlleleIndex, int alleleFieldIndex, + int alleleNumberFieldIndex, + String seqName, int featureFieldIndex) + { + if (alleleFieldIndex == -1 || featureFieldIndex == -1) + { + return null; + } + Object value = variant.getAttribute(vcfInfoId); + + if (value == null || !(value instanceof List)) + { + return null; + } + + /* + * inspect each consequence in turn (comma-separated blocks + * extracted by htsjdk) + */ + List consequences = (List) value; + + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > featureFieldIndex) + { + String featureIdentifier = csqFields[featureFieldIndex]; + if (featureIdentifier.length() > 4 + && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + { + /* + * feature (transcript) matched - now check for allele match + */ + if (matchAllele(variant, altAlleleIndex, csqFields, + alleleFieldIndex, alleleNumberFieldIndex)) + { + return consequence; + } + } + } + } + return null; + } + + private boolean matchAllele(VariantContext variant, int altAlleleIndex, + String[] csqFields, int alleleFieldIndex, + int alleleNumberFieldIndex) + { + /* + * if ALLELE_NUM is present, it must match altAlleleIndex + * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex + */ + if (alleleNumberFieldIndex > -1) + { + if (csqFields.length <= alleleNumberFieldIndex) + { + return false; + } + String alleleNum = csqFields[alleleNumberFieldIndex]; + return String.valueOf(altAlleleIndex + 1).equals(alleleNum); + } + + /* + * else consequence allele must match variant allele + */ + if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex) + { + String csqAllele = csqFields[alleleFieldIndex]; + String vcfAllele = variant.getAlternateAllele(altAlleleIndex) + .getBaseString(); + return csqAllele.equals(vcfAllele); + } + return false; + } + + /** * Add any allele-specific VCF key-value data to the sequence feature * * @param variant - * @param seq * @param sf * @param altAlelleIndex * (0, 1..) + * @param consequence + * if not null, the consequence specific to this sequence (transcript + * feature) and allele */ - protected void addAlleleProperties(VariantContext variant, SequenceI seq, - SequenceFeature sf, final int altAlelleIndex) + protected void addAlleleProperties(VariantContext variant, + SequenceFeature sf, final int altAlelleIndex, String consequence) { Map atts = variant.getAttributes(); @@ -801,7 +1176,15 @@ public class VCFLoader */ if (CSQ_FIELD.equals(key)) { - addConsequences(variant, seq, sf, altAlelleIndex); + addConsequences(variant, sf, consequence); + continue; + } + + /* + * filter out fields we don't want to capture + */ + if (!vcfFieldsOfInterest.contains(key)) + { continue; } @@ -859,30 +1242,22 @@ public class VCFLoader /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence - * feature for the current allele (and transcript if applicable) - *

- * Allele matching: if field ALLELE_NUM is present, it must match - * altAlleleIndex. If not present, then field Allele value must match the VCF - * Allele. + * feature. *

- * Transcript matching: if sequence name can be identified to at least one of - * the consequences' Feature values, then select only consequences that match - * the value (i.e. consequences for the current transcript sequence). If not, - * take all consequences (this is the case when adding features to the gene - * sequence). + * If myConsequence is not null, then this is the specific + * consequence data (pipe-delimited fields) that is for the current allele and + * transcript (sequence) being processed) * * @param variant - * @param seq * @param sf - * @param altAlelleIndex - * (0, 1..) + * @param myConsequence */ - protected void addConsequences(VariantContext variant, SequenceI seq, - SequenceFeature sf, int altAlelleIndex) + protected void addConsequences(VariantContext variant, SequenceFeature sf, + String myConsequence) { Object value = variant.getAttribute(CSQ_FIELD); - if (value == null || !(value instanceof ArrayList)) + if (value == null || !(value instanceof List)) { return; } @@ -890,43 +1265,17 @@ public class VCFLoader List consequences = (List) value; /* - * if CSQ data includes 'Feature', and any value matches the sequence name, - * then restrict consequence data to only the matching value (transcript) - * i.e. just pick out consequences for the transcript the variant feature is on - */ - String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); - String matchFeature = null; - if (csqFeatureFieldIndex > -1) - { - for (String consequence : consequences) - { - String[] csqFields = consequence.split(PIPE_REGEX); - if (csqFields.length > csqFeatureFieldIndex) - { - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (featureIdentifier.length() > 4 - && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) - { - matchFeature = featureIdentifier; - } - } - } - } - - /* - * inspect CSQ consequences; where possible restrict to the consequence + * inspect CSQ consequences; restrict to the consequence * associated with the current transcript (Feature) */ - SortedMap csqValues = new TreeMap<>( - String.CASE_INSENSITIVE_ORDER); + Map csqValues = new HashMap<>(); for (String consequence : consequences) { - String[] csqFields = consequence.split(PIPE_REGEX); - - if (includeConsequence(csqFields, matchFeature, variant, - altAlelleIndex)) + if (myConsequence == null || myConsequence.equals(consequence)) { + String[] csqFields = consequence.split(PIPE_REGEX); + /* * inspect individual fields of this consequence, copying non-null * values which are 'fields of interest' @@ -954,72 +1303,6 @@ public class VCFLoader } /** - * Answers true if we want to associate this block of consequence data with - * the specified alternate allele of the VCF variant. - *

- * If consequence data includes the ALLELE_NUM field, then this has to match - * altAlleleIndex. Otherwise the Allele field of the consequence data has to - * match the allele value. - *

- * Optionally (if matchFeature is not null), restrict to only include - * consequences whose Feature value matches. This allows us to attach - * consequences to their respective transcripts. - * - * @param csqFields - * @param matchFeature - * @param variant - * @param altAlelleIndex - * (0, 1..) - * @return - */ - protected boolean includeConsequence(String[] csqFields, - String matchFeature, VariantContext variant, int altAlelleIndex) - { - /* - * check consequence is for the current transcript - */ - if (matchFeature != null) - { - if (csqFields.length <= csqFeatureFieldIndex) - { - return false; - } - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (!featureIdentifier.equals(matchFeature)) - { - return false; // consequence is for a different transcript - } - } - - /* - * if ALLELE_NUM is present, it must match altAlleleIndex - * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex - */ - if (csqAlleleNumberFieldIndex > -1) - { - if (csqFields.length <= csqAlleleNumberFieldIndex) - { - return false; - } - String alleleNum = csqFields[csqAlleleNumberFieldIndex]; - return String.valueOf(altAlelleIndex + 1).equals(alleleNum); - } - - /* - * else consequence allele must match variant allele - */ - if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex) - { - String csqAllele = csqFields[csqAlleleFieldIndex]; - String vcfAllele = variant.getAlternateAllele(altAlelleIndex) - .getBaseString(); - return csqAllele.equals(vcfAllele); - } - - return false; - } - - /** * A convenience method to complement a dna base and return the string value * of its complement *