X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=5adc55c0764ef1eacac1d74391bb4ad7380f1a74;hb=14193747f3831242bc7dfac12394eb20eb0ba480;hp=e381b264ffde9cecb38055a211d572160e601fdd;hpb=a57976ba40e1abe6d7c1940386e1a25419ef9c9d;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index e381b26..5adc55c 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -6,6 +6,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import jalview.analysis.AlignmentUtils; @@ -17,6 +18,9 @@ import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; @@ -116,6 +120,12 @@ public class VCFLoader private int csqAlleleNumberFieldIndex = -1; private int csqFeatureFieldIndex = -1; + /* + * a unique identifier under which to save metadata about feature + * attributes (selected INFO field data) + */ + private String sourceId; + /** * Constructor given an alignment context * @@ -175,18 +185,19 @@ public class VCFLoader reader = new VCFReader(filePath); header = reader.getFileHeader(); - VCFHeaderLine ref = header - .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + + sourceId = filePath; + + saveMetadata(sourceId); /* * get offset of CSQ ALLELE_NUM and Feature if declared */ locateCsqFields(); - // check if reference is wrt assembly19 (GRCh37) - // todo may need to allow user to specify reference assembly? - boolean isRefGrch37 = (ref != null && ref.getValue().contains( - "assembly19")); + VCFHeaderLine ref = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + String vcfAssembly = ref.getValue(); int varCount = 0; int seqCount = 0; @@ -196,7 +207,7 @@ public class VCFLoader */ for (SequenceI seq : al.getSequences()) { - int added = loadSequenceVCF(seq, reader, isRefGrch37); + int added = loadSequenceVCF(seq, reader, vcfAssembly); if (added > 0) { seqCount++; @@ -239,6 +250,47 @@ public class VCFLoader } /** + * Reads metadata (such as INFO field descriptions and datatypes) and saves + * them for future reference + * + * @param sourceId + */ + void saveMetadata(String sourceId) + { + FeatureSource metadata = new FeatureSource(sourceId); + + for (VCFInfoHeaderLine info : header.getInfoHeaderLines()) + { + String attributeId = info.getID(); + String desc = info.getDescription(); + VCFHeaderLineType type = info.getType(); + FeatureAttributeType attType = null; + switch (type) + { + case Character: + attType = FeatureAttributeType.Character; + break; + case Flag: + attType = FeatureAttributeType.Flag; + break; + case Float: + attType = FeatureAttributeType.Float; + break; + case Integer: + attType = FeatureAttributeType.Integer; + break; + case String: + attType = FeatureAttributeType.String; + break; + } + metadata.setAttributeName(attributeId, desc); + metadata.setAttributeType(attributeId, attType); + } + + FeatureSources.getInstance().addSource(sourceId, metadata); + } + + /** * Records the position of selected fields defined in the CSQ INFO header (if * there is one). CSQ fields are declared in the CSQ INFO Description e.g. *

@@ -336,16 +388,17 @@ public class VCFLoader /** * Tries to add overlapping variants read from a VCF file to the given * sequence, and returns the number of variant features added. Note that this - * requires the sequence to hold information as to its chromosomal positions - * and reference, in order to be able to map the VCF variants to the sequence. + * requires the sequence to hold information as to its species, chromosomal + * positions and reference assembly, in order to be able to map the VCF + * variants to the sequence (or not) * * @param seq * @param reader - * @param isVcfRefGrch37 + * @param vcfAssembly * @return */ protected int loadSequenceVCF(SequenceI seq, VCFReader reader, - boolean isVcfRefGrch37) + String vcfAssembly) { int count = 0; GeneLociI seqCoords = seq.getGeneLoci(); @@ -357,16 +410,55 @@ public class VCFLoader return 0; } + if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId())) + { + return 0; + } + List seqChromosomalContigs = seqCoords.getMap().getToRanges(); for (int[] range : seqChromosomalContigs) { - count += addVcfVariants(seq, reader, range, isVcfRefGrch37); + count += addVcfVariants(seq, reader, range, vcfAssembly); } return count; } /** + * Answers true if the species inferred from the VCF reference identifier + * matches that for the sequence + * + * @param vcfAssembly + * @param speciesId + * @return + */ + boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId) + { + // PROBLEM 1 + // there are many aliases for species - how to equate one with another? + // PROBLEM 2 + // VCF ##reference header is an unstructured URI - how to extract species? + // perhaps check if ref includes any (Ensembl) alias of speciesId?? + // TODO ask the user to confirm this?? + + if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example + && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id + { + return true; + } + + if (vcfAssembly.contains("c_elegans") // VEP VCF response example + && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl + { + return true; + } + + // this is not a sustainable solution... + + return false; + } + + /** * Queries the VCF reader for any variants that overlap the given chromosome * region of the sequence, and adds as variant features. Returns the number of * overlapping variants found. @@ -376,12 +468,12 @@ public class VCFLoader * @param range * start-end range of a sequence region in its chromosomal * coordinates - * @param isVcfRefGrch37 - * true if the VCF is with reference to GRCh37 + * @param vcfAssembly + * the '##reference' identifier for the VCF reference assembly * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, boolean isVcfRefGrch37) + int[] range, String vcfAssembly) { GeneLociI seqCoords = seq.getGeneLoci(); @@ -390,22 +482,24 @@ public class VCFLoader String species = seqCoords.getSpeciesId(); /* - * map chromosomal coordinates from GRCh38 (sequence) to - * GRCh37 (VCF) if necessary + * map chromosomal coordinates from sequence to VCF if the VCF + * data has a different reference assembly to the sequence */ - // TODO generalise for other assemblies and species + // TODO generalise for non-human species + // - or get the user to choose in a dialog + int offset = 0; - String fromRef = "GRCh38"; - if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) + if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl + && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD { String toRef = "GRCh37"; int[] newRange = mapReferenceRange(range, chromosome, "human", - fromRef, toRef); + seqRef, toRef); if (newRange == null) { System.err.println(String.format( "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - fromRef, range[0], range[1], toRef)); + seqRef, range[0], range[1], toRef)); return 0; } offset = newRange[0] - range[0]; @@ -575,6 +669,7 @@ public class VCFLoader SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, featureEnd, score, FEATURE_GROUP_VCF); + sf.setSource(sourceId); sf.setValue(Gff3Helper.ALLELES, alleles); @@ -856,8 +951,7 @@ public class VCFLoader */ EnsemblMap mapper = new EnsemblMap(); int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef, - toRef, - queryRange); + toRef, queryRange); if (mapping == null) {