From 0e1130c3857c4cb86acec5fed9b4a7aa779e621b Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 24 Oct 2017 17:03:40 +0100 Subject: [PATCH] JAL-2743 slightly fancier species/assembly matching for VCF/sequence --- src/jalview/io/vcf/VCFLoader.java | 79 ++++++++++++++++++++++++-------- test/jalview/io/vcf/VCFLoaderTest.java | 23 ++++++---- test/jalview/io/vcf/testVcf.dat | 2 +- 3 files changed, 75 insertions(+), 29 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index e381b26..af7bb0b 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -183,10 +183,7 @@ public class VCFLoader */ locateCsqFields(); - // check if reference is wrt assembly19 (GRCh37) - // todo may need to allow user to specify reference assembly? - boolean isRefGrch37 = (ref != null && ref.getValue().contains( - "assembly19")); + String vcfAssembly = ref.getValue(); int varCount = 0; int seqCount = 0; @@ -196,7 +193,7 @@ public class VCFLoader */ for (SequenceI seq : al.getSequences()) { - int added = loadSequenceVCF(seq, reader, isRefGrch37); + int added = loadSequenceVCF(seq, reader, vcfAssembly); if (added > 0) { seqCount++; @@ -336,16 +333,17 @@ public class VCFLoader /** * Tries to add overlapping variants read from a VCF file to the given * sequence, and returns the number of variant features added. Note that this - * requires the sequence to hold information as to its chromosomal positions - * and reference, in order to be able to map the VCF variants to the sequence. + * requires the sequence to hold information as to its species, chromosomal + * positions and reference assembly, in order to be able to map the VCF + * variants to the sequence (or not) * * @param seq * @param reader - * @param isVcfRefGrch37 + * @param vcfAssembly * @return */ protected int loadSequenceVCF(SequenceI seq, VCFReader reader, - boolean isVcfRefGrch37) + String vcfAssembly) { int count = 0; GeneLociI seqCoords = seq.getGeneLoci(); @@ -357,16 +355,55 @@ public class VCFLoader return 0; } + if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId())) + { + return 0; + } + List seqChromosomalContigs = seqCoords.getMap().getToRanges(); for (int[] range : seqChromosomalContigs) { - count += addVcfVariants(seq, reader, range, isVcfRefGrch37); + count += addVcfVariants(seq, reader, range, vcfAssembly); } return count; } /** + * Answers true if the species inferred from the VCF reference identifier + * matches that for the sequence + * + * @param vcfAssembly + * @param speciesId + * @return + */ + boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId) + { + // PROBLEM 1 + // there are many aliases for species - how to equate one with another? + // PROBLEM 2 + // VCF ##reference header is an unstructured URI - how to extract species? + // perhaps check if ref includes any (Ensembl) alias of speciesId?? + // TODO ask the user to confirm this?? + + if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example + && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id + { + return true; + } + + if (vcfAssembly.contains("c_elegans") // VEP VCF response example + && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl + { + return true; + } + + // this is not a sustainable solution... + + return false; + } + + /** * Queries the VCF reader for any variants that overlap the given chromosome * region of the sequence, and adds as variant features. Returns the number of * overlapping variants found. @@ -376,12 +413,12 @@ public class VCFLoader * @param range * start-end range of a sequence region in its chromosomal * coordinates - * @param isVcfRefGrch37 - * true if the VCF is with reference to GRCh37 + * @param vcfAssembly + * the '##reference' identifier for the VCF reference assembly * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, boolean isVcfRefGrch37) + int[] range, String vcfAssembly) { GeneLociI seqCoords = seq.getGeneLoci(); @@ -390,22 +427,24 @@ public class VCFLoader String species = seqCoords.getSpeciesId(); /* - * map chromosomal coordinates from GRCh38 (sequence) to - * GRCh37 (VCF) if necessary + * map chromosomal coordinates from sequence to VCF if the VCF + * data has a different reference assembly to the sequence */ - // TODO generalise for other assemblies and species + // TODO generalise for non-human species + // - or get the user to choose in a dialog + int offset = 0; - String fromRef = "GRCh38"; - if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) + if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl + && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD { String toRef = "GRCh37"; int[] newRange = mapReferenceRange(range, chromosome, "human", - fromRef, toRef); + seqRef, toRef); if (newRange == null) { System.err.println(String.format( "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - fromRef, range[0], range[1], toRef)); + seqRef, range[0], range[1], toRef)); return 0; } offset = newRange[0] - range[0]; diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 4a254d2..5607b4b 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -55,7 +55,7 @@ public class VCFLoaderTest private static final String[] VCF = { "##fileformat=VCFv4.2", "##INFO=", - "##reference=GRCh38", + "##reference=Homo_sapiens/GRCh38", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", // A/T,C variants in position 2 of gene sequence (precedes transcript) // should create 2 variant features with respective scores @@ -191,7 +191,8 @@ public class VCFLoaderTest SequenceI gene1 = alignment.findName("gene1"); int[] to = new int[] { 45051610, 45051634 }; int[] from = new int[] { gene1.getStart(), gene1.getEnd() }; - gene1.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, 1, 1)); + gene1.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList(from, to, + 1, 1)); /* * map 'transcript1' to chromosome via 'gene1' @@ -201,7 +202,8 @@ public class VCFLoaderTest to = new int[] { 45051612, 45051619, 45051624, 45051633 }; SequenceI transcript1 = alignment.findName("transcript1"); from = new int[] { transcript1.getStart(), transcript1.getEnd() }; - transcript1.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, + transcript1.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList( + from, to, 1, 1)); /* @@ -210,7 +212,8 @@ public class VCFLoaderTest SequenceI gene2 = alignment.findName("gene2"); to = new int[] { 45051634, 45051610 }; from = new int[] { gene2.getStart(), gene2.getEnd() }; - gene2.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, 1, 1)); + gene2.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList(from, to, + 1, 1)); /* * map 'transcript2' to chromosome via 'gene2' @@ -220,7 +223,8 @@ public class VCFLoaderTest to = new int[] { 45051633, 45051624, 45051619, 45051612 }; SequenceI transcript2 = alignment.findName("transcript2"); from = new int[] { transcript2.getStart(), transcript2.getEnd() }; - transcript2.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, + transcript2.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList( + from, to, 1, 1)); /* @@ -248,7 +252,8 @@ public class VCFLoaderTest SequenceI gene3 = alignment.findName("gene3"); to = new int[] { 45051610, 45051634 }; from = new int[] { gene3.getStart(), gene3.getEnd() }; - gene3.setGeneLoci("human", "GRCh38", "5", new MapList(from, to, 1, 1)); + gene3.setGeneLoci("homo_sapiens", "GRCh38", "5", new MapList(from, to, + 1, 1)); /* * map 'transcript3' to chromosome @@ -256,7 +261,8 @@ public class VCFLoaderTest SequenceI transcript3 = alignment.findName("transcript3"); to = new int[] { 45051612, 45051619, 45051624, 45051633 }; from = new int[] { transcript3.getStart(), transcript3.getEnd() }; - transcript3.setGeneLoci("human", "GRCh38", "5", new MapList(from, to, + transcript3.setGeneLoci("homo_sapiens", "GRCh38", "5", new MapList( + from, to, 1, 1)); /* @@ -266,7 +272,8 @@ public class VCFLoaderTest to = new int[] { 45051615, 45051617, 45051619, 45051632, 45051634, 45051634 }; from = new int[] { transcript4.getStart(), transcript4.getEnd() }; - transcript4.setGeneLoci("human", "GRCh38", "5", new MapList(from, to, + transcript4.setGeneLoci("homo_sapiens", "GRCh38", "5", new MapList( + from, to, 1, 1)); /* diff --git a/test/jalview/io/vcf/testVcf.dat b/test/jalview/io/vcf/testVcf.dat index e9e6c22..77e070c 100644 --- a/test/jalview/io/vcf/testVcf.dat +++ b/test/jalview/io/vcf/testVcf.dat @@ -4,7 +4,7 @@ ##INFO= ##INFO= ##INFO= -##reference=GRCh38 +##reference=/Homo_sapiens/GRCh38 #CHROM POS ID REF ALT QUAL FILTER INFO 5 45051610 . C A 81.96 RF;AC0 AC=1;AF=0.1;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=A|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,A|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad 5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -- 1.7.10.2