*/
locateCsqFields();
- // check if reference is wrt assembly19 (GRCh37)
- // todo may need to allow user to specify reference assembly?
- boolean isRefGrch37 = (ref != null && ref.getValue().contains(
- "assembly19"));
+ String vcfAssembly = ref.getValue();
int varCount = 0;
int seqCount = 0;
*/
for (SequenceI seq : al.getSequences())
{
- int added = loadSequenceVCF(seq, reader, isRefGrch37);
+ int added = loadSequenceVCF(seq, reader, vcfAssembly);
if (added > 0)
{
seqCount++;
/**
* Tries to add overlapping variants read from a VCF file to the given
* sequence, and returns the number of variant features added. Note that this
- * requires the sequence to hold information as to its chromosomal positions
- * and reference, in order to be able to map the VCF variants to the sequence.
+ * requires the sequence to hold information as to its species, chromosomal
+ * positions and reference assembly, in order to be able to map the VCF
+ * variants to the sequence (or not)
*
* @param seq
* @param reader
- * @param isVcfRefGrch37
+ * @param vcfAssembly
* @return
*/
protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
- boolean isVcfRefGrch37)
+ String vcfAssembly)
{
int count = 0;
GeneLociI seqCoords = seq.getGeneLoci();
return 0;
}
+ if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId()))
+ {
+ return 0;
+ }
+
List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
for (int[] range : seqChromosomalContigs)
{
- count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
+ count += addVcfVariants(seq, reader, range, vcfAssembly);
}
return count;
}
/**
+ * Answers true if the species inferred from the VCF reference identifier
+ * matches that for the sequence
+ *
+ * @param vcfAssembly
+ * @param speciesId
+ * @return
+ */
+ boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
+ {
+ // PROBLEM 1
+ // there are many aliases for species - how to equate one with another?
+ // PROBLEM 2
+ // VCF ##reference header is an unstructured URI - how to extract species?
+ // perhaps check if ref includes any (Ensembl) alias of speciesId??
+ // TODO ask the user to confirm this??
+
+ if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
+ && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
+ {
+ return true;
+ }
+
+ if (vcfAssembly.contains("c_elegans") // VEP VCF response example
+ && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
+ {
+ return true;
+ }
+
+ // this is not a sustainable solution...
+
+ return false;
+ }
+
+ /**
* Queries the VCF reader for any variants that overlap the given chromosome
* region of the sequence, and adds as variant features. Returns the number of
* overlapping variants found.
* @param range
* start-end range of a sequence region in its chromosomal
* coordinates
- * @param isVcfRefGrch37
- * true if the VCF is with reference to GRCh37
+ * @param vcfAssembly
+ * the '##reference' identifier for the VCF reference assembly
* @return
*/
protected int addVcfVariants(SequenceI seq, VCFReader reader,
- int[] range, boolean isVcfRefGrch37)
+ int[] range, String vcfAssembly)
{
GeneLociI seqCoords = seq.getGeneLoci();
String species = seqCoords.getSpeciesId();
/*
- * map chromosomal coordinates from GRCh38 (sequence) to
- * GRCh37 (VCF) if necessary
+ * map chromosomal coordinates from sequence to VCF if the VCF
+ * data has a different reference assembly to the sequence
*/
- // TODO generalise for other assemblies and species
+ // TODO generalise for non-human species
+ // - or get the user to choose in a dialog
+
int offset = 0;
- String fromRef = "GRCh38";
- if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
+ if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
+ && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
{
String toRef = "GRCh37";
int[] newRange = mapReferenceRange(range, chromosome, "human",
- fromRef, toRef);
+ seqRef, toRef);
if (newRange == null)
{
System.err.println(String.format(
"Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
- fromRef, range[0], range[1], toRef));
+ seqRef, range[0], range[1], toRef));
return 0;
}
offset = newRange[0] - range[0];
private static final String[] VCF = { "##fileformat=VCFv4.2",
"##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency, for each ALT allele, in the same order as listed\">",
- "##reference=GRCh38",
+ "##reference=Homo_sapiens/GRCh38",
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
// A/T,C variants in position 2 of gene sequence (precedes transcript)
// should create 2 variant features with respective scores
SequenceI gene1 = alignment.findName("gene1");
int[] to = new int[] { 45051610, 45051634 };
int[] from = new int[] { gene1.getStart(), gene1.getEnd() };
- gene1.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, 1, 1));
+ gene1.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList(from, to,
+ 1, 1));
/*
* map 'transcript1' to chromosome via 'gene1'
to = new int[] { 45051612, 45051619, 45051624, 45051633 };
SequenceI transcript1 = alignment.findName("transcript1");
from = new int[] { transcript1.getStart(), transcript1.getEnd() };
- transcript1.setGeneLoci("human", "GRCh38", "17", new MapList(from, to,
+ transcript1.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList(
+ from, to,
1, 1));
/*
SequenceI gene2 = alignment.findName("gene2");
to = new int[] { 45051634, 45051610 };
from = new int[] { gene2.getStart(), gene2.getEnd() };
- gene2.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, 1, 1));
+ gene2.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList(from, to,
+ 1, 1));
/*
* map 'transcript2' to chromosome via 'gene2'
to = new int[] { 45051633, 45051624, 45051619, 45051612 };
SequenceI transcript2 = alignment.findName("transcript2");
from = new int[] { transcript2.getStart(), transcript2.getEnd() };
- transcript2.setGeneLoci("human", "GRCh38", "17", new MapList(from, to,
+ transcript2.setGeneLoci("homo_sapiens", "GRCh38", "17", new MapList(
+ from, to,
1, 1));
/*
SequenceI gene3 = alignment.findName("gene3");
to = new int[] { 45051610, 45051634 };
from = new int[] { gene3.getStart(), gene3.getEnd() };
- gene3.setGeneLoci("human", "GRCh38", "5", new MapList(from, to, 1, 1));
+ gene3.setGeneLoci("homo_sapiens", "GRCh38", "5", new MapList(from, to,
+ 1, 1));
/*
* map 'transcript3' to chromosome
SequenceI transcript3 = alignment.findName("transcript3");
to = new int[] { 45051612, 45051619, 45051624, 45051633 };
from = new int[] { transcript3.getStart(), transcript3.getEnd() };
- transcript3.setGeneLoci("human", "GRCh38", "5", new MapList(from, to,
+ transcript3.setGeneLoci("homo_sapiens", "GRCh38", "5", new MapList(
+ from, to,
1, 1));
/*
to = new int[] { 45051615, 45051617, 45051619, 45051632, 45051634,
45051634 };
from = new int[] { transcript4.getStart(), transcript4.getEnd() };
- transcript4.setGeneLoci("human", "GRCh38", "5", new MapList(from, to,
+ transcript4.setGeneLoci("homo_sapiens", "GRCh38", "5", new MapList(
+ from, to,
1, 1));
/*
##INFO=<ID=AF_Female,Number=R,Type=Float,Description="Allele Frequency among Female genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|PolyPhen">
-##reference=GRCh38
+##reference=/Homo_sapiens/GRCh38
#CHROM POS ID REF ALT QUAL FILTER INFO
5 45051610 . C A 81.96 RF;AC0 AC=1;AF=0.1;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=A|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,A|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad
5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad