*/
public class VCFLoader
{
- private static final String DEFAULT_SPECIES = "homo_sapiens";
-
/**
* A class to model the mapping from sequence to VCF coordinates. Cases include
* <ul>
/*
* Lookup keys, and default values, for Preference entries that describe
- * patterns for VCF and VEP fields to capture
+ * patterns for VCF and VEP fields to capture
*/
private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
/*
- * Lookup keys, and default values, for Preference entries that give
- * mappings from tokens in the 'reference' header to species or assembly
- */
- private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
-
- private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh38,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
-
- private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
-
- /*
* keys to fields of VEP CSQ consequence data
* see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
*/
private static final String PIPE_REGEX = "\\|";
/*
+ * key for Allele Frequency output by VEP
+ * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+ */
+ private static final String ALLELE_FREQUENCY_KEY = "AF";
+
+ /*
* delimiter that separates multiple consequence data blocks
*/
private static final String COMMA = ",";
private VCFHeader header;
/*
- * species (as a valid Ensembl term) the VCF is for
- */
- private String vcfSpecies;
-
- /*
- * genome assembly version (as a valid Ensembl identifier) the VCF is for
- */
- private String vcfAssembly;
-
- /*
* a Dictionary of contigs (if present) referenced in the VCF file
*/
private SAMSequenceDictionary dictionary;
*/
public SequenceI loadVCFContig(String contig)
{
- VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
- String ref = headerLine == null ? null : headerLine.getValue();
+ String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY)
+ .getValue();
if (ref.startsWith("file://"))
{
ref = ref.substring(7);
}
- setSpeciesAndAssembly(ref);
SequenceI seq = null;
File dbFile = new File(ref);
{
HtsContigDb db = new HtsContigDb("", dbFile);
seq = db.getSequenceProxy(contig);
- loadSequenceVCF(seq);
+ loadSequenceVCF(seq, ref);
db.close();
}
else
{
VCFHeaderLine ref = header
.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
- String reference = ref.getValue();
-
- setSpeciesAndAssembly(reference);
+ String vcfAssembly = ref.getValue();
int varCount = 0;
int seqCount = 0;
*/
for (SequenceI seq : seqs)
{
- int added = loadSequenceVCF(seq);
+ int added = loadSequenceVCF(seq, vcfAssembly);
if (added > 0)
{
seqCount++;
}
/**
- * Attempts to determine and save the species and genome assembly version to
- * which the VCF data applies. This may be done by parsing the {@code reference}
- * header line, configured in a property file, or (potentially) confirmed
- * interactively by the user.
- * <p>
- * The saved values should be identifiers valid for Ensembl's REST service
- * {@code map} endpoint, so they can be used (if necessary) to retrieve the
- * mapping between VCF coordinates and sequence coordinates.
- *
- * @param reference
- * @see https://rest.ensembl.org/documentation/info/assembly_map
- * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
- * @see https://rest.ensembl.org/info/species?content-type=text/xml
- */
- protected void setSpeciesAndAssembly(String reference)
- {
- reference = reference.toLowerCase();
- vcfSpecies = DEFAULT_SPECIES;
-
- /*
- * for a non-human species, or other assembly identifier,
- * specify as a Jalview property file entry e.g.
- * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
- * VCF_SPECIES = c_elegans=celegans
- * to map a token in the reference header to a value
- */
- String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
- for (String token : prop.split(","))
- {
- String[] tokens = token.split("=");
- if (tokens.length == 2)
- {
- if (reference.contains(tokens[0].trim().toLowerCase()))
- {
- vcfAssembly = tokens[1].trim();
- break;
- }
- }
- }
-
- prop = Cache.getProperty(VCF_SPECIES);
- if (prop != null)
- {
- for (String token : prop.split(","))
- {
- String[] tokens = token.split("=");
- if (tokens.length == 2)
- {
- if (reference.contains(tokens[0].trim().toLowerCase()))
- {
- vcfSpecies = tokens[1].trim();
- break;
- }
- }
- }
- }
- }
-
- /**
* Opens the VCF file and parses header data
*
* @param filePath
* and returns the number of variant features added
*
* @param seq
+ * @param vcfAssembly
* @return
*/
- protected int loadSequenceVCF(SequenceI seq)
+ protected int loadSequenceVCF(SequenceI seq, String vcfAssembly)
{
- VCFMap vcfMap = getVcfMap(seq);
+ VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
if (vcfMap == null)
{
return 0;
* Answers a map from sequence coordinates to VCF chromosome ranges
*
* @param seq
+ * @param vcfAssembly
* @return
*/
- private VCFMap getVcfMap(SequenceI seq)
+ private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
{
/*
* simplest case: sequence has id and length matching a VCF contig
String seqRef = seqCoords.getAssemblyId();
MapList map = seqCoords.getMap();
- // note this requires the configured species to match that
- // returned with the Ensembl sequence; todo: support aliases?
- if (!vcfSpecies.equalsIgnoreCase(species))
+ if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
{
- Cache.log.warn("No VCF loaded to " + seq.getName()
- + " as species not matched");
return null;
}
- if (seqRef.equalsIgnoreCase(vcfAssembly))
+ if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
{
return new VCFMap(chromosome, map);
}
+ if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
+ || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
+ {
+ return null;
+ }
+
/*
- * VCF data has a different reference assembly to the sequence:
- * query Ensembl to map chromosomal coordinates from sequence to VCF
+ * map chromosomal coordinates from sequence to VCF if the VCF
+ * data has a different reference assembly to the sequence
*/
+ // TODO generalise for cases other than GRCh38 -> GRCh37 !
+ // - or get the user to choose in a dialog
+
List<int[]> toVcfRanges = new ArrayList<>();
List<int[]> fromSequenceRanges = new ArrayList<>();
+ String toRef = "GRCh37";
for (int[] range : map.getToRanges())
{
}
int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
- vcfAssembly);
+ toRef);
if (newRange == null)
{
Cache.log.error(
String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
- chromosome, seqRef, range[0], range[1],
- vcfAssembly));
+ chromosome, seqRef, range[0], range[1], toRef));
continue;
}
else
}
/**
+ * Answers true if we determine that the VCF data uses the same reference
+ * assembly as the sequence, else false
+ *
+ * @param vcfAssembly
+ * @param seqRef
+ * @return
+ */
+ private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
+ String seqRef)
+ {
+ // TODO improve on this stub, which handles gnomAD and
+ // hopes for the best for other cases
+
+ if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
+ && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
+ {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Answers true if the species inferred from the VCF reference identifier
+ * matches that for the sequence
+ *
+ * @param vcfAssembly
+ * @param speciesId
+ * @return
+ */
+ boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
+ {
+ // PROBLEM 1
+ // there are many aliases for species - how to equate one with another?
+ // PROBLEM 2
+ // VCF ##reference header is an unstructured URI - how to extract species?
+ // perhaps check if ref includes any (Ensembl) alias of speciesId??
+ // TODO ask the user to confirm this??
+
+ if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
+ && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
+ {
+ return true;
+ }
+
+ if (vcfAssembly.contains("c_elegans") // VEP VCF response example
+ && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
+ {
+ return true;
+ }
+
+ // this is not a sustainable solution...
+
+ return false;
+ }
+
+ /**
* Queries the VCF reader for any variants that overlap the mapped chromosome
* ranges of the sequence, and adds as variant features. Returns the number of
* overlapping variants found.