*/
public class VCFLoader
{
+ private static final String DEFAULT_SPECIES = "homo_sapiens";
+
/**
* A class to model the mapping from sequence to VCF coordinates. Cases include
* <ul>
/*
* Lookup keys, and default values, for Preference entries that describe
- * patterns for VCF and VEP fields to capture
+ * patterns for VCF and VEP fields to capture
*/
private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
/*
+ * Lookup keys, and default values, for Preference entries that give
+ * mappings from tokens in the 'reference' header to species or assembly
+ */
+ private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
+
+ private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh38,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
+
+ private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
+
+ /*
* keys to fields of VEP CSQ consequence data
* see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
*/
private static final String PIPE_REGEX = "\\|";
/*
- * key for Allele Frequency output by VEP
- * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
- */
- private static final String ALLELE_FREQUENCY_KEY = "AF";
-
- /*
* delimiter that separates multiple consequence data blocks
*/
private static final String COMMA = ",";
private VCFHeader header;
/*
+ * species (as a valid Ensembl term) the VCF is for
+ */
+ private String vcfSpecies;
+
+ /*
+ * genome assembly version (as a valid Ensembl identifier) the VCF is for
+ */
+ private String vcfAssembly;
+
+ /*
* a Dictionary of contigs (if present) referenced in the VCF file
*/
private SAMSequenceDictionary dictionary;
{
ref = ref.substring(7);
}
+ setSpeciesAndAssembly(ref);
SequenceI seq = null;
File dbFile = new File(ref);
{
HtsContigDb db = new HtsContigDb("", dbFile);
seq = db.getSequenceProxy(contig);
- loadSequenceVCF(seq, ref);
+ loadSequenceVCF(seq);
db.close();
}
else
{
VCFHeaderLine ref = header
.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
- String vcfAssembly = ref.getValue();
+ String reference = ref.getValue();
+
+ setSpeciesAndAssembly(reference);
int varCount = 0;
int seqCount = 0;
*/
for (SequenceI seq : seqs)
{
- int added = loadSequenceVCF(seq, vcfAssembly);
+ int added = loadSequenceVCF(seq);
if (added > 0)
{
seqCount++;
}
/**
+ * Attempts to determine and save the species and genome assembly version to
+ * which the VCF data applies. This may be done by parsing the {@code reference}
+ * header line, configured in a property file, or (potentially) confirmed
+ * interactively by the user.
+ * <p>
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ vcfSpecies = DEFAULT_SPECIES;
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
* Opens the VCF file and parses header data
*
* @param filePath
* and returns the number of variant features added
*
* @param seq
- * @param vcfAssembly
* @return
*/
- protected int loadSequenceVCF(SequenceI seq, String vcfAssembly)
+ protected int loadSequenceVCF(SequenceI seq)
{
- VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
+ VCFMap vcfMap = getVcfMap(seq);
if (vcfMap == null)
{
return 0;
* Answers a map from sequence coordinates to VCF chromosome ranges
*
* @param seq
- * @param vcfAssembly
* @return
*/
- private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
+ private VCFMap getVcfMap(SequenceI seq)
{
/*
* simplest case: sequence has id and length matching a VCF contig
String seqRef = seqCoords.getAssemblyId();
MapList map = seqCoords.getMap();
- if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
+ // note this requires the configured species to match that
+ // returned with the Ensembl sequence; todo: support aliases?
+ if (!vcfSpecies.equalsIgnoreCase(species))
{
+ Cache.log.warn("No VCF loaded to " + seq.getName()
+ + " as species not matched");
return null;
}
- if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
+ if (seqRef.equalsIgnoreCase(vcfAssembly))
{
return new VCFMap(chromosome, map);
}
- if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
- || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
- {
- return null;
- }
-
/*
- * map chromosomal coordinates from sequence to VCF if the VCF
- * data has a different reference assembly to the sequence
+ * VCF data has a different reference assembly to the sequence:
+ * query Ensembl to map chromosomal coordinates from sequence to VCF
*/
- // TODO generalise for cases other than GRCh38 -> GRCh37 !
- // - or get the user to choose in a dialog
-
List<int[]> toVcfRanges = new ArrayList<>();
List<int[]> fromSequenceRanges = new ArrayList<>();
- String toRef = "GRCh37";
for (int[] range : map.getToRanges())
{
}
int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
- toRef);
+ vcfAssembly);
if (newRange == null)
{
Cache.log.error(
String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
- chromosome, seqRef, range[0], range[1], toRef));
+ chromosome, seqRef, range[0], range[1],
+ vcfAssembly));
continue;
}
else
}
/**
- * Answers true if we determine that the VCF data uses the same reference
- * assembly as the sequence, else false
- *
- * @param vcfAssembly
- * @param seqRef
- * @return
- */
- private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
- String seqRef)
- {
- // TODO improve on this stub, which handles gnomAD and
- // hopes for the best for other cases
-
- if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
- && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
- {
- return false;
- }
- return true;
- }
-
- /**
* Answers true if the species inferred from the VCF reference identifier
* matches that for the sequence
*