X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=ac707d8f01cea046948f2809c07256b0739c0ae2;hb=ba2c0286e9a47af8ffd46057f91a22df0d4850ea;hp=20e3ccd48192504ad0c39bfcdda69be153a16749;hpb=27a06af565d224505f2484a9c74743fb3cf69be8;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 20e3ccd..ac707d8 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,10 +1,8 @@ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; @@ -14,6 +12,7 @@ import jalview.datamodel.features.FeatureAttributeType; import jalview.datamodel.features.FeatureSource; import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; @@ -21,12 +20,18 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -34,8 +39,10 @@ import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -50,6 +57,33 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String ENCODED_COMMA = "%2C"; + + private static final String ENCODED_PERCENT = "%25"; + + private static final String ENCODED_EQUALS = "%3D"; + + private static final String ENCODED_SEMICOLON = "%3B"; + + private static final String ENCODED_COLON = "%3A"; + + private static final String UTF_8 = "UTF-8"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + + private static final String DEFAULT_SPECIES = "homo_sapiens"; + /** * A class to model the mapping from sequence to VCF coordinates. Cases include *
* This method is not thread safe - concurrent threads should use separate * instances of this class. * - * @param filePath + * @param seqs * @param gui */ - public void loadVCF(final String filePath, - final AlignViewControllerGuiI gui) + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) { if (gui != null) { @@ -220,54 +284,70 @@ public class VCFLoader new Thread() { - @Override public void run() { - VCFLoader.this.doLoad(filePath, gui); + VCFLoader.this.doLoad(seqs, gui); } - }.start(); } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * Reads the specified contig sequence and adds its VCF variants to it * - * @param filePath - * @param gui - * optional callback handler for messages + * @param contig + * the id of a single sequence (contig) to load + * @return */ - protected void doLoad(String filePath, AlignViewControllerGuiI gui) + public SequenceI loadVCFContig(String contig) { - VCFReader reader = null; - try + VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - header = reader.getFileHeader(); - - try - { - dictionary = header.getSequenceDictionary(); - } catch (SAMException e) - { - // ignore - thrown if any contig line lacks length info - } + Cache.log.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); + if (ref.startsWith("file://")) + { + ref = ref.substring(7); + } + setSpeciesAndAssembly(ref); - sourceId = filePath; + SequenceI seq = null; + File dbFile = new File(ref); - saveMetadata(sourceId); + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq); + db.close(); + } + else + { + Cache.log.error("VCF reference not found: " + ref); + } - /* - * get offset of CSQ ALLELE_NUM and Feature if declared - */ - parseCsqHeader(); + return seq; + } + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - String vcfAssembly = ref.getValue(); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); int varCount = 0; int seqCount = 0; @@ -275,9 +355,9 @@ public class VCFLoader /* * query for VCF overlapping each sequence in turn */ - for (SequenceI seq : al.getSequences()) + for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, reader, vcfAssembly); + int added = loadSequenceVCF(seq); if (added > 0) { seqCount++; @@ -287,7 +367,6 @@ public class VCFLoader } if (gui != null) { - // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); @@ -322,6 +401,103 @@ public class VCFLoader } /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the {@code reference} + * header line, configured in a property file, or (potentially) confirmed + * interactively by the user. + *
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Cache.log.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase();
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
* Reads metadata (such as INFO field descriptions and datatypes) and saves
* them for future reference
*
@@ -515,7 +691,8 @@ public class VCFLoader
/*
* dna-to-peptide product mapping
*/
- AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
+ // JAL-3187 render on the fly instead
+ // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
}
else
{
@@ -536,37 +713,38 @@ public class VCFLoader
}
/**
- * Tries to add overlapping variants read from a VCF file to the given
- * sequence, and returns the number of variant features added. Note that this
- * requires the sequence to hold information as to its species, chromosomal
- * positions and reference assembly, in order to be able to map the VCF
- * variants to the sequence (or not)
+ * Tries to add overlapping variants read from a VCF file to the given sequence,
+ * and returns the number of variant features added
*
* @param seq
- * @param reader
- * @param vcfAssembly
* @return
*/
- protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
- String vcfAssembly)
+ protected int loadSequenceVCF(SequenceI seq)
{
- VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
+ VCFMap vcfMap = getVcfMap(seq);
if (vcfMap == null)
{
return 0;
}
- return addVcfVariants(seq, reader, vcfMap, vcfAssembly);
+ /*
+ * work with the dataset sequence here
+ */
+ SequenceI dss = seq.getDatasetSequence();
+ if (dss == null)
+ {
+ dss = seq;
+ }
+ return addVcfVariants(dss, vcfMap);
}
/**
* Answers a map from sequence coordinates to VCF chromosome ranges
*
* @param seq
- * @param vcfAssembly
* @return
*/
- private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
+ private VCFMap getVcfMap(SequenceI seq)
{
/*
* simplest case: sequence has id and length matching a VCF contig
@@ -597,34 +775,28 @@ public class VCFLoader
String species = seqCoords.getSpeciesId();
String chromosome = seqCoords.getChromosomeId();
String seqRef = seqCoords.getAssemblyId();
- MapList map = seqCoords.getMap();
+ MapList map = seqCoords.getMapping();
- if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
+ // note this requires the configured species to match that
+ // returned with the Ensembl sequence; todo: support aliases?
+ if (!vcfSpecies.equalsIgnoreCase(species))
{
+ Cache.log.warn("No VCF loaded to " + seq.getName()
+ + " as species not matched");
return null;
}
- if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
+ if (seqRef.equalsIgnoreCase(vcfAssembly))
{
return new VCFMap(chromosome, map);
}
- if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
- || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
- {
- return null;
- }
-
/*
- * map chromosomal coordinates from sequence to VCF if the VCF
- * data has a different reference assembly to the sequence
+ * VCF data has a different reference assembly to the sequence:
+ * query Ensembl to map chromosomal coordinates from sequence to VCF
*/
- // TODO generalise for cases other than GRCh38 -> GRCh37 !
- // - or get the user to choose in a dialog
-
List
- * Allele matching: if field ALLELE_NUM is present, it must match - * altAlleleIndex. If not present, then field Allele value must match the VCF - * Allele. - *
- * Transcript matching: if sequence name can be identified to at least one of
- * the consequences' Feature values, then select only consequences that match
- * the value (i.e. consequences for the current transcript sequence). If not,
- * take all consequences (this is the case when adding features to the gene
- * sequence).
+ * If myConsequence
is not null, then this is the specific
+ * consequence data (pipe-delimited fields) that is for the current allele and
+ * transcript (sequence) being processed)
*
* @param variant
- * @param seq
* @param sf
- * @param altAlleleIndex
- * (0, 1..)
+ * @param myConsequence
*/
- protected void addConsequences(VariantContext variant, SequenceI seq,
- SequenceFeature sf, int altAlleleIndex)
+ protected void addConsequences(VariantContext variant, SequenceFeature sf,
+ String myConsequence)
{
- /*
- * first try to identify the matching consequence
- */
- String myConsequence = getConsequenceForAlleleAndFeature(variant,
- CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex,
- csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
- csqFeatureFieldIndex);
-
Object value = variant.getAttribute(CSQ_FIELD);
if (value == null || !(value instanceof List>))
@@ -1225,6 +1485,16 @@ public class VCFLoader
String id = vepFieldsOfInterest.get(i);
if (id != null)
{
+ /*
+ * VCF spec requires encoding of special characters e.g. '='
+ * so decode them here before storing
+ */
+ try
+ {
+ field = URLDecoder.decode(field, UTF_8);
+ } catch (UnsupportedEncodingException e)
+ {
+ }
csqValues.put(id, field);
}
}