X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=168f1c6499bfcaf4a1d9bd437bb5afe4dc9eeda5;hb=006890b02106eb31841e6e84d75f1027434823e0;hp=44e8e16531dacaaefa36e2f1cca4fc4c85b9815f;hpb=f5ddcb62310a9f32545231d0c5e1d7c364bfbb53;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 44e8e16..168f1c6 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,32 +1,52 @@ package jalview.io.vcf; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; - -import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; -import jalview.datamodel.AlignmentI; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLoci; +import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; +import jalview.util.MessageManager; +import jalview.util.StringUtils; +import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; /** * A class to read VCF data (using the htsjdk) and add variants as sequence @@ -36,344 +56,1428 @@ import java.util.Map.Entry; */ public class VCFLoader { + private static final String VCF_ENCODABLE = ":;=%,"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + + private static final String DEFAULT_SPECIES = "homo_sapiens"; + + /** + * A class to model the mapping from sequence to VCF coordinates. Cases include + *
+ * This method is not thread safe - concurrent threads should use separate + * instances of this class. + * + * @param seqs + * @param gui + */ + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) + { + if (gui != null) + { + gui.setStatus(MessageManager.getString("label.searching_vcf")); + } + + new Thread() + { + @Override + public void run() + { + VCFLoader.this.doLoad(seqs, gui); + } + }.start(); + } + + /** + * Reads the specified contig sequence and adds its VCF variants to it + * + * @param contig + * the id of a single sequence (contig) to load + * @return + */ + public SequenceI loadVCFContig(String contig) + { + VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) + { + Cache.log.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); + if (ref.startsWith("file://")) + { + ref = ref.substring(7); + } + setSpeciesAndAssembly(ref); + + SequenceI seq = null; + File dbFile = new File(ref); + + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq); + db.close(); + } + else + { + Cache.log.error("VCF reference not found: " + ref); + } + + return seq; + } + + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { + VCFHeaderLine ref = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); + + int varCount = 0; + int seqCount = 0; + + /* + * query for VCF overlapping each sequence in turn + */ + for (SequenceI seq : seqs) + { + int added = loadSequenceVCF(seq); + if (added > 0) + { + seqCount++; + varCount += added; + transferAddedFeatures(seq); + } + } + if (gui != null) + { + String msg = MessageManager.formatMessage("label.added_vcf", + varCount, seqCount); + gui.setStatus(msg); + if (gui.getFeatureSettingsUI() != null) + { + gui.getFeatureSettingsUI().discoverAllFeatureData(); + } + } + } catch (Throwable e) + { + System.err.println("Error processing VCF: " + e.getMessage()); + e.printStackTrace(); + if (gui != null) + { + gui.setStatus("Error occurred - see console for details"); + } + } finally + { + if (reader != null) + { + try + { + reader.close(); + } catch (IOException e) + { + // ignore + } + } + header = null; + dictionary = null; + } + } + + /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the {@code reference} + * header line, configured in a property file, or (potentially) confirmed + * interactively by the user. + *
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Cache.log.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase();
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
+ * Reads metadata (such as INFO field descriptions and datatypes) and saves
+ * them for future reference
+ *
+ * @param theSourceId
+ */
+ void saveMetadata(String theSourceId)
+ {
+ List
+ * CSQ fields are declared in the CSQ INFO Description e.g.
+ *
+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
+ */
+ protected void parseCsqHeader()
+ {
+ List
+ * This supports user-defined filters for fields of interest to capture while
+ * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
+ * fields with an ID of AF, or starting with AC, would be matched.
+ *
+ * @param key
+ * @param def
+ * @return
+ */
+ private List
- * This method is not thread safe - concurrent threads should use separate
- * instances of this class.
+ * Inspects one allele and attempts to add a variant feature for it to the
+ * sequence. The additional data associated with this allele is extracted to
+ * store in the feature's key-value map. Answers the number of features added (0
+ * or 1).
*
- * @param filePath
- * @param status
+ * @param seq
+ * @param variant
+ * @param altAlleleIndex
+ * (0, 1..)
+ * @param featureStart
+ * @param featureEnd
+ * @param forwardStrand
+ * @return
*/
- public void loadVCF(String filePath, AlignViewControllerGuiI status)
+ protected int addAlleleFeature(SequenceI seq, VariantContext variant,
+ int altAlleleIndex, int featureStart, int featureEnd,
+ boolean forwardStrand)
{
- VCFReader reader = null;
+ String reference = variant.getReference().getBaseString();
+ Allele alt = variant.getAlternateAllele(altAlleleIndex);
+ String allele = alt.getBaseString();
- try
+ /*
+ * insertion after a genomic base, if on reverse strand, has to be
+ * converted to insertion of complement after the preceding position
+ */
+ int referenceLength = reference.length();
+ if (!forwardStrand && allele.length() > referenceLength
+ && allele.startsWith(reference))
{
- // long start = System.currentTimeMillis();
- reader = new VCFReader(filePath);
+ featureStart -= referenceLength;
+ featureEnd = featureStart;
+ char insertAfter = seq.getCharAt(featureStart - seq.getStart());
+ reference = Dna.reverseComplement(String.valueOf(insertAfter));
+ allele = allele.substring(referenceLength) + reference;
+ }
- VCFHeader header = reader.getFileHeader();
- VCFHeaderLine ref = header
- .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
- // check if reference is wrt assembly19 (GRCh37)
- // todo may need to allow user to specify reference assembly?
- boolean isRefGrch37 = (ref != null && ref.getValue().contains(
- "assembly19"));
+ /*
+ * build the ref,alt allele description e.g. "G,A", using the base
+ * complement if the sequence is on the reverse strand
+ */
+ StringBuilder sb = new StringBuilder();
+ sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
+ sb.append(COMMA);
+ sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
+ String alleles = sb.toString(); // e.g. G,A
- int varCount = 0;
- int seqCount = 0;
+ /*
+ * pick out the consequence data (if any) that is for the current allele
+ * and feature (transcript) that matches the current sequence
+ */
+ String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
+ altAlleleIndex, csqAlleleFieldIndex,
+ csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
+ csqFeatureFieldIndex);
- /*
- * query for VCF overlapping each sequence in turn
- */
- for (SequenceI seq : al.getSequences())
- {
- int added = loadVCF(seq, reader, isRefGrch37);
- if (added > 0)
- {
- seqCount++;
- varCount += added;
- computePeptideVariants(seq);
- }
- }
- // long elapsed = System.currentTimeMillis() - start;
- String msg = String.format("Added %d VCF variants to %d sequence(s)",
- varCount, seqCount);
- if (status != null)
- {
- status.setStatus(msg);
- }
- } catch (Throwable e)
- {
- System.err.println("Error processing VCF: " + e.getMessage());
- e.printStackTrace();
- } finally
+ /*
+ * pick out the ontology term for the consequence type
+ */
+ String type = SequenceOntologyI.SEQUENCE_VARIANT;
+ if (consequence != null)
{
- if (reader != null)
- {
- try
- {
- reader.close();
- } catch (IOException e)
- {
- // ignore
- }
- }
+ type = getOntologyTerm(consequence);
}
+
+ SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
+ featureEnd, FEATURE_GROUP_VCF);
+ sf.setSource(sourceId);
+
+ /*
+ * save the derived alleles as a named attribute; this will be
+ * needed when Jalview computes derived peptide variants
+ */
+ addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles);
+
+ /*
+ * add selected VCF fixed column data as feature attributes
+ */
+ addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart()));
+ addFeatureAttribute(sf, VCF_ID, variant.getID());
+ addFeatureAttribute(sf, VCF_QUAL,
+ String.valueOf(variant.getPhredScaledQual()));
+ addFeatureAttribute(sf, VCF_FILTER, getFilter(variant));
+
+ addAlleleProperties(variant, sf, altAlleleIndex, consequence);
+
+ seq.addSequenceFeature(sf);
+
+ return 1;
}
/**
- * (Re-)computes peptide variants from dna variants, for any protein sequence
- * to which the dna sequence has a mapping. Note that although duplicate
- * features may get computed, they will not be added, since duplicate sequence
- * features are ignored in Sequence.addSequenceFeature.
+ * Answers the VCF FILTER value for the variant - or an approximation to it.
+ * This field is either PASS, or a semi-colon separated list of filters not
+ * passed. htsjdk saves filters as a HashSet, so the order when reassembled into
+ * a list may be different.
*
- * @param dnaSeq
+ * @param variant
+ * @return
*/
- protected void computePeptideVariants(SequenceI dnaSeq)
+ String getFilter(VariantContext variant)
{
- DBRefEntry[] dbrefs = dnaSeq.getDBRefs();
- if (dbrefs == null)
+ Set
- * If the sequence maps to the reverse strand of the chromosome, reference and
- * variant bases are recorded as their complements (C/G, A/T).
+ * Answers true for '.', null, or an empty value, or if the INFO type is String.
+ * If the INFO type is Integer or Float, answers false if the value is not in
+ * valid format.
*
- * @param seq
* @param variant
- * @param featureStart
- * @param featureEnd
- * @param forwardStrand
+ * @param infoId
+ * @param value
+ * @return
*/
- protected void addVariantFeatures(SequenceI seq, VariantContext variant,
- int featureStart, int featureEnd, boolean forwardStrand)
+ protected boolean isValid(VariantContext variant, String infoId,
+ String value)
{
- byte[] reference = variant.getReference().getBases();
- if (reference.length != 1)
+ if (value == null || value.isEmpty() || NO_VALUE.equals(value))
{
- /*
- * sorry, we don't handle INDEL variants
- */
- return;
+ return true;
}
-
- /*
- * for now we extract allele frequency as feature score; note
- * this attribute is String for a simple SNP, but List
+ * If
+ *
+ *
+ * @param consequence
* @return
+ * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
*/
- protected int loadVCF(SequenceI seq, VCFReader reader,
- boolean isVcfRefGrch37)
+ String getOntologyTerm(String consequence)
{
- int count = 0;
- GeneLoci seqCoords = seq.getGeneLoci();
- if (seqCoords == null)
+ String type = SequenceOntologyI.SEQUENCE_VARIANT;
+
+ /*
+ * could we associate Consequence data with this allele and feature (transcript)?
+ * if so, prefer the consequence term from that data
+ */
+ if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
{
- return 0;
+ /*
+ * no Consequence data so we can't refine the ontology term
+ */
+ return type;
}
- List
+ *
+ * If matched, the consequence is returned (as pipe-delimited fields).
*
- * @param seq
- * @param reader
- * @param range
- * start-end range of a sequence region in its chromosomal
- * coordinates
- * @param isVcfRefGrch37
- * true if the VCF is with reference to GRCh37
+ * @param variant
+ * @param vcfInfoId
+ * @param altAlleleIndex
+ * @param alleleFieldIndex
+ * @param alleleNumberFieldIndex
+ * @param seqName
+ * @param featureFieldIndex
* @return
*/
- protected int addVcfVariants(SequenceI seq, VCFReader reader,
- int[] range, boolean isVcfRefGrch37)
+ private String getConsequenceForAlleleAndFeature(VariantContext variant,
+ String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
+ int alleleNumberFieldIndex,
+ String seqName, int featureFieldIndex)
{
- GeneLoci seqCoords = seq.getGeneLoci();
-
- String chromosome = seqCoords.chromosome;
- String seqRef = seqCoords.assembly;
- String species = seqCoords.species;
+ if (alleleFieldIndex == -1 || featureFieldIndex == -1)
+ {
+ return null;
+ }
+ Object value = variant.getAttribute(vcfInfoId);
- // TODO handle species properly
- if ("".equals(species))
+ if (value == null || !(value instanceof List>))
{
- species = "human";
+ return null;
}
/*
- * map chromosomal coordinates from GRCh38 (sequence) to
- * GRCh37 (VCF) if necessary
+ * inspect each consequence in turn (comma-separated blocks
+ * extracted by htsjdk)
*/
- // TODO generalise for other assemblies and species
- int offset = 0;
- String fromRef = "GRCh38";
- if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
- {
- String toRef = "GRCh37";
- int[] newRange = mapReferenceRange(range, chromosome, species,
- fromRef, toRef);
- if (newRange == null)
+ List
+ *
+ * myConsequence
is not null, then this is the specific
+ * consequence data (pipe-delimited fields) that is for the current allele and
+ * transcript (sequence) being processed)
+ *
+ * @param variant
+ * @param sf
+ * @param myConsequence
+ */
+ protected void addConsequences(VariantContext variant, SequenceFeature sf,
+ String myConsequence)
+ {
+ Object value = variant.getAttribute(CSQ_FIELD);
+
+ if (value == null || !(value instanceof List>))
+ {
+ return;
+ }
+
+ List