X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=168f1c6499bfcaf4a1d9bd437bb5afe4dc9eeda5;hb=006890b02106eb31841e6e84d75f1027434823e0;hp=a85802ae2117001c7a1377c1284334dffbdc43dc;hpb=946a0619168de3fbe9858401d13bcd1981d0a0df;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index a85802a..168f1c6 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,10 +1,8 @@ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; @@ -14,21 +12,25 @@ import jalview.datamodel.features.FeatureAttributeType; import jalview.datamodel.features.FeatureSource; import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import jalview.util.StringUtils; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.SortedMap; -import java.util.TreeMap; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -36,8 +38,10 @@ import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -52,6 +56,23 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String VCF_ENCODABLE = ":;=%,"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + + private static final String DEFAULT_SPECIES = "homo_sapiens"; + /** * A class to model the mapping from sequence to VCF coordinates. Cases include *
* This method is not thread safe - concurrent threads should use separate * instances of this class. * - * @param filePath + * @param seqs * @param gui */ - public void loadVCF(final String filePath, - final AlignViewControllerGuiI gui) + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) { if (gui != null) { @@ -212,54 +273,70 @@ public class VCFLoader new Thread() { - @Override public void run() { - VCFLoader.this.doLoad(filePath, gui); + VCFLoader.this.doLoad(seqs, gui); } - }.start(); } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * Reads the specified contig sequence and adds its VCF variants to it * - * @param filePath - * @param gui - * optional callback handler for messages + * @param contig + * the id of a single sequence (contig) to load + * @return */ - protected void doLoad(String filePath, AlignViewControllerGuiI gui) + public SequenceI loadVCFContig(String contig) { - VCFReader reader = null; - try + VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - header = reader.getFileHeader(); - - try - { - dictionary = header.getSequenceDictionary(); - } catch (SAMException e) - { - // ignore - thrown if any contig line lacks length info - } + Cache.log.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); + if (ref.startsWith("file://")) + { + ref = ref.substring(7); + } + setSpeciesAndAssembly(ref); - sourceId = filePath; + SequenceI seq = null; + File dbFile = new File(ref); - saveMetadata(sourceId); + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq); + db.close(); + } + else + { + Cache.log.error("VCF reference not found: " + ref); + } - /* - * get offset of CSQ ALLELE_NUM and Feature if declared - */ - parseCsqHeader(); + return seq; + } + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - String vcfAssembly = ref.getValue(); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); int varCount = 0; int seqCount = 0; @@ -267,9 +344,9 @@ public class VCFLoader /* * query for VCF overlapping each sequence in turn */ - for (SequenceI seq : al.getSequences()) + for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, reader, vcfAssembly); + int added = loadSequenceVCF(seq); if (added > 0) { seqCount++; @@ -279,7 +356,6 @@ public class VCFLoader } if (gui != null) { - // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); @@ -314,6 +390,103 @@ public class VCFLoader } /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the {@code reference} + * header line, configured in a property file, or (potentially) confirmed + * interactively by the user. + *
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Cache.log.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase();
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
* Reads metadata (such as INFO field descriptions and datatypes) and saves
* them for future reference
*
@@ -419,15 +592,19 @@ public class VCFLoader
int index = 0;
for (String field : format)
{
- if (ALLELE_NUM_KEY.equals(field))
+ if (CSQ_CONSEQUENCE_KEY.equals(field))
+ {
+ csqConsequenceFieldIndex = index;
+ }
+ if (CSQ_ALLELE_NUM_KEY.equals(field))
{
csqAlleleNumberFieldIndex = index;
}
- if (ALLELE_KEY.equals(field))
+ if (CSQ_ALLELE_KEY.equals(field))
{
csqAlleleFieldIndex = index;
}
- if (FEATURE_KEY.equals(field))
+ if (CSQ_FEATURE_KEY.equals(field))
{
csqFeatureFieldIndex = index;
}
@@ -483,7 +660,7 @@ public class VCFLoader
*/
protected void transferAddedFeatures(SequenceI seq)
{
- DBRefEntry[] dbrefs = seq.getDBRefs();
+ List
- * Allele matching: if field ALLELE_NUM is present, it must match
- * altAlleleIndex. If not present, then field Allele value must match the VCF
- * Allele.
- *
- * Transcript matching: if sequence name can be identified to at least one of
- * the consequences' Feature values, then select only consequences that match
- * the value (i.e. consequences for the current transcript sequence). If not,
- * take all consequences (this is the case when adding features to the gene
- * sequence).
+ * If
- * If consequence data includes the ALLELE_NUM field, then this has to match
- * altAlleleIndex. Otherwise the Allele field of the consequence data has to
- * match the allele value.
- *
- * Optionally (if matchFeature is not null), restrict to only include
- * consequences whose Feature value matches. This allows us to attach
- * consequences to their respective transcripts.
- *
- * @param csqFields
- * @param matchFeature
- * @param variant
- * @param altAlelleIndex
- * (0, 1..)
- * @return
- */
- protected boolean includeConsequence(String[] csqFields,
- String matchFeature, VariantContext variant, int altAlelleIndex)
- {
- /*
- * check consequence is for the current transcript
- */
- if (matchFeature != null)
- {
- if (csqFields.length <= csqFeatureFieldIndex)
- {
- return false;
- }
- String featureIdentifier = csqFields[csqFeatureFieldIndex];
- if (!featureIdentifier.equals(matchFeature))
- {
- return false; // consequence is for a different transcript
- }
- }
-
- /*
- * if ALLELE_NUM is present, it must match altAlleleIndex
- * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
- */
- if (csqAlleleNumberFieldIndex > -1)
- {
- if (csqFields.length <= csqAlleleNumberFieldIndex)
- {
- return false;
- }
- String alleleNum = csqFields[csqAlleleNumberFieldIndex];
- return String.valueOf(altAlelleIndex + 1).equals(alleleNum);
- }
-
- /*
- * else consequence allele must match variant allele
- */
- if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex)
- {
- String csqAllele = csqFields[csqAlleleFieldIndex];
- String vcfAllele = variant.getAlternateAllele(altAlelleIndex)
- .getBaseString();
- return csqAllele.equals(vcfAllele);
- }
-
- return false;
- }
-
- /**
* A convenience method to complement a dna base and return the string value
* of its complement
*
+ *
+ *
+ * @param consequence
+ * @return
+ * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
+ */
+ String getOntologyTerm(String consequence)
+ {
+ String type = SequenceOntologyI.SEQUENCE_VARIANT;
+
+ /*
+ * could we associate Consequence data with this allele and feature (transcript)?
+ * if so, prefer the consequence term from that data
+ */
+ if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
+ {
+ /*
+ * no Consequence data so we can't refine the ontology term
+ */
+ return type;
+ }
+
+ if (consequence != null)
+ {
+ String[] csqFields = consequence.split(PIPE_REGEX);
+ if (csqFields.length > csqConsequenceFieldIndex)
+ {
+ type = csqFields[csqConsequenceFieldIndex];
+ }
+ }
+ else
+ {
+ // todo the same for SnpEff consequence data matching if wanted
+ }
+
+ /*
+ * if of the form (e.g.) missense_variant&splice_region_variant,
+ * just take the first ('most severe') consequence
+ */
+ if (type != null)
+ {
+ int pos = type.indexOf('&');
+ if (pos > 0)
+ {
+ type = type.substring(0, pos);
+ }
+ }
+ return type;
+ }
+
+ /**
+ * Returns matched consequence data if it can be found, else null.
+ *
+ *
+ * If matched, the consequence is returned (as pipe-delimited fields).
+ *
+ * @param variant
+ * @param vcfInfoId
+ * @param altAlleleIndex
+ * @param alleleFieldIndex
+ * @param alleleNumberFieldIndex
+ * @param seqName
+ * @param featureFieldIndex
+ * @return
+ */
+ private String getConsequenceForAlleleAndFeature(VariantContext variant,
+ String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
+ int alleleNumberFieldIndex,
+ String seqName, int featureFieldIndex)
+ {
+ if (alleleFieldIndex == -1 || featureFieldIndex == -1)
+ {
+ return null;
+ }
+ Object value = variant.getAttribute(vcfInfoId);
+
+ if (value == null || !(value instanceof List>))
+ {
+ return null;
+ }
+
+ /*
+ * inspect each consequence in turn (comma-separated blocks
+ * extracted by htsjdk)
+ */
+ List
+ *
+ * myConsequence
is not null, then this is the specific
+ * consequence data (pipe-delimited fields) that is for the current allele and
+ * transcript (sequence) being processed)
*
* @param variant
- * @param seq
* @param sf
- * @param altAlelleIndex
- * (0, 1..)
+ * @param myConsequence
*/
- protected void addConsequences(VariantContext variant, SequenceI seq,
- SequenceFeature sf, int altAlelleIndex)
+ protected void addConsequences(VariantContext variant, SequenceFeature sf,
+ String myConsequence)
{
Object value = variant.getAttribute(CSQ_FIELD);
- if (value == null || !(value instanceof ArrayList>))
+ if (value == null || !(value instanceof List>))
{
return;
}
@@ -1019,43 +1426,17 @@ public class VCFLoader
List