X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=9addfaaa68a6c7861a49e6d919daa95ffa578549;hb=296593216c47a835f462d1d74a40b41e4818f737;hp=2847bd796157aa3ff9d207ced225c6a2f3c2ea43;hpb=f8b17a9e7363b8a9e7cd12d61bc6d611c7c97d7d;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 2847bd7..9addfaa 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -4,7 +4,6 @@ import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; @@ -14,6 +13,7 @@ import jalview.datamodel.features.FeatureAttributeType; import jalview.datamodel.features.FeatureSource; import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; @@ -21,6 +21,7 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -30,6 +31,9 @@ import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; @@ -47,6 +51,35 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + /** + * A class to model the mapping from sequence to VCF coordinates. Cases include + *
* This method is not thread safe - concurrent threads should use separate
* instances of this class.
*
- * @param filePath
+ * @param seqs
* @param gui
*/
- public void loadVCF(final String filePath,
- final AlignViewControllerGuiI gui)
+ public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
{
if (gui != null)
{
@@ -179,43 +229,59 @@ public class VCFLoader
new Thread()
{
-
@Override
public void run()
{
- VCFLoader.this.doLoad(filePath, gui);
+ VCFLoader.this.doLoad(seqs, gui);
}
-
}.start();
}
/**
- * Loads VCF on to an alignment - provided it can be related to one or more
- * sequence's chromosomal coordinates
+ * Reads the specified contig sequence and adds its VCF variants to it
*
- * @param filePath
- * @param gui
- * optional callback handler for messages
+ * @param contig
+ * the id of a single sequence (contig) to load
+ * @return
*/
- protected void doLoad(String filePath, AlignViewControllerGuiI gui)
+ public SequenceI loadVCFContig(String contig)
{
- VCFReader reader = null;
- try
+ String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY)
+ .getValue();
+ if (ref.startsWith("file://"))
{
- // long start = System.currentTimeMillis();
- reader = new VCFReader(filePath);
-
- header = reader.getFileHeader();
+ ref = ref.substring(7);
+ }
- sourceId = filePath;
+ SequenceI seq = null;
+ File dbFile = new File(ref);
- saveMetadata(sourceId);
+ if (dbFile.exists())
+ {
+ HtsContigDb db = new HtsContigDb("", dbFile);
+ seq = db.getSequenceProxy(contig);
+ loadSequenceVCF(seq, ref);
+ db.close();
+ }
+ else
+ {
+ System.err.println("VCF reference not found: " + ref);
+ }
- /*
- * get offset of CSQ ALLELE_NUM and Feature if declared
- */
- parseCsqHeader();
+ return seq;
+ }
+ /**
+ * Loads VCF on to one or more sequences
+ *
+ * @param seqs
+ * @param gui
+ * optional callback handler for messages
+ */
+ protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
+ {
+ try
+ {
VCFHeaderLine ref = header
.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
String vcfAssembly = ref.getValue();
@@ -226,9 +292,9 @@ public class VCFLoader
/*
* query for VCF overlapping each sequence in turn
*/
- for (SequenceI seq : al.getSequences())
+ for (SequenceI seq : seqs)
{
- int added = loadSequenceVCF(seq, reader, vcfAssembly);
+ int added = loadSequenceVCF(seq, vcfAssembly);
if (added > 0)
{
seqCount++;
@@ -238,7 +304,6 @@ public class VCFLoader
}
if (gui != null)
{
- // long elapsed = System.currentTimeMillis() - start;
String msg = MessageManager.formatMessage("label.added_vcf",
varCount, seqCount);
gui.setStatus(msg);
@@ -267,7 +332,41 @@ public class VCFLoader
// ignore
}
}
+ header = null;
+ dictionary = null;
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
}
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
}
/**
@@ -376,15 +475,19 @@ public class VCFLoader
int index = 0;
for (String field : format)
{
- if (ALLELE_NUM_KEY.equals(field))
+ if (CSQ_CONSEQUENCE_KEY.equals(field))
+ {
+ csqConsequenceFieldIndex = index;
+ }
+ if (CSQ_ALLELE_NUM_KEY.equals(field))
{
csqAlleleNumberFieldIndex = index;
}
- if (ALLELE_KEY.equals(field))
+ if (CSQ_ALLELE_KEY.equals(field))
{
csqAlleleFieldIndex = index;
}
- if (FEATURE_KEY.equals(field))
+ if (CSQ_FEATURE_KEY.equals(field))
{
csqFeatureFieldIndex = index;
}
@@ -481,42 +584,174 @@ public class VCFLoader
}
/**
- * Tries to add overlapping variants read from a VCF file to the given
- * sequence, and returns the number of variant features added. Note that this
- * requires the sequence to hold information as to its species, chromosomal
- * positions and reference assembly, in order to be able to map the VCF
- * variants to the sequence (or not)
+ * Tries to add overlapping variants read from a VCF file to the given sequence,
+ * and returns the number of variant features added
*
* @param seq
- * @param reader
* @param vcfAssembly
* @return
*/
- protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
- String vcfAssembly)
+ protected int loadSequenceVCF(SequenceI seq, String vcfAssembly)
{
- int count = 0;
+ VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
+ if (vcfMap == null)
+ {
+ return 0;
+ }
+
+ /*
+ * work with the dataset sequence here
+ */
+ SequenceI dss = seq.getDatasetSequence();
+ if (dss == null)
+ {
+ dss = seq;
+ }
+ return addVcfVariants(dss, vcfMap);
+ }
+
+ /**
+ * Answers a map from sequence coordinates to VCF chromosome ranges
+ *
+ * @param seq
+ * @param vcfAssembly
+ * @return
+ */
+ private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
+ {
+ /*
+ * simplest case: sequence has id and length matching a VCF contig
+ */
+ VCFMap vcfMap = null;
+ if (dictionary != null)
+ {
+ vcfMap = getContigMap(seq);
+ }
+ if (vcfMap != null)
+ {
+ return vcfMap;
+ }
+
+ /*
+ * otherwise, map to VCF from chromosomal coordinates
+ * of the sequence (if known)
+ */
GeneLociI seqCoords = seq.getGeneLoci();
if (seqCoords == null)
{
- System.out.println(String.format(
+ Cache.log.warn(String.format(
"Can't query VCF for %s as chromosome coordinates not known",
seq.getName()));
- return 0;
+ return null;
+ }
+
+ String species = seqCoords.getSpeciesId();
+ String chromosome = seqCoords.getChromosomeId();
+ String seqRef = seqCoords.getAssemblyId();
+ MapList map = seqCoords.getMap();
+
+ if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
+ {
+ return null;
}
- if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId()))
+ if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
{
- return 0;
+ return new VCFMap(chromosome, map);
}
- List
- * Allele matching: if field ALLELE_NUM is present, it must match
- * altAlleleIndex. If not present, then field Allele value must match the VCF
- * Allele.
- *
- * Transcript matching: if sequence name can be identified to at least one of
- * the consequences' Feature values, then select only consequences that match
- * the value (i.e. consequences for the current transcript sequence). If not,
- * take all consequences (this is the case when adding features to the gene
- * sequence).
+ * If
- * If consequence data includes the ALLELE_NUM field, then this has to match
- * altAlleleIndex. Otherwise the Allele field of the consequence data has to
- * match the allele value.
- *
- * Optionally (if matchFeature is not null), restrict to only include
- * consequences whose Feature value matches. This allows us to attach
- * consequences to their respective transcripts.
- *
- * @param csqFields
- * @param matchFeature
- * @param variant
- * @param altAlelleIndex
- * (0, 1..)
- * @return
- */
- protected boolean includeConsequence(String[] csqFields,
- String matchFeature, VariantContext variant, int altAlelleIndex)
- {
- /*
- * check consequence is for the current transcript
- */
- if (matchFeature != null)
- {
- if (csqFields.length <= csqFeatureFieldIndex)
- {
- return false;
- }
- String featureIdentifier = csqFields[csqFeatureFieldIndex];
- if (!featureIdentifier.equals(matchFeature))
- {
- return false; // consequence is for a different transcript
- }
- }
-
- /*
- * if ALLELE_NUM is present, it must match altAlleleIndex
- * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
- */
- if (csqAlleleNumberFieldIndex > -1)
- {
- if (csqFields.length <= csqAlleleNumberFieldIndex)
- {
- return false;
- }
- String alleleNum = csqFields[csqAlleleNumberFieldIndex];
- return String.valueOf(altAlelleIndex + 1).equals(alleleNum);
- }
-
- /*
- * else consequence allele must match variant allele
- */
- if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex)
- {
- String csqAllele = csqFields[csqAlleleFieldIndex];
- String vcfAllele = variant.getAlternateAllele(altAlelleIndex)
- .getBaseString();
- return csqAllele.equals(vcfAllele);
- }
-
- return false;
- }
-
- /**
* A convenience method to complement a dna base and return the string value
* of its complement
*
+ *
+ *
+ * @param consequence
+ * @return
+ * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
+ */
+ String getOntologyTerm(String consequence)
+ {
+ String type = SequenceOntologyI.SEQUENCE_VARIANT;
+
+ if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
+ {
+ /*
+ * no Consequence data so we can't refine the ontology term
+ */
+ return type;
+ }
+
+ /*
+ * can we associate Consequence data with this allele and feature (transcript)?
+ * if so, prefer the consequence term from that data
+ */
+ if (consequence != null)
+ {
+ String[] csqFields = consequence.split(PIPE_REGEX);
+ if (csqFields.length > csqConsequenceFieldIndex)
+ {
+ type = csqFields[csqConsequenceFieldIndex];
+ }
+ }
+ else
+ {
+ // todo the same for SnpEff consequence data matching if wanted
+ }
+
+ /*
+ * if of the form (e.g.) missense_variant&splice_region_variant,
+ * just take the first ('most severe') consequence
+ */
+ if (type != null)
+ {
+ int pos = type.indexOf('&');
+ if (pos > 0)
+ {
+ type = type.substring(0, pos);
+ }
+ }
+ return type;
+ }
+
+ /**
+ * Returns matched consequence data if it can be found, else null.
+ *
+ *
+ * If matched, the consequence is returned (as pipe-delimited fields).
+ *
+ * @param variant
+ * @param vcfInfoId
+ * @param altAlleleIndex
+ * @param alleleFieldIndex
+ * @param alleleNumberFieldIndex
+ * @param seqName
+ * @param featureFieldIndex
+ * @return
+ */
+ private String getConsequenceForAlleleAndFeature(VariantContext variant,
+ String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
+ int alleleNumberFieldIndex,
+ String seqName, int featureFieldIndex)
+ {
+ if (alleleFieldIndex == -1 || featureFieldIndex == -1)
+ {
+ return null;
+ }
+ Object value = variant.getAttribute(vcfInfoId);
+
+ if (value == null || !(value instanceof List>))
+ {
+ return null;
+ }
+
+ /*
+ * inspect each consequence in turn (comma-separated blocks
+ * extracted by htsjdk)
+ */
+ List
+ *
+ * myConsequence
is not null, then this is the specific
+ * consequence data (pipe-delimited fields) that is for the current allele and
+ * transcript (sequence) being processed)
*
* @param variant
- * @param seq
* @param sf
- * @param altAlelleIndex
- * (0, 1..)
+ * @param myConsequence
*/
- protected void addConsequences(VariantContext variant, SequenceI seq,
- SequenceFeature sf, int altAlelleIndex)
+ protected void addConsequences(VariantContext variant, SequenceFeature sf,
+ String myConsequence)
{
Object value = variant.getAttribute(CSQ_FIELD);
+ // TODO if CSQ not present, try ANN (for SnpEff consequence data)?
- if (value == null || !(value instanceof ArrayList>))
+ if (value == null || !(value instanceof List>))
{
return;
}
@@ -888,42 +1259,17 @@ public class VCFLoader
List