X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=ea5b8e06f723f3ac512a903c780508e2158184f5;hb=57738a1f3c19b1c3a00bd3ac5108f8cd0af32f99;hp=20e3ccd48192504ad0c39bfcdda69be153a16749;hpb=0a680b4ff1aaad7580d3b10941233307e2190be4;p=jalview.git
diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java
index 20e3ccd..ea5b8e0 100644
--- a/src/jalview/io/vcf/VCFLoader.java
+++ b/src/jalview/io/vcf/VCFLoader.java
@@ -1,32 +1,37 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see
* This method is not thread safe - concurrent threads should use separate * instances of this class. * - * @param filePath + * @param seqs * @param gui */ - public void loadVCF(final String filePath, - final AlignViewControllerGuiI gui) + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) { if (gui != null) { @@ -220,54 +304,71 @@ public class VCFLoader new Thread() { - @Override public void run() { - VCFLoader.this.doLoad(filePath, gui); + VCFLoader.this.doLoad(seqs, gui); } - }.start(); } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * Reads the specified contig sequence and adds its VCF variants to it * - * @param filePath - * @param gui - * optional callback handler for messages + * @param contig + * the id of a single sequence (contig) to load + * @return */ - protected void doLoad(String filePath, AlignViewControllerGuiI gui) + public SequenceI loadVCFContig(String contig) { - VCFReader reader = null; - try + VCFHeaderLine headerLine = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - header = reader.getFileHeader(); - - try - { - dictionary = header.getSequenceDictionary(); - } catch (SAMException e) - { - // ignore - thrown if any contig line lacks length info - } + Console.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); + if (ref.startsWith("file://")) + { + ref = ref.substring(7); + } + setSpeciesAndAssembly(ref); - sourceId = filePath; + SequenceI seq = null; + File dbFile = new File(ref); - saveMetadata(sourceId); + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq); + db.close(); + } + else + { + Console.error("VCF reference not found: " + ref); + } - /* - * get offset of CSQ ALLELE_NUM and Feature if declared - */ - parseCsqHeader(); + return seq; + } + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - String vcfAssembly = ref.getValue(); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); int varCount = 0; int seqCount = 0; @@ -275,9 +376,9 @@ public class VCFLoader /* * query for VCF overlapping each sequence in turn */ - for (SequenceI seq : al.getSequences()) + for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, reader, vcfAssembly); + int added = loadSequenceVCF(seq); if (added > 0) { seqCount++; @@ -287,7 +388,6 @@ public class VCFLoader } if (gui != null) { - // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); @@ -322,6 +422,103 @@ public class VCFLoader } /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the + * {@code reference} header line, configured in a property file, or + * (potentially) confirmed interactively by the user. + *
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Console.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase(Locale.ROOT);
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT)))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT)))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
* Reads metadata (such as INFO field descriptions and datatypes) and saves
* them for future reference
*
@@ -384,7 +581,7 @@ public class VCFLoader
{
for (Pattern p : filters)
{
- if (p.matcher(id.toUpperCase()).matches())
+ if (p.matcher(id.toUpperCase(Locale.ROOT)).matches())
{
return true;
}
@@ -478,7 +675,7 @@ public class VCFLoader
{
try
{
- patterns.add(Pattern.compile(token.toUpperCase()));
+ patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT)));
} catch (PatternSyntaxException e)
{
System.err.println("Invalid pattern ignored: " + token);
@@ -489,13 +686,12 @@ public class VCFLoader
/**
* Transfers VCF features to sequences to which this sequence has a mapping.
- * If the mapping is 3:1, computes peptide variants from nucleotide variants.
*
* @param seq
*/
protected void transferAddedFeatures(SequenceI seq)
{
- DBRefEntry[] dbrefs = seq.getDBRefs();
+ List
- * Allele matching: if field ALLELE_NUM is present, it must match
- * altAlleleIndex. If not present, then field Allele value must match the VCF
- * Allele.
- *
- * Transcript matching: if sequence name can be identified to at least one of
- * the consequences' Feature values, then select only consequences that match
- * the value (i.e. consequences for the current transcript sequence). If not,
- * take all consequences (this is the case when adding features to the gene
- * sequence).
+ * If
*
*
- * @param seq
- * @param variant
- * @param altAlleleIndex
+ * @param consequence
* @return
* @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
*/
- String getOntologyTerm(SequenceI seq, VariantContext variant,
- int altAlleleIndex)
+ String getOntologyTerm(String consequence)
{
String type = SequenceOntologyI.SEQUENCE_VARIANT;
+ /*
+ * could we associate Consequence data with this allele and feature (transcript)?
+ * if so, prefer the consequence term from that data
+ */
if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
{
/*
@@ -947,14 +1159,6 @@ public class VCFLoader
return type;
}
- /*
- * can we associate Consequence data with this allele and feature (transcript)?
- * if so, prefer the consequence term from that data
- */
- String consequence = getConsequenceForAlleleAndFeature(variant,
- CSQ_FIELD,
- altAlleleIndex, csqAlleleFieldIndex, csqAlleleNumberFieldIndex,
- seq.getName().toLowerCase(), csqFeatureFieldIndex);
if (consequence != null)
{
String[] csqFields = consequence.split(PIPE_REGEX);
@@ -1007,8 +1211,7 @@ public class VCFLoader
*/
private String getConsequenceForAlleleAndFeature(VariantContext variant,
String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
- int alleleNumberFieldIndex,
- String seqName, int featureFieldIndex)
+ int alleleNumberFieldIndex, String seqName, int featureFieldIndex)
{
if (alleleFieldIndex == -1 || featureFieldIndex == -1)
{
@@ -1033,8 +1236,8 @@ public class VCFLoader
if (csqFields.length > featureFieldIndex)
{
String featureIdentifier = csqFields[featureFieldIndex];
- if (featureIdentifier.length() > 4
- && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
+ if (featureIdentifier.length() > 4 && seqName
+ .indexOf(featureIdentifier.toLowerCase(Locale.ROOT)) > -1)
{
/*
* feature (transcript) matched - now check for allele match
@@ -1085,13 +1288,15 @@ public class VCFLoader
* Add any allele-specific VCF key-value data to the sequence feature
*
* @param variant
- * @param seq
* @param sf
* @param altAlelleIndex
* (0, 1..)
+ * @param consequence
+ * if not null, the consequence specific to this sequence (transcript
+ * feature) and allele
*/
- protected void addAlleleProperties(VariantContext variant, SequenceI seq,
- SequenceFeature sf, final int altAlelleIndex)
+ protected void addAlleleProperties(VariantContext variant,
+ SequenceFeature sf, final int altAlelleIndex, String consequence)
{
MapmyConsequence
is not null, then this is the specific
+ * consequence data (pipe-delimited fields) that is for the current allele and
+ * transcript (sequence) being processed)
*
* @param variant
- * @param seq
* @param sf
- * @param altAlleleIndex
- * (0, 1..)
+ * @param myConsequence
*/
- protected void addConsequences(VariantContext variant, SequenceI seq,
- SequenceFeature sf, int altAlleleIndex)
+ protected void addConsequences(VariantContext variant, SequenceFeature sf,
+ String myConsequence)
{
- /*
- * first try to identify the matching consequence
- */
- String myConsequence = getConsequenceForAlleleAndFeature(variant,
- CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex,
- csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
- csqFeatureFieldIndex);
-
Object value = variant.getAttribute(CSQ_FIELD);
if (value == null || !(value instanceof List>))
@@ -1225,6 +1485,11 @@ public class VCFLoader
String id = vepFieldsOfInterest.get(i);
if (id != null)
{
+ /*
+ * VCF spec requires encoding of special characters e.g. '='
+ * so decode them here before storing
+ */
+ field = StringUtils.urlDecode(field, VCF_ENCODABLE);
csqValues.put(id, field);
}
}
@@ -1334,8 +1599,8 @@ public class VCFLoader
* @param toRef
* @return
*/
- protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
- String species, String fromRef, String toRef)
+ protected int[] findSubsumedRangeMapping(int[] queryRange,
+ String chromosome, String species, String fromRef, String toRef)
{
String key = makeRangesKey(chromosome, species, fromRef, toRef);
if (assemblyMappings.containsKey(key))
@@ -1357,7 +1622,8 @@ public class VCFLoader
*/
int offset = queryRange[0] - fromRange[0];
int mappedRangeFrom = toRange[0] + offset;
- int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
+ int mappedRangeTo = mappedRangeFrom
+ + (queryRange[1] - queryRange[0]);
return new int[] { mappedRangeFrom, mappedRangeTo };
}
}
@@ -1380,7 +1646,7 @@ public class VCFLoader
SequenceI targetSequence, MapList mapping)
{
int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
-
+
if (mappedRange != null)
{
String group = sf.getFeatureGroup();
@@ -1404,7 +1670,6 @@ public class VCFLoader
protected static String makeRangesKey(String chromosome, String species,
String fromRef, String toRef)
{
- return species + EXCL + chromosome + EXCL + fromRef + EXCL
- + toRef;
+ return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef;
}
}