From: gmungoc Date: Mon, 4 Dec 2017 15:21:25 +0000 (+0000) Subject: JAL-2850 JAL-1766 option to load contig sequence referenced by VCF X-Git-Tag: Release_2_11_0~62^2~5 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=296593216c47a835f462d1d74a40b41e4818f737 JAL-2850 JAL-1766 option to load contig sequence referenced by VCF --- diff --git a/src/jalview/ext/htsjdk/HtsContigDb.java b/src/jalview/ext/htsjdk/HtsContigDb.java index 729f658..73d1674 100644 --- a/src/jalview/ext/htsjdk/HtsContigDb.java +++ b/src/jalview/ext/htsjdk/HtsContigDb.java @@ -250,10 +250,19 @@ public class HtsContigDb // ///// end of hts bits. + /** + * Reads the contig with the given id and returns as a Jalview SequenceI object. + * Note the database must be indexed for this operation to succeed. + * + * @param id + * @return + */ public SequenceI getSequenceProxy(String id) { - if (!isValid()) + if (!isValid() || !refFile.isIndexed()) { + System.err.println( + "Cannot read contig as file is invalid or not indexed"); return null; } diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java index 5b812c2..8a44f27 100644 --- a/src/jalview/gui/AlignFrame.java +++ b/src/jalview/gui/AlignFrame.java @@ -5603,7 +5603,8 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, { String choice = chooser.getSelectedFile().getPath(); Cache.setProperty("LAST_DIRECTORY", choice); - new VCFLoader(viewport.getAlignment()).loadVCF(choice, this); + SequenceI[] seqs = viewport.getAlignment().getSequencesArray(); + new VCFLoader(choice).loadVCF(seqs, this); } } diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 9d98b7e..9addfaa 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -4,7 +4,6 @@ import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; @@ -14,6 +13,7 @@ import jalview.datamodel.features.FeatureAttributeType; import jalview.datamodel.features.FeatureSource; import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; @@ -21,6 +21,7 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -135,9 +136,9 @@ public class VCFLoader private static final String EXCL = "!"; /* - * the alignment we are associating VCF data with + * the VCF file we are processing */ - private AlignmentI al; + protected String vcfFilePath; /* * mappings between VCF and sequence reference assembly regions, as @@ -146,6 +147,8 @@ public class VCFLoader */ private Map> assemblyMappings; + private VCFReader reader; + /* * holds details of the VCF header lines (metadata) */ @@ -189,29 +192,35 @@ public class VCFLoader Map vepFieldsOfInterest; /** - * Constructor given an alignment context + * Constructor given a VCF file * * @param alignment */ - public VCFLoader(AlignmentI alignment) + public VCFLoader(String vcfFile) { - al = alignment; + try + { + initialise(vcfFile); + } catch (IOException e) + { + System.err.println("Error opening VCF file: " + e.getMessage()); + } // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} assemblyMappings = new HashMap<>(); } /** - * Starts a new thread to query and load VCF variant data on to the alignment + * Starts a new thread to query and load VCF variant data on to the given + * sequences *

* This method is not thread safe - concurrent threads should use separate * instances of this class. * - * @param filePath + * @param seqs * @param gui */ - public void loadVCF(final String filePath, - final AlignViewControllerGuiI gui) + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) { if (gui != null) { @@ -220,51 +229,59 @@ public class VCFLoader new Thread() { - @Override public void run() { - VCFLoader.this.doLoad(filePath, gui); + VCFLoader.this.doLoad(seqs, gui); } - }.start(); } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * Reads the specified contig sequence and adds its VCF variants to it * - * @param filePath - * @param gui - * optional callback handler for messages + * @param contig + * the id of a single sequence (contig) to load + * @return */ - protected void doLoad(String filePath, AlignViewControllerGuiI gui) + public SequenceI loadVCFContig(String contig) { - VCFReader reader = null; - try + String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY) + .getValue(); + if (ref.startsWith("file://")) { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - header = reader.getFileHeader(); - - try - { - dictionary = header.getSequenceDictionary(); - } catch (SAMException e) - { - // ignore - thrown if any contig line lacks length info - } + ref = ref.substring(7); + } - sourceId = filePath; + SequenceI seq = null; + File dbFile = new File(ref); - saveMetadata(sourceId); + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq, ref); + db.close(); + } + else + { + System.err.println("VCF reference not found: " + ref); + } - /* - * get offset of CSQ ALLELE_NUM and Feature if declared - */ - parseCsqHeader(); + return seq; + } + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); String vcfAssembly = ref.getValue(); @@ -275,9 +292,9 @@ public class VCFLoader /* * query for VCF overlapping each sequence in turn */ - for (SequenceI seq : al.getSequences()) + for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, reader, vcfAssembly); + int added = loadSequenceVCF(seq, vcfAssembly); if (added > 0) { seqCount++; @@ -287,7 +304,6 @@ public class VCFLoader } if (gui != null) { - // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); @@ -322,6 +338,38 @@ public class VCFLoader } /** + * Opens the VCF file and parses header data + * + * @param filePath + * @throws IOException + */ + private void initialise(String filePath) throws IOException + { + vcfFilePath = filePath; + + reader = new VCFReader(filePath); + + header = reader.getFileHeader(); + + try + { + dictionary = header.getSequenceDictionary(); + } catch (SAMException e) + { + // ignore - thrown if any contig line lacks length info + } + + sourceId = filePath; + + saveMetadata(sourceId); + + /* + * get offset of CSQ ALLELE_NUM and Feature if declared + */ + parseCsqHeader(); + } + + /** * Reads metadata (such as INFO field descriptions and datatypes) and saves * them for future reference * @@ -536,19 +584,14 @@ public class VCFLoader } /** - * Tries to add overlapping variants read from a VCF file to the given - * sequence, and returns the number of variant features added. Note that this - * requires the sequence to hold information as to its species, chromosomal - * positions and reference assembly, in order to be able to map the VCF - * variants to the sequence (or not) + * Tries to add overlapping variants read from a VCF file to the given sequence, + * and returns the number of variant features added * * @param seq - * @param reader * @param vcfAssembly * @return */ - protected int loadSequenceVCF(SequenceI seq, VCFReader reader, - String vcfAssembly) + protected int loadSequenceVCF(SequenceI seq, String vcfAssembly) { VCFMap vcfMap = getVcfMap(seq, vcfAssembly); if (vcfMap == null) @@ -564,7 +607,7 @@ public class VCFLoader { dss = seq; } - return addVcfVariants(dss, reader, vcfMap, vcfAssembly); + return addVcfVariants(dss, vcfMap); } /** @@ -751,15 +794,11 @@ public class VCFLoader * overlapping variants found. * * @param seq - * @param reader * @param map * mapping from sequence to VCF coordinates - * @param vcfAssembly - * the '##reference' identifier for the VCF reference assembly * @return */ - protected int addVcfVariants(SequenceI seq, VCFReader reader, - VCFMap map, String vcfAssembly) + protected int addVcfVariants(SequenceI seq, VCFMap map) { boolean forwardStrand = map.map.isToForwardStrand(); @@ -939,8 +978,7 @@ public class VCFLoader String type = SequenceOntologyI.SEQUENCE_VARIANT; if (consequence != null) { - type = getOntologyTerm(seq, variant, altAlleleIndex, - consequence); + type = getOntologyTerm(consequence); } float score = getAlleleFrequency(variant, altAlleleIndex); @@ -951,7 +989,7 @@ public class VCFLoader sf.setValue(Gff3Helper.ALLELES, alleles); - addAlleleProperties(variant, seq, sf, altAlleleIndex, consequence); + addAlleleProperties(variant, sf, altAlleleIndex, consequence); seq.addSequenceFeature(sf); @@ -967,15 +1005,11 @@ public class VCFLoader *

  • sequence id can be matched to VEP Feature (or SnpEff Feature_ID)
  • * * - * @param seq - * @param variant - * @param altAlleleIndex * @param consequence * @return * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 */ - String getOntologyTerm(SequenceI seq, VariantContext variant, - int altAlleleIndex, String consequence) + String getOntologyTerm(String consequence) { String type = SequenceOntologyI.SEQUENCE_VARIANT; @@ -1121,7 +1155,6 @@ public class VCFLoader * Add any allele-specific VCF key-value data to the sequence feature * * @param variant - * @param seq * @param sf * @param altAlelleIndex * (0, 1..) @@ -1129,7 +1162,7 @@ public class VCFLoader * if not null, the consequence specific to this sequence (transcript * feature) and allele */ - protected void addAlleleProperties(VariantContext variant, SequenceI seq, + protected void addAlleleProperties(VariantContext variant, SequenceFeature sf, final int altAlelleIndex, String consequence) { Map atts = variant.getAttributes(); @@ -1144,7 +1177,7 @@ public class VCFLoader */ if (CSQ_FIELD.equals(key)) { - addConsequences(variant, seq, sf, consequence); + addConsequences(variant, sf, consequence); continue; } @@ -1209,12 +1242,11 @@ public class VCFLoader * transcript (sequence) being processed) * * @param variant - * @param seq * @param sf * @param myConsequence */ - protected void addConsequences(VariantContext variant, SequenceI seq, - SequenceFeature sf, String myConsequence) + protected void addConsequences(VariantContext variant, SequenceFeature sf, + String myConsequence) { Object value = variant.getAttribute(CSQ_FIELD); // TODO if CSQ not present, try ANN (for SnpEff consequence data)? diff --git a/test/jalview/ext/htsjdk/TestHtsContigDb.java b/test/jalview/ext/htsjdk/TestHtsContigDb.java index 29303d0..28c5cf0 100644 --- a/test/jalview/ext/htsjdk/TestHtsContigDb.java +++ b/test/jalview/ext/htsjdk/TestHtsContigDb.java @@ -78,6 +78,17 @@ public class TestHtsContigDb fail("Expected exception opening .fai file"); } + /** + * Tests that exercise + *
      + *
    • opening an unindexed fasta file
    • + *
    • creating a .fai index
    • + *
    • opening the fasta file, now using the index
    • + *
    • error on creating index if overwrite not allowed
    • + *
    + * + * @throws IOException + */ @Test(groups = "Functional") public void testCreateFastaSequenceIndex() throws IOException { @@ -120,4 +131,18 @@ public class TestHtsContigDb assertTrue(db.isIndexed()); db.close(); } + + /** + * A convenience 'test' that may be run to create a .fai file for any given + * fasta file + * + * @throws IOException + */ + @Test(enabled = false) + public void testCreateIndex() throws IOException + { + + File fasta = new File("test/jalview/io/vcf/contigs.fasta"); + HtsContigDb.createFastaSequenceIndex(fasta.toPath(), true); + } } diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 29cf9c9..a02cc5a 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -75,17 +75,18 @@ public class VCFLoaderTest Cache.loadProperties("test/jalview/io/testProps.jvprops"); Cache.setProperty("VCF_FIELDS", ".*"); Cache.setProperty("VEP_FIELDS", ".*"); + Cache.initLogger(); } @Test(groups = "Functional") public void testDoLoad() throws IOException { AlignmentI al = buildAlignment(); - VCFLoader loader = new VCFLoader(al); File f = makeVcf(); + VCFLoader loader = new VCFLoader(f.getPath()); - loader.doLoad(f.getPath(), null); + loader.doLoad(al.getSequencesArray(), null); /* * verify variant feature(s) added to gene @@ -313,11 +314,11 @@ public class VCFLoaderTest { AlignmentI al = buildAlignment(); - VCFLoader loader = new VCFLoader(al); - File f = makeVcf(); - loader.doLoad(f.getPath(), null); + VCFLoader loader = new VCFLoader(f.getPath()); + + loader.doLoad(al.getSequencesArray(), null); /* * verify variant feature(s) added to gene2 @@ -440,7 +441,7 @@ public class VCFLoaderTest { AlignmentI al = buildAlignment(); - VCFLoader loader = new VCFLoader(al); + VCFLoader loader = new VCFLoader("test/jalview/io/vcf/testVcf.vcf"); /* * VCF data file with variants at gene3 positions @@ -450,7 +451,7 @@ public class VCFLoaderTest * 13 C/G, C/T * 17 A/AC (insertion), A/G */ - loader.doLoad("test/jalview/io/vcf/testVcf.dat", null); + loader.doLoad(al.getSequencesArray(), null); /* * verify variant feature(s) added to gene3 @@ -616,4 +617,58 @@ public class VCFLoaderTest assertEquals(map.size(), 9); assertEquals(sf.getValueAsString("CSQ", "Feature"), "transcript4"); } -} + + /** + * A test that demonstrates loading a contig sequence from an indexed sequence + * database which is the reference for a VCF file + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testLoadVCFContig() throws IOException + { + VCFLoader loader = new VCFLoader( + "test/jalview/io/vcf/testVcf2.vcf"); + + SequenceI seq = loader.loadVCFContig("contig123"); + assertEquals(seq.getLength(), 15); + assertEquals(seq.getSequenceAsString(), "AAAAACCCCCGGGGG"); + List features = seq.getSequenceFeatures(); + SequenceFeatures.sortFeatures(features, true); + assertEquals(features.size(), 2); + SequenceFeature sf = features.get(0); + assertEquals(sf.getBegin(), 8); + assertEquals(sf.getEnd(), 8); + assertEquals(sf.getDescription(), "C,A"); + sf = features.get(1); + assertEquals(sf.getBegin(), 12); + assertEquals(sf.getEnd(), 12); + assertEquals(sf.getDescription(), "G,T"); + + seq = loader.loadVCFContig("contig789"); + assertEquals(seq.getLength(), 25); + assertEquals(seq.getSequenceAsString(), "GGGGGTTTTTAAAAACCCCCGGGGG"); + features = seq.getSequenceFeatures(); + SequenceFeatures.sortFeatures(features, true); + assertEquals(features.size(), 2); + sf = features.get(0); + assertEquals(sf.getBegin(), 2); + assertEquals(sf.getEnd(), 2); + assertEquals(sf.getDescription(), "G,T"); + sf = features.get(1); + assertEquals(sf.getBegin(), 21); + assertEquals(sf.getEnd(), 21); + assertEquals(sf.getDescription(), "G,A"); + + seq = loader.loadVCFContig("contig456"); + assertEquals(seq.getLength(), 20); + assertEquals(seq.getSequenceAsString(), "CCCCCGGGGGTTTTTAAAAA"); + features = seq.getSequenceFeatures(); + SequenceFeatures.sortFeatures(features, true); + assertEquals(features.size(), 1); + sf = features.get(0); + assertEquals(sf.getBegin(), 15); + assertEquals(sf.getEnd(), 15); + assertEquals(sf.getDescription(), "T,C"); + } +} \ No newline at end of file diff --git a/test/jalview/io/vcf/contigs.fasta b/test/jalview/io/vcf/contigs.fasta new file mode 100644 index 0000000..ec839b6 --- /dev/null +++ b/test/jalview/io/vcf/contigs.fasta @@ -0,0 +1,6 @@ +>contig123 +AAAAACCCCCGGGGG +>contig456 +CCCCCGGGGGTTTTTAAAAA +>contig789 +GGGGGTTTTTAAAAACCCCCGGGGG diff --git a/test/jalview/io/vcf/contigs.fasta.fai b/test/jalview/io/vcf/contigs.fasta.fai new file mode 100644 index 0000000..e9f5067 --- /dev/null +++ b/test/jalview/io/vcf/contigs.fasta.fai @@ -0,0 +1,3 @@ +contig123 15 11 15 16 +contig456 20 38 20 21 +contig789 25 70 25 26 diff --git a/test/jalview/io/vcf/testVcf.dat b/test/jalview/io/vcf/testVcf.vcf similarity index 100% rename from test/jalview/io/vcf/testVcf.dat rename to test/jalview/io/vcf/testVcf.vcf diff --git a/test/jalview/io/vcf/testVcf2.vcf b/test/jalview/io/vcf/testVcf2.vcf new file mode 100644 index 0000000..aa3792a --- /dev/null +++ b/test/jalview/io/vcf/testVcf2.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##INFO= +##contig= +##contig= +##contig= +##INFO= +##reference=test/jalview/io/vcf/contigs.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO +contig123 8 . C A 81.96 . AC=1;AF=0.1 +contig123 12 . G T 1666.64 . AC=1;AF=0.2 +contig456 15 . T C 41.94 . AC=1;AF=0.3 +contig789 2 . G T 224.23 . AC=1,2;AF=0 +contig789 21 . G A 433.35 . AC=3;AF=0.6