From 9317cd655af4803461acc71581ecbdc0a6677069 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 21 Sep 2017 14:14:41 +0100 Subject: [PATCH] JAL-2738 PoC of Load VCF file on to gene sequence --- resources/lang/Messages.properties | 2 + src/jalview/ext/htsjdk/VCFReader.java | 14 +- src/jalview/gui/AlignFrame.java | 22 +++ src/jalview/io/vcf/VCFLoader.java | 256 ++++++++++++++++++++++++++++ src/jalview/jbgui/GAlignFrame.java | 17 ++ test/jalview/ext/htsjdk/VCFReaderTest.java | 30 +++- 6 files changed, 332 insertions(+), 9 deletions(-) create mode 100644 src/jalview/io/vcf/VCFLoader.java diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties index fe74476..42baf30 100644 --- a/resources/lang/Messages.properties +++ b/resources/lang/Messages.properties @@ -490,6 +490,8 @@ label.settings_for_type = Settings for {0} label.view_full_application = View in Full Application label.load_associated_tree = Load Associated Tree... label.load_features_annotations = Load Features/Annotations... +label.load_vcf = Load plain text or indexed VCF data +label.load_vcf_file = Load VCF File label.export_features = Export Features... label.export_annotations = Export Annotations... label.to_upper_case = To Upper Case diff --git a/src/jalview/ext/htsjdk/VCFReader.java b/src/jalview/ext/htsjdk/VCFReader.java index 8dfd7e2..c5e09e0 100644 --- a/src/jalview/ext/htsjdk/VCFReader.java +++ b/src/jalview/ext/htsjdk/VCFReader.java @@ -3,6 +3,7 @@ package jalview.ext.htsjdk; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; import java.io.Closeable; import java.io.File; @@ -72,7 +73,8 @@ public class VCFReader implements Closeable, Iterable /** * Queries for records overlapping the region specified. Note that this method * requires a VCF file with an associated index. If no index exists a - * TribbleException will be thrown. + * TribbleException will be thrown. Client code should call close() on the + * iterator when finished with it. * * @param chrom * the chromosome to query @@ -87,4 +89,14 @@ public class VCFReader implements Closeable, Iterable { return reader == null ? null : reader.query(chrom, start, end); } + + /** + * Returns an object that models the VCF file headers + * + * @return + */ + public VCFHeader getFileHeader() + { + return reader == null ? null : reader.getFileHeader(); + } } diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java index 13b715e..3ea5481 100644 --- a/src/jalview/gui/AlignFrame.java +++ b/src/jalview/gui/AlignFrame.java @@ -81,6 +81,7 @@ import jalview.io.JnetAnnotationMaker; import jalview.io.NewickFile; import jalview.io.ScoreMatrixFile; import jalview.io.TCoffeeScoreFile; +import jalview.io.vcf.VCFLoader; import jalview.jbgui.GAlignFrame; import jalview.schemes.ColourSchemeI; import jalview.schemes.ColourSchemes; @@ -841,6 +842,7 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, AlignmentI al = getViewport().getAlignment(); boolean nucleotide = al.isNucleotide(); + loadVcf.setVisible(nucleotide); showTranslation.setVisible(nucleotide); showReverse.setVisible(nucleotide); showReverseComplement.setVisible(nucleotide); @@ -5584,6 +5586,26 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, new CalculationChooser(AlignFrame.this); } } + + @Override + protected void loadVcf_actionPerformed() + { + JalviewFileChooser chooser = new JalviewFileChooser( + Cache.getProperty("LAST_DIRECTORY")); + chooser.setFileView(new JalviewFileView()); + chooser.setDialogTitle(MessageManager.getString("label.load_vcf_file")); + chooser.setToolTipText(MessageManager.getString("label.load_vcf_file")); + + int value = chooser.showOpenDialog(null); + + if (value == JalviewFileChooser.APPROVE_OPTION) + { + String choice = chooser.getSelectedFile().getPath(); + Cache.setProperty("LAST_DIRECTORY", choice); + new VCFLoader(viewport.getAlignment()).loadVCF(choice); + } + + } } class PrintThread extends Thread diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java new file mode 100644 index 0000000..d70ffdf --- /dev/null +++ b/src/jalview/io/vcf/VCFLoader.java @@ -0,0 +1,256 @@ +package jalview.io.vcf; + +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.GeneLoci; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.ext.htsjdk.VCFReader; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.MapList; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +public class VCFLoader +{ + AlignmentI al; + + /** + * Constructor given an alignment context + * + * @param alignment + */ + public VCFLoader(AlignmentI alignment) + { + al = alignment; + } + + /** + * Loads VCF on to an alignment - provided it can be related to one or more + * sequence's chromosomal coordinates + * + * @param filePath + */ + public void loadVCF(String filePath) + { + VCFReader reader = null; + + try + { + reader = new VCFReader(filePath); + + VCFHeader header = reader.getFileHeader(); + VCFHeaderLine ref = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + // check if reference is wrt assembly19 (GRCh37) + // todo may need to allow user to specify reference assembly? + boolean isRefGrch37 = (ref != null && ref.getValue().contains( + "assembly19")); + + int varCount = 0; + int seqCount = 0; + + for (SequenceI seq : al.getSequences()) + { + int added = loadVCF(seq, reader, isRefGrch37); + if (added > 0) + { + seqCount++; + varCount += added; + } + } + System.out.println(String.format( +"Added %d VCF variants to %d sequence(s)", varCount, + seqCount)); + + reader.close(); + } catch (Throwable e) + { + System.err.println("Error processing VCF: " + e.getMessage()); + e.printStackTrace(); + } + } + + /** + * Tries to add overlapping variants read from a VCF file to the given + * sequence, and returns the number of overlapping variants found. Note that + * this requires the sequence to hold information as to its chromosomal + * positions and reference, in order to be able to map the VCF variants to the + * sequence. + * + * @param seq + * @param reader + * @param isVcfRefGrch37 + * @return + */ + protected int loadVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37) + { + int count = 0; + GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); + if (seqCoords == null) + { + return 0; + } + + /* + * fudge - ensure chromosomal mapping from range is sequence start/end + * (in case end == 0 when the mapping is first created) + */ + MapList mapping = seqCoords.getMapping(); + if (mapping.getFromRanges().get(0)[1] == 0) + { + mapping.getFromRanges().get(0)[1] = seq.getEnd(); + } + + List seqChromosomalContigs = mapping.getToRanges(); + for (int[] range : seqChromosomalContigs) + { + count += addVcfVariants(seq, reader, range, isVcfRefGrch37); + } + + return count; + } + + /** + * Queries the VCF reader for any variants that overlap the given chromosome + * region of the sequence, and adds as variant features. Returns the number of + * overlapping variants found. + * + * @param seq + * @param reader + * @param range + * start-end range of a sequence region in its chromosomal + * coordinates + * @param isVcfRefGrch37 + * true if the VCF is with reference to GRCh37 + * @return + */ + protected int addVcfVariants(SequenceI seq, VCFReader reader, + int[] range, boolean isVcfRefGrch37) + { + GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); + + String chromosome = seqCoords.getChromosome(); + String seqRef = seqCoords.getReference(); + String species = seqCoords.getSpecies(); + + /* + * map chromosomal coordinates from GRCh38 (sequence) to + * GRCh37 (VCF) if necessary + */ + int offset = 0; + if ("GRCh38".equalsIgnoreCase(seqRef) && isVcfRefGrch37) + { + int[] newRange = mapReferenceRange(range, chromosome, species, + "GRCh38", + "GRCh37"); + offset = newRange[0] - range[0]; + range = newRange; + } + + /* + * query the VCF for overlaps + */ + int count = 0; + MapList mapping = seqCoords.getMapping(); + + CloseableIterator variants = reader.query(chromosome, + range[0], range[1]); + while (variants.hasNext()) + { + /* + * get variant location in sequence chromosomal coordinates + */ + VariantContext variant = variants.next(); + count++; + int start = variant.getStart() - offset; + int end = variant.getEnd() - offset; + + /* + * get location in sequence coordinates + */ + int[] seqLocation = mapping.locateInFrom(start, end); + int featureStart = seqLocation[0]; + int featureEnd = seqLocation[1]; + addVariantFeatures(seq, variant, featureStart, featureEnd); + } + + variants.close(); + + return count; + } + + /** + * Inspects the VCF variant record, and adds variant features to the sequence + * + * @param seq + * @param variant + * @param featureStart + * @param featureEnd + */ + protected void addVariantFeatures(SequenceI seq, VariantContext variant, + int featureStart, int featureEnd) + { + StringBuilder sb = new StringBuilder(); + sb.append(variant.getReference().getBaseString()); + + for (Allele allele : variant.getAlleles()) + { + if (!allele.isReference()) + { + sb.append(",").append(allele.getBaseString()); + } + } + String alleles = sb.toString(); // e.g. G,A,C + + String type = SequenceOntologyI.SEQUENCE_VARIANT; + SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, + featureEnd, "VCF"); + + /* + * only add 'alleles' property if a SNP, as we can + * only handle SNPs when computing peptide variants + */ + if (variant.isSNP()) + { + sf.setValue("alleles", alleles); + } + + Map atts = variant.getAttributes(); + for (Entry att : atts.entrySet()) + { + sf.setValue(att.getKey(), att.getValue()); + } + seq.addSequenceFeature(sf); + } + + /** + * Call the Ensembl REST service that maps from one assembly reference's + * coordinates to another's + * + * @param range + * @param chromosome + * @param species + * @param fromRef + * @param toRef + * @return + */ + protected int[] mapReferenceRange(int[] range, String chromosome, + String species, String fromRef, String toRef) + { + // TODO call + // http://rest.ensembl.org/map/species/fromRef/chromosome:range[0]..range[1]:1/toRef?content-type=application/json + // and parse the JSON response + + // 1922632 is the difference between 37 and 38 for chromosome 17 + return new int[] { range[0] - 1922632, range[1] - 1922632 }; + } +} diff --git a/src/jalview/jbgui/GAlignFrame.java b/src/jalview/jbgui/GAlignFrame.java index 86d0c85..1cf482d 100755 --- a/src/jalview/jbgui/GAlignFrame.java +++ b/src/jalview/jbgui/GAlignFrame.java @@ -147,6 +147,8 @@ public class GAlignFrame extends JInternalFrame protected JMenuItem runGroovy = new JMenuItem(); + protected JMenuItem loadVcf; + protected JCheckBoxMenuItem autoCalculate = new JCheckBoxMenuItem(); protected JCheckBoxMenuItem sortByTree = new JCheckBoxMenuItem(); @@ -1308,6 +1310,16 @@ public class GAlignFrame extends JInternalFrame associatedData_actionPerformed(e); } }); + loadVcf = new JMenuItem(MessageManager.getString("label.load_vcf_file")); + loadVcf.setToolTipText(MessageManager.getString("label.load_vcf")); + loadVcf.addActionListener(new ActionListener() + { + @Override + public void actionPerformed(ActionEvent e) + { + loadVcf_actionPerformed(); + } + }); autoCalculate.setText( MessageManager.getString("label.autocalculate_consensus")); autoCalculate.setState( @@ -1710,6 +1722,7 @@ public class GAlignFrame extends JInternalFrame fileMenu.add(exportAnnotations); fileMenu.add(loadTreeMenuItem); fileMenu.add(associatedData); + fileMenu.add(loadVcf); fileMenu.addSeparator(); fileMenu.add(closeMenuItem); @@ -1855,6 +1868,10 @@ public class GAlignFrame extends JInternalFrame // selectMenu.add(listenToViewSelections); } + protected void loadVcf_actionPerformed() + { + } + /** * Constructs the entries on the Colour menu (but does not add them to the * menu). diff --git a/test/jalview/ext/htsjdk/VCFReaderTest.java b/test/jalview/ext/htsjdk/VCFReaderTest.java index 42c655d..29d752c 100644 --- a/test/jalview/ext/htsjdk/VCFReaderTest.java +++ b/test/jalview/ext/htsjdk/VCFReaderTest.java @@ -26,6 +26,8 @@ public class VCFReaderTest // gnomAD exome variant dataset private static final String VCF_PATH = "/Volumes/gjb/smacgowan/NOBACK/resources/gnomad/gnomad.exomes.r2.0.1.sites.vcf.gz"; + // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz"; + /** * A test to exercise some basic functionality of the htsjdk VCF reader * @@ -105,8 +107,6 @@ public class VCFReaderTest pw.close(); return f; } - - // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz"; /** * A 'test' that demonstrates querying an indexed VCF file for features in a @@ -131,15 +131,29 @@ public class VCFReaderTest CloseableIterator features = reader.query("17", 43128978 + 9724, 43128978 + 9734); // first 11 CDS positions - assertEquals(features.next().getStart(), 43138702); - assertEquals(features.next().getStart(), 43138704); - assertEquals(features.next().getStart(), 43138707); - assertEquals(features.next().getStart(), 43138708); - assertEquals(features.next().getStart(), 43138710); - assertEquals(features.next().getStart(), 43138711); + assertEquals(printNext(features), 43138702); + assertEquals(printNext(features), 43138704); + assertEquals(printNext(features), 43138707); + assertEquals(printNext(features), 43138708); + assertEquals(printNext(features), 43138710); + assertEquals(printNext(features), 43138711); assertFalse(features.hasNext()); features.close(); reader.close(); } + + /** + * Prints the toString value of the next variant, and returns its start + * location + * + * @param features + * @return + */ + protected int printNext(CloseableIterator features) + { + VariantContext next = features.next(); + System.out.println(next.toString()); + return next.getStart(); + } } -- 1.7.10.2