X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=de2f18ad7fa812fefb35350307fff2ed6b1ac322;hb=14550ad1e5da4b81082a3a06222cbe26eb9435ce;hp=20e3ccd48192504ad0c39bfcdda69be153a16749;hpb=27a06af565d224505f2484a9c74743fb3cf69be8;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 20e3ccd..de2f18a 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -4,7 +4,6 @@ import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; @@ -14,6 +13,7 @@ import jalview.datamodel.features.FeatureAttributeType; import jalview.datamodel.features.FeatureSource; import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; @@ -21,6 +21,7 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -135,9 +136,9 @@ public class VCFLoader private static final String EXCL = "!"; /* - * the alignment we are associating VCF data with + * the VCF file we are processing */ - private AlignmentI al; + protected String vcfFilePath; /* * mappings between VCF and sequence reference assembly regions, as @@ -146,6 +147,8 @@ public class VCFLoader */ private Map> assemblyMappings; + private VCFReader reader; + /* * holds details of the VCF header lines (metadata) */ @@ -189,29 +192,35 @@ public class VCFLoader Map vepFieldsOfInterest; /** - * Constructor given an alignment context + * Constructor given a VCF file * * @param alignment */ - public VCFLoader(AlignmentI alignment) + public VCFLoader(String vcfFile) { - al = alignment; + try + { + initialise(vcfFile); + } catch (IOException e) + { + System.err.println("Error opening VCF file: " + e.getMessage()); + } // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} assemblyMappings = new HashMap<>(); } /** - * Starts a new thread to query and load VCF variant data on to the alignment + * Starts a new thread to query and load VCF variant data on to the given + * sequences *

* This method is not thread safe - concurrent threads should use separate * instances of this class. * - * @param filePath + * @param seqs * @param gui */ - public void loadVCF(final String filePath, - final AlignViewControllerGuiI gui) + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) { if (gui != null) { @@ -220,51 +229,59 @@ public class VCFLoader new Thread() { - @Override public void run() { - VCFLoader.this.doLoad(filePath, gui); + VCFLoader.this.doLoad(seqs, gui); } - }.start(); } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * Reads the specified contig sequence and adds its VCF variants to it * - * @param filePath - * @param gui - * optional callback handler for messages + * @param contig + * the id of a single sequence (contig) to load + * @return */ - protected void doLoad(String filePath, AlignViewControllerGuiI gui) + public SequenceI loadVCFContig(String contig) { - VCFReader reader = null; - try + String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY) + .getValue(); + if (ref.startsWith("file://")) { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - header = reader.getFileHeader(); - - try - { - dictionary = header.getSequenceDictionary(); - } catch (SAMException e) - { - // ignore - thrown if any contig line lacks length info - } + ref = ref.substring(7); + } - sourceId = filePath; + SequenceI seq = null; + File dbFile = new File(ref); - saveMetadata(sourceId); + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq, ref); + db.close(); + } + else + { + System.err.println("VCF reference not found: " + ref); + } - /* - * get offset of CSQ ALLELE_NUM and Feature if declared - */ - parseCsqHeader(); + return seq; + } + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); String vcfAssembly = ref.getValue(); @@ -275,9 +292,9 @@ public class VCFLoader /* * query for VCF overlapping each sequence in turn */ - for (SequenceI seq : al.getSequences()) + for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, reader, vcfAssembly); + int added = loadSequenceVCF(seq, vcfAssembly); if (added > 0) { seqCount++; @@ -287,7 +304,6 @@ public class VCFLoader } if (gui != null) { - // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); @@ -322,6 +338,38 @@ public class VCFLoader } /** + * Opens the VCF file and parses header data + * + * @param filePath + * @throws IOException + */ + private void initialise(String filePath) throws IOException + { + vcfFilePath = filePath; + + reader = new VCFReader(filePath); + + header = reader.getFileHeader(); + + try + { + dictionary = header.getSequenceDictionary(); + } catch (SAMException e) + { + // ignore - thrown if any contig line lacks length info + } + + sourceId = filePath; + + saveMetadata(sourceId); + + /* + * get offset of CSQ ALLELE_NUM and Feature if declared + */ + parseCsqHeader(); + } + + /** * Reads metadata (such as INFO field descriptions and datatypes) and saves * them for future reference * @@ -536,19 +584,14 @@ public class VCFLoader } /** - * Tries to add overlapping variants read from a VCF file to the given - * sequence, and returns the number of variant features added. Note that this - * requires the sequence to hold information as to its species, chromosomal - * positions and reference assembly, in order to be able to map the VCF - * variants to the sequence (or not) + * Tries to add overlapping variants read from a VCF file to the given sequence, + * and returns the number of variant features added * * @param seq - * @param reader * @param vcfAssembly * @return */ - protected int loadSequenceVCF(SequenceI seq, VCFReader reader, - String vcfAssembly) + protected int loadSequenceVCF(SequenceI seq, String vcfAssembly) { VCFMap vcfMap = getVcfMap(seq, vcfAssembly); if (vcfMap == null) @@ -556,7 +599,15 @@ public class VCFLoader return 0; } - return addVcfVariants(seq, reader, vcfMap, vcfAssembly); + /* + * work with the dataset sequence here + */ + SequenceI dss = seq.getDatasetSequence(); + if (dss == null) + { + dss = seq; + } + return addVcfVariants(dss, vcfMap); } /** @@ -743,15 +794,11 @@ public class VCFLoader * overlapping variants found. * * @param seq - * @param reader * @param map * mapping from sequence to VCF coordinates - * @param vcfAssembly - * the '##reference' identifier for the VCF reference assembly * @return */ - protected int addVcfVariants(SequenceI seq, VCFReader reader, - VCFMap map, String vcfAssembly) + protected int addVcfVariants(SequenceI seq, VCFMap map) { boolean forwardStrand = map.map.isToForwardStrand(); @@ -892,17 +939,47 @@ public class VCFLoader String allele = alt.getBaseString(); /* + * insertion after a genomic base, if on reverse strand, has to be + * converted to insertion of complement after the preceding position + */ + int referenceLength = reference.length(); + if (!forwardStrand && allele.length() > referenceLength + && allele.startsWith(reference)) + { + featureStart -= referenceLength; + featureEnd = featureStart; + char insertAfter = seq.getCharAt(featureStart - seq.getStart()); + reference = Dna.reverseComplement(String.valueOf(insertAfter)); + allele = allele.substring(referenceLength) + reference; + } + + /* * build the ref,alt allele description e.g. "G,A", using the base * complement if the sequence is on the reverse strand */ - // FIXME correctly handle insertions on reverse strand JAL-2845 StringBuilder sb = new StringBuilder(); sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); sb.append(COMMA); sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); String alleles = sb.toString(); // e.g. G,A - String type = getOntologyTerm(seq, variant, altAlleleIndex); + /* + * pick out the consequence data (if any) that is for the current allele + * and feature (transcript) that matches the current sequence + */ + String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD, + altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), + csqFeatureFieldIndex); + + /* + * pick out the ontology term for the consequence type + */ + String type = SequenceOntologyI.SEQUENCE_VARIANT; + if (consequence != null) + { + type = getOntologyTerm(consequence); + } float score = getAlleleFrequency(variant, altAlleleIndex); @@ -912,7 +989,7 @@ public class VCFLoader sf.setValue(Gff3Helper.ALLELES, alleles); - addAlleleProperties(variant, seq, sf, altAlleleIndex); + addAlleleProperties(variant, sf, altAlleleIndex, consequence); seq.addSequenceFeature(sf); @@ -928,17 +1005,18 @@ public class VCFLoader *

  • sequence id can be matched to VEP Feature (or SnpEff Feature_ID)
  • * * - * @param seq - * @param variant - * @param altAlleleIndex + * @param consequence * @return * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 */ - String getOntologyTerm(SequenceI seq, VariantContext variant, - int altAlleleIndex) + String getOntologyTerm(String consequence) { String type = SequenceOntologyI.SEQUENCE_VARIANT; + /* + * could we associate Consequence data with this allele and feature (transcript)? + * if so, prefer the consequence term from that data + */ if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 { /* @@ -947,14 +1025,6 @@ public class VCFLoader return type; } - /* - * can we associate Consequence data with this allele and feature (transcript)? - * if so, prefer the consequence term from that data - */ - String consequence = getConsequenceForAlleleAndFeature(variant, - CSQ_FIELD, - altAlleleIndex, csqAlleleFieldIndex, csqAlleleNumberFieldIndex, - seq.getName().toLowerCase(), csqFeatureFieldIndex); if (consequence != null) { String[] csqFields = consequence.split(PIPE_REGEX); @@ -1085,13 +1155,15 @@ public class VCFLoader * Add any allele-specific VCF key-value data to the sequence feature * * @param variant - * @param seq * @param sf * @param altAlelleIndex * (0, 1..) + * @param consequence + * if not null, the consequence specific to this sequence (transcript + * feature) and allele */ - protected void addAlleleProperties(VariantContext variant, SequenceI seq, - SequenceFeature sf, final int altAlelleIndex) + protected void addAlleleProperties(VariantContext variant, + SequenceFeature sf, final int altAlelleIndex, String consequence) { Map atts = variant.getAttributes(); @@ -1105,7 +1177,15 @@ public class VCFLoader */ if (CSQ_FIELD.equals(key)) { - addConsequences(variant, seq, sf, altAlelleIndex); + addConsequences(variant, sf, consequence); + continue; + } + + /* + * filter out fields we don't want to capture + */ + if (!vcfFieldsOfInterest.contains(key)) + { continue; } @@ -1163,35 +1243,19 @@ public class VCFLoader /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence - * feature for the current allele (and transcript if applicable) + * feature. *

    - * Allele matching: if field ALLELE_NUM is present, it must match - * altAlleleIndex. If not present, then field Allele value must match the VCF - * Allele. - *

    - * Transcript matching: if sequence name can be identified to at least one of - * the consequences' Feature values, then select only consequences that match - * the value (i.e. consequences for the current transcript sequence). If not, - * take all consequences (this is the case when adding features to the gene - * sequence). + * If myConsequence is not null, then this is the specific + * consequence data (pipe-delimited fields) that is for the current allele and + * transcript (sequence) being processed) * * @param variant - * @param seq * @param sf - * @param altAlleleIndex - * (0, 1..) + * @param myConsequence */ - protected void addConsequences(VariantContext variant, SequenceI seq, - SequenceFeature sf, int altAlleleIndex) + protected void addConsequences(VariantContext variant, SequenceFeature sf, + String myConsequence) { - /* - * first try to identify the matching consequence - */ - String myConsequence = getConsequenceForAlleleAndFeature(variant, - CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, - csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), - csqFeatureFieldIndex); - Object value = variant.getAttribute(CSQ_FIELD); if (value == null || !(value instanceof List))