+ private VCFReader reader;
+
+ /*
+ * holds details of the VCF header lines (metadata)
+ */
+ private VCFHeader header;
+
+ /*
+ * species (as a valid Ensembl term) the VCF is for
+ */
+ private String vcfSpecies;
+
+ /*
+ * genome assembly version (as a valid Ensembl identifier) the VCF is for
+ */
+ private String vcfAssembly;
+
+ /*
+ * a Dictionary of contigs (if present) referenced in the VCF file
+ */
+ private SAMSequenceDictionary dictionary;
+
+ /*
+ * the position (0...) of field in each block of
+ * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
+ * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+ */
+ private int csqConsequenceFieldIndex = -1;
+
+ private int csqAlleleFieldIndex = -1;
+
+ private int csqAlleleNumberFieldIndex = -1;
+
+ private int csqFeatureFieldIndex = -1;
+
+ // todo the same fields for SnpEff ANN data if wanted
+ // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
+
+ /*
+ * a unique identifier under which to save metadata about feature
+ * attributes (selected INFO field data)
+ */
+ private String sourceId;
+
+ /*
+ * The INFO IDs of data that is both present in the VCF file, and
+ * also matched by any filters for data of interest
+ */
+ List<String> vcfFieldsOfInterest;
+
+ /*
+ * The field offsets and identifiers for VEP (CSQ) data that is both present
+ * in the VCF file, and also matched by any filters for data of interest
+ * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
+ */
+ Map<Integer, String> vepFieldsOfInterest;
+
+ /*
+ * key:value for which rejected data has been seen
+ * (the error is logged only once for each combination)
+ */
+ private Set<String> badData;
+
+ /**
+ * Constructor given a VCF file
+ *
+ * @param alignment
+ */
+ public VCFLoader(String vcfFile)
+ {
+ try
+ {
+ initialise(vcfFile);
+ } catch (IOException e)
+ {
+ System.err.println("Error opening VCF file: " + e.getMessage());
+ }
+
+ // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
+ assemblyMappings = new HashMap<>();
+ }
+
+ /**
+ * Starts a new thread to query and load VCF variant data on to the given
+ * sequences
+ * <p>
+ * This method is not thread safe - concurrent threads should use separate
+ * instances of this class.
+ *
+ * @param seqs
+ * @param gui
+ */
+ public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
+ {
+ if (gui != null)
+ {
+ gui.setStatus(MessageManager.getString("label.searching_vcf"));
+ }
+
+ new Thread()
+ {
+ @Override
+ public void run()
+ {
+ VCFLoader.this.doLoad(seqs, gui);
+ }
+ }.start();
+ }
+
+ /**
+ * Reads the specified contig sequence and adds its VCF variants to it
+ *
+ * @param contig
+ * the id of a single sequence (contig) to load
+ * @return
+ */
+ public SequenceI loadVCFContig(String contig)
+ {
+ VCFHeaderLine headerLine = header
+ .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+ if (headerLine == null)
+ {
+ Console.error("VCF reference header not found");
+ return null;
+ }
+ String ref = headerLine.getValue();
+ if (ref.startsWith("file://"))
+ {
+ ref = ref.substring(7);
+ }
+ setSpeciesAndAssembly(ref);
+
+ SequenceI seq = null;
+ File dbFile = new File(ref);
+
+ if (dbFile.exists())
+ {
+ HtsContigDb db = new HtsContigDb("", dbFile);
+ seq = db.getSequenceProxy(contig);
+ loadSequenceVCF(seq);
+ db.close();
+ }
+ else
+ {
+ Console.error("VCF reference not found: " + ref);
+ }
+
+ return seq;
+ }
+
+ /**
+ * Loads VCF on to one or more sequences
+ *
+ * @param seqs
+ * @param gui
+ * optional callback handler for messages
+ */
+ protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
+ {
+ try
+ {
+ VCFHeaderLine ref = header
+ .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+ String reference = ref == null ? null : ref.getValue();
+
+ setSpeciesAndAssembly(reference);
+
+ int varCount = 0;
+ int seqCount = 0;
+
+ /*
+ * query for VCF overlapping each sequence in turn
+ */
+ for (SequenceI seq : seqs)
+ {
+ int added = loadSequenceVCF(seq);
+ if (added > 0)
+ {
+ seqCount++;
+ varCount += added;
+ transferAddedFeatures(seq);
+ }
+ }
+ if (gui != null)
+ {
+ String msg = MessageManager.formatMessage("label.added_vcf",
+ varCount, seqCount);
+ gui.setStatus(msg);
+ if (gui.getFeatureSettingsUI() != null)
+ {
+ gui.getFeatureSettingsUI().discoverAllFeatureData();
+ }
+ }
+ } catch (Throwable e)
+ {
+ System.err.println("Error processing VCF: " + e.getMessage());
+ e.printStackTrace();
+ if (gui != null)
+ {
+ gui.setStatus("Error occurred - see console for details");
+ }
+ } finally
+ {
+ if (reader != null)
+ {
+ try
+ {
+ reader.close();
+ } catch (IOException e)
+ {
+ // ignore
+ }
+ }
+ header = null;
+ dictionary = null;
+ }
+ }
+
+ /**
+ * Attempts to determine and save the species and genome assembly version to
+ * which the VCF data applies. This may be done by parsing the
+ * {@code reference} header line, configured in a property file, or
+ * (potentially) confirmed interactively by the user.
+ * <p>
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Console.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase(Locale.ROOT);
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT)))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT)))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
+ * Reads metadata (such as INFO field descriptions and datatypes) and saves
+ * them for future reference
+ *
+ * @param theSourceId
+ */
+ void saveMetadata(String theSourceId)
+ {
+ List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
+ DEFAULT_VCF_FIELDS);
+ vcfFieldsOfInterest = new ArrayList<>();
+
+ FeatureSource metadata = new FeatureSource(theSourceId);
+
+ for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
+ {
+ String attributeId = info.getID();
+ String desc = info.getDescription();
+ VCFHeaderLineType type = info.getType();
+ FeatureAttributeType attType = null;
+ switch (type)
+ {
+ case Character:
+ attType = FeatureAttributeType.Character;
+ break;
+ case Flag:
+ attType = FeatureAttributeType.Flag;
+ break;
+ case Float:
+ attType = FeatureAttributeType.Float;
+ break;
+ case Integer:
+ attType = FeatureAttributeType.Integer;
+ break;
+ case String:
+ attType = FeatureAttributeType.String;
+ break;
+ }
+ metadata.setAttributeName(attributeId, desc);
+ metadata.setAttributeType(attributeId, attType);
+
+ if (isFieldWanted(attributeId, vcfFieldPatterns))
+ {
+ vcfFieldsOfInterest.add(attributeId);
+ }
+ }
+
+ FeatureSources.getInstance().addSource(theSourceId, metadata);
+ }
+
+ /**
+ * Answers true if the field id is matched by any of the filter patterns, else
+ * false. Matching is against regular expression patterns, and is not
+ * case-sensitive.
+ *
+ * @param id
+ * @param filters
+ * @return
+ */
+ private boolean isFieldWanted(String id, List<Pattern> filters)
+ {
+ for (Pattern p : filters)
+ {
+ if (p.matcher(id.toUpperCase(Locale.ROOT)).matches())
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
+ * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
+ * required for processing.
+ * <p>
+ * CSQ fields are declared in the CSQ INFO Description e.g.
+ * <p>
+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
+ */
+ protected void parseCsqHeader()
+ {
+ List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
+ DEFAULT_VEP_FIELDS);
+ vepFieldsOfInterest = new HashMap<>();
+
+ VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
+ if (csqInfo == null)
+ {
+ return;
+ }
+
+ /*
+ * parse out the pipe-separated list of CSQ fields; we assume here that
+ * these form the last part of the description, and contain no spaces
+ */
+ String desc = csqInfo.getDescription();
+ int spacePos = desc.lastIndexOf(" ");
+ desc = desc.substring(spacePos + 1);
+
+ if (desc != null)
+ {
+ String[] format = desc.split(PIPE_REGEX);
+ int index = 0;
+ for (String field : format)
+ {
+ if (CSQ_CONSEQUENCE_KEY.equals(field))
+ {
+ csqConsequenceFieldIndex = index;
+ }
+ if (CSQ_ALLELE_NUM_KEY.equals(field))
+ {
+ csqAlleleNumberFieldIndex = index;
+ }
+ if (CSQ_ALLELE_KEY.equals(field))
+ {
+ csqAlleleFieldIndex = index;
+ }
+ if (CSQ_FEATURE_KEY.equals(field))
+ {
+ csqFeatureFieldIndex = index;
+ }
+
+ if (isFieldWanted(field, vepFieldFilters))
+ {
+ vepFieldsOfInterest.put(index, field);
+ }
+
+ index++;
+ }
+ }
+ }
+
+ /**
+ * Reads the Preference value for the given key, with default specified if no
+ * preference set. The value is interpreted as a comma-separated list of
+ * regular expressions, and converted into a list of compiled patterns ready
+ * for matching. Patterns are forced to upper-case for non-case-sensitive
+ * matching.
+ * <p>
+ * This supports user-defined filters for fields of interest to capture while
+ * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
+ * fields with an ID of AF, or starting with AC, would be matched.
+ *
+ * @param key
+ * @param def
+ * @return
+ */
+ private List<Pattern> getFieldMatchers(String key, String def)
+ {
+ String pref = Cache.getDefault(key, def);
+ List<Pattern> patterns = new ArrayList<>();
+ String[] tokens = pref.split(",");
+ for (String token : tokens)
+ {
+ try
+ {
+ patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT)));
+ } catch (PatternSyntaxException e)
+ {
+ System.err.println("Invalid pattern ignored: " + token);
+ }
+ }
+ return patterns;
+ }
+
+ /**
+ * Transfers VCF features to sequences to which this sequence has a mapping.
+ *
+ * @param seq
+ */
+ protected void transferAddedFeatures(SequenceI seq)
+ {
+ List<DBRefEntry> dbrefs = seq.getDBRefs();
+ if (dbrefs == null)
+ {
+ return;
+ }
+ for (DBRefEntry dbref : dbrefs)
+ {
+ Mapping mapping = dbref.getMap();
+ if (mapping == null || mapping.getTo() == null)
+ {
+ continue;
+ }
+
+ SequenceI mapTo = mapping.getTo();
+ MapList map = mapping.getMap();
+ if (map.getFromRatio() == 3)
+ {
+ /*
+ * dna-to-peptide product mapping
+ */
+ // JAL-3187 render on the fly instead
+ // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
+ }
+ else
+ {
+ /*
+ * nucleotide-to-nucleotide mapping e.g. transcript to CDS
+ */
+ List<SequenceFeature> features = seq.getFeatures()
+ .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
+ for (SequenceFeature sf : features)
+ {
+ if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
+ {
+ transferFeature(sf, mapTo, map);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Tries to add overlapping variants read from a VCF file to the given
+ * sequence, and returns the number of variant features added
+ *
+ * @param seq
+ * @return
+ */
+ protected int loadSequenceVCF(SequenceI seq)
+ {
+ VCFMap vcfMap = getVcfMap(seq);
+ if (vcfMap == null)
+ {
+ return 0;
+ }
+
+ /*
+ * work with the dataset sequence here
+ */
+ SequenceI dss = seq.getDatasetSequence();
+ if (dss == null)
+ {
+ dss = seq;
+ }
+ return addVcfVariants(dss, vcfMap);
+ }
+
+ /**
+ * Answers a map from sequence coordinates to VCF chromosome ranges
+ *
+ * @param seq
+ * @return
+ */
+ private VCFMap getVcfMap(SequenceI seq)
+ {
+ /*
+ * simplest case: sequence has id and length matching a VCF contig
+ */
+ VCFMap vcfMap = null;
+ if (dictionary != null)
+ {
+ vcfMap = getContigMap(seq);
+ }
+ if (vcfMap != null)
+ {
+ return vcfMap;
+ }
+
+ /*
+ * otherwise, map to VCF from chromosomal coordinates
+ * of the sequence (if known)
+ */
+ GeneLociI seqCoords = seq.getGeneLoci();
+ if (seqCoords == null)
+ {
+ Console.warn(String.format(
+ "Can't query VCF for %s as chromosome coordinates not known",
+ seq.getName()));
+ return null;
+ }
+
+ String species = seqCoords.getSpeciesId();
+ String chromosome = seqCoords.getChromosomeId();
+ String seqRef = seqCoords.getAssemblyId();
+ MapList map = seqCoords.getMapping();
+
+ // note this requires the configured species to match that
+ // returned with the Ensembl sequence; todo: support aliases?
+ if (!vcfSpecies.equalsIgnoreCase(species))
+ {
+ Console.warn("No VCF loaded to " + seq.getName()
+ + " as species not matched");
+ return null;
+ }
+
+ if (seqRef.equalsIgnoreCase(vcfAssembly))
+ {
+ return new VCFMap(chromosome, map);
+ }
+
+ /*
+ * VCF data has a different reference assembly to the sequence:
+ * query Ensembl to map chromosomal coordinates from sequence to VCF
+ */
+ List<int[]> toVcfRanges = new ArrayList<>();
+ List<int[]> fromSequenceRanges = new ArrayList<>();
+
+ for (int[] range : map.getToRanges())
+ {
+ int[] fromRange = map.locateInFrom(range[0], range[1]);
+ if (fromRange == null)
+ {
+ // corrupted map?!?
+ continue;
+ }
+
+ int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
+ vcfAssembly);
+ if (newRange == null)
+ {
+ Console.error(String.format("Failed to map %s:%s:%s:%d:%d to %s",
+ species, chromosome, seqRef, range[0], range[1],
+ vcfAssembly));
+ continue;
+ }
+ else
+ {
+ toVcfRanges.add(newRange);
+ fromSequenceRanges.add(fromRange);
+ }
+ }
+
+ return new VCFMap(chromosome,
+ new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
+ }
+
+ /**
+ * If the sequence id matches a contig declared in the VCF file, and the
+ * sequence length matches the contig length, then returns a 1:1 map of the
+ * sequence to the contig, else returns null
+ *
+ * @param seq
+ * @return
+ */
+ private VCFMap getContigMap(SequenceI seq)
+ {
+ String id = seq.getName();
+ SAMSequenceRecord contig = dictionary.getSequence(id);
+ if (contig != null)
+ {
+ int len = seq.getLength();
+ if (len == contig.getSequenceLength())
+ {
+ MapList map = new MapList(new int[] { 1, len },
+ new int[]
+ { 1, len }, 1, 1);
+ return new VCFMap(id, map);
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Queries the VCF reader for any variants that overlap the mapped chromosome
+ * ranges of the sequence, and adds as variant features. Returns the number of
+ * overlapping variants found.
+ *
+ * @param seq
+ * @param map
+ * mapping from sequence to VCF coordinates
+ * @return
+ */
+ protected int addVcfVariants(SequenceI seq, VCFMap map)
+ {
+ boolean forwardStrand = map.map.isToForwardStrand();
+
+ /*
+ * query the VCF for overlaps of each contiguous chromosomal region
+ */
+ int count = 0;
+
+ for (int[] range : map.map.getToRanges())
+ {
+ int vcfStart = Math.min(range[0], range[1]);
+ int vcfEnd = Math.max(range[0], range[1]);
+ try
+ {
+ CloseableIterator<VariantContext> variants = reader
+ .query(map.chromosome, vcfStart, vcfEnd);
+ while (variants.hasNext())
+ {
+ VariantContext variant = variants.next();
+
+ int[] featureRange = map.map.locateInFrom(variant.getStart(),
+ variant.getEnd());
+
+ /*
+ * only take features whose range is fully mappable to sequence positions
+ */
+ if (featureRange != null)
+ {
+ int featureStart = Math.min(featureRange[0], featureRange[1]);
+ int featureEnd = Math.max(featureRange[0], featureRange[1]);
+ if (featureEnd - featureStart == variant.getEnd()
+ - variant.getStart())
+ {
+ count += addAlleleFeatures(seq, variant, featureStart,
+ featureEnd, forwardStrand);
+ }
+ }
+ }
+ variants.close();
+ } catch (TribbleException e)
+ {
+ /*
+ * RuntimeException throwable by htsjdk
+ */
+ String msg = String.format("Error reading VCF for %s:%d-%d: %s ",
+ map.chromosome, vcfStart, vcfEnd, e.getLocalizedMessage());
+ Console.error(msg);
+ }
+ }
+
+ return count;
+ }
+
+ /**
+ * A convenience method to get an attribute value for an alternate allele
+ *
+ * @param variant
+ * @param attributeName
+ * @param alleleIndex
+ * @return
+ */
+ protected String getAttributeValue(VariantContext variant,
+ String attributeName, int alleleIndex)
+ {
+ Object att = variant.getAttribute(attributeName);
+
+ if (att instanceof String)
+ {
+ return (String) att;
+ }
+ else if (att instanceof ArrayList)
+ {
+ return ((List<String>) att).get(alleleIndex);
+ }
+
+ return null;
+ }
+