+ * the VCF file we are processing
+ */
+ protected String vcfFilePath;
+
+ /*
+ * mappings between VCF and sequence reference assembly regions, as
+ * key = "species!chromosome!fromAssembly!toAssembly
+ * value = Map{fromRange, toRange}
+ */
+ private Map<String, Map<int[], int[]>> assemblyMappings;
+
+ private VCFReader reader;
+
+ /*
+ * holds details of the VCF header lines (metadata)
+ */
+ private VCFHeader header;
+
+ /*
+ * species (as a valid Ensembl term) the VCF is for
+ */
+ private String vcfSpecies;
+
+ /*
+ * genome assembly version (as a valid Ensembl identifier) the VCF is for
+ */
+ private String vcfAssembly;
+
+ /*
+ * a Dictionary of contigs (if present) referenced in the VCF file
+ */
+ private SAMSequenceDictionary dictionary;
+
+ /*
+ * the position (0...) of field in each block of
+ * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
+ * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+ */
+ private int csqConsequenceFieldIndex = -1;
+ private int csqAlleleFieldIndex = -1;
+ private int csqAlleleNumberFieldIndex = -1;
+ private int csqFeatureFieldIndex = -1;
+
+ // todo the same fields for SnpEff ANN data if wanted
+ // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
+
+ /*
+ * a unique identifier under which to save metadata about feature
+ * attributes (selected INFO field data)
+ */
+ private String sourceId;
+
+ /*
+ * The INFO IDs of data that is both present in the VCF file, and
+ * also matched by any filters for data of interest
+ */
+ List<String> vcfFieldsOfInterest;
+
+ /*
+ * The field offsets and identifiers for VEP (CSQ) data that is both present
+ * in the VCF file, and also matched by any filters for data of interest
+ * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
+ */
+ Map<Integer, String> vepFieldsOfInterest;
+
+ /**
+ * Constructor given a VCF file
+ *
+ * @param alignment
+ */
+ public VCFLoader(String vcfFile)
+ {
+ try
+ {
+ initialise(vcfFile);
+ } catch (IOException e)
+ {
+ System.err.println("Error opening VCF file: " + e.getMessage());
+ }
+
+ // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
+ assemblyMappings = new HashMap<>();
+ }
+
+ /**
+ * Starts a new thread to query and load VCF variant data on to the given
+ * sequences
+ * <p>
+ * This method is not thread safe - concurrent threads should use separate
+ * instances of this class.
+ *
+ * @param seqs
+ * @param gui
+ */
+ public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
+ {
+ if (gui != null)
+ {
+ gui.setStatus(MessageManager.getString("label.searching_vcf"));
+ }
+
+ new Thread()
+ {
+ @Override
+ public void run()
+ {
+ VCFLoader.this.doLoad(seqs, gui);
+ }
+ }.start();
+ }
+
+ /**
+ * Reads the specified contig sequence and adds its VCF variants to it
+ *
+ * @param contig
+ * the id of a single sequence (contig) to load
+ * @return
+ */
+ public SequenceI loadVCFContig(String contig)
+ {
+ VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+ if (headerLine == null)
+ {
+ Cache.log.error("VCF reference header not found");
+ return null;
+ }
+ String ref = headerLine.getValue();
+ if (ref.startsWith("file://"))
+ {
+ ref = ref.substring(7);
+ }
+ setSpeciesAndAssembly(ref);
+
+ SequenceI seq = null;
+ File dbFile = new File(ref);
+
+ if (dbFile.exists())
+ {
+ HtsContigDb db = new HtsContigDb("", dbFile);
+ seq = db.getSequenceProxy(contig);
+ loadSequenceVCF(seq);
+ db.close();
+ }
+ else
+ {
+ Cache.log.error("VCF reference not found: " + ref);
+ }
+
+ return seq;
+ }
+
+ /**
+ * Loads VCF on to one or more sequences
+ *
+ * @param seqs
+ * @param gui
+ * optional callback handler for messages
+ */
+ protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
+ {
+ try
+ {
+ VCFHeaderLine ref = header
+ .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+ String reference = ref == null ? null : ref.getValue();
+
+ setSpeciesAndAssembly(reference);
+
+ int varCount = 0;
+ int seqCount = 0;
+
+ /*
+ * query for VCF overlapping each sequence in turn
+ */
+ for (SequenceI seq : seqs)
+ {
+ int added = loadSequenceVCF(seq);
+ if (added > 0)
+ {
+ seqCount++;
+ varCount += added;
+ transferAddedFeatures(seq);
+ }
+ }
+ if (gui != null)
+ {
+ String msg = MessageManager.formatMessage("label.added_vcf",
+ varCount, seqCount);
+ gui.setStatus(msg);
+ if (gui.getFeatureSettingsUI() != null)
+ {
+ gui.getFeatureSettingsUI().discoverAllFeatureData();
+ }
+ }
+ } catch (Throwable e)
+ {
+ System.err.println("Error processing VCF: " + e.getMessage());
+ e.printStackTrace();
+ if (gui != null)
+ {
+ gui.setStatus("Error occurred - see console for details");
+ }
+ } finally
+ {
+ if (reader != null)
+ {
+ try
+ {
+ reader.close();
+ } catch (IOException e)
+ {
+ // ignore
+ }
+ }
+ header = null;
+ dictionary = null;
+ }
+ }
+
+ /**
+ * Attempts to determine and save the species and genome assembly version to
+ * which the VCF data applies. This may be done by parsing the {@code reference}
+ * header line, configured in a property file, or (potentially) confirmed
+ * interactively by the user.
+ * <p>
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Cache.log.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase();
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
+ * Reads metadata (such as INFO field descriptions and datatypes) and saves
+ * them for future reference
+ *
+ * @param theSourceId
+ */
+ void saveMetadata(String theSourceId)
+ {
+ List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
+ DEFAULT_VCF_FIELDS);
+ vcfFieldsOfInterest = new ArrayList<>();
+
+ FeatureSource metadata = new FeatureSource(theSourceId);
+
+ for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
+ {
+ String attributeId = info.getID();
+ String desc = info.getDescription();
+ VCFHeaderLineType type = info.getType();
+ FeatureAttributeType attType = null;
+ switch (type)
+ {
+ case Character:
+ attType = FeatureAttributeType.Character;
+ break;
+ case Flag:
+ attType = FeatureAttributeType.Flag;
+ break;
+ case Float:
+ attType = FeatureAttributeType.Float;
+ break;
+ case Integer:
+ attType = FeatureAttributeType.Integer;
+ break;
+ case String:
+ attType = FeatureAttributeType.String;
+ break;
+ }
+ metadata.setAttributeName(attributeId, desc);
+ metadata.setAttributeType(attributeId, attType);
+
+ if (isFieldWanted(attributeId, vcfFieldPatterns))
+ {
+ vcfFieldsOfInterest.add(attributeId);
+ }
+ }
+
+ FeatureSources.getInstance().addSource(theSourceId, metadata);
+ }
+
+ /**
+ * Answers true if the field id is matched by any of the filter patterns, else
+ * false. Matching is against regular expression patterns, and is not
+ * case-sensitive.
+ *
+ * @param id
+ * @param filters
+ * @return
+ */
+ private boolean isFieldWanted(String id, List<Pattern> filters)
+ {
+ for (Pattern p : filters)
+ {
+ if (p.matcher(id.toUpperCase()).matches())
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
+ * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
+ * required for processing.
+ * <p>
+ * CSQ fields are declared in the CSQ INFO Description e.g.
+ * <p>
+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...