+ * Attempts to determine and save the species and genome assembly version to
+ * which the VCF data applies. This may be done by parsing the {@code reference}
+ * header line, configured in a property file, or (potentially) confirmed
+ * interactively by the user.
+ * <p>
+ * The saved values should be identifiers valid for Ensembl's REST service
+ * {@code map} endpoint, so they can be used (if necessary) to retrieve the
+ * mapping between VCF coordinates and sequence coordinates.
+ *
+ * @param reference
+ * @see https://rest.ensembl.org/documentation/info/assembly_map
+ * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
+ * @see https://rest.ensembl.org/info/species?content-type=text/xml
+ */
+ protected void setSpeciesAndAssembly(String reference)
+ {
+ if (reference == null)
+ {
+ Cache.log.error("No VCF ##reference found, defaulting to "
+ + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
+ reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
+ }
+ reference = reference.toLowerCase();
+
+ /*
+ * for a non-human species, or other assembly identifier,
+ * specify as a Jalview property file entry e.g.
+ * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
+ * VCF_SPECIES = c_elegans=celegans
+ * to map a token in the reference header to a value
+ */
+ String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfAssembly = tokens[1].trim();
+ break;
+ }
+ }
+ }
+
+ vcfSpecies = DEFAULT_SPECIES;
+ prop = Cache.getProperty(VCF_SPECIES);
+ if (prop != null)
+ {
+ for (String token : prop.split(","))
+ {
+ String[] tokens = token.split("=");
+ if (tokens.length == 2)
+ {
+ if (reference.contains(tokens[0].trim().toLowerCase()))
+ {
+ vcfSpecies = tokens[1].trim();
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Opens the VCF file and parses header data
+ *
+ * @param filePath
+ * @throws IOException
+ */
+ private void initialise(String filePath) throws IOException
+ {
+ vcfFilePath = filePath;
+
+ reader = new VCFReader(filePath);
+
+ header = reader.getFileHeader();
+
+ try
+ {
+ dictionary = header.getSequenceDictionary();
+ } catch (SAMException e)
+ {
+ // ignore - thrown if any contig line lacks length info
+ }
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
+
+ /*
+ * get offset of CSQ ALLELE_NUM and Feature if declared
+ */
+ parseCsqHeader();
+ }
+
+ /**
+ * Reads metadata (such as INFO field descriptions and datatypes) and saves
+ * them for future reference
+ *
+ * @param theSourceId
+ */
+ void saveMetadata(String theSourceId)
+ {
+ List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
+ DEFAULT_VCF_FIELDS);
+ vcfFieldsOfInterest = new ArrayList<>();
+
+ FeatureSource metadata = new FeatureSource(theSourceId);
+
+ for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
+ {
+ String attributeId = info.getID();
+ String desc = info.getDescription();
+ VCFHeaderLineType type = info.getType();
+ FeatureAttributeType attType = null;
+ switch (type)
+ {
+ case Character:
+ attType = FeatureAttributeType.Character;
+ break;
+ case Flag:
+ attType = FeatureAttributeType.Flag;
+ break;
+ case Float:
+ attType = FeatureAttributeType.Float;
+ break;
+ case Integer:
+ attType = FeatureAttributeType.Integer;
+ break;
+ case String:
+ attType = FeatureAttributeType.String;
+ break;
+ }
+ metadata.setAttributeName(attributeId, desc);
+ metadata.setAttributeType(attributeId, attType);
+
+ if (isFieldWanted(attributeId, vcfFieldPatterns))
+ {
+ vcfFieldsOfInterest.add(attributeId);
+ }
+ }
+
+ FeatureSources.getInstance().addSource(theSourceId, metadata);
+ }
+
+ /**
+ * Answers true if the field id is matched by any of the filter patterns, else
+ * false. Matching is against regular expression patterns, and is not
+ * case-sensitive.
+ *
+ * @param id
+ * @param filters
+ * @return
+ */
+ private boolean isFieldWanted(String id, List<Pattern> filters)
+ {
+ for (Pattern p : filters)
+ {
+ if (p.matcher(id.toUpperCase()).matches())
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
+ * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
+ * required for processing.
+ * <p>
+ * CSQ fields are declared in the CSQ INFO Description e.g.
+ * <p>
+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
+ */
+ protected void parseCsqHeader()
+ {
+ List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
+ DEFAULT_VEP_FIELDS);
+ vepFieldsOfInterest = new HashMap<>();
+
+ VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
+ if (csqInfo == null)
+ {
+ return;
+ }
+
+ /*
+ * parse out the pipe-separated list of CSQ fields; we assume here that
+ * these form the last part of the description, and contain no spaces
+ */
+ String desc = csqInfo.getDescription();
+ int spacePos = desc.lastIndexOf(" ");
+ desc = desc.substring(spacePos + 1);
+
+ if (desc != null)
+ {
+ String[] format = desc.split(PIPE_REGEX);
+ int index = 0;
+ for (String field : format)
+ {
+ if (CSQ_CONSEQUENCE_KEY.equals(field))
+ {
+ csqConsequenceFieldIndex = index;
+ }
+ if (CSQ_ALLELE_NUM_KEY.equals(field))
+ {
+ csqAlleleNumberFieldIndex = index;
+ }
+ if (CSQ_ALLELE_KEY.equals(field))
+ {
+ csqAlleleFieldIndex = index;
+ }
+ if (CSQ_FEATURE_KEY.equals(field))
+ {
+ csqFeatureFieldIndex = index;
+ }
+
+ if (isFieldWanted(field, vepFieldFilters))
+ {
+ vepFieldsOfInterest.put(index, field);
+ }
+
+ index++;
+ }
+ }
+ }
+
+ /**
+ * Reads the Preference value for the given key, with default specified if no
+ * preference set. The value is interpreted as a comma-separated list of
+ * regular expressions, and converted into a list of compiled patterns ready
+ * for matching. Patterns are forced to upper-case for non-case-sensitive
+ * matching.
+ * <p>
+ * This supports user-defined filters for fields of interest to capture while
+ * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
+ * fields with an ID of AF, or starting with AC, would be matched.
+ *
+ * @param key
+ * @param def
+ * @return
+ */
+ private List<Pattern> getFieldMatchers(String key, String def)
+ {
+ String pref = Cache.getDefault(key, def);
+ List<Pattern> patterns = new ArrayList<>();
+ String[] tokens = pref.split(",");
+ for (String token : tokens)
+ {
+ try
+ {
+ patterns.add(Pattern.compile(token.toUpperCase()));
+ } catch (PatternSyntaxException e)
+ {
+ System.err.println("Invalid pattern ignored: " + token);
+ }
+ }
+ return patterns;
+ }
+
+ /**