import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import jalview.analysis.AlignmentUtils;
import jalview.datamodel.Mapping;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.datamodel.features.FeatureAttributeType;
+import jalview.datamodel.features.FeatureSource;
+import jalview.datamodel.features.FeatureSources;
import jalview.ext.ensembl.EnsemblMap;
import jalview.ext.htsjdk.VCFReader;
import jalview.io.gff.Gff3Helper;
private int csqAlleleNumberFieldIndex = -1;
private int csqFeatureFieldIndex = -1;
+ /*
+ * a unique identifier under which to save metadata about feature
+ * attributes (selected INFO field data)
+ */
+ private String sourceId;
+
/**
* Constructor given an alignment context
*
reader = new VCFReader(filePath);
header = reader.getFileHeader();
- VCFHeaderLine ref = header
- .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+
+ sourceId = filePath;
+
+ saveMetadata(sourceId);
/*
* get offset of CSQ ALLELE_NUM and Feature if declared
*/
locateCsqFields();
- // check if reference is wrt assembly19 (GRCh37)
- // todo may need to allow user to specify reference assembly?
- boolean isRefGrch37 = (ref != null && ref.getValue().contains(
- "assembly19"));
+ VCFHeaderLine ref = header
+ .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+ String vcfAssembly = ref.getValue();
int varCount = 0;
int seqCount = 0;
*/
for (SequenceI seq : al.getSequences())
{
- int added = loadSequenceVCF(seq, reader, isRefGrch37);
+ int added = loadSequenceVCF(seq, reader, vcfAssembly);
if (added > 0)
{
seqCount++;
}
/**
+ * Reads metadata (such as INFO field descriptions and datatypes) and saves
+ * them for future reference
+ *
+ * @param sourceId
+ */
+ void saveMetadata(String sourceId)
+ {
+ FeatureSource metadata = new FeatureSource(sourceId);
+
+ for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
+ {
+ String attributeId = info.getID();
+ String desc = info.getDescription();
+ VCFHeaderLineType type = info.getType();
+ FeatureAttributeType attType = null;
+ switch (type)
+ {
+ case Character:
+ attType = FeatureAttributeType.Character;
+ break;
+ case Flag:
+ attType = FeatureAttributeType.Flag;
+ break;
+ case Float:
+ attType = FeatureAttributeType.Float;
+ break;
+ case Integer:
+ attType = FeatureAttributeType.Integer;
+ break;
+ case String:
+ attType = FeatureAttributeType.String;
+ break;
+ }
+ metadata.setAttributeName(attributeId, desc);
+ metadata.setAttributeType(attributeId, attType);
+ }
+
+ FeatureSources.getInstance().addSource(sourceId, metadata);
+ }
+
+ /**
* Records the position of selected fields defined in the CSQ INFO header (if
* there is one). CSQ fields are declared in the CSQ INFO Description e.g.
* <p>
/**
* Tries to add overlapping variants read from a VCF file to the given
* sequence, and returns the number of variant features added. Note that this
- * requires the sequence to hold information as to its chromosomal positions
- * and reference, in order to be able to map the VCF variants to the sequence.
+ * requires the sequence to hold information as to its species, chromosomal
+ * positions and reference assembly, in order to be able to map the VCF
+ * variants to the sequence (or not)
*
* @param seq
* @param reader
- * @param isVcfRefGrch37
+ * @param vcfAssembly
* @return
*/
protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
- boolean isVcfRefGrch37)
+ String vcfAssembly)
{
int count = 0;
GeneLociI seqCoords = seq.getGeneLoci();
return 0;
}
+ if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId()))
+ {
+ return 0;
+ }
+
List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
for (int[] range : seqChromosomalContigs)
{
- count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
+ count += addVcfVariants(seq, reader, range, vcfAssembly);
}
return count;
}
/**
+ * Answers true if the species inferred from the VCF reference identifier
+ * matches that for the sequence
+ *
+ * @param vcfAssembly
+ * @param speciesId
+ * @return
+ */
+ boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
+ {
+ // PROBLEM 1
+ // there are many aliases for species - how to equate one with another?
+ // PROBLEM 2
+ // VCF ##reference header is an unstructured URI - how to extract species?
+ // perhaps check if ref includes any (Ensembl) alias of speciesId??
+ // TODO ask the user to confirm this??
+
+ if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
+ && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
+ {
+ return true;
+ }
+
+ if (vcfAssembly.contains("c_elegans") // VEP VCF response example
+ && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
+ {
+ return true;
+ }
+
+ // this is not a sustainable solution...
+
+ return false;
+ }
+
+ /**
* Queries the VCF reader for any variants that overlap the given chromosome
* region of the sequence, and adds as variant features. Returns the number of
* overlapping variants found.
* @param range
* start-end range of a sequence region in its chromosomal
* coordinates
- * @param isVcfRefGrch37
- * true if the VCF is with reference to GRCh37
+ * @param vcfAssembly
+ * the '##reference' identifier for the VCF reference assembly
* @return
*/
protected int addVcfVariants(SequenceI seq, VCFReader reader,
- int[] range, boolean isVcfRefGrch37)
+ int[] range, String vcfAssembly)
{
GeneLociI seqCoords = seq.getGeneLoci();
String species = seqCoords.getSpeciesId();
/*
- * map chromosomal coordinates from GRCh38 (sequence) to
- * GRCh37 (VCF) if necessary
+ * map chromosomal coordinates from sequence to VCF if the VCF
+ * data has a different reference assembly to the sequence
*/
- // TODO generalise for other assemblies and species
+ // TODO generalise for non-human species
+ // - or get the user to choose in a dialog
+
int offset = 0;
- String fromRef = "GRCh38";
- if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
+ if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
+ && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
{
String toRef = "GRCh37";
int[] newRange = mapReferenceRange(range, chromosome, "human",
- fromRef, toRef);
+ seqRef, toRef);
if (newRange == null)
{
System.err.println(String.format(
"Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
- fromRef, range[0], range[1], toRef));
+ seqRef, range[0], range[1], toRef));
return 0;
}
offset = newRange[0] - range[0];
SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
featureEnd, score, FEATURE_GROUP_VCF);
+ sf.setSource(sourceId);
sf.setValue(Gff3Helper.ALLELES, alleles);
*/
EnsemblMap mapper = new EnsemblMap();
int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
- toRef,
- queryRange);
+ toRef, queryRange);
if (mapping == null)
{