import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
+import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
*/
public class VCFLoader
{
+ /**
+ * A class to model the mapping from sequence to VCF coordinates. Cases include
+ * <ul>
+ * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
+ * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
+ * use the same reference assembly</li>
+ * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
+ * and VCF use different reference assembles</li>
+ * </ul>
+ */
+ class VCFMap
+ {
+ final String chromosome;
+
+ final MapList map;
+
+ VCFMap(String chr, MapList m)
+ {
+ chromosome = chr;
+ map = m;
+ }
+ }
+
/*
* Lookup keys, and default values, for Preference entries that describe
* patterns for VCF and VEP fields to capture
protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
String vcfAssembly)
{
- int count = 0;
+ VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
+ if (vcfMap == null)
+ {
+ return 0;
+ }
+
+ return addVcfVariants(seq, reader, vcfMap, vcfAssembly);
+ }
+
+ /**
+ * Answers a map from sequence coordinates to VCF chromosome ranges
+ *
+ * @param seq
+ * @param vcfAssembly
+ * @return
+ */
+ private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
+ {
+ /*
+ * simplest case: sequence has id and length matching a VCF contig
+ */
GeneLociI seqCoords = seq.getGeneLoci();
if (seqCoords == null)
{
- System.out.println(String.format(
- "Can't query VCF for %s as chromosome coordinates not known",
- seq.getName()));
- return 0;
+ VCFMap map = getContigMap(seq);
+ if (map == null)
+ {
+ Cache.log.warn(String.format(
+ "Can't query VCF for %s as chromosome coordinates not known",
+ seq.getName()));
+ }
+ return map;
+ }
+
+ String species = seqCoords.getSpeciesId();
+ String chromosome = seqCoords.getChromosomeId();
+ String seqRef = seqCoords.getAssemblyId();
+ MapList map = seqCoords.getMap();
+
+ if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
+ {
+ return null;
}
- if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId()))
+ if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
{
- return 0;
+ return new VCFMap(chromosome, map);
}
- List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
- for (int[] range : seqChromosomalContigs)
+ if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
+ || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
{
- count += addVcfVariants(seq, reader, range, vcfAssembly);
+ return null;
}
- return count;
+ /*
+ * map chromosomal coordinates from sequence to VCF if the VCF
+ * data has a different reference assembly to the sequence
+ */
+ // TODO generalise for cases other than GRCh38 -> GRCh37 !
+ // - or get the user to choose in a dialog
+
+ List<int[]> toVcfRanges = new ArrayList<>();
+ List<int[]> fromSequenceRanges = new ArrayList<>();
+ String toRef = "GRCh37";
+
+ for (int[] range : map.getToRanges())
+ {
+ int[] fromRange = map.locateInFrom(range[0], range[1]);
+ if (fromRange == null)
+ {
+ // corrupted map?!?
+ continue;
+ }
+
+ int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
+ toRef);
+ if (newRange == null)
+ {
+ Cache.log.error(
+ String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
+ chromosome, seqRef, range[0], range[1], toRef));
+ continue;
+ }
+ else
+ {
+ toVcfRanges.add(newRange);
+ fromSequenceRanges.add(fromRange);
+ }
+ }
+
+ return new VCFMap(chromosome,
+ new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
+ }
+
+ /**
+ * If the sequence id matches a contig declared in the VCF file, and the
+ * sequence matches the contig length, then returns a 1:1 map of the sequence to
+ * the contig, else returns null
+ *
+ * @param seq
+ * @return
+ */
+ private VCFMap getContigMap(SequenceI seq)
+ {
+ String id = seq.getName();
+ for (VCFContigHeaderLine contig : header.getContigLines())
+ {
+ if (contig.getID().equals(id))
+ {
+ /*
+ * have to construct a SAMSequenceRecord to
+ * read the contig 'length' field!
+ */
+ int len = seq.getLength();
+ SAMSequenceRecord ssr = contig.getSAMSequenceRecord();
+ if (len == ssr.getSequenceLength())
+ {
+ MapList map = new MapList(new int[] { 1, len },
+ new int[]
+ { 1, len }, 1, 1);
+ return new VCFMap(id, map);
+ }
+ }
+
+ }
+ return null;
+ }
+
+ /**
+ * Answers true if we determine that the VCF data uses the same reference
+ * assembly as the sequence, else false
+ *
+ * @param vcfAssembly
+ * @param seqRef
+ * @return
+ */
+ private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
+ String seqRef)
+ {
+ // TODO improve on this stub, which handles gnomAD and
+ // hopes for the best for other cases
+
+ if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
+ && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
+ {
+ return false;
+ }
+ return true;
}
/**
}
/**
- * Queries the VCF reader for any variants that overlap the given chromosome
- * region of the sequence, and adds as variant features. Returns the number of
+ * Queries the VCF reader for any variants that overlap the mapped chromosome
+ * ranges of the sequence, and adds as variant features. Returns the number of
* overlapping variants found.
*
* @param seq
* @param reader
- * @param range
- * start-end range of a sequence region in its chromosomal
- * coordinates
+ * @param map
+ * mapping from sequence to VCF coordinates
* @param vcfAssembly
* the '##reference' identifier for the VCF reference assembly
* @return
*/
protected int addVcfVariants(SequenceI seq, VCFReader reader,
- int[] range, String vcfAssembly)
+ VCFMap map, String vcfAssembly)
{
- GeneLociI seqCoords = seq.getGeneLoci();
-
- String chromosome = seqCoords.getChromosomeId();
- String seqRef = seqCoords.getAssemblyId();
- String species = seqCoords.getSpeciesId();
-
- /*
- * map chromosomal coordinates from sequence to VCF if the VCF
- * data has a different reference assembly to the sequence
- */
- // TODO generalise for non-human species
- // - or get the user to choose in a dialog
-
- int offset = 0;
- if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
- && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
- {
- String toRef = "GRCh37";
- int[] newRange = mapReferenceRange(range, chromosome, "human",
- seqRef, toRef);
- if (newRange == null)
- {
- System.err.println(String.format(
- "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
- seqRef, range[0], range[1], toRef));
- return 0;
- }
- offset = newRange[0] - range[0];
- range = newRange;
- }
-
- boolean forwardStrand = range[0] <= range[1];
+ boolean forwardStrand = map.map.isToForwardStrand();
/*
- * query the VCF for overlaps
- * (convert a reverse strand range to forwards)
+ * query the VCF for overlaps of each contiguous chromosomal region
*/
int count = 0;
- MapList mapping = seqCoords.getMap();
- int fromLocus = Math.min(range[0], range[1]);
- int toLocus = Math.max(range[0], range[1]);
- CloseableIterator<VariantContext> variants = reader.query(chromosome,
- fromLocus, toLocus);
- while (variants.hasNext())
+ for (int[] range : map.map.getToRanges())
{
- /*
- * get variant location in sequence chromosomal coordinates
- */
- VariantContext variant = variants.next();
+ int vcfStart = Math.min(range[0], range[1]);
+ int vcfEnd = Math.max(range[0], range[1]);
+ CloseableIterator<VariantContext> variants = reader
+ .query(map.chromosome, vcfStart, vcfEnd);
+ while (variants.hasNext())
+ {
+ VariantContext variant = variants.next();
- int start = variant.getStart() - offset;
- int end = variant.getEnd() - offset;
+ int[] featureRange = map.map.locateInFrom(variant.getStart(),
+ variant.getEnd());
- /*
- * convert chromosomal location to sequence coordinates
- * - may be reverse strand (convert to forward for sequence feature)
- * - null if a partially overlapping feature
- */
- int[] seqLocation = mapping.locateInFrom(start, end);
- if (seqLocation != null)
- {
- int featureStart = Math.min(seqLocation[0], seqLocation[1]);
- int featureEnd = Math.max(seqLocation[0], seqLocation[1]);
- count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
- forwardStrand);
+ if (featureRange != null)
+ {
+ int featureStart = Math.min(featureRange[0], featureRange[1]);
+ int featureEnd = Math.max(featureRange[0], featureRange[1]);
+ count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
+ forwardStrand);
+ }
}
+ variants.close();
}
- variants.close();
-
return count;
}