import jalview.bin.Console;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.GeneLociI;
+import jalview.datamodel.GenomicAssemblies;
import jalview.datamodel.Mapping;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
private static final String FEATURE_GROUP_VCF = "VCF";
/*
- * internal delimiter used to build keys for assemblyMappings
- *
- */
- private static final String EXCL = "!";
-
- /*
* the VCF file we are processing
*/
protected String vcfFilePath;
- /*
- * mappings between VCF and sequence reference assembly regions, as
- * key = "species!chromosome!fromAssembly!toAssembly
- * value = Map{fromRange, toRange}
- */
- private Map<String, Map<int[], int[]>> assemblyMappings;
-
private VCFReader reader;
/*
{
System.err.println("Error opening VCF file: " + e.getMessage());
}
-
- // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
- assemblyMappings = new HashMap<>();
}
/**
continue;
}
- int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
+ int[] newRange = GenomicAssemblies.mapReferenceRange(range, chromosome, "human", seqRef,
vcfAssembly);
if (newRange == null)
{
}
/**
- * Determines the location of the query range (chromosome positions) in a
- * different reference assembly.
- * <p>
- * If the range is just a subregion of one for which we already have a mapping
- * (for example, an exon sub-region of a gene), then the mapping is just
- * computed arithmetically.
- * <p>
- * Otherwise, calls the Ensembl REST service that maps from one assembly
- * reference's coordinates to another's
- *
- * @param queryRange
- * start-end chromosomal range in 'fromRef' coordinates
- * @param chromosome
- * @param species
- * @param fromRef
- * assembly reference for the query coordinates
- * @param toRef
- * assembly reference we wish to translate to
- * @return the start-end range in 'toRef' coordinates
- */
- protected int[] mapReferenceRange(int[] queryRange, String chromosome,
- String species, String fromRef, String toRef)
- {
- /*
- * first try shorcut of computing the mapping as a subregion of one
- * we already have (e.g. for an exon, if we have the gene mapping)
- */
- int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
- species, fromRef, toRef);
- if (mappedRange != null)
- {
- return mappedRange;
- }
-
- /*
- * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
- */
- EnsemblMap mapper = new EnsemblMap();
- int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
- toRef, queryRange);
-
- if (mapping == null)
- {
- // mapping service failure
- return null;
- }
-
- /*
- * save mapping for possible future re-use
- */
- String key = makeRangesKey(chromosome, species, fromRef, toRef);
- if (!assemblyMappings.containsKey(key))
- {
- assemblyMappings.put(key, new HashMap<int[], int[]>());
- }
-
- assemblyMappings.get(key).put(queryRange, mapping);
-
- return mapping;
- }
-
- /**
- * If we already have a 1:1 contiguous mapping which subsumes the given query
- * range, this method just calculates and returns the subset of that mapping,
- * else it returns null. In practical terms, if a gene has a contiguous
- * mapping between (for example) GRCh37 and GRCh38, then we assume that its
- * subsidiary exons occupy unchanged relative positions, and just compute
- * these as offsets, rather than do another lookup of the mapping.
- * <p>
- * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
- * simply remove this method or let it always return null.
- * <p>
- * Warning: many rapid calls to the /map service map result in a 429 overload
- * error response
- *
- * @param queryRange
- * @param chromosome
- * @param species
- * @param fromRef
- * @param toRef
- * @return
- */
- protected int[] findSubsumedRangeMapping(int[] queryRange,
- String chromosome, String species, String fromRef, String toRef)
- {
- String key = makeRangesKey(chromosome, species, fromRef, toRef);
- if (assemblyMappings.containsKey(key))
- {
- Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
- for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
- {
- int[] fromRange = mappedRange.getKey();
- int[] toRange = mappedRange.getValue();
- if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
- {
- /*
- * mapping is 1:1 in length, so we trust it to have no discontinuities
- */
- if (MappingUtils.rangeContains(fromRange, queryRange))
- {
- /*
- * fromRange subsumes our query range
- */
- int offset = queryRange[0] - fromRange[0];
- int mappedRangeFrom = toRange[0] + offset;
- int mappedRangeTo = mappedRangeFrom
- + (queryRange[1] - queryRange[0]);
- return new int[] { mappedRangeFrom, mappedRangeTo };
- }
- }
- }
- }
- return null;
- }
-
- /**
* Transfers the sequence feature to the target sequence, locating its start
* and end range based on the mapping. Features which do not overlap the
* target sequence are ignored.
targetSequence.addSequenceFeature(copy);
}
}
-
- /**
- * Formats a ranges map lookup key
- *
- * @param chromosome
- * @param species
- * @param fromRef
- * @param toRef
- * @return
- */
- protected static String makeRangesKey(String chromosome, String species,
- String fromRef, String toRef)
- {
- return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef;
- }
}