From afdaac63a77fde754f15a400e7cc86407bb5adfd Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Tue, 29 Mar 2022 20:06:40 +0100 Subject: [PATCH] JAL-3860 JAL-3892 extract genomic assembly mapping logic to static store --- src/jalview/datamodel/GenomicAssemblies.java | 158 ++++++++++++++++++++++++++ src/jalview/io/vcf/VCFLoader.java | 150 +----------------------- 2 files changed, 160 insertions(+), 148 deletions(-) create mode 100644 src/jalview/datamodel/GenomicAssemblies.java diff --git a/src/jalview/datamodel/GenomicAssemblies.java b/src/jalview/datamodel/GenomicAssemblies.java new file mode 100644 index 0000000..9754b90 --- /dev/null +++ b/src/jalview/datamodel/GenomicAssemblies.java @@ -0,0 +1,158 @@ +package jalview.datamodel; + +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + +import jalview.ext.ensembl.EnsemblMap; +import jalview.io.vcf.VCFLoader; +import jalview.util.MappingUtils; + +public class GenomicAssemblies +{ + + /* + * mappings between VCF and sequence reference assembly regions, as + * key = "species!chromosome!fromAssembly!toAssembly + * value = Map{fromRange, toRange} + */ + private static Map> assemblyMappings = new HashMap<>(); + + /* + * internal delimiter used to build keys for assemblyMappings + * + */ + private static final String EXCL = "!"; + + /** + * Formats a ranges map lookup key + * + * @param chromosome + * @param species + * @param fromRef + * @param toRef + * @return + */ + public static String makeRangesKey(String chromosome, String species, + String fromRef, String toRef) + { + return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef; + } + + /** + * Determines the location of the query range (chromosome positions) in a + * different reference assembly. + *

+ * If the range is just a subregion of one for which we already have a mapping + * (for example, an exon sub-region of a gene), then the mapping is just + * computed arithmetically. + *

+ * Otherwise, calls the Ensembl REST service that maps from one assembly + * reference's coordinates to another's + * + * @param queryRange + * start-end chromosomal range in 'fromRef' coordinates + * @param chromosome + * @param species + * @param fromRef + * assembly reference for the query coordinates + * @param toRef + * assembly reference we wish to translate to + * @return the start-end range in 'toRef' coordinates + */ + public static int[] mapReferenceRange(int[] queryRange, String chromosome, + String species, String fromRef, String toRef) + { + /* + * first try shorcut of computing the mapping as a subregion of one + * we already have (e.g. for an exon, if we have the gene mapping) + */ + int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, + species, fromRef, toRef); + if (mappedRange != null) + { + return mappedRange; + } + + /* + * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 + */ + EnsemblMap mapper = new EnsemblMap(); + int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef, + toRef, queryRange); + + if (mapping == null) + { + // mapping service failure + return null; + } + + /* + * save mapping for possible future re-use + */ + String key = GenomicAssemblies.makeRangesKey(chromosome, species, fromRef, toRef); + if (!assemblyMappings.containsKey(key)) + { + assemblyMappings.put(key, new HashMap()); + } + + assemblyMappings.get(key).put(queryRange, mapping); + + return mapping; + } + + /** + * If we already have a 1:1 contiguous mapping which subsumes the given query + * range, this method just calculates and returns the subset of that mapping, + * else it returns null. In practical terms, if a gene has a contiguous + * mapping between (for example) GRCh37 and GRCh38, then we assume that its + * subsidiary exons occupy unchanged relative positions, and just compute + * these as offsets, rather than do another lookup of the mapping. + *

+ * If in future these assumptions prove invalid (e.g. for bacterial dna?!), + * simply remove this method or let it always return null. + *

+ * Warning: many rapid calls to the /map service map result in a 429 overload + * error response + * + * @param queryRange + * @param chromosome + * @param species + * @param fromRef + * @param toRef + * @return + */ + protected static int[] findSubsumedRangeMapping(int[] queryRange, + String chromosome, String species, String fromRef, String toRef) + { + String key = GenomicAssemblies.makeRangesKey(chromosome, species, fromRef, toRef); + if (assemblyMappings.containsKey(key)) + { + Map mappedRanges = assemblyMappings.get(key); + for (Entry mappedRange : mappedRanges.entrySet()) + { + int[] fromRange = mappedRange.getKey(); + int[] toRange = mappedRange.getValue(); + if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) + { + /* + * mapping is 1:1 in length, so we trust it to have no discontinuities + */ + if (MappingUtils.rangeContains(fromRange, queryRange)) + { + /* + * fromRange subsumes our query range + */ + int offset = queryRange[0] - fromRange[0]; + int mappedRangeFrom = toRange[0] + offset; + int mappedRangeTo = mappedRangeFrom + + (queryRange[1] - queryRange[0]); + return new int[] { mappedRangeFrom, mappedRangeTo }; + } + } + } + } + return null; + } + +} diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 9179b79..bbbade8 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -54,6 +54,7 @@ import jalview.bin.Cache; import jalview.bin.Console; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; +import jalview.datamodel.GenomicAssemblies; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -186,23 +187,10 @@ public class VCFLoader private static final String FEATURE_GROUP_VCF = "VCF"; /* - * internal delimiter used to build keys for assemblyMappings - * - */ - private static final String EXCL = "!"; - - /* * the VCF file we are processing */ protected String vcfFilePath; - /* - * mappings between VCF and sequence reference assembly regions, as - * key = "species!chromosome!fromAssembly!toAssembly - * value = Map{fromRange, toRange} - */ - private Map> assemblyMappings; - private VCFReader reader; /* @@ -281,9 +269,6 @@ public class VCFLoader jalview.bin.Console .errPrintln("Error opening VCF file: " + e.getMessage()); } - - // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} - assemblyMappings = new HashMap<>(); } /** @@ -829,7 +814,7 @@ public class VCFLoader continue; } - int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, + int[] newRange = GenomicAssemblies.mapReferenceRange(range, chromosome, "human", seqRef, vcfAssembly); if (newRange == null) { @@ -1519,122 +1504,6 @@ public class VCFLoader } /** - * Determines the location of the query range (chromosome positions) in a - * different reference assembly. - *

- * If the range is just a subregion of one for which we already have a mapping - * (for example, an exon sub-region of a gene), then the mapping is just - * computed arithmetically. - *

- * Otherwise, calls the Ensembl REST service that maps from one assembly - * reference's coordinates to another's - * - * @param queryRange - * start-end chromosomal range in 'fromRef' coordinates - * @param chromosome - * @param species - * @param fromRef - * assembly reference for the query coordinates - * @param toRef - * assembly reference we wish to translate to - * @return the start-end range in 'toRef' coordinates - */ - protected int[] mapReferenceRange(int[] queryRange, String chromosome, - String species, String fromRef, String toRef) - { - /* - * first try shorcut of computing the mapping as a subregion of one - * we already have (e.g. for an exon, if we have the gene mapping) - */ - int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, - species, fromRef, toRef); - if (mappedRange != null) - { - return mappedRange; - } - - /* - * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 - */ - EnsemblMap mapper = new EnsemblMap(); - int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef, - toRef, queryRange); - - if (mapping == null) - { - // mapping service failure - return null; - } - - /* - * save mapping for possible future re-use - */ - String key = makeRangesKey(chromosome, species, fromRef, toRef); - if (!assemblyMappings.containsKey(key)) - { - assemblyMappings.put(key, new HashMap()); - } - - assemblyMappings.get(key).put(queryRange, mapping); - - return mapping; - } - - /** - * If we already have a 1:1 contiguous mapping which subsumes the given query - * range, this method just calculates and returns the subset of that mapping, - * else it returns null. In practical terms, if a gene has a contiguous - * mapping between (for example) GRCh37 and GRCh38, then we assume that its - * subsidiary exons occupy unchanged relative positions, and just compute - * these as offsets, rather than do another lookup of the mapping. - *

- * If in future these assumptions prove invalid (e.g. for bacterial dna?!), - * simply remove this method or let it always return null. - *

- * Warning: many rapid calls to the /map service map result in a 429 overload - * error response - * - * @param queryRange - * @param chromosome - * @param species - * @param fromRef - * @param toRef - * @return - */ - protected int[] findSubsumedRangeMapping(int[] queryRange, - String chromosome, String species, String fromRef, String toRef) - { - String key = makeRangesKey(chromosome, species, fromRef, toRef); - if (assemblyMappings.containsKey(key)) - { - Map mappedRanges = assemblyMappings.get(key); - for (Entry mappedRange : mappedRanges.entrySet()) - { - int[] fromRange = mappedRange.getKey(); - int[] toRange = mappedRange.getValue(); - if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) - { - /* - * mapping is 1:1 in length, so we trust it to have no discontinuities - */ - if (MappingUtils.rangeContains(fromRange, queryRange)) - { - /* - * fromRange subsumes our query range - */ - int offset = queryRange[0] - fromRange[0]; - int mappedRangeFrom = toRange[0] + offset; - int mappedRangeTo = mappedRangeFrom - + (queryRange[1] - queryRange[0]); - return new int[] { mappedRangeFrom, mappedRangeTo }; - } - } - } - } - return null; - } - - /** * Transfers the sequence feature to the target sequence, locating its start * and end range based on the mapping. Features which do not overlap the * target sequence are ignored. @@ -1659,19 +1528,4 @@ public class VCFLoader targetSequence.addSequenceFeature(copy); } } - - /** - * Formats a ranges map lookup key - * - * @param chromosome - * @param species - * @param fromRef - * @param toRef - * @return - */ - protected static String makeRangesKey(String chromosome, String species, - String fromRef, String toRef) - { - return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef; - } } -- 1.7.10.2