From: gmungoc Date: Mon, 25 Sep 2017 13:46:27 +0000 (+0100) Subject: Merge branch 'develop' into features/JAL-1793VCF X-Git-Tag: Release_2_11_0~231 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=7725003d7903d17d2bed685e77e5efb5bf2014b3;hp=12ba8429c72ec9428f88adb6ae5338a5df63552e;p=jalview.git Merge branch 'develop' into features/JAL-1793VCF --- diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties index 5d9bdff..7295104 100644 --- a/resources/lang/Messages.properties +++ b/resources/lang/Messages.properties @@ -490,6 +490,8 @@ label.settings_for_type = Settings for {0} label.view_full_application = View in Full Application label.load_associated_tree = Load Associated Tree... label.load_features_annotations = Load Features/Annotations... +label.load_vcf = Load SNP variants from plain text or indexed VCF data +label.load_vcf_file = Load VCF File label.export_features = Export Features... label.export_annotations = Export Annotations... label.to_upper_case = To Upper Case diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 2b9b9f9..6acac01 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -36,6 +36,7 @@ import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; +import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; @@ -105,6 +106,15 @@ public class AlignmentUtils { return variant == null ? null : variant.getFeatureGroup(); } + + /** + * toString for aid in the debugger only + */ + @Override + public String toString() + { + return base + ":" + (variant == null ? "" : variant.getDescription()); + } } /** @@ -2365,7 +2375,7 @@ public class AlignmentUtils { if (var.variant != null) { - String alleles = (String) var.variant.getValue("alleles"); + String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES); if (alleles != null) { for (String base : alleles.split(",")) @@ -2387,7 +2397,7 @@ public class AlignmentUtils { if (var.variant != null) { - String alleles = (String) var.variant.getValue("alleles"); + String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES); if (alleles != null) { for (String base : alleles.split(",")) @@ -2409,7 +2419,7 @@ public class AlignmentUtils { if (var.variant != null) { - String alleles = (String) var.variant.getValue("alleles"); + String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES); if (alleles != null) { for (String base : alleles.split(",")) @@ -2542,6 +2552,22 @@ public class AlignmentUtils // not handling multi-locus variant features continue; } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue(Gff3Helper.ALLELES); + if (alls == null) + { + continue; // non-SNP VCF variant perhaps - can't process this + } + String[] alleles = alls.toUpperCase().split(","); + int i = 0; + for (String allele : alleles) + { + alleles[i++] = allele.trim(); // lose any space characters "A, G" + } + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); if (mapsTo == null) { @@ -2560,21 +2586,6 @@ public class AlignmentUtils } /* - * extract dna variants to a string array - */ - String alls = (String) sf.getValue("alleles"); - if (alls == null) - { - continue; - } - String[] alleles = alls.toUpperCase().split(","); - int i = 0; - for (String allele : alleles) - { - alleles[i++] = allele.trim(); // lose any space characters "A, G" - } - - /* * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10] */ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon diff --git a/src/jalview/datamodel/GeneLoci.java b/src/jalview/datamodel/GeneLoci.java new file mode 100644 index 0000000..b4bd642 --- /dev/null +++ b/src/jalview/datamodel/GeneLoci.java @@ -0,0 +1,46 @@ +package jalview.datamodel; + +import jalview.util.MapList; + +/** + * A data bean to model one or more contiguous regions on one chromosome + */ +public class GeneLoci +{ + /* + * an identifier for the species + */ + public final String species; + + /* + * an identifier for a genome assembly, e.g. GRCh38 + */ + public final String assembly; + + /* + * a chromosome identifier, e.g. "5" or "X" + */ + public final String chromosome; + + /* + * mapping from sequence positions to chromosome locations; + * any regions with start > end are on the reverse strand + */ + public final MapList mapping; + + /** + * Constructor + * + * @param taxon + * @param ref + * @param chrId + * @param map + */ + public GeneLoci(String taxon, String ref, String chrId, MapList map) + { + species = taxon; + assembly = ref; + chromosome = chrId; + mapping = map; + } +} diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java index 2f1da7f..a619d84 100755 --- a/src/jalview/datamodel/Sequence.java +++ b/src/jalview/datamodel/Sequence.java @@ -106,6 +106,8 @@ public class Sequence extends ASequence implements SequenceI */ private int changeCount; + private GeneLoci geneLoci; + /** * Creates a new Sequence object. * @@ -645,21 +647,69 @@ public class Sequence extends ASequence implements SequenceI } /** - * DOCUMENT ME! + * Sets the sequence description, and also parses out any special formats of + * interest * * @param desc - * DOCUMENT ME! */ @Override public void setDescription(String desc) { this.description = desc; + parseDescription(); } /** - * DOCUMENT ME! + * Parses and saves fields of an Ensembl-style description e.g. + * chromosome:GRCh38:17:45051610:45109016:1 + */ + protected void parseDescription() + { + if (description == null) + { + return; + } + String[] tokens = description.split(":"); + if (tokens.length == 6 && "chromosome".equals(tokens[0])) { + String ref = tokens[1]; + String chrom = tokens[2]; + try { + int chStart = Integer.parseInt(tokens[3]); + int chEnd = Integer.parseInt(tokens[4]); + boolean forwardStrand = "1".equals(tokens[5]); + String species = ""; // dunno yet! + int[] from = new int[] { start, end }; + int[] to = new int[] { forwardStrand ? chStart : chEnd, + forwardStrand ? chEnd : chStart }; + MapList map = new MapList(from, to, 1, 1); + GeneLoci gl = new GeneLoci(species, ref, chrom, map); + setGeneLoci(gl); + } catch (NumberFormatException e) + { + System.err.println("Bad integers in description " + description); + } + } + } + + public void setGeneLoci(GeneLoci gl) + { + geneLoci = gl; + } + + /** + * Returns the gene loci mapping for the sequence (may be null) * - * @return DOCUMENT ME! + * @return + */ + public GeneLoci getGeneLoci() + { + return geneLoci; + } + + /** + * Answers the description + * + * @return */ @Override public String getDescription() diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 365c1c2..f975ac8 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -23,6 +23,7 @@ package jalview.ext.ensembl; import jalview.api.FeatureColourI; import jalview.api.FeatureSettingsModelI; import jalview.datamodel.AlignmentI; +import jalview.datamodel.GeneLoci; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -401,6 +402,8 @@ public class EnsemblGene extends EnsemblSeqProxy cdna.transferFeatures(gene.getFeatures().getPositionalFeatures(), transcript.getDatasetSequence(), mapping, parentId); + mapTranscriptToChromosome(transcript, gene, mapping); + /* * fetch and save cross-references */ @@ -415,6 +418,51 @@ public class EnsemblGene extends EnsemblSeqProxy } /** + * If the gene has a mapping to chromosome coordinates, derive the transcript + * chromosome regions and save on the transcript sequence + * + * @param transcript + * @param gene + * @param mapping + * the mapping from gene to transcript positions + */ + protected void mapTranscriptToChromosome(Sequence transcript, + SequenceI gene, MapList mapping) + { + GeneLoci loci = ((Sequence) gene).getGeneLoci(); + if (loci == null) + { + return; + } + + /* + * patch to ensure gene to chromosome mapping is complete + * (in case created before gene length was known) + */ + MapList geneMapping = loci.mapping; + if (geneMapping.getFromRanges().get(0)[1] == 0) + { + geneMapping.getFromRanges().get(0)[0] = gene.getStart(); + geneMapping.getFromRanges().get(0)[1] = gene.getEnd(); + } + + List exons = mapping.getFromRanges(); + List transcriptLoci = new ArrayList<>(); + + for (int[] exon : exons) { + transcriptLoci.add(geneMapping.locateInTo(exon[0], exon[1])); + } + + List transcriptRange = Arrays.asList(new int[] { + transcript.getStart(), transcript.getEnd() }); + MapList mapList = new MapList(transcriptRange, transcriptLoci, 1, 1); + GeneLoci gl = new GeneLoci(loci.species, loci.assembly, + loci.chromosome, mapList); + + transcript.setGeneLoci(gl); + } + + /** * Returns the 'transcript_id' property of the sequence feature (or null) * * @param feature diff --git a/src/jalview/ext/ensembl/EnsemblMap.java b/src/jalview/ext/ensembl/EnsemblMap.java new file mode 100644 index 0000000..05cc897 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblMap.java @@ -0,0 +1,183 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefSource; + +import java.io.BufferedReader; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; +import java.util.List; + +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class EnsemblMap extends EnsemblRestClient +{ + + /** + * Default constructor (to use rest.ensembl.org) + */ + public EnsemblMap() + { + super(); + } + + /** + * Constructor given the target domain to fetch data from + * + * @param + */ + public EnsemblMap(String domain) + { + super(domain); + } + + @Override + public String getDbName() + { + return DBRefSource.ENSEMBL; + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + return null; // not used + } + + /** + * Constructs a URL of the format + * http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37?content-type=application/json + * + * + * @param species + * @param chromosome + * @param fromRef + * @param toRef + * @param startPos + * @param endPos + * @return + * @throws MalformedURLException + */ + protected URL getUrl(String species, String chromosome, String fromRef, + String toRef, int startPos, int endPos) + throws MalformedURLException + { + /* + * start-end might be reverse strand - present forwards to the service + */ + boolean forward = startPos <= endPos; + int start = forward ? startPos : endPos; + int end = forward ? endPos : startPos; + String strand = forward ? "1" : "-1"; + String url = String.format( + "%s/map/%s/%s/%s:%d..%d:%s/%s?content-type=application/json", + getDomain(), species, fromRef, chromosome, start, end, strand, + toRef); + try + { + return new URL(url); + } catch (MalformedURLException e) + { + return null; + } + } + + @Override + protected boolean useGetRequest() + { + return true; + } + + @Override + protected String getRequestMimeType(boolean multipleIds) + { + return "application/json"; + } + + @Override + protected String getResponseMimeType() + { + return "application/json"; + } + + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + return null; // not used + } + + public int[] getMapping(String species, String chromosome, + String fromRef, String toRef, int[] queryRange) + { + URL url = null; + BufferedReader br = null; + + try + { + url = getUrl(species, chromosome, fromRef, toRef, queryRange[0], + queryRange[1]); + // System.out.println("Calling " + url); + br = getHttpResponse(url, null); + return (parseResponse(br)); + } catch (Throwable t) + { + System.out.println("Error calling " + url + ": " + t.getMessage()); + return null; + } + } + + /** + * Parses the JSON response from the /map REST service. The format is (with + * some fields omitted) + * + *
+   *  {"mappings": 
+   *    [{
+   *       "original": {"end":45109016,"start":45051610},
+   *       "mapped"  : {"end":43186384,"start":43128978} 
+   *  }] }
+   * 
+ * + * @param br + * @return + */ + protected int[] parseResponse(BufferedReader br) + { + int[] result = null; + JSONParser jp = new JSONParser(); + + try + { + JSONObject parsed = (JSONObject) jp.parse(br); + JSONArray mappings = (JSONArray) parsed.get("mappings"); + + Iterator rvals = mappings.iterator(); + while (rvals.hasNext()) + { + // todo check for "mapped" + JSONObject val = (JSONObject) rvals.next(); + JSONObject mapped = (JSONObject) val.get("mapped"); + int start = Integer.parseInt(mapped.get("start").toString()); + int end = Integer.parseInt(mapped.get("end").toString()); + String strand = mapped.get("strand").toString(); + if ("1".equals(strand)) + { + result = new int[] { start, end }; + } + else + { + result = new int[] { end, start }; + } + } + } catch (IOException | ParseException | NumberFormatException e) + { + // ignore + } + return result; + } + +} diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 577111e..35ceea3 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -34,6 +34,7 @@ import jalview.datamodel.features.SequenceFeatures; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; +import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.util.Comparison; @@ -59,8 +60,6 @@ import java.util.Map.Entry; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - private static final String ALLELES = "alleles"; - protected static final String PARENT = "Parent"; protected static final String ID = "ID"; @@ -717,7 +716,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ static void reverseComplementAlleles(SequenceFeature sf) { - final String alleles = (String) sf.getValue(ALLELES); + final String alleles = (String) sf.getValue(Gff3Helper.ALLELES); if (alleles == null) { return; @@ -728,7 +727,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient reverseComplementAllele(complement, allele); } String comp = complement.toString(); - sf.setValue(ALLELES, comp); + sf.setValue(Gff3Helper.ALLELES, comp); sf.setDescription(comp); /* @@ -738,7 +737,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String atts = sf.getAttributes(); if (atts != null) { - atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp); + atts = atts.replace(Gff3Helper.ALLELES + "=" + alleles, + Gff3Helper.ALLELES + "=" + comp); sf.setAttributes(atts); } } diff --git a/src/jalview/ext/htsjdk/VCFReader.java b/src/jalview/ext/htsjdk/VCFReader.java new file mode 100644 index 0000000..14c057f --- /dev/null +++ b/src/jalview/ext/htsjdk/VCFReader.java @@ -0,0 +1,214 @@ +package jalview.ext.htsjdk; + +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; + +/** + * A thin wrapper for htsjdk classes to read either plain, or compressed, or + * compressed and indexed VCF files + */ +public class VCFReader implements Closeable, Iterable +{ + private static final String GZ = "gz"; + + private static final String TBI_EXTENSION = ".tbi"; + + private boolean indexed; + + private VCFFileReader reader; + + /** + * Constructor given a raw or compressed VCF file or a (tabix) index file + *

+ * For now, file type is inferred from its suffix: .gz or .bgz for compressed + * data, .tbi for an index file, anything else is assumed to be plain text + * VCF. + * + * @param f + * @throws IOException + */ + public VCFReader(String filePath) throws IOException + { + if (filePath.endsWith(GZ)) + { + if (new File(filePath + TBI_EXTENSION).exists()) + { + indexed = true; + } + } + else if (filePath.endsWith(TBI_EXTENSION)) + { + indexed = true; + filePath = filePath.substring(0, filePath.length() - 4); + } + + reader = new VCFFileReader(new File(filePath), indexed); + } + + @Override + public void close() throws IOException + { + if (reader != null) + { + reader.close(); + } + } + + /** + * Returns an iterator over VCF variants in the file. The client should call + * close() on the iterator when finished with it. + */ + @Override + public CloseableIterator iterator() + { + return reader == null ? null : reader.iterator(); + } + + /** + * Queries for records overlapping the region specified. Note that this method + * is performant if the VCF file is indexed, and may be very slow if it is + * not. + *

+ * Client code should call close() on the iterator when finished with it. + * + * @param chrom + * the chromosome to query + * @param start + * query interval start + * @param end + * query interval end + * @return + */ + public CloseableIterator query(final String chrom, + final int start, final int end) + { + if (reader == null) { + return null; + } + if (indexed) + { + return reader.query(chrom, start, end); + } + else + { + return queryUnindexed(chrom, start, end); + } + } + + /** + * Returns an iterator over variant records read from a flat file which + * overlap the specified chromosomal positions. Call close() on the iterator + * when finished with it! + * + * @param chrom + * @param start + * @param end + * @return + */ + protected CloseableIterator queryUnindexed( + final String chrom, final int start, final int end) + { + final CloseableIterator it = reader.iterator(); + + return new CloseableIterator() + { + boolean atEnd = false; + + // prime look-ahead buffer with next matching record + private VariantContext next = findNext(); + + private VariantContext findNext() + { + if (atEnd) + { + return null; + } + VariantContext variant = null; + while (it.hasNext()) + { + variant = it.next(); + int vstart = variant.getStart(); + + if (vstart > end) + { + atEnd = true; + close(); + return null; + } + + int vend = variant.getEnd(); + // todo what is the undeprecated way to get + // the chromosome for the variant? + if (chrom.equals(variant.getChr()) && (vstart <= end) + && (vend >= start)) + { + return variant; + } + } + return null; + } + + @Override + public boolean hasNext() + { + boolean hasNext = !atEnd && (next != null); + if (!hasNext) + { + close(); + } + return hasNext; + } + + @Override + public VariantContext next() + { + /* + * return the next match, and then re-prime + * it with the following one (if any) + */ + VariantContext temp = next; + next = findNext(); + return temp; + } + + @Override + public void remove() + { + // not implemented + } + + @Override + public void close() + { + it.close(); + } + }; + } + + /** + * Returns an object that models the VCF file headers + * + * @return + */ + public VCFHeader getFileHeader() + { + return reader == null ? null : reader.getFileHeader(); + } + + /** + * Answers true if we are processing a tab-indexed VCF file, false if it is a + * plain text (uncompressed) file. + * + * @return + */ + public boolean isIndex() + { + return indexed; + } +} diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java index 13b715e..95cabcd 100644 --- a/src/jalview/gui/AlignFrame.java +++ b/src/jalview/gui/AlignFrame.java @@ -81,6 +81,7 @@ import jalview.io.JnetAnnotationMaker; import jalview.io.NewickFile; import jalview.io.ScoreMatrixFile; import jalview.io.TCoffeeScoreFile; +import jalview.io.vcf.VCFLoader; import jalview.jbgui.GAlignFrame; import jalview.schemes.ColourSchemeI; import jalview.schemes.ColourSchemes; @@ -841,6 +842,7 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, AlignmentI al = getViewport().getAlignment(); boolean nucleotide = al.isNucleotide(); + loadVcf.setVisible(nucleotide); showTranslation.setVisible(nucleotide); showReverse.setVisible(nucleotide); showReverseComplement.setVisible(nucleotide); @@ -5584,6 +5586,26 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, new CalculationChooser(AlignFrame.this); } } + + @Override + protected void loadVcf_actionPerformed() + { + JalviewFileChooser chooser = new JalviewFileChooser( + Cache.getProperty("LAST_DIRECTORY")); + chooser.setFileView(new JalviewFileView()); + chooser.setDialogTitle(MessageManager.getString("label.load_vcf_file")); + chooser.setToolTipText(MessageManager.getString("label.load_vcf_file")); + + int value = chooser.showOpenDialog(null); + + if (value == JalviewFileChooser.APPROVE_OPTION) + { + String choice = chooser.getSelectedFile().getPath(); + Cache.setProperty("LAST_DIRECTORY", choice); + new VCFLoader(viewport.getAlignment()).loadVCF(choice, this); + } + + } } class PrintThread extends Thread diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index c7e1d7a..a25a014 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -39,6 +39,8 @@ import java.util.Map; */ public class Gff3Helper extends GffHelperBase { + public static final String ALLELES = "alleles"; + protected static final String TARGET = "Target"; protected static final String ID = "ID"; @@ -399,7 +401,7 @@ public class Gff3Helper extends GffHelperBase /* * Ensembl returns dna variants as 'alleles' */ - desc = StringUtils.listToDelimitedString(attributes.get("alleles"), + desc = StringUtils.listToDelimitedString(attributes.get(ALLELES), ","); } diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java new file mode 100644 index 0000000..13966f4 --- /dev/null +++ b/src/jalview/io/vcf/VCFLoader.java @@ -0,0 +1,505 @@ +package jalview.io.vcf; + +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; + +import jalview.analysis.AlignmentUtils; +import jalview.api.AlignViewControllerGuiI; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLoci; +import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.VCFReader; +import jalview.io.gff.Gff3Helper; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.MapList; +import jalview.util.MappingUtils; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** + * A class to read VCF data (using the htsjdk) and add variants as sequence + * features on dna and any related protein product sequences + * + * @author gmcarstairs + */ +public class VCFLoader +{ + private static final String EXCL = "!"; + + /* + * the alignment we are associated VCF data with + */ + private AlignmentI al; + + /* + * mappings between VCF and sequence reference assembly regions, as + * key = "species!chromosome!fromAssembly!toAssembly + * value = Map{fromRange, toRange} + */ + private Map> assemblyMappings; + + /** + * Constructor given an alignment context + * + * @param alignment + */ + public VCFLoader(AlignmentI alignment) + { + al = alignment; + + // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} + assemblyMappings = new HashMap>(); + } + + /** + * Loads VCF on to an alignment - provided it can be related to one or more + * sequence's chromosomal coordinates. + *

+ * This method is not thread safe - concurrent threads should use separate + * instances of this class. + * + * @param filePath + * @param status + */ + public void loadVCF(String filePath, AlignViewControllerGuiI status) + { + VCFReader reader = null; + + try + { + // long start = System.currentTimeMillis(); + reader = new VCFReader(filePath); + + VCFHeader header = reader.getFileHeader(); + VCFHeaderLine ref = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + // check if reference is wrt assembly19 (GRCh37) + // todo may need to allow user to specify reference assembly? + boolean isRefGrch37 = (ref != null && ref.getValue().contains( + "assembly19")); + + int varCount = 0; + int seqCount = 0; + + /* + * query for VCF overlapping each sequence in turn + */ + for (SequenceI seq : al.getSequences()) + { + int added = loadVCF(seq, reader, isRefGrch37); + if (added > 0) + { + seqCount++; + varCount += added; + computePeptideVariants(seq); + } + } + // long elapsed = System.currentTimeMillis() - start; + String msg = String.format("Added %d VCF variants to %d sequence(s)", + varCount, seqCount); + if (status != null) + { + status.setStatus(msg); + } + } catch (Throwable e) + { + System.err.println("Error processing VCF: " + e.getMessage()); + e.printStackTrace(); + } finally + { + if (reader != null) + { + try + { + reader.close(); + } catch (IOException e) + { + // ignore + } + } + } + } + + /** + * (Re-)computes peptide variants from dna variants, for any protein sequence + * to which the dna sequence has a mapping. Note that although duplicate + * features may get computed, they will not be added, since duplicate sequence + * features are ignored in Sequence.addSequenceFeature. + * + * @param dnaSeq + */ + protected void computePeptideVariants(SequenceI dnaSeq) + { + DBRefEntry[] dbrefs = dnaSeq.getDBRefs(); + if (dbrefs == null) + { + return; + } + for (DBRefEntry dbref : dbrefs) + { + Mapping mapping = dbref.getMap(); + if (mapping == null || mapping.getTo() == null + || mapping.getMap().getFromRatio() != 3) + { + continue; + } + AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(), + mapping.getMap()); + } + } + + /** + * Tries to add overlapping variants read from a VCF file to the given + * sequence, and returns the number of overlapping variants found. Note that + * this requires the sequence to hold information as to its chromosomal + * positions and reference, in order to be able to map the VCF variants to the + * sequence. + * + * @param seq + * @param reader + * @param isVcfRefGrch37 + * @return + */ + protected int loadVCF(SequenceI seq, VCFReader reader, + boolean isVcfRefGrch37) + { + int count = 0; + GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); + if (seqCoords == null) + { + return 0; + } + + List seqChromosomalContigs = seqCoords.mapping.getToRanges(); + for (int[] range : seqChromosomalContigs) + { + count += addVcfVariants(seq, reader, range, isVcfRefGrch37); + } + + return count; + } + + /** + * Queries the VCF reader for any variants that overlap the given chromosome + * region of the sequence, and adds as variant features. Returns the number of + * overlapping variants found. + * + * @param seq + * @param reader + * @param range + * start-end range of a sequence region in its chromosomal + * coordinates + * @param isVcfRefGrch37 + * true if the VCF is with reference to GRCh37 + * @return + */ + protected int addVcfVariants(SequenceI seq, VCFReader reader, + int[] range, boolean isVcfRefGrch37) + { + GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); + + String chromosome = seqCoords.chromosome; + String seqRef = seqCoords.assembly; + String species = seqCoords.species; + + // TODO handle species properly + if ("".equals(species)) + { + species = "human"; + } + + /* + * map chromosomal coordinates from GRCh38 (sequence) to + * GRCh37 (VCF) if necessary + */ + // TODO generalise for other assemblies and species + int offset = 0; + String fromRef = "GRCh38"; + if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) + { + String toRef = "GRCh37"; + int[] newRange = mapReferenceRange(range, chromosome, species, + fromRef, toRef); + if (newRange == null) + { + System.err.println(String.format( + "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, + fromRef, range[0], range[1], toRef)); + return 0; + } + offset = newRange[0] - range[0]; + range = newRange; + } + + /* + * query the VCF for overlaps + * (convert a reverse strand range to forwards) + */ + int count = 0; + MapList mapping = seqCoords.mapping; + + int fromLocus = Math.min(range[0], range[1]); + int toLocus = Math.max(range[0], range[1]); + CloseableIterator variants = reader.query(chromosome, + fromLocus, toLocus); + while (variants.hasNext()) + { + /* + * get variant location in sequence chromosomal coordinates + */ + VariantContext variant = variants.next(); + + /* + * we can only process SNP variants (which can be reported + * as part of a MIXED variant record + */ + if (!variant.isSNP() && !variant.isMixed()) + { + continue; + } + + count++; + int start = variant.getStart() - offset; + int end = variant.getEnd() - offset; + + /* + * convert chromosomal location to sequence coordinates + * - null if a partially overlapping feature + */ + int[] seqLocation = mapping.locateInFrom(start, end); + if (seqLocation != null) + { + addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1]); + } + } + + variants.close(); + + return count; + } + + /** + * Inspects the VCF variant record, and adds variant features to the sequence. + * Only SNP variants are added, not INDELs. + * + * @param seq + * @param variant + * @param featureStart + * @param featureEnd + */ + protected void addVariantFeatures(SequenceI seq, VariantContext variant, + int featureStart, int featureEnd) + { + String reference = variant.getReference().getBaseString(); + if (reference.length() != 1) + { + /* + * sorry, we don't handle INDEL variants + */ + return; + } + + /* + * for now we extract allele frequency as feature score; note + * this attribute is String for a simple SNP, but List if + * multiple alleles at the locus; we extract for the simple case only, + * since not sure how to match allele order with AF values + */ + Object af = variant.getAttribute("AF"); + float score = 0f; + if (af instanceof String) + { + try + { + score = Float.parseFloat((String) af); + } catch (NumberFormatException e) + { + // leave as 0 + } + } + + StringBuilder sb = new StringBuilder(); + sb.append(reference); + + /* + * inspect alleles and record SNP variants (as the variant + * record could be MIXED and include INDEL and SNP alleles) + */ + int alleleCount = 0; + + /* + * inspect alleles; warning: getAlleles gives no guarantee + * as to the order in which they are returned + */ + for (Allele allele : variant.getAlleles()) + { + if (!allele.isReference()) + { + String alleleBase = allele.getBaseString(); + if (alleleBase.length() == 1) + { + sb.append(",").append(alleleBase); + alleleCount++; + } + } + } + String alleles = sb.toString(); // e.g. G,A,C + + String type = SequenceOntologyI.SEQUENCE_VARIANT; + + SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, + featureEnd, score, "VCF"); + + sf.setValue(Gff3Helper.ALLELES, alleles); + + Map atts = variant.getAttributes(); + for (Entry att : atts.entrySet()) + { + sf.setValue(att.getKey(), att.getValue()); + } + seq.addSequenceFeature(sf); + } + + /** + * Determines the location of the query range (chromosome positions) in a + * different reference assembly. + *

+ * If the range is just a subregion of one for which we already have a mapping + * (for example, an exon sub-region of a gene), then the mapping is just + * computed arithmetically. + *

+ * Otherwise, calls the Ensembl REST service that maps from one assembly + * reference's coordinates to another's + * + * @param queryRange + * start-end chromosomal range in 'fromRef' coordinates + * @param chromosome + * @param species + * @param fromRef + * assembly reference for the query coordinates + * @param toRef + * assembly reference we wish to translate to + * @return the start-end range in 'toRef' coordinates + */ + protected int[] mapReferenceRange(int[] queryRange, String chromosome, + String species, String fromRef, String toRef) + { + /* + * first try shorcut of computing the mapping as a subregion of one + * we already have (e.g. for an exon, if we have the gene mapping) + */ + int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, + species, fromRef, toRef); + if (mappedRange != null) + { + return mappedRange; + } + + /* + * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 + */ + EnsemblMap mapper = new EnsemblMap(); + int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef, + queryRange); + + if (mapping == null) + { + // mapping service failure + return null; + } + + /* + * save mapping for possible future re-use + */ + String key = makeRangesKey(chromosome, species, fromRef, toRef); + if (!assemblyMappings.containsKey(key)) + { + assemblyMappings.put(key, new HashMap()); + } + + assemblyMappings.get(key).put(queryRange, mapping); + + return mapping; + } + + /** + * If we already have a 1:1 contiguous mapping which subsumes the given query + * range, this method just calculates and returns the subset of that mapping, + * else it returns null. In practical terms, if a gene has a contiguous + * mapping between (for example) GRCh37 and GRCh38, then we assume that its + * subsidiary exons occupy unchanged relative positions, and just compute + * these as offsets, rather than do another lookup of the mapping. + *

+ * If in future these assumptions prove invalid (e.g. for bacterial dna?!), + * simply remove this method or let it always return null. + *

+ * Warning: many rapid calls to the /map service map result in a 429 overload + * error response + * + * @param queryRange + * @param chromosome + * @param species + * @param fromRef + * @param toRef + * @return + */ + protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome, + String species, String fromRef, String toRef) + { + String key = makeRangesKey(chromosome, species, fromRef, toRef); + if (assemblyMappings.containsKey(key)) + { + Map mappedRanges = assemblyMappings.get(key); + for (Entry mappedRange : mappedRanges.entrySet()) + { + int[] fromRange = mappedRange.getKey(); + int[] toRange = mappedRange.getValue(); + if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) + { + /* + * mapping is 1:1 in length, so we trust it to have no discontinuities + */ + if (MappingUtils.rangeContains(fromRange, queryRange)) + { + /* + * fromRange subsumes our query range + */ + int offset = queryRange[0] - fromRange[0]; + int mappedRangeFrom = toRange[0] + offset; + int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]); + return new int[] { mappedRangeFrom, mappedRangeTo }; + } + } + } + } + return null; + } + + /** + * Formats a ranges map lookup key + * + * @param chromosome + * @param species + * @param fromRef + * @param toRef + * @return + */ + protected static String makeRangesKey(String chromosome, String species, + String fromRef, String toRef) + { + return species + EXCL + chromosome + EXCL + fromRef + EXCL + + toRef; + } +} diff --git a/src/jalview/jbgui/GAlignFrame.java b/src/jalview/jbgui/GAlignFrame.java index 86d0c85..1cf482d 100755 --- a/src/jalview/jbgui/GAlignFrame.java +++ b/src/jalview/jbgui/GAlignFrame.java @@ -147,6 +147,8 @@ public class GAlignFrame extends JInternalFrame protected JMenuItem runGroovy = new JMenuItem(); + protected JMenuItem loadVcf; + protected JCheckBoxMenuItem autoCalculate = new JCheckBoxMenuItem(); protected JCheckBoxMenuItem sortByTree = new JCheckBoxMenuItem(); @@ -1308,6 +1310,16 @@ public class GAlignFrame extends JInternalFrame associatedData_actionPerformed(e); } }); + loadVcf = new JMenuItem(MessageManager.getString("label.load_vcf_file")); + loadVcf.setToolTipText(MessageManager.getString("label.load_vcf")); + loadVcf.addActionListener(new ActionListener() + { + @Override + public void actionPerformed(ActionEvent e) + { + loadVcf_actionPerformed(); + } + }); autoCalculate.setText( MessageManager.getString("label.autocalculate_consensus")); autoCalculate.setState( @@ -1710,6 +1722,7 @@ public class GAlignFrame extends JInternalFrame fileMenu.add(exportAnnotations); fileMenu.add(loadTreeMenuItem); fileMenu.add(associatedData); + fileMenu.add(loadVcf); fileMenu.addSeparator(); fileMenu.add(closeMenuItem); @@ -1855,6 +1868,10 @@ public class GAlignFrame extends JInternalFrame // selectMenu.add(listenToViewSelections); } + protected void loadVcf_actionPerformed() + { + } + /** * Constructs the entries on the Colour menu (but does not add them to the * menu). diff --git a/src/jalview/util/MappingUtils.java b/src/jalview/util/MappingUtils.java index 3682239..d21eac3 100644 --- a/src/jalview/util/MappingUtils.java +++ b/src/jalview/util/MappingUtils.java @@ -939,4 +939,32 @@ public final class MappingUtils } return copy; } + + /** + * Answers true if range's start-end positions include those of queryRange, + * where either range might be in reverse direction, else false + * + * @param range + * a start-end range + * @param queryRange + * a candidate subrange of range (start2-end2) + * @return + */ + public static boolean rangeContains(int[] range, int[] queryRange) + { + if (range == null || queryRange == null || range.length != 2 + || queryRange.length != 2) + { + /* + * invalid arguments + */ + return false; + } + + int min = Math.min(range[0], range[1]); + int max = Math.max(range[0], range[1]); + + return (min <= queryRange[0] && max >= queryRange[0] + && min <= queryRange[1] && max >= queryRange[1]); + } } diff --git a/test/jalview/ext/htsjdk/VCFReaderTest.java b/test/jalview/ext/htsjdk/VCFReaderTest.java new file mode 100644 index 0000000..bf617ae --- /dev/null +++ b/test/jalview/ext/htsjdk/VCFReaderTest.java @@ -0,0 +1,200 @@ +package jalview.ext.htsjdk; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.List; + +import org.testng.annotations.Test; + +public class VCFReaderTest +{ + private static final String[] VCF = new String[] { + "##fileformat=VCFv4.2", + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", + "20\t3\t.\tC\tG\t.\tPASS\tDP=100", // SNP C/G + "20\t7\t.\tG\tGA\t.\tPASS\tDP=100", // insertion G/GA + "18\t2\t.\tACG\tA\t.\tPASS\tDP=100" }; // deletion ACG/A + + // gnomAD exome variant dataset + private static final String VCF_PATH = "/Volumes/gjb/smacgowan/NOBACK/resources/gnomad/gnomad.exomes.r2.0.1.sites.vcf.gz"; + + // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz"; + + /** + * A test to exercise some basic functionality of the htsjdk VCF reader, + * reading from a non-index VCF file + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testReadVcf_plain() throws IOException + { + File f = writeVcfFile(); + VCFReader reader = new VCFReader(f.getAbsolutePath()); + CloseableIterator variants = reader.iterator(); + + /* + * SNP C/G variant + */ + VariantContext vc = variants.next(); + assertTrue(vc.isSNP()); + Allele ref = vc.getReference(); + assertEquals(ref.getBaseString(), "C"); + List alleles = vc.getAlleles(); + assertEquals(alleles.size(), 2); + assertTrue(alleles.get(0).isReference()); + assertEquals(alleles.get(0).getBaseString(), "C"); + assertFalse(alleles.get(1).isReference()); + assertEquals(alleles.get(1).getBaseString(), "G"); + + /* + * Insertion G -> GA + */ + vc = variants.next(); + assertFalse(vc.isSNP()); + assertTrue(vc.isSimpleInsertion()); + ref = vc.getReference(); + assertEquals(ref.getBaseString(), "G"); + alleles = vc.getAlleles(); + assertEquals(alleles.size(), 2); + assertTrue(alleles.get(0).isReference()); + assertEquals(alleles.get(0).getBaseString(), "G"); + assertFalse(alleles.get(1).isReference()); + assertEquals(alleles.get(1).getBaseString(), "GA"); + + /* + * Deletion ACG -> A + */ + vc = variants.next(); + assertFalse(vc.isSNP()); + assertTrue(vc.isSimpleDeletion()); + ref = vc.getReference(); + assertEquals(ref.getBaseString(), "ACG"); + alleles = vc.getAlleles(); + assertEquals(alleles.size(), 2); + assertTrue(alleles.get(0).isReference()); + assertEquals(alleles.get(0).getBaseString(), "ACG"); + assertFalse(alleles.get(1).isReference()); + assertEquals(alleles.get(1).getBaseString(), "A"); + + assertFalse(variants.hasNext()); + + variants.close(); + reader.close(); + } + + /** + * Creates a temporary file to be read by the htsjdk VCF reader + * + * @return + * @throws IOException + */ + protected File writeVcfFile() throws IOException + { + File f = File.createTempFile("Test", "vcf"); + f.deleteOnExit(); + PrintWriter pw = new PrintWriter(f); + for (String vcfLine : VCF) { + pw.println(vcfLine); + } + pw.close(); + return f; + } + + /** + * A 'test' that demonstrates querying an indexed VCF file for features in a + * specified interval + * + * @throws IOException + */ + @Test + public void testQuery_indexed() throws IOException + { + /* + * if not specified, assumes index file is filename.tbi + */ + VCFReader reader = new VCFReader(VCF_PATH); + + /* + * gene NMT1 (human) is on chromosome 17 + * GCHR38 (Ensembl): 45051610-45109016 + * GCHR37 (gnoMAD): 43128978-43186384 + * CDS begins at offset 9720, first CDS variant at offset 9724 + */ + CloseableIterator features = reader.query("17", + 43128978 + 9724, 43128978 + 9734); // first 11 CDS positions + + assertEquals(printNext(features), 43138702); + assertEquals(printNext(features), 43138704); + assertEquals(printNext(features), 43138707); + assertEquals(printNext(features), 43138708); + assertEquals(printNext(features), 43138710); + assertEquals(printNext(features), 43138711); + assertFalse(features.hasNext()); + + features.close(); + reader.close(); + } + + /** + * Prints the toString value of the next variant, and returns its start + * location + * + * @param features + * @return + */ + protected int printNext(CloseableIterator features) + { + VariantContext next = features.next(); + System.out.println(next.toString()); + return next.getStart(); + } + + // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz"; + + /** + * Test the query method that wraps a non-indexed VCF file + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testQuery_plain() throws IOException + { + File f = writeVcfFile(); + VCFReader reader = new VCFReader(f.getAbsolutePath()); + + /* + * query for overlap of 5-8 - should find variant at 7 + */ + CloseableIterator variants = reader.query("20", 5, 8); + + /* + * INDEL G/GA variant + */ + VariantContext vc = variants.next(); + assertTrue(vc.isIndel()); + assertEquals(vc.getStart(), 7); + assertEquals(vc.getEnd(), 7); + Allele ref = vc.getReference(); + assertEquals(ref.getBaseString(), "G"); + List alleles = vc.getAlleles(); + assertEquals(alleles.size(), 2); + assertTrue(alleles.get(0).isReference()); + assertEquals(alleles.get(0).getBaseString(), "G"); + assertFalse(alleles.get(1).isReference()); + assertEquals(alleles.get(1).getBaseString(), "GA"); + + assertFalse(variants.hasNext()); + + variants.close(); + reader.close(); + } +} diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java new file mode 100644 index 0000000..7319123 --- /dev/null +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -0,0 +1,164 @@ +package jalview.io.vcf; + +import static org.testng.Assert.assertEquals; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLoci; +import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.gui.AlignFrame; +import jalview.io.DataSourceType; +import jalview.io.FileLoader; +import jalview.io.gff.Gff3Helper; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.MapList; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.List; + +import org.testng.annotations.Test; + +public class VCFLoaderTest +{ + // columns 9717- of gene P30419 from Ensembl (modified) + private static final String FASTA = ">ENSG00000136448/1-25 chromosome:GRCh38:17:45051610:45051634:1\n" + + "CAAGCTGGCGGACGAGAGTGTGACA\n" + // and a 'made up' mini-transcript with two exons + + ">ENST00000592782/1-18\n--AGCTGGCG----AGAGTGTGAC-\n"; + + private static final String[] VCF = { "##fileformat=VCFv4.2", + "##INFO=", + "##reference=GRCh38", + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", + // SNP A/T in position 2 of gene sequence (precedes transcript) + "17\t45051611\t.\tA\tT\t1666.64\tRF\tAC=15;AF=5.08130e-03", + // SNP G/C in position 4 of gene sequence, position 2 of transcript + // this is a mixed variant, the insertion G/GA is not transferred + "17\t45051613\t.\tG\tGA,C\t1666.64\tRF\tAC=15;AF=3.08130e-03" }; + + @Test(groups = "Functional") + public void testLoadVCF() throws IOException + { + AlignmentI al = buildAlignment(); + VCFLoader loader = new VCFLoader(al); + + File f = makeVcf(); + + loader.loadVCF(f.getPath(), null); + + /* + * verify variant feature(s) added to gene + */ + List geneFeatures = al.getSequenceAt(0).findFeatures( + 2, 2); + assertEquals(geneFeatures.size(), 1); + SequenceFeature sf = geneFeatures.get(0); + assertEquals(sf.getFeatureGroup(), "VCF"); + assertEquals(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT); + assertEquals(sf.getScore(), 5.08130e-03, 0.000001f); + assertEquals("A,T", sf.getValue(Gff3Helper.ALLELES)); + + /* + * verify variant feature(s) added to transcript + */ + List transcriptFeatures = al.getSequenceAt(1) + .findFeatures(4, 4); + assertEquals(transcriptFeatures.size(), 1); + sf = transcriptFeatures.get(0); + assertEquals(sf.getFeatureGroup(), "VCF"); + assertEquals(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT); + assertEquals(sf.getScore(), 3.08130e-03, 0.000001f); + assertEquals("G,C", sf.getValue(Gff3Helper.ALLELES)); + + /* + * verify variant feature(s) computed and added to protein + * first codon AGC varies to ACC giving S/T + */ + SequenceI peptide = al.getSequenceAt(1) + .getDBRefs()[0].getMap().getTo(); + List proteinFeatures = peptide.findFeatures(1, 6); + assertEquals(proteinFeatures.size(), 1); + sf = proteinFeatures.get(0); + assertEquals(sf.getFeatureGroup(), "VCF"); + assertEquals(sf.getBegin(), 2); + assertEquals(sf.getEnd(), 2); + assertEquals(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT); + assertEquals(sf.getDescription(), "p.Ser1Thr"); + } + + private File makeVcf() throws IOException + { + File f = File.createTempFile("Test", ".vcf"); + f.deleteOnExit(); + PrintWriter pw = new PrintWriter(f); + for (String vcfLine : VCF) + { + pw.println(vcfLine); + } + pw.close(); + return f; + } + + /** + * Make a simple alignment with one 'gene' and one 'transcript' + * + * @return + */ + private AlignmentI buildAlignment() + { + AlignFrame af = new FileLoader().LoadFileWaitTillLoaded(FASTA, + DataSourceType.PASTE); + + /* + * map gene sequence to chromosome (normally done when the sequence is fetched + * from Ensembl and transcripts computed) + */ + AlignmentI alignment = af.getViewport().getAlignment(); + int[][] to = new int[][] { new int[] { 45051610, 45051634 } }; + List toRanges = Arrays.asList(to); + SequenceI gene = alignment.getSequenceAt(0); + List fromRanges = Arrays.asList(new int[][] { new int[] { + gene.getStart(), gene.getEnd() } }); + ((Sequence) gene).setGeneLoci(new GeneLoci("human", "GRCh38", "17", + new MapList(fromRanges, toRanges, 1, 1))); + + /* + * map 'transcript' to chromosome via 'gene' + * transcript/1-18 is gene/3-10,15-24 + * which is chromosome 45051612-45051619,45051624-45051633 + */ + to = new int[][] { new int[] { 45051612, 45051619 }, + new int[] { 45051624, 45051633 } }; + toRanges = Arrays.asList(to); + SequenceI transcript = alignment.getSequenceAt(1); + fromRanges = Arrays.asList(new int[][] { new int[] { + transcript.getStart(), transcript.getEnd() } }); + ((Sequence) transcript).setGeneLoci(new GeneLoci("human", "GRCh38", + "17", new MapList(fromRanges, toRanges, 1, 1))); + + /* + * add a protein product as a DBRef on the transcript + */ + SequenceI peptide = new Sequence("ENSP001", "SWRECD"); + MapList mapList = new MapList(new int[] { 1, 18 }, new int[] { 1, 6 }, + 3, 1); + Mapping map = new Mapping(peptide, mapList); + DBRefEntry product = new DBRefEntry("", "", "ENSP001", map); + transcript.addDBRef(product); + + return alignment; + } + + @Test(groups = "Functional") + public void testLoadVCF_reverseStrand() throws IOException + { + // TODO a test with reverse strand mapping of + // gene and transcript to chromosome + } +} diff --git a/test/jalview/util/MappingUtilsTest.java b/test/jalview/util/MappingUtilsTest.java index d0ec3e8..87070d7 100644 --- a/test/jalview/util/MappingUtilsTest.java +++ b/test/jalview/util/MappingUtilsTest.java @@ -1149,4 +1149,93 @@ public class MappingUtilsTest assertEquals("[12, 11, 8, 4]", Arrays.toString(ranges)); } + @Test(groups = { "Functional" }) + public void testRangeContains() + { + /* + * both forward ranges + */ + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 1, 10 })); + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 2, 10 })); + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 1, 9 })); + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 4, 5 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 0, 9 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + -10, -9 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 1, 11 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 11, 12 })); + + /* + * forward range, reverse query + */ + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 10, 1 })); + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 9, 1 })); + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 10, 2 })); + assertTrue(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 5, 5 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 11, 1 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, new int[] { + 10, 0 })); + + /* + * reverse range, forward query + */ + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 1, 10 })); + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 1, 9 })); + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 2, 10 })); + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 6, 6 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 6, 11 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 11, 20 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + -3, -2 })); + + /* + * both reverse + */ + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 10, 1 })); + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 9, 1 })); + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 10, 2 })); + assertTrue(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 3, 3 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 11, 1 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 10, 0 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + 12, 11 })); + assertFalse(MappingUtils.rangeContains(new int[] { 10, 1 }, new int[] { + -5, -8 })); + + /* + * bad arguments + */ + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10, 12 }, + new int[] { + 1, 10 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, + new int[] { 1 })); + assertFalse(MappingUtils.rangeContains(new int[] { 1, 10 }, null)); + assertFalse(MappingUtils.rangeContains(null, new int[] { 1, 10 })); + } + }