From 0a680b4ff1aaad7580d3b10941233307e2190be4 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 1 Dec 2017 17:03:17 +0000 Subject: [PATCH] JAL-2835 spike updated to latest (use specific SO term for feature) --- help/html/releases.html | 13 + src/jalview/datamodel/SequenceFeature.java | 9 +- src/jalview/ext/htsjdk/HtsContigDb.java | 60 ++- src/jalview/gui/SeqCanvas.java | 57 +- src/jalview/io/gff/SequenceOntologyLite.java | 26 +- src/jalview/io/vcf/VCFLoader.java | 598 ++++++++++++++------- src/jalview/util/MapList.java | 35 +- test/jalview/ext/htsjdk/TestHtsContigDb.java | 94 +++- test/jalview/ext/so/SequenceOntologyTest.java | 25 + test/jalview/io/gff/SequenceOntologyLiteTest.java | 37 ++ test/jalview/util/MapListTest.java | 33 +- 11 files changed, 723 insertions(+), 264 deletions(-) create mode 100644 test/jalview/io/gff/SequenceOntologyLiteTest.java diff --git a/help/html/releases.html b/help/html/releases.html index 1a48340..e1402e9 100755 --- a/help/html/releases.html +++ b/help/html/releases.html @@ -68,6 +68,19 @@ li:before { + +
+ 2.10.3b1
5/12/2017
+
+ +
+ + +
+
  • Alignment doesn't appear to scroll vertically via trackpad and scrollwheel
    • + + +
      2.10.3
      17/11/2017
      diff --git a/src/jalview/datamodel/SequenceFeature.java b/src/jalview/datamodel/SequenceFeature.java index 8a6cb61..34565c6 100755 --- a/src/jalview/datamodel/SequenceFeature.java +++ b/src/jalview/datamodel/SequenceFeature.java @@ -30,6 +30,7 @@ import jalview.util.StringUtils; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; +import java.util.SortedMap; import java.util.TreeMap; import java.util.Vector; @@ -643,9 +644,13 @@ public class SequenceFeature implements FeatureLocationI { /* * expand values in a Map attribute across separate lines + * copy to a TreeMap for alphabetical ordering */ - Map values = (Map) value; - for (Entry e : values.entrySet()) + Map values = (Map) value; + SortedMap sm = new TreeMap<>( + String.CASE_INSENSITIVE_ORDER); + sm.putAll(values); + for (Entry e : sm.entrySet()) { sb.append(String.format(ROW_DATA, key, e.getKey().toString(), e .getValue().toString())); diff --git a/src/jalview/ext/htsjdk/HtsContigDb.java b/src/jalview/ext/htsjdk/HtsContigDb.java index 37ce625..729f658 100644 --- a/src/jalview/ext/htsjdk/HtsContigDb.java +++ b/src/jalview/ext/htsjdk/HtsContigDb.java @@ -20,17 +20,13 @@ */ package jalview.ext.htsjdk; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.reference.ReferenceSequence; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.samtools.reference.ReferenceSequenceFileFactory; -import htsjdk.samtools.util.StringUtil; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import java.io.File; +import java.io.IOException; import java.math.BigInteger; +import java.nio.file.Path; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; @@ -38,6 +34,15 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.FastaSequenceIndexCreator; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.util.StringUtil; + /** * a source of sequence data accessed via the HTSJDK * @@ -46,14 +51,25 @@ import java.util.Set; */ public class HtsContigDb { - private String name; private File dbLocation; private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null; - public HtsContigDb(String name, File descriptor) throws Exception + public static void createFastaSequenceIndex(Path path, boolean overwrite) + throws IOException + { + try + { + FastaSequenceIndexCreator.create(path, overwrite); + } catch (SAMException e) + { + throw new IOException(e.getMessage()); + } + } + + public HtsContigDb(String name, File descriptor) { if (descriptor.isFile()) { @@ -63,7 +79,21 @@ public class HtsContigDb initSource(); } - private void initSource() throws Exception + public void close() + { + if (refFile != null) + { + try + { + refFile.close(); + } catch (IOException e) + { + // ignore + } + } + } + + private void initSource() { if (refFile != null) { @@ -142,8 +172,8 @@ public class HtsContigDb final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory .getReferenceSequenceFile(f, truncate); ReferenceSequence refSeq; - List ret = new ArrayList(); - Set sequenceNames = new HashSet(); + List ret = new ArrayList<>(); + Set sequenceNames = new HashSet<>(); for (int numSequences = 0; (refSeq = refSeqFile .nextSequence()) != null; ++numSequences) { @@ -220,7 +250,7 @@ public class HtsContigDb // ///// end of hts bits. - SequenceI getSequenceProxy(String id) + public SequenceI getSequenceProxy(String id) { if (!isValid()) { @@ -230,4 +260,10 @@ public class HtsContigDb ReferenceSequence sseq = refFile.getSequence(id); return new Sequence(sseq.getName(), new String(sseq.getBases())); } + + public boolean isIndexed() + { + return refFile != null && refFile.isIndexed(); + } + } diff --git a/src/jalview/gui/SeqCanvas.java b/src/jalview/gui/SeqCanvas.java index 433d2ec..1e1105f 100755 --- a/src/jalview/gui/SeqCanvas.java +++ b/src/jalview/gui/SeqCanvas.java @@ -1783,31 +1783,15 @@ public class SeqCanvas extends JComponent implements ViewportListenerI { scrollX = -range; } - - // Both scrolling and resizing change viewport ranges: scrolling changes - // both start and end points, but resize only changes end values. - // Here we only want to fastpaint on a scroll, with resize using a normal - // paint, so scroll events are identified as changes to the horizontal or - // vertical start value. - if (eventName.equals(ViewportRanges.STARTRES)) - { - if (av.getWrapAlignment()) - { - fastPaintWrapped(scrollX); - } - else - { - fastPaint(scrollX, 0); - } - } - else if (eventName.equals(ViewportRanges.STARTSEQ)) - { - // scroll - fastPaint(0, (int) evt.getNewValue() - (int) evt.getOldValue()); - } - else if (eventName.equals(ViewportRanges.STARTRESANDSEQ)) - { - if (av.getWrapAlignment()) + } + // Both scrolling and resizing change viewport ranges: scrolling changes + // both start and end points, but resize only changes end values. + // Here we only want to fastpaint on a scroll, with resize using a normal + // paint, so scroll events are identified as changes to the horizontal or + // vertical start value. + if (eventName.equals(ViewportRanges.STARTRES)) + { + if (av.getWrapAlignment()) { fastPaintWrapped(scrollX); } @@ -1815,11 +1799,26 @@ public class SeqCanvas extends JComponent implements ViewportListenerI { fastPaint(scrollX, 0); } - // bizarrely, we only need to scroll on the x value here as fastpaint - // copies the full height of the image anyway. Passing in the y value - // causes nasty repaint artefacts, which only disappear on a full - // repaint. + } + else if (eventName.equals(ViewportRanges.STARTSEQ)) + { + // scroll + fastPaint(0, (int) evt.getNewValue() - (int) evt.getOldValue()); + } + else if (eventName.equals(ViewportRanges.STARTRESANDSEQ)) + { + if (av.getWrapAlignment()) + { + fastPaintWrapped(scrollX); + } + else + { + fastPaint(scrollX, 0); } + // bizarrely, we only need to scroll on the x value here as fastpaint + // copies the full height of the image anyway. Passing in the y value + // causes nasty repaint artefacts, which only disappear on a full + // repaint. } } diff --git a/src/jalview/io/gff/SequenceOntologyLite.java b/src/jalview/io/gff/SequenceOntologyLite.java index f989f7b..72e906c 100644 --- a/src/jalview/io/gff/SequenceOntologyLite.java +++ b/src/jalview/io/gff/SequenceOntologyLite.java @@ -44,7 +44,7 @@ public class SequenceOntologyLite implements SequenceOntologyI * initial selection of types of interest when processing Ensembl features * NB unlike the full SequenceOntology we don't traverse indirect * child-parent relationships here so e.g. need to list every sub-type - * of gene (direct or indirect) that is of interest + * (direct or indirect) that is of interest */ // @formatter:off private final String[][] TERMS = new String[][] { @@ -75,16 +75,26 @@ public class SequenceOntologyLite implements SequenceOntologyI // there are many more sub-types of ncRNA... /* - * sequence_variant sub-types: + * sequence_variant sub-types */ { "sequence_variant", "sequence_variant" }, + { "structural_variant", "sequence_variant" }, { "feature_variant", "sequence_variant" }, { "gene_variant", "sequence_variant" }, + { "transcript_variant", "sequence_variant" }, // NB Ensembl uses NMD_transcript_variant as if a 'transcript' // but we model it here correctly as per the SO { "NMD_transcript_variant", "sequence_variant" }, - { "transcript_variant", "sequence_variant" }, - { "structural_variant", "sequence_variant" }, + { "missense_variant", "sequence_variant" }, + { "synonymous_variant", "sequence_variant" }, + { "frameshift_variant", "sequence_variant" }, + { "5_prime_UTR_variant", "sequence_variant" }, + { "3_prime_UTR_variant", "sequence_variant" }, + { "stop_gained", "sequence_variant" }, + { "stop_lost", "sequence_variant" }, + { "inframe_deletion", "sequence_variant" }, + { "inframe_insertion", "sequence_variant" }, + { "splice_region_variant", "sequence_variant" }, /* * no sub-types of exon or CDS yet seen in Ensembl @@ -121,8 +131,8 @@ public class SequenceOntologyLite implements SequenceOntologyI public SequenceOntologyLite() { - termsFound = new ArrayList(); - termsNotFound = new ArrayList(); + termsFound = new ArrayList<>(); + termsNotFound = new ArrayList<>(); loadStaticData(); } @@ -131,13 +141,13 @@ public class SequenceOntologyLite implements SequenceOntologyI */ private void loadStaticData() { - parents = new HashMap>(); + parents = new HashMap<>(); for (String[] pair : TERMS) { List p = parents.get(pair[0]); if (p == null) { - p = new ArrayList(); + p = new ArrayList<>(); parents.put(pair[0], p); } p.add(pair[1]); diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 155e773..20e3ccd 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -27,11 +27,12 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.SortedMap; -import java.util.TreeMap; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; @@ -49,6 +50,35 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + /** + * A class to model the mapping from sequence to VCF coordinates. Cases include + *
        + *
      • a direct 1:1 mapping where the sequence is one of the VCF contigs
      • + *
      • a mapping of sequence to chromosomal coordinates, where sequence and VCF + * use the same reference assembly
      • + *
      • a modified mapping of sequence to chromosomal coordinates, where sequence + * and VCF use different reference assembles
      • + *
      + */ + class VCFMap + { + final String chromosome; + + final MapList map; + + VCFMap(String chr, MapList m) + { + chromosome = chr; + map = m; + } + + @Override + public String toString() + { + return chromosome + ":" + map.toString(); + } + } + /* * Lookup keys, and default values, for Preference entries that describe * patterns for VCF and VEP fields to capture @@ -65,10 +95,10 @@ public class VCFLoader * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ - private static final String ALLELE_KEY = "Allele"; - - private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... - private static final String FEATURE_KEY = "Feature"; // Ensembl stable id + private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; + private static final String CSQ_ALLELE_KEY = "Allele"; + private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id /* * default VCF INFO key for VEP consequence data @@ -122,14 +152,23 @@ public class VCFLoader private VCFHeader header; /* + * a Dictionary of contigs (if present) referenced in the VCF file + */ + private SAMSequenceDictionary dictionary; + + /* * the position (0...) of field in each block of * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ + private int csqConsequenceFieldIndex = -1; private int csqAlleleFieldIndex = -1; private int csqAlleleNumberFieldIndex = -1; private int csqFeatureFieldIndex = -1; + // todo the same fields for SnpEff ANN data if wanted + // see http://snpeff.sourceforge.net/SnpEff_manual.html#input + /* * a unique identifier under which to save metadata about feature * attributes (selected INFO field data) @@ -209,6 +248,14 @@ public class VCFLoader header = reader.getFileHeader(); + try + { + dictionary = header.getSequenceDictionary(); + } catch (SAMException e) + { + // ignore - thrown if any contig line lacks length info + } + sourceId = filePath; saveMetadata(sourceId); @@ -269,6 +316,8 @@ public class VCFLoader // ignore } } + header = null; + dictionary = null; } } @@ -378,15 +427,19 @@ public class VCFLoader int index = 0; for (String field : format) { - if (ALLELE_NUM_KEY.equals(field)) + if (CSQ_CONSEQUENCE_KEY.equals(field)) + { + csqConsequenceFieldIndex = index; + } + if (CSQ_ALLELE_NUM_KEY.equals(field)) { csqAlleleNumberFieldIndex = index; } - if (ALLELE_KEY.equals(field)) + if (CSQ_ALLELE_KEY.equals(field)) { csqAlleleFieldIndex = index; } - if (FEATURE_KEY.equals(field)) + if (CSQ_FEATURE_KEY.equals(field)) { csqFeatureFieldIndex = index; } @@ -497,28 +550,157 @@ public class VCFLoader protected int loadSequenceVCF(SequenceI seq, VCFReader reader, String vcfAssembly) { - int count = 0; + VCFMap vcfMap = getVcfMap(seq, vcfAssembly); + if (vcfMap == null) + { + return 0; + } + + return addVcfVariants(seq, reader, vcfMap, vcfAssembly); + } + + /** + * Answers a map from sequence coordinates to VCF chromosome ranges + * + * @param seq + * @param vcfAssembly + * @return + */ + private VCFMap getVcfMap(SequenceI seq, String vcfAssembly) + { + /* + * simplest case: sequence has id and length matching a VCF contig + */ + VCFMap vcfMap = null; + if (dictionary != null) + { + vcfMap = getContigMap(seq); + } + if (vcfMap != null) + { + return vcfMap; + } + + /* + * otherwise, map to VCF from chromosomal coordinates + * of the sequence (if known) + */ GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { - System.out.println(String.format( + Cache.log.warn(String.format( "Can't query VCF for %s as chromosome coordinates not known", seq.getName())); - return 0; + return null; } - if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId())) + String species = seqCoords.getSpeciesId(); + String chromosome = seqCoords.getChromosomeId(); + String seqRef = seqCoords.getAssemblyId(); + MapList map = seqCoords.getMap(); + + if (!vcfSpeciesMatchesSequence(vcfAssembly, species)) { - return 0; + return null; } - List seqChromosomalContigs = seqCoords.getMap().getToRanges(); - for (int[] range : seqChromosomalContigs) + if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef)) { - count += addVcfVariants(seq, reader, range, vcfAssembly); + return new VCFMap(chromosome, map); } - return count; + if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl + || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD + { + return null; + } + + /* + * map chromosomal coordinates from sequence to VCF if the VCF + * data has a different reference assembly to the sequence + */ + // TODO generalise for cases other than GRCh38 -> GRCh37 ! + // - or get the user to choose in a dialog + + List toVcfRanges = new ArrayList<>(); + List fromSequenceRanges = new ArrayList<>(); + String toRef = "GRCh37"; + + for (int[] range : map.getToRanges()) + { + int[] fromRange = map.locateInFrom(range[0], range[1]); + if (fromRange == null) + { + // corrupted map?!? + continue; + } + + int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, + toRef); + if (newRange == null) + { + Cache.log.error( + String.format("Failed to map %s:%s:%s:%d:%d to %s", species, + chromosome, seqRef, range[0], range[1], toRef)); + continue; + } + else + { + toVcfRanges.add(newRange); + fromSequenceRanges.add(fromRange); + } + } + + return new VCFMap(chromosome, + new MapList(fromSequenceRanges, toVcfRanges, 1, 1)); + } + + /** + * If the sequence id matches a contig declared in the VCF file, and the + * sequence length matches the contig length, then returns a 1:1 map of the + * sequence to the contig, else returns null + * + * @param seq + * @return + */ + private VCFMap getContigMap(SequenceI seq) + { + String id = seq.getName(); + SAMSequenceRecord contig = dictionary.getSequence(id); + if (contig != null) + { + int len = seq.getLength(); + if (len == contig.getSequenceLength()) + { + MapList map = new MapList(new int[] { 1, len }, + new int[] + { 1, len }, 1, 1); + return new VCFMap(id, map); + } + } + return null; + } + + /** + * Answers true if we determine that the VCF data uses the same reference + * assembly as the sequence, else false + * + * @param vcfAssembly + * @param seqRef + * @return + */ + private boolean vcfAssemblyMatchesSequence(String vcfAssembly, + String seqRef) + { + // TODO improve on this stub, which handles gnomAD and + // hopes for the best for other cases + + if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl + && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD + { + return false; + } + return true; } /** @@ -556,93 +738,52 @@ public class VCFLoader } /** - * Queries the VCF reader for any variants that overlap the given chromosome - * region of the sequence, and adds as variant features. Returns the number of + * Queries the VCF reader for any variants that overlap the mapped chromosome + * ranges of the sequence, and adds as variant features. Returns the number of * overlapping variants found. * * @param seq * @param reader - * @param range - * start-end range of a sequence region in its chromosomal - * coordinates + * @param map + * mapping from sequence to VCF coordinates * @param vcfAssembly * the '##reference' identifier for the VCF reference assembly * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, String vcfAssembly) + VCFMap map, String vcfAssembly) { - GeneLociI seqCoords = seq.getGeneLoci(); - - String chromosome = seqCoords.getChromosomeId(); - String seqRef = seqCoords.getAssemblyId(); - String species = seqCoords.getSpeciesId(); - - /* - * map chromosomal coordinates from sequence to VCF if the VCF - * data has a different reference assembly to the sequence - */ - // TODO generalise for non-human species - // - or get the user to choose in a dialog - - int offset = 0; - if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl - && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD - { - String toRef = "GRCh37"; - int[] newRange = mapReferenceRange(range, chromosome, "human", - seqRef, toRef); - if (newRange == null) - { - System.err.println(String.format( - "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - seqRef, range[0], range[1], toRef)); - return 0; - } - offset = newRange[0] - range[0]; - range = newRange; - } - - boolean forwardStrand = range[0] <= range[1]; + boolean forwardStrand = map.map.isToForwardStrand(); /* - * query the VCF for overlaps - * (convert a reverse strand range to forwards) + * query the VCF for overlaps of each contiguous chromosomal region */ int count = 0; - MapList mapping = seqCoords.getMap(); - int fromLocus = Math.min(range[0], range[1]); - int toLocus = Math.max(range[0], range[1]); - CloseableIterator variants = reader.query(chromosome, - fromLocus, toLocus); - while (variants.hasNext()) + for (int[] range : map.map.getToRanges()) { - /* - * get variant location in sequence chromosomal coordinates - */ - VariantContext variant = variants.next(); + int vcfStart = Math.min(range[0], range[1]); + int vcfEnd = Math.max(range[0], range[1]); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) + { + VariantContext variant = variants.next(); - int start = variant.getStart() - offset; - int end = variant.getEnd() - offset; + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); - /* - * convert chromosomal location to sequence coordinates - * - may be reverse strand (convert to forward for sequence feature) - * - null if a partially overlapping feature - */ - int[] seqLocation = mapping.locateInFrom(start, end); - if (seqLocation != null) - { - int featureStart = Math.min(seqLocation[0], seqLocation[1]); - int featureEnd = Math.max(seqLocation[0], seqLocation[1]); - count += addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); + if (featureRange != null) + { + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + count += addAlleleFeatures(seq, variant, featureStart, featureEnd, + forwardStrand); + } } + variants.close(); } - variants.close(); - return count; } @@ -729,9 +870,9 @@ public class VCFLoader /** * Inspects one allele and attempts to add a variant feature for it to the - * sequence. We extract as much as possible of the additional data associated - * with this allele to store in the feature's key-value map. Answers the - * number of features added (0 or 1). + * sequence. The additional data associated with this allele is extracted to + * store in the feature's key-value map. Answers the number of features added (0 + * or 1). * * @param seq * @param variant @@ -754,14 +895,15 @@ public class VCFLoader * build the ref,alt allele description e.g. "G,A", using the base * complement if the sequence is on the reverse strand */ - // TODO check how structural variants are shown on reverse strand + // FIXME correctly handle insertions on reverse strand JAL-2845 StringBuilder sb = new StringBuilder(); sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); sb.append(COMMA); sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); String alleles = sb.toString(); // e.g. G,A - String type = SequenceOntologyI.SEQUENCE_VARIANT; + String type = getOntologyTerm(seq, variant, altAlleleIndex); + float score = getAlleleFrequency(variant, altAlleleIndex); SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, @@ -778,6 +920,168 @@ public class VCFLoader } /** + * Determines the Sequence Ontology term to use for the variant feature type in + * Jalview. The default is 'sequence_variant', but a more specific term is used + * if: + *
        + *
      • VEP (or SnpEff) Consequence annotation is included in the VCF
      • + *
      • sequence id can be matched to VEP Feature (or SnpEff Feature_ID)
      • + *
      + * + * @param seq + * @param variant + * @param altAlleleIndex + * @return + * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 + */ + String getOntologyTerm(SequenceI seq, VariantContext variant, + int altAlleleIndex) + { + String type = SequenceOntologyI.SEQUENCE_VARIANT; + + if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 + { + /* + * no Consequence data so we can't refine the ontology term + */ + return type; + } + + /* + * can we associate Consequence data with this allele and feature (transcript)? + * if so, prefer the consequence term from that data + */ + String consequence = getConsequenceForAlleleAndFeature(variant, + CSQ_FIELD, + altAlleleIndex, csqAlleleFieldIndex, csqAlleleNumberFieldIndex, + seq.getName().toLowerCase(), csqFeatureFieldIndex); + if (consequence != null) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > csqConsequenceFieldIndex) + { + type = csqFields[csqConsequenceFieldIndex]; + } + } + else + { + // todo the same for SnpEff consequence data matching if wanted + } + + /* + * if of the form (e.g.) missense_variant&splice_region_variant, + * just take the first ('most severe') consequence + */ + if (type != null) + { + int pos = type.indexOf('&'); + if (pos > 0) + { + type = type.substring(0, pos); + } + } + return type; + } + + /** + * Returns matched consequence data if it can be found, else null. + *
        + *
      • inspects the VCF data for key 'vcfInfoId'
      • + *
      • splits this on comma (to distinct consequences)
      • + *
      • returns the first consequence (if any) where
      • + *
          + *
        • the allele matches the altAlleleIndex'th allele of variant
        • + *
        • the feature matches the sequence name (e.g. transcript id)
        • + *
        + *
      + * If matched, the consequence is returned (as pipe-delimited fields). + * + * @param variant + * @param vcfInfoId + * @param altAlleleIndex + * @param alleleFieldIndex + * @param alleleNumberFieldIndex + * @param seqName + * @param featureFieldIndex + * @return + */ + private String getConsequenceForAlleleAndFeature(VariantContext variant, + String vcfInfoId, int altAlleleIndex, int alleleFieldIndex, + int alleleNumberFieldIndex, + String seqName, int featureFieldIndex) + { + if (alleleFieldIndex == -1 || featureFieldIndex == -1) + { + return null; + } + Object value = variant.getAttribute(vcfInfoId); + + if (value == null || !(value instanceof List)) + { + return null; + } + + /* + * inspect each consequence in turn (comma-separated blocks + * extracted by htsjdk) + */ + List consequences = (List) value; + + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > featureFieldIndex) + { + String featureIdentifier = csqFields[featureFieldIndex]; + if (featureIdentifier.length() > 4 + && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + { + /* + * feature (transcript) matched - now check for allele match + */ + if (matchAllele(variant, altAlleleIndex, csqFields, + alleleFieldIndex, alleleNumberFieldIndex)) + { + return consequence; + } + } + } + } + return null; + } + + private boolean matchAllele(VariantContext variant, int altAlleleIndex, + String[] csqFields, int alleleFieldIndex, + int alleleNumberFieldIndex) + { + /* + * if ALLELE_NUM is present, it must match altAlleleIndex + * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex + */ + if (alleleNumberFieldIndex > -1) + { + if (csqFields.length <= alleleNumberFieldIndex) + { + return false; + } + String alleleNum = csqFields[alleleNumberFieldIndex]; + return String.valueOf(altAlleleIndex + 1).equals(alleleNum); + } + + /* + * else consequence allele must match variant allele + */ + if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex) + { + String csqAllele = csqFields[alleleFieldIndex]; + String vcfAllele = variant.getAlternateAllele(altAlleleIndex) + .getBaseString(); + return csqAllele.equals(vcfAllele); + } + return false; + } + + /** * Add any allele-specific VCF key-value data to the sequence feature * * @param variant @@ -874,15 +1178,23 @@ public class VCFLoader * @param variant * @param seq * @param sf - * @param altAlelleIndex + * @param altAlleleIndex * (0, 1..) */ protected void addConsequences(VariantContext variant, SequenceI seq, - SequenceFeature sf, int altAlelleIndex) + SequenceFeature sf, int altAlleleIndex) { + /* + * first try to identify the matching consequence + */ + String myConsequence = getConsequenceForAlleleAndFeature(variant, + CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), + csqFeatureFieldIndex); + Object value = variant.getAttribute(CSQ_FIELD); - if (value == null || !(value instanceof ArrayList)) + if (value == null || !(value instanceof List)) { return; } @@ -890,43 +1202,17 @@ public class VCFLoader List consequences = (List) value; /* - * if CSQ data includes 'Feature', and any value matches the sequence name, - * then restrict consequence data to only the matching value (transcript) - * i.e. just pick out consequences for the transcript the variant feature is on - */ - String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); - String matchFeature = null; - if (csqFeatureFieldIndex > -1) - { - for (String consequence : consequences) - { - String[] csqFields = consequence.split(PIPE_REGEX); - if (csqFields.length > csqFeatureFieldIndex) - { - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (featureIdentifier.length() > 4 - && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) - { - matchFeature = featureIdentifier; - } - } - } - } - - /* - * inspect CSQ consequences; where possible restrict to the consequence + * inspect CSQ consequences; restrict to the consequence * associated with the current transcript (Feature) */ - SortedMap csqValues = new TreeMap<>( - String.CASE_INSENSITIVE_ORDER); + Map csqValues = new HashMap<>(); for (String consequence : consequences) { - String[] csqFields = consequence.split(PIPE_REGEX); - - if (includeConsequence(csqFields, matchFeature, variant, - altAlelleIndex)) + if (myConsequence == null || myConsequence.equals(consequence)) { + String[] csqFields = consequence.split(PIPE_REGEX); + /* * inspect individual fields of this consequence, copying non-null * values which are 'fields of interest' @@ -954,72 +1240,6 @@ public class VCFLoader } /** - * Answers true if we want to associate this block of consequence data with - * the specified alternate allele of the VCF variant. - *

      - * If consequence data includes the ALLELE_NUM field, then this has to match - * altAlleleIndex. Otherwise the Allele field of the consequence data has to - * match the allele value. - *

      - * Optionally (if matchFeature is not null), restrict to only include - * consequences whose Feature value matches. This allows us to attach - * consequences to their respective transcripts. - * - * @param csqFields - * @param matchFeature - * @param variant - * @param altAlelleIndex - * (0, 1..) - * @return - */ - protected boolean includeConsequence(String[] csqFields, - String matchFeature, VariantContext variant, int altAlelleIndex) - { - /* - * check consequence is for the current transcript - */ - if (matchFeature != null) - { - if (csqFields.length <= csqFeatureFieldIndex) - { - return false; - } - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (!featureIdentifier.equals(matchFeature)) - { - return false; // consequence is for a different transcript - } - } - - /* - * if ALLELE_NUM is present, it must match altAlleleIndex - * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex - */ - if (csqAlleleNumberFieldIndex > -1) - { - if (csqFields.length <= csqAlleleNumberFieldIndex) - { - return false; - } - String alleleNum = csqFields[csqAlleleNumberFieldIndex]; - return String.valueOf(altAlelleIndex + 1).equals(alleleNum); - } - - /* - * else consequence allele must match variant allele - */ - if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex) - { - String csqAllele = csqFields[csqAlleleFieldIndex]; - String vcfAllele = variant.getAlternateAllele(altAlelleIndex) - .getBaseString(); - return csqAllele.equals(vcfAllele); - } - - return false; - } - - /** * A convenience method to complement a dna base and return the string value * of its complement * diff --git a/src/jalview/util/MapList.java b/src/jalview/util/MapList.java index 3ce0bb3..c944345 100644 --- a/src/jalview/util/MapList.java +++ b/src/jalview/util/MapList.java @@ -77,8 +77,8 @@ public class MapList */ public MapList() { - fromShifts = new ArrayList(); - toShifts = new ArrayList(); + fromShifts = new ArrayList<>(); + toShifts = new ArrayList<>(); } /** @@ -347,7 +347,7 @@ public class MapList } boolean changed = false; - List merged = new ArrayList(); + List merged = new ArrayList<>(); int[] lastRange = ranges.get(0); int lastDirection = lastRange[1] >= lastRange[0] ? 1 : -1; lastRange = new int[] { lastRange[0], lastRange[1] }; @@ -803,7 +803,7 @@ public class MapList { return null; } - List ranges = new ArrayList(); + List ranges = new ArrayList<>(); if (fs <= fe) { intv = fs; @@ -1094,8 +1094,33 @@ public class MapList */ public boolean isFromForwardStrand() { + return isForwardStrand(getFromRanges()); + } + + /** + * Returns true if mapping is to forward strand, false if to reverse strand. + * Result is just based on the first 'to' range that is not a single position. + * Default is true unless proven to be false. Behaviour is not well defined if + * the mapping has a mixture of forward and reverse ranges. + * + * @return + */ + public boolean isToForwardStrand() + { + return isForwardStrand(getToRanges()); + } + + /** + * A helper method that returns true unless at least one range has start > end. + * Behaviour is undefined for a mixture of forward and reverse ranges. + * + * @param ranges + * @return + */ + private boolean isForwardStrand(List ranges) + { boolean forwardStrand = true; - for (int[] range : getFromRanges()) + for (int[] range : ranges) { if (range[1] > range[0]) { diff --git a/test/jalview/ext/htsjdk/TestHtsContigDb.java b/test/jalview/ext/htsjdk/TestHtsContigDb.java index 350b599..29303d0 100644 --- a/test/jalview/ext/htsjdk/TestHtsContigDb.java +++ b/test/jalview/ext/htsjdk/TestHtsContigDb.java @@ -20,13 +20,19 @@ */ package jalview.ext.htsjdk; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + import jalview.datamodel.SequenceI; -import jalview.gui.JvOptionPane; import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; /** @@ -35,25 +41,83 @@ import org.testng.annotations.Test; */ public class TestHtsContigDb { + @Test(groups = "Functional") + public final void testGetSequenceProxy() throws Exception + { + String pathname = "test/jalview/ext/htsjdk/pgmb.fasta"; + HtsContigDb db = new HtsContigDb("ADB", new File(pathname)); + + assertTrue(db.isValid()); + assertTrue(db.isIndexed()); // htsjdk opens the .fai file + + SequenceI sq = db.getSequenceProxy("Deminut"); + assertNotNull(sq); + assertEquals(sq.getLength(), 606); - @BeforeClass(alwaysRun = true) - public void setUpJvOptionPane() + /* + * read a sequence earlier in the file + */ + sq = db.getSequenceProxy("PPL_06716"); + assertNotNull(sq); + assertEquals(sq.getLength(), 602); + + // dict = db.getDictionary(f, truncate)) + } + + /** + * Trying to open a .fai file directly results in IllegalArgumentException - + * have to provide the unindexed file name instead + */ + @Test( + groups = "Functional", + expectedExceptions = java.lang.IllegalArgumentException.class) + public final void testGetSequenceProxy_indexed() { - JvOptionPane.setInteractiveMode(false); - JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); + String pathname = "test/jalview/ext/htsjdk/pgmb.fasta.fai"; + new HtsContigDb("ADB", new File(pathname)); + fail("Expected exception opening .fai file"); } @Test(groups = "Functional") - public final void testHTSReferenceSequence() throws Exception + public void testCreateFastaSequenceIndex() throws IOException { - HtsContigDb remmadb = new HtsContigDb("REEMADB", new File( - "test/jalview/ext/htsjdk/pgmb.fasta")); + File fasta = new File("test/jalview/ext/htsjdk/pgmb.fasta"); + + /* + * create .fai with no overwrite fails if it exists + */ + try { + HtsContigDb.createFastaSequenceIndex(fasta.toPath(), false); + fail("Expected exception"); + } catch (IOException e) + { + // expected + } - Assert.assertTrue(remmadb.isValid()); + /* + * create a copy of the .fasta (as a temp file) + */ + File copyFasta = File.createTempFile("copyFasta", ".fasta"); + copyFasta.deleteOnExit(); + assertTrue(copyFasta.exists()); + Files.copy(fasta.toPath(), copyFasta.toPath(), + StandardCopyOption.REPLACE_EXISTING); - SequenceI sq = remmadb.getSequenceProxy("Deminut"); - Assert.assertNotNull(sq); - Assert.assertNotEquals(0, sq.getLength()); - } + /* + * open the Fasta file - not indexed, as no .fai file yet exists + */ + HtsContigDb db = new HtsContigDb("ADB", copyFasta); + assertTrue(db.isValid()); + assertFalse(db.isIndexed()); + db.close(); + /* + * create the .fai index, re-open the .fasta file - now indexed + */ + HtsContigDb.createFastaSequenceIndex(copyFasta.toPath(), true); + db = new HtsContigDb("ADB", copyFasta); + assertTrue(db.isValid()); + assertTrue(db.isIndexed()); + db.close(); + } } diff --git a/test/jalview/ext/so/SequenceOntologyTest.java b/test/jalview/ext/so/SequenceOntologyTest.java index b76a295..31e1887 100644 --- a/test/jalview/ext/so/SequenceOntologyTest.java +++ b/test/jalview/ext/so/SequenceOntologyTest.java @@ -107,4 +107,29 @@ public class SequenceOntologyTest assertFalse(so.isA("CDS_region", "CDS"));// part_of assertFalse(so.isA("polypeptide", "CDS")); // derives_from } + + @Test(groups = "Functional") + public void testIsSequenceVariant() + { + assertFalse(so.isA("CDS", "sequence_variant")); + assertTrue(so.isA("sequence_variant", "sequence_variant")); + + /* + * these should all be sub-types of sequence_variant + */ + assertTrue(so.isA("structural_variant", "sequence_variant")); + assertTrue(so.isA("feature_variant", "sequence_variant")); + assertTrue(so.isA("gene_variant", "sequence_variant")); + assertTrue(so.isA("transcript_variant", "sequence_variant")); + assertTrue(so.isA("NMD_transcript_variant", "sequence_variant")); + assertTrue(so.isA("missense_variant", "sequence_variant")); + assertTrue(so.isA("synonymous_variant", "sequence_variant")); + assertTrue(so.isA("frameshift_variant", "sequence_variant")); + assertTrue(so.isA("5_prime_UTR_variant", "sequence_variant")); + assertTrue(so.isA("3_prime_UTR_variant", "sequence_variant")); + assertTrue(so.isA("stop_gained", "sequence_variant")); + assertTrue(so.isA("stop_lost", "sequence_variant")); + assertTrue(so.isA("inframe_deletion", "sequence_variant")); + assertTrue(so.isA("inframe_insertion", "sequence_variant")); + } } diff --git a/test/jalview/io/gff/SequenceOntologyLiteTest.java b/test/jalview/io/gff/SequenceOntologyLiteTest.java new file mode 100644 index 0000000..0766666 --- /dev/null +++ b/test/jalview/io/gff/SequenceOntologyLiteTest.java @@ -0,0 +1,37 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertFalse; +import static org.testng.AssertJUnit.assertTrue; + +import org.testng.annotations.Test; + +public class SequenceOntologyLiteTest +{ + @Test(groups = "Functional") + public void testIsA_sequenceVariant() + { + SequenceOntologyI so = new SequenceOntologyLite(); + + assertFalse(so.isA("CDS", "sequence_variant")); + assertTrue(so.isA("sequence_variant", "sequence_variant")); + + /* + * these should all be sub-types of sequence_variant + */ + assertTrue(so.isA("structural_variant", "sequence_variant")); + assertTrue(so.isA("feature_variant", "sequence_variant")); + assertTrue(so.isA("gene_variant", "sequence_variant")); + assertTrue(so.isA("transcript_variant", "sequence_variant")); + assertTrue(so.isA("NMD_transcript_variant", "sequence_variant")); + assertTrue(so.isA("missense_variant", "sequence_variant")); + assertTrue(so.isA("synonymous_variant", "sequence_variant")); + assertTrue(so.isA("frameshift_variant", "sequence_variant")); + assertTrue(so.isA("5_prime_UTR_variant", "sequence_variant")); + assertTrue(so.isA("3_prime_UTR_variant", "sequence_variant")); + assertTrue(so.isA("stop_gained", "sequence_variant")); + assertTrue(so.isA("stop_lost", "sequence_variant")); + assertTrue(so.isA("inframe_deletion", "sequence_variant")); + assertTrue(so.isA("inframe_insertion", "sequence_variant")); + assertTrue(so.isA("splice_region_variant", "sequence_variant")); + } +} diff --git a/test/jalview/util/MapListTest.java b/test/jalview/util/MapListTest.java index 3fc6fe0..029b681 100644 --- a/test/jalview/util/MapListTest.java +++ b/test/jalview/util/MapListTest.java @@ -426,7 +426,7 @@ public class MapListTest @Test(groups = { "Functional" }) public void testGetRanges() { - List ranges = new ArrayList(); + List ranges = new ArrayList<>(); ranges.add(new int[] { 2, 3 }); ranges.add(new int[] { 5, 6 }); assertEquals("[2, 3, 5, 6]", Arrays.toString(MapList.getRanges(ranges))); @@ -603,7 +603,7 @@ public class MapListTest public void testAddRange() { int[] range = { 1, 5 }; - List ranges = new ArrayList(); + List ranges = new ArrayList<>(); // add to empty list: MapList.addRange(range, ranges); @@ -702,7 +702,7 @@ public class MapListTest public void testCoalesceRanges() { assertNull(MapList.coalesceRanges(null)); - List ranges = new ArrayList(); + List ranges = new ArrayList<>(); assertSame(ranges, MapList.coalesceRanges(ranges)); ranges.add(new int[] { 1, 3 }); assertSame(ranges, MapList.coalesceRanges(ranges)); @@ -763,7 +763,7 @@ public class MapListTest @Test(groups = { "Functional" }) public void testCoalesceRanges_withOverlap() { - List ranges = new ArrayList(); + List ranges = new ArrayList<>(); ranges.add(new int[] { 1, 3 }); ranges.add(new int[] { 2, 5 }); @@ -940,4 +940,29 @@ public class MapListTest compound = ml1.traverse(ml2); assertNull(compound); } + + /** + * Test that method that inspects for the (first) forward or reverse 'to' range. + * Single position ranges are ignored. + */ + @Test(groups = { "Functional" }) + public void testIsToForwardsStrand() + { + // [3-9] declares forward strand + MapList ml = new MapList(new int[] { 20, 11 }, + new int[] + { 2, 2, 3, 9, 12, 11 }, 1, 1); + assertTrue(ml.isToForwardStrand()); + + // [11-5] declares reverse strand ([13-14] is ignored) + ml = new MapList(new int[] { 20, 11 }, + new int[] + { 2, 2, 11, 5, 13, 14 }, 1, 1); + assertFalse(ml.isToForwardStrand()); + + // all single position ranges - defaults to forward strand + ml = new MapList(new int[] { 3, 1 }, new int[] { 2, 2, 4, 4, 6, 6 }, 1, + 1); + assertTrue(ml.isToForwardStrand()); + } } -- 1.7.10.2