From 27a06af565d224505f2484a9c74743fb3cf69be8 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 1 Dec 2017 17:01:36 +0000 Subject: [PATCH 1/1] JAL-2835 refactored to extract specific consequence term per transcript and allele --- src/jalview/io/vcf/VCFLoader.java | 319 ++++++++++++++++++++++++------------- 1 file changed, 205 insertions(+), 114 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index a85802a..20e3ccd 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -27,8 +27,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.SortedMap; -import java.util.TreeMap; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -73,6 +71,12 @@ public class VCFLoader chromosome = chr; map = m; } + + @Override + public String toString() + { + return chromosome + ":" + map.toString(); + } } /* @@ -91,10 +95,10 @@ public class VCFLoader * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ - private static final String ALLELE_KEY = "Allele"; - - private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... - private static final String FEATURE_KEY = "Feature"; // Ensembl stable id + private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; + private static final String CSQ_ALLELE_KEY = "Allele"; + private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id /* * default VCF INFO key for VEP consequence data @@ -157,10 +161,14 @@ public class VCFLoader * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ + private int csqConsequenceFieldIndex = -1; private int csqAlleleFieldIndex = -1; private int csqAlleleNumberFieldIndex = -1; private int csqFeatureFieldIndex = -1; + // todo the same fields for SnpEff ANN data if wanted + // see http://snpeff.sourceforge.net/SnpEff_manual.html#input + /* * a unique identifier under which to save metadata about feature * attributes (selected INFO field data) @@ -419,15 +427,19 @@ public class VCFLoader int index = 0; for (String field : format) { - if (ALLELE_NUM_KEY.equals(field)) + if (CSQ_CONSEQUENCE_KEY.equals(field)) + { + csqConsequenceFieldIndex = index; + } + if (CSQ_ALLELE_NUM_KEY.equals(field)) { csqAlleleNumberFieldIndex = index; } - if (ALLELE_KEY.equals(field)) + if (CSQ_ALLELE_KEY.equals(field)) { csqAlleleFieldIndex = index; } - if (FEATURE_KEY.equals(field)) + if (CSQ_FEATURE_KEY.equals(field)) { csqFeatureFieldIndex = index; } @@ -858,9 +870,9 @@ public class VCFLoader /** * Inspects one allele and attempts to add a variant feature for it to the - * sequence. We extract as much as possible of the additional data associated - * with this allele to store in the feature's key-value map. Answers the - * number of features added (0 or 1). + * sequence. The additional data associated with this allele is extracted to + * store in the feature's key-value map. Answers the number of features added (0 + * or 1). * * @param seq * @param variant @@ -883,14 +895,15 @@ public class VCFLoader * build the ref,alt allele description e.g. "G,A", using the base * complement if the sequence is on the reverse strand */ - // TODO check how structural variants are shown on reverse strand + // FIXME correctly handle insertions on reverse strand JAL-2845 StringBuilder sb = new StringBuilder(); sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); sb.append(COMMA); sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); String alleles = sb.toString(); // e.g. G,A - String type = SequenceOntologyI.SEQUENCE_VARIANT; + String type = getOntologyTerm(seq, variant, altAlleleIndex); + float score = getAlleleFrequency(variant, altAlleleIndex); SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, @@ -907,6 +920,168 @@ public class VCFLoader } /** + * Determines the Sequence Ontology term to use for the variant feature type in + * Jalview. The default is 'sequence_variant', but a more specific term is used + * if: + * + * + * @param seq + * @param variant + * @param altAlleleIndex + * @return + * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 + */ + String getOntologyTerm(SequenceI seq, VariantContext variant, + int altAlleleIndex) + { + String type = SequenceOntologyI.SEQUENCE_VARIANT; + + if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 + { + /* + * no Consequence data so we can't refine the ontology term + */ + return type; + } + + /* + * can we associate Consequence data with this allele and feature (transcript)? + * if so, prefer the consequence term from that data + */ + String consequence = getConsequenceForAlleleAndFeature(variant, + CSQ_FIELD, + altAlleleIndex, csqAlleleFieldIndex, csqAlleleNumberFieldIndex, + seq.getName().toLowerCase(), csqFeatureFieldIndex); + if (consequence != null) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > csqConsequenceFieldIndex) + { + type = csqFields[csqConsequenceFieldIndex]; + } + } + else + { + // todo the same for SnpEff consequence data matching if wanted + } + + /* + * if of the form (e.g.) missense_variant&splice_region_variant, + * just take the first ('most severe') consequence + */ + if (type != null) + { + int pos = type.indexOf('&'); + if (pos > 0) + { + type = type.substring(0, pos); + } + } + return type; + } + + /** + * Returns matched consequence data if it can be found, else null. + * + * If matched, the consequence is returned (as pipe-delimited fields). + * + * @param variant + * @param vcfInfoId + * @param altAlleleIndex + * @param alleleFieldIndex + * @param alleleNumberFieldIndex + * @param seqName + * @param featureFieldIndex + * @return + */ + private String getConsequenceForAlleleAndFeature(VariantContext variant, + String vcfInfoId, int altAlleleIndex, int alleleFieldIndex, + int alleleNumberFieldIndex, + String seqName, int featureFieldIndex) + { + if (alleleFieldIndex == -1 || featureFieldIndex == -1) + { + return null; + } + Object value = variant.getAttribute(vcfInfoId); + + if (value == null || !(value instanceof List)) + { + return null; + } + + /* + * inspect each consequence in turn (comma-separated blocks + * extracted by htsjdk) + */ + List consequences = (List) value; + + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > featureFieldIndex) + { + String featureIdentifier = csqFields[featureFieldIndex]; + if (featureIdentifier.length() > 4 + && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + { + /* + * feature (transcript) matched - now check for allele match + */ + if (matchAllele(variant, altAlleleIndex, csqFields, + alleleFieldIndex, alleleNumberFieldIndex)) + { + return consequence; + } + } + } + } + return null; + } + + private boolean matchAllele(VariantContext variant, int altAlleleIndex, + String[] csqFields, int alleleFieldIndex, + int alleleNumberFieldIndex) + { + /* + * if ALLELE_NUM is present, it must match altAlleleIndex + * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex + */ + if (alleleNumberFieldIndex > -1) + { + if (csqFields.length <= alleleNumberFieldIndex) + { + return false; + } + String alleleNum = csqFields[alleleNumberFieldIndex]; + return String.valueOf(altAlleleIndex + 1).equals(alleleNum); + } + + /* + * else consequence allele must match variant allele + */ + if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex) + { + String csqAllele = csqFields[alleleFieldIndex]; + String vcfAllele = variant.getAlternateAllele(altAlleleIndex) + .getBaseString(); + return csqAllele.equals(vcfAllele); + } + return false; + } + + /** * Add any allele-specific VCF key-value data to the sequence feature * * @param variant @@ -1003,15 +1178,23 @@ public class VCFLoader * @param variant * @param seq * @param sf - * @param altAlelleIndex + * @param altAlleleIndex * (0, 1..) */ protected void addConsequences(VariantContext variant, SequenceI seq, - SequenceFeature sf, int altAlelleIndex) + SequenceFeature sf, int altAlleleIndex) { + /* + * first try to identify the matching consequence + */ + String myConsequence = getConsequenceForAlleleAndFeature(variant, + CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), + csqFeatureFieldIndex); + Object value = variant.getAttribute(CSQ_FIELD); - if (value == null || !(value instanceof ArrayList)) + if (value == null || !(value instanceof List)) { return; } @@ -1019,43 +1202,17 @@ public class VCFLoader List consequences = (List) value; /* - * if CSQ data includes 'Feature', and any value matches the sequence name, - * then restrict consequence data to only the matching value (transcript) - * i.e. just pick out consequences for the transcript the variant feature is on - */ - String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); - String matchFeature = null; - if (csqFeatureFieldIndex > -1) - { - for (String consequence : consequences) - { - String[] csqFields = consequence.split(PIPE_REGEX); - if (csqFields.length > csqFeatureFieldIndex) - { - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (featureIdentifier.length() > 4 - && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) - { - matchFeature = featureIdentifier; - } - } - } - } - - /* - * inspect CSQ consequences; where possible restrict to the consequence + * inspect CSQ consequences; restrict to the consequence * associated with the current transcript (Feature) */ - SortedMap csqValues = new TreeMap<>( - String.CASE_INSENSITIVE_ORDER); + Map csqValues = new HashMap<>(); for (String consequence : consequences) { - String[] csqFields = consequence.split(PIPE_REGEX); - - if (includeConsequence(csqFields, matchFeature, variant, - altAlelleIndex)) + if (myConsequence == null || myConsequence.equals(consequence)) { + String[] csqFields = consequence.split(PIPE_REGEX); + /* * inspect individual fields of this consequence, copying non-null * values which are 'fields of interest' @@ -1083,72 +1240,6 @@ public class VCFLoader } /** - * Answers true if we want to associate this block of consequence data with - * the specified alternate allele of the VCF variant. - *

- * If consequence data includes the ALLELE_NUM field, then this has to match - * altAlleleIndex. Otherwise the Allele field of the consequence data has to - * match the allele value. - *

- * Optionally (if matchFeature is not null), restrict to only include - * consequences whose Feature value matches. This allows us to attach - * consequences to their respective transcripts. - * - * @param csqFields - * @param matchFeature - * @param variant - * @param altAlelleIndex - * (0, 1..) - * @return - */ - protected boolean includeConsequence(String[] csqFields, - String matchFeature, VariantContext variant, int altAlelleIndex) - { - /* - * check consequence is for the current transcript - */ - if (matchFeature != null) - { - if (csqFields.length <= csqFeatureFieldIndex) - { - return false; - } - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (!featureIdentifier.equals(matchFeature)) - { - return false; // consequence is for a different transcript - } - } - - /* - * if ALLELE_NUM is present, it must match altAlleleIndex - * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex - */ - if (csqAlleleNumberFieldIndex > -1) - { - if (csqFields.length <= csqAlleleNumberFieldIndex) - { - return false; - } - String alleleNum = csqFields[csqAlleleNumberFieldIndex]; - return String.valueOf(altAlelleIndex + 1).equals(alleleNum); - } - - /* - * else consequence allele must match variant allele - */ - if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex) - { - String csqAllele = csqFields[csqAlleleFieldIndex]; - String vcfAllele = variant.getAlternateAllele(altAlelleIndex) - .getBaseString(); - return csqAllele.equals(vcfAllele); - } - - return false; - } - - /** * A convenience method to complement a dna base and return the string value * of its complement * -- 1.7.10.2