From c78eeee1a8a612b081a78597cdea69959eb8aa3a Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 4 Oct 2017 11:31:12 +0100 Subject: [PATCH] JAL-2738 match consequence "Allele" to VCF allele; add non-SNP variants on nucleotide --- src/jalview/analysis/AlignmentUtils.java | 19 ++- src/jalview/analysis/Dna.java | 17 ++ src/jalview/io/vcf/VCFLoader.java | 275 ++++++++++++++---------------- 3 files changed, 161 insertions(+), 150 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index c88a462..9a46a91 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -2519,7 +2519,10 @@ public class AlignmentUtils /** * Builds a map whose key is position in the protein sequence, and value is a - * list of the base and all variants for each corresponding codon position + * list of the base and all variants for each corresponding codon position. + *

+ * This depends on dna variants being held as a comma-separated list as + * property "alleles" on variant features. * * @param dnaSeq * @param dnaToProtein @@ -2559,18 +2562,26 @@ public class AlignmentUtils } /* - * extract dna variants to a string array + * ignore variant if not a SNP */ String alls = (String) sf.getValue(Gff3Helper.ALLELES); if (alls == null) { continue; // non-SNP VCF variant perhaps - can't process this } + String[] alleles = alls.toUpperCase().split(","); - int i = 0; + boolean isSnp = true; for (String allele : alleles) { - alleles[i++] = allele.trim(); // lose any space characters "A, G" + if (allele.trim().length() > 1) + { + isSnp = false; + } + } + if (!isSnp) + { + continue; } int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); diff --git a/src/jalview/analysis/Dna.java b/src/jalview/analysis/Dna.java index a10b037..f3088ea 100644 --- a/src/jalview/analysis/Dna.java +++ b/src/jalview/analysis/Dna.java @@ -851,6 +851,23 @@ public class Dna } /** + * Answers the reverse complement of the input string + * + * @see #getComplement(char) + * @param s + * @return + */ + public static String reverseComplement(String s) + { + StringBuilder sb = new StringBuilder(s.length()); + for (int i = s.length() - 1; i >= 0; i--) + { + sb.append(Dna.getComplement(s.charAt(i))); + } + return sb.toString(); + } + + /** * Returns dna complement (preserving case) for aAcCgGtTuU. Ambiguity codes * are treated as on http://reverse-complement.com/. Anything else is left * unchanged. diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 9099fcd..fd0e0d6 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -44,11 +44,17 @@ public class VCFLoader * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ - private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + private static final String ALLELE_KEY = "Allele"; + private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... private static final String FEATURE_KEY = "Feature"; // Ensembl stable id /* + * what comes before column headings in CSQ Description field + */ + private static final String FORMAT = "Format: "; + + /* * default VCF INFO key for VEP consequence data * NB this can be overridden running VEP with --vcf_info_field * - we don't handle this case (require CSQ identifier) @@ -74,12 +80,6 @@ public class VCFLoader private static final String COMMA = ","; /* - * (temporary) flag that determines whether Jalview adds one feature - * per VCF record, or one per allele (preferred) - */ - private static final boolean FEATURE_PER_ALLELE = true; - - /* * the feature group assigned to a VCF variant in Jalview */ private static final String FEATURE_GROUP_VCF = "VCF"; @@ -108,12 +108,12 @@ public class VCFLoader private VCFHeader header; /* - * the position (0...) of the ALLELE_NUM field in each block of + * the position (0...) of field in each block of * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ + private int csqAlleleFieldIndex = -1; private int csqAlleleNumberFieldIndex = -1; - private int csqFeatureFieldIndex = -1; /** @@ -179,7 +179,7 @@ public class VCFLoader .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); /* - * note offset of CSQ ALLELE_NUM field if it is declared + * get offset of CSQ ALLELE_NUM and Feature if declared */ locateCsqFields(); @@ -196,7 +196,7 @@ public class VCFLoader */ for (SequenceI seq : al.getSequences()) { - int added = loadVCF(seq, reader, isRefGrch37); + int added = loadSequenceVCF(seq, reader, isRefGrch37); if (added > 0) { seqCount++; @@ -239,9 +239,8 @@ public class VCFLoader } /** - * Records the position of fields for ALLELE_NUM and Feature defined in the - * CSQ INFO header (if there is one). CSQ fields are declared in the CSQ INFO - * Description e.g. + * Records the position of selected fields defined in the CSQ INFO header (if + * there is one). CSQ fields are declared in the CSQ INFO Description e.g. *

* Description="Consequence ...from ... VEP. Format: Allele|Consequence|... */ @@ -254,6 +253,15 @@ public class VCFLoader } String desc = csqInfo.getDescription(); + int formatPos = desc.indexOf(FORMAT); + if (formatPos == -1) + { + System.err.println("Parse error, failed to find " + FORMAT + + " in " + desc); + return; + } + desc = desc.substring(formatPos + FORMAT.length()); + if (desc != null) { String[] format = desc.split(PIPE_REGEX); @@ -264,6 +272,10 @@ public class VCFLoader { csqAlleleNumberFieldIndex = index; } + if (ALLELE_KEY.equals(field)) + { + csqAlleleFieldIndex = index; + } if (FEATURE_KEY.equals(field)) { csqFeatureFieldIndex = index; @@ -332,7 +344,7 @@ public class VCFLoader * @param isVcfRefGrch37 * @return */ - protected int loadVCF(SequenceI seq, VCFReader reader, + protected int loadSequenceVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37) { int count = 0; @@ -423,7 +435,7 @@ public class VCFLoader */ if (!variant.isSNP() && !variant.isMixed()) { - continue; + // continue; } int start = variant.getStart() - offset; @@ -436,7 +448,7 @@ public class VCFLoader int[] seqLocation = mapping.locateInFrom(start, end); if (seqLocation != null) { - count += addVariantFeature(seq, variant, seqLocation[0], + count += addAlleleFeatures(seq, variant, seqLocation[0], seqLocation[1], forwardStrand); } } @@ -447,86 +459,6 @@ public class VCFLoader } /** - * Inspects the VCF variant record, and adds variant features to the sequence. - * Only SNP variants are added, not INDELs. Returns the number of features - * added. - *

- * If the sequence maps to the reverse strand of the chromosome, reference and - * variant bases are recorded as their complements (C/G, A/T). - * - * @param seq - * @param variant - * @param featureStart - * @param featureEnd - * @param forwardStrand - */ - protected int addVariantFeature(SequenceI seq, VariantContext variant, - int featureStart, int featureEnd, boolean forwardStrand) - { - byte[] reference = variant.getReference().getBases(); - if (reference.length != 1) - { - /* - * sorry, we don't handle INDEL variants - */ - return 0; - } - - if (FEATURE_PER_ALLELE) - { - return addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); - } - - /* - * for now we extract allele frequency as feature score; note - * this attribute is String for a simple SNP, but List if - * multiple alleles at the locus; we extract for the simple case only - */ - float score = getAlleleFrequency(variant, 0); - - StringBuilder sb = new StringBuilder(); - sb.append(forwardStrand ? (char) reference[0] : complement(reference)); - - /* - * inspect alleles and record SNP variants (as the variant - * record could be MIXED and include INDEL and SNP alleles) - * warning: getAlleles gives no guarantee as to the order - * in which they are returned - */ - for (Allele allele : variant.getAlleles()) - { - if (!allele.isReference()) - { - byte[] alleleBase = allele.getBases(); - if (alleleBase.length == 1) - { - sb.append(COMMA).append( - forwardStrand ? (char) alleleBase[0] - : complement(alleleBase)); - } - } - } - String alleles = sb.toString(); // e.g. G,A,C - - String type = SequenceOntologyI.SEQUENCE_VARIANT; - - SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, - featureEnd, score, FEATURE_GROUP_VCF); - - sf.setValue(Gff3Helper.ALLELES, alleles); - - Map atts = variant.getAttributes(); - for (Entry att : atts.entrySet()) - { - sf.setValue(att.getKey(), att.getValue()); - } - seq.addSequenceFeature(sf); - - return 1; - } - - /** * A convenience method to get the AF value for the given alternate allele * index * @@ -579,7 +511,7 @@ public class VCFLoader } /** - * Adds one variant feature for each SNP allele in the VCF variant record, and + * Adds one variant feature for each allele in the VCF variant record, and * returns the number of features added. * * @param seq @@ -609,13 +541,14 @@ public class VCFLoader /** * Inspects one allele and attempts to add a variant feature for it to the - * sequence. Only SNP variants are added as features. We extract as much as - * possible of the additional data associated with this allele to store in the - * feature's key-value map. Answers the number of features added (0 or 1). + * sequence. We extract as much as possible of the additional data associated + * with this allele to store in the feature's key-value map. Answers the + * number of features added (0 or 1). * * @param seq * @param variant * @param altAlleleIndex + * (0, 1..) * @param featureStart * @param featureEnd * @param forwardStrand @@ -625,24 +558,26 @@ public class VCFLoader int altAlleleIndex, int featureStart, int featureEnd, boolean forwardStrand) { - byte[] reference = variant.getReference().getBases(); + String reference = variant.getReference().getBaseString(); Allele alt = variant.getAlternateAllele(altAlleleIndex); - byte[] allele = alt.getBases(); - if (allele.length != 1) + String allele = alt.getBaseString(); + if (allele.length() != 1) { /* * not a SNP variant */ - return 0; + // return 0; } /* - * build the ref,alt allele description e.g. "G,A" + * build the ref,alt allele description e.g. "G,A", using the base + * complement if the sequence is on the reverse strand */ + // TODO check how structural variants are shown on reverse strand StringBuilder sb = new StringBuilder(); - sb.append(forwardStrand ? (char) reference[0] : complement(reference)); + sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); sb.append(COMMA); - sb.append(forwardStrand ? (char) allele[0] : complement(allele)); + sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); String alleles = sb.toString(); // e.g. G,A String type = SequenceOntologyI.SEQUENCE_VARIANT; @@ -667,6 +602,7 @@ public class VCFLoader * @param seq * @param sf * @param altAlelleIndex + * (0, 1..) */ protected void addAlleleProperties(VariantContext variant, SequenceI seq, SequenceFeature sf, final int altAlelleIndex) @@ -681,9 +617,9 @@ public class VCFLoader * extract Consequence data (if present) that we are able to * associated with the allele for this variant feature */ - if (CSQ.equals(key) && csqAlleleNumberFieldIndex > -1) + if (CSQ.equals(key)) { - addConsequences(att.getValue(), seq, sf, altAlelleIndex + 1); + addConsequences(variant, seq, sf, altAlelleIndex); return; } @@ -735,9 +671,9 @@ public class VCFLoader * Inspects CSQ data blocks (consequences) and adds attributes on the sequence * feature for the current allele (and transcript if applicable) *

- * Allele matching: we require field ALLELE_NUM to match altAlleleIndex. If - * the CSQ data does not include ALLELE_NUM values then no data is added to - * the variant feature. + * Allele matching: if field ALLELE_NUM is present, it must match + * altAlleleIndex. If not present, then field Allele value must match the VCF + * Allele. *

* Transcript matching: if sequence name can be identified to at least one of * the consequences' Feature values, then select only consequences that match @@ -745,16 +681,18 @@ public class VCFLoader * take all consequences (this is the case when adding features to the gene * sequence). * - * @param value + * @param variant * @param seq * @param sf * @param altAlelleIndex - * (1=first alternative allele...) + * (0, 1..) */ - protected void addConsequences(Object value, SequenceI seq, + protected void addConsequences(VariantContext variant, SequenceI seq, SequenceFeature sf, int altAlelleIndex) { - if (!(value instanceof ArrayList)) + Object value = variant.getAttribute(CSQ); + + if (value == null || !(value instanceof ArrayList)) { return; } @@ -763,12 +701,11 @@ public class VCFLoader /* * if CSQ data includes 'Feature', and any value matches the sequence name, - * then restrict consequence data to the matching value (transcript) + * then restrict consequence data to only the matching value (transcript) * i.e. just pick out consequences for the transcript the variant feature is on */ String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); - boolean matchFeature = false; - String matchFeatureValue = null; + String matchFeature = null; if (csqFeatureFieldIndex > -1) { for (String consequence : consequences) @@ -780,8 +717,7 @@ public class VCFLoader if (featureIdentifier.length() > 4 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) { - matchFeature = true; - matchFeatureValue = featureIdentifier; + matchFeature = featureIdentifier; } } } @@ -794,41 +730,88 @@ public class VCFLoader { String[] csqFields = consequence.split(PIPE_REGEX); - /* - * check consequence is for the current transcript - */ - if (matchFeature) + if (includeConsequence(csqFields, matchFeature, variant, + altAlelleIndex)) { - if (csqFields.length <= csqFeatureFieldIndex) - { - continue; - } - String featureIdentifier = csqFields[csqFeatureFieldIndex]; - if (!featureIdentifier.equals(matchFeatureValue)) + if (found) { - continue; // consequence is for a different transcript + sb.append(COMMA); } + found = true; + sb.append(consequence); } + } + + if (found) + { + sf.setValue(CSQ, sb.toString()); + } + } - if (csqFields.length > csqAlleleNumberFieldIndex) + /** + * Answers true if we want to associate this block of consequence data with + * the specified alternate allele of the VCF variant. + *

+ * If consequence data includes the ALLELE_NUM field, then this has to match + * altAlleleIndex. Otherwise the Allele field of the consequence data has to + * match the allele value. + *

+ * Optionally (if matchFeature is not null), restrict to only include + * consequences whose Feature value matches. This allows us to attach + * consequences to their respective transcripts. + * + * @param csqFields + * @param matchFeature + * @param variant + * @param altAlelleIndex + * (0, 1..) + * @return + */ + protected boolean includeConsequence(String[] csqFields, + String matchFeature, VariantContext variant, int altAlelleIndex) + { + /* + * check consequence is for the current transcript + */ + if (matchFeature != null) + { + if (csqFields.length <= csqFeatureFieldIndex) { - String alleleNum = csqFields[csqAlleleNumberFieldIndex]; - if (String.valueOf(altAlelleIndex).equals(alleleNum)) - { - if (found) - { - sb.append(COMMA); - } - found = true; - sb.append(consequence); - } + return false; + } + String featureIdentifier = csqFields[csqFeatureFieldIndex]; + if (!featureIdentifier.equals(matchFeature)) + { + return false; // consequence is for a different transcript } } - if (found) + /* + * if ALLELE_NUM is present, it must match altAlleleIndex + * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex + */ + if (csqAlleleNumberFieldIndex > -1) { - sf.setValue(CSQ, sb.toString()); + if (csqFields.length <= csqAlleleNumberFieldIndex) + { + return false; + } + String alleleNum = csqFields[csqAlleleNumberFieldIndex]; + return String.valueOf(altAlelleIndex + 1).equals(alleleNum); } + + /* + * else consequence allele must match variant allele + */ + if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex) + { + String csqAllele = csqFields[csqAlleleFieldIndex]; + String vcfAllele = variant.getAlternateAllele(altAlelleIndex) + .getBaseString(); + return csqAllele.equals(vcfAllele); + } + + return false; } /** -- 1.7.10.2