From 91d83d4836ba3f8c6c395d46d607faf693946e66 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 26 Jul 2019 17:58:06 +0200 Subject: [PATCH] JAL-3376 capture VCF POS, ID, QUAL, FILTER as feature attributes --- src/jalview/io/vcf/VCFLoader.java | 78 ++++++++++++++++++++++++++++++-- test/jalview/io/vcf/VCFLoaderTest.java | 10 +++- 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 5544bd6..decff23 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -55,6 +56,17 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' private static final String DEFAULT_SPECIES = "homo_sapiens"; @@ -900,7 +912,7 @@ public class VCFLoader if (att instanceof String) { - return NO_VALUE.equals(att) ? null : (String) att; + return (String) att; } else if (att instanceof ArrayList) { @@ -1009,7 +1021,20 @@ public class VCFLoader featureEnd, FEATURE_GROUP_VCF); sf.setSource(sourceId); - sf.setValue(Gff3Helper.ALLELES, alleles); + /* + * save the derived alleles as a named attribute; this will be + * needed when Jalview computes derived peptide variants + */ + addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles); + + /* + * add selected VCF fixed column data as feature attributes + */ + addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart())); + addFeatureAttribute(sf, VCF_ID, variant.getID()); + addFeatureAttribute(sf, VCF_QUAL, + String.valueOf(variant.getPhredScaledQual())); + addFeatureAttribute(sf, VCF_FILTER, getFilter(variant)); addAlleleProperties(variant, sf, altAlleleIndex, consequence); @@ -1019,6 +1044,53 @@ public class VCFLoader } /** + * Answers the VCF FILTER value for the variant - or an approximation to it. + * This field is either PASS, or a semi-colon separated list of filters not + * passed. htsjdk saves filters as a HashSet, so the order when reassembled into + * a list may be different. + * + * @param variant + * @return + */ + String getFilter(VariantContext variant) + { + Set filters = variant.getFilters(); + if (filters.isEmpty()) + { + return NO_VALUE; + } + Iterator iterator = filters.iterator(); + String first = iterator.next(); + if (filters.size() == 1) + { + return first; + } + + StringBuilder sb = new StringBuilder(first); + while (iterator.hasNext()) + { + sb.append(";").append(iterator.next()); + } + + return sb.toString(); + } + + /** + * Adds one feature attribute unless the value is null, empty or '.' + * + * @param sf + * @param key + * @param value + */ + void addFeatureAttribute(SequenceFeature sf, String key, String value) + { + if (value != null && !value.isEmpty() && !NO_VALUE.equals(value)) + { + sf.setValue(key, value); + } + } + + /** * Determines the Sequence Ontology term to use for the variant feature type in * Jalview. The default is 'sequence_variant', but a more specific term is used * if: @@ -1258,7 +1330,7 @@ public class VCFLoader String value = getAttributeValue(variant, key, index); if (value != null && isValid(variant, key, value)) { - sf.setValue(key, value); + addFeatureAttribute(sf, key, value); } } } diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 808fe86..1e88665 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -71,10 +71,10 @@ public class VCFLoaderTest // A/T,C variants in position 2 of gene sequence (precedes transcript) // should create 2 variant features with respective AF values // malformed values for AC_Female and AF_AFR should be ignored - "17\t45051611\t.\tA\tT,C\t1666.64\tRF\tAC=15;AF=5.0e-03,4.0e-03;AC_Female=12,3d;AF_AFR=low,2.3e-4", + "17\t45051611\trs384765\tA\tT,C\t1666.64\tRF;XYZ\tAC=15;AF=5.0e-03,4.0e-03;AC_Female=12,3d;AF_AFR=low,2.3e-4", // SNP G/C in position 4 of gene sequence, position 2 of transcript // insertion G/GA is transferred to nucleotide but not to peptide - "17\t45051613\t.\tG\tGA,C\t1666.65\tRF\tAC=15;AF=3.0e-03,2.0e-03", + "17\t45051613\t.\tG\tGA,C\t1666.65\t.\tAC=15;AF=3.0e-03,2.0e-03", // '.' in INFO field should be ignored "17\t45051615\t.\tG\tC\t1666.66\tRF\tAC=16;AF=." }; @@ -130,6 +130,10 @@ public class VCFLoaderTest assertEquals(sf.getValue("AF_AFR"), "2.3e-4"); assertEquals(sf.getValue(Gff3Helper.ALLELES), "A,C"); assertEquals(sf.getType(), SEQUENCE_VARIANT); + assertEquals(sf.getValue("POS"), "45051611"); + assertEquals(sf.getValue("ID"), "rs384765"); + assertEquals(sf.getValue("QUAL"), "1666.64"); + assertEquals(sf.getValue("FILTER"), "RF;XYZ"); // malformed integer for AC_Female is ignored (JAL-3375) assertNull(sf.getValue("AC_Female")); @@ -165,6 +169,8 @@ public class VCFLoaderTest assertEquals(Float.parseFloat((String) sf.getValue("AF")), 3.0e-03, DELTA); assertEquals(sf.getValue(Gff3Helper.ALLELES), "G,GA"); + assertNull(sf.getValue("ID")); // '.' is ignored + assertNull(sf.getValue("FILTER")); // '.' is ignored sf = geneFeatures.get(4); assertEquals(sf.getFeatureGroup(), "VCF"); -- 1.7.10.2