X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=ac707d8f01cea046948f2809c07256b0739c0ae2;hb=456e5c765ac1e85336fd9f9b1a35453069bb2298;hp=5544bd6ceaafcee00e74d6069cfff8fe37f524fa;hpb=195aaaebc7c27996d1db214494025edfd1505d63;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 5544bd6..ac707d8 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,6 +1,5 @@ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; @@ -23,9 +22,12 @@ import jalview.util.MessageManager; import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -55,6 +57,29 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String ENCODED_COMMA = "%2C"; + + private static final String ENCODED_PERCENT = "%25"; + + private static final String ENCODED_EQUALS = "%3D"; + + private static final String ENCODED_SEMICOLON = "%3B"; + + private static final String ENCODED_COLON = "%3A"; + + private static final String UTF_8 = "UTF-8"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' private static final String DEFAULT_SPECIES = "homo_sapiens"; @@ -666,7 +691,8 @@ public class VCFLoader /* * dna-to-peptide product mapping */ - AlignmentUtils.computeProteinFeatures(seq, mapTo, map); + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(seq, mapTo, map); } else { @@ -900,7 +926,7 @@ public class VCFLoader if (att instanceof String) { - return NO_VALUE.equals(att) ? null : (String) att; + return (String) att; } else if (att instanceof ArrayList) { @@ -1009,7 +1035,20 @@ public class VCFLoader featureEnd, FEATURE_GROUP_VCF); sf.setSource(sourceId); - sf.setValue(Gff3Helper.ALLELES, alleles); + /* + * save the derived alleles as a named attribute; this will be + * needed when Jalview computes derived peptide variants + */ + addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles); + + /* + * add selected VCF fixed column data as feature attributes + */ + addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart())); + addFeatureAttribute(sf, VCF_ID, variant.getID()); + addFeatureAttribute(sf, VCF_QUAL, + String.valueOf(variant.getPhredScaledQual())); + addFeatureAttribute(sf, VCF_FILTER, getFilter(variant)); addAlleleProperties(variant, sf, altAlleleIndex, consequence); @@ -1019,6 +1058,53 @@ public class VCFLoader } /** + * Answers the VCF FILTER value for the variant - or an approximation to it. + * This field is either PASS, or a semi-colon separated list of filters not + * passed. htsjdk saves filters as a HashSet, so the order when reassembled into + * a list may be different. + * + * @param variant + * @return + */ + String getFilter(VariantContext variant) + { + Set filters = variant.getFilters(); + if (filters.isEmpty()) + { + return NO_VALUE; + } + Iterator iterator = filters.iterator(); + String first = iterator.next(); + if (filters.size() == 1) + { + return first; + } + + StringBuilder sb = new StringBuilder(first); + while (iterator.hasNext()) + { + sb.append(";").append(iterator.next()); + } + + return sb.toString(); + } + + /** + * Adds one feature attribute unless the value is null, empty or '.' + * + * @param sf + * @param key + * @param value + */ + void addFeatureAttribute(SequenceFeature sf, String key, String value) + { + if (value != null && !value.isEmpty() && !NO_VALUE.equals(value)) + { + sf.setValue(key, value); + } + } + + /** * Determines the Sequence Ontology term to use for the variant feature type in * Jalview. The default is 'sequence_variant', but a more specific term is used * if: @@ -1212,14 +1298,6 @@ public class VCFLoader } /* - * filter out fields we don't want to capture - */ - if (!vcfFieldsOfInterest.contains(key)) - { - continue; - } - - /* * we extract values for other data which are allele-specific; * these may be per alternate allele (INFO[key].Number = 'A') * or per allele including reference (INFO[key].Number = 'R') @@ -1258,12 +1336,42 @@ public class VCFLoader String value = getAttributeValue(variant, key, index); if (value != null && isValid(variant, key, value)) { - sf.setValue(key, value); + value = decodeSpecialCharacters(value); + addFeatureAttribute(sf, key, value); } } } /** + * Decodes colon, semicolon, equals sign, percent sign, comma to their decoded + * form. The VCF specification (para 1.2) requires these to be encoded where not + * used with their special meaning in the VCF syntax. Note that general URL + * decoding should not be applied, since this would incorrectly decode (for + * example) a '+' sign. + * + * @param value + * @return + */ + protected static String decodeSpecialCharacters(String value) + { + /* + * avoid regex compilation if it is not needed! + */ + if (!value.contains(ENCODED_COLON) && !value.contains(ENCODED_SEMICOLON) + && !value.contains(ENCODED_EQUALS) + && !value.contains(ENCODED_PERCENT) + && !value.contains(ENCODED_COMMA)) + { + return value; + } + + value = value.replace(ENCODED_COLON, ":") + .replace(ENCODED_SEMICOLON, ";").replace(ENCODED_EQUALS, "=") + .replace(ENCODED_PERCENT, "%").replace(ENCODED_COMMA, ","); + return value; + } + + /** * Answers true for '.', null, or an empty value, or if the INFO type is String. * If the INFO type is Integer or Float, answers false if the value is not in * valid format. @@ -1377,6 +1485,16 @@ public class VCFLoader String id = vepFieldsOfInterest.get(i); if (id != null) { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + try + { + field = URLDecoder.decode(field, UTF_8); + } catch (UnsupportedEncodingException e) + { + } csqValues.put(id, field); } }