X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=ea5b8e06f723f3ac512a903c780508e2158184f5;hb=57738a1f3c19b1c3a00bd3ac5108f8cd0af32f99;hp=5544bd6ceaafcee00e74d6069cfff8fe37f524fa;hpb=195aaaebc7c27996d1db214494025edfd1505d63;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 5544bd6..ea5b8e0 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,31 +1,33 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; -import jalview.analysis.Dna; -import jalview.api.AlignViewControllerGuiI; -import jalview.bin.Cache; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLociI; -import jalview.datamodel.Mapping; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.datamodel.features.FeatureAttributeType; -import jalview.datamodel.features.FeatureSource; -import jalview.datamodel.features.FeatureSources; -import jalview.ext.ensembl.EnsemblMap; -import jalview.ext.htsjdk.HtsContigDb; -import jalview.ext.htsjdk.VCFReader; -import jalview.io.gff.Gff3Helper; -import jalview.io.gff.SequenceOntologyI; -import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.MessageManager; +import java.util.Locale; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -46,6 +48,27 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import jalview.analysis.Dna; +import jalview.api.AlignViewControllerGuiI; +import jalview.bin.Cache; +import jalview.bin.Console; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLociI; +import jalview.datamodel.Mapping; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; +import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; +import jalview.ext.htsjdk.VCFReader; +import jalview.io.gff.Gff3Helper; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.MessageManager; +import jalview.util.StringUtils; /** * A class to read VCF data (using the htsjdk) and add variants as sequence @@ -55,18 +78,32 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String VCF_ENCODABLE = ":;=%,"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' private static final String DEFAULT_SPECIES = "homo_sapiens"; /** - * A class to model the mapping from sequence to VCF coordinates. Cases include + * A class to model the mapping from sequence to VCF coordinates. Cases + * include * */ class VCFMap @@ -110,15 +147,20 @@ public class VCFLoader private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human - private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37 + private static final String DEFAULT_REFERENCE = "grch37"; // fallback default + // is human GRCh37 /* * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; + private static final String CSQ_ALLELE_KEY = "Allele"; - private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + + private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), + // 1... + private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id /* @@ -189,8 +231,11 @@ public class VCFLoader * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private int csqConsequenceFieldIndex = -1; + private int csqAlleleFieldIndex = -1; + private int csqAlleleNumberFieldIndex = -1; + private int csqFeatureFieldIndex = -1; // todo the same fields for SnpEff ANN data if wanted @@ -276,10 +321,11 @@ public class VCFLoader */ public SequenceI loadVCFContig(String contig) { - VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + VCFHeaderLine headerLine = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); if (headerLine == null) { - Cache.log.error("VCF reference header not found"); + Console.error("VCF reference header not found"); return null; } String ref = headerLine.getValue(); @@ -301,7 +347,7 @@ public class VCFLoader } else { - Cache.log.error("VCF reference not found: " + ref); + Console.error("VCF reference not found: " + ref); } return seq; @@ -377,9 +423,9 @@ public class VCFLoader /** * Attempts to determine and save the species and genome assembly version to - * which the VCF data applies. This may be done by parsing the {@code reference} - * header line, configured in a property file, or (potentially) confirmed - * interactively by the user. + * which the VCF data applies. This may be done by parsing the + * {@code reference} header line, configured in a property file, or + * (potentially) confirmed interactively by the user. *

* The saved values should be identifiers valid for Ensembl's REST service * {@code map} endpoint, so they can be used (if necessary) to retrieve the @@ -394,11 +440,11 @@ public class VCFLoader { if (reference == null) { - Cache.log.error("No VCF ##reference found, defaulting to " + Console.error("No VCF ##reference found, defaulting to " + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES); reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified } - reference = reference.toLowerCase(); + reference = reference.toLowerCase(Locale.ROOT); /* * for a non-human species, or other assembly identifier, @@ -413,7 +459,7 @@ public class VCFLoader String[] tokens = token.split("="); if (tokens.length == 2) { - if (reference.contains(tokens[0].trim().toLowerCase())) + if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT))) { vcfAssembly = tokens[1].trim(); break; @@ -430,7 +476,7 @@ public class VCFLoader String[] tokens = token.split("="); if (tokens.length == 2) { - if (reference.contains(tokens[0].trim().toLowerCase())) + if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT))) { vcfSpecies = tokens[1].trim(); break; @@ -535,7 +581,7 @@ public class VCFLoader { for (Pattern p : filters) { - if (p.matcher(id.toUpperCase()).matches()) + if (p.matcher(id.toUpperCase(Locale.ROOT)).matches()) { return true; } @@ -629,7 +675,7 @@ public class VCFLoader { try { - patterns.add(Pattern.compile(token.toUpperCase())); + patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT))); } catch (PatternSyntaxException e) { System.err.println("Invalid pattern ignored: " + token); @@ -640,13 +686,12 @@ public class VCFLoader /** * Transfers VCF features to sequences to which this sequence has a mapping. - * If the mapping is 3:1, computes peptide variants from nucleotide variants. * * @param seq */ protected void transferAddedFeatures(SequenceI seq) { - DBRefEntry[] dbrefs = seq.getDBRefs(); + List dbrefs = seq.getDBRefs(); if (dbrefs == null) { return; @@ -666,7 +711,8 @@ public class VCFLoader /* * dna-to-peptide product mapping */ - AlignmentUtils.computeProteinFeatures(seq, mapTo, map); + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(seq, mapTo, map); } else { @@ -687,8 +733,8 @@ public class VCFLoader } /** - * Tries to add overlapping variants read from a VCF file to the given sequence, - * and returns the number of variant features added + * Tries to add overlapping variants read from a VCF file to the given + * sequence, and returns the number of variant features added * * @param seq * @return @@ -740,7 +786,7 @@ public class VCFLoader GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { - Cache.log.warn(String.format( + Console.warn(String.format( "Can't query VCF for %s as chromosome coordinates not known", seq.getName())); return null; @@ -755,7 +801,7 @@ public class VCFLoader // returned with the Ensembl sequence; todo: support aliases? if (!vcfSpecies.equalsIgnoreCase(species)) { - Cache.log.warn("No VCF loaded to " + seq.getName() + Console.warn("No VCF loaded to " + seq.getName() + " as species not matched"); return null; } @@ -785,10 +831,9 @@ public class VCFLoader vcfAssembly); if (newRange == null) { - Cache.log.error( - String.format("Failed to map %s:%s:%s:%d:%d to %s", species, - chromosome, seqRef, range[0], range[1], - vcfAssembly)); + Console.error(String.format("Failed to map %s:%s:%s:%d:%d to %s", + species, chromosome, seqRef, range[0], range[1], + vcfAssembly)); continue; } else @@ -862,12 +907,19 @@ public class VCFLoader int[] featureRange = map.map.locateInFrom(variant.getStart(), variant.getEnd()); + /* + * only take features whose range is fully mappable to sequence positions + */ if (featureRange != null) { int featureStart = Math.min(featureRange[0], featureRange[1]); int featureEnd = Math.max(featureRange[0], featureRange[1]); - count += addAlleleFeatures(seq, variant, featureStart, - featureEnd, forwardStrand); + if (featureEnd - featureStart == variant.getEnd() + - variant.getStart()) + { + count += addAlleleFeatures(seq, variant, featureStart, + featureEnd, forwardStrand); + } } } variants.close(); @@ -877,8 +929,8 @@ public class VCFLoader * RuntimeException throwable by htsjdk */ String msg = String.format("Error reading VCF for %s:%d-%d: %s ", - map.chromosome, vcfStart, vcfEnd); - Cache.log.error(msg); + map.chromosome, vcfStart, vcfEnd, e.getLocalizedMessage()); + Console.error(msg); } } @@ -900,7 +952,7 @@ public class VCFLoader if (att instanceof String) { - return NO_VALUE.equals(att) ? null : (String) att; + return (String) att; } else if (att instanceof ArrayList) { @@ -942,8 +994,8 @@ public class VCFLoader /** * Inspects one allele and attempts to add a variant feature for it to the * sequence. The additional data associated with this allele is extracted to - * store in the feature's key-value map. Answers the number of features added (0 - * or 1). + * store in the feature's key-value map. Answers the number of features added + * (0 or 1). * * @param seq * @param variant @@ -991,10 +1043,10 @@ public class VCFLoader * pick out the consequence data (if any) that is for the current allele * and feature (transcript) that matches the current sequence */ - String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD, - altAlleleIndex, csqAlleleFieldIndex, - csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), - csqFeatureFieldIndex); + String consequence = getConsequenceForAlleleAndFeature(variant, + CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, + seq.getName().toLowerCase(Locale.ROOT), csqFeatureFieldIndex); /* * pick out the ontology term for the consequence type @@ -1009,7 +1061,20 @@ public class VCFLoader featureEnd, FEATURE_GROUP_VCF); sf.setSource(sourceId); - sf.setValue(Gff3Helper.ALLELES, alleles); + /* + * save the derived alleles as a named attribute; this will be + * needed when Jalview computes derived peptide variants + */ + addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles); + + /* + * add selected VCF fixed column data as feature attributes + */ + addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart())); + addFeatureAttribute(sf, VCF_ID, variant.getID()); + addFeatureAttribute(sf, VCF_QUAL, + String.valueOf(variant.getPhredScaledQual())); + addFeatureAttribute(sf, VCF_FILTER, getFilter(variant)); addAlleleProperties(variant, sf, altAlleleIndex, consequence); @@ -1019,9 +1084,56 @@ public class VCFLoader } /** - * Determines the Sequence Ontology term to use for the variant feature type in - * Jalview. The default is 'sequence_variant', but a more specific term is used - * if: + * Answers the VCF FILTER value for the variant - or an approximation to it. + * This field is either PASS, or a semi-colon separated list of filters not + * passed. htsjdk saves filters as a HashSet, so the order when reassembled + * into a list may be different. + * + * @param variant + * @return + */ + String getFilter(VariantContext variant) + { + Set filters = variant.getFilters(); + if (filters.isEmpty()) + { + return NO_VALUE; + } + Iterator iterator = filters.iterator(); + String first = iterator.next(); + if (filters.size() == 1) + { + return first; + } + + StringBuilder sb = new StringBuilder(first); + while (iterator.hasNext()) + { + sb.append(";").append(iterator.next()); + } + + return sb.toString(); + } + + /** + * Adds one feature attribute unless the value is null, empty or '.' + * + * @param sf + * @param key + * @param value + */ + void addFeatureAttribute(SequenceFeature sf, String key, String value) + { + if (value != null && !value.isEmpty() && !NO_VALUE.equals(value)) + { + sf.setValue(key, value); + } + } + + /** + * Determines the Sequence Ontology term to use for the variant feature type + * in Jalview. The default is 'sequence_variant', but a more specific term is + * used if: *