From c7261c53692f99b3ee1081bc2756f194961a8df7 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 21 Sep 2017 16:28:09 +0100 Subject: [PATCH] JAL-2738 compute VCF on peptide, extract AF (allele frequency) score --- src/jalview/analysis/AlignmentUtils.java | 40 +++++++++++------- src/jalview/io/vcf/VCFLoader.java | 65 ++++++++++++++++++++++++------ 2 files changed, 78 insertions(+), 27 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 2b9b9f9..41fe77e 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -105,6 +105,15 @@ public class AlignmentUtils { return variant == null ? null : variant.getFeatureGroup(); } + + /** + * toString for aid in the debugger only + */ + @Override + public String toString() + { + return base + ":" + (variant == null ? "" : variant.getDescription()); + } } /** @@ -2542,6 +2551,22 @@ public class AlignmentUtils // not handling multi-locus variant features continue; } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue("alleles"); + if (alls == null) + { + continue; // non-SNP VCF variant perhaps - can't process this + } + String[] alleles = alls.toUpperCase().split(","); + int i = 0; + for (String allele : alleles) + { + alleles[i++] = allele.trim(); // lose any space characters "A, G" + } + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); if (mapsTo == null) { @@ -2560,21 +2585,6 @@ public class AlignmentUtils } /* - * extract dna variants to a string array - */ - String alls = (String) sf.getValue("alleles"); - if (alls == null) - { - continue; - } - String[] alleles = alls.toUpperCase().split(","); - int i = 0; - for (String allele : alleles) - { - alleles[i++] = allele.trim(); // lose any space characters "A, G" - } - - /* * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10] */ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index d70ffdf..0b13143 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -6,8 +6,11 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; +import jalview.analysis.AlignmentUtils; import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLoci; +import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -45,6 +48,7 @@ public class VCFLoader try { + long start = System.currentTimeMillis(); reader = new VCFReader(filePath); VCFHeader header = reader.getFileHeader(); @@ -58,6 +62,9 @@ public class VCFLoader int varCount = 0; int seqCount = 0; + /* + * query for VCF overlapping each sequence in turn + */ for (SequenceI seq : al.getSequences()) { int added = loadVCF(seq, reader, isRefGrch37); @@ -65,11 +72,13 @@ public class VCFLoader { seqCount++; varCount += added; + computePeptideVariants(seq); } } + long elapsed = System.currentTimeMillis() - start; System.out.println(String.format( -"Added %d VCF variants to %d sequence(s)", varCount, - seqCount)); + "Added %d VCF variants to %d sequence(s) (%dms)", varCount, + seqCount, elapsed)); reader.close(); } catch (Throwable e) @@ -80,6 +89,34 @@ public class VCFLoader } /** + * (Re-)computes peptide variants from dna variants, for any protein sequence + * to which the dna sequence has a mapping. Note that although duplicate + * features may get computed, they will not be added, since duplicate sequence + * features are ignored in Sequence.addSequenceFeature. + * + * @param dnaSeq + */ + protected void computePeptideVariants(SequenceI dnaSeq) + { + DBRefEntry[] dbrefs = dnaSeq.getDBRefs(); + if (dbrefs == null) + { + return; + } + for (DBRefEntry dbref : dbrefs) + { + Mapping mapping = dbref.getMap(); + if (mapping == null || mapping.getTo() == null + || mapping.getMap().getFromRatio() != 3) + { + continue; + } + AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(), + mapping.getMap()); + } + } + + /** * Tries to add overlapping variants read from a VCF file to the given * sequence, and returns the number of overlapping variants found. Note that * this requires the sequence to hold information as to its chromosomal @@ -100,16 +137,7 @@ public class VCFLoader return 0; } - /* - * fudge - ensure chromosomal mapping from range is sequence start/end - * (in case end == 0 when the mapping is first created) - */ MapList mapping = seqCoords.getMapping(); - if (mapping.getFromRanges().get(0)[1] == 0) - { - mapping.getFromRanges().get(0)[1] = seq.getEnd(); - } - List seqChromosomalContigs = mapping.getToRanges(); for (int[] range : seqChromosomalContigs) { @@ -202,18 +230,31 @@ public class VCFLoader StringBuilder sb = new StringBuilder(); sb.append(variant.getReference().getBaseString()); + int alleleCount = 0; for (Allele allele : variant.getAlleles()) { if (!allele.isReference()) { sb.append(",").append(allele.getBaseString()); + alleleCount++; } } String alleles = sb.toString(); // e.g. G,A,C String type = SequenceOntologyI.SEQUENCE_VARIANT; + float score = 0f; + if (alleleCount == 1) + { + try + { + score = (float) variant.getAttributeAsDouble("AF", 0d); + } catch (NumberFormatException e) + { + // leave score as 0 + } + } SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, - featureEnd, "VCF"); + featureEnd, score, "VCF"); /* * only add 'alleles' property if a SNP, as we can -- 1.7.10.2