src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.api.AlignViewControllerGuiI;
   6 import jalview.bin.Cache;
   7 import jalview.datamodel.AlignmentI;
   8 import jalview.datamodel.DBRefEntry;
   9 import jalview.datamodel.GeneLociI;
  10 import jalview.datamodel.Mapping;
  11 import jalview.datamodel.SequenceFeature;
  12 import jalview.datamodel.SequenceI;
  13 import jalview.datamodel.features.FeatureAttributeType;
  14 import jalview.datamodel.features.FeatureSource;
  15 import jalview.datamodel.features.FeatureSources;
  16 import jalview.ext.ensembl.EnsemblMap;
  17 import jalview.ext.htsjdk.VCFReader;
  18 import jalview.io.gff.Gff3Helper;
  19 import jalview.io.gff.SequenceOntologyI;
  20 import jalview.util.MapList;
  21 import jalview.util.MappingUtils;
  22 import jalview.util.MessageManager;
  23
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Map.Entry;
  30 import java.util.regex.Pattern;
  31 import java.util.regex.PatternSyntaxException;
  32
  33 import htsjdk.samtools.util.CloseableIterator;
  34 import htsjdk.variant.variantcontext.Allele;
  35 import htsjdk.variant.variantcontext.VariantContext;
  36 import htsjdk.variant.vcf.VCFHeader;
  37 import htsjdk.variant.vcf.VCFHeaderLine;
  38 import htsjdk.variant.vcf.VCFHeaderLineCount;
  39 import htsjdk.variant.vcf.VCFHeaderLineType;
  40 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  41
  42 /**
  43  * A class to read VCF data (using the htsjdk) and add variants as sequence
  44  * features on dna and any related protein product sequences
  45  *
  46  * @author gmcarstairs
  47  */
  48 public class VCFLoader
  49 {
  50   /*
  51    * Lookup keys, and default values, for Preference entries that describe
  52    * patterns for VCF and VEP fields to capture
  53    */
  54   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  55
  56   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  57
  58   private static final String DEFAULT_VCF_FIELDS = "AF,AC*";
  59
  60   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  61
  62   /*
  63    * keys to fields of VEP CSQ consequence data
  64    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  65    */
  66   private static final String ALLELE_KEY = "Allele";
  67
  68   private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
  69   private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
  70
  71   /*
  72    * default VCF INFO key for VEP consequence data
  73    * NB this can be overridden running VEP with --vcf_info_field
  74    * - we don't handle this case (require identifier to be CSQ)
  75    */
  76   private static final String CSQ_FIELD = "CSQ";
  77
  78   /*
  79    * separator for fields in consequence data is '|'
  80    */
  81   private static final String PIPE_REGEX = "\\|";
  82
  83   /*
  84    * key for Allele Frequency output by VEP
  85    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  86    */
  87   private static final String ALLELE_FREQUENCY_KEY = "AF";
  88
  89   /*
  90    * delimiter that separates multiple consequence data blocks
  91    */
  92   private static final String COMMA = ",";
  93
  94   /*
  95    * the feature group assigned to a VCF variant in Jalview
  96    */
  97   private static final String FEATURE_GROUP_VCF = "VCF";
  98
  99   /*
 100    * internal delimiter used to build keys for assemblyMappings
 101    *
 102    */
 103   private static final String EXCL = "!";
 104
 105   /*
 106    * the alignment we are associating VCF data with
 107    */
 108   private AlignmentI al;
 109
 110   /*
 111    * mappings between VCF and sequence reference assembly regions, as
 112    * key = "species!chromosome!fromAssembly!toAssembly
 113    * value = Map{fromRange, toRange}
 114    */
 115   private Map<String, Map<int[], int[]>> assemblyMappings;
 116
 117   /*
 118    * holds details of the VCF header lines (metadata)
 119    */
 120   private VCFHeader header;
 121
 122   /*
 123    * the position (0...) of field in each block of
 124    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 125    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 126    */
 127   private int csqAlleleFieldIndex = -1;
 128   private int csqAlleleNumberFieldIndex = -1;
 129   private int csqFeatureFieldIndex = -1;
 130
 131   /*
 132    * a unique identifier under which to save metadata about feature
 133    * attributes (selected INFO field data)
 134    */
 135   private String sourceId;
 136
 137   /*
 138    * The INFO IDs of data that is both present in the VCF file, and
 139    * also matched by any filters for data of interest
 140    */
 141   List<String> vcfFieldsOfInterest;
 142
 143   /*
 144    * The field offsets and identifiers for VEP (CSQ) data that is both present
 145    * in the VCF file, and also matched by any filters for data of interest
 146    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 147    */
 148   Map<Integer, String> vepFieldsOfInterest;
 149
 150   /**
 151    * Constructor given an alignment context
 152    *
 153    * @param alignment
 154    */
 155   public VCFLoader(AlignmentI alignment)
 156   {
 157     al = alignment;
 158
 159     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 160     assemblyMappings = new HashMap<>();
 161   }
 162
 163   /**
 164    * Starts a new thread to query and load VCF variant data on to the alignment
 165    * <p>
 166    * This method is not thread safe - concurrent threads should use separate
 167    * instances of this class.
 168    *
 169    * @param filePath
 170    * @param gui
 171    */
 172   public void loadVCF(final String filePath,
 173           final AlignViewControllerGuiI gui)
 174   {
 175     if (gui != null)
 176     {
 177       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 178     }
 179
 180     new Thread()
 181     {
 182
 183       @Override
 184       public void run()
 185       {
 186         VCFLoader.this.doLoad(filePath, gui);
 187       }
 188
 189     }.start();
 190   }
 191
 192   /**
 193    * Loads VCF on to an alignment - provided it can be related to one or more
 194    * sequence's chromosomal coordinates
 195    *
 196    * @param filePath
 197    * @param gui
 198    *          optional callback handler for messages
 199    */
 200   protected void doLoad(String filePath, AlignViewControllerGuiI gui)
 201   {
 202     VCFReader reader = null;
 203     try
 204     {
 205       // long start = System.currentTimeMillis();
 206       reader = new VCFReader(filePath);
 207
 208       header = reader.getFileHeader();
 209
 210       sourceId = filePath;
 211
 212       saveMetadata(sourceId);
 213
 214       /*
 215        * get offset of CSQ ALLELE_NUM and Feature if declared
 216        */
 217       parseCsqHeader();
 218
 219       VCFHeaderLine ref = header
 220               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 221       String vcfAssembly = ref.getValue();
 222
 223       int varCount = 0;
 224       int seqCount = 0;
 225
 226       /*
 227        * query for VCF overlapping each sequence in turn
 228        */
 229       for (SequenceI seq : al.getSequences())
 230       {
 231         int added = loadSequenceVCF(seq, reader, vcfAssembly);
 232         if (added > 0)
 233         {
 234           seqCount++;
 235           varCount += added;
 236           transferAddedFeatures(seq);
 237         }
 238       }
 239       if (gui != null)
 240       {
 241         // long elapsed = System.currentTimeMillis() - start;
 242         String msg = MessageManager.formatMessage("label.added_vcf",
 243                 varCount, seqCount);
 244         gui.setStatus(msg);
 245         if (gui.getFeatureSettingsUI() != null)
 246         {
 247           gui.getFeatureSettingsUI().discoverAllFeatureData();
 248         }
 249       }
 250     } catch (Throwable e)
 251     {
 252       System.err.println("Error processing VCF: " + e.getMessage());
 253       e.printStackTrace();
 254       if (gui != null)
 255       {
 256         gui.setStatus("Error occurred - see console for details");
 257       }
 258     } finally
 259     {
 260       if (reader != null)
 261       {
 262         try
 263         {
 264           reader.close();
 265         } catch (IOException e)
 266         {
 267           // ignore
 268         }
 269       }
 270     }
 271   }
 272
 273   /**
 274    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 275    * them for future reference
 276    *
 277    * @param theSourceId
 278    */
 279   void saveMetadata(String theSourceId)
 280   {
 281     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 282             DEFAULT_VCF_FIELDS);
 283     vcfFieldsOfInterest = new ArrayList<>();
 284
 285     FeatureSource metadata = new FeatureSource(theSourceId);
 286
 287     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 288     {
 289       String attributeId = info.getID();
 290       String desc = info.getDescription();
 291       VCFHeaderLineType type = info.getType();
 292       FeatureAttributeType attType = null;
 293       switch (type)
 294       {
 295       case Character:
 296         attType = FeatureAttributeType.Character;
 297         break;
 298       case Flag:
 299         attType = FeatureAttributeType.Flag;
 300         break;
 301       case Float:
 302         attType = FeatureAttributeType.Float;
 303         break;
 304       case Integer:
 305         attType = FeatureAttributeType.Integer;
 306         break;
 307       case String:
 308         attType = FeatureAttributeType.String;
 309         break;
 310       }
 311       metadata.setAttributeName(attributeId, desc);
 312       metadata.setAttributeType(attributeId, attType);
 313
 314       if (isFieldWanted(attributeId, vcfFieldPatterns))
 315       {
 316         vcfFieldsOfInterest.add(attributeId);
 317       }
 318     }
 319
 320     FeatureSources.getInstance().addSource(theSourceId, metadata);
 321   }
 322
 323   /**
 324    * Answers true if the field id is matched by any of the filter patterns, else
 325    * false. Matching is against regular expression patterns, and is not
 326    * case-sensitive.
 327    *
 328    * @param id
 329    * @param filters
 330    * @return
 331    */
 332   private boolean isFieldWanted(String id, List<Pattern> filters)
 333   {
 334     for (Pattern p : filters)
 335     {
 336       if (p.matcher(id.toUpperCase()).matches())
 337       {
 338         return true;
 339       }
 340     }
 341     return false;
 342   }
 343
 344   /**
 345    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 346    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 347    * required for processing.
 348    * <p>
 349    * CSQ fields are declared in the CSQ INFO Description e.g.
 350    * <p>
 351    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 352    */
 353   protected void parseCsqHeader()
 354   {
 355     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 356             DEFAULT_VEP_FIELDS);
 357     vepFieldsOfInterest = new HashMap<>();
 358
 359     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 360     if (csqInfo == null)
 361     {
 362       return;
 363     }
 364
 365     /*
 366      * parse out the pipe-separated list of CSQ fields; we assume here that
 367      * these form the last part of the description, and contain no spaces
 368      */
 369     String desc = csqInfo.getDescription();
 370     int spacePos = desc.lastIndexOf(" ");
 371     desc = desc.substring(spacePos + 1);
 372
 373     if (desc != null)
 374     {
 375       String[] format = desc.split(PIPE_REGEX);
 376       int index = 0;
 377       for (String field : format)
 378       {
 379         if (ALLELE_NUM_KEY.equals(field))
 380         {
 381           csqAlleleNumberFieldIndex = index;
 382         }
 383         if (ALLELE_KEY.equals(field))
 384         {
 385           csqAlleleFieldIndex = index;
 386         }
 387         if (FEATURE_KEY.equals(field))
 388         {
 389           csqFeatureFieldIndex = index;
 390         }
 391
 392         if (isFieldWanted(field, vepFieldFilters))
 393         {
 394           vepFieldsOfInterest.put(index, field);
 395         }
 396
 397         index++;
 398       }
 399     }
 400   }
 401
 402   /**
 403    * Reads the Preference value for the given key, with default specified if no
 404    * preference set. The value is interpreted as a comma-separated list of
 405    * regular expressions, and converted into a list of compiled patterns ready
 406    * for matching. Patterns are forced to upper-case for non-case-sensitive
 407    * matching.
 408    * <p>
 409    * This supports user-defined filters for fields of interest to capture while
 410    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 411    * fields with an ID of AF, or starting with AC, would be matched.
 412    *
 413    * @param key
 414    * @param def
 415    * @return
 416    */
 417   private List<Pattern> getFieldMatchers(String key, String def)
 418   {
 419     String pref = Cache.getDefault(key, def);
 420     List<Pattern> patterns = new ArrayList<>();
 421     String[] tokens = pref.split(",");
 422     for (String token : tokens)
 423     {
 424       try
 425       {
 426       patterns.add(Pattern.compile(token.toUpperCase()));
 427       } catch (PatternSyntaxException e)
 428       {
 429         System.err.println("Invalid pattern ignored: " + token);
 430       }
 431     }
 432     return patterns;
 433   }
 434
 435   /**
 436    * Transfers VCF features to sequences to which this sequence has a mapping.
 437    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 438    *
 439    * @param seq
 440    */
 441   protected void transferAddedFeatures(SequenceI seq)
 442   {
 443     DBRefEntry[] dbrefs = seq.getDBRefs();
 444     if (dbrefs == null)
 445     {
 446       return;
 447     }
 448     for (DBRefEntry dbref : dbrefs)
 449     {
 450       Mapping mapping = dbref.getMap();
 451       if (mapping == null || mapping.getTo() == null)
 452       {
 453         continue;
 454       }
 455
 456       SequenceI mapTo = mapping.getTo();
 457       MapList map = mapping.getMap();
 458       if (map.getFromRatio() == 3)
 459       {
 460         /*
 461          * dna-to-peptide product mapping
 462          */
 463         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 464       }
 465       else
 466       {
 467         /*
 468          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 469          */
 470         List<SequenceFeature> features = seq.getFeatures()
 471                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 472         for (SequenceFeature sf : features)
 473         {
 474           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 475           {
 476             transferFeature(sf, mapTo, map);
 477           }
 478         }
 479       }
 480     }
 481   }
 482
 483   /**
 484    * Tries to add overlapping variants read from a VCF file to the given
 485    * sequence, and returns the number of variant features added. Note that this
 486    * requires the sequence to hold information as to its species, chromosomal
 487    * positions and reference assembly, in order to be able to map the VCF
 488    * variants to the sequence (or not)
 489    *
 490    * @param seq
 491    * @param reader
 492    * @param vcfAssembly
 493    * @return
 494    */
 495   protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
 496           String vcfAssembly)
 497   {
 498     int count = 0;
 499     GeneLociI seqCoords = seq.getGeneLoci();
 500     if (seqCoords == null)
 501     {
 502       System.out.println(String.format(
 503               "Can't query VCF for %s as chromosome coordinates not known",
 504               seq.getName()));
 505       return 0;
 506     }
 507
 508     if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId()))
 509     {
 510       return 0;
 511     }
 512
 513     List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
 514     for (int[] range : seqChromosomalContigs)
 515     {
 516       count += addVcfVariants(seq, reader, range, vcfAssembly);
 517     }
 518
 519     return count;
 520   }
 521
 522   /**
 523    * Answers true if the species inferred from the VCF reference identifier
 524    * matches that for the sequence
 525    *
 526    * @param vcfAssembly
 527    * @param speciesId
 528    * @return
 529    */
 530   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 531   {
 532     // PROBLEM 1
 533     // there are many aliases for species - how to equate one with another?
 534     // PROBLEM 2
 535     // VCF ##reference header is an unstructured URI - how to extract species?
 536     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 537     // TODO ask the user to confirm this??
 538
 539     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 540             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 541     {
 542       return true;
 543     }
 544
 545     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 546             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 547     {
 548       return true;
 549     }
 550
 551     // this is not a sustainable solution...
 552
 553     return false;
 554   }
 555
 556   /**
 557    * Queries the VCF reader for any variants that overlap the given chromosome
 558    * region of the sequence, and adds as variant features. Returns the number of
 559    * overlapping variants found.
 560    *
 561    * @param seq
 562    * @param reader
 563    * @param range
 564    *          start-end range of a sequence region in its chromosomal
 565    *          coordinates
 566    * @param vcfAssembly
 567    *          the '##reference' identifier for the VCF reference assembly
 568    * @return
 569    */
 570   protected int addVcfVariants(SequenceI seq, VCFReader reader,
 571           int[] range, String vcfAssembly)
 572   {
 573     GeneLociI seqCoords = seq.getGeneLoci();
 574
 575     String chromosome = seqCoords.getChromosomeId();
 576     String seqRef = seqCoords.getAssemblyId();
 577     String species = seqCoords.getSpeciesId();
 578
 579     /*
 580      * map chromosomal coordinates from sequence to VCF if the VCF
 581      * data has a different reference assembly to the sequence
 582      */
 583     // TODO generalise for non-human species
 584     // - or get the user to choose in a dialog
 585
 586     int offset = 0;
 587     if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 588             && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 589     {
 590       String toRef = "GRCh37";
 591       int[] newRange = mapReferenceRange(range, chromosome, "human",
 592               seqRef, toRef);
 593       if (newRange == null)
 594       {
 595         System.err.println(String.format(
 596                 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
 597                 seqRef, range[0], range[1], toRef));
 598         return 0;
 599       }
 600       offset = newRange[0] - range[0];
 601       range = newRange;
 602     }
 603
 604     boolean forwardStrand = range[0] <= range[1];
 605
 606     /*
 607      * query the VCF for overlaps
 608      * (convert a reverse strand range to forwards)
 609      */
 610     int count = 0;
 611     MapList mapping = seqCoords.getMap();
 612
 613     int fromLocus = Math.min(range[0], range[1]);
 614     int toLocus = Math.max(range[0], range[1]);
 615     CloseableIterator<VariantContext> variants = reader.query(chromosome,
 616             fromLocus, toLocus);
 617     while (variants.hasNext())
 618     {
 619       /*
 620        * get variant location in sequence chromosomal coordinates
 621        */
 622       VariantContext variant = variants.next();
 623
 624       int start = variant.getStart() - offset;
 625       int end = variant.getEnd() - offset;
 626
 627       /*
 628        * convert chromosomal location to sequence coordinates
 629        * - may be reverse strand (convert to forward for sequence feature)
 630        * - null if a partially overlapping feature
 631        */
 632       int[] seqLocation = mapping.locateInFrom(start, end);
 633       if (seqLocation != null)
 634       {
 635         int featureStart = Math.min(seqLocation[0], seqLocation[1]);
 636         int featureEnd = Math.max(seqLocation[0], seqLocation[1]);
 637         count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 638                 forwardStrand);
 639       }
 640     }
 641
 642     variants.close();
 643
 644     return count;
 645   }
 646
 647   /**
 648    * A convenience method to get the AF value for the given alternate allele
 649    * index
 650    *
 651    * @param variant
 652    * @param alleleIndex
 653    * @return
 654    */
 655   protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
 656   {
 657     float score = 0f;
 658     String attributeValue = getAttributeValue(variant,
 659             ALLELE_FREQUENCY_KEY, alleleIndex);
 660     if (attributeValue != null)
 661     {
 662       try
 663       {
 664         score = Float.parseFloat(attributeValue);
 665       } catch (NumberFormatException e)
 666       {
 667         // leave as 0
 668       }
 669     }
 670
 671     return score;
 672   }
 673
 674   /**
 675    * A convenience method to get an attribute value for an alternate allele
 676    *
 677    * @param variant
 678    * @param attributeName
 679    * @param alleleIndex
 680    * @return
 681    */
 682   protected String getAttributeValue(VariantContext variant,
 683           String attributeName, int alleleIndex)
 684   {
 685     Object att = variant.getAttribute(attributeName);
 686
 687     if (att instanceof String)
 688     {
 689       return (String) att;
 690     }
 691     else if (att instanceof ArrayList)
 692     {
 693       return ((List<String>) att).get(alleleIndex);
 694     }
 695
 696     return null;
 697   }
 698
 699   /**
 700    * Adds one variant feature for each allele in the VCF variant record, and
 701    * returns the number of features added.
 702    *
 703    * @param seq
 704    * @param variant
 705    * @param featureStart
 706    * @param featureEnd
 707    * @param forwardStrand
 708    * @return
 709    */
 710   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 711           int featureStart, int featureEnd, boolean forwardStrand)
 712   {
 713     int added = 0;
 714
 715     /*
 716      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 717      * so we proceed defensively to get them in strict order
 718      */
 719     int altAlleleCount = variant.getAlternateAlleles().size();
 720     for (int i = 0; i < altAlleleCount; i++)
 721     {
 722       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 723               forwardStrand);
 724     }
 725     return added;
 726   }
 727
 728   /**
 729    * Inspects one allele and attempts to add a variant feature for it to the
 730    * sequence. We extract as much as possible of the additional data associated
 731    * with this allele to store in the feature's key-value map. Answers the
 732    * number of features added (0 or 1).
 733    *
 734    * @param seq
 735    * @param variant
 736    * @param altAlleleIndex
 737    *          (0, 1..)
 738    * @param featureStart
 739    * @param featureEnd
 740    * @param forwardStrand
 741    * @return
 742    */
 743   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 744           int altAlleleIndex, int featureStart, int featureEnd,
 745           boolean forwardStrand)
 746   {
 747     String reference = variant.getReference().getBaseString();
 748     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 749     String allele = alt.getBaseString();
 750
 751     /*
 752      * build the ref,alt allele description e.g. "G,A", using the base
 753      * complement if the sequence is on the reverse strand
 754      */
 755     // TODO check how structural variants are shown on reverse strand
 756     StringBuilder sb = new StringBuilder();
 757     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 758     sb.append(COMMA);
 759     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 760     String alleles = sb.toString(); // e.g. G,A
 761
 762     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 763     float score = getAlleleFrequency(variant, altAlleleIndex);
 764
 765     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 766             featureEnd, score, FEATURE_GROUP_VCF);
 767     sf.setSource(sourceId);
 768
 769     sf.setValue(Gff3Helper.ALLELES, alleles);
 770
 771     addAlleleProperties(variant, seq, sf, altAlleleIndex);
 772
 773     seq.addSequenceFeature(sf);
 774
 775     return 1;
 776   }
 777
 778   /**
 779    * Add any allele-specific VCF key-value data to the sequence feature
 780    *
 781    * @param variant
 782    * @param seq
 783    * @param sf
 784    * @param altAlelleIndex
 785    *          (0, 1..)
 786    */
 787   protected void addAlleleProperties(VariantContext variant, SequenceI seq,
 788           SequenceFeature sf, final int altAlelleIndex)
 789   {
 790     Map<String, Object> atts = variant.getAttributes();
 791
 792     for (Entry<String, Object> att : atts.entrySet())
 793     {
 794       String key = att.getKey();
 795
 796       /*
 797        * extract Consequence data (if present) that we are able to
 798        * associated with the allele for this variant feature
 799        */
 800       if (CSQ_FIELD.equals(key))
 801       {
 802         addConsequences(variant, seq, sf, altAlelleIndex);
 803         continue;
 804       }
 805
 806       /*
 807        * filter out fields we don't want to capture
 808        */
 809       if (!vcfFieldsOfInterest.contains(key))
 810       {
 811         continue;
 812       }
 813
 814       /*
 815        * we extract values for other data which are allele-specific;
 816        * these may be per alternate allele (INFO[key].Number = 'A')
 817        * or per allele including reference (INFO[key].Number = 'R')
 818        */
 819       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
 820       if (infoHeader == null)
 821       {
 822         /*
 823          * can't be sure what data belongs to this allele, so
 824          * play safe and don't take any
 825          */
 826         continue;
 827       }
 828
 829       VCFHeaderLineCount number = infoHeader.getCountType();
 830       int index = altAlelleIndex;
 831       if (number == VCFHeaderLineCount.R)
 832       {
 833         /*
 834          * one value per allele including reference, so bump index
 835          * e.g. the 3rd value is for the  2nd alternate allele
 836          */
 837         index++;
 838       }
 839       else if (number != VCFHeaderLineCount.A)
 840       {
 841         /*
 842          * don't save other values as not allele-related
 843          */
 844         continue;
 845       }
 846
 847       /*
 848        * take the index'th value
 849        */
 850       String value = getAttributeValue(variant, key, index);
 851       if (value != null)
 852       {
 853         sf.setValue(key, value);
 854       }
 855     }
 856   }
 857
 858   /**
 859    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
 860    * feature for the current allele (and transcript if applicable)
 861    * <p>
 862    * Allele matching: if field ALLELE_NUM is present, it must match
 863    * altAlleleIndex. If not present, then field Allele value must match the VCF
 864    * Allele.
 865    * <p>
 866    * Transcript matching: if sequence name can be identified to at least one of
 867    * the consequences' Feature values, then select only consequences that match
 868    * the value (i.e. consequences for the current transcript sequence). If not,
 869    * take all consequences (this is the case when adding features to the gene
 870    * sequence).
 871    *
 872    * @param variant
 873    * @param seq
 874    * @param sf
 875    * @param altAlelleIndex
 876    *          (0, 1..)
 877    */
 878   protected void addConsequences(VariantContext variant, SequenceI seq,
 879           SequenceFeature sf, int altAlelleIndex)
 880   {
 881     Object value = variant.getAttribute(CSQ_FIELD);
 882
 883     if (value == null || !(value instanceof ArrayList<?>))
 884     {
 885       return;
 886     }
 887
 888     List<String> consequences = (List<String>) value;
 889
 890     /*
 891      * if CSQ data includes 'Feature', and any value matches the sequence name,
 892      * then restrict consequence data to only the matching value (transcript)
 893      * i.e. just pick out consequences for the transcript the variant feature is on
 894      */
 895     String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase();
 896     String matchFeature = null;
 897     if (csqFeatureFieldIndex > -1)
 898     {
 899       for (String consequence : consequences)
 900       {
 901         String[] csqFields = consequence.split(PIPE_REGEX);
 902         if (csqFields.length > csqFeatureFieldIndex)
 903         {
 904           String featureIdentifier = csqFields[csqFeatureFieldIndex];
 905           if (featureIdentifier.length() > 4
 906                   && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
 907           {
 908             matchFeature = featureIdentifier;
 909           }
 910         }
 911       }
 912     }
 913
 914     /*
 915      * inspect CSQ consequences; where possible restrict to the consequence
 916      * associated with the current transcript (Feature)
 917      */
 918     Map<String, String> csqValues = new HashMap<>();
 919
 920     for (String consequence : consequences)
 921     {
 922       String[] csqFields = consequence.split(PIPE_REGEX);
 923
 924       if (includeConsequence(csqFields, matchFeature, variant,
 925               altAlelleIndex))
 926       {
 927         /*
 928          * inspect individual fields of this consequence, copying non-null
 929          * values which are 'fields of interest'
 930          */
 931         int i = 0;
 932         for (String field : csqFields)
 933         {
 934           if (field != null && field.length() > 0)
 935           {
 936             String id = vepFieldsOfInterest.get(i);
 937             if (id != null)
 938             {
 939               csqValues.put(id, field);
 940             }
 941           }
 942           i++;
 943         }
 944       }
 945     }
 946
 947     if (!csqValues.isEmpty())
 948     {
 949       sf.setValue(CSQ_FIELD, csqValues);
 950     }
 951   }
 952
 953   /**
 954    * Answers true if we want to associate this block of consequence data with
 955    * the specified alternate allele of the VCF variant.
 956    * <p>
 957    * If consequence data includes the ALLELE_NUM field, then this has to match
 958    * altAlleleIndex. Otherwise the Allele field of the consequence data has to
 959    * match the allele value.
 960    * <p>
 961    * Optionally (if matchFeature is not null), restrict to only include
 962    * consequences whose Feature value matches. This allows us to attach
 963    * consequences to their respective transcripts.
 964    *
 965    * @param csqFields
 966    * @param matchFeature
 967    * @param variant
 968    * @param altAlelleIndex
 969    *          (0, 1..)
 970    * @return
 971    */
 972   protected boolean includeConsequence(String[] csqFields,
 973           String matchFeature, VariantContext variant, int altAlelleIndex)
 974   {
 975     /*
 976      * check consequence is for the current transcript
 977      */
 978     if (matchFeature != null)
 979     {
 980       if (csqFields.length <= csqFeatureFieldIndex)
 981       {
 982         return false;
 983       }
 984       String featureIdentifier = csqFields[csqFeatureFieldIndex];
 985       if (!featureIdentifier.equals(matchFeature))
 986       {
 987         return false; // consequence is for a different transcript
 988       }
 989     }
 990
 991     /*
 992      * if ALLELE_NUM is present, it must match altAlleleIndex
 993      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
 994      */
 995     if (csqAlleleNumberFieldIndex > -1)
 996     {
 997       if (csqFields.length <= csqAlleleNumberFieldIndex)
 998       {
 999         return false;
1000       }
1001       String alleleNum = csqFields[csqAlleleNumberFieldIndex];
1002       return String.valueOf(altAlelleIndex + 1).equals(alleleNum);
1003     }
1004
1005     /*
1006      * else consequence allele must match variant allele
1007      */
1008     if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex)
1009     {
1010       String csqAllele = csqFields[csqAlleleFieldIndex];
1011       String vcfAllele = variant.getAlternateAllele(altAlelleIndex)
1012               .getBaseString();
1013       return csqAllele.equals(vcfAllele);
1014     }
1015
1016     return false;
1017   }
1018
1019   /**
1020    * A convenience method to complement a dna base and return the string value
1021    * of its complement
1022    *
1023    * @param reference
1024    * @return
1025    */
1026   protected String complement(byte[] reference)
1027   {
1028     return String.valueOf(Dna.getComplement((char) reference[0]));
1029   }
1030
1031   /**
1032    * Determines the location of the query range (chromosome positions) in a
1033    * different reference assembly.
1034    * <p>
1035    * If the range is just a subregion of one for which we already have a mapping
1036    * (for example, an exon sub-region of a gene), then the mapping is just
1037    * computed arithmetically.
1038    * <p>
1039    * Otherwise, calls the Ensembl REST service that maps from one assembly
1040    * reference's coordinates to another's
1041    *
1042    * @param queryRange
1043    *          start-end chromosomal range in 'fromRef' coordinates
1044    * @param chromosome
1045    * @param species
1046    * @param fromRef
1047    *          assembly reference for the query coordinates
1048    * @param toRef
1049    *          assembly reference we wish to translate to
1050    * @return the start-end range in 'toRef' coordinates
1051    */
1052   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1053           String species, String fromRef, String toRef)
1054   {
1055     /*
1056      * first try shorcut of computing the mapping as a subregion of one
1057      * we already have (e.g. for an exon, if we have the gene mapping)
1058      */
1059     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1060             species, fromRef, toRef);
1061     if (mappedRange != null)
1062     {
1063       return mappedRange;
1064     }
1065
1066     /*
1067      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1068      */
1069     EnsemblMap mapper = new EnsemblMap();
1070     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1071             toRef, queryRange);
1072
1073     if (mapping == null)
1074     {
1075       // mapping service failure
1076       return null;
1077     }
1078
1079     /*
1080      * save mapping for possible future re-use
1081      */
1082     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1083     if (!assemblyMappings.containsKey(key))
1084     {
1085       assemblyMappings.put(key, new HashMap<int[], int[]>());
1086     }
1087
1088     assemblyMappings.get(key).put(queryRange, mapping);
1089
1090     return mapping;
1091   }
1092
1093   /**
1094    * If we already have a 1:1 contiguous mapping which subsumes the given query
1095    * range, this method just calculates and returns the subset of that mapping,
1096    * else it returns null. In practical terms, if a gene has a contiguous
1097    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1098    * subsidiary exons occupy unchanged relative positions, and just compute
1099    * these as offsets, rather than do another lookup of the mapping.
1100    * <p>
1101    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1102    * simply remove this method or let it always return null.
1103    * <p>
1104    * Warning: many rapid calls to the /map service map result in a 429 overload
1105    * error response
1106    *
1107    * @param queryRange
1108    * @param chromosome
1109    * @param species
1110    * @param fromRef
1111    * @param toRef
1112    * @return
1113    */
1114   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1115           String species, String fromRef, String toRef)
1116   {
1117     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1118     if (assemblyMappings.containsKey(key))
1119     {
1120       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1121       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1122       {
1123         int[] fromRange = mappedRange.getKey();
1124         int[] toRange = mappedRange.getValue();
1125         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1126         {
1127           /*
1128            * mapping is 1:1 in length, so we trust it to have no discontinuities
1129            */
1130           if (MappingUtils.rangeContains(fromRange, queryRange))
1131           {
1132             /*
1133              * fromRange subsumes our query range
1134              */
1135             int offset = queryRange[0] - fromRange[0];
1136             int mappedRangeFrom = toRange[0] + offset;
1137             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1138             return new int[] { mappedRangeFrom, mappedRangeTo };
1139           }
1140         }
1141       }
1142     }
1143     return null;
1144   }
1145
1146   /**
1147    * Transfers the sequence feature to the target sequence, locating its start
1148    * and end range based on the mapping. Features which do not overlap the
1149    * target sequence are ignored.
1150    *
1151    * @param sf
1152    * @param targetSequence
1153    * @param mapping
1154    *          mapping from the feature's coordinates to the target sequence
1155    */
1156   protected void transferFeature(SequenceFeature sf,
1157           SequenceI targetSequence, MapList mapping)
1158   {
1159     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1160
1161     if (mappedRange != null)
1162     {
1163       String group = sf.getFeatureGroup();
1164       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1165       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1166       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1167               group, sf.getScore());
1168       targetSequence.addSequenceFeature(copy);
1169     }
1170   }
1171
1172   /**
1173    * Formats a ranges map lookup key
1174    *
1175    * @param chromosome
1176    * @param species
1177    * @param fromRef
1178    * @param toRef
1179    * @return
1180    */
1181   protected static String makeRangesKey(String chromosome, String species,
1182           String fromRef, String toRef)
1183   {
1184     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1185             + toRef;
1186   }
1187 }