src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.api.AlignViewControllerGuiI;
   6 import jalview.bin.Cache;
   7 import jalview.datamodel.AlignmentI;
   8 import jalview.datamodel.DBRefEntry;
   9 import jalview.datamodel.GeneLociI;
  10 import jalview.datamodel.Mapping;
  11 import jalview.datamodel.SequenceFeature;
  12 import jalview.datamodel.SequenceI;
  13 import jalview.datamodel.features.FeatureAttributeType;
  14 import jalview.datamodel.features.FeatureSource;
  15 import jalview.datamodel.features.FeatureSources;
  16 import jalview.ext.ensembl.EnsemblMap;
  17 import jalview.ext.htsjdk.VCFReader;
  18 import jalview.io.gff.Gff3Helper;
  19 import jalview.io.gff.SequenceOntologyI;
  20 import jalview.util.MapList;
  21 import jalview.util.MappingUtils;
  22 import jalview.util.MessageManager;
  23
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Map.Entry;
  30 import java.util.SortedMap;
  31 import java.util.TreeMap;
  32 import java.util.regex.Pattern;
  33 import java.util.regex.PatternSyntaxException;
  34
  35 import htsjdk.samtools.util.CloseableIterator;
  36 import htsjdk.variant.variantcontext.Allele;
  37 import htsjdk.variant.variantcontext.VariantContext;
  38 import htsjdk.variant.vcf.VCFHeader;
  39 import htsjdk.variant.vcf.VCFHeaderLine;
  40 import htsjdk.variant.vcf.VCFHeaderLineCount;
  41 import htsjdk.variant.vcf.VCFHeaderLineType;
  42 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  43
  44 /**
  45  * A class to read VCF data (using the htsjdk) and add variants as sequence
  46  * features on dna and any related protein product sequences
  47  *
  48  * @author gmcarstairs
  49  */
  50 public class VCFLoader
  51 {
  52   /*
  53    * Lookup keys, and default values, for Preference entries that describe
  54    * patterns for VCF and VEP fields to capture
  55    */
  56   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  57
  58   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  59
  60   private static final String DEFAULT_VCF_FIELDS = ".*";
  61
  62   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  63
  64   /*
  65    * keys to fields of VEP CSQ consequence data
  66    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  67    */
  68   private static final String ALLELE_KEY = "Allele";
  69
  70   private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
  71   private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
  72
  73   /*
  74    * default VCF INFO key for VEP consequence data
  75    * NB this can be overridden running VEP with --vcf_info_field
  76    * - we don't handle this case (require identifier to be CSQ)
  77    */
  78   private static final String CSQ_FIELD = "CSQ";
  79
  80   /*
  81    * separator for fields in consequence data is '|'
  82    */
  83   private static final String PIPE_REGEX = "\\|";
  84
  85   /*
  86    * key for Allele Frequency output by VEP
  87    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  88    */
  89   private static final String ALLELE_FREQUENCY_KEY = "AF";
  90
  91   /*
  92    * delimiter that separates multiple consequence data blocks
  93    */
  94   private static final String COMMA = ",";
  95
  96   /*
  97    * the feature group assigned to a VCF variant in Jalview
  98    */
  99   private static final String FEATURE_GROUP_VCF = "VCF";
 100
 101   /*
 102    * internal delimiter used to build keys for assemblyMappings
 103    *
 104    */
 105   private static final String EXCL = "!";
 106
 107   /*
 108    * the alignment we are associating VCF data with
 109    */
 110   private AlignmentI al;
 111
 112   /*
 113    * mappings between VCF and sequence reference assembly regions, as
 114    * key = "species!chromosome!fromAssembly!toAssembly
 115    * value = Map{fromRange, toRange}
 116    */
 117   private Map<String, Map<int[], int[]>> assemblyMappings;
 118
 119   /*
 120    * holds details of the VCF header lines (metadata)
 121    */
 122   private VCFHeader header;
 123
 124   /*
 125    * the position (0...) of field in each block of
 126    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 127    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 128    */
 129   private int csqAlleleFieldIndex = -1;
 130   private int csqAlleleNumberFieldIndex = -1;
 131   private int csqFeatureFieldIndex = -1;
 132
 133   /*
 134    * a unique identifier under which to save metadata about feature
 135    * attributes (selected INFO field data)
 136    */
 137   private String sourceId;
 138
 139   /*
 140    * The INFO IDs of data that is both present in the VCF file, and
 141    * also matched by any filters for data of interest
 142    */
 143   List<String> vcfFieldsOfInterest;
 144
 145   /*
 146    * The field offsets and identifiers for VEP (CSQ) data that is both present
 147    * in the VCF file, and also matched by any filters for data of interest
 148    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 149    */
 150   Map<Integer, String> vepFieldsOfInterest;
 151
 152   /**
 153    * Constructor given an alignment context
 154    *
 155    * @param alignment
 156    */
 157   public VCFLoader(AlignmentI alignment)
 158   {
 159     al = alignment;
 160
 161     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 162     assemblyMappings = new HashMap<>();
 163   }
 164
 165   /**
 166    * Starts a new thread to query and load VCF variant data on to the alignment
 167    * <p>
 168    * This method is not thread safe - concurrent threads should use separate
 169    * instances of this class.
 170    *
 171    * @param filePath
 172    * @param gui
 173    */
 174   public void loadVCF(final String filePath,
 175           final AlignViewControllerGuiI gui)
 176   {
 177     if (gui != null)
 178     {
 179       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 180     }
 181
 182     new Thread()
 183     {
 184
 185       @Override
 186       public void run()
 187       {
 188         VCFLoader.this.doLoad(filePath, gui);
 189       }
 190
 191     }.start();
 192   }
 193
 194   /**
 195    * Loads VCF on to an alignment - provided it can be related to one or more
 196    * sequence's chromosomal coordinates
 197    *
 198    * @param filePath
 199    * @param gui
 200    *          optional callback handler for messages
 201    */
 202   protected void doLoad(String filePath, AlignViewControllerGuiI gui)
 203   {
 204     VCFReader reader = null;
 205     try
 206     {
 207       // long start = System.currentTimeMillis();
 208       reader = new VCFReader(filePath);
 209
 210       header = reader.getFileHeader();
 211
 212       sourceId = filePath;
 213
 214       saveMetadata(sourceId);
 215
 216       /*
 217        * get offset of CSQ ALLELE_NUM and Feature if declared
 218        */
 219       parseCsqHeader();
 220
 221       VCFHeaderLine ref = header
 222               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 223       String vcfAssembly = ref.getValue();
 224
 225       int varCount = 0;
 226       int seqCount = 0;
 227
 228       /*
 229        * query for VCF overlapping each sequence in turn
 230        */
 231       for (SequenceI seq : al.getSequences())
 232       {
 233         int added = loadSequenceVCF(seq, reader, vcfAssembly);
 234         if (added > 0)
 235         {
 236           seqCount++;
 237           varCount += added;
 238           transferAddedFeatures(seq);
 239         }
 240       }
 241       if (gui != null)
 242       {
 243         // long elapsed = System.currentTimeMillis() - start;
 244         String msg = MessageManager.formatMessage("label.added_vcf",
 245                 varCount, seqCount);
 246         gui.setStatus(msg);
 247         if (gui.getFeatureSettingsUI() != null)
 248         {
 249           gui.getFeatureSettingsUI().discoverAllFeatureData();
 250         }
 251       }
 252     } catch (Throwable e)
 253     {
 254       System.err.println("Error processing VCF: " + e.getMessage());
 255       e.printStackTrace();
 256       if (gui != null)
 257       {
 258         gui.setStatus("Error occurred - see console for details");
 259       }
 260     } finally
 261     {
 262       if (reader != null)
 263       {
 264         try
 265         {
 266           reader.close();
 267         } catch (IOException e)
 268         {
 269           // ignore
 270         }
 271       }
 272     }
 273   }
 274
 275   /**
 276    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 277    * them for future reference
 278    *
 279    * @param theSourceId
 280    */
 281   void saveMetadata(String theSourceId)
 282   {
 283     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 284             DEFAULT_VCF_FIELDS);
 285     vcfFieldsOfInterest = new ArrayList<>();
 286
 287     FeatureSource metadata = new FeatureSource(theSourceId);
 288
 289     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 290     {
 291       String attributeId = info.getID();
 292       String desc = info.getDescription();
 293       VCFHeaderLineType type = info.getType();
 294       FeatureAttributeType attType = null;
 295       switch (type)
 296       {
 297       case Character:
 298         attType = FeatureAttributeType.Character;
 299         break;
 300       case Flag:
 301         attType = FeatureAttributeType.Flag;
 302         break;
 303       case Float:
 304         attType = FeatureAttributeType.Float;
 305         break;
 306       case Integer:
 307         attType = FeatureAttributeType.Integer;
 308         break;
 309       case String:
 310         attType = FeatureAttributeType.String;
 311         break;
 312       }
 313       metadata.setAttributeName(attributeId, desc);
 314       metadata.setAttributeType(attributeId, attType);
 315
 316       if (isFieldWanted(attributeId, vcfFieldPatterns))
 317       {
 318         vcfFieldsOfInterest.add(attributeId);
 319       }
 320     }
 321
 322     FeatureSources.getInstance().addSource(theSourceId, metadata);
 323   }
 324
 325   /**
 326    * Answers true if the field id is matched by any of the filter patterns, else
 327    * false. Matching is against regular expression patterns, and is not
 328    * case-sensitive.
 329    *
 330    * @param id
 331    * @param filters
 332    * @return
 333    */
 334   private boolean isFieldWanted(String id, List<Pattern> filters)
 335   {
 336     for (Pattern p : filters)
 337     {
 338       if (p.matcher(id.toUpperCase()).matches())
 339       {
 340         return true;
 341       }
 342     }
 343     return false;
 344   }
 345
 346   /**
 347    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 348    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 349    * required for processing.
 350    * <p>
 351    * CSQ fields are declared in the CSQ INFO Description e.g.
 352    * <p>
 353    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 354    */
 355   protected void parseCsqHeader()
 356   {
 357     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 358             DEFAULT_VEP_FIELDS);
 359     vepFieldsOfInterest = new HashMap<>();
 360
 361     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 362     if (csqInfo == null)
 363     {
 364       return;
 365     }
 366
 367     /*
 368      * parse out the pipe-separated list of CSQ fields; we assume here that
 369      * these form the last part of the description, and contain no spaces
 370      */
 371     String desc = csqInfo.getDescription();
 372     int spacePos = desc.lastIndexOf(" ");
 373     desc = desc.substring(spacePos + 1);
 374
 375     if (desc != null)
 376     {
 377       String[] format = desc.split(PIPE_REGEX);
 378       int index = 0;
 379       for (String field : format)
 380       {
 381         if (ALLELE_NUM_KEY.equals(field))
 382         {
 383           csqAlleleNumberFieldIndex = index;
 384         }
 385         if (ALLELE_KEY.equals(field))
 386         {
 387           csqAlleleFieldIndex = index;
 388         }
 389         if (FEATURE_KEY.equals(field))
 390         {
 391           csqFeatureFieldIndex = index;
 392         }
 393
 394         if (isFieldWanted(field, vepFieldFilters))
 395         {
 396           vepFieldsOfInterest.put(index, field);
 397         }
 398
 399         index++;
 400       }
 401     }
 402   }
 403
 404   /**
 405    * Reads the Preference value for the given key, with default specified if no
 406    * preference set. The value is interpreted as a comma-separated list of
 407    * regular expressions, and converted into a list of compiled patterns ready
 408    * for matching. Patterns are forced to upper-case for non-case-sensitive
 409    * matching.
 410    * <p>
 411    * This supports user-defined filters for fields of interest to capture while
 412    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 413    * fields with an ID of AF, or starting with AC, would be matched.
 414    *
 415    * @param key
 416    * @param def
 417    * @return
 418    */
 419   private List<Pattern> getFieldMatchers(String key, String def)
 420   {
 421     String pref = Cache.getDefault(key, def);
 422     List<Pattern> patterns = new ArrayList<>();
 423     String[] tokens = pref.split(",");
 424     for (String token : tokens)
 425     {
 426       try
 427       {
 428       patterns.add(Pattern.compile(token.toUpperCase()));
 429       } catch (PatternSyntaxException e)
 430       {
 431         System.err.println("Invalid pattern ignored: " + token);
 432       }
 433     }
 434     return patterns;
 435   }
 436
 437   /**
 438    * Transfers VCF features to sequences to which this sequence has a mapping.
 439    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 440    *
 441    * @param seq
 442    */
 443   protected void transferAddedFeatures(SequenceI seq)
 444   {
 445     DBRefEntry[] dbrefs = seq.getDBRefs();
 446     if (dbrefs == null)
 447     {
 448       return;
 449     }
 450     for (DBRefEntry dbref : dbrefs)
 451     {
 452       Mapping mapping = dbref.getMap();
 453       if (mapping == null || mapping.getTo() == null)
 454       {
 455         continue;
 456       }
 457
 458       SequenceI mapTo = mapping.getTo();
 459       MapList map = mapping.getMap();
 460       if (map.getFromRatio() == 3)
 461       {
 462         /*
 463          * dna-to-peptide product mapping
 464          */
 465         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 466       }
 467       else
 468       {
 469         /*
 470          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 471          */
 472         List<SequenceFeature> features = seq.getFeatures()
 473                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 474         for (SequenceFeature sf : features)
 475         {
 476           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 477           {
 478             transferFeature(sf, mapTo, map);
 479           }
 480         }
 481       }
 482     }
 483   }
 484
 485   /**
 486    * Tries to add overlapping variants read from a VCF file to the given
 487    * sequence, and returns the number of variant features added. Note that this
 488    * requires the sequence to hold information as to its species, chromosomal
 489    * positions and reference assembly, in order to be able to map the VCF
 490    * variants to the sequence (or not)
 491    *
 492    * @param seq
 493    * @param reader
 494    * @param vcfAssembly
 495    * @return
 496    */
 497   protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
 498           String vcfAssembly)
 499   {
 500     int count = 0;
 501     GeneLociI seqCoords = seq.getGeneLoci();
 502     if (seqCoords == null)
 503     {
 504       System.out.println(String.format(
 505               "Can't query VCF for %s as chromosome coordinates not known",
 506               seq.getName()));
 507       return 0;
 508     }
 509
 510     if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId()))
 511     {
 512       return 0;
 513     }
 514
 515     List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
 516     for (int[] range : seqChromosomalContigs)
 517     {
 518       count += addVcfVariants(seq, reader, range, vcfAssembly);
 519     }
 520
 521     return count;
 522   }
 523
 524   /**
 525    * Answers true if the species inferred from the VCF reference identifier
 526    * matches that for the sequence
 527    *
 528    * @param vcfAssembly
 529    * @param speciesId
 530    * @return
 531    */
 532   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 533   {
 534     // PROBLEM 1
 535     // there are many aliases for species - how to equate one with another?
 536     // PROBLEM 2
 537     // VCF ##reference header is an unstructured URI - how to extract species?
 538     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 539     // TODO ask the user to confirm this??
 540
 541     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 542             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 543     {
 544       return true;
 545     }
 546
 547     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 548             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 549     {
 550       return true;
 551     }
 552
 553     // this is not a sustainable solution...
 554
 555     return false;
 556   }
 557
 558   /**
 559    * Queries the VCF reader for any variants that overlap the given chromosome
 560    * region of the sequence, and adds as variant features. Returns the number of
 561    * overlapping variants found.
 562    *
 563    * @param seq
 564    * @param reader
 565    * @param range
 566    *          start-end range of a sequence region in its chromosomal
 567    *          coordinates
 568    * @param vcfAssembly
 569    *          the '##reference' identifier for the VCF reference assembly
 570    * @return
 571    */
 572   protected int addVcfVariants(SequenceI seq, VCFReader reader,
 573           int[] range, String vcfAssembly)
 574   {
 575     GeneLociI seqCoords = seq.getGeneLoci();
 576
 577     String chromosome = seqCoords.getChromosomeId();
 578     String seqRef = seqCoords.getAssemblyId();
 579     String species = seqCoords.getSpeciesId();
 580
 581     /*
 582      * map chromosomal coordinates from sequence to VCF if the VCF
 583      * data has a different reference assembly to the sequence
 584      */
 585     // TODO generalise for non-human species
 586     // - or get the user to choose in a dialog
 587
 588     int offset = 0;
 589     if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 590             && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 591     {
 592       String toRef = "GRCh37";
 593       int[] newRange = mapReferenceRange(range, chromosome, "human",
 594               seqRef, toRef);
 595       if (newRange == null)
 596       {
 597         System.err.println(String.format(
 598                 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
 599                 seqRef, range[0], range[1], toRef));
 600         return 0;
 601       }
 602       offset = newRange[0] - range[0];
 603       range = newRange;
 604     }
 605
 606     boolean forwardStrand = range[0] <= range[1];
 607
 608     /*
 609      * query the VCF for overlaps
 610      * (convert a reverse strand range to forwards)
 611      */
 612     int count = 0;
 613     MapList mapping = seqCoords.getMap();
 614
 615     int fromLocus = Math.min(range[0], range[1]);
 616     int toLocus = Math.max(range[0], range[1]);
 617     CloseableIterator<VariantContext> variants = reader.query(chromosome,
 618             fromLocus, toLocus);
 619     while (variants.hasNext())
 620     {
 621       /*
 622        * get variant location in sequence chromosomal coordinates
 623        */
 624       VariantContext variant = variants.next();
 625
 626       int start = variant.getStart() - offset;
 627       int end = variant.getEnd() - offset;
 628
 629       /*
 630        * convert chromosomal location to sequence coordinates
 631        * - may be reverse strand (convert to forward for sequence feature)
 632        * - null if a partially overlapping feature
 633        */
 634       int[] seqLocation = mapping.locateInFrom(start, end);
 635       if (seqLocation != null)
 636       {
 637         int featureStart = Math.min(seqLocation[0], seqLocation[1]);
 638         int featureEnd = Math.max(seqLocation[0], seqLocation[1]);
 639         count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 640                 forwardStrand);
 641       }
 642     }
 643
 644     variants.close();
 645
 646     return count;
 647   }
 648
 649   /**
 650    * A convenience method to get the AF value for the given alternate allele
 651    * index
 652    *
 653    * @param variant
 654    * @param alleleIndex
 655    * @return
 656    */
 657   protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
 658   {
 659     float score = 0f;
 660     String attributeValue = getAttributeValue(variant,
 661             ALLELE_FREQUENCY_KEY, alleleIndex);
 662     if (attributeValue != null)
 663     {
 664       try
 665       {
 666         score = Float.parseFloat(attributeValue);
 667       } catch (NumberFormatException e)
 668       {
 669         // leave as 0
 670       }
 671     }
 672
 673     return score;
 674   }
 675
 676   /**
 677    * A convenience method to get an attribute value for an alternate allele
 678    *
 679    * @param variant
 680    * @param attributeName
 681    * @param alleleIndex
 682    * @return
 683    */
 684   protected String getAttributeValue(VariantContext variant,
 685           String attributeName, int alleleIndex)
 686   {
 687     Object att = variant.getAttribute(attributeName);
 688
 689     if (att instanceof String)
 690     {
 691       return (String) att;
 692     }
 693     else if (att instanceof ArrayList)
 694     {
 695       return ((List<String>) att).get(alleleIndex);
 696     }
 697
 698     return null;
 699   }
 700
 701   /**
 702    * Adds one variant feature for each allele in the VCF variant record, and
 703    * returns the number of features added.
 704    *
 705    * @param seq
 706    * @param variant
 707    * @param featureStart
 708    * @param featureEnd
 709    * @param forwardStrand
 710    * @return
 711    */
 712   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 713           int featureStart, int featureEnd, boolean forwardStrand)
 714   {
 715     int added = 0;
 716
 717     /*
 718      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 719      * so we proceed defensively to get them in strict order
 720      */
 721     int altAlleleCount = variant.getAlternateAlleles().size();
 722     for (int i = 0; i < altAlleleCount; i++)
 723     {
 724       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 725               forwardStrand);
 726     }
 727     return added;
 728   }
 729
 730   /**
 731    * Inspects one allele and attempts to add a variant feature for it to the
 732    * sequence. We extract as much as possible of the additional data associated
 733    * with this allele to store in the feature's key-value map. Answers the
 734    * number of features added (0 or 1).
 735    *
 736    * @param seq
 737    * @param variant
 738    * @param altAlleleIndex
 739    *          (0, 1..)
 740    * @param featureStart
 741    * @param featureEnd
 742    * @param forwardStrand
 743    * @return
 744    */
 745   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 746           int altAlleleIndex, int featureStart, int featureEnd,
 747           boolean forwardStrand)
 748   {
 749     String reference = variant.getReference().getBaseString();
 750     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 751     String allele = alt.getBaseString();
 752
 753     /*
 754      * build the ref,alt allele description e.g. "G,A", using the base
 755      * complement if the sequence is on the reverse strand
 756      */
 757     // TODO check how structural variants are shown on reverse strand
 758     StringBuilder sb = new StringBuilder();
 759     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 760     sb.append(COMMA);
 761     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 762     String alleles = sb.toString(); // e.g. G,A
 763
 764     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 765     float score = getAlleleFrequency(variant, altAlleleIndex);
 766
 767     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 768             featureEnd, score, FEATURE_GROUP_VCF);
 769     sf.setSource(sourceId);
 770
 771     sf.setValue(Gff3Helper.ALLELES, alleles);
 772
 773     addAlleleProperties(variant, seq, sf, altAlleleIndex);
 774
 775     seq.addSequenceFeature(sf);
 776
 777     return 1;
 778   }
 779
 780   /**
 781    * Add any allele-specific VCF key-value data to the sequence feature
 782    *
 783    * @param variant
 784    * @param seq
 785    * @param sf
 786    * @param altAlelleIndex
 787    *          (0, 1..)
 788    */
 789   protected void addAlleleProperties(VariantContext variant, SequenceI seq,
 790           SequenceFeature sf, final int altAlelleIndex)
 791   {
 792     Map<String, Object> atts = variant.getAttributes();
 793
 794     for (Entry<String, Object> att : atts.entrySet())
 795     {
 796       String key = att.getKey();
 797
 798       /*
 799        * extract Consequence data (if present) that we are able to
 800        * associated with the allele for this variant feature
 801        */
 802       if (CSQ_FIELD.equals(key))
 803       {
 804         addConsequences(variant, seq, sf, altAlelleIndex);
 805         continue;
 806       }
 807
 808       /*
 809        * filter out fields we don't want to capture
 810        */
 811       if (!vcfFieldsOfInterest.contains(key))
 812       {
 813         continue;
 814       }
 815
 816       /*
 817        * we extract values for other data which are allele-specific;
 818        * these may be per alternate allele (INFO[key].Number = 'A')
 819        * or per allele including reference (INFO[key].Number = 'R')
 820        */
 821       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
 822       if (infoHeader == null)
 823       {
 824         /*
 825          * can't be sure what data belongs to this allele, so
 826          * play safe and don't take any
 827          */
 828         continue;
 829       }
 830
 831       VCFHeaderLineCount number = infoHeader.getCountType();
 832       int index = altAlelleIndex;
 833       if (number == VCFHeaderLineCount.R)
 834       {
 835         /*
 836          * one value per allele including reference, so bump index
 837          * e.g. the 3rd value is for the  2nd alternate allele
 838          */
 839         index++;
 840       }
 841       else if (number != VCFHeaderLineCount.A)
 842       {
 843         /*
 844          * don't save other values as not allele-related
 845          */
 846         continue;
 847       }
 848
 849       /*
 850        * take the index'th value
 851        */
 852       String value = getAttributeValue(variant, key, index);
 853       if (value != null)
 854       {
 855         sf.setValue(key, value);
 856       }
 857     }
 858   }
 859
 860   /**
 861    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
 862    * feature for the current allele (and transcript if applicable)
 863    * <p>
 864    * Allele matching: if field ALLELE_NUM is present, it must match
 865    * altAlleleIndex. If not present, then field Allele value must match the VCF
 866    * Allele.
 867    * <p>
 868    * Transcript matching: if sequence name can be identified to at least one of
 869    * the consequences' Feature values, then select only consequences that match
 870    * the value (i.e. consequences for the current transcript sequence). If not,
 871    * take all consequences (this is the case when adding features to the gene
 872    * sequence).
 873    *
 874    * @param variant
 875    * @param seq
 876    * @param sf
 877    * @param altAlelleIndex
 878    *          (0, 1..)
 879    */
 880   protected void addConsequences(VariantContext variant, SequenceI seq,
 881           SequenceFeature sf, int altAlelleIndex)
 882   {
 883     Object value = variant.getAttribute(CSQ_FIELD);
 884
 885     if (value == null || !(value instanceof ArrayList<?>))
 886     {
 887       return;
 888     }
 889
 890     List<String> consequences = (List<String>) value;
 891
 892     /*
 893      * if CSQ data includes 'Feature', and any value matches the sequence name,
 894      * then restrict consequence data to only the matching value (transcript)
 895      * i.e. just pick out consequences for the transcript the variant feature is on
 896      */
 897     String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase();
 898     String matchFeature = null;
 899     if (csqFeatureFieldIndex > -1)
 900     {
 901       for (String consequence : consequences)
 902       {
 903         String[] csqFields = consequence.split(PIPE_REGEX);
 904         if (csqFields.length > csqFeatureFieldIndex)
 905         {
 906           String featureIdentifier = csqFields[csqFeatureFieldIndex];
 907           if (featureIdentifier.length() > 4
 908                   && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
 909           {
 910             matchFeature = featureIdentifier;
 911           }
 912         }
 913       }
 914     }
 915
 916     /*
 917      * inspect CSQ consequences; where possible restrict to the consequence
 918      * associated with the current transcript (Feature)
 919      */
 920     SortedMap<String, String> csqValues = new TreeMap<>(
 921             String.CASE_INSENSITIVE_ORDER);
 922
 923     for (String consequence : consequences)
 924     {
 925       String[] csqFields = consequence.split(PIPE_REGEX);
 926
 927       if (includeConsequence(csqFields, matchFeature, variant,
 928               altAlelleIndex))
 929       {
 930         /*
 931          * inspect individual fields of this consequence, copying non-null
 932          * values which are 'fields of interest'
 933          */
 934         int i = 0;
 935         for (String field : csqFields)
 936         {
 937           if (field != null && field.length() > 0)
 938           {
 939             String id = vepFieldsOfInterest.get(i);
 940             if (id != null)
 941             {
 942               csqValues.put(id, field);
 943             }
 944           }
 945           i++;
 946         }
 947       }
 948     }
 949
 950     if (!csqValues.isEmpty())
 951     {
 952       sf.setValue(CSQ_FIELD, csqValues);
 953     }
 954   }
 955
 956   /**
 957    * Answers true if we want to associate this block of consequence data with
 958    * the specified alternate allele of the VCF variant.
 959    * <p>
 960    * If consequence data includes the ALLELE_NUM field, then this has to match
 961    * altAlleleIndex. Otherwise the Allele field of the consequence data has to
 962    * match the allele value.
 963    * <p>
 964    * Optionally (if matchFeature is not null), restrict to only include
 965    * consequences whose Feature value matches. This allows us to attach
 966    * consequences to their respective transcripts.
 967    *
 968    * @param csqFields
 969    * @param matchFeature
 970    * @param variant
 971    * @param altAlelleIndex
 972    *          (0, 1..)
 973    * @return
 974    */
 975   protected boolean includeConsequence(String[] csqFields,
 976           String matchFeature, VariantContext variant, int altAlelleIndex)
 977   {
 978     /*
 979      * check consequence is for the current transcript
 980      */
 981     if (matchFeature != null)
 982     {
 983       if (csqFields.length <= csqFeatureFieldIndex)
 984       {
 985         return false;
 986       }
 987       String featureIdentifier = csqFields[csqFeatureFieldIndex];
 988       if (!featureIdentifier.equals(matchFeature))
 989       {
 990         return false; // consequence is for a different transcript
 991       }
 992     }
 993
 994     /*
 995      * if ALLELE_NUM is present, it must match altAlleleIndex
 996      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
 997      */
 998     if (csqAlleleNumberFieldIndex > -1)
 999     {
1000       if (csqFields.length <= csqAlleleNumberFieldIndex)
1001       {
1002         return false;
1003       }
1004       String alleleNum = csqFields[csqAlleleNumberFieldIndex];
1005       return String.valueOf(altAlelleIndex + 1).equals(alleleNum);
1006     }
1007
1008     /*
1009      * else consequence allele must match variant allele
1010      */
1011     if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex)
1012     {
1013       String csqAllele = csqFields[csqAlleleFieldIndex];
1014       String vcfAllele = variant.getAlternateAllele(altAlelleIndex)
1015               .getBaseString();
1016       return csqAllele.equals(vcfAllele);
1017     }
1018
1019     return false;
1020   }
1021
1022   /**
1023    * A convenience method to complement a dna base and return the string value
1024    * of its complement
1025    *
1026    * @param reference
1027    * @return
1028    */
1029   protected String complement(byte[] reference)
1030   {
1031     return String.valueOf(Dna.getComplement((char) reference[0]));
1032   }
1033
1034   /**
1035    * Determines the location of the query range (chromosome positions) in a
1036    * different reference assembly.
1037    * <p>
1038    * If the range is just a subregion of one for which we already have a mapping
1039    * (for example, an exon sub-region of a gene), then the mapping is just
1040    * computed arithmetically.
1041    * <p>
1042    * Otherwise, calls the Ensembl REST service that maps from one assembly
1043    * reference's coordinates to another's
1044    *
1045    * @param queryRange
1046    *          start-end chromosomal range in 'fromRef' coordinates
1047    * @param chromosome
1048    * @param species
1049    * @param fromRef
1050    *          assembly reference for the query coordinates
1051    * @param toRef
1052    *          assembly reference we wish to translate to
1053    * @return the start-end range in 'toRef' coordinates
1054    */
1055   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1056           String species, String fromRef, String toRef)
1057   {
1058     /*
1059      * first try shorcut of computing the mapping as a subregion of one
1060      * we already have (e.g. for an exon, if we have the gene mapping)
1061      */
1062     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1063             species, fromRef, toRef);
1064     if (mappedRange != null)
1065     {
1066       return mappedRange;
1067     }
1068
1069     /*
1070      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1071      */
1072     EnsemblMap mapper = new EnsemblMap();
1073     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1074             toRef, queryRange);
1075
1076     if (mapping == null)
1077     {
1078       // mapping service failure
1079       return null;
1080     }
1081
1082     /*
1083      * save mapping for possible future re-use
1084      */
1085     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1086     if (!assemblyMappings.containsKey(key))
1087     {
1088       assemblyMappings.put(key, new HashMap<int[], int[]>());
1089     }
1090
1091     assemblyMappings.get(key).put(queryRange, mapping);
1092
1093     return mapping;
1094   }
1095
1096   /**
1097    * If we already have a 1:1 contiguous mapping which subsumes the given query
1098    * range, this method just calculates and returns the subset of that mapping,
1099    * else it returns null. In practical terms, if a gene has a contiguous
1100    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1101    * subsidiary exons occupy unchanged relative positions, and just compute
1102    * these as offsets, rather than do another lookup of the mapping.
1103    * <p>
1104    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1105    * simply remove this method or let it always return null.
1106    * <p>
1107    * Warning: many rapid calls to the /map service map result in a 429 overload
1108    * error response
1109    *
1110    * @param queryRange
1111    * @param chromosome
1112    * @param species
1113    * @param fromRef
1114    * @param toRef
1115    * @return
1116    */
1117   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1118           String species, String fromRef, String toRef)
1119   {
1120     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1121     if (assemblyMappings.containsKey(key))
1122     {
1123       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1124       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1125       {
1126         int[] fromRange = mappedRange.getKey();
1127         int[] toRange = mappedRange.getValue();
1128         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1129         {
1130           /*
1131            * mapping is 1:1 in length, so we trust it to have no discontinuities
1132            */
1133           if (MappingUtils.rangeContains(fromRange, queryRange))
1134           {
1135             /*
1136              * fromRange subsumes our query range
1137              */
1138             int offset = queryRange[0] - fromRange[0];
1139             int mappedRangeFrom = toRange[0] + offset;
1140             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1141             return new int[] { mappedRangeFrom, mappedRangeTo };
1142           }
1143         }
1144       }
1145     }
1146     return null;
1147   }
1148
1149   /**
1150    * Transfers the sequence feature to the target sequence, locating its start
1151    * and end range based on the mapping. Features which do not overlap the
1152    * target sequence are ignored.
1153    *
1154    * @param sf
1155    * @param targetSequence
1156    * @param mapping
1157    *          mapping from the feature's coordinates to the target sequence
1158    */
1159   protected void transferFeature(SequenceFeature sf,
1160           SequenceI targetSequence, MapList mapping)
1161   {
1162     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1163
1164     if (mappedRange != null)
1165     {
1166       String group = sf.getFeatureGroup();
1167       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1168       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1169       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1170               group, sf.getScore());
1171       targetSequence.addSequenceFeature(copy);
1172     }
1173   }
1174
1175   /**
1176    * Formats a ranges map lookup key
1177    *
1178    * @param chromosome
1179    * @param species
1180    * @param fromRef
1181    * @param toRef
1182    * @return
1183    */
1184   protected static String makeRangesKey(String chromosome, String species,
1185           String fromRef, String toRef)
1186   {
1187     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1188             + toRef;
1189   }
1190 }