src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.api.AlignViewControllerGuiI;
   6 import jalview.bin.Cache;
   7 import jalview.datamodel.AlignmentI;
   8 import jalview.datamodel.DBRefEntry;
   9 import jalview.datamodel.GeneLociI;
  10 import jalview.datamodel.Mapping;
  11 import jalview.datamodel.SequenceFeature;
  12 import jalview.datamodel.SequenceI;
  13 import jalview.datamodel.features.FeatureAttributeType;
  14 import jalview.datamodel.features.FeatureSource;
  15 import jalview.datamodel.features.FeatureSources;
  16 import jalview.ext.ensembl.EnsemblMap;
  17 import jalview.ext.htsjdk.VCFReader;
  18 import jalview.io.gff.Gff3Helper;
  19 import jalview.io.gff.SequenceOntologyI;
  20 import jalview.util.MapList;
  21 import jalview.util.MappingUtils;
  22 import jalview.util.MessageManager;
  23
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Map.Entry;
  30 import java.util.regex.Pattern;
  31 import java.util.regex.PatternSyntaxException;
  32
  33 import htsjdk.samtools.SAMException;
  34 import htsjdk.samtools.SAMSequenceDictionary;
  35 import htsjdk.samtools.SAMSequenceRecord;
  36 import htsjdk.samtools.util.CloseableIterator;
  37 import htsjdk.variant.variantcontext.Allele;
  38 import htsjdk.variant.variantcontext.VariantContext;
  39 import htsjdk.variant.vcf.VCFHeader;
  40 import htsjdk.variant.vcf.VCFHeaderLine;
  41 import htsjdk.variant.vcf.VCFHeaderLineCount;
  42 import htsjdk.variant.vcf.VCFHeaderLineType;
  43 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  44
  45 /**
  46  * A class to read VCF data (using the htsjdk) and add variants as sequence
  47  * features on dna and any related protein product sequences
  48  *
  49  * @author gmcarstairs
  50  */
  51 public class VCFLoader
  52 {
  53   /**
  54    * A class to model the mapping from sequence to VCF coordinates. Cases include
  55    * <ul>
  56    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  57    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
  58    * use the same reference assembly</li>
  59    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
  60    * and VCF use different reference assembles</li>
  61    * </ul>
  62    */
  63   class VCFMap
  64   {
  65     final String chromosome;
  66
  67     final MapList map;
  68
  69     VCFMap(String chr, MapList m)
  70     {
  71       chromosome = chr;
  72       map = m;
  73     }
  74
  75     @Override
  76     public String toString()
  77     {
  78       return chromosome + ":" + map.toString();
  79     }
  80   }
  81
  82   /*
  83    * Lookup keys, and default values, for Preference entries that describe
  84    * patterns for VCF and VEP fields to capture
  85    */
  86   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  87
  88   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  89
  90   private static final String DEFAULT_VCF_FIELDS = ".*";
  91
  92   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  93
  94   /*
  95    * keys to fields of VEP CSQ consequence data
  96    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  97    */
  98   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
  99   private static final String CSQ_ALLELE_KEY = "Allele";
 100   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 101   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 102
 103   /*
 104    * default VCF INFO key for VEP consequence data
 105    * NB this can be overridden running VEP with --vcf_info_field
 106    * - we don't handle this case (require identifier to be CSQ)
 107    */
 108   private static final String CSQ_FIELD = "CSQ";
 109
 110   /*
 111    * separator for fields in consequence data is '|'
 112    */
 113   private static final String PIPE_REGEX = "\\|";
 114
 115   /*
 116    * key for Allele Frequency output by VEP
 117    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 118    */
 119   private static final String ALLELE_FREQUENCY_KEY = "AF";
 120
 121   /*
 122    * delimiter that separates multiple consequence data blocks
 123    */
 124   private static final String COMMA = ",";
 125
 126   /*
 127    * the feature group assigned to a VCF variant in Jalview
 128    */
 129   private static final String FEATURE_GROUP_VCF = "VCF";
 130
 131   /*
 132    * internal delimiter used to build keys for assemblyMappings
 133    *
 134    */
 135   private static final String EXCL = "!";
 136
 137   /*
 138    * the alignment we are associating VCF data with
 139    */
 140   private AlignmentI al;
 141
 142   /*
 143    * mappings between VCF and sequence reference assembly regions, as
 144    * key = "species!chromosome!fromAssembly!toAssembly
 145    * value = Map{fromRange, toRange}
 146    */
 147   private Map<String, Map<int[], int[]>> assemblyMappings;
 148
 149   /*
 150    * holds details of the VCF header lines (metadata)
 151    */
 152   private VCFHeader header;
 153
 154   /*
 155    * a Dictionary of contigs (if present) referenced in the VCF file
 156    */
 157   private SAMSequenceDictionary dictionary;
 158
 159   /*
 160    * the position (0...) of field in each block of
 161    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 162    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 163    */
 164   private int csqConsequenceFieldIndex = -1;
 165   private int csqAlleleFieldIndex = -1;
 166   private int csqAlleleNumberFieldIndex = -1;
 167   private int csqFeatureFieldIndex = -1;
 168
 169   // todo the same fields for SnpEff ANN data if wanted
 170   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 171
 172   /*
 173    * a unique identifier under which to save metadata about feature
 174    * attributes (selected INFO field data)
 175    */
 176   private String sourceId;
 177
 178   /*
 179    * The INFO IDs of data that is both present in the VCF file, and
 180    * also matched by any filters for data of interest
 181    */
 182   List<String> vcfFieldsOfInterest;
 183
 184   /*
 185    * The field offsets and identifiers for VEP (CSQ) data that is both present
 186    * in the VCF file, and also matched by any filters for data of interest
 187    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 188    */
 189   Map<Integer, String> vepFieldsOfInterest;
 190
 191   /**
 192    * Constructor given an alignment context
 193    *
 194    * @param alignment
 195    */
 196   public VCFLoader(AlignmentI alignment)
 197   {
 198     al = alignment;
 199
 200     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 201     assemblyMappings = new HashMap<>();
 202   }
 203
 204   /**
 205    * Starts a new thread to query and load VCF variant data on to the alignment
 206    * <p>
 207    * This method is not thread safe - concurrent threads should use separate
 208    * instances of this class.
 209    *
 210    * @param filePath
 211    * @param gui
 212    */
 213   public void loadVCF(final String filePath,
 214           final AlignViewControllerGuiI gui)
 215   {
 216     if (gui != null)
 217     {
 218       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 219     }
 220
 221     new Thread()
 222     {
 223
 224       @Override
 225       public void run()
 226       {
 227         VCFLoader.this.doLoad(filePath, gui);
 228       }
 229
 230     }.start();
 231   }
 232
 233   /**
 234    * Loads VCF on to an alignment - provided it can be related to one or more
 235    * sequence's chromosomal coordinates
 236    *
 237    * @param filePath
 238    * @param gui
 239    *          optional callback handler for messages
 240    */
 241   protected void doLoad(String filePath, AlignViewControllerGuiI gui)
 242   {
 243     VCFReader reader = null;
 244     try
 245     {
 246       // long start = System.currentTimeMillis();
 247       reader = new VCFReader(filePath);
 248
 249       header = reader.getFileHeader();
 250
 251       try
 252       {
 253         dictionary = header.getSequenceDictionary();
 254       } catch (SAMException e)
 255       {
 256         // ignore - thrown if any contig line lacks length info
 257       }
 258
 259       sourceId = filePath;
 260
 261       saveMetadata(sourceId);
 262
 263       /*
 264        * get offset of CSQ ALLELE_NUM and Feature if declared
 265        */
 266       parseCsqHeader();
 267
 268       VCFHeaderLine ref = header
 269               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 270       String vcfAssembly = ref.getValue();
 271
 272       int varCount = 0;
 273       int seqCount = 0;
 274
 275       /*
 276        * query for VCF overlapping each sequence in turn
 277        */
 278       for (SequenceI seq : al.getSequences())
 279       {
 280         int added = loadSequenceVCF(seq, reader, vcfAssembly);
 281         if (added > 0)
 282         {
 283           seqCount++;
 284           varCount += added;
 285           transferAddedFeatures(seq);
 286         }
 287       }
 288       if (gui != null)
 289       {
 290         // long elapsed = System.currentTimeMillis() - start;
 291         String msg = MessageManager.formatMessage("label.added_vcf",
 292                 varCount, seqCount);
 293         gui.setStatus(msg);
 294         if (gui.getFeatureSettingsUI() != null)
 295         {
 296           gui.getFeatureSettingsUI().discoverAllFeatureData();
 297         }
 298       }
 299     } catch (Throwable e)
 300     {
 301       System.err.println("Error processing VCF: " + e.getMessage());
 302       e.printStackTrace();
 303       if (gui != null)
 304       {
 305         gui.setStatus("Error occurred - see console for details");
 306       }
 307     } finally
 308     {
 309       if (reader != null)
 310       {
 311         try
 312         {
 313           reader.close();
 314         } catch (IOException e)
 315         {
 316           // ignore
 317         }
 318       }
 319       header = null;
 320       dictionary = null;
 321     }
 322   }
 323
 324   /**
 325    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 326    * them for future reference
 327    *
 328    * @param theSourceId
 329    */
 330   void saveMetadata(String theSourceId)
 331   {
 332     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 333             DEFAULT_VCF_FIELDS);
 334     vcfFieldsOfInterest = new ArrayList<>();
 335
 336     FeatureSource metadata = new FeatureSource(theSourceId);
 337
 338     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 339     {
 340       String attributeId = info.getID();
 341       String desc = info.getDescription();
 342       VCFHeaderLineType type = info.getType();
 343       FeatureAttributeType attType = null;
 344       switch (type)
 345       {
 346       case Character:
 347         attType = FeatureAttributeType.Character;
 348         break;
 349       case Flag:
 350         attType = FeatureAttributeType.Flag;
 351         break;
 352       case Float:
 353         attType = FeatureAttributeType.Float;
 354         break;
 355       case Integer:
 356         attType = FeatureAttributeType.Integer;
 357         break;
 358       case String:
 359         attType = FeatureAttributeType.String;
 360         break;
 361       }
 362       metadata.setAttributeName(attributeId, desc);
 363       metadata.setAttributeType(attributeId, attType);
 364
 365       if (isFieldWanted(attributeId, vcfFieldPatterns))
 366       {
 367         vcfFieldsOfInterest.add(attributeId);
 368       }
 369     }
 370
 371     FeatureSources.getInstance().addSource(theSourceId, metadata);
 372   }
 373
 374   /**
 375    * Answers true if the field id is matched by any of the filter patterns, else
 376    * false. Matching is against regular expression patterns, and is not
 377    * case-sensitive.
 378    *
 379    * @param id
 380    * @param filters
 381    * @return
 382    */
 383   private boolean isFieldWanted(String id, List<Pattern> filters)
 384   {
 385     for (Pattern p : filters)
 386     {
 387       if (p.matcher(id.toUpperCase()).matches())
 388       {
 389         return true;
 390       }
 391     }
 392     return false;
 393   }
 394
 395   /**
 396    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 397    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 398    * required for processing.
 399    * <p>
 400    * CSQ fields are declared in the CSQ INFO Description e.g.
 401    * <p>
 402    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 403    */
 404   protected void parseCsqHeader()
 405   {
 406     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 407             DEFAULT_VEP_FIELDS);
 408     vepFieldsOfInterest = new HashMap<>();
 409
 410     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 411     if (csqInfo == null)
 412     {
 413       return;
 414     }
 415
 416     /*
 417      * parse out the pipe-separated list of CSQ fields; we assume here that
 418      * these form the last part of the description, and contain no spaces
 419      */
 420     String desc = csqInfo.getDescription();
 421     int spacePos = desc.lastIndexOf(" ");
 422     desc = desc.substring(spacePos + 1);
 423
 424     if (desc != null)
 425     {
 426       String[] format = desc.split(PIPE_REGEX);
 427       int index = 0;
 428       for (String field : format)
 429       {
 430         if (CSQ_CONSEQUENCE_KEY.equals(field))
 431         {
 432           csqConsequenceFieldIndex = index;
 433         }
 434         if (CSQ_ALLELE_NUM_KEY.equals(field))
 435         {
 436           csqAlleleNumberFieldIndex = index;
 437         }
 438         if (CSQ_ALLELE_KEY.equals(field))
 439         {
 440           csqAlleleFieldIndex = index;
 441         }
 442         if (CSQ_FEATURE_KEY.equals(field))
 443         {
 444           csqFeatureFieldIndex = index;
 445         }
 446
 447         if (isFieldWanted(field, vepFieldFilters))
 448         {
 449           vepFieldsOfInterest.put(index, field);
 450         }
 451
 452         index++;
 453       }
 454     }
 455   }
 456
 457   /**
 458    * Reads the Preference value for the given key, with default specified if no
 459    * preference set. The value is interpreted as a comma-separated list of
 460    * regular expressions, and converted into a list of compiled patterns ready
 461    * for matching. Patterns are forced to upper-case for non-case-sensitive
 462    * matching.
 463    * <p>
 464    * This supports user-defined filters for fields of interest to capture while
 465    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 466    * fields with an ID of AF, or starting with AC, would be matched.
 467    *
 468    * @param key
 469    * @param def
 470    * @return
 471    */
 472   private List<Pattern> getFieldMatchers(String key, String def)
 473   {
 474     String pref = Cache.getDefault(key, def);
 475     List<Pattern> patterns = new ArrayList<>();
 476     String[] tokens = pref.split(",");
 477     for (String token : tokens)
 478     {
 479       try
 480       {
 481       patterns.add(Pattern.compile(token.toUpperCase()));
 482       } catch (PatternSyntaxException e)
 483       {
 484         System.err.println("Invalid pattern ignored: " + token);
 485       }
 486     }
 487     return patterns;
 488   }
 489
 490   /**
 491    * Transfers VCF features to sequences to which this sequence has a mapping.
 492    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 493    *
 494    * @param seq
 495    */
 496   protected void transferAddedFeatures(SequenceI seq)
 497   {
 498     DBRefEntry[] dbrefs = seq.getDBRefs();
 499     if (dbrefs == null)
 500     {
 501       return;
 502     }
 503     for (DBRefEntry dbref : dbrefs)
 504     {
 505       Mapping mapping = dbref.getMap();
 506       if (mapping == null || mapping.getTo() == null)
 507       {
 508         continue;
 509       }
 510
 511       SequenceI mapTo = mapping.getTo();
 512       MapList map = mapping.getMap();
 513       if (map.getFromRatio() == 3)
 514       {
 515         /*
 516          * dna-to-peptide product mapping
 517          */
 518         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 519       }
 520       else
 521       {
 522         /*
 523          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 524          */
 525         List<SequenceFeature> features = seq.getFeatures()
 526                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 527         for (SequenceFeature sf : features)
 528         {
 529           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 530           {
 531             transferFeature(sf, mapTo, map);
 532           }
 533         }
 534       }
 535     }
 536   }
 537
 538   /**
 539    * Tries to add overlapping variants read from a VCF file to the given
 540    * sequence, and returns the number of variant features added. Note that this
 541    * requires the sequence to hold information as to its species, chromosomal
 542    * positions and reference assembly, in order to be able to map the VCF
 543    * variants to the sequence (or not)
 544    *
 545    * @param seq
 546    * @param reader
 547    * @param vcfAssembly
 548    * @return
 549    */
 550   protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
 551           String vcfAssembly)
 552   {
 553     VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
 554     if (vcfMap == null)
 555     {
 556       return 0;
 557     }
 558
 559     /*
 560      * work with the dataset sequence here
 561      */
 562     SequenceI dss = seq.getDatasetSequence();
 563     if (dss == null)
 564     {
 565       dss = seq;
 566     }
 567     return addVcfVariants(dss, reader, vcfMap, vcfAssembly);
 568   }
 569
 570   /**
 571    * Answers a map from sequence coordinates to VCF chromosome ranges
 572    *
 573    * @param seq
 574    * @param vcfAssembly
 575    * @return
 576    */
 577   private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
 578   {
 579     /*
 580      * simplest case: sequence has id and length matching a VCF contig
 581      */
 582     VCFMap vcfMap = null;
 583     if (dictionary != null)
 584     {
 585       vcfMap = getContigMap(seq);
 586     }
 587     if (vcfMap != null)
 588     {
 589       return vcfMap;
 590     }
 591
 592     /*
 593      * otherwise, map to VCF from chromosomal coordinates
 594      * of the sequence (if known)
 595      */
 596     GeneLociI seqCoords = seq.getGeneLoci();
 597     if (seqCoords == null)
 598     {
 599       Cache.log.warn(String.format(
 600               "Can't query VCF for %s as chromosome coordinates not known",
 601               seq.getName()));
 602       return null;
 603     }
 604
 605     String species = seqCoords.getSpeciesId();
 606     String chromosome = seqCoords.getChromosomeId();
 607     String seqRef = seqCoords.getAssemblyId();
 608     MapList map = seqCoords.getMap();
 609
 610     if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
 611     {
 612       return null;
 613     }
 614
 615     if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
 616     {
 617       return new VCFMap(chromosome, map);
 618     }
 619
 620     if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 621             || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 622     {
 623       return null;
 624     }
 625
 626     /*
 627      * map chromosomal coordinates from sequence to VCF if the VCF
 628      * data has a different reference assembly to the sequence
 629      */
 630     // TODO generalise for cases other than GRCh38 -> GRCh37 !
 631     // - or get the user to choose in a dialog
 632
 633     List<int[]> toVcfRanges = new ArrayList<>();
 634     List<int[]> fromSequenceRanges = new ArrayList<>();
 635     String toRef = "GRCh37";
 636
 637     for (int[] range : map.getToRanges())
 638     {
 639       int[] fromRange = map.locateInFrom(range[0], range[1]);
 640       if (fromRange == null)
 641       {
 642         // corrupted map?!?
 643         continue;
 644       }
 645
 646       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 647               toRef);
 648       if (newRange == null)
 649       {
 650         Cache.log.error(
 651                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 652                         chromosome, seqRef, range[0], range[1], toRef));
 653         continue;
 654       }
 655       else
 656       {
 657         toVcfRanges.add(newRange);
 658         fromSequenceRanges.add(fromRange);
 659       }
 660     }
 661
 662     return new VCFMap(chromosome,
 663             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 664   }
 665
 666   /**
 667    * If the sequence id matches a contig declared in the VCF file, and the
 668    * sequence length matches the contig length, then returns a 1:1 map of the
 669    * sequence to the contig, else returns null
 670    *
 671    * @param seq
 672    * @return
 673    */
 674   private VCFMap getContigMap(SequenceI seq)
 675   {
 676     String id = seq.getName();
 677     SAMSequenceRecord contig = dictionary.getSequence(id);
 678     if (contig != null)
 679     {
 680       int len = seq.getLength();
 681       if (len == contig.getSequenceLength())
 682       {
 683         MapList map = new MapList(new int[] { 1, len },
 684                 new int[]
 685                 { 1, len }, 1, 1);
 686         return new VCFMap(id, map);
 687       }
 688     }
 689     return null;
 690   }
 691
 692   /**
 693    * Answers true if we determine that the VCF data uses the same reference
 694    * assembly as the sequence, else false
 695    *
 696    * @param vcfAssembly
 697    * @param seqRef
 698    * @return
 699    */
 700   private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
 701           String seqRef)
 702   {
 703     // TODO improve on this stub, which handles gnomAD and
 704     // hopes for the best for other cases
 705
 706     if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 707             && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 708     {
 709       return false;
 710     }
 711     return true;
 712   }
 713
 714   /**
 715    * Answers true if the species inferred from the VCF reference identifier
 716    * matches that for the sequence
 717    *
 718    * @param vcfAssembly
 719    * @param speciesId
 720    * @return
 721    */
 722   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 723   {
 724     // PROBLEM 1
 725     // there are many aliases for species - how to equate one with another?
 726     // PROBLEM 2
 727     // VCF ##reference header is an unstructured URI - how to extract species?
 728     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 729     // TODO ask the user to confirm this??
 730
 731     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 732             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 733     {
 734       return true;
 735     }
 736
 737     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 738             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 739     {
 740       return true;
 741     }
 742
 743     // this is not a sustainable solution...
 744
 745     return false;
 746   }
 747
 748   /**
 749    * Queries the VCF reader for any variants that overlap the mapped chromosome
 750    * ranges of the sequence, and adds as variant features. Returns the number of
 751    * overlapping variants found.
 752    *
 753    * @param seq
 754    * @param reader
 755    * @param map
 756    *          mapping from sequence to VCF coordinates
 757    * @param vcfAssembly
 758    *          the '##reference' identifier for the VCF reference assembly
 759    * @return
 760    */
 761   protected int addVcfVariants(SequenceI seq, VCFReader reader,
 762           VCFMap map, String vcfAssembly)
 763   {
 764     boolean forwardStrand = map.map.isToForwardStrand();
 765
 766     /*
 767      * query the VCF for overlaps of each contiguous chromosomal region
 768      */
 769     int count = 0;
 770
 771     for (int[] range : map.map.getToRanges())
 772     {
 773       int vcfStart = Math.min(range[0], range[1]);
 774       int vcfEnd = Math.max(range[0], range[1]);
 775       CloseableIterator<VariantContext> variants = reader
 776               .query(map.chromosome, vcfStart, vcfEnd);
 777       while (variants.hasNext())
 778       {
 779         VariantContext variant = variants.next();
 780
 781         int[] featureRange = map.map.locateInFrom(variant.getStart(),
 782                 variant.getEnd());
 783
 784         if (featureRange != null)
 785         {
 786           int featureStart = Math.min(featureRange[0], featureRange[1]);
 787           int featureEnd = Math.max(featureRange[0], featureRange[1]);
 788           count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 789                   forwardStrand);
 790         }
 791       }
 792       variants.close();
 793     }
 794
 795     return count;
 796   }
 797
 798   /**
 799    * A convenience method to get the AF value for the given alternate allele
 800    * index
 801    *
 802    * @param variant
 803    * @param alleleIndex
 804    * @return
 805    */
 806   protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
 807   {
 808     float score = 0f;
 809     String attributeValue = getAttributeValue(variant,
 810             ALLELE_FREQUENCY_KEY, alleleIndex);
 811     if (attributeValue != null)
 812     {
 813       try
 814       {
 815         score = Float.parseFloat(attributeValue);
 816       } catch (NumberFormatException e)
 817       {
 818         // leave as 0
 819       }
 820     }
 821
 822     return score;
 823   }
 824
 825   /**
 826    * A convenience method to get an attribute value for an alternate allele
 827    *
 828    * @param variant
 829    * @param attributeName
 830    * @param alleleIndex
 831    * @return
 832    */
 833   protected String getAttributeValue(VariantContext variant,
 834           String attributeName, int alleleIndex)
 835   {
 836     Object att = variant.getAttribute(attributeName);
 837
 838     if (att instanceof String)
 839     {
 840       return (String) att;
 841     }
 842     else if (att instanceof ArrayList)
 843     {
 844       return ((List<String>) att).get(alleleIndex);
 845     }
 846
 847     return null;
 848   }
 849
 850   /**
 851    * Adds one variant feature for each allele in the VCF variant record, and
 852    * returns the number of features added.
 853    *
 854    * @param seq
 855    * @param variant
 856    * @param featureStart
 857    * @param featureEnd
 858    * @param forwardStrand
 859    * @return
 860    */
 861   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 862           int featureStart, int featureEnd, boolean forwardStrand)
 863   {
 864     int added = 0;
 865
 866     /*
 867      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 868      * so we proceed defensively to get them in strict order
 869      */
 870     int altAlleleCount = variant.getAlternateAlleles().size();
 871     for (int i = 0; i < altAlleleCount; i++)
 872     {
 873       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 874               forwardStrand);
 875     }
 876     return added;
 877   }
 878
 879   /**
 880    * Inspects one allele and attempts to add a variant feature for it to the
 881    * sequence. The additional data associated with this allele is extracted to
 882    * store in the feature's key-value map. Answers the number of features added (0
 883    * or 1).
 884    *
 885    * @param seq
 886    * @param variant
 887    * @param altAlleleIndex
 888    *          (0, 1..)
 889    * @param featureStart
 890    * @param featureEnd
 891    * @param forwardStrand
 892    * @return
 893    */
 894   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 895           int altAlleleIndex, int featureStart, int featureEnd,
 896           boolean forwardStrand)
 897   {
 898     String reference = variant.getReference().getBaseString();
 899     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 900     String allele = alt.getBaseString();
 901
 902     /*
 903      * insertion after a genomic base, if on reverse strand, has to be
 904      * converted to insertion of complement after the preceding position
 905      */
 906     int referenceLength = reference.length();
 907     if (!forwardStrand && allele.length() > referenceLength
 908             && allele.startsWith(reference))
 909     {
 910       featureStart -= referenceLength;
 911       featureEnd = featureStart;
 912       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
 913       reference = Dna.reverseComplement(String.valueOf(insertAfter));
 914       allele = allele.substring(referenceLength) + reference;
 915     }
 916
 917     /*
 918      * build the ref,alt allele description e.g. "G,A", using the base
 919      * complement if the sequence is on the reverse strand
 920      */
 921     StringBuilder sb = new StringBuilder();
 922     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 923     sb.append(COMMA);
 924     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 925     String alleles = sb.toString(); // e.g. G,A
 926
 927     /*
 928      * pick out the consequence data (if any) that is for the current allele
 929      * and feature (transcript) that matches the current sequence
 930      */
 931     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
 932             altAlleleIndex, csqAlleleFieldIndex,
 933             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
 934             csqFeatureFieldIndex);
 935
 936     /*
 937      * pick out the ontology term for the consequence type
 938      */
 939     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 940     if (consequence != null)
 941     {
 942       type = getOntologyTerm(seq, variant, altAlleleIndex,
 943             consequence);
 944     }
 945
 946     float score = getAlleleFrequency(variant, altAlleleIndex);
 947
 948     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 949             featureEnd, score, FEATURE_GROUP_VCF);
 950     sf.setSource(sourceId);
 951
 952     sf.setValue(Gff3Helper.ALLELES, alleles);
 953
 954     addAlleleProperties(variant, seq, sf, altAlleleIndex, consequence);
 955
 956     seq.addSequenceFeature(sf);
 957
 958     return 1;
 959   }
 960
 961   /**
 962    * Determines the Sequence Ontology term to use for the variant feature type in
 963    * Jalview. The default is 'sequence_variant', but a more specific term is used
 964    * if:
 965    * <ul>
 966    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
 967    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
 968    * </ul>
 969    *
 970    * @param seq
 971    * @param variant
 972    * @param altAlleleIndex
 973    * @param consequence
 974    * @return
 975    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
 976    */
 977   String getOntologyTerm(SequenceI seq, VariantContext variant,
 978           int altAlleleIndex, String consequence)
 979   {
 980     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 981
 982     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
 983     {
 984       /*
 985        * no Consequence data so we can't refine the ontology term
 986        */
 987       return type;
 988     }
 989
 990     /*
 991      * can we associate Consequence data with this allele and feature (transcript)?
 992      * if so, prefer the consequence term from that data
 993      */
 994     if (consequence != null)
 995     {
 996       String[] csqFields = consequence.split(PIPE_REGEX);
 997       if (csqFields.length > csqConsequenceFieldIndex)
 998       {
 999         type = csqFields[csqConsequenceFieldIndex];
1000       }
1001     }
1002     else
1003     {
1004       // todo the same for SnpEff consequence data matching if wanted
1005     }
1006
1007     /*
1008      * if of the form (e.g.) missense_variant&splice_region_variant,
1009      * just take the first ('most severe') consequence
1010      */
1011     if (type != null)
1012     {
1013       int pos = type.indexOf('&');
1014       if (pos > 0)
1015       {
1016         type = type.substring(0, pos);
1017       }
1018     }
1019     return type;
1020   }
1021
1022   /**
1023    * Returns matched consequence data if it can be found, else null.
1024    * <ul>
1025    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1026    * <li>splits this on comma (to distinct consequences)</li>
1027    * <li>returns the first consequence (if any) where</li>
1028    * <ul>
1029    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1030    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1031    * </ul>
1032    * </ul>
1033    * If matched, the consequence is returned (as pipe-delimited fields).
1034    *
1035    * @param variant
1036    * @param vcfInfoId
1037    * @param altAlleleIndex
1038    * @param alleleFieldIndex
1039    * @param alleleNumberFieldIndex
1040    * @param seqName
1041    * @param featureFieldIndex
1042    * @return
1043    */
1044   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1045           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1046           int alleleNumberFieldIndex,
1047           String seqName, int featureFieldIndex)
1048   {
1049     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1050     {
1051       return null;
1052     }
1053     Object value = variant.getAttribute(vcfInfoId);
1054
1055     if (value == null || !(value instanceof List<?>))
1056     {
1057       return null;
1058     }
1059
1060     /*
1061      * inspect each consequence in turn (comma-separated blocks
1062      * extracted by htsjdk)
1063      */
1064     List<String> consequences = (List<String>) value;
1065
1066     for (String consequence : consequences)
1067     {
1068       String[] csqFields = consequence.split(PIPE_REGEX);
1069       if (csqFields.length > featureFieldIndex)
1070       {
1071         String featureIdentifier = csqFields[featureFieldIndex];
1072         if (featureIdentifier.length() > 4
1073                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1074         {
1075           /*
1076            * feature (transcript) matched - now check for allele match
1077            */
1078           if (matchAllele(variant, altAlleleIndex, csqFields,
1079                   alleleFieldIndex, alleleNumberFieldIndex))
1080           {
1081             return consequence;
1082           }
1083         }
1084       }
1085     }
1086     return null;
1087   }
1088
1089   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1090           String[] csqFields, int alleleFieldIndex,
1091           int alleleNumberFieldIndex)
1092   {
1093     /*
1094      * if ALLELE_NUM is present, it must match altAlleleIndex
1095      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1096      */
1097     if (alleleNumberFieldIndex > -1)
1098     {
1099       if (csqFields.length <= alleleNumberFieldIndex)
1100       {
1101         return false;
1102       }
1103       String alleleNum = csqFields[alleleNumberFieldIndex];
1104       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1105     }
1106
1107     /*
1108      * else consequence allele must match variant allele
1109      */
1110     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1111     {
1112       String csqAllele = csqFields[alleleFieldIndex];
1113       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1114               .getBaseString();
1115       return csqAllele.equals(vcfAllele);
1116     }
1117     return false;
1118   }
1119
1120   /**
1121    * Add any allele-specific VCF key-value data to the sequence feature
1122    *
1123    * @param variant
1124    * @param seq
1125    * @param sf
1126    * @param altAlelleIndex
1127    *          (0, 1..)
1128    * @param consequence
1129    *          if not null, the consequence specific to this sequence (transcript
1130    *          feature) and allele
1131    */
1132   protected void addAlleleProperties(VariantContext variant, SequenceI seq,
1133           SequenceFeature sf, final int altAlelleIndex, String consequence)
1134   {
1135     Map<String, Object> atts = variant.getAttributes();
1136
1137     for (Entry<String, Object> att : atts.entrySet())
1138     {
1139       String key = att.getKey();
1140
1141       /*
1142        * extract Consequence data (if present) that we are able to
1143        * associated with the allele for this variant feature
1144        */
1145       if (CSQ_FIELD.equals(key))
1146       {
1147         addConsequences(variant, seq, sf, consequence);
1148         continue;
1149       }
1150
1151       /*
1152        * filter out fields we don't want to capture
1153        */
1154       if (!vcfFieldsOfInterest.contains(key))
1155       {
1156         continue;
1157       }
1158
1159       /*
1160        * we extract values for other data which are allele-specific;
1161        * these may be per alternate allele (INFO[key].Number = 'A')
1162        * or per allele including reference (INFO[key].Number = 'R')
1163        */
1164       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1165       if (infoHeader == null)
1166       {
1167         /*
1168          * can't be sure what data belongs to this allele, so
1169          * play safe and don't take any
1170          */
1171         continue;
1172       }
1173
1174       VCFHeaderLineCount number = infoHeader.getCountType();
1175       int index = altAlelleIndex;
1176       if (number == VCFHeaderLineCount.R)
1177       {
1178         /*
1179          * one value per allele including reference, so bump index
1180          * e.g. the 3rd value is for the  2nd alternate allele
1181          */
1182         index++;
1183       }
1184       else if (number != VCFHeaderLineCount.A)
1185       {
1186         /*
1187          * don't save other values as not allele-related
1188          */
1189         continue;
1190       }
1191
1192       /*
1193        * take the index'th value
1194        */
1195       String value = getAttributeValue(variant, key, index);
1196       if (value != null)
1197       {
1198         sf.setValue(key, value);
1199       }
1200     }
1201   }
1202
1203   /**
1204    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1205    * feature.
1206    * <p>
1207    * If <code>myConsequence</code> is not null, then this is the specific
1208    * consequence data (pipe-delimited fields) that is for the current allele and
1209    * transcript (sequence) being processed)
1210    *
1211    * @param variant
1212    * @param seq
1213    * @param sf
1214    * @param myConsequence
1215    */
1216   protected void addConsequences(VariantContext variant, SequenceI seq,
1217           SequenceFeature sf, String myConsequence)
1218   {
1219     Object value = variant.getAttribute(CSQ_FIELD);
1220     // TODO if CSQ not present, try ANN (for SnpEff consequence data)?
1221
1222     if (value == null || !(value instanceof List<?>))
1223     {
1224       return;
1225     }
1226
1227     List<String> consequences = (List<String>) value;
1228
1229     /*
1230      * inspect CSQ consequences; restrict to the consequence
1231      * associated with the current transcript (Feature)
1232      */
1233     Map<String, String> csqValues = new HashMap<>();
1234
1235     for (String consequence : consequences)
1236     {
1237       if (myConsequence == null || myConsequence.equals(consequence))
1238       {
1239         String[] csqFields = consequence.split(PIPE_REGEX);
1240
1241         /*
1242          * inspect individual fields of this consequence, copying non-null
1243          * values which are 'fields of interest'
1244          */
1245         int i = 0;
1246         for (String field : csqFields)
1247         {
1248           if (field != null && field.length() > 0)
1249           {
1250             String id = vepFieldsOfInterest.get(i);
1251             if (id != null)
1252             {
1253               csqValues.put(id, field);
1254             }
1255           }
1256           i++;
1257         }
1258       }
1259     }
1260
1261     if (!csqValues.isEmpty())
1262     {
1263       sf.setValue(CSQ_FIELD, csqValues);
1264     }
1265   }
1266
1267   /**
1268    * A convenience method to complement a dna base and return the string value
1269    * of its complement
1270    *
1271    * @param reference
1272    * @return
1273    */
1274   protected String complement(byte[] reference)
1275   {
1276     return String.valueOf(Dna.getComplement((char) reference[0]));
1277   }
1278
1279   /**
1280    * Determines the location of the query range (chromosome positions) in a
1281    * different reference assembly.
1282    * <p>
1283    * If the range is just a subregion of one for which we already have a mapping
1284    * (for example, an exon sub-region of a gene), then the mapping is just
1285    * computed arithmetically.
1286    * <p>
1287    * Otherwise, calls the Ensembl REST service that maps from one assembly
1288    * reference's coordinates to another's
1289    *
1290    * @param queryRange
1291    *          start-end chromosomal range in 'fromRef' coordinates
1292    * @param chromosome
1293    * @param species
1294    * @param fromRef
1295    *          assembly reference for the query coordinates
1296    * @param toRef
1297    *          assembly reference we wish to translate to
1298    * @return the start-end range in 'toRef' coordinates
1299    */
1300   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1301           String species, String fromRef, String toRef)
1302   {
1303     /*
1304      * first try shorcut of computing the mapping as a subregion of one
1305      * we already have (e.g. for an exon, if we have the gene mapping)
1306      */
1307     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1308             species, fromRef, toRef);
1309     if (mappedRange != null)
1310     {
1311       return mappedRange;
1312     }
1313
1314     /*
1315      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1316      */
1317     EnsemblMap mapper = new EnsemblMap();
1318     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1319             toRef, queryRange);
1320
1321     if (mapping == null)
1322     {
1323       // mapping service failure
1324       return null;
1325     }
1326
1327     /*
1328      * save mapping for possible future re-use
1329      */
1330     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1331     if (!assemblyMappings.containsKey(key))
1332     {
1333       assemblyMappings.put(key, new HashMap<int[], int[]>());
1334     }
1335
1336     assemblyMappings.get(key).put(queryRange, mapping);
1337
1338     return mapping;
1339   }
1340
1341   /**
1342    * If we already have a 1:1 contiguous mapping which subsumes the given query
1343    * range, this method just calculates and returns the subset of that mapping,
1344    * else it returns null. In practical terms, if a gene has a contiguous
1345    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1346    * subsidiary exons occupy unchanged relative positions, and just compute
1347    * these as offsets, rather than do another lookup of the mapping.
1348    * <p>
1349    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1350    * simply remove this method or let it always return null.
1351    * <p>
1352    * Warning: many rapid calls to the /map service map result in a 429 overload
1353    * error response
1354    *
1355    * @param queryRange
1356    * @param chromosome
1357    * @param species
1358    * @param fromRef
1359    * @param toRef
1360    * @return
1361    */
1362   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1363           String species, String fromRef, String toRef)
1364   {
1365     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1366     if (assemblyMappings.containsKey(key))
1367     {
1368       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1369       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1370       {
1371         int[] fromRange = mappedRange.getKey();
1372         int[] toRange = mappedRange.getValue();
1373         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1374         {
1375           /*
1376            * mapping is 1:1 in length, so we trust it to have no discontinuities
1377            */
1378           if (MappingUtils.rangeContains(fromRange, queryRange))
1379           {
1380             /*
1381              * fromRange subsumes our query range
1382              */
1383             int offset = queryRange[0] - fromRange[0];
1384             int mappedRangeFrom = toRange[0] + offset;
1385             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1386             return new int[] { mappedRangeFrom, mappedRangeTo };
1387           }
1388         }
1389       }
1390     }
1391     return null;
1392   }
1393
1394   /**
1395    * Transfers the sequence feature to the target sequence, locating its start
1396    * and end range based on the mapping. Features which do not overlap the
1397    * target sequence are ignored.
1398    *
1399    * @param sf
1400    * @param targetSequence
1401    * @param mapping
1402    *          mapping from the feature's coordinates to the target sequence
1403    */
1404   protected void transferFeature(SequenceFeature sf,
1405           SequenceI targetSequence, MapList mapping)
1406   {
1407     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1408
1409     if (mappedRange != null)
1410     {
1411       String group = sf.getFeatureGroup();
1412       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1413       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1414       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1415               group, sf.getScore());
1416       targetSequence.addSequenceFeature(copy);
1417     }
1418   }
1419
1420   /**
1421    * Formats a ranges map lookup key
1422    *
1423    * @param chromosome
1424    * @param species
1425    * @param fromRef
1426    * @param toRef
1427    * @return
1428    */
1429   protected static String makeRangesKey(String chromosome, String species,
1430           String fromRef, String toRef)
1431   {
1432     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1433             + toRef;
1434   }
1435 }