src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.api.AlignViewControllerGuiI;
   6 import jalview.bin.Cache;
   7 import jalview.datamodel.AlignmentI;
   8 import jalview.datamodel.DBRefEntry;
   9 import jalview.datamodel.GeneLociI;
  10 import jalview.datamodel.Mapping;
  11 import jalview.datamodel.SequenceFeature;
  12 import jalview.datamodel.SequenceI;
  13 import jalview.datamodel.features.FeatureAttributeType;
  14 import jalview.datamodel.features.FeatureSource;
  15 import jalview.datamodel.features.FeatureSources;
  16 import jalview.ext.ensembl.EnsemblMap;
  17 import jalview.ext.htsjdk.VCFReader;
  18 import jalview.io.gff.Gff3Helper;
  19 import jalview.io.gff.SequenceOntologyI;
  20 import jalview.util.MapList;
  21 import jalview.util.MappingUtils;
  22 import jalview.util.MessageManager;
  23
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Map.Entry;
  30 import java.util.regex.Pattern;
  31 import java.util.regex.PatternSyntaxException;
  32
  33 import htsjdk.samtools.SAMException;
  34 import htsjdk.samtools.SAMSequenceDictionary;
  35 import htsjdk.samtools.SAMSequenceRecord;
  36 import htsjdk.samtools.util.CloseableIterator;
  37 import htsjdk.variant.variantcontext.Allele;
  38 import htsjdk.variant.variantcontext.VariantContext;
  39 import htsjdk.variant.vcf.VCFHeader;
  40 import htsjdk.variant.vcf.VCFHeaderLine;
  41 import htsjdk.variant.vcf.VCFHeaderLineCount;
  42 import htsjdk.variant.vcf.VCFHeaderLineType;
  43 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  44
  45 /**
  46  * A class to read VCF data (using the htsjdk) and add variants as sequence
  47  * features on dna and any related protein product sequences
  48  *
  49  * @author gmcarstairs
  50  */
  51 public class VCFLoader
  52 {
  53   /**
  54    * A class to model the mapping from sequence to VCF coordinates. Cases include
  55    * <ul>
  56    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  57    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
  58    * use the same reference assembly</li>
  59    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
  60    * and VCF use different reference assembles</li>
  61    * </ul>
  62    */
  63   class VCFMap
  64   {
  65     final String chromosome;
  66
  67     final MapList map;
  68
  69     VCFMap(String chr, MapList m)
  70     {
  71       chromosome = chr;
  72       map = m;
  73     }
  74
  75     @Override
  76     public String toString()
  77     {
  78       return chromosome + ":" + map.toString();
  79     }
  80   }
  81
  82   /*
  83    * Lookup keys, and default values, for Preference entries that describe
  84    * patterns for VCF and VEP fields to capture
  85    */
  86   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  87
  88   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  89
  90   private static final String DEFAULT_VCF_FIELDS = ".*";
  91
  92   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  93
  94   /*
  95    * keys to fields of VEP CSQ consequence data
  96    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  97    */
  98   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
  99   private static final String CSQ_ALLELE_KEY = "Allele";
 100   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 101   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 102
 103   /*
 104    * default VCF INFO key for VEP consequence data
 105    * NB this can be overridden running VEP with --vcf_info_field
 106    * - we don't handle this case (require identifier to be CSQ)
 107    */
 108   private static final String CSQ_FIELD = "CSQ";
 109
 110   /*
 111    * separator for fields in consequence data is '|'
 112    */
 113   private static final String PIPE_REGEX = "\\|";
 114
 115   /*
 116    * key for Allele Frequency output by VEP
 117    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 118    */
 119   private static final String ALLELE_FREQUENCY_KEY = "AF";
 120
 121   /*
 122    * delimiter that separates multiple consequence data blocks
 123    */
 124   private static final String COMMA = ",";
 125
 126   /*
 127    * the feature group assigned to a VCF variant in Jalview
 128    */
 129   private static final String FEATURE_GROUP_VCF = "VCF";
 130
 131   /*
 132    * internal delimiter used to build keys for assemblyMappings
 133    *
 134    */
 135   private static final String EXCL = "!";
 136
 137   /*
 138    * the alignment we are associating VCF data with
 139    */
 140   private AlignmentI al;
 141
 142   /*
 143    * mappings between VCF and sequence reference assembly regions, as
 144    * key = "species!chromosome!fromAssembly!toAssembly
 145    * value = Map{fromRange, toRange}
 146    */
 147   private Map<String, Map<int[], int[]>> assemblyMappings;
 148
 149   /*
 150    * holds details of the VCF header lines (metadata)
 151    */
 152   private VCFHeader header;
 153
 154   /*
 155    * a Dictionary of contigs (if present) referenced in the VCF file
 156    */
 157   private SAMSequenceDictionary dictionary;
 158
 159   /*
 160    * the position (0...) of field in each block of
 161    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 162    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 163    */
 164   private int csqConsequenceFieldIndex = -1;
 165   private int csqAlleleFieldIndex = -1;
 166   private int csqAlleleNumberFieldIndex = -1;
 167   private int csqFeatureFieldIndex = -1;
 168
 169   // todo the same fields for SnpEff ANN data if wanted
 170   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 171
 172   /*
 173    * a unique identifier under which to save metadata about feature
 174    * attributes (selected INFO field data)
 175    */
 176   private String sourceId;
 177
 178   /*
 179    * The INFO IDs of data that is both present in the VCF file, and
 180    * also matched by any filters for data of interest
 181    */
 182   List<String> vcfFieldsOfInterest;
 183
 184   /*
 185    * The field offsets and identifiers for VEP (CSQ) data that is both present
 186    * in the VCF file, and also matched by any filters for data of interest
 187    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 188    */
 189   Map<Integer, String> vepFieldsOfInterest;
 190
 191   /**
 192    * Constructor given an alignment context
 193    *
 194    * @param alignment
 195    */
 196   public VCFLoader(AlignmentI alignment)
 197   {
 198     al = alignment;
 199
 200     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 201     assemblyMappings = new HashMap<>();
 202   }
 203
 204   /**
 205    * Starts a new thread to query and load VCF variant data on to the alignment
 206    * <p>
 207    * This method is not thread safe - concurrent threads should use separate
 208    * instances of this class.
 209    *
 210    * @param filePath
 211    * @param gui
 212    */
 213   public void loadVCF(final String filePath,
 214           final AlignViewControllerGuiI gui)
 215   {
 216     if (gui != null)
 217     {
 218       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 219     }
 220
 221     new Thread()
 222     {
 223
 224       @Override
 225       public void run()
 226       {
 227         VCFLoader.this.doLoad(filePath, gui);
 228       }
 229
 230     }.start();
 231   }
 232
 233   /**
 234    * Loads VCF on to an alignment - provided it can be related to one or more
 235    * sequence's chromosomal coordinates
 236    *
 237    * @param filePath
 238    * @param gui
 239    *          optional callback handler for messages
 240    */
 241   protected void doLoad(String filePath, AlignViewControllerGuiI gui)
 242   {
 243     VCFReader reader = null;
 244     try
 245     {
 246       // long start = System.currentTimeMillis();
 247       reader = new VCFReader(filePath);
 248
 249       header = reader.getFileHeader();
 250
 251       try
 252       {
 253         dictionary = header.getSequenceDictionary();
 254       } catch (SAMException e)
 255       {
 256         // ignore - thrown if any contig line lacks length info
 257       }
 258
 259       sourceId = filePath;
 260
 261       saveMetadata(sourceId);
 262
 263       /*
 264        * get offset of CSQ ALLELE_NUM and Feature if declared
 265        */
 266       parseCsqHeader();
 267
 268       VCFHeaderLine ref = header
 269               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 270       String vcfAssembly = ref.getValue();
 271
 272       int varCount = 0;
 273       int seqCount = 0;
 274
 275       /*
 276        * query for VCF overlapping each sequence in turn
 277        */
 278       for (SequenceI seq : al.getSequences())
 279       {
 280         int added = loadSequenceVCF(seq, reader, vcfAssembly);
 281         if (added > 0)
 282         {
 283           seqCount++;
 284           varCount += added;
 285           transferAddedFeatures(seq);
 286         }
 287       }
 288       if (gui != null)
 289       {
 290         // long elapsed = System.currentTimeMillis() - start;
 291         String msg = MessageManager.formatMessage("label.added_vcf",
 292                 varCount, seqCount);
 293         gui.setStatus(msg);
 294         if (gui.getFeatureSettingsUI() != null)
 295         {
 296           gui.getFeatureSettingsUI().discoverAllFeatureData();
 297         }
 298       }
 299     } catch (Throwable e)
 300     {
 301       System.err.println("Error processing VCF: " + e.getMessage());
 302       e.printStackTrace();
 303       if (gui != null)
 304       {
 305         gui.setStatus("Error occurred - see console for details");
 306       }
 307     } finally
 308     {
 309       if (reader != null)
 310       {
 311         try
 312         {
 313           reader.close();
 314         } catch (IOException e)
 315         {
 316           // ignore
 317         }
 318       }
 319       header = null;
 320       dictionary = null;
 321     }
 322   }
 323
 324   /**
 325    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 326    * them for future reference
 327    *
 328    * @param theSourceId
 329    */
 330   void saveMetadata(String theSourceId)
 331   {
 332     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 333             DEFAULT_VCF_FIELDS);
 334     vcfFieldsOfInterest = new ArrayList<>();
 335
 336     FeatureSource metadata = new FeatureSource(theSourceId);
 337
 338     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 339     {
 340       String attributeId = info.getID();
 341       String desc = info.getDescription();
 342       VCFHeaderLineType type = info.getType();
 343       FeatureAttributeType attType = null;
 344       switch (type)
 345       {
 346       case Character:
 347         attType = FeatureAttributeType.Character;
 348         break;
 349       case Flag:
 350         attType = FeatureAttributeType.Flag;
 351         break;
 352       case Float:
 353         attType = FeatureAttributeType.Float;
 354         break;
 355       case Integer:
 356         attType = FeatureAttributeType.Integer;
 357         break;
 358       case String:
 359         attType = FeatureAttributeType.String;
 360         break;
 361       }
 362       metadata.setAttributeName(attributeId, desc);
 363       metadata.setAttributeType(attributeId, attType);
 364
 365       if (isFieldWanted(attributeId, vcfFieldPatterns))
 366       {
 367         vcfFieldsOfInterest.add(attributeId);
 368       }
 369     }
 370
 371     FeatureSources.getInstance().addSource(theSourceId, metadata);
 372   }
 373
 374   /**
 375    * Answers true if the field id is matched by any of the filter patterns, else
 376    * false. Matching is against regular expression patterns, and is not
 377    * case-sensitive.
 378    *
 379    * @param id
 380    * @param filters
 381    * @return
 382    */
 383   private boolean isFieldWanted(String id, List<Pattern> filters)
 384   {
 385     for (Pattern p : filters)
 386     {
 387       if (p.matcher(id.toUpperCase()).matches())
 388       {
 389         return true;
 390       }
 391     }
 392     return false;
 393   }
 394
 395   /**
 396    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 397    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 398    * required for processing.
 399    * <p>
 400    * CSQ fields are declared in the CSQ INFO Description e.g.
 401    * <p>
 402    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 403    */
 404   protected void parseCsqHeader()
 405   {
 406     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 407             DEFAULT_VEP_FIELDS);
 408     vepFieldsOfInterest = new HashMap<>();
 409
 410     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 411     if (csqInfo == null)
 412     {
 413       return;
 414     }
 415
 416     /*
 417      * parse out the pipe-separated list of CSQ fields; we assume here that
 418      * these form the last part of the description, and contain no spaces
 419      */
 420     String desc = csqInfo.getDescription();
 421     int spacePos = desc.lastIndexOf(" ");
 422     desc = desc.substring(spacePos + 1);
 423
 424     if (desc != null)
 425     {
 426       String[] format = desc.split(PIPE_REGEX);
 427       int index = 0;
 428       for (String field : format)
 429       {
 430         if (CSQ_CONSEQUENCE_KEY.equals(field))
 431         {
 432           csqConsequenceFieldIndex = index;
 433         }
 434         if (CSQ_ALLELE_NUM_KEY.equals(field))
 435         {
 436           csqAlleleNumberFieldIndex = index;
 437         }
 438         if (CSQ_ALLELE_KEY.equals(field))
 439         {
 440           csqAlleleFieldIndex = index;
 441         }
 442         if (CSQ_FEATURE_KEY.equals(field))
 443         {
 444           csqFeatureFieldIndex = index;
 445         }
 446
 447         if (isFieldWanted(field, vepFieldFilters))
 448         {
 449           vepFieldsOfInterest.put(index, field);
 450         }
 451
 452         index++;
 453       }
 454     }
 455   }
 456
 457   /**
 458    * Reads the Preference value for the given key, with default specified if no
 459    * preference set. The value is interpreted as a comma-separated list of
 460    * regular expressions, and converted into a list of compiled patterns ready
 461    * for matching. Patterns are forced to upper-case for non-case-sensitive
 462    * matching.
 463    * <p>
 464    * This supports user-defined filters for fields of interest to capture while
 465    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 466    * fields with an ID of AF, or starting with AC, would be matched.
 467    *
 468    * @param key
 469    * @param def
 470    * @return
 471    */
 472   private List<Pattern> getFieldMatchers(String key, String def)
 473   {
 474     String pref = Cache.getDefault(key, def);
 475     List<Pattern> patterns = new ArrayList<>();
 476     String[] tokens = pref.split(",");
 477     for (String token : tokens)
 478     {
 479       try
 480       {
 481       patterns.add(Pattern.compile(token.toUpperCase()));
 482       } catch (PatternSyntaxException e)
 483       {
 484         System.err.println("Invalid pattern ignored: " + token);
 485       }
 486     }
 487     return patterns;
 488   }
 489
 490   /**
 491    * Transfers VCF features to sequences to which this sequence has a mapping.
 492    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 493    *
 494    * @param seq
 495    */
 496   protected void transferAddedFeatures(SequenceI seq)
 497   {
 498     DBRefEntry[] dbrefs = seq.getDBRefs();
 499     if (dbrefs == null)
 500     {
 501       return;
 502     }
 503     for (DBRefEntry dbref : dbrefs)
 504     {
 505       Mapping mapping = dbref.getMap();
 506       if (mapping == null || mapping.getTo() == null)
 507       {
 508         continue;
 509       }
 510
 511       SequenceI mapTo = mapping.getTo();
 512       MapList map = mapping.getMap();
 513       if (map.getFromRatio() == 3)
 514       {
 515         /*
 516          * dna-to-peptide product mapping
 517          */
 518         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 519       }
 520       else
 521       {
 522         /*
 523          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 524          */
 525         List<SequenceFeature> features = seq.getFeatures()
 526                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 527         for (SequenceFeature sf : features)
 528         {
 529           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 530           {
 531             transferFeature(sf, mapTo, map);
 532           }
 533         }
 534       }
 535     }
 536   }
 537
 538   /**
 539    * Tries to add overlapping variants read from a VCF file to the given
 540    * sequence, and returns the number of variant features added. Note that this
 541    * requires the sequence to hold information as to its species, chromosomal
 542    * positions and reference assembly, in order to be able to map the VCF
 543    * variants to the sequence (or not)
 544    *
 545    * @param seq
 546    * @param reader
 547    * @param vcfAssembly
 548    * @return
 549    */
 550   protected int loadSequenceVCF(SequenceI seq, VCFReader reader,
 551           String vcfAssembly)
 552   {
 553     VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
 554     if (vcfMap == null)
 555     {
 556       return 0;
 557     }
 558
 559     return addVcfVariants(seq, reader, vcfMap, vcfAssembly);
 560   }
 561
 562   /**
 563    * Answers a map from sequence coordinates to VCF chromosome ranges
 564    *
 565    * @param seq
 566    * @param vcfAssembly
 567    * @return
 568    */
 569   private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
 570   {
 571     /*
 572      * simplest case: sequence has id and length matching a VCF contig
 573      */
 574     VCFMap vcfMap = null;
 575     if (dictionary != null)
 576     {
 577       vcfMap = getContigMap(seq);
 578     }
 579     if (vcfMap != null)
 580     {
 581       return vcfMap;
 582     }
 583
 584     /*
 585      * otherwise, map to VCF from chromosomal coordinates
 586      * of the sequence (if known)
 587      */
 588     GeneLociI seqCoords = seq.getGeneLoci();
 589     if (seqCoords == null)
 590     {
 591       Cache.log.warn(String.format(
 592               "Can't query VCF for %s as chromosome coordinates not known",
 593               seq.getName()));
 594       return null;
 595     }
 596
 597     String species = seqCoords.getSpeciesId();
 598     String chromosome = seqCoords.getChromosomeId();
 599     String seqRef = seqCoords.getAssemblyId();
 600     MapList map = seqCoords.getMap();
 601
 602     if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
 603     {
 604       return null;
 605     }
 606
 607     if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
 608     {
 609       return new VCFMap(chromosome, map);
 610     }
 611
 612     if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 613             || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 614     {
 615       return null;
 616     }
 617
 618     /*
 619      * map chromosomal coordinates from sequence to VCF if the VCF
 620      * data has a different reference assembly to the sequence
 621      */
 622     // TODO generalise for cases other than GRCh38 -> GRCh37 !
 623     // - or get the user to choose in a dialog
 624
 625     List<int[]> toVcfRanges = new ArrayList<>();
 626     List<int[]> fromSequenceRanges = new ArrayList<>();
 627     String toRef = "GRCh37";
 628
 629     for (int[] range : map.getToRanges())
 630     {
 631       int[] fromRange = map.locateInFrom(range[0], range[1]);
 632       if (fromRange == null)
 633       {
 634         // corrupted map?!?
 635         continue;
 636       }
 637
 638       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 639               toRef);
 640       if (newRange == null)
 641       {
 642         Cache.log.error(
 643                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 644                         chromosome, seqRef, range[0], range[1], toRef));
 645         continue;
 646       }
 647       else
 648       {
 649         toVcfRanges.add(newRange);
 650         fromSequenceRanges.add(fromRange);
 651       }
 652     }
 653
 654     return new VCFMap(chromosome,
 655             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 656   }
 657
 658   /**
 659    * If the sequence id matches a contig declared in the VCF file, and the
 660    * sequence length matches the contig length, then returns a 1:1 map of the
 661    * sequence to the contig, else returns null
 662    *
 663    * @param seq
 664    * @return
 665    */
 666   private VCFMap getContigMap(SequenceI seq)
 667   {
 668     String id = seq.getName();
 669     SAMSequenceRecord contig = dictionary.getSequence(id);
 670     if (contig != null)
 671     {
 672       int len = seq.getLength();
 673       if (len == contig.getSequenceLength())
 674       {
 675         MapList map = new MapList(new int[] { 1, len },
 676                 new int[]
 677                 { 1, len }, 1, 1);
 678         return new VCFMap(id, map);
 679       }
 680     }
 681     return null;
 682   }
 683
 684   /**
 685    * Answers true if we determine that the VCF data uses the same reference
 686    * assembly as the sequence, else false
 687    *
 688    * @param vcfAssembly
 689    * @param seqRef
 690    * @return
 691    */
 692   private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
 693           String seqRef)
 694   {
 695     // TODO improve on this stub, which handles gnomAD and
 696     // hopes for the best for other cases
 697
 698     if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 699             && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 700     {
 701       return false;
 702     }
 703     return true;
 704   }
 705
 706   /**
 707    * Answers true if the species inferred from the VCF reference identifier
 708    * matches that for the sequence
 709    *
 710    * @param vcfAssembly
 711    * @param speciesId
 712    * @return
 713    */
 714   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 715   {
 716     // PROBLEM 1
 717     // there are many aliases for species - how to equate one with another?
 718     // PROBLEM 2
 719     // VCF ##reference header is an unstructured URI - how to extract species?
 720     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 721     // TODO ask the user to confirm this??
 722
 723     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 724             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 725     {
 726       return true;
 727     }
 728
 729     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 730             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 731     {
 732       return true;
 733     }
 734
 735     // this is not a sustainable solution...
 736
 737     return false;
 738   }
 739
 740   /**
 741    * Queries the VCF reader for any variants that overlap the mapped chromosome
 742    * ranges of the sequence, and adds as variant features. Returns the number of
 743    * overlapping variants found.
 744    *
 745    * @param seq
 746    * @param reader
 747    * @param map
 748    *          mapping from sequence to VCF coordinates
 749    * @param vcfAssembly
 750    *          the '##reference' identifier for the VCF reference assembly
 751    * @return
 752    */
 753   protected int addVcfVariants(SequenceI seq, VCFReader reader,
 754           VCFMap map, String vcfAssembly)
 755   {
 756     boolean forwardStrand = map.map.isToForwardStrand();
 757
 758     /*
 759      * query the VCF for overlaps of each contiguous chromosomal region
 760      */
 761     int count = 0;
 762
 763     for (int[] range : map.map.getToRanges())
 764     {
 765       int vcfStart = Math.min(range[0], range[1]);
 766       int vcfEnd = Math.max(range[0], range[1]);
 767       CloseableIterator<VariantContext> variants = reader
 768               .query(map.chromosome, vcfStart, vcfEnd);
 769       while (variants.hasNext())
 770       {
 771         VariantContext variant = variants.next();
 772
 773         int[] featureRange = map.map.locateInFrom(variant.getStart(),
 774                 variant.getEnd());
 775
 776         if (featureRange != null)
 777         {
 778           int featureStart = Math.min(featureRange[0], featureRange[1]);
 779           int featureEnd = Math.max(featureRange[0], featureRange[1]);
 780           count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 781                   forwardStrand);
 782         }
 783       }
 784       variants.close();
 785     }
 786
 787     return count;
 788   }
 789
 790   /**
 791    * A convenience method to get the AF value for the given alternate allele
 792    * index
 793    *
 794    * @param variant
 795    * @param alleleIndex
 796    * @return
 797    */
 798   protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
 799   {
 800     float score = 0f;
 801     String attributeValue = getAttributeValue(variant,
 802             ALLELE_FREQUENCY_KEY, alleleIndex);
 803     if (attributeValue != null)
 804     {
 805       try
 806       {
 807         score = Float.parseFloat(attributeValue);
 808       } catch (NumberFormatException e)
 809       {
 810         // leave as 0
 811       }
 812     }
 813
 814     return score;
 815   }
 816
 817   /**
 818    * A convenience method to get an attribute value for an alternate allele
 819    *
 820    * @param variant
 821    * @param attributeName
 822    * @param alleleIndex
 823    * @return
 824    */
 825   protected String getAttributeValue(VariantContext variant,
 826           String attributeName, int alleleIndex)
 827   {
 828     Object att = variant.getAttribute(attributeName);
 829
 830     if (att instanceof String)
 831     {
 832       return (String) att;
 833     }
 834     else if (att instanceof ArrayList)
 835     {
 836       return ((List<String>) att).get(alleleIndex);
 837     }
 838
 839     return null;
 840   }
 841
 842   /**
 843    * Adds one variant feature for each allele in the VCF variant record, and
 844    * returns the number of features added.
 845    *
 846    * @param seq
 847    * @param variant
 848    * @param featureStart
 849    * @param featureEnd
 850    * @param forwardStrand
 851    * @return
 852    */
 853   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 854           int featureStart, int featureEnd, boolean forwardStrand)
 855   {
 856     int added = 0;
 857
 858     /*
 859      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 860      * so we proceed defensively to get them in strict order
 861      */
 862     int altAlleleCount = variant.getAlternateAlleles().size();
 863     for (int i = 0; i < altAlleleCount; i++)
 864     {
 865       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 866               forwardStrand);
 867     }
 868     return added;
 869   }
 870
 871   /**
 872    * Inspects one allele and attempts to add a variant feature for it to the
 873    * sequence. The additional data associated with this allele is extracted to
 874    * store in the feature's key-value map. Answers the number of features added (0
 875    * or 1).
 876    *
 877    * @param seq
 878    * @param variant
 879    * @param altAlleleIndex
 880    *          (0, 1..)
 881    * @param featureStart
 882    * @param featureEnd
 883    * @param forwardStrand
 884    * @return
 885    */
 886   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 887           int altAlleleIndex, int featureStart, int featureEnd,
 888           boolean forwardStrand)
 889   {
 890     String reference = variant.getReference().getBaseString();
 891     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 892     String allele = alt.getBaseString();
 893
 894     /*
 895      * build the ref,alt allele description e.g. "G,A", using the base
 896      * complement if the sequence is on the reverse strand
 897      */
 898     // FIXME correctly handle insertions on reverse strand JAL-2845
 899     StringBuilder sb = new StringBuilder();
 900     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 901     sb.append(COMMA);
 902     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 903     String alleles = sb.toString(); // e.g. G,A
 904
 905     /*
 906      * pick out the consequence data (if any) that is for the current allele
 907      * and feature (transcript) that matches the current sequence
 908      */
 909     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
 910             altAlleleIndex, csqAlleleFieldIndex,
 911             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
 912             csqFeatureFieldIndex);
 913
 914     /*
 915      * pick out the ontology term for the consequence type
 916      */
 917     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 918     if (consequence != null)
 919     {
 920       type = getOntologyTerm(seq, variant, altAlleleIndex,
 921             consequence);
 922     }
 923
 924     float score = getAlleleFrequency(variant, altAlleleIndex);
 925
 926     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 927             featureEnd, score, FEATURE_GROUP_VCF);
 928     sf.setSource(sourceId);
 929
 930     sf.setValue(Gff3Helper.ALLELES, alleles);
 931
 932     addAlleleProperties(variant, seq, sf, altAlleleIndex, consequence);
 933
 934     seq.addSequenceFeature(sf);
 935
 936     return 1;
 937   }
 938
 939   /**
 940    * Determines the Sequence Ontology term to use for the variant feature type in
 941    * Jalview. The default is 'sequence_variant', but a more specific term is used
 942    * if:
 943    * <ul>
 944    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
 945    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
 946    * </ul>
 947    *
 948    * @param seq
 949    * @param variant
 950    * @param altAlleleIndex
 951    * @param consequence
 952    * @return
 953    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
 954    */
 955   String getOntologyTerm(SequenceI seq, VariantContext variant,
 956           int altAlleleIndex, String consequence)
 957   {
 958     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 959
 960     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
 961     {
 962       /*
 963        * no Consequence data so we can't refine the ontology term
 964        */
 965       return type;
 966     }
 967
 968     /*
 969      * can we associate Consequence data with this allele and feature (transcript)?
 970      * if so, prefer the consequence term from that data
 971      */
 972     if (consequence != null)
 973     {
 974       String[] csqFields = consequence.split(PIPE_REGEX);
 975       if (csqFields.length > csqConsequenceFieldIndex)
 976       {
 977         type = csqFields[csqConsequenceFieldIndex];
 978       }
 979     }
 980     else
 981     {
 982       // todo the same for SnpEff consequence data matching if wanted
 983     }
 984
 985     /*
 986      * if of the form (e.g.) missense_variant&splice_region_variant,
 987      * just take the first ('most severe') consequence
 988      */
 989     if (type != null)
 990     {
 991       int pos = type.indexOf('&');
 992       if (pos > 0)
 993       {
 994         type = type.substring(0, pos);
 995       }
 996     }
 997     return type;
 998   }
 999
1000   /**
1001    * Returns matched consequence data if it can be found, else null.
1002    * <ul>
1003    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1004    * <li>splits this on comma (to distinct consequences)</li>
1005    * <li>returns the first consequence (if any) where</li>
1006    * <ul>
1007    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1008    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1009    * </ul>
1010    * </ul>
1011    * If matched, the consequence is returned (as pipe-delimited fields).
1012    *
1013    * @param variant
1014    * @param vcfInfoId
1015    * @param altAlleleIndex
1016    * @param alleleFieldIndex
1017    * @param alleleNumberFieldIndex
1018    * @param seqName
1019    * @param featureFieldIndex
1020    * @return
1021    */
1022   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1023           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1024           int alleleNumberFieldIndex,
1025           String seqName, int featureFieldIndex)
1026   {
1027     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1028     {
1029       return null;
1030     }
1031     Object value = variant.getAttribute(vcfInfoId);
1032
1033     if (value == null || !(value instanceof List<?>))
1034     {
1035       return null;
1036     }
1037
1038     /*
1039      * inspect each consequence in turn (comma-separated blocks
1040      * extracted by htsjdk)
1041      */
1042     List<String> consequences = (List<String>) value;
1043
1044     for (String consequence : consequences)
1045     {
1046       String[] csqFields = consequence.split(PIPE_REGEX);
1047       if (csqFields.length > featureFieldIndex)
1048       {
1049         String featureIdentifier = csqFields[featureFieldIndex];
1050         if (featureIdentifier.length() > 4
1051                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1052         {
1053           /*
1054            * feature (transcript) matched - now check for allele match
1055            */
1056           if (matchAllele(variant, altAlleleIndex, csqFields,
1057                   alleleFieldIndex, alleleNumberFieldIndex))
1058           {
1059             return consequence;
1060           }
1061         }
1062       }
1063     }
1064     return null;
1065   }
1066
1067   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1068           String[] csqFields, int alleleFieldIndex,
1069           int alleleNumberFieldIndex)
1070   {
1071     /*
1072      * if ALLELE_NUM is present, it must match altAlleleIndex
1073      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1074      */
1075     if (alleleNumberFieldIndex > -1)
1076     {
1077       if (csqFields.length <= alleleNumberFieldIndex)
1078       {
1079         return false;
1080       }
1081       String alleleNum = csqFields[alleleNumberFieldIndex];
1082       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1083     }
1084
1085     /*
1086      * else consequence allele must match variant allele
1087      */
1088     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1089     {
1090       String csqAllele = csqFields[alleleFieldIndex];
1091       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1092               .getBaseString();
1093       return csqAllele.equals(vcfAllele);
1094     }
1095     return false;
1096   }
1097
1098   /**
1099    * Add any allele-specific VCF key-value data to the sequence feature
1100    *
1101    * @param variant
1102    * @param seq
1103    * @param sf
1104    * @param altAlelleIndex
1105    *          (0, 1..)
1106    * @param consequence
1107    *          if not null, the consequence specific to this sequence (transcript
1108    *          feature) and allele
1109    */
1110   protected void addAlleleProperties(VariantContext variant, SequenceI seq,
1111           SequenceFeature sf, final int altAlelleIndex, String consequence)
1112   {
1113     Map<String, Object> atts = variant.getAttributes();
1114
1115     for (Entry<String, Object> att : atts.entrySet())
1116     {
1117       String key = att.getKey();
1118
1119       /*
1120        * extract Consequence data (if present) that we are able to
1121        * associated with the allele for this variant feature
1122        */
1123       if (CSQ_FIELD.equals(key))
1124       {
1125         addConsequences(variant, seq, sf, consequence);
1126         continue;
1127       }
1128
1129       /*
1130        * filter out fields we don't want to capture
1131        */
1132       if (!vcfFieldsOfInterest.contains(key))
1133       {
1134         continue;
1135       }
1136
1137       /*
1138        * we extract values for other data which are allele-specific;
1139        * these may be per alternate allele (INFO[key].Number = 'A')
1140        * or per allele including reference (INFO[key].Number = 'R')
1141        */
1142       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1143       if (infoHeader == null)
1144       {
1145         /*
1146          * can't be sure what data belongs to this allele, so
1147          * play safe and don't take any
1148          */
1149         continue;
1150       }
1151
1152       VCFHeaderLineCount number = infoHeader.getCountType();
1153       int index = altAlelleIndex;
1154       if (number == VCFHeaderLineCount.R)
1155       {
1156         /*
1157          * one value per allele including reference, so bump index
1158          * e.g. the 3rd value is for the  2nd alternate allele
1159          */
1160         index++;
1161       }
1162       else if (number != VCFHeaderLineCount.A)
1163       {
1164         /*
1165          * don't save other values as not allele-related
1166          */
1167         continue;
1168       }
1169
1170       /*
1171        * take the index'th value
1172        */
1173       String value = getAttributeValue(variant, key, index);
1174       if (value != null)
1175       {
1176         sf.setValue(key, value);
1177       }
1178     }
1179   }
1180
1181   /**
1182    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1183    * feature.
1184    * <p>
1185    * If <code>myConsequence</code> is not null, then this is the specific
1186    * consequence data (pipe-delimited fields) that is for the current allele and
1187    * transcript (sequence) being processed)
1188    *
1189    * @param variant
1190    * @param seq
1191    * @param sf
1192    * @param myConsequence
1193    */
1194   protected void addConsequences(VariantContext variant, SequenceI seq,
1195           SequenceFeature sf, String myConsequence)
1196   {
1197     Object value = variant.getAttribute(CSQ_FIELD);
1198     // TODO if CSQ not present, try ANN (for SnpEff consequence data)?
1199
1200     if (value == null || !(value instanceof List<?>))
1201     {
1202       return;
1203     }
1204
1205     List<String> consequences = (List<String>) value;
1206
1207     /*
1208      * inspect CSQ consequences; restrict to the consequence
1209      * associated with the current transcript (Feature)
1210      */
1211     Map<String, String> csqValues = new HashMap<>();
1212
1213     for (String consequence : consequences)
1214     {
1215       if (myConsequence == null || myConsequence.equals(consequence))
1216       {
1217         String[] csqFields = consequence.split(PIPE_REGEX);
1218
1219         /*
1220          * inspect individual fields of this consequence, copying non-null
1221          * values which are 'fields of interest'
1222          */
1223         int i = 0;
1224         for (String field : csqFields)
1225         {
1226           if (field != null && field.length() > 0)
1227           {
1228             String id = vepFieldsOfInterest.get(i);
1229             if (id != null)
1230             {
1231               csqValues.put(id, field);
1232             }
1233           }
1234           i++;
1235         }
1236       }
1237     }
1238
1239     if (!csqValues.isEmpty())
1240     {
1241       sf.setValue(CSQ_FIELD, csqValues);
1242     }
1243   }
1244
1245   /**
1246    * A convenience method to complement a dna base and return the string value
1247    * of its complement
1248    *
1249    * @param reference
1250    * @return
1251    */
1252   protected String complement(byte[] reference)
1253   {
1254     return String.valueOf(Dna.getComplement((char) reference[0]));
1255   }
1256
1257   /**
1258    * Determines the location of the query range (chromosome positions) in a
1259    * different reference assembly.
1260    * <p>
1261    * If the range is just a subregion of one for which we already have a mapping
1262    * (for example, an exon sub-region of a gene), then the mapping is just
1263    * computed arithmetically.
1264    * <p>
1265    * Otherwise, calls the Ensembl REST service that maps from one assembly
1266    * reference's coordinates to another's
1267    *
1268    * @param queryRange
1269    *          start-end chromosomal range in 'fromRef' coordinates
1270    * @param chromosome
1271    * @param species
1272    * @param fromRef
1273    *          assembly reference for the query coordinates
1274    * @param toRef
1275    *          assembly reference we wish to translate to
1276    * @return the start-end range in 'toRef' coordinates
1277    */
1278   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1279           String species, String fromRef, String toRef)
1280   {
1281     /*
1282      * first try shorcut of computing the mapping as a subregion of one
1283      * we already have (e.g. for an exon, if we have the gene mapping)
1284      */
1285     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1286             species, fromRef, toRef);
1287     if (mappedRange != null)
1288     {
1289       return mappedRange;
1290     }
1291
1292     /*
1293      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1294      */
1295     EnsemblMap mapper = new EnsemblMap();
1296     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1297             toRef, queryRange);
1298
1299     if (mapping == null)
1300     {
1301       // mapping service failure
1302       return null;
1303     }
1304
1305     /*
1306      * save mapping for possible future re-use
1307      */
1308     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1309     if (!assemblyMappings.containsKey(key))
1310     {
1311       assemblyMappings.put(key, new HashMap<int[], int[]>());
1312     }
1313
1314     assemblyMappings.get(key).put(queryRange, mapping);
1315
1316     return mapping;
1317   }
1318
1319   /**
1320    * If we already have a 1:1 contiguous mapping which subsumes the given query
1321    * range, this method just calculates and returns the subset of that mapping,
1322    * else it returns null. In practical terms, if a gene has a contiguous
1323    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1324    * subsidiary exons occupy unchanged relative positions, and just compute
1325    * these as offsets, rather than do another lookup of the mapping.
1326    * <p>
1327    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1328    * simply remove this method or let it always return null.
1329    * <p>
1330    * Warning: many rapid calls to the /map service map result in a 429 overload
1331    * error response
1332    *
1333    * @param queryRange
1334    * @param chromosome
1335    * @param species
1336    * @param fromRef
1337    * @param toRef
1338    * @return
1339    */
1340   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1341           String species, String fromRef, String toRef)
1342   {
1343     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1344     if (assemblyMappings.containsKey(key))
1345     {
1346       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1347       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1348       {
1349         int[] fromRange = mappedRange.getKey();
1350         int[] toRange = mappedRange.getValue();
1351         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1352         {
1353           /*
1354            * mapping is 1:1 in length, so we trust it to have no discontinuities
1355            */
1356           if (MappingUtils.rangeContains(fromRange, queryRange))
1357           {
1358             /*
1359              * fromRange subsumes our query range
1360              */
1361             int offset = queryRange[0] - fromRange[0];
1362             int mappedRangeFrom = toRange[0] + offset;
1363             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1364             return new int[] { mappedRangeFrom, mappedRangeTo };
1365           }
1366         }
1367       }
1368     }
1369     return null;
1370   }
1371
1372   /**
1373    * Transfers the sequence feature to the target sequence, locating its start
1374    * and end range based on the mapping. Features which do not overlap the
1375    * target sequence are ignored.
1376    *
1377    * @param sf
1378    * @param targetSequence
1379    * @param mapping
1380    *          mapping from the feature's coordinates to the target sequence
1381    */
1382   protected void transferFeature(SequenceFeature sf,
1383           SequenceI targetSequence, MapList mapping)
1384   {
1385     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1386
1387     if (mappedRange != null)
1388     {
1389       String group = sf.getFeatureGroup();
1390       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1391       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1392       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1393               group, sf.getScore());
1394       targetSequence.addSequenceFeature(copy);
1395     }
1396   }
1397
1398   /**
1399    * Formats a ranges map lookup key
1400    *
1401    * @param chromosome
1402    * @param species
1403    * @param fromRef
1404    * @param toRef
1405    * @return
1406    */
1407   protected static String makeRangesKey(String chromosome, String species,
1408           String fromRef, String toRef)
1409   {
1410     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1411             + toRef;
1412   }
1413 }