src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.Dna;
   4 import jalview.api.AlignViewControllerGuiI;
   5 import jalview.bin.Cache;
   6 import jalview.datamodel.DBRefEntry;
   7 import jalview.datamodel.GeneLociI;
   8 import jalview.datamodel.Mapping;
   9 import jalview.datamodel.SequenceFeature;
  10 import jalview.datamodel.SequenceI;
  11 import jalview.datamodel.features.FeatureAttributeType;
  12 import jalview.datamodel.features.FeatureSource;
  13 import jalview.datamodel.features.FeatureSources;
  14 import jalview.ext.ensembl.EnsemblMap;
  15 import jalview.ext.htsjdk.HtsContigDb;
  16 import jalview.ext.htsjdk.VCFReader;
  17 import jalview.io.gff.Gff3Helper;
  18 import jalview.io.gff.SequenceOntologyI;
  19 import jalview.util.MapList;
  20 import jalview.util.MappingUtils;
  21 import jalview.util.MessageManager;
  22
  23 import java.io.File;
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Map.Entry;
  30 import java.util.regex.Pattern;
  31 import java.util.regex.PatternSyntaxException;
  32
  33 import htsjdk.samtools.SAMException;
  34 import htsjdk.samtools.SAMSequenceDictionary;
  35 import htsjdk.samtools.SAMSequenceRecord;
  36 import htsjdk.samtools.util.CloseableIterator;
  37 import htsjdk.variant.variantcontext.Allele;
  38 import htsjdk.variant.variantcontext.VariantContext;
  39 import htsjdk.variant.vcf.VCFHeader;
  40 import htsjdk.variant.vcf.VCFHeaderLine;
  41 import htsjdk.variant.vcf.VCFHeaderLineCount;
  42 import htsjdk.variant.vcf.VCFHeaderLineType;
  43 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  44
  45 /**
  46  * A class to read VCF data (using the htsjdk) and add variants as sequence
  47  * features on dna and any related protein product sequences
  48  *
  49  * @author gmcarstairs
  50  */
  51 public class VCFLoader
  52 {
  53   /**
  54    * A class to model the mapping from sequence to VCF coordinates. Cases include
  55    * <ul>
  56    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  57    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
  58    * use the same reference assembly</li>
  59    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
  60    * and VCF use different reference assembles</li>
  61    * </ul>
  62    */
  63   class VCFMap
  64   {
  65     final String chromosome;
  66
  67     final MapList map;
  68
  69     VCFMap(String chr, MapList m)
  70     {
  71       chromosome = chr;
  72       map = m;
  73     }
  74
  75     @Override
  76     public String toString()
  77     {
  78       return chromosome + ":" + map.toString();
  79     }
  80   }
  81
  82   /*
  83    * Lookup keys, and default values, for Preference entries that describe
  84    * patterns for VCF and VEP fields to capture
  85    */
  86   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  87
  88   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  89
  90   private static final String DEFAULT_VCF_FIELDS = ".*";
  91
  92   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  93
  94   /*
  95    * keys to fields of VEP CSQ consequence data
  96    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  97    */
  98   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
  99   private static final String CSQ_ALLELE_KEY = "Allele";
 100   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 101   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 102
 103   /*
 104    * default VCF INFO key for VEP consequence data
 105    * NB this can be overridden running VEP with --vcf_info_field
 106    * - we don't handle this case (require identifier to be CSQ)
 107    */
 108   private static final String CSQ_FIELD = "CSQ";
 109
 110   /*
 111    * separator for fields in consequence data is '|'
 112    */
 113   private static final String PIPE_REGEX = "\\|";
 114
 115   /*
 116    * key for Allele Frequency output by VEP
 117    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 118    */
 119   private static final String ALLELE_FREQUENCY_KEY = "AF";
 120
 121   /*
 122    * delimiter that separates multiple consequence data blocks
 123    */
 124   private static final String COMMA = ",";
 125
 126   /*
 127    * the feature group assigned to a VCF variant in Jalview
 128    */
 129   private static final String FEATURE_GROUP_VCF = "VCF";
 130
 131   /*
 132    * internal delimiter used to build keys for assemblyMappings
 133    *
 134    */
 135   private static final String EXCL = "!";
 136
 137   /*
 138    * the VCF file we are processing
 139    */
 140   protected String vcfFilePath;
 141
 142   /*
 143    * mappings between VCF and sequence reference assembly regions, as
 144    * key = "species!chromosome!fromAssembly!toAssembly
 145    * value = Map{fromRange, toRange}
 146    */
 147   private Map<String, Map<int[], int[]>> assemblyMappings;
 148
 149   private VCFReader reader;
 150
 151   /*
 152    * holds details of the VCF header lines (metadata)
 153    */
 154   private VCFHeader header;
 155
 156   /*
 157    * a Dictionary of contigs (if present) referenced in the VCF file
 158    */
 159   private SAMSequenceDictionary dictionary;
 160
 161   /*
 162    * the position (0...) of field in each block of
 163    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 164    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 165    */
 166   private int csqConsequenceFieldIndex = -1;
 167   private int csqAlleleFieldIndex = -1;
 168   private int csqAlleleNumberFieldIndex = -1;
 169   private int csqFeatureFieldIndex = -1;
 170
 171   // todo the same fields for SnpEff ANN data if wanted
 172   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 173
 174   /*
 175    * a unique identifier under which to save metadata about feature
 176    * attributes (selected INFO field data)
 177    */
 178   private String sourceId;
 179
 180   /*
 181    * The INFO IDs of data that is both present in the VCF file, and
 182    * also matched by any filters for data of interest
 183    */
 184   List<String> vcfFieldsOfInterest;
 185
 186   /*
 187    * The field offsets and identifiers for VEP (CSQ) data that is both present
 188    * in the VCF file, and also matched by any filters for data of interest
 189    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 190    */
 191   Map<Integer, String> vepFieldsOfInterest;
 192
 193   /**
 194    * Constructor given a VCF file
 195    *
 196    * @param alignment
 197    */
 198   public VCFLoader(String vcfFile)
 199   {
 200     try
 201     {
 202       initialise(vcfFile);
 203     } catch (IOException e)
 204     {
 205       System.err.println("Error opening VCF file: " + e.getMessage());
 206     }
 207
 208     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 209     assemblyMappings = new HashMap<>();
 210   }
 211
 212   /**
 213    * Starts a new thread to query and load VCF variant data on to the given
 214    * sequences
 215    * <p>
 216    * This method is not thread safe - concurrent threads should use separate
 217    * instances of this class.
 218    *
 219    * @param seqs
 220    * @param gui
 221    */
 222   public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
 223   {
 224     if (gui != null)
 225     {
 226       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 227     }
 228
 229     new Thread()
 230     {
 231       @Override
 232       public void run()
 233       {
 234         VCFLoader.this.doLoad(seqs, gui);
 235       }
 236     }.start();
 237   }
 238
 239   /**
 240    * Reads the specified contig sequence and adds its VCF variants to it
 241    *
 242    * @param contig
 243    *          the id of a single sequence (contig) to load
 244    * @return
 245    */
 246   public SequenceI loadVCFContig(String contig)
 247   {
 248     String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY)
 249             .getValue();
 250     if (ref.startsWith("file://"))
 251     {
 252       ref = ref.substring(7);
 253     }
 254
 255     SequenceI seq = null;
 256     File dbFile = new File(ref);
 257
 258     if (dbFile.exists())
 259     {
 260       HtsContigDb db = new HtsContigDb("", dbFile);
 261       seq = db.getSequenceProxy(contig);
 262       loadSequenceVCF(seq, ref);
 263       db.close();
 264     }
 265     else
 266     {
 267       System.err.println("VCF reference not found: " + ref);
 268     }
 269
 270     return seq;
 271   }
 272
 273   /**
 274    * Loads VCF on to one or more sequences
 275    *
 276    * @param seqs
 277    * @param gui
 278    *          optional callback handler for messages
 279    */
 280   protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
 281   {
 282     try
 283     {
 284       VCFHeaderLine ref = header
 285               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 286       String vcfAssembly = ref.getValue();
 287
 288       int varCount = 0;
 289       int seqCount = 0;
 290
 291       /*
 292        * query for VCF overlapping each sequence in turn
 293        */
 294       for (SequenceI seq : seqs)
 295       {
 296         int added = loadSequenceVCF(seq, vcfAssembly);
 297         if (added > 0)
 298         {
 299           seqCount++;
 300           varCount += added;
 301           transferAddedFeatures(seq);
 302         }
 303       }
 304       if (gui != null)
 305       {
 306         String msg = MessageManager.formatMessage("label.added_vcf",
 307                 varCount, seqCount);
 308         gui.setStatus(msg);
 309         if (gui.getFeatureSettingsUI() != null)
 310         {
 311           gui.getFeatureSettingsUI().discoverAllFeatureData();
 312         }
 313       }
 314     } catch (Throwable e)
 315     {
 316       System.err.println("Error processing VCF: " + e.getMessage());
 317       e.printStackTrace();
 318       if (gui != null)
 319       {
 320         gui.setStatus("Error occurred - see console for details");
 321       }
 322     } finally
 323     {
 324       if (reader != null)
 325       {
 326         try
 327         {
 328           reader.close();
 329         } catch (IOException e)
 330         {
 331           // ignore
 332         }
 333       }
 334       header = null;
 335       dictionary = null;
 336     }
 337   }
 338
 339   /**
 340    * Opens the VCF file and parses header data
 341    *
 342    * @param filePath
 343    * @throws IOException
 344    */
 345   private void initialise(String filePath) throws IOException
 346   {
 347     vcfFilePath = filePath;
 348
 349     reader = new VCFReader(filePath);
 350
 351     header = reader.getFileHeader();
 352
 353     try
 354     {
 355       dictionary = header.getSequenceDictionary();
 356     } catch (SAMException e)
 357     {
 358       // ignore - thrown if any contig line lacks length info
 359     }
 360
 361     sourceId = filePath;
 362
 363     saveMetadata(sourceId);
 364
 365     /*
 366      * get offset of CSQ ALLELE_NUM and Feature if declared
 367      */
 368     parseCsqHeader();
 369   }
 370
 371   /**
 372    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 373    * them for future reference
 374    *
 375    * @param theSourceId
 376    */
 377   void saveMetadata(String theSourceId)
 378   {
 379     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 380             DEFAULT_VCF_FIELDS);
 381     vcfFieldsOfInterest = new ArrayList<>();
 382
 383     FeatureSource metadata = new FeatureSource(theSourceId);
 384
 385     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 386     {
 387       String attributeId = info.getID();
 388       String desc = info.getDescription();
 389       VCFHeaderLineType type = info.getType();
 390       FeatureAttributeType attType = null;
 391       switch (type)
 392       {
 393       case Character:
 394         attType = FeatureAttributeType.Character;
 395         break;
 396       case Flag:
 397         attType = FeatureAttributeType.Flag;
 398         break;
 399       case Float:
 400         attType = FeatureAttributeType.Float;
 401         break;
 402       case Integer:
 403         attType = FeatureAttributeType.Integer;
 404         break;
 405       case String:
 406         attType = FeatureAttributeType.String;
 407         break;
 408       }
 409       metadata.setAttributeName(attributeId, desc);
 410       metadata.setAttributeType(attributeId, attType);
 411
 412       if (isFieldWanted(attributeId, vcfFieldPatterns))
 413       {
 414         vcfFieldsOfInterest.add(attributeId);
 415       }
 416     }
 417
 418     FeatureSources.getInstance().addSource(theSourceId, metadata);
 419   }
 420
 421   /**
 422    * Answers true if the field id is matched by any of the filter patterns, else
 423    * false. Matching is against regular expression patterns, and is not
 424    * case-sensitive.
 425    *
 426    * @param id
 427    * @param filters
 428    * @return
 429    */
 430   private boolean isFieldWanted(String id, List<Pattern> filters)
 431   {
 432     for (Pattern p : filters)
 433     {
 434       if (p.matcher(id.toUpperCase()).matches())
 435       {
 436         return true;
 437       }
 438     }
 439     return false;
 440   }
 441
 442   /**
 443    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 444    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 445    * required for processing.
 446    * <p>
 447    * CSQ fields are declared in the CSQ INFO Description e.g.
 448    * <p>
 449    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 450    */
 451   protected void parseCsqHeader()
 452   {
 453     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 454             DEFAULT_VEP_FIELDS);
 455     vepFieldsOfInterest = new HashMap<>();
 456
 457     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 458     if (csqInfo == null)
 459     {
 460       return;
 461     }
 462
 463     /*
 464      * parse out the pipe-separated list of CSQ fields; we assume here that
 465      * these form the last part of the description, and contain no spaces
 466      */
 467     String desc = csqInfo.getDescription();
 468     int spacePos = desc.lastIndexOf(" ");
 469     desc = desc.substring(spacePos + 1);
 470
 471     if (desc != null)
 472     {
 473       String[] format = desc.split(PIPE_REGEX);
 474       int index = 0;
 475       for (String field : format)
 476       {
 477         if (CSQ_CONSEQUENCE_KEY.equals(field))
 478         {
 479           csqConsequenceFieldIndex = index;
 480         }
 481         if (CSQ_ALLELE_NUM_KEY.equals(field))
 482         {
 483           csqAlleleNumberFieldIndex = index;
 484         }
 485         if (CSQ_ALLELE_KEY.equals(field))
 486         {
 487           csqAlleleFieldIndex = index;
 488         }
 489         if (CSQ_FEATURE_KEY.equals(field))
 490         {
 491           csqFeatureFieldIndex = index;
 492         }
 493
 494         if (isFieldWanted(field, vepFieldFilters))
 495         {
 496           vepFieldsOfInterest.put(index, field);
 497         }
 498
 499         index++;
 500       }
 501     }
 502   }
 503
 504   /**
 505    * Reads the Preference value for the given key, with default specified if no
 506    * preference set. The value is interpreted as a comma-separated list of
 507    * regular expressions, and converted into a list of compiled patterns ready
 508    * for matching. Patterns are forced to upper-case for non-case-sensitive
 509    * matching.
 510    * <p>
 511    * This supports user-defined filters for fields of interest to capture while
 512    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 513    * fields with an ID of AF, or starting with AC, would be matched.
 514    *
 515    * @param key
 516    * @param def
 517    * @return
 518    */
 519   private List<Pattern> getFieldMatchers(String key, String def)
 520   {
 521     String pref = Cache.getDefault(key, def);
 522     List<Pattern> patterns = new ArrayList<>();
 523     String[] tokens = pref.split(",");
 524     for (String token : tokens)
 525     {
 526       try
 527       {
 528       patterns.add(Pattern.compile(token.toUpperCase()));
 529       } catch (PatternSyntaxException e)
 530       {
 531         System.err.println("Invalid pattern ignored: " + token);
 532       }
 533     }
 534     return patterns;
 535   }
 536
 537   /**
 538    * Transfers VCF features to sequences to which this sequence has a mapping.
 539    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 540    *
 541    * @param seq
 542    */
 543   protected void transferAddedFeatures(SequenceI seq)
 544   {
 545     DBRefEntry[] dbrefs = seq.getDBRefs();
 546     if (dbrefs == null)
 547     {
 548       return;
 549     }
 550     for (DBRefEntry dbref : dbrefs)
 551     {
 552       Mapping mapping = dbref.getMap();
 553       if (mapping == null || mapping.getTo() == null)
 554       {
 555         continue;
 556       }
 557
 558       SequenceI mapTo = mapping.getTo();
 559       MapList map = mapping.getMap();
 560       if (map.getFromRatio() == 3)
 561       {
 562         /*
 563          * dna-to-peptide product mapping
 564          */
 565         // JAL-3187 render on the fly instead
 566         // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 567       }
 568       else
 569       {
 570         /*
 571          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 572          */
 573         List<SequenceFeature> features = seq.getFeatures()
 574                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 575         for (SequenceFeature sf : features)
 576         {
 577           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 578           {
 579             transferFeature(sf, mapTo, map);
 580           }
 581         }
 582       }
 583     }
 584   }
 585
 586   /**
 587    * Tries to add overlapping variants read from a VCF file to the given sequence,
 588    * and returns the number of variant features added
 589    *
 590    * @param seq
 591    * @param vcfAssembly
 592    * @return
 593    */
 594   protected int loadSequenceVCF(SequenceI seq, String vcfAssembly)
 595   {
 596     VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
 597     if (vcfMap == null)
 598     {
 599       return 0;
 600     }
 601
 602     /*
 603      * work with the dataset sequence here
 604      */
 605     SequenceI dss = seq.getDatasetSequence();
 606     if (dss == null)
 607     {
 608       dss = seq;
 609     }
 610     return addVcfVariants(dss, vcfMap);
 611   }
 612
 613   /**
 614    * Answers a map from sequence coordinates to VCF chromosome ranges
 615    *
 616    * @param seq
 617    * @param vcfAssembly
 618    * @return
 619    */
 620   private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
 621   {
 622     /*
 623      * simplest case: sequence has id and length matching a VCF contig
 624      */
 625     VCFMap vcfMap = null;
 626     if (dictionary != null)
 627     {
 628       vcfMap = getContigMap(seq);
 629     }
 630     if (vcfMap != null)
 631     {
 632       return vcfMap;
 633     }
 634
 635     /*
 636      * otherwise, map to VCF from chromosomal coordinates
 637      * of the sequence (if known)
 638      */
 639     GeneLociI seqCoords = seq.getGeneLoci();
 640     if (seqCoords == null)
 641     {
 642       Cache.log.warn(String.format(
 643               "Can't query VCF for %s as chromosome coordinates not known",
 644               seq.getName()));
 645       return null;
 646     }
 647
 648     String species = seqCoords.getSpeciesId();
 649     String chromosome = seqCoords.getChromosomeId();
 650     String seqRef = seqCoords.getAssemblyId();
 651     MapList map = seqCoords.getMap();
 652
 653     if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
 654     {
 655       return null;
 656     }
 657
 658     if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
 659     {
 660       return new VCFMap(chromosome, map);
 661     }
 662
 663     if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 664             || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 665     {
 666       return null;
 667     }
 668
 669     /*
 670      * map chromosomal coordinates from sequence to VCF if the VCF
 671      * data has a different reference assembly to the sequence
 672      */
 673     // TODO generalise for cases other than GRCh38 -> GRCh37 !
 674     // - or get the user to choose in a dialog
 675
 676     List<int[]> toVcfRanges = new ArrayList<>();
 677     List<int[]> fromSequenceRanges = new ArrayList<>();
 678     String toRef = "GRCh37";
 679
 680     for (int[] range : map.getToRanges())
 681     {
 682       int[] fromRange = map.locateInFrom(range[0], range[1]);
 683       if (fromRange == null)
 684       {
 685         // corrupted map?!?
 686         continue;
 687       }
 688
 689       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 690               toRef);
 691       if (newRange == null)
 692       {
 693         Cache.log.error(
 694                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 695                         chromosome, seqRef, range[0], range[1], toRef));
 696         continue;
 697       }
 698       else
 699       {
 700         toVcfRanges.add(newRange);
 701         fromSequenceRanges.add(fromRange);
 702       }
 703     }
 704
 705     return new VCFMap(chromosome,
 706             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 707   }
 708
 709   /**
 710    * If the sequence id matches a contig declared in the VCF file, and the
 711    * sequence length matches the contig length, then returns a 1:1 map of the
 712    * sequence to the contig, else returns null
 713    *
 714    * @param seq
 715    * @return
 716    */
 717   private VCFMap getContigMap(SequenceI seq)
 718   {
 719     String id = seq.getName();
 720     SAMSequenceRecord contig = dictionary.getSequence(id);
 721     if (contig != null)
 722     {
 723       int len = seq.getLength();
 724       if (len == contig.getSequenceLength())
 725       {
 726         MapList map = new MapList(new int[] { 1, len },
 727                 new int[]
 728                 { 1, len }, 1, 1);
 729         return new VCFMap(id, map);
 730       }
 731     }
 732     return null;
 733   }
 734
 735   /**
 736    * Answers true if we determine that the VCF data uses the same reference
 737    * assembly as the sequence, else false
 738    *
 739    * @param vcfAssembly
 740    * @param seqRef
 741    * @return
 742    */
 743   private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
 744           String seqRef)
 745   {
 746     // TODO improve on this stub, which handles gnomAD and
 747     // hopes for the best for other cases
 748
 749     if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 750             && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 751     {
 752       return false;
 753     }
 754     return true;
 755   }
 756
 757   /**
 758    * Answers true if the species inferred from the VCF reference identifier
 759    * matches that for the sequence
 760    *
 761    * @param vcfAssembly
 762    * @param speciesId
 763    * @return
 764    */
 765   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 766   {
 767     // PROBLEM 1
 768     // there are many aliases for species - how to equate one with another?
 769     // PROBLEM 2
 770     // VCF ##reference header is an unstructured URI - how to extract species?
 771     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 772     // TODO ask the user to confirm this??
 773
 774     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 775             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 776     {
 777       return true;
 778     }
 779
 780     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 781             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 782     {
 783       return true;
 784     }
 785
 786     // this is not a sustainable solution...
 787
 788     return false;
 789   }
 790
 791   /**
 792    * Queries the VCF reader for any variants that overlap the mapped chromosome
 793    * ranges of the sequence, and adds as variant features. Returns the number of
 794    * overlapping variants found.
 795    *
 796    * @param seq
 797    * @param map
 798    *          mapping from sequence to VCF coordinates
 799    * @return
 800    */
 801   protected int addVcfVariants(SequenceI seq, VCFMap map)
 802   {
 803     boolean forwardStrand = map.map.isToForwardStrand();
 804
 805     /*
 806      * query the VCF for overlaps of each contiguous chromosomal region
 807      */
 808     int count = 0;
 809
 810     for (int[] range : map.map.getToRanges())
 811     {
 812       int vcfStart = Math.min(range[0], range[1]);
 813       int vcfEnd = Math.max(range[0], range[1]);
 814       CloseableIterator<VariantContext> variants = reader
 815               .query(map.chromosome, vcfStart, vcfEnd);
 816       while (variants.hasNext())
 817       {
 818         VariantContext variant = variants.next();
 819
 820         int[] featureRange = map.map.locateInFrom(variant.getStart(),
 821                 variant.getEnd());
 822
 823         if (featureRange != null)
 824         {
 825           int featureStart = Math.min(featureRange[0], featureRange[1]);
 826           int featureEnd = Math.max(featureRange[0], featureRange[1]);
 827           count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 828                   forwardStrand);
 829         }
 830       }
 831       variants.close();
 832     }
 833
 834     return count;
 835   }
 836
 837   /**
 838    * A convenience method to get the AF value for the given alternate allele
 839    * index
 840    *
 841    * @param variant
 842    * @param alleleIndex
 843    * @return
 844    */
 845   protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
 846   {
 847     float score = 0f;
 848     String attributeValue = getAttributeValue(variant,
 849             ALLELE_FREQUENCY_KEY, alleleIndex);
 850     if (attributeValue != null)
 851     {
 852       try
 853       {
 854         score = Float.parseFloat(attributeValue);
 855       } catch (NumberFormatException e)
 856       {
 857         // leave as 0
 858       }
 859     }
 860
 861     return score;
 862   }
 863
 864   /**
 865    * A convenience method to get an attribute value for an alternate allele
 866    *
 867    * @param variant
 868    * @param attributeName
 869    * @param alleleIndex
 870    * @return
 871    */
 872   protected String getAttributeValue(VariantContext variant,
 873           String attributeName, int alleleIndex)
 874   {
 875     Object att = variant.getAttribute(attributeName);
 876
 877     if (att instanceof String)
 878     {
 879       return (String) att;
 880     }
 881     else if (att instanceof ArrayList)
 882     {
 883       return ((List<String>) att).get(alleleIndex);
 884     }
 885
 886     return null;
 887   }
 888
 889   /**
 890    * Adds one variant feature for each allele in the VCF variant record, and
 891    * returns the number of features added.
 892    *
 893    * @param seq
 894    * @param variant
 895    * @param featureStart
 896    * @param featureEnd
 897    * @param forwardStrand
 898    * @return
 899    */
 900   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 901           int featureStart, int featureEnd, boolean forwardStrand)
 902   {
 903     int added = 0;
 904
 905     /*
 906      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 907      * so we proceed defensively to get them in strict order
 908      */
 909     int altAlleleCount = variant.getAlternateAlleles().size();
 910     for (int i = 0; i < altAlleleCount; i++)
 911     {
 912       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 913               forwardStrand);
 914     }
 915     return added;
 916   }
 917
 918   /**
 919    * Inspects one allele and attempts to add a variant feature for it to the
 920    * sequence. The additional data associated with this allele is extracted to
 921    * store in the feature's key-value map. Answers the number of features added (0
 922    * or 1).
 923    *
 924    * @param seq
 925    * @param variant
 926    * @param altAlleleIndex
 927    *          (0, 1..)
 928    * @param featureStart
 929    * @param featureEnd
 930    * @param forwardStrand
 931    * @return
 932    */
 933   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 934           int altAlleleIndex, int featureStart, int featureEnd,
 935           boolean forwardStrand)
 936   {
 937     String reference = variant.getReference().getBaseString();
 938     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 939     String allele = alt.getBaseString();
 940
 941     /*
 942      * insertion after a genomic base, if on reverse strand, has to be
 943      * converted to insertion of complement after the preceding position
 944      */
 945     int referenceLength = reference.length();
 946     if (!forwardStrand && allele.length() > referenceLength
 947             && allele.startsWith(reference))
 948     {
 949       featureStart -= referenceLength;
 950       featureEnd = featureStart;
 951       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
 952       reference = Dna.reverseComplement(String.valueOf(insertAfter));
 953       allele = allele.substring(referenceLength) + reference;
 954     }
 955
 956     /*
 957      * build the ref,alt allele description e.g. "G,A", using the base
 958      * complement if the sequence is on the reverse strand
 959      */
 960     StringBuilder sb = new StringBuilder();
 961     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 962     sb.append(COMMA);
 963     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 964     String alleles = sb.toString(); // e.g. G,A
 965
 966     /*
 967      * pick out the consequence data (if any) that is for the current allele
 968      * and feature (transcript) that matches the current sequence
 969      */
 970     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
 971             altAlleleIndex, csqAlleleFieldIndex,
 972             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
 973             csqFeatureFieldIndex);
 974
 975     /*
 976      * pick out the ontology term for the consequence type
 977      */
 978     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 979     if (consequence != null)
 980     {
 981       type = getOntologyTerm(consequence);
 982     }
 983
 984     float score = getAlleleFrequency(variant, altAlleleIndex);
 985
 986     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 987             featureEnd, score, FEATURE_GROUP_VCF);
 988     sf.setSource(sourceId);
 989
 990     sf.setValue(Gff3Helper.ALLELES, alleles);
 991
 992     addAlleleProperties(variant, sf, altAlleleIndex, consequence);
 993
 994     seq.addSequenceFeature(sf);
 995
 996     return 1;
 997   }
 998
 999   /**
1000    * Determines the Sequence Ontology term to use for the variant feature type in
1001    * Jalview. The default is 'sequence_variant', but a more specific term is used
1002    * if:
1003    * <ul>
1004    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
1005    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
1006    * </ul>
1007    *
1008    * @param consequence
1009    * @return
1010    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
1011    */
1012   String getOntologyTerm(String consequence)
1013   {
1014     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1015
1016     /*
1017      * could we associate Consequence data with this allele and feature (transcript)?
1018      * if so, prefer the consequence term from that data
1019      */
1020     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
1021     {
1022       /*
1023        * no Consequence data so we can't refine the ontology term
1024        */
1025       return type;
1026     }
1027
1028     if (consequence != null)
1029     {
1030       String[] csqFields = consequence.split(PIPE_REGEX);
1031       if (csqFields.length > csqConsequenceFieldIndex)
1032       {
1033         type = csqFields[csqConsequenceFieldIndex];
1034       }
1035     }
1036     else
1037     {
1038       // todo the same for SnpEff consequence data matching if wanted
1039     }
1040
1041     /*
1042      * if of the form (e.g.) missense_variant&splice_region_variant,
1043      * just take the first ('most severe') consequence
1044      */
1045     if (type != null)
1046     {
1047       int pos = type.indexOf('&');
1048       if (pos > 0)
1049       {
1050         type = type.substring(0, pos);
1051       }
1052     }
1053     return type;
1054   }
1055
1056   /**
1057    * Returns matched consequence data if it can be found, else null.
1058    * <ul>
1059    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1060    * <li>splits this on comma (to distinct consequences)</li>
1061    * <li>returns the first consequence (if any) where</li>
1062    * <ul>
1063    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1064    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1065    * </ul>
1066    * </ul>
1067    * If matched, the consequence is returned (as pipe-delimited fields).
1068    *
1069    * @param variant
1070    * @param vcfInfoId
1071    * @param altAlleleIndex
1072    * @param alleleFieldIndex
1073    * @param alleleNumberFieldIndex
1074    * @param seqName
1075    * @param featureFieldIndex
1076    * @return
1077    */
1078   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1079           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1080           int alleleNumberFieldIndex,
1081           String seqName, int featureFieldIndex)
1082   {
1083     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1084     {
1085       return null;
1086     }
1087     Object value = variant.getAttribute(vcfInfoId);
1088
1089     if (value == null || !(value instanceof List<?>))
1090     {
1091       return null;
1092     }
1093
1094     /*
1095      * inspect each consequence in turn (comma-separated blocks
1096      * extracted by htsjdk)
1097      */
1098     List<String> consequences = (List<String>) value;
1099
1100     for (String consequence : consequences)
1101     {
1102       String[] csqFields = consequence.split(PIPE_REGEX);
1103       if (csqFields.length > featureFieldIndex)
1104       {
1105         String featureIdentifier = csqFields[featureFieldIndex];
1106         if (featureIdentifier.length() > 4
1107                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1108         {
1109           /*
1110            * feature (transcript) matched - now check for allele match
1111            */
1112           if (matchAllele(variant, altAlleleIndex, csqFields,
1113                   alleleFieldIndex, alleleNumberFieldIndex))
1114           {
1115             return consequence;
1116           }
1117         }
1118       }
1119     }
1120     return null;
1121   }
1122
1123   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1124           String[] csqFields, int alleleFieldIndex,
1125           int alleleNumberFieldIndex)
1126   {
1127     /*
1128      * if ALLELE_NUM is present, it must match altAlleleIndex
1129      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1130      */
1131     if (alleleNumberFieldIndex > -1)
1132     {
1133       if (csqFields.length <= alleleNumberFieldIndex)
1134       {
1135         return false;
1136       }
1137       String alleleNum = csqFields[alleleNumberFieldIndex];
1138       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1139     }
1140
1141     /*
1142      * else consequence allele must match variant allele
1143      */
1144     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1145     {
1146       String csqAllele = csqFields[alleleFieldIndex];
1147       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1148               .getBaseString();
1149       return csqAllele.equals(vcfAllele);
1150     }
1151     return false;
1152   }
1153
1154   /**
1155    * Add any allele-specific VCF key-value data to the sequence feature
1156    *
1157    * @param variant
1158    * @param sf
1159    * @param altAlelleIndex
1160    *          (0, 1..)
1161    * @param consequence
1162    *          if not null, the consequence specific to this sequence (transcript
1163    *          feature) and allele
1164    */
1165   protected void addAlleleProperties(VariantContext variant,
1166           SequenceFeature sf, final int altAlelleIndex, String consequence)
1167   {
1168     Map<String, Object> atts = variant.getAttributes();
1169
1170     for (Entry<String, Object> att : atts.entrySet())
1171     {
1172       String key = att.getKey();
1173
1174       /*
1175        * extract Consequence data (if present) that we are able to
1176        * associated with the allele for this variant feature
1177        */
1178       if (CSQ_FIELD.equals(key))
1179       {
1180         addConsequences(variant, sf, consequence);
1181         continue;
1182       }
1183
1184       /*
1185        * filter out fields we don't want to capture
1186        */
1187       if (!vcfFieldsOfInterest.contains(key))
1188       {
1189         continue;
1190       }
1191
1192       /*
1193        * filter out fields we don't want to capture
1194        */
1195       if (!vcfFieldsOfInterest.contains(key))
1196       {
1197         continue;
1198       }
1199
1200       /*
1201        * we extract values for other data which are allele-specific;
1202        * these may be per alternate allele (INFO[key].Number = 'A')
1203        * or per allele including reference (INFO[key].Number = 'R')
1204        */
1205       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1206       if (infoHeader == null)
1207       {
1208         /*
1209          * can't be sure what data belongs to this allele, so
1210          * play safe and don't take any
1211          */
1212         continue;
1213       }
1214
1215       VCFHeaderLineCount number = infoHeader.getCountType();
1216       int index = altAlelleIndex;
1217       if (number == VCFHeaderLineCount.R)
1218       {
1219         /*
1220          * one value per allele including reference, so bump index
1221          * e.g. the 3rd value is for the  2nd alternate allele
1222          */
1223         index++;
1224       }
1225       else if (number != VCFHeaderLineCount.A)
1226       {
1227         /*
1228          * don't save other values as not allele-related
1229          */
1230         continue;
1231       }
1232
1233       /*
1234        * take the index'th value
1235        */
1236       String value = getAttributeValue(variant, key, index);
1237       if (value != null)
1238       {
1239         sf.setValue(key, value);
1240       }
1241     }
1242   }
1243
1244   /**
1245    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1246    * feature.
1247    * <p>
1248    * If <code>myConsequence</code> is not null, then this is the specific
1249    * consequence data (pipe-delimited fields) that is for the current allele and
1250    * transcript (sequence) being processed)
1251    *
1252    * @param variant
1253    * @param sf
1254    * @param myConsequence
1255    */
1256   protected void addConsequences(VariantContext variant, SequenceFeature sf,
1257           String myConsequence)
1258   {
1259     Object value = variant.getAttribute(CSQ_FIELD);
1260
1261     if (value == null || !(value instanceof List<?>))
1262     {
1263       return;
1264     }
1265
1266     List<String> consequences = (List<String>) value;
1267
1268     /*
1269      * inspect CSQ consequences; restrict to the consequence
1270      * associated with the current transcript (Feature)
1271      */
1272     Map<String, String> csqValues = new HashMap<>();
1273
1274     for (String consequence : consequences)
1275     {
1276       if (myConsequence == null || myConsequence.equals(consequence))
1277       {
1278         String[] csqFields = consequence.split(PIPE_REGEX);
1279
1280         /*
1281          * inspect individual fields of this consequence, copying non-null
1282          * values which are 'fields of interest'
1283          */
1284         int i = 0;
1285         for (String field : csqFields)
1286         {
1287           if (field != null && field.length() > 0)
1288           {
1289             String id = vepFieldsOfInterest.get(i);
1290             if (id != null)
1291             {
1292               csqValues.put(id, field);
1293             }
1294           }
1295           i++;
1296         }
1297       }
1298     }
1299
1300     if (!csqValues.isEmpty())
1301     {
1302       sf.setValue(CSQ_FIELD, csqValues);
1303     }
1304   }
1305
1306   /**
1307    * A convenience method to complement a dna base and return the string value
1308    * of its complement
1309    *
1310    * @param reference
1311    * @return
1312    */
1313   protected String complement(byte[] reference)
1314   {
1315     return String.valueOf(Dna.getComplement((char) reference[0]));
1316   }
1317
1318   /**
1319    * Determines the location of the query range (chromosome positions) in a
1320    * different reference assembly.
1321    * <p>
1322    * If the range is just a subregion of one for which we already have a mapping
1323    * (for example, an exon sub-region of a gene), then the mapping is just
1324    * computed arithmetically.
1325    * <p>
1326    * Otherwise, calls the Ensembl REST service that maps from one assembly
1327    * reference's coordinates to another's
1328    *
1329    * @param queryRange
1330    *          start-end chromosomal range in 'fromRef' coordinates
1331    * @param chromosome
1332    * @param species
1333    * @param fromRef
1334    *          assembly reference for the query coordinates
1335    * @param toRef
1336    *          assembly reference we wish to translate to
1337    * @return the start-end range in 'toRef' coordinates
1338    */
1339   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1340           String species, String fromRef, String toRef)
1341   {
1342     /*
1343      * first try shorcut of computing the mapping as a subregion of one
1344      * we already have (e.g. for an exon, if we have the gene mapping)
1345      */
1346     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1347             species, fromRef, toRef);
1348     if (mappedRange != null)
1349     {
1350       return mappedRange;
1351     }
1352
1353     /*
1354      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1355      */
1356     EnsemblMap mapper = new EnsemblMap();
1357     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1358             toRef, queryRange);
1359
1360     if (mapping == null)
1361     {
1362       // mapping service failure
1363       return null;
1364     }
1365
1366     /*
1367      * save mapping for possible future re-use
1368      */
1369     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1370     if (!assemblyMappings.containsKey(key))
1371     {
1372       assemblyMappings.put(key, new HashMap<int[], int[]>());
1373     }
1374
1375     assemblyMappings.get(key).put(queryRange, mapping);
1376
1377     return mapping;
1378   }
1379
1380   /**
1381    * If we already have a 1:1 contiguous mapping which subsumes the given query
1382    * range, this method just calculates and returns the subset of that mapping,
1383    * else it returns null. In practical terms, if a gene has a contiguous
1384    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1385    * subsidiary exons occupy unchanged relative positions, and just compute
1386    * these as offsets, rather than do another lookup of the mapping.
1387    * <p>
1388    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1389    * simply remove this method or let it always return null.
1390    * <p>
1391    * Warning: many rapid calls to the /map service map result in a 429 overload
1392    * error response
1393    *
1394    * @param queryRange
1395    * @param chromosome
1396    * @param species
1397    * @param fromRef
1398    * @param toRef
1399    * @return
1400    */
1401   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1402           String species, String fromRef, String toRef)
1403   {
1404     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1405     if (assemblyMappings.containsKey(key))
1406     {
1407       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1408       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1409       {
1410         int[] fromRange = mappedRange.getKey();
1411         int[] toRange = mappedRange.getValue();
1412         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1413         {
1414           /*
1415            * mapping is 1:1 in length, so we trust it to have no discontinuities
1416            */
1417           if (MappingUtils.rangeContains(fromRange, queryRange))
1418           {
1419             /*
1420              * fromRange subsumes our query range
1421              */
1422             int offset = queryRange[0] - fromRange[0];
1423             int mappedRangeFrom = toRange[0] + offset;
1424             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1425             return new int[] { mappedRangeFrom, mappedRangeTo };
1426           }
1427         }
1428       }
1429     }
1430     return null;
1431   }
1432
1433   /**
1434    * Transfers the sequence feature to the target sequence, locating its start
1435    * and end range based on the mapping. Features which do not overlap the
1436    * target sequence are ignored.
1437    *
1438    * @param sf
1439    * @param targetSequence
1440    * @param mapping
1441    *          mapping from the feature's coordinates to the target sequence
1442    */
1443   protected void transferFeature(SequenceFeature sf,
1444           SequenceI targetSequence, MapList mapping)
1445   {
1446     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1447
1448     if (mappedRange != null)
1449     {
1450       String group = sf.getFeatureGroup();
1451       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1452       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1453       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1454               group, sf.getScore());
1455       targetSequence.addSequenceFeature(copy);
1456     }
1457   }
1458
1459   /**
1460    * Formats a ranges map lookup key
1461    *
1462    * @param chromosome
1463    * @param species
1464    * @param fromRef
1465    * @param toRef
1466    * @return
1467    */
1468   protected static String makeRangesKey(String chromosome, String species,
1469           String fromRef, String toRef)
1470   {
1471     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1472             + toRef;
1473   }
1474 }