src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.api.AlignViewControllerGuiI;
   6 import jalview.bin.Cache;
   7 import jalview.datamodel.DBRefEntry;
   8 import jalview.datamodel.GeneLociI;
   9 import jalview.datamodel.Mapping;
  10 import jalview.datamodel.SequenceFeature;
  11 import jalview.datamodel.SequenceI;
  12 import jalview.datamodel.features.FeatureAttributeType;
  13 import jalview.datamodel.features.FeatureSource;
  14 import jalview.datamodel.features.FeatureSources;
  15 import jalview.ext.ensembl.EnsemblMap;
  16 import jalview.ext.htsjdk.HtsContigDb;
  17 import jalview.ext.htsjdk.VCFReader;
  18 import jalview.io.gff.Gff3Helper;
  19 import jalview.io.gff.SequenceOntologyI;
  20 import jalview.util.MapList;
  21 import jalview.util.MappingUtils;
  22 import jalview.util.MessageManager;
  23
  24 import java.io.File;
  25 import java.io.IOException;
  26 import java.util.ArrayList;
  27 import java.util.HashMap;
  28 import java.util.List;
  29 import java.util.Map;
  30 import java.util.Map.Entry;
  31 import java.util.regex.Pattern;
  32 import java.util.regex.PatternSyntaxException;
  33
  34 import htsjdk.samtools.SAMException;
  35 import htsjdk.samtools.SAMSequenceDictionary;
  36 import htsjdk.samtools.SAMSequenceRecord;
  37 import htsjdk.samtools.util.CloseableIterator;
  38 import htsjdk.variant.variantcontext.Allele;
  39 import htsjdk.variant.variantcontext.VariantContext;
  40 import htsjdk.variant.vcf.VCFHeader;
  41 import htsjdk.variant.vcf.VCFHeaderLine;
  42 import htsjdk.variant.vcf.VCFHeaderLineCount;
  43 import htsjdk.variant.vcf.VCFHeaderLineType;
  44 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  45
  46 /**
  47  * A class to read VCF data (using the htsjdk) and add variants as sequence
  48  * features on dna and any related protein product sequences
  49  *
  50  * @author gmcarstairs
  51  */
  52 public class VCFLoader
  53 {
  54   /**
  55    * A class to model the mapping from sequence to VCF coordinates. Cases include
  56    * <ul>
  57    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  58    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
  59    * use the same reference assembly</li>
  60    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
  61    * and VCF use different reference assembles</li>
  62    * </ul>
  63    */
  64   class VCFMap
  65   {
  66     final String chromosome;
  67
  68     final MapList map;
  69
  70     VCFMap(String chr, MapList m)
  71     {
  72       chromosome = chr;
  73       map = m;
  74     }
  75
  76     @Override
  77     public String toString()
  78     {
  79       return chromosome + ":" + map.toString();
  80     }
  81   }
  82
  83   /*
  84    * Lookup keys, and default values, for Preference entries that describe
  85    * patterns for VCF and VEP fields to capture
  86    */
  87   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  88
  89   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  90
  91   private static final String DEFAULT_VCF_FIELDS = ".*";
  92
  93   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  94
  95   /*
  96    * keys to fields of VEP CSQ consequence data
  97    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  98    */
  99   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
 100   private static final String CSQ_ALLELE_KEY = "Allele";
 101   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 102   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 103
 104   /*
 105    * default VCF INFO key for VEP consequence data
 106    * NB this can be overridden running VEP with --vcf_info_field
 107    * - we don't handle this case (require identifier to be CSQ)
 108    */
 109   private static final String CSQ_FIELD = "CSQ";
 110
 111   /*
 112    * separator for fields in consequence data is '|'
 113    */
 114   private static final String PIPE_REGEX = "\\|";
 115
 116   /*
 117    * key for Allele Frequency output by VEP
 118    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 119    */
 120   private static final String ALLELE_FREQUENCY_KEY = "AF";
 121
 122   /*
 123    * delimiter that separates multiple consequence data blocks
 124    */
 125   private static final String COMMA = ",";
 126
 127   /*
 128    * the feature group assigned to a VCF variant in Jalview
 129    */
 130   private static final String FEATURE_GROUP_VCF = "VCF";
 131
 132   /*
 133    * internal delimiter used to build keys for assemblyMappings
 134    *
 135    */
 136   private static final String EXCL = "!";
 137
 138   /*
 139    * the VCF file we are processing
 140    */
 141   protected String vcfFilePath;
 142
 143   /*
 144    * mappings between VCF and sequence reference assembly regions, as
 145    * key = "species!chromosome!fromAssembly!toAssembly
 146    * value = Map{fromRange, toRange}
 147    */
 148   private Map<String, Map<int[], int[]>> assemblyMappings;
 149
 150   private VCFReader reader;
 151
 152   /*
 153    * holds details of the VCF header lines (metadata)
 154    */
 155   private VCFHeader header;
 156
 157   /*
 158    * a Dictionary of contigs (if present) referenced in the VCF file
 159    */
 160   private SAMSequenceDictionary dictionary;
 161
 162   /*
 163    * the position (0...) of field in each block of
 164    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 165    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 166    */
 167   private int csqConsequenceFieldIndex = -1;
 168   private int csqAlleleFieldIndex = -1;
 169   private int csqAlleleNumberFieldIndex = -1;
 170   private int csqFeatureFieldIndex = -1;
 171
 172   // todo the same fields for SnpEff ANN data if wanted
 173   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 174
 175   /*
 176    * a unique identifier under which to save metadata about feature
 177    * attributes (selected INFO field data)
 178    */
 179   private String sourceId;
 180
 181   /*
 182    * The INFO IDs of data that is both present in the VCF file, and
 183    * also matched by any filters for data of interest
 184    */
 185   List<String> vcfFieldsOfInterest;
 186
 187   /*
 188    * The field offsets and identifiers for VEP (CSQ) data that is both present
 189    * in the VCF file, and also matched by any filters for data of interest
 190    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 191    */
 192   Map<Integer, String> vepFieldsOfInterest;
 193
 194   /**
 195    * Constructor given a VCF file
 196    *
 197    * @param alignment
 198    */
 199   public VCFLoader(String vcfFile)
 200   {
 201     try
 202     {
 203       initialise(vcfFile);
 204     } catch (IOException e)
 205     {
 206       System.err.println("Error opening VCF file: " + e.getMessage());
 207     }
 208
 209     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 210     assemblyMappings = new HashMap<>();
 211   }
 212
 213   /**
 214    * Starts a new thread to query and load VCF variant data on to the given
 215    * sequences
 216    * <p>
 217    * This method is not thread safe - concurrent threads should use separate
 218    * instances of this class.
 219    *
 220    * @param seqs
 221    * @param gui
 222    */
 223   public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
 224   {
 225     if (gui != null)
 226     {
 227       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 228     }
 229
 230     new Thread()
 231     {
 232       @Override
 233       public void run()
 234       {
 235         VCFLoader.this.doLoad(seqs, gui);
 236       }
 237     }.start();
 238   }
 239
 240   /**
 241    * Reads the specified contig sequence and adds its VCF variants to it
 242    *
 243    * @param contig
 244    *          the id of a single sequence (contig) to load
 245    * @return
 246    */
 247   public SequenceI loadVCFContig(String contig)
 248   {
 249     String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY)
 250             .getValue();
 251     if (ref.startsWith("file://"))
 252     {
 253       ref = ref.substring(7);
 254     }
 255
 256     SequenceI seq = null;
 257     File dbFile = new File(ref);
 258
 259     if (dbFile.exists())
 260     {
 261       HtsContigDb db = new HtsContigDb("", dbFile);
 262       seq = db.getSequenceProxy(contig);
 263       loadSequenceVCF(seq, ref);
 264       db.close();
 265     }
 266     else
 267     {
 268       System.err.println("VCF reference not found: " + ref);
 269     }
 270
 271     return seq;
 272   }
 273
 274   /**
 275    * Loads VCF on to one or more sequences
 276    *
 277    * @param seqs
 278    * @param gui
 279    *          optional callback handler for messages
 280    */
 281   protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
 282   {
 283     try
 284     {
 285       VCFHeaderLine ref = header
 286               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 287       String vcfAssembly = ref.getValue();
 288
 289       int varCount = 0;
 290       int seqCount = 0;
 291
 292       /*
 293        * query for VCF overlapping each sequence in turn
 294        */
 295       for (SequenceI seq : seqs)
 296       {
 297         int added = loadSequenceVCF(seq, vcfAssembly);
 298         if (added > 0)
 299         {
 300           seqCount++;
 301           varCount += added;
 302           transferAddedFeatures(seq);
 303         }
 304       }
 305       if (gui != null)
 306       {
 307         String msg = MessageManager.formatMessage("label.added_vcf",
 308                 varCount, seqCount);
 309         gui.setStatus(msg);
 310         if (gui.getFeatureSettingsUI() != null)
 311         {
 312           gui.getFeatureSettingsUI().discoverAllFeatureData();
 313         }
 314       }
 315     } catch (Throwable e)
 316     {
 317       System.err.println("Error processing VCF: " + e.getMessage());
 318       e.printStackTrace();
 319       if (gui != null)
 320       {
 321         gui.setStatus("Error occurred - see console for details");
 322       }
 323     } finally
 324     {
 325       if (reader != null)
 326       {
 327         try
 328         {
 329           reader.close();
 330         } catch (IOException e)
 331         {
 332           // ignore
 333         }
 334       }
 335       header = null;
 336       dictionary = null;
 337     }
 338   }
 339
 340   /**
 341    * Opens the VCF file and parses header data
 342    *
 343    * @param filePath
 344    * @throws IOException
 345    */
 346   private void initialise(String filePath) throws IOException
 347   {
 348     vcfFilePath = filePath;
 349
 350     reader = new VCFReader(filePath);
 351
 352     header = reader.getFileHeader();
 353
 354     try
 355     {
 356       dictionary = header.getSequenceDictionary();
 357     } catch (SAMException e)
 358     {
 359       // ignore - thrown if any contig line lacks length info
 360     }
 361
 362     sourceId = filePath;
 363
 364     saveMetadata(sourceId);
 365
 366     /*
 367      * get offset of CSQ ALLELE_NUM and Feature if declared
 368      */
 369     parseCsqHeader();
 370   }
 371
 372   /**
 373    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 374    * them for future reference
 375    *
 376    * @param theSourceId
 377    */
 378   void saveMetadata(String theSourceId)
 379   {
 380     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 381             DEFAULT_VCF_FIELDS);
 382     vcfFieldsOfInterest = new ArrayList<>();
 383
 384     FeatureSource metadata = new FeatureSource(theSourceId);
 385
 386     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 387     {
 388       String attributeId = info.getID();
 389       String desc = info.getDescription();
 390       VCFHeaderLineType type = info.getType();
 391       FeatureAttributeType attType = null;
 392       switch (type)
 393       {
 394       case Character:
 395         attType = FeatureAttributeType.Character;
 396         break;
 397       case Flag:
 398         attType = FeatureAttributeType.Flag;
 399         break;
 400       case Float:
 401         attType = FeatureAttributeType.Float;
 402         break;
 403       case Integer:
 404         attType = FeatureAttributeType.Integer;
 405         break;
 406       case String:
 407         attType = FeatureAttributeType.String;
 408         break;
 409       }
 410       metadata.setAttributeName(attributeId, desc);
 411       metadata.setAttributeType(attributeId, attType);
 412
 413       if (isFieldWanted(attributeId, vcfFieldPatterns))
 414       {
 415         vcfFieldsOfInterest.add(attributeId);
 416       }
 417     }
 418
 419     FeatureSources.getInstance().addSource(theSourceId, metadata);
 420   }
 421
 422   /**
 423    * Answers true if the field id is matched by any of the filter patterns, else
 424    * false. Matching is against regular expression patterns, and is not
 425    * case-sensitive.
 426    *
 427    * @param id
 428    * @param filters
 429    * @return
 430    */
 431   private boolean isFieldWanted(String id, List<Pattern> filters)
 432   {
 433     for (Pattern p : filters)
 434     {
 435       if (p.matcher(id.toUpperCase()).matches())
 436       {
 437         return true;
 438       }
 439     }
 440     return false;
 441   }
 442
 443   /**
 444    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 445    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 446    * required for processing.
 447    * <p>
 448    * CSQ fields are declared in the CSQ INFO Description e.g.
 449    * <p>
 450    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 451    */
 452   protected void parseCsqHeader()
 453   {
 454     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 455             DEFAULT_VEP_FIELDS);
 456     vepFieldsOfInterest = new HashMap<>();
 457
 458     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 459     if (csqInfo == null)
 460     {
 461       return;
 462     }
 463
 464     /*
 465      * parse out the pipe-separated list of CSQ fields; we assume here that
 466      * these form the last part of the description, and contain no spaces
 467      */
 468     String desc = csqInfo.getDescription();
 469     int spacePos = desc.lastIndexOf(" ");
 470     desc = desc.substring(spacePos + 1);
 471
 472     if (desc != null)
 473     {
 474       String[] format = desc.split(PIPE_REGEX);
 475       int index = 0;
 476       for (String field : format)
 477       {
 478         if (CSQ_CONSEQUENCE_KEY.equals(field))
 479         {
 480           csqConsequenceFieldIndex = index;
 481         }
 482         if (CSQ_ALLELE_NUM_KEY.equals(field))
 483         {
 484           csqAlleleNumberFieldIndex = index;
 485         }
 486         if (CSQ_ALLELE_KEY.equals(field))
 487         {
 488           csqAlleleFieldIndex = index;
 489         }
 490         if (CSQ_FEATURE_KEY.equals(field))
 491         {
 492           csqFeatureFieldIndex = index;
 493         }
 494
 495         if (isFieldWanted(field, vepFieldFilters))
 496         {
 497           vepFieldsOfInterest.put(index, field);
 498         }
 499
 500         index++;
 501       }
 502     }
 503   }
 504
 505   /**
 506    * Reads the Preference value for the given key, with default specified if no
 507    * preference set. The value is interpreted as a comma-separated list of
 508    * regular expressions, and converted into a list of compiled patterns ready
 509    * for matching. Patterns are forced to upper-case for non-case-sensitive
 510    * matching.
 511    * <p>
 512    * This supports user-defined filters for fields of interest to capture while
 513    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 514    * fields with an ID of AF, or starting with AC, would be matched.
 515    *
 516    * @param key
 517    * @param def
 518    * @return
 519    */
 520   private List<Pattern> getFieldMatchers(String key, String def)
 521   {
 522     String pref = Cache.getDefault(key, def);
 523     List<Pattern> patterns = new ArrayList<>();
 524     String[] tokens = pref.split(",");
 525     for (String token : tokens)
 526     {
 527       try
 528       {
 529       patterns.add(Pattern.compile(token.toUpperCase()));
 530       } catch (PatternSyntaxException e)
 531       {
 532         System.err.println("Invalid pattern ignored: " + token);
 533       }
 534     }
 535     return patterns;
 536   }
 537
 538   /**
 539    * Transfers VCF features to sequences to which this sequence has a mapping.
 540    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 541    *
 542    * @param seq
 543    */
 544   protected void transferAddedFeatures(SequenceI seq)
 545   {
 546     List<DBRefEntry> dbrefs = seq.getDBRefs();
 547     if (dbrefs == null)
 548     {
 549       return;
 550     }
 551     for (DBRefEntry dbref : dbrefs)
 552     {
 553       Mapping mapping = dbref.getMap();
 554       if (mapping == null || mapping.getTo() == null)
 555       {
 556         continue;
 557       }
 558
 559       SequenceI mapTo = mapping.getTo();
 560       MapList map = mapping.getMap();
 561       if (map.getFromRatio() == 3)
 562       {
 563         /*
 564          * dna-to-peptide product mapping
 565          */
 566         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 567       }
 568       else
 569       {
 570         /*
 571          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 572          */
 573         List<SequenceFeature> features = seq.getFeatures()
 574                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 575         for (SequenceFeature sf : features)
 576         {
 577           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 578           {
 579             transferFeature(sf, mapTo, map);
 580           }
 581         }
 582       }
 583     }
 584   }
 585
 586   /**
 587    * Tries to add overlapping variants read from a VCF file to the given sequence,
 588    * and returns the number of variant features added
 589    *
 590    * @param seq
 591    * @param vcfAssembly
 592    * @return
 593    */
 594   protected int loadSequenceVCF(SequenceI seq, String vcfAssembly)
 595   {
 596     VCFMap vcfMap = getVcfMap(seq, vcfAssembly);
 597     if (vcfMap == null)
 598     {
 599       return 0;
 600     }
 601
 602     /*
 603      * work with the dataset sequence here
 604      */
 605     SequenceI dss = seq.getDatasetSequence();
 606     if (dss == null)
 607     {
 608       dss = seq;
 609     }
 610     return addVcfVariants(dss, vcfMap);
 611   }
 612
 613   /**
 614    * Answers a map from sequence coordinates to VCF chromosome ranges
 615    *
 616    * @param seq
 617    * @param vcfAssembly
 618    * @return
 619    */
 620   private VCFMap getVcfMap(SequenceI seq, String vcfAssembly)
 621   {
 622     /*
 623      * simplest case: sequence has id and length matching a VCF contig
 624      */
 625     VCFMap vcfMap = null;
 626     if (dictionary != null)
 627     {
 628       vcfMap = getContigMap(seq);
 629     }
 630     if (vcfMap != null)
 631     {
 632       return vcfMap;
 633     }
 634
 635     /*
 636      * otherwise, map to VCF from chromosomal coordinates
 637      * of the sequence (if known)
 638      */
 639     GeneLociI seqCoords = seq.getGeneLoci();
 640     if (seqCoords == null)
 641     {
 642       Cache.log.warn(String.format(
 643               "Can't query VCF for %s as chromosome coordinates not known",
 644               seq.getName()));
 645       return null;
 646     }
 647
 648     String species = seqCoords.getSpeciesId();
 649     String chromosome = seqCoords.getChromosomeId();
 650     String seqRef = seqCoords.getAssemblyId();
 651     MapList map = seqCoords.getMap();
 652
 653     if (!vcfSpeciesMatchesSequence(vcfAssembly, species))
 654     {
 655       return null;
 656     }
 657
 658     if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef))
 659     {
 660       return new VCFMap(chromosome, map);
 661     }
 662
 663     if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 664             || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 665     {
 666       return null;
 667     }
 668
 669     /*
 670      * map chromosomal coordinates from sequence to VCF if the VCF
 671      * data has a different reference assembly to the sequence
 672      */
 673     // TODO generalise for cases other than GRCh38 -> GRCh37 !
 674     // - or get the user to choose in a dialog
 675
 676     List<int[]> toVcfRanges = new ArrayList<>();
 677     List<int[]> fromSequenceRanges = new ArrayList<>();
 678     String toRef = "GRCh37";
 679
 680     for (int[] range : map.getToRanges())
 681     {
 682       int[] fromRange = map.locateInFrom(range[0], range[1]);
 683       if (fromRange == null)
 684       {
 685         // corrupted map?!?
 686         continue;
 687       }
 688
 689       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 690               toRef);
 691       if (newRange == null)
 692       {
 693         Cache.log.error(
 694                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 695                         chromosome, seqRef, range[0], range[1], toRef));
 696         continue;
 697       }
 698       else
 699       {
 700         toVcfRanges.add(newRange);
 701         fromSequenceRanges.add(fromRange);
 702       }
 703     }
 704
 705     return new VCFMap(chromosome,
 706             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 707   }
 708
 709   /**
 710    * If the sequence id matches a contig declared in the VCF file, and the
 711    * sequence length matches the contig length, then returns a 1:1 map of the
 712    * sequence to the contig, else returns null
 713    *
 714    * @param seq
 715    * @return
 716    */
 717   private VCFMap getContigMap(SequenceI seq)
 718   {
 719     String id = seq.getName();
 720     SAMSequenceRecord contig = dictionary.getSequence(id);
 721     if (contig != null)
 722     {
 723       int len = seq.getLength();
 724       if (len == contig.getSequenceLength())
 725       {
 726         MapList map = new MapList(new int[] { 1, len },
 727                 new int[]
 728                 { 1, len }, 1, 1);
 729         return new VCFMap(id, map);
 730       }
 731     }
 732     return null;
 733   }
 734
 735   /**
 736    * Answers true if we determine that the VCF data uses the same reference
 737    * assembly as the sequence, else false
 738    *
 739    * @param vcfAssembly
 740    * @param seqRef
 741    * @return
 742    */
 743   private boolean vcfAssemblyMatchesSequence(String vcfAssembly,
 744           String seqRef)
 745   {
 746     // TODO improve on this stub, which handles gnomAD and
 747     // hopes for the best for other cases
 748
 749     if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl
 750             && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD
 751     {
 752       return false;
 753     }
 754     return true;
 755   }
 756
 757   /**
 758    * Answers true if the species inferred from the VCF reference identifier
 759    * matches that for the sequence
 760    *
 761    * @param vcfAssembly
 762    * @param speciesId
 763    * @return
 764    */
 765   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 766   {
 767     // PROBLEM 1
 768     // there are many aliases for species - how to equate one with another?
 769     // PROBLEM 2
 770     // VCF ##reference header is an unstructured URI - how to extract species?
 771     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 772     // TODO ask the user to confirm this??
 773
 774     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 775             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 776     {
 777       return true;
 778     }
 779
 780     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 781             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 782     {
 783       return true;
 784     }
 785
 786     // this is not a sustainable solution...
 787
 788     return false;
 789   }
 790
 791   /**
 792    * Queries the VCF reader for any variants that overlap the mapped chromosome
 793    * ranges of the sequence, and adds as variant features. Returns the number of
 794    * overlapping variants found.
 795    *
 796    * @param seq
 797    * @param map
 798    *          mapping from sequence to VCF coordinates
 799    * @return
 800    */
 801   protected int addVcfVariants(SequenceI seq, VCFMap map)
 802   {
 803     boolean forwardStrand = map.map.isToForwardStrand();
 804
 805     /*
 806      * query the VCF for overlaps of each contiguous chromosomal region
 807      */
 808     int count = 0;
 809
 810     for (int[] range : map.map.getToRanges())
 811     {
 812       int vcfStart = Math.min(range[0], range[1]);
 813       int vcfEnd = Math.max(range[0], range[1]);
 814       CloseableIterator<VariantContext> variants = reader
 815               .query(map.chromosome, vcfStart, vcfEnd);
 816       while (variants.hasNext())
 817       {
 818         VariantContext variant = variants.next();
 819
 820         int[] featureRange = map.map.locateInFrom(variant.getStart(),
 821                 variant.getEnd());
 822
 823         if (featureRange != null)
 824         {
 825           int featureStart = Math.min(featureRange[0], featureRange[1]);
 826           int featureEnd = Math.max(featureRange[0], featureRange[1]);
 827           count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 828                   forwardStrand);
 829         }
 830       }
 831       variants.close();
 832     }
 833
 834     return count;
 835   }
 836
 837   /**
 838    * A convenience method to get an attribute value for an alternate allele
 839    *
 840    * @param variant
 841    * @param attributeName
 842    * @param alleleIndex
 843    * @return
 844    */
 845   protected String getAttributeValue(VariantContext variant,
 846           String attributeName, int alleleIndex)
 847   {
 848     Object att = variant.getAttribute(attributeName);
 849
 850     if (att instanceof String)
 851     {
 852       return (String) att;
 853     }
 854     else if (att instanceof ArrayList)
 855     {
 856       return ((List<String>) att).get(alleleIndex);
 857     }
 858
 859     return null;
 860   }
 861
 862   /**
 863    * Adds one variant feature for each allele in the VCF variant record, and
 864    * returns the number of features added.
 865    *
 866    * @param seq
 867    * @param variant
 868    * @param featureStart
 869    * @param featureEnd
 870    * @param forwardStrand
 871    * @return
 872    */
 873   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 874           int featureStart, int featureEnd, boolean forwardStrand)
 875   {
 876     int added = 0;
 877
 878     /*
 879      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 880      * so we proceed defensively to get them in strict order
 881      */
 882     int altAlleleCount = variant.getAlternateAlleles().size();
 883     for (int i = 0; i < altAlleleCount; i++)
 884     {
 885       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 886               forwardStrand);
 887     }
 888     return added;
 889   }
 890
 891   /**
 892    * Inspects one allele and attempts to add a variant feature for it to the
 893    * sequence. The additional data associated with this allele is extracted to
 894    * store in the feature's key-value map. Answers the number of features added (0
 895    * or 1).
 896    *
 897    * @param seq
 898    * @param variant
 899    * @param altAlleleIndex
 900    *          (0, 1..)
 901    * @param featureStart
 902    * @param featureEnd
 903    * @param forwardStrand
 904    * @return
 905    */
 906   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 907           int altAlleleIndex, int featureStart, int featureEnd,
 908           boolean forwardStrand)
 909   {
 910     String reference = variant.getReference().getBaseString();
 911     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 912     String allele = alt.getBaseString();
 913
 914     /*
 915      * insertion after a genomic base, if on reverse strand, has to be
 916      * converted to insertion of complement after the preceding position
 917      */
 918     int referenceLength = reference.length();
 919     if (!forwardStrand && allele.length() > referenceLength
 920             && allele.startsWith(reference))
 921     {
 922       featureStart -= referenceLength;
 923       featureEnd = featureStart;
 924       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
 925       reference = Dna.reverseComplement(String.valueOf(insertAfter));
 926       allele = allele.substring(referenceLength) + reference;
 927     }
 928
 929     /*
 930      * build the ref,alt allele description e.g. "G,A", using the base
 931      * complement if the sequence is on the reverse strand
 932      */
 933     StringBuilder sb = new StringBuilder();
 934     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 935     sb.append(COMMA);
 936     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 937     String alleles = sb.toString(); // e.g. G,A
 938
 939     /*
 940      * pick out the consequence data (if any) that is for the current allele
 941      * and feature (transcript) that matches the current sequence
 942      */
 943     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
 944             altAlleleIndex, csqAlleleFieldIndex,
 945             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
 946             csqFeatureFieldIndex);
 947
 948     /*
 949      * pick out the ontology term for the consequence type
 950      */
 951     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 952     if (consequence != null)
 953     {
 954       type = getOntologyTerm(consequence);
 955     }
 956
 957     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 958             featureEnd, FEATURE_GROUP_VCF);
 959     sf.setSource(sourceId);
 960
 961     sf.setValue(Gff3Helper.ALLELES, alleles);
 962
 963     addAlleleProperties(variant, sf, altAlleleIndex, consequence);
 964
 965     seq.addSequenceFeature(sf);
 966
 967     return 1;
 968   }
 969
 970   /**
 971    * Determines the Sequence Ontology term to use for the variant feature type in
 972    * Jalview. The default is 'sequence_variant', but a more specific term is used
 973    * if:
 974    * <ul>
 975    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
 976    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
 977    * </ul>
 978    *
 979    * @param consequence
 980    * @return
 981    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
 982    */
 983   String getOntologyTerm(String consequence)
 984   {
 985     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 986
 987     /*
 988      * could we associate Consequence data with this allele and feature (transcript)?
 989      * if so, prefer the consequence term from that data
 990      */
 991     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
 992     {
 993       /*
 994        * no Consequence data so we can't refine the ontology term
 995        */
 996       return type;
 997     }
 998
 999     if (consequence != null)
1000     {
1001       String[] csqFields = consequence.split(PIPE_REGEX);
1002       if (csqFields.length > csqConsequenceFieldIndex)
1003       {
1004         type = csqFields[csqConsequenceFieldIndex];
1005       }
1006     }
1007     else
1008     {
1009       // todo the same for SnpEff consequence data matching if wanted
1010     }
1011
1012     /*
1013      * if of the form (e.g.) missense_variant&splice_region_variant,
1014      * just take the first ('most severe') consequence
1015      */
1016     if (type != null)
1017     {
1018       int pos = type.indexOf('&');
1019       if (pos > 0)
1020       {
1021         type = type.substring(0, pos);
1022       }
1023     }
1024     return type;
1025   }
1026
1027   /**
1028    * Returns matched consequence data if it can be found, else null.
1029    * <ul>
1030    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1031    * <li>splits this on comma (to distinct consequences)</li>
1032    * <li>returns the first consequence (if any) where</li>
1033    * <ul>
1034    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1035    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1036    * </ul>
1037    * </ul>
1038    * If matched, the consequence is returned (as pipe-delimited fields).
1039    *
1040    * @param variant
1041    * @param vcfInfoId
1042    * @param altAlleleIndex
1043    * @param alleleFieldIndex
1044    * @param alleleNumberFieldIndex
1045    * @param seqName
1046    * @param featureFieldIndex
1047    * @return
1048    */
1049   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1050           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1051           int alleleNumberFieldIndex,
1052           String seqName, int featureFieldIndex)
1053   {
1054     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1055     {
1056       return null;
1057     }
1058     Object value = variant.getAttribute(vcfInfoId);
1059
1060     if (value == null || !(value instanceof List<?>))
1061     {
1062       return null;
1063     }
1064
1065     /*
1066      * inspect each consequence in turn (comma-separated blocks
1067      * extracted by htsjdk)
1068      */
1069     List<String> consequences = (List<String>) value;
1070
1071     for (String consequence : consequences)
1072     {
1073       String[] csqFields = consequence.split(PIPE_REGEX);
1074       if (csqFields.length > featureFieldIndex)
1075       {
1076         String featureIdentifier = csqFields[featureFieldIndex];
1077         if (featureIdentifier.length() > 4
1078                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1079         {
1080           /*
1081            * feature (transcript) matched - now check for allele match
1082            */
1083           if (matchAllele(variant, altAlleleIndex, csqFields,
1084                   alleleFieldIndex, alleleNumberFieldIndex))
1085           {
1086             return consequence;
1087           }
1088         }
1089       }
1090     }
1091     return null;
1092   }
1093
1094   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1095           String[] csqFields, int alleleFieldIndex,
1096           int alleleNumberFieldIndex)
1097   {
1098     /*
1099      * if ALLELE_NUM is present, it must match altAlleleIndex
1100      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1101      */
1102     if (alleleNumberFieldIndex > -1)
1103     {
1104       if (csqFields.length <= alleleNumberFieldIndex)
1105       {
1106         return false;
1107       }
1108       String alleleNum = csqFields[alleleNumberFieldIndex];
1109       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1110     }
1111
1112     /*
1113      * else consequence allele must match variant allele
1114      */
1115     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1116     {
1117       String csqAllele = csqFields[alleleFieldIndex];
1118       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1119               .getBaseString();
1120       return csqAllele.equals(vcfAllele);
1121     }
1122     return false;
1123   }
1124
1125   /**
1126    * Add any allele-specific VCF key-value data to the sequence feature
1127    *
1128    * @param variant
1129    * @param sf
1130    * @param altAlelleIndex
1131    *          (0, 1..)
1132    * @param consequence
1133    *          if not null, the consequence specific to this sequence (transcript
1134    *          feature) and allele
1135    */
1136   protected void addAlleleProperties(VariantContext variant,
1137           SequenceFeature sf, final int altAlelleIndex, String consequence)
1138   {
1139     Map<String, Object> atts = variant.getAttributes();
1140
1141     for (Entry<String, Object> att : atts.entrySet())
1142     {
1143       String key = att.getKey();
1144
1145       /*
1146        * extract Consequence data (if present) that we are able to
1147        * associated with the allele for this variant feature
1148        */
1149       if (CSQ_FIELD.equals(key))
1150       {
1151         addConsequences(variant, sf, consequence);
1152         continue;
1153       }
1154
1155       /*
1156        * filter out fields we don't want to capture
1157        */
1158       if (!vcfFieldsOfInterest.contains(key))
1159       {
1160         continue;
1161       }
1162
1163       /*
1164        * filter out fields we don't want to capture
1165        */
1166       if (!vcfFieldsOfInterest.contains(key))
1167       {
1168         continue;
1169       }
1170
1171       /*
1172        * we extract values for other data which are allele-specific;
1173        * these may be per alternate allele (INFO[key].Number = 'A')
1174        * or per allele including reference (INFO[key].Number = 'R')
1175        */
1176       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1177       if (infoHeader == null)
1178       {
1179         /*
1180          * can't be sure what data belongs to this allele, so
1181          * play safe and don't take any
1182          */
1183         continue;
1184       }
1185
1186       VCFHeaderLineCount number = infoHeader.getCountType();
1187       int index = altAlelleIndex;
1188       if (number == VCFHeaderLineCount.R)
1189       {
1190         /*
1191          * one value per allele including reference, so bump index
1192          * e.g. the 3rd value is for the  2nd alternate allele
1193          */
1194         index++;
1195       }
1196       else if (number != VCFHeaderLineCount.A)
1197       {
1198         /*
1199          * don't save other values as not allele-related
1200          */
1201         continue;
1202       }
1203
1204       /*
1205        * take the index'th value
1206        */
1207       String value = getAttributeValue(variant, key, index);
1208       if (value != null)
1209       {
1210         sf.setValue(key, value);
1211       }
1212     }
1213   }
1214
1215   /**
1216    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1217    * feature.
1218    * <p>
1219    * If <code>myConsequence</code> is not null, then this is the specific
1220    * consequence data (pipe-delimited fields) that is for the current allele and
1221    * transcript (sequence) being processed)
1222    *
1223    * @param variant
1224    * @param sf
1225    * @param myConsequence
1226    */
1227   protected void addConsequences(VariantContext variant, SequenceFeature sf,
1228           String myConsequence)
1229   {
1230     Object value = variant.getAttribute(CSQ_FIELD);
1231
1232     if (value == null || !(value instanceof List<?>))
1233     {
1234       return;
1235     }
1236
1237     List<String> consequences = (List<String>) value;
1238
1239     /*
1240      * inspect CSQ consequences; restrict to the consequence
1241      * associated with the current transcript (Feature)
1242      */
1243     Map<String, String> csqValues = new HashMap<>();
1244
1245     for (String consequence : consequences)
1246     {
1247       if (myConsequence == null || myConsequence.equals(consequence))
1248       {
1249         String[] csqFields = consequence.split(PIPE_REGEX);
1250
1251         /*
1252          * inspect individual fields of this consequence, copying non-null
1253          * values which are 'fields of interest'
1254          */
1255         int i = 0;
1256         for (String field : csqFields)
1257         {
1258           if (field != null && field.length() > 0)
1259           {
1260             String id = vepFieldsOfInterest.get(i);
1261             if (id != null)
1262             {
1263               csqValues.put(id, field);
1264             }
1265           }
1266           i++;
1267         }
1268       }
1269     }
1270
1271     if (!csqValues.isEmpty())
1272     {
1273       sf.setValue(CSQ_FIELD, csqValues);
1274     }
1275   }
1276
1277   /**
1278    * A convenience method to complement a dna base and return the string value
1279    * of its complement
1280    *
1281    * @param reference
1282    * @return
1283    */
1284   protected String complement(byte[] reference)
1285   {
1286     return String.valueOf(Dna.getComplement((char) reference[0]));
1287   }
1288
1289   /**
1290    * Determines the location of the query range (chromosome positions) in a
1291    * different reference assembly.
1292    * <p>
1293    * If the range is just a subregion of one for which we already have a mapping
1294    * (for example, an exon sub-region of a gene), then the mapping is just
1295    * computed arithmetically.
1296    * <p>
1297    * Otherwise, calls the Ensembl REST service that maps from one assembly
1298    * reference's coordinates to another's
1299    *
1300    * @param queryRange
1301    *          start-end chromosomal range in 'fromRef' coordinates
1302    * @param chromosome
1303    * @param species
1304    * @param fromRef
1305    *          assembly reference for the query coordinates
1306    * @param toRef
1307    *          assembly reference we wish to translate to
1308    * @return the start-end range in 'toRef' coordinates
1309    */
1310   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1311           String species, String fromRef, String toRef)
1312   {
1313     /*
1314      * first try shorcut of computing the mapping as a subregion of one
1315      * we already have (e.g. for an exon, if we have the gene mapping)
1316      */
1317     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1318             species, fromRef, toRef);
1319     if (mappedRange != null)
1320     {
1321       return mappedRange;
1322     }
1323
1324     /*
1325      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1326      */
1327     EnsemblMap mapper = new EnsemblMap();
1328     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1329             toRef, queryRange);
1330
1331     if (mapping == null)
1332     {
1333       // mapping service failure
1334       return null;
1335     }
1336
1337     /*
1338      * save mapping for possible future re-use
1339      */
1340     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1341     if (!assemblyMappings.containsKey(key))
1342     {
1343       assemblyMappings.put(key, new HashMap<int[], int[]>());
1344     }
1345
1346     assemblyMappings.get(key).put(queryRange, mapping);
1347
1348     return mapping;
1349   }
1350
1351   /**
1352    * If we already have a 1:1 contiguous mapping which subsumes the given query
1353    * range, this method just calculates and returns the subset of that mapping,
1354    * else it returns null. In practical terms, if a gene has a contiguous
1355    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1356    * subsidiary exons occupy unchanged relative positions, and just compute
1357    * these as offsets, rather than do another lookup of the mapping.
1358    * <p>
1359    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1360    * simply remove this method or let it always return null.
1361    * <p>
1362    * Warning: many rapid calls to the /map service map result in a 429 overload
1363    * error response
1364    *
1365    * @param queryRange
1366    * @param chromosome
1367    * @param species
1368    * @param fromRef
1369    * @param toRef
1370    * @return
1371    */
1372   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1373           String species, String fromRef, String toRef)
1374   {
1375     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1376     if (assemblyMappings.containsKey(key))
1377     {
1378       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1379       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1380       {
1381         int[] fromRange = mappedRange.getKey();
1382         int[] toRange = mappedRange.getValue();
1383         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1384         {
1385           /*
1386            * mapping is 1:1 in length, so we trust it to have no discontinuities
1387            */
1388           if (MappingUtils.rangeContains(fromRange, queryRange))
1389           {
1390             /*
1391              * fromRange subsumes our query range
1392              */
1393             int offset = queryRange[0] - fromRange[0];
1394             int mappedRangeFrom = toRange[0] + offset;
1395             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1396             return new int[] { mappedRangeFrom, mappedRangeTo };
1397           }
1398         }
1399       }
1400     }
1401     return null;
1402   }
1403
1404   /**
1405    * Transfers the sequence feature to the target sequence, locating its start
1406    * and end range based on the mapping. Features which do not overlap the
1407    * target sequence are ignored.
1408    *
1409    * @param sf
1410    * @param targetSequence
1411    * @param mapping
1412    *          mapping from the feature's coordinates to the target sequence
1413    */
1414   protected void transferFeature(SequenceFeature sf,
1415           SequenceI targetSequence, MapList mapping)
1416   {
1417     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1418
1419     if (mappedRange != null)
1420     {
1421       String group = sf.getFeatureGroup();
1422       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1423       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1424       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1425               group, sf.getScore());
1426       targetSequence.addSequenceFeature(copy);
1427     }
1428   }
1429
1430   /**
1431    * Formats a ranges map lookup key
1432    *
1433    * @param chromosome
1434    * @param species
1435    * @param fromRef
1436    * @param toRef
1437    * @return
1438    */
1439   protected static String makeRangesKey(String chromosome, String species,
1440           String fromRef, String toRef)
1441   {
1442     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1443             + toRef;
1444   }
1445 }