src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.Dna;
   4 import jalview.api.AlignViewControllerGuiI;
   5 import jalview.bin.Cache;
   6 import jalview.datamodel.DBRefEntry;
   7 import jalview.datamodel.GeneLociI;
   8 import jalview.datamodel.Mapping;
   9 import jalview.datamodel.SequenceFeature;
  10 import jalview.datamodel.SequenceI;
  11 import jalview.datamodel.features.FeatureAttributeType;
  12 import jalview.datamodel.features.FeatureSource;
  13 import jalview.datamodel.features.FeatureSources;
  14 import jalview.ext.ensembl.EnsemblMap;
  15 import jalview.ext.htsjdk.HtsContigDb;
  16 import jalview.ext.htsjdk.VCFReader;
  17 import jalview.io.gff.Gff3Helper;
  18 import jalview.io.gff.SequenceOntologyI;
  19 import jalview.util.MapList;
  20 import jalview.util.MappingUtils;
  21 import jalview.util.MessageManager;
  22
  23 import java.io.File;
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Map.Entry;
  30 import java.util.regex.Pattern;
  31 import java.util.regex.PatternSyntaxException;
  32
  33 import htsjdk.samtools.SAMException;
  34 import htsjdk.samtools.SAMSequenceDictionary;
  35 import htsjdk.samtools.SAMSequenceRecord;
  36 import htsjdk.samtools.util.CloseableIterator;
  37 import htsjdk.variant.variantcontext.Allele;
  38 import htsjdk.variant.variantcontext.VariantContext;
  39 import htsjdk.variant.vcf.VCFHeader;
  40 import htsjdk.variant.vcf.VCFHeaderLine;
  41 import htsjdk.variant.vcf.VCFHeaderLineCount;
  42 import htsjdk.variant.vcf.VCFHeaderLineType;
  43 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  44
  45 /**
  46  * A class to read VCF data (using the htsjdk) and add variants as sequence
  47  * features on dna and any related protein product sequences
  48  *
  49  * @author gmcarstairs
  50  */
  51 public class VCFLoader
  52 {
  53   private static final String DEFAULT_SPECIES = "homo_sapiens";
  54
  55   /**
  56    * A class to model the mapping from sequence to VCF coordinates. Cases include
  57    * <ul>
  58    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  59    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
  60    * use the same reference assembly</li>
  61    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
  62    * and VCF use different reference assembles</li>
  63    * </ul>
  64    */
  65   class VCFMap
  66   {
  67     final String chromosome;
  68
  69     final MapList map;
  70
  71     VCFMap(String chr, MapList m)
  72     {
  73       chromosome = chr;
  74       map = m;
  75     }
  76
  77     @Override
  78     public String toString()
  79     {
  80       return chromosome + ":" + map.toString();
  81     }
  82   }
  83
  84   /*
  85    * Lookup keys, and default values, for Preference entries that describe
  86    * patterns for VCF and VEP fields to capture
  87    */
  88   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  89
  90   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  91
  92   private static final String DEFAULT_VCF_FIELDS = ".*";
  93
  94   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  95
  96   /*
  97    * Lookup keys, and default values, for Preference entries that give
  98    * mappings from tokens in the 'reference' header to species or assembly
  99    */
 100   private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
 101
 102   private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
 103
 104   private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
 105
 106   private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37
 107
 108   /*
 109    * keys to fields of VEP CSQ consequence data
 110    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 111    */
 112   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
 113   private static final String CSQ_ALLELE_KEY = "Allele";
 114   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 115   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 116
 117   /*
 118    * default VCF INFO key for VEP consequence data
 119    * NB this can be overridden running VEP with --vcf_info_field
 120    * - we don't handle this case (require identifier to be CSQ)
 121    */
 122   private static final String CSQ_FIELD = "CSQ";
 123
 124   /*
 125    * separator for fields in consequence data is '|'
 126    */
 127   private static final String PIPE_REGEX = "\\|";
 128
 129   /*
 130    * delimiter that separates multiple consequence data blocks
 131    */
 132   private static final String COMMA = ",";
 133
 134   /*
 135    * the feature group assigned to a VCF variant in Jalview
 136    */
 137   private static final String FEATURE_GROUP_VCF = "VCF";
 138
 139   /*
 140    * internal delimiter used to build keys for assemblyMappings
 141    *
 142    */
 143   private static final String EXCL = "!";
 144
 145   /*
 146    * the VCF file we are processing
 147    */
 148   protected String vcfFilePath;
 149
 150   /*
 151    * mappings between VCF and sequence reference assembly regions, as
 152    * key = "species!chromosome!fromAssembly!toAssembly
 153    * value = Map{fromRange, toRange}
 154    */
 155   private Map<String, Map<int[], int[]>> assemblyMappings;
 156
 157   private VCFReader reader;
 158
 159   /*
 160    * holds details of the VCF header lines (metadata)
 161    */
 162   private VCFHeader header;
 163
 164   /*
 165    * species (as a valid Ensembl term) the VCF is for
 166    */
 167   private String vcfSpecies;
 168
 169   /*
 170    * genome assembly version (as a valid Ensembl identifier) the VCF is for
 171    */
 172   private String vcfAssembly;
 173
 174   /*
 175    * a Dictionary of contigs (if present) referenced in the VCF file
 176    */
 177   private SAMSequenceDictionary dictionary;
 178
 179   /*
 180    * the position (0...) of field in each block of
 181    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 182    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 183    */
 184   private int csqConsequenceFieldIndex = -1;
 185   private int csqAlleleFieldIndex = -1;
 186   private int csqAlleleNumberFieldIndex = -1;
 187   private int csqFeatureFieldIndex = -1;
 188
 189   // todo the same fields for SnpEff ANN data if wanted
 190   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 191
 192   /*
 193    * a unique identifier under which to save metadata about feature
 194    * attributes (selected INFO field data)
 195    */
 196   private String sourceId;
 197
 198   /*
 199    * The INFO IDs of data that is both present in the VCF file, and
 200    * also matched by any filters for data of interest
 201    */
 202   List<String> vcfFieldsOfInterest;
 203
 204   /*
 205    * The field offsets and identifiers for VEP (CSQ) data that is both present
 206    * in the VCF file, and also matched by any filters for data of interest
 207    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 208    */
 209   Map<Integer, String> vepFieldsOfInterest;
 210
 211   /**
 212    * Constructor given a VCF file
 213    *
 214    * @param alignment
 215    */
 216   public VCFLoader(String vcfFile)
 217   {
 218     try
 219     {
 220       initialise(vcfFile);
 221     } catch (IOException e)
 222     {
 223       System.err.println("Error opening VCF file: " + e.getMessage());
 224     }
 225
 226     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 227     assemblyMappings = new HashMap<>();
 228   }
 229
 230   /**
 231    * Starts a new thread to query and load VCF variant data on to the given
 232    * sequences
 233    * <p>
 234    * This method is not thread safe - concurrent threads should use separate
 235    * instances of this class.
 236    *
 237    * @param seqs
 238    * @param gui
 239    */
 240   public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
 241   {
 242     if (gui != null)
 243     {
 244       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 245     }
 246
 247     new Thread()
 248     {
 249       @Override
 250       public void run()
 251       {
 252         VCFLoader.this.doLoad(seqs, gui);
 253       }
 254     }.start();
 255   }
 256
 257   /**
 258    * Reads the specified contig sequence and adds its VCF variants to it
 259    *
 260    * @param contig
 261    *          the id of a single sequence (contig) to load
 262    * @return
 263    */
 264   public SequenceI loadVCFContig(String contig)
 265   {
 266     VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 267     if (headerLine == null)
 268     {
 269       Cache.log.error("VCF reference header not found");
 270       return null;
 271     }
 272     String ref = headerLine.getValue();
 273     if (ref.startsWith("file://"))
 274     {
 275       ref = ref.substring(7);
 276     }
 277     setSpeciesAndAssembly(ref);
 278
 279     SequenceI seq = null;
 280     File dbFile = new File(ref);
 281
 282     if (dbFile.exists())
 283     {
 284       HtsContigDb db = new HtsContigDb("", dbFile);
 285       seq = db.getSequenceProxy(contig);
 286       loadSequenceVCF(seq);
 287       db.close();
 288     }
 289     else
 290     {
 291       Cache.log.error("VCF reference not found: " + ref);
 292     }
 293
 294     return seq;
 295   }
 296
 297   /**
 298    * Loads VCF on to one or more sequences
 299    *
 300    * @param seqs
 301    * @param gui
 302    *          optional callback handler for messages
 303    */
 304   protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
 305   {
 306     try
 307     {
 308       VCFHeaderLine ref = header
 309               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 310       String reference = ref == null ? null : ref.getValue();
 311
 312       setSpeciesAndAssembly(reference);
 313
 314       int varCount = 0;
 315       int seqCount = 0;
 316
 317       /*
 318        * query for VCF overlapping each sequence in turn
 319        */
 320       for (SequenceI seq : seqs)
 321       {
 322         int added = loadSequenceVCF(seq);
 323         if (added > 0)
 324         {
 325           seqCount++;
 326           varCount += added;
 327           transferAddedFeatures(seq);
 328         }
 329       }
 330       if (gui != null)
 331       {
 332         String msg = MessageManager.formatMessage("label.added_vcf",
 333                 varCount, seqCount);
 334         gui.setStatus(msg);
 335         if (gui.getFeatureSettingsUI() != null)
 336         {
 337           gui.getFeatureSettingsUI().discoverAllFeatureData();
 338         }
 339       }
 340     } catch (Throwable e)
 341     {
 342       System.err.println("Error processing VCF: " + e.getMessage());
 343       e.printStackTrace();
 344       if (gui != null)
 345       {
 346         gui.setStatus("Error occurred - see console for details");
 347       }
 348     } finally
 349     {
 350       if (reader != null)
 351       {
 352         try
 353         {
 354           reader.close();
 355         } catch (IOException e)
 356         {
 357           // ignore
 358         }
 359       }
 360       header = null;
 361       dictionary = null;
 362     }
 363   }
 364
 365   /**
 366    * Attempts to determine and save the species and genome assembly version to
 367    * which the VCF data applies. This may be done by parsing the {@code reference}
 368    * header line, configured in a property file, or (potentially) confirmed
 369    * interactively by the user.
 370    * <p>
 371    * The saved values should be identifiers valid for Ensembl's REST service
 372    * {@code map} endpoint, so they can be used (if necessary) to retrieve the
 373    * mapping between VCF coordinates and sequence coordinates.
 374    *
 375    * @param reference
 376    * @see https://rest.ensembl.org/documentation/info/assembly_map
 377    * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
 378    * @see https://rest.ensembl.org/info/species?content-type=text/xml
 379    */
 380   protected void setSpeciesAndAssembly(String reference)
 381   {
 382     if (reference == null)
 383     {
 384       Cache.log.error("No VCF ##reference found, defaulting to "
 385               + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
 386       reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
 387     }
 388     reference = reference.toLowerCase();
 389
 390     /*
 391      * for a non-human species, or other assembly identifier,
 392      * specify as a Jalview property file entry e.g.
 393      * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
 394      * VCF_SPECIES = c_elegans=celegans
 395      * to map a token in the reference header to a value
 396      */
 397     String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
 398     for (String token : prop.split(","))
 399     {
 400       String[] tokens = token.split("=");
 401       if (tokens.length == 2)
 402       {
 403         if (reference.contains(tokens[0].trim().toLowerCase()))
 404         {
 405           vcfAssembly = tokens[1].trim();
 406           break;
 407         }
 408       }
 409     }
 410
 411     vcfSpecies = DEFAULT_SPECIES;
 412     prop = Cache.getProperty(VCF_SPECIES);
 413     if (prop != null)
 414     {
 415       for (String token : prop.split(","))
 416       {
 417         String[] tokens = token.split("=");
 418         if (tokens.length == 2)
 419         {
 420           if (reference.contains(tokens[0].trim().toLowerCase()))
 421           {
 422             vcfSpecies = tokens[1].trim();
 423             break;
 424           }
 425         }
 426       }
 427     }
 428   }
 429
 430   /**
 431    * Opens the VCF file and parses header data
 432    *
 433    * @param filePath
 434    * @throws IOException
 435    */
 436   private void initialise(String filePath) throws IOException
 437   {
 438     vcfFilePath = filePath;
 439
 440     reader = new VCFReader(filePath);
 441
 442     header = reader.getFileHeader();
 443
 444     try
 445     {
 446       dictionary = header.getSequenceDictionary();
 447     } catch (SAMException e)
 448     {
 449       // ignore - thrown if any contig line lacks length info
 450     }
 451
 452     sourceId = filePath;
 453
 454     saveMetadata(sourceId);
 455
 456     /*
 457      * get offset of CSQ ALLELE_NUM and Feature if declared
 458      */
 459     parseCsqHeader();
 460   }
 461
 462   /**
 463    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 464    * them for future reference
 465    *
 466    * @param theSourceId
 467    */
 468   void saveMetadata(String theSourceId)
 469   {
 470     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 471             DEFAULT_VCF_FIELDS);
 472     vcfFieldsOfInterest = new ArrayList<>();
 473
 474     FeatureSource metadata = new FeatureSource(theSourceId);
 475
 476     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 477     {
 478       String attributeId = info.getID();
 479       String desc = info.getDescription();
 480       VCFHeaderLineType type = info.getType();
 481       FeatureAttributeType attType = null;
 482       switch (type)
 483       {
 484       case Character:
 485         attType = FeatureAttributeType.Character;
 486         break;
 487       case Flag:
 488         attType = FeatureAttributeType.Flag;
 489         break;
 490       case Float:
 491         attType = FeatureAttributeType.Float;
 492         break;
 493       case Integer:
 494         attType = FeatureAttributeType.Integer;
 495         break;
 496       case String:
 497         attType = FeatureAttributeType.String;
 498         break;
 499       }
 500       metadata.setAttributeName(attributeId, desc);
 501       metadata.setAttributeType(attributeId, attType);
 502
 503       if (isFieldWanted(attributeId, vcfFieldPatterns))
 504       {
 505         vcfFieldsOfInterest.add(attributeId);
 506       }
 507     }
 508
 509     FeatureSources.getInstance().addSource(theSourceId, metadata);
 510   }
 511
 512   /**
 513    * Answers true if the field id is matched by any of the filter patterns, else
 514    * false. Matching is against regular expression patterns, and is not
 515    * case-sensitive.
 516    *
 517    * @param id
 518    * @param filters
 519    * @return
 520    */
 521   private boolean isFieldWanted(String id, List<Pattern> filters)
 522   {
 523     for (Pattern p : filters)
 524     {
 525       if (p.matcher(id.toUpperCase()).matches())
 526       {
 527         return true;
 528       }
 529     }
 530     return false;
 531   }
 532
 533   /**
 534    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 535    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 536    * required for processing.
 537    * <p>
 538    * CSQ fields are declared in the CSQ INFO Description e.g.
 539    * <p>
 540    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 541    */
 542   protected void parseCsqHeader()
 543   {
 544     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 545             DEFAULT_VEP_FIELDS);
 546     vepFieldsOfInterest = new HashMap<>();
 547
 548     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 549     if (csqInfo == null)
 550     {
 551       return;
 552     }
 553
 554     /*
 555      * parse out the pipe-separated list of CSQ fields; we assume here that
 556      * these form the last part of the description, and contain no spaces
 557      */
 558     String desc = csqInfo.getDescription();
 559     int spacePos = desc.lastIndexOf(" ");
 560     desc = desc.substring(spacePos + 1);
 561
 562     if (desc != null)
 563     {
 564       String[] format = desc.split(PIPE_REGEX);
 565       int index = 0;
 566       for (String field : format)
 567       {
 568         if (CSQ_CONSEQUENCE_KEY.equals(field))
 569         {
 570           csqConsequenceFieldIndex = index;
 571         }
 572         if (CSQ_ALLELE_NUM_KEY.equals(field))
 573         {
 574           csqAlleleNumberFieldIndex = index;
 575         }
 576         if (CSQ_ALLELE_KEY.equals(field))
 577         {
 578           csqAlleleFieldIndex = index;
 579         }
 580         if (CSQ_FEATURE_KEY.equals(field))
 581         {
 582           csqFeatureFieldIndex = index;
 583         }
 584
 585         if (isFieldWanted(field, vepFieldFilters))
 586         {
 587           vepFieldsOfInterest.put(index, field);
 588         }
 589
 590         index++;
 591       }
 592     }
 593   }
 594
 595   /**
 596    * Reads the Preference value for the given key, with default specified if no
 597    * preference set. The value is interpreted as a comma-separated list of
 598    * regular expressions, and converted into a list of compiled patterns ready
 599    * for matching. Patterns are forced to upper-case for non-case-sensitive
 600    * matching.
 601    * <p>
 602    * This supports user-defined filters for fields of interest to capture while
 603    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 604    * fields with an ID of AF, or starting with AC, would be matched.
 605    *
 606    * @param key
 607    * @param def
 608    * @return
 609    */
 610   private List<Pattern> getFieldMatchers(String key, String def)
 611   {
 612     String pref = Cache.getDefault(key, def);
 613     List<Pattern> patterns = new ArrayList<>();
 614     String[] tokens = pref.split(",");
 615     for (String token : tokens)
 616     {
 617       try
 618       {
 619       patterns.add(Pattern.compile(token.toUpperCase()));
 620       } catch (PatternSyntaxException e)
 621       {
 622         System.err.println("Invalid pattern ignored: " + token);
 623       }
 624     }
 625     return patterns;
 626   }
 627
 628   /**
 629    * Transfers VCF features to sequences to which this sequence has a mapping.
 630    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 631    *
 632    * @param seq
 633    */
 634   protected void transferAddedFeatures(SequenceI seq)
 635   {
 636     DBRefEntry[] dbrefs = seq.getDBRefs();
 637     if (dbrefs == null)
 638     {
 639       return;
 640     }
 641     for (DBRefEntry dbref : dbrefs)
 642     {
 643       Mapping mapping = dbref.getMap();
 644       if (mapping == null || mapping.getTo() == null)
 645       {
 646         continue;
 647       }
 648
 649       SequenceI mapTo = mapping.getTo();
 650       MapList map = mapping.getMap();
 651       if (map.getFromRatio() == 3)
 652       {
 653         /*
 654          * dna-to-peptide product mapping
 655          */
 656         // JAL-3187 render on the fly instead
 657         // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 658       }
 659       else
 660       {
 661         /*
 662          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 663          */
 664         List<SequenceFeature> features = seq.getFeatures()
 665                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 666         for (SequenceFeature sf : features)
 667         {
 668           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 669           {
 670             transferFeature(sf, mapTo, map);
 671           }
 672         }
 673       }
 674     }
 675   }
 676
 677   /**
 678    * Tries to add overlapping variants read from a VCF file to the given sequence,
 679    * and returns the number of variant features added
 680    *
 681    * @param seq
 682    * @return
 683    */
 684   protected int loadSequenceVCF(SequenceI seq)
 685   {
 686     VCFMap vcfMap = getVcfMap(seq);
 687     if (vcfMap == null)
 688     {
 689       return 0;
 690     }
 691
 692     /*
 693      * work with the dataset sequence here
 694      */
 695     SequenceI dss = seq.getDatasetSequence();
 696     if (dss == null)
 697     {
 698       dss = seq;
 699     }
 700     return addVcfVariants(dss, vcfMap);
 701   }
 702
 703   /**
 704    * Answers a map from sequence coordinates to VCF chromosome ranges
 705    *
 706    * @param seq
 707    * @return
 708    */
 709   private VCFMap getVcfMap(SequenceI seq)
 710   {
 711     /*
 712      * simplest case: sequence has id and length matching a VCF contig
 713      */
 714     VCFMap vcfMap = null;
 715     if (dictionary != null)
 716     {
 717       vcfMap = getContigMap(seq);
 718     }
 719     if (vcfMap != null)
 720     {
 721       return vcfMap;
 722     }
 723
 724     /*
 725      * otherwise, map to VCF from chromosomal coordinates
 726      * of the sequence (if known)
 727      */
 728     GeneLociI seqCoords = seq.getGeneLoci();
 729     if (seqCoords == null)
 730     {
 731       Cache.log.warn(String.format(
 732               "Can't query VCF for %s as chromosome coordinates not known",
 733               seq.getName()));
 734       return null;
 735     }
 736
 737     String species = seqCoords.getSpeciesId();
 738     String chromosome = seqCoords.getChromosomeId();
 739     String seqRef = seqCoords.getAssemblyId();
 740     MapList map = seqCoords.getMapping();
 741
 742     // note this requires the configured species to match that
 743     // returned with the Ensembl sequence; todo: support aliases?
 744     if (!vcfSpecies.equalsIgnoreCase(species))
 745     {
 746       Cache.log.warn("No VCF loaded to " + seq.getName()
 747               + " as species not matched");
 748       return null;
 749     }
 750
 751     if (seqRef.equalsIgnoreCase(vcfAssembly))
 752     {
 753       return new VCFMap(chromosome, map);
 754     }
 755
 756     /*
 757      * VCF data has a different reference assembly to the sequence:
 758      * query Ensembl to map chromosomal coordinates from sequence to VCF
 759      */
 760     List<int[]> toVcfRanges = new ArrayList<>();
 761     List<int[]> fromSequenceRanges = new ArrayList<>();
 762
 763     for (int[] range : map.getToRanges())
 764     {
 765       int[] fromRange = map.locateInFrom(range[0], range[1]);
 766       if (fromRange == null)
 767       {
 768         // corrupted map?!?
 769         continue;
 770       }
 771
 772       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 773               vcfAssembly);
 774       if (newRange == null)
 775       {
 776         Cache.log.error(
 777                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 778                         chromosome, seqRef, range[0], range[1],
 779                         vcfAssembly));
 780         continue;
 781       }
 782       else
 783       {
 784         toVcfRanges.add(newRange);
 785         fromSequenceRanges.add(fromRange);
 786       }
 787     }
 788
 789     return new VCFMap(chromosome,
 790             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 791   }
 792
 793   /**
 794    * If the sequence id matches a contig declared in the VCF file, and the
 795    * sequence length matches the contig length, then returns a 1:1 map of the
 796    * sequence to the contig, else returns null
 797    *
 798    * @param seq
 799    * @return
 800    */
 801   private VCFMap getContigMap(SequenceI seq)
 802   {
 803     String id = seq.getName();
 804     SAMSequenceRecord contig = dictionary.getSequence(id);
 805     if (contig != null)
 806     {
 807       int len = seq.getLength();
 808       if (len == contig.getSequenceLength())
 809       {
 810         MapList map = new MapList(new int[] { 1, len },
 811                 new int[]
 812                 { 1, len }, 1, 1);
 813         return new VCFMap(id, map);
 814       }
 815     }
 816     return null;
 817   }
 818
 819   /**
 820    * Queries the VCF reader for any variants that overlap the mapped chromosome
 821    * ranges of the sequence, and adds as variant features. Returns the number of
 822    * overlapping variants found.
 823    *
 824    * @param seq
 825    * @param map
 826    *          mapping from sequence to VCF coordinates
 827    * @return
 828    */
 829   protected int addVcfVariants(SequenceI seq, VCFMap map)
 830   {
 831     boolean forwardStrand = map.map.isToForwardStrand();
 832
 833     /*
 834      * query the VCF for overlaps of each contiguous chromosomal region
 835      */
 836     int count = 0;
 837
 838     for (int[] range : map.map.getToRanges())
 839     {
 840       int vcfStart = Math.min(range[0], range[1]);
 841       int vcfEnd = Math.max(range[0], range[1]);
 842       CloseableIterator<VariantContext> variants = reader
 843               .query(map.chromosome, vcfStart, vcfEnd);
 844       while (variants.hasNext())
 845       {
 846         VariantContext variant = variants.next();
 847
 848         int[] featureRange = map.map.locateInFrom(variant.getStart(),
 849                 variant.getEnd());
 850
 851         if (featureRange != null)
 852         {
 853           int featureStart = Math.min(featureRange[0], featureRange[1]);
 854           int featureEnd = Math.max(featureRange[0], featureRange[1]);
 855           count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 856                   forwardStrand);
 857         }
 858       }
 859       variants.close();
 860     }
 861
 862     return count;
 863   }
 864
 865   /**
 866    * A convenience method to get an attribute value for an alternate allele
 867    *
 868    * @param variant
 869    * @param attributeName
 870    * @param alleleIndex
 871    * @return
 872    */
 873   protected String getAttributeValue(VariantContext variant,
 874           String attributeName, int alleleIndex)
 875   {
 876     Object att = variant.getAttribute(attributeName);
 877
 878     if (att instanceof String)
 879     {
 880       return (String) att;
 881     }
 882     else if (att instanceof ArrayList)
 883     {
 884       return ((List<String>) att).get(alleleIndex);
 885     }
 886
 887     return null;
 888   }
 889
 890   /**
 891    * Adds one variant feature for each allele in the VCF variant record, and
 892    * returns the number of features added.
 893    *
 894    * @param seq
 895    * @param variant
 896    * @param featureStart
 897    * @param featureEnd
 898    * @param forwardStrand
 899    * @return
 900    */
 901   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 902           int featureStart, int featureEnd, boolean forwardStrand)
 903   {
 904     int added = 0;
 905
 906     /*
 907      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 908      * so we proceed defensively to get them in strict order
 909      */
 910     int altAlleleCount = variant.getAlternateAlleles().size();
 911     for (int i = 0; i < altAlleleCount; i++)
 912     {
 913       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 914               forwardStrand);
 915     }
 916     return added;
 917   }
 918
 919   /**
 920    * Inspects one allele and attempts to add a variant feature for it to the
 921    * sequence. The additional data associated with this allele is extracted to
 922    * store in the feature's key-value map. Answers the number of features added (0
 923    * or 1).
 924    *
 925    * @param seq
 926    * @param variant
 927    * @param altAlleleIndex
 928    *          (0, 1..)
 929    * @param featureStart
 930    * @param featureEnd
 931    * @param forwardStrand
 932    * @return
 933    */
 934   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 935           int altAlleleIndex, int featureStart, int featureEnd,
 936           boolean forwardStrand)
 937   {
 938     String reference = variant.getReference().getBaseString();
 939     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 940     String allele = alt.getBaseString();
 941
 942     /*
 943      * insertion after a genomic base, if on reverse strand, has to be
 944      * converted to insertion of complement after the preceding position
 945      */
 946     int referenceLength = reference.length();
 947     if (!forwardStrand && allele.length() > referenceLength
 948             && allele.startsWith(reference))
 949     {
 950       featureStart -= referenceLength;
 951       featureEnd = featureStart;
 952       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
 953       reference = Dna.reverseComplement(String.valueOf(insertAfter));
 954       allele = allele.substring(referenceLength) + reference;
 955     }
 956
 957     /*
 958      * build the ref,alt allele description e.g. "G,A", using the base
 959      * complement if the sequence is on the reverse strand
 960      */
 961     StringBuilder sb = new StringBuilder();
 962     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 963     sb.append(COMMA);
 964     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 965     String alleles = sb.toString(); // e.g. G,A
 966
 967     /*
 968      * pick out the consequence data (if any) that is for the current allele
 969      * and feature (transcript) that matches the current sequence
 970      */
 971     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
 972             altAlleleIndex, csqAlleleFieldIndex,
 973             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
 974             csqFeatureFieldIndex);
 975
 976     /*
 977      * pick out the ontology term for the consequence type
 978      */
 979     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 980     if (consequence != null)
 981     {
 982       type = getOntologyTerm(consequence);
 983     }
 984
 985     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 986             featureEnd, FEATURE_GROUP_VCF);
 987     sf.setSource(sourceId);
 988
 989     sf.setValue(Gff3Helper.ALLELES, alleles);
 990
 991     addAlleleProperties(variant, sf, altAlleleIndex, consequence);
 992
 993     seq.addSequenceFeature(sf);
 994
 995     return 1;
 996   }
 997
 998   /**
 999    * Determines the Sequence Ontology term to use for the variant feature type in
1000    * Jalview. The default is 'sequence_variant', but a more specific term is used
1001    * if:
1002    * <ul>
1003    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
1004    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
1005    * </ul>
1006    *
1007    * @param consequence
1008    * @return
1009    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
1010    */
1011   String getOntologyTerm(String consequence)
1012   {
1013     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1014
1015     /*
1016      * could we associate Consequence data with this allele and feature (transcript)?
1017      * if so, prefer the consequence term from that data
1018      */
1019     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
1020     {
1021       /*
1022        * no Consequence data so we can't refine the ontology term
1023        */
1024       return type;
1025     }
1026
1027     if (consequence != null)
1028     {
1029       String[] csqFields = consequence.split(PIPE_REGEX);
1030       if (csqFields.length > csqConsequenceFieldIndex)
1031       {
1032         type = csqFields[csqConsequenceFieldIndex];
1033       }
1034     }
1035     else
1036     {
1037       // todo the same for SnpEff consequence data matching if wanted
1038     }
1039
1040     /*
1041      * if of the form (e.g.) missense_variant&splice_region_variant,
1042      * just take the first ('most severe') consequence
1043      */
1044     if (type != null)
1045     {
1046       int pos = type.indexOf('&');
1047       if (pos > 0)
1048       {
1049         type = type.substring(0, pos);
1050       }
1051     }
1052     return type;
1053   }
1054
1055   /**
1056    * Returns matched consequence data if it can be found, else null.
1057    * <ul>
1058    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1059    * <li>splits this on comma (to distinct consequences)</li>
1060    * <li>returns the first consequence (if any) where</li>
1061    * <ul>
1062    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1063    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1064    * </ul>
1065    * </ul>
1066    * If matched, the consequence is returned (as pipe-delimited fields).
1067    *
1068    * @param variant
1069    * @param vcfInfoId
1070    * @param altAlleleIndex
1071    * @param alleleFieldIndex
1072    * @param alleleNumberFieldIndex
1073    * @param seqName
1074    * @param featureFieldIndex
1075    * @return
1076    */
1077   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1078           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1079           int alleleNumberFieldIndex,
1080           String seqName, int featureFieldIndex)
1081   {
1082     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1083     {
1084       return null;
1085     }
1086     Object value = variant.getAttribute(vcfInfoId);
1087
1088     if (value == null || !(value instanceof List<?>))
1089     {
1090       return null;
1091     }
1092
1093     /*
1094      * inspect each consequence in turn (comma-separated blocks
1095      * extracted by htsjdk)
1096      */
1097     List<String> consequences = (List<String>) value;
1098
1099     for (String consequence : consequences)
1100     {
1101       String[] csqFields = consequence.split(PIPE_REGEX);
1102       if (csqFields.length > featureFieldIndex)
1103       {
1104         String featureIdentifier = csqFields[featureFieldIndex];
1105         if (featureIdentifier.length() > 4
1106                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1107         {
1108           /*
1109            * feature (transcript) matched - now check for allele match
1110            */
1111           if (matchAllele(variant, altAlleleIndex, csqFields,
1112                   alleleFieldIndex, alleleNumberFieldIndex))
1113           {
1114             return consequence;
1115           }
1116         }
1117       }
1118     }
1119     return null;
1120   }
1121
1122   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1123           String[] csqFields, int alleleFieldIndex,
1124           int alleleNumberFieldIndex)
1125   {
1126     /*
1127      * if ALLELE_NUM is present, it must match altAlleleIndex
1128      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1129      */
1130     if (alleleNumberFieldIndex > -1)
1131     {
1132       if (csqFields.length <= alleleNumberFieldIndex)
1133       {
1134         return false;
1135       }
1136       String alleleNum = csqFields[alleleNumberFieldIndex];
1137       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1138     }
1139
1140     /*
1141      * else consequence allele must match variant allele
1142      */
1143     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1144     {
1145       String csqAllele = csqFields[alleleFieldIndex];
1146       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1147               .getBaseString();
1148       return csqAllele.equals(vcfAllele);
1149     }
1150     return false;
1151   }
1152
1153   /**
1154    * Add any allele-specific VCF key-value data to the sequence feature
1155    *
1156    * @param variant
1157    * @param sf
1158    * @param altAlelleIndex
1159    *          (0, 1..)
1160    * @param consequence
1161    *          if not null, the consequence specific to this sequence (transcript
1162    *          feature) and allele
1163    */
1164   protected void addAlleleProperties(VariantContext variant,
1165           SequenceFeature sf, final int altAlelleIndex, String consequence)
1166   {
1167     Map<String, Object> atts = variant.getAttributes();
1168
1169     for (Entry<String, Object> att : atts.entrySet())
1170     {
1171       String key = att.getKey();
1172
1173       /*
1174        * extract Consequence data (if present) that we are able to
1175        * associated with the allele for this variant feature
1176        */
1177       if (CSQ_FIELD.equals(key))
1178       {
1179         addConsequences(variant, sf, consequence);
1180         continue;
1181       }
1182
1183       /*
1184        * filter out fields we don't want to capture
1185        */
1186       if (!vcfFieldsOfInterest.contains(key))
1187       {
1188         continue;
1189       }
1190
1191       /*
1192        * filter out fields we don't want to capture
1193        */
1194       if (!vcfFieldsOfInterest.contains(key))
1195       {
1196         continue;
1197       }
1198
1199       /*
1200        * we extract values for other data which are allele-specific;
1201        * these may be per alternate allele (INFO[key].Number = 'A')
1202        * or per allele including reference (INFO[key].Number = 'R')
1203        */
1204       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1205       if (infoHeader == null)
1206       {
1207         /*
1208          * can't be sure what data belongs to this allele, so
1209          * play safe and don't take any
1210          */
1211         continue;
1212       }
1213
1214       VCFHeaderLineCount number = infoHeader.getCountType();
1215       int index = altAlelleIndex;
1216       if (number == VCFHeaderLineCount.R)
1217       {
1218         /*
1219          * one value per allele including reference, so bump index
1220          * e.g. the 3rd value is for the  2nd alternate allele
1221          */
1222         index++;
1223       }
1224       else if (number != VCFHeaderLineCount.A)
1225       {
1226         /*
1227          * don't save other values as not allele-related
1228          */
1229         continue;
1230       }
1231
1232       /*
1233        * take the index'th value
1234        */
1235       String value = getAttributeValue(variant, key, index);
1236       if (value != null)
1237       {
1238         sf.setValue(key, value);
1239       }
1240     }
1241   }
1242
1243   /**
1244    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1245    * feature.
1246    * <p>
1247    * If <code>myConsequence</code> is not null, then this is the specific
1248    * consequence data (pipe-delimited fields) that is for the current allele and
1249    * transcript (sequence) being processed)
1250    *
1251    * @param variant
1252    * @param sf
1253    * @param myConsequence
1254    */
1255   protected void addConsequences(VariantContext variant, SequenceFeature sf,
1256           String myConsequence)
1257   {
1258     Object value = variant.getAttribute(CSQ_FIELD);
1259
1260     if (value == null || !(value instanceof List<?>))
1261     {
1262       return;
1263     }
1264
1265     List<String> consequences = (List<String>) value;
1266
1267     /*
1268      * inspect CSQ consequences; restrict to the consequence
1269      * associated with the current transcript (Feature)
1270      */
1271     Map<String, String> csqValues = new HashMap<>();
1272
1273     for (String consequence : consequences)
1274     {
1275       if (myConsequence == null || myConsequence.equals(consequence))
1276       {
1277         String[] csqFields = consequence.split(PIPE_REGEX);
1278
1279         /*
1280          * inspect individual fields of this consequence, copying non-null
1281          * values which are 'fields of interest'
1282          */
1283         int i = 0;
1284         for (String field : csqFields)
1285         {
1286           if (field != null && field.length() > 0)
1287           {
1288             String id = vepFieldsOfInterest.get(i);
1289             if (id != null)
1290             {
1291               csqValues.put(id, field);
1292             }
1293           }
1294           i++;
1295         }
1296       }
1297     }
1298
1299     if (!csqValues.isEmpty())
1300     {
1301       sf.setValue(CSQ_FIELD, csqValues);
1302     }
1303   }
1304
1305   /**
1306    * A convenience method to complement a dna base and return the string value
1307    * of its complement
1308    *
1309    * @param reference
1310    * @return
1311    */
1312   protected String complement(byte[] reference)
1313   {
1314     return String.valueOf(Dna.getComplement((char) reference[0]));
1315   }
1316
1317   /**
1318    * Determines the location of the query range (chromosome positions) in a
1319    * different reference assembly.
1320    * <p>
1321    * If the range is just a subregion of one for which we already have a mapping
1322    * (for example, an exon sub-region of a gene), then the mapping is just
1323    * computed arithmetically.
1324    * <p>
1325    * Otherwise, calls the Ensembl REST service that maps from one assembly
1326    * reference's coordinates to another's
1327    *
1328    * @param queryRange
1329    *          start-end chromosomal range in 'fromRef' coordinates
1330    * @param chromosome
1331    * @param species
1332    * @param fromRef
1333    *          assembly reference for the query coordinates
1334    * @param toRef
1335    *          assembly reference we wish to translate to
1336    * @return the start-end range in 'toRef' coordinates
1337    */
1338   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1339           String species, String fromRef, String toRef)
1340   {
1341     /*
1342      * first try shorcut of computing the mapping as a subregion of one
1343      * we already have (e.g. for an exon, if we have the gene mapping)
1344      */
1345     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1346             species, fromRef, toRef);
1347     if (mappedRange != null)
1348     {
1349       return mappedRange;
1350     }
1351
1352     /*
1353      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1354      */
1355     EnsemblMap mapper = new EnsemblMap();
1356     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1357             toRef, queryRange);
1358
1359     if (mapping == null)
1360     {
1361       // mapping service failure
1362       return null;
1363     }
1364
1365     /*
1366      * save mapping for possible future re-use
1367      */
1368     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1369     if (!assemblyMappings.containsKey(key))
1370     {
1371       assemblyMappings.put(key, new HashMap<int[], int[]>());
1372     }
1373
1374     assemblyMappings.get(key).put(queryRange, mapping);
1375
1376     return mapping;
1377   }
1378
1379   /**
1380    * If we already have a 1:1 contiguous mapping which subsumes the given query
1381    * range, this method just calculates and returns the subset of that mapping,
1382    * else it returns null. In practical terms, if a gene has a contiguous
1383    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1384    * subsidiary exons occupy unchanged relative positions, and just compute
1385    * these as offsets, rather than do another lookup of the mapping.
1386    * <p>
1387    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1388    * simply remove this method or let it always return null.
1389    * <p>
1390    * Warning: many rapid calls to the /map service map result in a 429 overload
1391    * error response
1392    *
1393    * @param queryRange
1394    * @param chromosome
1395    * @param species
1396    * @param fromRef
1397    * @param toRef
1398    * @return
1399    */
1400   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1401           String species, String fromRef, String toRef)
1402   {
1403     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1404     if (assemblyMappings.containsKey(key))
1405     {
1406       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1407       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1408       {
1409         int[] fromRange = mappedRange.getKey();
1410         int[] toRange = mappedRange.getValue();
1411         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1412         {
1413           /*
1414            * mapping is 1:1 in length, so we trust it to have no discontinuities
1415            */
1416           if (MappingUtils.rangeContains(fromRange, queryRange))
1417           {
1418             /*
1419              * fromRange subsumes our query range
1420              */
1421             int offset = queryRange[0] - fromRange[0];
1422             int mappedRangeFrom = toRange[0] + offset;
1423             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1424             return new int[] { mappedRangeFrom, mappedRangeTo };
1425           }
1426         }
1427       }
1428     }
1429     return null;
1430   }
1431
1432   /**
1433    * Transfers the sequence feature to the target sequence, locating its start
1434    * and end range based on the mapping. Features which do not overlap the
1435    * target sequence are ignored.
1436    *
1437    * @param sf
1438    * @param targetSequence
1439    * @param mapping
1440    *          mapping from the feature's coordinates to the target sequence
1441    */
1442   protected void transferFeature(SequenceFeature sf,
1443           SequenceI targetSequence, MapList mapping)
1444   {
1445     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1446
1447     if (mappedRange != null)
1448     {
1449       String group = sf.getFeatureGroup();
1450       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1451       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1452       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1453               group, sf.getScore());
1454       targetSequence.addSequenceFeature(copy);
1455     }
1456   }
1457
1458   /**
1459    * Formats a ranges map lookup key
1460    *
1461    * @param chromosome
1462    * @param species
1463    * @param fromRef
1464    * @param toRef
1465    * @return
1466    */
1467   protected static String makeRangesKey(String chromosome, String species,
1468           String fromRef, String toRef)
1469   {
1470     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1471             + toRef;
1472   }
1473 }