src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.api.AlignViewControllerGuiI;
   6 import jalview.bin.Cache;
   7 import jalview.datamodel.DBRefEntry;
   8 import jalview.datamodel.GeneLociI;
   9 import jalview.datamodel.Mapping;
  10 import jalview.datamodel.SequenceFeature;
  11 import jalview.datamodel.SequenceI;
  12 import jalview.datamodel.features.FeatureAttributeType;
  13 import jalview.datamodel.features.FeatureSource;
  14 import jalview.datamodel.features.FeatureSources;
  15 import jalview.ext.ensembl.EnsemblMap;
  16 import jalview.ext.htsjdk.HtsContigDb;
  17 import jalview.ext.htsjdk.VCFReader;
  18 import jalview.io.gff.Gff3Helper;
  19 import jalview.io.gff.SequenceOntologyI;
  20 import jalview.util.MapList;
  21 import jalview.util.MappingUtils;
  22 import jalview.util.MessageManager;
  23
  24 import java.io.File;
  25 import java.io.IOException;
  26 import java.util.ArrayList;
  27 import java.util.HashMap;
  28 import java.util.List;
  29 import java.util.Map;
  30 import java.util.Map.Entry;
  31 import java.util.regex.Pattern;
  32 import java.util.regex.PatternSyntaxException;
  33
  34 import htsjdk.samtools.SAMException;
  35 import htsjdk.samtools.SAMSequenceDictionary;
  36 import htsjdk.samtools.SAMSequenceRecord;
  37 import htsjdk.samtools.util.CloseableIterator;
  38 import htsjdk.variant.variantcontext.Allele;
  39 import htsjdk.variant.variantcontext.VariantContext;
  40 import htsjdk.variant.vcf.VCFHeader;
  41 import htsjdk.variant.vcf.VCFHeaderLine;
  42 import htsjdk.variant.vcf.VCFHeaderLineCount;
  43 import htsjdk.variant.vcf.VCFHeaderLineType;
  44 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  45
  46 /**
  47  * A class to read VCF data (using the htsjdk) and add variants as sequence
  48  * features on dna and any related protein product sequences
  49  *
  50  * @author gmcarstairs
  51  */
  52 public class VCFLoader
  53 {
  54   private static final String DEFAULT_SPECIES = "homo_sapiens";
  55
  56   /**
  57    * A class to model the mapping from sequence to VCF coordinates. Cases include
  58    * <ul>
  59    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  60    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
  61    * use the same reference assembly</li>
  62    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
  63    * and VCF use different reference assembles</li>
  64    * </ul>
  65    */
  66   class VCFMap
  67   {
  68     final String chromosome;
  69
  70     final MapList map;
  71
  72     VCFMap(String chr, MapList m)
  73     {
  74       chromosome = chr;
  75       map = m;
  76     }
  77
  78     @Override
  79     public String toString()
  80     {
  81       return chromosome + ":" + map.toString();
  82     }
  83   }
  84
  85   /*
  86    * Lookup keys, and default values, for Preference entries that describe
  87    * patterns for VCF and VEP fields to capture
  88    */
  89   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
  90
  91   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
  92
  93   private static final String DEFAULT_VCF_FIELDS = ".*";
  94
  95   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
  96
  97   /*
  98    * Lookup keys, and default values, for Preference entries that give
  99    * mappings from tokens in the 'reference' header to species or assembly
 100    */
 101   private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
 102
 103   private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh38,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
 104
 105   private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
 106
 107   /*
 108    * keys to fields of VEP CSQ consequence data
 109    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 110    */
 111   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
 112   private static final String CSQ_ALLELE_KEY = "Allele";
 113   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 114   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 115
 116   /*
 117    * default VCF INFO key for VEP consequence data
 118    * NB this can be overridden running VEP with --vcf_info_field
 119    * - we don't handle this case (require identifier to be CSQ)
 120    */
 121   private static final String CSQ_FIELD = "CSQ";
 122
 123   /*
 124    * separator for fields in consequence data is '|'
 125    */
 126   private static final String PIPE_REGEX = "\\|";
 127
 128   /*
 129    * delimiter that separates multiple consequence data blocks
 130    */
 131   private static final String COMMA = ",";
 132
 133   /*
 134    * the feature group assigned to a VCF variant in Jalview
 135    */
 136   private static final String FEATURE_GROUP_VCF = "VCF";
 137
 138   /*
 139    * internal delimiter used to build keys for assemblyMappings
 140    *
 141    */
 142   private static final String EXCL = "!";
 143
 144   /*
 145    * the VCF file we are processing
 146    */
 147   protected String vcfFilePath;
 148
 149   /*
 150    * mappings between VCF and sequence reference assembly regions, as
 151    * key = "species!chromosome!fromAssembly!toAssembly
 152    * value = Map{fromRange, toRange}
 153    */
 154   private Map<String, Map<int[], int[]>> assemblyMappings;
 155
 156   private VCFReader reader;
 157
 158   /*
 159    * holds details of the VCF header lines (metadata)
 160    */
 161   private VCFHeader header;
 162
 163   /*
 164    * species (as a valid Ensembl term) the VCF is for
 165    */
 166   private String vcfSpecies;
 167
 168   /*
 169    * genome assembly version (as a valid Ensembl identifier) the VCF is for
 170    */
 171   private String vcfAssembly;
 172
 173   /*
 174    * a Dictionary of contigs (if present) referenced in the VCF file
 175    */
 176   private SAMSequenceDictionary dictionary;
 177
 178   /*
 179    * the position (0...) of field in each block of
 180    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 181    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 182    */
 183   private int csqConsequenceFieldIndex = -1;
 184   private int csqAlleleFieldIndex = -1;
 185   private int csqAlleleNumberFieldIndex = -1;
 186   private int csqFeatureFieldIndex = -1;
 187
 188   // todo the same fields for SnpEff ANN data if wanted
 189   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 190
 191   /*
 192    * a unique identifier under which to save metadata about feature
 193    * attributes (selected INFO field data)
 194    */
 195   private String sourceId;
 196
 197   /*
 198    * The INFO IDs of data that is both present in the VCF file, and
 199    * also matched by any filters for data of interest
 200    */
 201   List<String> vcfFieldsOfInterest;
 202
 203   /*
 204    * The field offsets and identifiers for VEP (CSQ) data that is both present
 205    * in the VCF file, and also matched by any filters for data of interest
 206    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 207    */
 208   Map<Integer, String> vepFieldsOfInterest;
 209
 210   /**
 211    * Constructor given a VCF file
 212    *
 213    * @param alignment
 214    */
 215   public VCFLoader(String vcfFile)
 216   {
 217     try
 218     {
 219       initialise(vcfFile);
 220     } catch (IOException e)
 221     {
 222       System.err.println("Error opening VCF file: " + e.getMessage());
 223     }
 224
 225     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 226     assemblyMappings = new HashMap<>();
 227   }
 228
 229   /**
 230    * Starts a new thread to query and load VCF variant data on to the given
 231    * sequences
 232    * <p>
 233    * This method is not thread safe - concurrent threads should use separate
 234    * instances of this class.
 235    *
 236    * @param seqs
 237    * @param gui
 238    */
 239   public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
 240   {
 241     if (gui != null)
 242     {
 243       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 244     }
 245
 246     new Thread()
 247     {
 248       @Override
 249       public void run()
 250       {
 251         VCFLoader.this.doLoad(seqs, gui);
 252       }
 253     }.start();
 254   }
 255
 256   /**
 257    * Reads the specified contig sequence and adds its VCF variants to it
 258    *
 259    * @param contig
 260    *          the id of a single sequence (contig) to load
 261    * @return
 262    */
 263   public SequenceI loadVCFContig(String contig)
 264   {
 265     String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY)
 266             .getValue();
 267     if (ref.startsWith("file://"))
 268     {
 269       ref = ref.substring(7);
 270     }
 271     setSpeciesAndAssembly(ref);
 272
 273     SequenceI seq = null;
 274     File dbFile = new File(ref);
 275
 276     if (dbFile.exists())
 277     {
 278       HtsContigDb db = new HtsContigDb("", dbFile);
 279       seq = db.getSequenceProxy(contig);
 280       loadSequenceVCF(seq);
 281       db.close();
 282     }
 283     else
 284     {
 285       System.err.println("VCF reference not found: " + ref);
 286     }
 287
 288     return seq;
 289   }
 290
 291   /**
 292    * Loads VCF on to one or more sequences
 293    *
 294    * @param seqs
 295    * @param gui
 296    *          optional callback handler for messages
 297    */
 298   protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
 299   {
 300     try
 301     {
 302       VCFHeaderLine ref = header
 303               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 304       String reference = ref.getValue();
 305
 306       setSpeciesAndAssembly(reference);
 307
 308       int varCount = 0;
 309       int seqCount = 0;
 310
 311       /*
 312        * query for VCF overlapping each sequence in turn
 313        */
 314       for (SequenceI seq : seqs)
 315       {
 316         int added = loadSequenceVCF(seq);
 317         if (added > 0)
 318         {
 319           seqCount++;
 320           varCount += added;
 321           transferAddedFeatures(seq);
 322         }
 323       }
 324       if (gui != null)
 325       {
 326         String msg = MessageManager.formatMessage("label.added_vcf",
 327                 varCount, seqCount);
 328         gui.setStatus(msg);
 329         if (gui.getFeatureSettingsUI() != null)
 330         {
 331           gui.getFeatureSettingsUI().discoverAllFeatureData();
 332         }
 333       }
 334     } catch (Throwable e)
 335     {
 336       System.err.println("Error processing VCF: " + e.getMessage());
 337       e.printStackTrace();
 338       if (gui != null)
 339       {
 340         gui.setStatus("Error occurred - see console for details");
 341       }
 342     } finally
 343     {
 344       if (reader != null)
 345       {
 346         try
 347         {
 348           reader.close();
 349         } catch (IOException e)
 350         {
 351           // ignore
 352         }
 353       }
 354       header = null;
 355       dictionary = null;
 356     }
 357   }
 358
 359   /**
 360    * Attempts to determine and save the species and genome assembly version to
 361    * which the VCF data applies. This may be done by parsing the {@code reference}
 362    * header line, configured in a property file, or (potentially) confirmed
 363    * interactively by the user.
 364    * <p>
 365    * The saved values should be identifiers valid for Ensembl's REST service
 366    * {@code map} endpoint, so they can be used (if necessary) to retrieve the
 367    * mapping between VCF coordinates and sequence coordinates.
 368    *
 369    * @param reference
 370    * @see https://rest.ensembl.org/documentation/info/assembly_map
 371    * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
 372    * @see https://rest.ensembl.org/info/species?content-type=text/xml
 373    */
 374   protected void setSpeciesAndAssembly(String reference)
 375   {
 376     vcfSpecies = DEFAULT_SPECIES;
 377
 378     /*
 379      * for a non-human species, or other assembly identifier,
 380      * specify as a Jalview property file entry e.g.
 381      * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
 382      * VCF_SPECIES = c_elegans=celegans
 383      * to map a token in the reference header to a value
 384      */
 385     String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
 386     for (String token : prop.split(","))
 387     {
 388       String[] tokens = token.split("=");
 389       if (tokens.length == 2)
 390       {
 391         if (reference.contains(tokens[0].trim().toLowerCase()))
 392         {
 393           vcfAssembly = tokens[1].trim();
 394           break;
 395         }
 396       }
 397     }
 398
 399     prop = Cache.getProperty(VCF_SPECIES);
 400     if (prop != null)
 401     {
 402       for (String token : prop.split(","))
 403       {
 404         String[] tokens = token.split("=");
 405         if (tokens.length == 2)
 406         {
 407           if (reference.contains(tokens[0].trim().toLowerCase()))
 408           {
 409             vcfSpecies = tokens[1].trim();
 410             break;
 411           }
 412         }
 413       }
 414     }
 415   }
 416
 417   /**
 418    * Opens the VCF file and parses header data
 419    *
 420    * @param filePath
 421    * @throws IOException
 422    */
 423   private void initialise(String filePath) throws IOException
 424   {
 425     vcfFilePath = filePath;
 426
 427     reader = new VCFReader(filePath);
 428
 429     header = reader.getFileHeader();
 430
 431     try
 432     {
 433       dictionary = header.getSequenceDictionary();
 434     } catch (SAMException e)
 435     {
 436       // ignore - thrown if any contig line lacks length info
 437     }
 438
 439     sourceId = filePath;
 440
 441     saveMetadata(sourceId);
 442
 443     /*
 444      * get offset of CSQ ALLELE_NUM and Feature if declared
 445      */
 446     parseCsqHeader();
 447   }
 448
 449   /**
 450    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 451    * them for future reference
 452    *
 453    * @param theSourceId
 454    */
 455   void saveMetadata(String theSourceId)
 456   {
 457     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 458             DEFAULT_VCF_FIELDS);
 459     vcfFieldsOfInterest = new ArrayList<>();
 460
 461     FeatureSource metadata = new FeatureSource(theSourceId);
 462
 463     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 464     {
 465       String attributeId = info.getID();
 466       String desc = info.getDescription();
 467       VCFHeaderLineType type = info.getType();
 468       FeatureAttributeType attType = null;
 469       switch (type)
 470       {
 471       case Character:
 472         attType = FeatureAttributeType.Character;
 473         break;
 474       case Flag:
 475         attType = FeatureAttributeType.Flag;
 476         break;
 477       case Float:
 478         attType = FeatureAttributeType.Float;
 479         break;
 480       case Integer:
 481         attType = FeatureAttributeType.Integer;
 482         break;
 483       case String:
 484         attType = FeatureAttributeType.String;
 485         break;
 486       }
 487       metadata.setAttributeName(attributeId, desc);
 488       metadata.setAttributeType(attributeId, attType);
 489
 490       if (isFieldWanted(attributeId, vcfFieldPatterns))
 491       {
 492         vcfFieldsOfInterest.add(attributeId);
 493       }
 494     }
 495
 496     FeatureSources.getInstance().addSource(theSourceId, metadata);
 497   }
 498
 499   /**
 500    * Answers true if the field id is matched by any of the filter patterns, else
 501    * false. Matching is against regular expression patterns, and is not
 502    * case-sensitive.
 503    *
 504    * @param id
 505    * @param filters
 506    * @return
 507    */
 508   private boolean isFieldWanted(String id, List<Pattern> filters)
 509   {
 510     for (Pattern p : filters)
 511     {
 512       if (p.matcher(id.toUpperCase()).matches())
 513       {
 514         return true;
 515       }
 516     }
 517     return false;
 518   }
 519
 520   /**
 521    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 522    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 523    * required for processing.
 524    * <p>
 525    * CSQ fields are declared in the CSQ INFO Description e.g.
 526    * <p>
 527    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 528    */
 529   protected void parseCsqHeader()
 530   {
 531     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 532             DEFAULT_VEP_FIELDS);
 533     vepFieldsOfInterest = new HashMap<>();
 534
 535     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 536     if (csqInfo == null)
 537     {
 538       return;
 539     }
 540
 541     /*
 542      * parse out the pipe-separated list of CSQ fields; we assume here that
 543      * these form the last part of the description, and contain no spaces
 544      */
 545     String desc = csqInfo.getDescription();
 546     int spacePos = desc.lastIndexOf(" ");
 547     desc = desc.substring(spacePos + 1);
 548
 549     if (desc != null)
 550     {
 551       String[] format = desc.split(PIPE_REGEX);
 552       int index = 0;
 553       for (String field : format)
 554       {
 555         if (CSQ_CONSEQUENCE_KEY.equals(field))
 556         {
 557           csqConsequenceFieldIndex = index;
 558         }
 559         if (CSQ_ALLELE_NUM_KEY.equals(field))
 560         {
 561           csqAlleleNumberFieldIndex = index;
 562         }
 563         if (CSQ_ALLELE_KEY.equals(field))
 564         {
 565           csqAlleleFieldIndex = index;
 566         }
 567         if (CSQ_FEATURE_KEY.equals(field))
 568         {
 569           csqFeatureFieldIndex = index;
 570         }
 571
 572         if (isFieldWanted(field, vepFieldFilters))
 573         {
 574           vepFieldsOfInterest.put(index, field);
 575         }
 576
 577         index++;
 578       }
 579     }
 580   }
 581
 582   /**
 583    * Reads the Preference value for the given key, with default specified if no
 584    * preference set. The value is interpreted as a comma-separated list of
 585    * regular expressions, and converted into a list of compiled patterns ready
 586    * for matching. Patterns are forced to upper-case for non-case-sensitive
 587    * matching.
 588    * <p>
 589    * This supports user-defined filters for fields of interest to capture while
 590    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 591    * fields with an ID of AF, or starting with AC, would be matched.
 592    *
 593    * @param key
 594    * @param def
 595    * @return
 596    */
 597   private List<Pattern> getFieldMatchers(String key, String def)
 598   {
 599     String pref = Cache.getDefault(key, def);
 600     List<Pattern> patterns = new ArrayList<>();
 601     String[] tokens = pref.split(",");
 602     for (String token : tokens)
 603     {
 604       try
 605       {
 606       patterns.add(Pattern.compile(token.toUpperCase()));
 607       } catch (PatternSyntaxException e)
 608       {
 609         System.err.println("Invalid pattern ignored: " + token);
 610       }
 611     }
 612     return patterns;
 613   }
 614
 615   /**
 616    * Transfers VCF features to sequences to which this sequence has a mapping.
 617    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
 618    *
 619    * @param seq
 620    */
 621   protected void transferAddedFeatures(SequenceI seq)
 622   {
 623     DBRefEntry[] dbrefs = seq.getDBRefs();
 624     if (dbrefs == null)
 625     {
 626       return;
 627     }
 628     for (DBRefEntry dbref : dbrefs)
 629     {
 630       Mapping mapping = dbref.getMap();
 631       if (mapping == null || mapping.getTo() == null)
 632       {
 633         continue;
 634       }
 635
 636       SequenceI mapTo = mapping.getTo();
 637       MapList map = mapping.getMap();
 638       if (map.getFromRatio() == 3)
 639       {
 640         /*
 641          * dna-to-peptide product mapping
 642          */
 643         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 644       }
 645       else
 646       {
 647         /*
 648          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 649          */
 650         List<SequenceFeature> features = seq.getFeatures()
 651                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 652         for (SequenceFeature sf : features)
 653         {
 654           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 655           {
 656             transferFeature(sf, mapTo, map);
 657           }
 658         }
 659       }
 660     }
 661   }
 662
 663   /**
 664    * Tries to add overlapping variants read from a VCF file to the given sequence,
 665    * and returns the number of variant features added
 666    *
 667    * @param seq
 668    * @return
 669    */
 670   protected int loadSequenceVCF(SequenceI seq)
 671   {
 672     VCFMap vcfMap = getVcfMap(seq);
 673     if (vcfMap == null)
 674     {
 675       return 0;
 676     }
 677
 678     /*
 679      * work with the dataset sequence here
 680      */
 681     SequenceI dss = seq.getDatasetSequence();
 682     if (dss == null)
 683     {
 684       dss = seq;
 685     }
 686     return addVcfVariants(dss, vcfMap);
 687   }
 688
 689   /**
 690    * Answers a map from sequence coordinates to VCF chromosome ranges
 691    *
 692    * @param seq
 693    * @return
 694    */
 695   private VCFMap getVcfMap(SequenceI seq)
 696   {
 697     /*
 698      * simplest case: sequence has id and length matching a VCF contig
 699      */
 700     VCFMap vcfMap = null;
 701     if (dictionary != null)
 702     {
 703       vcfMap = getContigMap(seq);
 704     }
 705     if (vcfMap != null)
 706     {
 707       return vcfMap;
 708     }
 709
 710     /*
 711      * otherwise, map to VCF from chromosomal coordinates
 712      * of the sequence (if known)
 713      */
 714     GeneLociI seqCoords = seq.getGeneLoci();
 715     if (seqCoords == null)
 716     {
 717       Cache.log.warn(String.format(
 718               "Can't query VCF for %s as chromosome coordinates not known",
 719               seq.getName()));
 720       return null;
 721     }
 722
 723     String species = seqCoords.getSpeciesId();
 724     String chromosome = seqCoords.getChromosomeId();
 725     String seqRef = seqCoords.getAssemblyId();
 726     MapList map = seqCoords.getMap();
 727
 728     // note this requires the configured species to match that
 729     // returned with the Ensembl sequence; todo: support aliases?
 730     if (!vcfSpecies.equalsIgnoreCase(species))
 731     {
 732       Cache.log.warn("No VCF loaded to " + seq.getName()
 733               + " as species not matched");
 734       return null;
 735     }
 736
 737     if (seqRef.equalsIgnoreCase(vcfAssembly))
 738     {
 739       return new VCFMap(chromosome, map);
 740     }
 741
 742     /*
 743      * VCF data has a different reference assembly to the sequence:
 744      * query Ensembl to map chromosomal coordinates from sequence to VCF
 745      */
 746     List<int[]> toVcfRanges = new ArrayList<>();
 747     List<int[]> fromSequenceRanges = new ArrayList<>();
 748
 749     for (int[] range : map.getToRanges())
 750     {
 751       int[] fromRange = map.locateInFrom(range[0], range[1]);
 752       if (fromRange == null)
 753       {
 754         // corrupted map?!?
 755         continue;
 756       }
 757
 758       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 759               vcfAssembly);
 760       if (newRange == null)
 761       {
 762         Cache.log.error(
 763                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 764                         chromosome, seqRef, range[0], range[1],
 765                         vcfAssembly));
 766         continue;
 767       }
 768       else
 769       {
 770         toVcfRanges.add(newRange);
 771         fromSequenceRanges.add(fromRange);
 772       }
 773     }
 774
 775     return new VCFMap(chromosome,
 776             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 777   }
 778
 779   /**
 780    * If the sequence id matches a contig declared in the VCF file, and the
 781    * sequence length matches the contig length, then returns a 1:1 map of the
 782    * sequence to the contig, else returns null
 783    *
 784    * @param seq
 785    * @return
 786    */
 787   private VCFMap getContigMap(SequenceI seq)
 788   {
 789     String id = seq.getName();
 790     SAMSequenceRecord contig = dictionary.getSequence(id);
 791     if (contig != null)
 792     {
 793       int len = seq.getLength();
 794       if (len == contig.getSequenceLength())
 795       {
 796         MapList map = new MapList(new int[] { 1, len },
 797                 new int[]
 798                 { 1, len }, 1, 1);
 799         return new VCFMap(id, map);
 800       }
 801     }
 802     return null;
 803   }
 804
 805   /**
 806    * Answers true if the species inferred from the VCF reference identifier
 807    * matches that for the sequence
 808    *
 809    * @param vcfAssembly
 810    * @param speciesId
 811    * @return
 812    */
 813   boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId)
 814   {
 815     // PROBLEM 1
 816     // there are many aliases for species - how to equate one with another?
 817     // PROBLEM 2
 818     // VCF ##reference header is an unstructured URI - how to extract species?
 819     // perhaps check if ref includes any (Ensembl) alias of speciesId??
 820     // TODO ask the user to confirm this??
 821
 822     if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example
 823             && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id
 824     {
 825       return true;
 826     }
 827
 828     if (vcfAssembly.contains("c_elegans") // VEP VCF response example
 829             && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl
 830     {
 831       return true;
 832     }
 833
 834     // this is not a sustainable solution...
 835
 836     return false;
 837   }
 838
 839   /**
 840    * Queries the VCF reader for any variants that overlap the mapped chromosome
 841    * ranges of the sequence, and adds as variant features. Returns the number of
 842    * overlapping variants found.
 843    *
 844    * @param seq
 845    * @param map
 846    *          mapping from sequence to VCF coordinates
 847    * @return
 848    */
 849   protected int addVcfVariants(SequenceI seq, VCFMap map)
 850   {
 851     boolean forwardStrand = map.map.isToForwardStrand();
 852
 853     /*
 854      * query the VCF for overlaps of each contiguous chromosomal region
 855      */
 856     int count = 0;
 857
 858     for (int[] range : map.map.getToRanges())
 859     {
 860       int vcfStart = Math.min(range[0], range[1]);
 861       int vcfEnd = Math.max(range[0], range[1]);
 862       CloseableIterator<VariantContext> variants = reader
 863               .query(map.chromosome, vcfStart, vcfEnd);
 864       while (variants.hasNext())
 865       {
 866         VariantContext variant = variants.next();
 867
 868         int[] featureRange = map.map.locateInFrom(variant.getStart(),
 869                 variant.getEnd());
 870
 871         if (featureRange != null)
 872         {
 873           int featureStart = Math.min(featureRange[0], featureRange[1]);
 874           int featureEnd = Math.max(featureRange[0], featureRange[1]);
 875           count += addAlleleFeatures(seq, variant, featureStart, featureEnd,
 876                   forwardStrand);
 877         }
 878       }
 879       variants.close();
 880     }
 881
 882     return count;
 883   }
 884
 885   /**
 886    * A convenience method to get an attribute value for an alternate allele
 887    *
 888    * @param variant
 889    * @param attributeName
 890    * @param alleleIndex
 891    * @return
 892    */
 893   protected String getAttributeValue(VariantContext variant,
 894           String attributeName, int alleleIndex)
 895   {
 896     Object att = variant.getAttribute(attributeName);
 897
 898     if (att instanceof String)
 899     {
 900       return (String) att;
 901     }
 902     else if (att instanceof ArrayList)
 903     {
 904       return ((List<String>) att).get(alleleIndex);
 905     }
 906
 907     return null;
 908   }
 909
 910   /**
 911    * Adds one variant feature for each allele in the VCF variant record, and
 912    * returns the number of features added.
 913    *
 914    * @param seq
 915    * @param variant
 916    * @param featureStart
 917    * @param featureEnd
 918    * @param forwardStrand
 919    * @return
 920    */
 921   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 922           int featureStart, int featureEnd, boolean forwardStrand)
 923   {
 924     int added = 0;
 925
 926     /*
 927      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 928      * so we proceed defensively to get them in strict order
 929      */
 930     int altAlleleCount = variant.getAlternateAlleles().size();
 931     for (int i = 0; i < altAlleleCount; i++)
 932     {
 933       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 934               forwardStrand);
 935     }
 936     return added;
 937   }
 938
 939   /**
 940    * Inspects one allele and attempts to add a variant feature for it to the
 941    * sequence. The additional data associated with this allele is extracted to
 942    * store in the feature's key-value map. Answers the number of features added (0
 943    * or 1).
 944    *
 945    * @param seq
 946    * @param variant
 947    * @param altAlleleIndex
 948    *          (0, 1..)
 949    * @param featureStart
 950    * @param featureEnd
 951    * @param forwardStrand
 952    * @return
 953    */
 954   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 955           int altAlleleIndex, int featureStart, int featureEnd,
 956           boolean forwardStrand)
 957   {
 958     String reference = variant.getReference().getBaseString();
 959     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 960     String allele = alt.getBaseString();
 961
 962     /*
 963      * insertion after a genomic base, if on reverse strand, has to be
 964      * converted to insertion of complement after the preceding position
 965      */
 966     int referenceLength = reference.length();
 967     if (!forwardStrand && allele.length() > referenceLength
 968             && allele.startsWith(reference))
 969     {
 970       featureStart -= referenceLength;
 971       featureEnd = featureStart;
 972       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
 973       reference = Dna.reverseComplement(String.valueOf(insertAfter));
 974       allele = allele.substring(referenceLength) + reference;
 975     }
 976
 977     /*
 978      * build the ref,alt allele description e.g. "G,A", using the base
 979      * complement if the sequence is on the reverse strand
 980      */
 981     StringBuilder sb = new StringBuilder();
 982     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
 983     sb.append(COMMA);
 984     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
 985     String alleles = sb.toString(); // e.g. G,A
 986
 987     /*
 988      * pick out the consequence data (if any) that is for the current allele
 989      * and feature (transcript) that matches the current sequence
 990      */
 991     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
 992             altAlleleIndex, csqAlleleFieldIndex,
 993             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
 994             csqFeatureFieldIndex);
 995
 996     /*
 997      * pick out the ontology term for the consequence type
 998      */
 999     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1000     if (consequence != null)
1001     {
1002       type = getOntologyTerm(consequence);
1003     }
1004
1005     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
1006             featureEnd, FEATURE_GROUP_VCF);
1007     sf.setSource(sourceId);
1008
1009     sf.setValue(Gff3Helper.ALLELES, alleles);
1010
1011     addAlleleProperties(variant, sf, altAlleleIndex, consequence);
1012
1013     seq.addSequenceFeature(sf);
1014
1015     return 1;
1016   }
1017
1018   /**
1019    * Determines the Sequence Ontology term to use for the variant feature type in
1020    * Jalview. The default is 'sequence_variant', but a more specific term is used
1021    * if:
1022    * <ul>
1023    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
1024    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
1025    * </ul>
1026    *
1027    * @param consequence
1028    * @return
1029    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
1030    */
1031   String getOntologyTerm(String consequence)
1032   {
1033     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1034
1035     /*
1036      * could we associate Consequence data with this allele and feature (transcript)?
1037      * if so, prefer the consequence term from that data
1038      */
1039     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
1040     {
1041       /*
1042        * no Consequence data so we can't refine the ontology term
1043        */
1044       return type;
1045     }
1046
1047     if (consequence != null)
1048     {
1049       String[] csqFields = consequence.split(PIPE_REGEX);
1050       if (csqFields.length > csqConsequenceFieldIndex)
1051       {
1052         type = csqFields[csqConsequenceFieldIndex];
1053       }
1054     }
1055     else
1056     {
1057       // todo the same for SnpEff consequence data matching if wanted
1058     }
1059
1060     /*
1061      * if of the form (e.g.) missense_variant&splice_region_variant,
1062      * just take the first ('most severe') consequence
1063      */
1064     if (type != null)
1065     {
1066       int pos = type.indexOf('&');
1067       if (pos > 0)
1068       {
1069         type = type.substring(0, pos);
1070       }
1071     }
1072     return type;
1073   }
1074
1075   /**
1076    * Returns matched consequence data if it can be found, else null.
1077    * <ul>
1078    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1079    * <li>splits this on comma (to distinct consequences)</li>
1080    * <li>returns the first consequence (if any) where</li>
1081    * <ul>
1082    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1083    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1084    * </ul>
1085    * </ul>
1086    * If matched, the consequence is returned (as pipe-delimited fields).
1087    *
1088    * @param variant
1089    * @param vcfInfoId
1090    * @param altAlleleIndex
1091    * @param alleleFieldIndex
1092    * @param alleleNumberFieldIndex
1093    * @param seqName
1094    * @param featureFieldIndex
1095    * @return
1096    */
1097   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1098           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1099           int alleleNumberFieldIndex,
1100           String seqName, int featureFieldIndex)
1101   {
1102     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1103     {
1104       return null;
1105     }
1106     Object value = variant.getAttribute(vcfInfoId);
1107
1108     if (value == null || !(value instanceof List<?>))
1109     {
1110       return null;
1111     }
1112
1113     /*
1114      * inspect each consequence in turn (comma-separated blocks
1115      * extracted by htsjdk)
1116      */
1117     List<String> consequences = (List<String>) value;
1118
1119     for (String consequence : consequences)
1120     {
1121       String[] csqFields = consequence.split(PIPE_REGEX);
1122       if (csqFields.length > featureFieldIndex)
1123       {
1124         String featureIdentifier = csqFields[featureFieldIndex];
1125         if (featureIdentifier.length() > 4
1126                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1127         {
1128           /*
1129            * feature (transcript) matched - now check for allele match
1130            */
1131           if (matchAllele(variant, altAlleleIndex, csqFields,
1132                   alleleFieldIndex, alleleNumberFieldIndex))
1133           {
1134             return consequence;
1135           }
1136         }
1137       }
1138     }
1139     return null;
1140   }
1141
1142   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1143           String[] csqFields, int alleleFieldIndex,
1144           int alleleNumberFieldIndex)
1145   {
1146     /*
1147      * if ALLELE_NUM is present, it must match altAlleleIndex
1148      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1149      */
1150     if (alleleNumberFieldIndex > -1)
1151     {
1152       if (csqFields.length <= alleleNumberFieldIndex)
1153       {
1154         return false;
1155       }
1156       String alleleNum = csqFields[alleleNumberFieldIndex];
1157       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1158     }
1159
1160     /*
1161      * else consequence allele must match variant allele
1162      */
1163     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1164     {
1165       String csqAllele = csqFields[alleleFieldIndex];
1166       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1167               .getBaseString();
1168       return csqAllele.equals(vcfAllele);
1169     }
1170     return false;
1171   }
1172
1173   /**
1174    * Add any allele-specific VCF key-value data to the sequence feature
1175    *
1176    * @param variant
1177    * @param sf
1178    * @param altAlelleIndex
1179    *          (0, 1..)
1180    * @param consequence
1181    *          if not null, the consequence specific to this sequence (transcript
1182    *          feature) and allele
1183    */
1184   protected void addAlleleProperties(VariantContext variant,
1185           SequenceFeature sf, final int altAlelleIndex, String consequence)
1186   {
1187     Map<String, Object> atts = variant.getAttributes();
1188
1189     for (Entry<String, Object> att : atts.entrySet())
1190     {
1191       String key = att.getKey();
1192
1193       /*
1194        * extract Consequence data (if present) that we are able to
1195        * associated with the allele for this variant feature
1196        */
1197       if (CSQ_FIELD.equals(key))
1198       {
1199         addConsequences(variant, sf, consequence);
1200         continue;
1201       }
1202
1203       /*
1204        * filter out fields we don't want to capture
1205        */
1206       if (!vcfFieldsOfInterest.contains(key))
1207       {
1208         continue;
1209       }
1210
1211       /*
1212        * filter out fields we don't want to capture
1213        */
1214       if (!vcfFieldsOfInterest.contains(key))
1215       {
1216         continue;
1217       }
1218
1219       /*
1220        * we extract values for other data which are allele-specific;
1221        * these may be per alternate allele (INFO[key].Number = 'A')
1222        * or per allele including reference (INFO[key].Number = 'R')
1223        */
1224       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1225       if (infoHeader == null)
1226       {
1227         /*
1228          * can't be sure what data belongs to this allele, so
1229          * play safe and don't take any
1230          */
1231         continue;
1232       }
1233
1234       VCFHeaderLineCount number = infoHeader.getCountType();
1235       int index = altAlelleIndex;
1236       if (number == VCFHeaderLineCount.R)
1237       {
1238         /*
1239          * one value per allele including reference, so bump index
1240          * e.g. the 3rd value is for the  2nd alternate allele
1241          */
1242         index++;
1243       }
1244       else if (number != VCFHeaderLineCount.A)
1245       {
1246         /*
1247          * don't save other values as not allele-related
1248          */
1249         continue;
1250       }
1251
1252       /*
1253        * take the index'th value
1254        */
1255       String value = getAttributeValue(variant, key, index);
1256       if (value != null)
1257       {
1258         sf.setValue(key, value);
1259       }
1260     }
1261   }
1262
1263   /**
1264    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1265    * feature.
1266    * <p>
1267    * If <code>myConsequence</code> is not null, then this is the specific
1268    * consequence data (pipe-delimited fields) that is for the current allele and
1269    * transcript (sequence) being processed)
1270    *
1271    * @param variant
1272    * @param sf
1273    * @param myConsequence
1274    */
1275   protected void addConsequences(VariantContext variant, SequenceFeature sf,
1276           String myConsequence)
1277   {
1278     Object value = variant.getAttribute(CSQ_FIELD);
1279
1280     if (value == null || !(value instanceof List<?>))
1281     {
1282       return;
1283     }
1284
1285     List<String> consequences = (List<String>) value;
1286
1287     /*
1288      * inspect CSQ consequences; restrict to the consequence
1289      * associated with the current transcript (Feature)
1290      */
1291     Map<String, String> csqValues = new HashMap<>();
1292
1293     for (String consequence : consequences)
1294     {
1295       if (myConsequence == null || myConsequence.equals(consequence))
1296       {
1297         String[] csqFields = consequence.split(PIPE_REGEX);
1298
1299         /*
1300          * inspect individual fields of this consequence, copying non-null
1301          * values which are 'fields of interest'
1302          */
1303         int i = 0;
1304         for (String field : csqFields)
1305         {
1306           if (field != null && field.length() > 0)
1307           {
1308             String id = vepFieldsOfInterest.get(i);
1309             if (id != null)
1310             {
1311               csqValues.put(id, field);
1312             }
1313           }
1314           i++;
1315         }
1316       }
1317     }
1318
1319     if (!csqValues.isEmpty())
1320     {
1321       sf.setValue(CSQ_FIELD, csqValues);
1322     }
1323   }
1324
1325   /**
1326    * A convenience method to complement a dna base and return the string value
1327    * of its complement
1328    *
1329    * @param reference
1330    * @return
1331    */
1332   protected String complement(byte[] reference)
1333   {
1334     return String.valueOf(Dna.getComplement((char) reference[0]));
1335   }
1336
1337   /**
1338    * Determines the location of the query range (chromosome positions) in a
1339    * different reference assembly.
1340    * <p>
1341    * If the range is just a subregion of one for which we already have a mapping
1342    * (for example, an exon sub-region of a gene), then the mapping is just
1343    * computed arithmetically.
1344    * <p>
1345    * Otherwise, calls the Ensembl REST service that maps from one assembly
1346    * reference's coordinates to another's
1347    *
1348    * @param queryRange
1349    *          start-end chromosomal range in 'fromRef' coordinates
1350    * @param chromosome
1351    * @param species
1352    * @param fromRef
1353    *          assembly reference for the query coordinates
1354    * @param toRef
1355    *          assembly reference we wish to translate to
1356    * @return the start-end range in 'toRef' coordinates
1357    */
1358   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1359           String species, String fromRef, String toRef)
1360   {
1361     /*
1362      * first try shorcut of computing the mapping as a subregion of one
1363      * we already have (e.g. for an exon, if we have the gene mapping)
1364      */
1365     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1366             species, fromRef, toRef);
1367     if (mappedRange != null)
1368     {
1369       return mappedRange;
1370     }
1371
1372     /*
1373      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1374      */
1375     EnsemblMap mapper = new EnsemblMap();
1376     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1377             toRef, queryRange);
1378
1379     if (mapping == null)
1380     {
1381       // mapping service failure
1382       return null;
1383     }
1384
1385     /*
1386      * save mapping for possible future re-use
1387      */
1388     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1389     if (!assemblyMappings.containsKey(key))
1390     {
1391       assemblyMappings.put(key, new HashMap<int[], int[]>());
1392     }
1393
1394     assemblyMappings.get(key).put(queryRange, mapping);
1395
1396     return mapping;
1397   }
1398
1399   /**
1400    * If we already have a 1:1 contiguous mapping which subsumes the given query
1401    * range, this method just calculates and returns the subset of that mapping,
1402    * else it returns null. In practical terms, if a gene has a contiguous
1403    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1404    * subsidiary exons occupy unchanged relative positions, and just compute
1405    * these as offsets, rather than do another lookup of the mapping.
1406    * <p>
1407    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1408    * simply remove this method or let it always return null.
1409    * <p>
1410    * Warning: many rapid calls to the /map service map result in a 429 overload
1411    * error response
1412    *
1413    * @param queryRange
1414    * @param chromosome
1415    * @param species
1416    * @param fromRef
1417    * @param toRef
1418    * @return
1419    */
1420   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1421           String species, String fromRef, String toRef)
1422   {
1423     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1424     if (assemblyMappings.containsKey(key))
1425     {
1426       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1427       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1428       {
1429         int[] fromRange = mappedRange.getKey();
1430         int[] toRange = mappedRange.getValue();
1431         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1432         {
1433           /*
1434            * mapping is 1:1 in length, so we trust it to have no discontinuities
1435            */
1436           if (MappingUtils.rangeContains(fromRange, queryRange))
1437           {
1438             /*
1439              * fromRange subsumes our query range
1440              */
1441             int offset = queryRange[0] - fromRange[0];
1442             int mappedRangeFrom = toRange[0] + offset;
1443             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1444             return new int[] { mappedRangeFrom, mappedRangeTo };
1445           }
1446         }
1447       }
1448     }
1449     return null;
1450   }
1451
1452   /**
1453    * Transfers the sequence feature to the target sequence, locating its start
1454    * and end range based on the mapping. Features which do not overlap the
1455    * target sequence are ignored.
1456    *
1457    * @param sf
1458    * @param targetSequence
1459    * @param mapping
1460    *          mapping from the feature's coordinates to the target sequence
1461    */
1462   protected void transferFeature(SequenceFeature sf,
1463           SequenceI targetSequence, MapList mapping)
1464   {
1465     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1466
1467     if (mappedRange != null)
1468     {
1469       String group = sf.getFeatureGroup();
1470       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1471       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1472       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1473               group, sf.getScore());
1474       targetSequence.addSequenceFeature(copy);
1475     }
1476   }
1477
1478   /**
1479    * Formats a ranges map lookup key
1480    *
1481    * @param chromosome
1482    * @param species
1483    * @param fromRef
1484    * @param toRef
1485    * @return
1486    */
1487   protected static String makeRangesKey(String chromosome, String species,
1488           String fromRef, String toRef)
1489   {
1490     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1491             + toRef;
1492   }
1493 }