src/jalview/io/vcf/VCFLoader.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.vcf;
  22
  23 import java.io.File;
  24 import java.io.IOException;
  25 import java.util.ArrayList;
  26 import java.util.HashMap;
  27 import java.util.HashSet;
  28 import java.util.Iterator;
  29 import java.util.List;
  30 import java.util.Map;
  31 import java.util.Map.Entry;
  32 import java.util.Set;
  33 import java.util.regex.Pattern;
  34 import java.util.regex.PatternSyntaxException;
  35
  36 import htsjdk.samtools.SAMException;
  37 import htsjdk.samtools.SAMSequenceDictionary;
  38 import htsjdk.samtools.SAMSequenceRecord;
  39 import htsjdk.samtools.util.CloseableIterator;
  40 import htsjdk.tribble.TribbleException;
  41 import htsjdk.variant.variantcontext.Allele;
  42 import htsjdk.variant.variantcontext.VariantContext;
  43 import htsjdk.variant.vcf.VCFConstants;
  44 import htsjdk.variant.vcf.VCFHeader;
  45 import htsjdk.variant.vcf.VCFHeaderLine;
  46 import htsjdk.variant.vcf.VCFHeaderLineCount;
  47 import htsjdk.variant.vcf.VCFHeaderLineType;
  48 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  49 import jalview.analysis.Dna;
  50 import jalview.api.AlignViewControllerGuiI;
  51 import jalview.bin.Cache;
  52 import jalview.datamodel.DBRefEntry;
  53 import jalview.datamodel.GeneLociI;
  54 import jalview.datamodel.Mapping;
  55 import jalview.datamodel.SequenceFeature;
  56 import jalview.datamodel.SequenceI;
  57 import jalview.datamodel.features.FeatureAttributeType;
  58 import jalview.datamodel.features.FeatureSource;
  59 import jalview.datamodel.features.FeatureSources;
  60 import jalview.ext.ensembl.EnsemblMap;
  61 import jalview.ext.htsjdk.HtsContigDb;
  62 import jalview.ext.htsjdk.VCFReader;
  63 import jalview.io.gff.Gff3Helper;
  64 import jalview.io.gff.SequenceOntologyI;
  65 import jalview.util.MapList;
  66 import jalview.util.MappingUtils;
  67 import jalview.util.MessageManager;
  68 import jalview.util.StringUtils;
  69
  70 /**
  71  * A class to read VCF data (using the htsjdk) and add variants as sequence
  72  * features on dna and any related protein product sequences
  73  *
  74  * @author gmcarstairs
  75  */
  76 public class VCFLoader
  77 {
  78   private static final String VCF_ENCODABLE = ":;=%,";
  79
  80   /*
  81    * Jalview feature attributes for VCF fixed column data
  82    */
  83   private static final String VCF_POS = "POS";
  84
  85   private static final String VCF_ID = "ID";
  86
  87   private static final String VCF_QUAL = "QUAL";
  88
  89   private static final String VCF_FILTER = "FILTER";
  90
  91   private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.'
  92
  93   private static final String DEFAULT_SPECIES = "homo_sapiens";
  94
  95   /**
  96    * A class to model the mapping from sequence to VCF coordinates. Cases include
  97    * <ul>
  98    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
  99    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
 100    * use the same reference assembly</li>
 101    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
 102    * and VCF use different reference assembles</li>
 103    * </ul>
 104    */
 105   class VCFMap
 106   {
 107     final String chromosome;
 108
 109     final MapList map;
 110
 111     VCFMap(String chr, MapList m)
 112     {
 113       chromosome = chr;
 114       map = m;
 115     }
 116
 117     @Override
 118     public String toString()
 119     {
 120       return chromosome + ":" + map.toString();
 121     }
 122   }
 123
 124   /*
 125    * Lookup keys, and default values, for Preference entries that describe
 126    * patterns for VCF and VEP fields to capture
 127    */
 128   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
 129
 130   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
 131
 132   private static final String DEFAULT_VCF_FIELDS = ".*";
 133
 134   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
 135
 136   /*
 137    * Lookup keys, and default values, for Preference entries that give
 138    * mappings from tokens in the 'reference' header to species or assembly
 139    */
 140   private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
 141
 142   private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
 143
 144   private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
 145
 146   private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37
 147
 148   /*
 149    * keys to fields of VEP CSQ consequence data
 150    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 151    */
 152   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
 153   private static final String CSQ_ALLELE_KEY = "Allele";
 154   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
 155   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 156
 157   /*
 158    * default VCF INFO key for VEP consequence data
 159    * NB this can be overridden running VEP with --vcf_info_field
 160    * - we don't handle this case (require identifier to be CSQ)
 161    */
 162   private static final String CSQ_FIELD = "CSQ";
 163
 164   /*
 165    * separator for fields in consequence data is '|'
 166    */
 167   private static final String PIPE_REGEX = "\\|";
 168
 169   /*
 170    * delimiter that separates multiple consequence data blocks
 171    */
 172   private static final String COMMA = ",";
 173
 174   /*
 175    * the feature group assigned to a VCF variant in Jalview
 176    */
 177   private static final String FEATURE_GROUP_VCF = "VCF";
 178
 179   /*
 180    * internal delimiter used to build keys for assemblyMappings
 181    *
 182    */
 183   private static final String EXCL = "!";
 184
 185   /*
 186    * the VCF file we are processing
 187    */
 188   protected String vcfFilePath;
 189
 190   /*
 191    * mappings between VCF and sequence reference assembly regions, as
 192    * key = "species!chromosome!fromAssembly!toAssembly
 193    * value = Map{fromRange, toRange}
 194    */
 195   private Map<String, Map<int[], int[]>> assemblyMappings;
 196
 197   private VCFReader reader;
 198
 199   /*
 200    * holds details of the VCF header lines (metadata)
 201    */
 202   private VCFHeader header;
 203
 204   /*
 205    * species (as a valid Ensembl term) the VCF is for
 206    */
 207   private String vcfSpecies;
 208
 209   /*
 210    * genome assembly version (as a valid Ensembl identifier) the VCF is for
 211    */
 212   private String vcfAssembly;
 213
 214   /*
 215    * a Dictionary of contigs (if present) referenced in the VCF file
 216    */
 217   private SAMSequenceDictionary dictionary;
 218
 219   /*
 220    * the position (0...) of field in each block of
 221    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 222    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 223    */
 224   private int csqConsequenceFieldIndex = -1;
 225   private int csqAlleleFieldIndex = -1;
 226   private int csqAlleleNumberFieldIndex = -1;
 227   private int csqFeatureFieldIndex = -1;
 228
 229   // todo the same fields for SnpEff ANN data if wanted
 230   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 231
 232   /*
 233    * a unique identifier under which to save metadata about feature
 234    * attributes (selected INFO field data)
 235    */
 236   private String sourceId;
 237
 238   /*
 239    * The INFO IDs of data that is both present in the VCF file, and
 240    * also matched by any filters for data of interest
 241    */
 242   List<String> vcfFieldsOfInterest;
 243
 244   /*
 245    * The field offsets and identifiers for VEP (CSQ) data that is both present
 246    * in the VCF file, and also matched by any filters for data of interest
 247    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 248    */
 249   Map<Integer, String> vepFieldsOfInterest;
 250
 251   /*
 252    * key:value for which rejected data has been seen
 253    * (the error is logged only once for each combination)
 254    */
 255   private Set<String> badData;
 256
 257   /**
 258    * Constructor given a VCF file
 259    *
 260    * @param alignment
 261    */
 262   public VCFLoader(String vcfFile)
 263   {
 264     try
 265     {
 266       initialise(vcfFile);
 267     } catch (IOException e)
 268     {
 269       System.err.println("Error opening VCF file: " + e.getMessage());
 270     }
 271
 272     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 273     assemblyMappings = new HashMap<>();
 274   }
 275
 276   /**
 277    * Starts a new thread to query and load VCF variant data on to the given
 278    * sequences
 279    * <p>
 280    * This method is not thread safe - concurrent threads should use separate
 281    * instances of this class.
 282    *
 283    * @param seqs
 284    * @param gui
 285    */
 286   public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
 287   {
 288     if (gui != null)
 289     {
 290       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 291     }
 292
 293     new Thread()
 294     {
 295       @Override
 296       public void run()
 297       {
 298         VCFLoader.this.doLoad(seqs, gui);
 299       }
 300     }.start();
 301   }
 302
 303   /**
 304    * Reads the specified contig sequence and adds its VCF variants to it
 305    *
 306    * @param contig
 307    *          the id of a single sequence (contig) to load
 308    * @return
 309    */
 310   public SequenceI loadVCFContig(String contig)
 311   {
 312     VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 313     if (headerLine == null)
 314     {
 315       Cache.log.error("VCF reference header not found");
 316       return null;
 317     }
 318     String ref = headerLine.getValue();
 319     if (ref.startsWith("file://"))
 320     {
 321       ref = ref.substring(7);
 322     }
 323     setSpeciesAndAssembly(ref);
 324
 325     SequenceI seq = null;
 326     File dbFile = new File(ref);
 327
 328     if (dbFile.exists())
 329     {
 330       HtsContigDb db = new HtsContigDb("", dbFile);
 331       seq = db.getSequenceProxy(contig);
 332       loadSequenceVCF(seq);
 333       db.close();
 334     }
 335     else
 336     {
 337       Cache.log.error("VCF reference not found: " + ref);
 338     }
 339
 340     return seq;
 341   }
 342
 343   /**
 344    * Loads VCF on to one or more sequences
 345    *
 346    * @param seqs
 347    * @param gui
 348    *          optional callback handler for messages
 349    */
 350   protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
 351   {
 352     try
 353     {
 354       VCFHeaderLine ref = header
 355               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 356       String reference = ref == null ? null : ref.getValue();
 357
 358       setSpeciesAndAssembly(reference);
 359
 360       int varCount = 0;
 361       int seqCount = 0;
 362
 363       /*
 364        * query for VCF overlapping each sequence in turn
 365        */
 366       for (SequenceI seq : seqs)
 367       {
 368         int added = loadSequenceVCF(seq);
 369         if (added > 0)
 370         {
 371           seqCount++;
 372           varCount += added;
 373           transferAddedFeatures(seq);
 374         }
 375       }
 376       if (gui != null)
 377       {
 378         String msg = MessageManager.formatMessage("label.added_vcf",
 379                 varCount, seqCount);
 380         gui.setStatus(msg);
 381         if (gui.getFeatureSettingsUI() != null)
 382         {
 383           gui.getFeatureSettingsUI().discoverAllFeatureData();
 384         }
 385       }
 386     } catch (Throwable e)
 387     {
 388       System.err.println("Error processing VCF: " + e.getMessage());
 389       e.printStackTrace();
 390       if (gui != null)
 391       {
 392         gui.setStatus("Error occurred - see console for details");
 393       }
 394     } finally
 395     {
 396       if (reader != null)
 397       {
 398         try
 399         {
 400           reader.close();
 401         } catch (IOException e)
 402         {
 403           // ignore
 404         }
 405       }
 406       header = null;
 407       dictionary = null;
 408     }
 409   }
 410
 411   /**
 412    * Attempts to determine and save the species and genome assembly version to
 413    * which the VCF data applies. This may be done by parsing the {@code reference}
 414    * header line, configured in a property file, or (potentially) confirmed
 415    * interactively by the user.
 416    * <p>
 417    * The saved values should be identifiers valid for Ensembl's REST service
 418    * {@code map} endpoint, so they can be used (if necessary) to retrieve the
 419    * mapping between VCF coordinates and sequence coordinates.
 420    *
 421    * @param reference
 422    * @see https://rest.ensembl.org/documentation/info/assembly_map
 423    * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
 424    * @see https://rest.ensembl.org/info/species?content-type=text/xml
 425    */
 426   protected void setSpeciesAndAssembly(String reference)
 427   {
 428     if (reference == null)
 429     {
 430       Cache.log.error("No VCF ##reference found, defaulting to "
 431               + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
 432       reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
 433     }
 434     reference = reference.toLowerCase();
 435
 436     /*
 437      * for a non-human species, or other assembly identifier,
 438      * specify as a Jalview property file entry e.g.
 439      * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
 440      * VCF_SPECIES = c_elegans=celegans
 441      * to map a token in the reference header to a value
 442      */
 443     String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
 444     for (String token : prop.split(","))
 445     {
 446       String[] tokens = token.split("=");
 447       if (tokens.length == 2)
 448       {
 449         if (reference.contains(tokens[0].trim().toLowerCase()))
 450         {
 451           vcfAssembly = tokens[1].trim();
 452           break;
 453         }
 454       }
 455     }
 456
 457     vcfSpecies = DEFAULT_SPECIES;
 458     prop = Cache.getProperty(VCF_SPECIES);
 459     if (prop != null)
 460     {
 461       for (String token : prop.split(","))
 462       {
 463         String[] tokens = token.split("=");
 464         if (tokens.length == 2)
 465         {
 466           if (reference.contains(tokens[0].trim().toLowerCase()))
 467           {
 468             vcfSpecies = tokens[1].trim();
 469             break;
 470           }
 471         }
 472       }
 473     }
 474   }
 475
 476   /**
 477    * Opens the VCF file and parses header data
 478    *
 479    * @param filePath
 480    * @throws IOException
 481    */
 482   private void initialise(String filePath) throws IOException
 483   {
 484     vcfFilePath = filePath;
 485
 486     reader = new VCFReader(filePath);
 487
 488     header = reader.getFileHeader();
 489
 490     try
 491     {
 492       dictionary = header.getSequenceDictionary();
 493     } catch (SAMException e)
 494     {
 495       // ignore - thrown if any contig line lacks length info
 496     }
 497
 498     sourceId = filePath;
 499
 500     saveMetadata(sourceId);
 501
 502     /*
 503      * get offset of CSQ ALLELE_NUM and Feature if declared
 504      */
 505     parseCsqHeader();
 506   }
 507
 508   /**
 509    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 510    * them for future reference
 511    *
 512    * @param theSourceId
 513    */
 514   void saveMetadata(String theSourceId)
 515   {
 516     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 517             DEFAULT_VCF_FIELDS);
 518     vcfFieldsOfInterest = new ArrayList<>();
 519
 520     FeatureSource metadata = new FeatureSource(theSourceId);
 521
 522     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 523     {
 524       String attributeId = info.getID();
 525       String desc = info.getDescription();
 526       VCFHeaderLineType type = info.getType();
 527       FeatureAttributeType attType = null;
 528       switch (type)
 529       {
 530       case Character:
 531         attType = FeatureAttributeType.Character;
 532         break;
 533       case Flag:
 534         attType = FeatureAttributeType.Flag;
 535         break;
 536       case Float:
 537         attType = FeatureAttributeType.Float;
 538         break;
 539       case Integer:
 540         attType = FeatureAttributeType.Integer;
 541         break;
 542       case String:
 543         attType = FeatureAttributeType.String;
 544         break;
 545       }
 546       metadata.setAttributeName(attributeId, desc);
 547       metadata.setAttributeType(attributeId, attType);
 548
 549       if (isFieldWanted(attributeId, vcfFieldPatterns))
 550       {
 551         vcfFieldsOfInterest.add(attributeId);
 552       }
 553     }
 554
 555     FeatureSources.getInstance().addSource(theSourceId, metadata);
 556   }
 557
 558   /**
 559    * Answers true if the field id is matched by any of the filter patterns, else
 560    * false. Matching is against regular expression patterns, and is not
 561    * case-sensitive.
 562    *
 563    * @param id
 564    * @param filters
 565    * @return
 566    */
 567   private boolean isFieldWanted(String id, List<Pattern> filters)
 568   {
 569     for (Pattern p : filters)
 570     {
 571       if (p.matcher(id.toUpperCase()).matches())
 572       {
 573         return true;
 574       }
 575     }
 576     return false;
 577   }
 578
 579   /**
 580    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 581    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 582    * required for processing.
 583    * <p>
 584    * CSQ fields are declared in the CSQ INFO Description e.g.
 585    * <p>
 586    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 587    */
 588   protected void parseCsqHeader()
 589   {
 590     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 591             DEFAULT_VEP_FIELDS);
 592     vepFieldsOfInterest = new HashMap<>();
 593
 594     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 595     if (csqInfo == null)
 596     {
 597       return;
 598     }
 599
 600     /*
 601      * parse out the pipe-separated list of CSQ fields; we assume here that
 602      * these form the last part of the description, and contain no spaces
 603      */
 604     String desc = csqInfo.getDescription();
 605     int spacePos = desc.lastIndexOf(" ");
 606     desc = desc.substring(spacePos + 1);
 607
 608     if (desc != null)
 609     {
 610       String[] format = desc.split(PIPE_REGEX);
 611       int index = 0;
 612       for (String field : format)
 613       {
 614         if (CSQ_CONSEQUENCE_KEY.equals(field))
 615         {
 616           csqConsequenceFieldIndex = index;
 617         }
 618         if (CSQ_ALLELE_NUM_KEY.equals(field))
 619         {
 620           csqAlleleNumberFieldIndex = index;
 621         }
 622         if (CSQ_ALLELE_KEY.equals(field))
 623         {
 624           csqAlleleFieldIndex = index;
 625         }
 626         if (CSQ_FEATURE_KEY.equals(field))
 627         {
 628           csqFeatureFieldIndex = index;
 629         }
 630
 631         if (isFieldWanted(field, vepFieldFilters))
 632         {
 633           vepFieldsOfInterest.put(index, field);
 634         }
 635
 636         index++;
 637       }
 638     }
 639   }
 640
 641   /**
 642    * Reads the Preference value for the given key, with default specified if no
 643    * preference set. The value is interpreted as a comma-separated list of
 644    * regular expressions, and converted into a list of compiled patterns ready
 645    * for matching. Patterns are forced to upper-case for non-case-sensitive
 646    * matching.
 647    * <p>
 648    * This supports user-defined filters for fields of interest to capture while
 649    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 650    * fields with an ID of AF, or starting with AC, would be matched.
 651    *
 652    * @param key
 653    * @param def
 654    * @return
 655    */
 656   private List<Pattern> getFieldMatchers(String key, String def)
 657   {
 658     String pref = Cache.getDefault(key, def);
 659     List<Pattern> patterns = new ArrayList<>();
 660     String[] tokens = pref.split(",");
 661     for (String token : tokens)
 662     {
 663       try
 664       {
 665       patterns.add(Pattern.compile(token.toUpperCase()));
 666       } catch (PatternSyntaxException e)
 667       {
 668         System.err.println("Invalid pattern ignored: " + token);
 669       }
 670     }
 671     return patterns;
 672   }
 673
 674   /**
 675    * Transfers VCF features to sequences to which this sequence has a mapping.
 676    *
 677    * @param seq
 678    */
 679   protected void transferAddedFeatures(SequenceI seq)
 680   {
 681     List<DBRefEntry> dbrefs = seq.getDBRefs();
 682     if (dbrefs == null)
 683     {
 684       return;
 685     }
 686     for (DBRefEntry dbref : dbrefs)
 687     {
 688       Mapping mapping = dbref.getMap();
 689       if (mapping == null || mapping.getTo() == null)
 690       {
 691         continue;
 692       }
 693
 694       SequenceI mapTo = mapping.getTo();
 695       MapList map = mapping.getMap();
 696       if (map.getFromRatio() == 3)
 697       {
 698         /*
 699          * dna-to-peptide product mapping
 700          */
 701         // JAL-3187 render on the fly instead
 702         // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 703       }
 704       else
 705       {
 706         /*
 707          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 708          */
 709         List<SequenceFeature> features = seq.getFeatures()
 710                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 711         for (SequenceFeature sf : features)
 712         {
 713           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 714           {
 715             transferFeature(sf, mapTo, map);
 716           }
 717         }
 718       }
 719     }
 720   }
 721
 722   /**
 723    * Tries to add overlapping variants read from a VCF file to the given sequence,
 724    * and returns the number of variant features added
 725    *
 726    * @param seq
 727    * @return
 728    */
 729   protected int loadSequenceVCF(SequenceI seq)
 730   {
 731     VCFMap vcfMap = getVcfMap(seq);
 732     if (vcfMap == null)
 733     {
 734       return 0;
 735     }
 736
 737     /*
 738      * work with the dataset sequence here
 739      */
 740     SequenceI dss = seq.getDatasetSequence();
 741     if (dss == null)
 742     {
 743       dss = seq;
 744     }
 745     return addVcfVariants(dss, vcfMap);
 746   }
 747
 748   /**
 749    * Answers a map from sequence coordinates to VCF chromosome ranges
 750    *
 751    * @param seq
 752    * @return
 753    */
 754   private VCFMap getVcfMap(SequenceI seq)
 755   {
 756     /*
 757      * simplest case: sequence has id and length matching a VCF contig
 758      */
 759     VCFMap vcfMap = null;
 760     if (dictionary != null)
 761     {
 762       vcfMap = getContigMap(seq);
 763     }
 764     if (vcfMap != null)
 765     {
 766       return vcfMap;
 767     }
 768
 769     /*
 770      * otherwise, map to VCF from chromosomal coordinates
 771      * of the sequence (if known)
 772      */
 773     GeneLociI seqCoords = seq.getGeneLoci();
 774     if (seqCoords == null)
 775     {
 776       Cache.log.warn(String.format(
 777               "Can't query VCF for %s as chromosome coordinates not known",
 778               seq.getName()));
 779       return null;
 780     }
 781
 782     String species = seqCoords.getSpeciesId();
 783     String chromosome = seqCoords.getChromosomeId();
 784     String seqRef = seqCoords.getAssemblyId();
 785     MapList map = seqCoords.getMapping();
 786
 787     // note this requires the configured species to match that
 788     // returned with the Ensembl sequence; todo: support aliases?
 789     if (!vcfSpecies.equalsIgnoreCase(species))
 790     {
 791       Cache.log.warn("No VCF loaded to " + seq.getName()
 792               + " as species not matched");
 793       return null;
 794     }
 795
 796     if (seqRef.equalsIgnoreCase(vcfAssembly))
 797     {
 798       return new VCFMap(chromosome, map);
 799     }
 800
 801     /*
 802      * VCF data has a different reference assembly to the sequence:
 803      * query Ensembl to map chromosomal coordinates from sequence to VCF
 804      */
 805     List<int[]> toVcfRanges = new ArrayList<>();
 806     List<int[]> fromSequenceRanges = new ArrayList<>();
 807
 808     for (int[] range : map.getToRanges())
 809     {
 810       int[] fromRange = map.locateInFrom(range[0], range[1]);
 811       if (fromRange == null)
 812       {
 813         // corrupted map?!?
 814         continue;
 815       }
 816
 817       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 818               vcfAssembly);
 819       if (newRange == null)
 820       {
 821         Cache.log.error(
 822                 String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
 823                         chromosome, seqRef, range[0], range[1],
 824                         vcfAssembly));
 825         continue;
 826       }
 827       else
 828       {
 829         toVcfRanges.add(newRange);
 830         fromSequenceRanges.add(fromRange);
 831       }
 832     }
 833
 834     return new VCFMap(chromosome,
 835             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 836   }
 837
 838   /**
 839    * If the sequence id matches a contig declared in the VCF file, and the
 840    * sequence length matches the contig length, then returns a 1:1 map of the
 841    * sequence to the contig, else returns null
 842    *
 843    * @param seq
 844    * @return
 845    */
 846   private VCFMap getContigMap(SequenceI seq)
 847   {
 848     String id = seq.getName();
 849     SAMSequenceRecord contig = dictionary.getSequence(id);
 850     if (contig != null)
 851     {
 852       int len = seq.getLength();
 853       if (len == contig.getSequenceLength())
 854       {
 855         MapList map = new MapList(new int[] { 1, len },
 856                 new int[]
 857                 { 1, len }, 1, 1);
 858         return new VCFMap(id, map);
 859       }
 860     }
 861     return null;
 862   }
 863
 864   /**
 865    * Queries the VCF reader for any variants that overlap the mapped chromosome
 866    * ranges of the sequence, and adds as variant features. Returns the number of
 867    * overlapping variants found.
 868    *
 869    * @param seq
 870    * @param map
 871    *          mapping from sequence to VCF coordinates
 872    * @return
 873    */
 874   protected int addVcfVariants(SequenceI seq, VCFMap map)
 875   {
 876     boolean forwardStrand = map.map.isToForwardStrand();
 877
 878     /*
 879      * query the VCF for overlaps of each contiguous chromosomal region
 880      */
 881     int count = 0;
 882
 883     for (int[] range : map.map.getToRanges())
 884     {
 885       int vcfStart = Math.min(range[0], range[1]);
 886       int vcfEnd = Math.max(range[0], range[1]);
 887       try
 888       {
 889         CloseableIterator<VariantContext> variants = reader
 890                 .query(map.chromosome, vcfStart, vcfEnd);
 891         while (variants.hasNext())
 892         {
 893           VariantContext variant = variants.next();
 894
 895           int[] featureRange = map.map.locateInFrom(variant.getStart(),
 896                   variant.getEnd());
 897
 898           /*
 899            * only take features whose range is fully mappable to sequence positions
 900            */
 901           if (featureRange != null)
 902           {
 903             int featureStart = Math.min(featureRange[0], featureRange[1]);
 904             int featureEnd = Math.max(featureRange[0], featureRange[1]);
 905             if (featureEnd - featureStart == variant.getEnd()
 906                     - variant.getStart())
 907             {
 908               count += addAlleleFeatures(seq, variant, featureStart,
 909                       featureEnd, forwardStrand);
 910             }
 911           }
 912         }
 913         variants.close();
 914       } catch (TribbleException e)
 915       {
 916         /*
 917          * RuntimeException throwable by htsjdk
 918          */
 919         String msg = String.format("Error reading VCF for %s:%d-%d: %s ",
 920                 map.chromosome, vcfStart, vcfEnd,e.getLocalizedMessage());
 921         Cache.log.error(msg);
 922       }
 923     }
 924
 925     return count;
 926   }
 927
 928   /**
 929    * A convenience method to get an attribute value for an alternate allele
 930    *
 931    * @param variant
 932    * @param attributeName
 933    * @param alleleIndex
 934    * @return
 935    */
 936   protected String getAttributeValue(VariantContext variant,
 937           String attributeName, int alleleIndex)
 938   {
 939     Object att = variant.getAttribute(attributeName);
 940
 941     if (att instanceof String)
 942     {
 943       return (String) att;
 944     }
 945     else if (att instanceof ArrayList)
 946     {
 947       return ((List<String>) att).get(alleleIndex);
 948     }
 949
 950     return null;
 951   }
 952
 953   /**
 954    * Adds one variant feature for each allele in the VCF variant record, and
 955    * returns the number of features added.
 956    *
 957    * @param seq
 958    * @param variant
 959    * @param featureStart
 960    * @param featureEnd
 961    * @param forwardStrand
 962    * @return
 963    */
 964   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 965           int featureStart, int featureEnd, boolean forwardStrand)
 966   {
 967     int added = 0;
 968
 969     /*
 970      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 971      * so we proceed defensively to get them in strict order
 972      */
 973     int altAlleleCount = variant.getAlternateAlleles().size();
 974     for (int i = 0; i < altAlleleCount; i++)
 975     {
 976       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 977               forwardStrand);
 978     }
 979     return added;
 980   }
 981
 982   /**
 983    * Inspects one allele and attempts to add a variant feature for it to the
 984    * sequence. The additional data associated with this allele is extracted to
 985    * store in the feature's key-value map. Answers the number of features added (0
 986    * or 1).
 987    *
 988    * @param seq
 989    * @param variant
 990    * @param altAlleleIndex
 991    *          (0, 1..)
 992    * @param featureStart
 993    * @param featureEnd
 994    * @param forwardStrand
 995    * @return
 996    */
 997   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 998           int altAlleleIndex, int featureStart, int featureEnd,
 999           boolean forwardStrand)
1000   {
1001     String reference = variant.getReference().getBaseString();
1002     Allele alt = variant.getAlternateAllele(altAlleleIndex);
1003     String allele = alt.getBaseString();
1004
1005     /*
1006      * insertion after a genomic base, if on reverse strand, has to be
1007      * converted to insertion of complement after the preceding position
1008      */
1009     int referenceLength = reference.length();
1010     if (!forwardStrand && allele.length() > referenceLength
1011             && allele.startsWith(reference))
1012     {
1013       featureStart -= referenceLength;
1014       featureEnd = featureStart;
1015       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
1016       reference = Dna.reverseComplement(String.valueOf(insertAfter));
1017       allele = allele.substring(referenceLength) + reference;
1018     }
1019
1020     /*
1021      * build the ref,alt allele description e.g. "G,A", using the base
1022      * complement if the sequence is on the reverse strand
1023      */
1024     StringBuilder sb = new StringBuilder();
1025     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
1026     sb.append(COMMA);
1027     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
1028     String alleles = sb.toString(); // e.g. G,A
1029
1030     /*
1031      * pick out the consequence data (if any) that is for the current allele
1032      * and feature (transcript) that matches the current sequence
1033      */
1034     String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
1035             altAlleleIndex, csqAlleleFieldIndex,
1036             csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
1037             csqFeatureFieldIndex);
1038
1039     /*
1040      * pick out the ontology term for the consequence type
1041      */
1042     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1043     if (consequence != null)
1044     {
1045       type = getOntologyTerm(consequence);
1046     }
1047
1048     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
1049             featureEnd, FEATURE_GROUP_VCF);
1050     sf.setSource(sourceId);
1051
1052     /*
1053      * save the derived alleles as a named attribute; this will be
1054      * needed when Jalview computes derived peptide variants
1055      */
1056     addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles);
1057
1058     /*
1059      * add selected VCF fixed column data as feature attributes
1060      */
1061     addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart()));
1062     addFeatureAttribute(sf, VCF_ID, variant.getID());
1063     addFeatureAttribute(sf, VCF_QUAL,
1064             String.valueOf(variant.getPhredScaledQual()));
1065     addFeatureAttribute(sf, VCF_FILTER, getFilter(variant));
1066
1067     addAlleleProperties(variant, sf, altAlleleIndex, consequence);
1068
1069     seq.addSequenceFeature(sf);
1070
1071     return 1;
1072   }
1073
1074   /**
1075    * Answers the VCF FILTER value for the variant - or an approximation to it.
1076    * This field is either PASS, or a semi-colon separated list of filters not
1077    * passed. htsjdk saves filters as a HashSet, so the order when reassembled into
1078    * a list may be different.
1079    *
1080    * @param variant
1081    * @return
1082    */
1083   String getFilter(VariantContext variant)
1084   {
1085     Set<String> filters = variant.getFilters();
1086     if (filters.isEmpty())
1087     {
1088       return NO_VALUE;
1089     }
1090     Iterator<String> iterator = filters.iterator();
1091     String first = iterator.next();
1092     if (filters.size() == 1)
1093     {
1094       return first;
1095     }
1096
1097     StringBuilder sb = new StringBuilder(first);
1098     while (iterator.hasNext())
1099     {
1100       sb.append(";").append(iterator.next());
1101     }
1102
1103     return sb.toString();
1104   }
1105
1106   /**
1107    * Adds one feature attribute unless the value is null, empty or '.'
1108    *
1109    * @param sf
1110    * @param key
1111    * @param value
1112    */
1113   void addFeatureAttribute(SequenceFeature sf, String key, String value)
1114   {
1115     if (value != null && !value.isEmpty() && !NO_VALUE.equals(value))
1116     {
1117       sf.setValue(key, value);
1118     }
1119   }
1120
1121   /**
1122    * Determines the Sequence Ontology term to use for the variant feature type in
1123    * Jalview. The default is 'sequence_variant', but a more specific term is used
1124    * if:
1125    * <ul>
1126    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
1127    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
1128    * </ul>
1129    *
1130    * @param consequence
1131    * @return
1132    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
1133    */
1134   String getOntologyTerm(String consequence)
1135   {
1136     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1137
1138     /*
1139      * could we associate Consequence data with this allele and feature (transcript)?
1140      * if so, prefer the consequence term from that data
1141      */
1142     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
1143     {
1144       /*
1145        * no Consequence data so we can't refine the ontology term
1146        */
1147       return type;
1148     }
1149
1150     if (consequence != null)
1151     {
1152       String[] csqFields = consequence.split(PIPE_REGEX);
1153       if (csqFields.length > csqConsequenceFieldIndex)
1154       {
1155         type = csqFields[csqConsequenceFieldIndex];
1156       }
1157     }
1158     else
1159     {
1160       // todo the same for SnpEff consequence data matching if wanted
1161     }
1162
1163     /*
1164      * if of the form (e.g.) missense_variant&splice_region_variant,
1165      * just take the first ('most severe') consequence
1166      */
1167     if (type != null)
1168     {
1169       int pos = type.indexOf('&');
1170       if (pos > 0)
1171       {
1172         type = type.substring(0, pos);
1173       }
1174     }
1175     return type;
1176   }
1177
1178   /**
1179    * Returns matched consequence data if it can be found, else null.
1180    * <ul>
1181    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1182    * <li>splits this on comma (to distinct consequences)</li>
1183    * <li>returns the first consequence (if any) where</li>
1184    * <ul>
1185    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1186    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1187    * </ul>
1188    * </ul>
1189    * If matched, the consequence is returned (as pipe-delimited fields).
1190    *
1191    * @param variant
1192    * @param vcfInfoId
1193    * @param altAlleleIndex
1194    * @param alleleFieldIndex
1195    * @param alleleNumberFieldIndex
1196    * @param seqName
1197    * @param featureFieldIndex
1198    * @return
1199    */
1200   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1201           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1202           int alleleNumberFieldIndex,
1203           String seqName, int featureFieldIndex)
1204   {
1205     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1206     {
1207       return null;
1208     }
1209     Object value = variant.getAttribute(vcfInfoId);
1210
1211     if (value == null || !(value instanceof List<?>))
1212     {
1213       return null;
1214     }
1215
1216     /*
1217      * inspect each consequence in turn (comma-separated blocks
1218      * extracted by htsjdk)
1219      */
1220     List<String> consequences = (List<String>) value;
1221
1222     for (String consequence : consequences)
1223     {
1224       String[] csqFields = consequence.split(PIPE_REGEX);
1225       if (csqFields.length > featureFieldIndex)
1226       {
1227         String featureIdentifier = csqFields[featureFieldIndex];
1228         if (featureIdentifier.length() > 4
1229                 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1230         {
1231           /*
1232            * feature (transcript) matched - now check for allele match
1233            */
1234           if (matchAllele(variant, altAlleleIndex, csqFields,
1235                   alleleFieldIndex, alleleNumberFieldIndex))
1236           {
1237             return consequence;
1238           }
1239         }
1240       }
1241     }
1242     return null;
1243   }
1244
1245   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1246           String[] csqFields, int alleleFieldIndex,
1247           int alleleNumberFieldIndex)
1248   {
1249     /*
1250      * if ALLELE_NUM is present, it must match altAlleleIndex
1251      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1252      */
1253     if (alleleNumberFieldIndex > -1)
1254     {
1255       if (csqFields.length <= alleleNumberFieldIndex)
1256       {
1257         return false;
1258       }
1259       String alleleNum = csqFields[alleleNumberFieldIndex];
1260       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1261     }
1262
1263     /*
1264      * else consequence allele must match variant allele
1265      */
1266     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1267     {
1268       String csqAllele = csqFields[alleleFieldIndex];
1269       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1270               .getBaseString();
1271       return csqAllele.equals(vcfAllele);
1272     }
1273     return false;
1274   }
1275
1276   /**
1277    * Add any allele-specific VCF key-value data to the sequence feature
1278    *
1279    * @param variant
1280    * @param sf
1281    * @param altAlelleIndex
1282    *          (0, 1..)
1283    * @param consequence
1284    *          if not null, the consequence specific to this sequence (transcript
1285    *          feature) and allele
1286    */
1287   protected void addAlleleProperties(VariantContext variant,
1288           SequenceFeature sf, final int altAlelleIndex, String consequence)
1289   {
1290     Map<String, Object> atts = variant.getAttributes();
1291
1292     for (Entry<String, Object> att : atts.entrySet())
1293     {
1294       String key = att.getKey();
1295
1296       /*
1297        * extract Consequence data (if present) that we are able to
1298        * associated with the allele for this variant feature
1299        */
1300       if (CSQ_FIELD.equals(key))
1301       {
1302         addConsequences(variant, sf, consequence);
1303         continue;
1304       }
1305
1306       /*
1307        * filter out fields we don't want to capture
1308        */
1309       if (!vcfFieldsOfInterest.contains(key))
1310       {
1311         continue;
1312       }
1313
1314       /*
1315        * we extract values for other data which are allele-specific;
1316        * these may be per alternate allele (INFO[key].Number = 'A')
1317        * or per allele including reference (INFO[key].Number = 'R')
1318        */
1319       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1320       if (infoHeader == null)
1321       {
1322         /*
1323          * can't be sure what data belongs to this allele, so
1324          * play safe and don't take any
1325          */
1326         continue;
1327       }
1328
1329       VCFHeaderLineCount number = infoHeader.getCountType();
1330       int index = altAlelleIndex;
1331       if (number == VCFHeaderLineCount.R)
1332       {
1333         /*
1334          * one value per allele including reference, so bump index
1335          * e.g. the 3rd value is for the  2nd alternate allele
1336          */
1337         index++;
1338       }
1339       else if (number != VCFHeaderLineCount.A)
1340       {
1341         /*
1342          * don't save other values as not allele-related
1343          */
1344         continue;
1345       }
1346
1347       /*
1348        * take the index'th value
1349        */
1350       String value = getAttributeValue(variant, key, index);
1351       if (value != null && isValid(variant, key, value))
1352       {
1353         /*
1354          * decode colon, semicolon, equals sign, percent sign, comma (only)
1355          * as required by the VCF specification (para 1.2)
1356          */
1357         value = StringUtils.urlDecode(value, VCF_ENCODABLE);
1358         addFeatureAttribute(sf, key, value);
1359       }
1360     }
1361   }
1362
1363   /**
1364    * Answers true for '.', null, or an empty value, or if the INFO type is String.
1365    * If the INFO type is Integer or Float, answers false if the value is not in
1366    * valid format.
1367    *
1368    * @param variant
1369    * @param infoId
1370    * @param value
1371    * @return
1372    */
1373   protected boolean isValid(VariantContext variant, String infoId,
1374           String value)
1375   {
1376     if (value == null || value.isEmpty() || NO_VALUE.equals(value))
1377     {
1378       return true;
1379     }
1380     VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId);
1381     if (infoHeader == null)
1382     {
1383       Cache.log.error("Field " + infoId + " has no INFO header");
1384       return false;
1385     }
1386     VCFHeaderLineType infoType = infoHeader.getType();
1387     try
1388     {
1389       if (infoType == VCFHeaderLineType.Integer)
1390       {
1391         Integer.parseInt(value);
1392       }
1393       else if (infoType == VCFHeaderLineType.Float)
1394       {
1395         Float.parseFloat(value);
1396       }
1397     } catch (NumberFormatException e)
1398     {
1399       logInvalidValue(variant, infoId, value);
1400       return false;
1401     }
1402     return true;
1403   }
1404
1405   /**
1406    * Logs an error message for malformed data; duplicate messages (same id and
1407    * value) are not logged
1408    *
1409    * @param variant
1410    * @param infoId
1411    * @param value
1412    */
1413   private void logInvalidValue(VariantContext variant, String infoId,
1414           String value)
1415   {
1416     if (badData == null)
1417     {
1418       badData = new HashSet<>();
1419     }
1420     String token = infoId + ":" + value;
1421     if (!badData.contains(token))
1422     {
1423       badData.add(token);
1424       Cache.log.error(String.format("Invalid VCF data at %s:%d %s=%s",
1425               variant.getContig(), variant.getStart(), infoId, value));
1426     }
1427   }
1428
1429   /**
1430    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1431    * feature.
1432    * <p>
1433    * If <code>myConsequence</code> is not null, then this is the specific
1434    * consequence data (pipe-delimited fields) that is for the current allele and
1435    * transcript (sequence) being processed)
1436    *
1437    * @param variant
1438    * @param sf
1439    * @param myConsequence
1440    */
1441   protected void addConsequences(VariantContext variant, SequenceFeature sf,
1442           String myConsequence)
1443   {
1444     Object value = variant.getAttribute(CSQ_FIELD);
1445
1446     if (value == null || !(value instanceof List<?>))
1447     {
1448       return;
1449     }
1450
1451     List<String> consequences = (List<String>) value;
1452
1453     /*
1454      * inspect CSQ consequences; restrict to the consequence
1455      * associated with the current transcript (Feature)
1456      */
1457     Map<String, String> csqValues = new HashMap<>();
1458
1459     for (String consequence : consequences)
1460     {
1461       if (myConsequence == null || myConsequence.equals(consequence))
1462       {
1463         String[] csqFields = consequence.split(PIPE_REGEX);
1464
1465         /*
1466          * inspect individual fields of this consequence, copying non-null
1467          * values which are 'fields of interest'
1468          */
1469         int i = 0;
1470         for (String field : csqFields)
1471         {
1472           if (field != null && field.length() > 0)
1473           {
1474             String id = vepFieldsOfInterest.get(i);
1475             if (id != null)
1476             {
1477               /*
1478                * VCF spec requires encoding of special characters e.g. '='
1479                * so decode them here before storing
1480                */
1481               field = StringUtils.urlDecode(field, VCF_ENCODABLE);
1482               csqValues.put(id, field);
1483             }
1484           }
1485           i++;
1486         }
1487       }
1488     }
1489
1490     if (!csqValues.isEmpty())
1491     {
1492       sf.setValue(CSQ_FIELD, csqValues);
1493     }
1494   }
1495
1496   /**
1497    * A convenience method to complement a dna base and return the string value
1498    * of its complement
1499    *
1500    * @param reference
1501    * @return
1502    */
1503   protected String complement(byte[] reference)
1504   {
1505     return String.valueOf(Dna.getComplement((char) reference[0]));
1506   }
1507
1508   /**
1509    * Determines the location of the query range (chromosome positions) in a
1510    * different reference assembly.
1511    * <p>
1512    * If the range is just a subregion of one for which we already have a mapping
1513    * (for example, an exon sub-region of a gene), then the mapping is just
1514    * computed arithmetically.
1515    * <p>
1516    * Otherwise, calls the Ensembl REST service that maps from one assembly
1517    * reference's coordinates to another's
1518    *
1519    * @param queryRange
1520    *          start-end chromosomal range in 'fromRef' coordinates
1521    * @param chromosome
1522    * @param species
1523    * @param fromRef
1524    *          assembly reference for the query coordinates
1525    * @param toRef
1526    *          assembly reference we wish to translate to
1527    * @return the start-end range in 'toRef' coordinates
1528    */
1529   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1530           String species, String fromRef, String toRef)
1531   {
1532     /*
1533      * first try shorcut of computing the mapping as a subregion of one
1534      * we already have (e.g. for an exon, if we have the gene mapping)
1535      */
1536     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1537             species, fromRef, toRef);
1538     if (mappedRange != null)
1539     {
1540       return mappedRange;
1541     }
1542
1543     /*
1544      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1545      */
1546     EnsemblMap mapper = new EnsemblMap();
1547     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1548             toRef, queryRange);
1549
1550     if (mapping == null)
1551     {
1552       // mapping service failure
1553       return null;
1554     }
1555
1556     /*
1557      * save mapping for possible future re-use
1558      */
1559     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1560     if (!assemblyMappings.containsKey(key))
1561     {
1562       assemblyMappings.put(key, new HashMap<int[], int[]>());
1563     }
1564
1565     assemblyMappings.get(key).put(queryRange, mapping);
1566
1567     return mapping;
1568   }
1569
1570   /**
1571    * If we already have a 1:1 contiguous mapping which subsumes the given query
1572    * range, this method just calculates and returns the subset of that mapping,
1573    * else it returns null. In practical terms, if a gene has a contiguous
1574    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1575    * subsidiary exons occupy unchanged relative positions, and just compute
1576    * these as offsets, rather than do another lookup of the mapping.
1577    * <p>
1578    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1579    * simply remove this method or let it always return null.
1580    * <p>
1581    * Warning: many rapid calls to the /map service map result in a 429 overload
1582    * error response
1583    *
1584    * @param queryRange
1585    * @param chromosome
1586    * @param species
1587    * @param fromRef
1588    * @param toRef
1589    * @return
1590    */
1591   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1592           String species, String fromRef, String toRef)
1593   {
1594     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1595     if (assemblyMappings.containsKey(key))
1596     {
1597       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1598       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1599       {
1600         int[] fromRange = mappedRange.getKey();
1601         int[] toRange = mappedRange.getValue();
1602         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1603         {
1604           /*
1605            * mapping is 1:1 in length, so we trust it to have no discontinuities
1606            */
1607           if (MappingUtils.rangeContains(fromRange, queryRange))
1608           {
1609             /*
1610              * fromRange subsumes our query range
1611              */
1612             int offset = queryRange[0] - fromRange[0];
1613             int mappedRangeFrom = toRange[0] + offset;
1614             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1615             return new int[] { mappedRangeFrom, mappedRangeTo };
1616           }
1617         }
1618       }
1619     }
1620     return null;
1621   }
1622
1623   /**
1624    * Transfers the sequence feature to the target sequence, locating its start
1625    * and end range based on the mapping. Features which do not overlap the
1626    * target sequence are ignored.
1627    *
1628    * @param sf
1629    * @param targetSequence
1630    * @param mapping
1631    *          mapping from the feature's coordinates to the target sequence
1632    */
1633   protected void transferFeature(SequenceFeature sf,
1634           SequenceI targetSequence, MapList mapping)
1635   {
1636     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1637
1638     if (mappedRange != null)
1639     {
1640       String group = sf.getFeatureGroup();
1641       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1642       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1643       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1644               group, sf.getScore());
1645       targetSequence.addSequenceFeature(copy);
1646     }
1647   }
1648
1649   /**
1650    * Formats a ranges map lookup key
1651    *
1652    * @param chromosome
1653    * @param species
1654    * @param fromRef
1655    * @param toRef
1656    * @return
1657    */
1658   protected static String makeRangesKey(String chromosome, String species,
1659           String fromRef, String toRef)
1660   {
1661     return species + EXCL + chromosome + EXCL + fromRef + EXCL
1662             + toRef;
1663   }
1664 }