src/jalview/io/vcf/VCFLoader.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.vcf;
  22
  23 import java.util.Locale;
  24
  25 import java.io.File;
  26 import java.io.IOException;
  27 import java.util.ArrayList;
  28 import java.util.HashMap;
  29 import java.util.HashSet;
  30 import java.util.Iterator;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.Map.Entry;
  34 import java.util.Set;
  35 import java.util.regex.Pattern;
  36 import java.util.regex.PatternSyntaxException;
  37
  38 import htsjdk.samtools.SAMException;
  39 import htsjdk.samtools.SAMSequenceDictionary;
  40 import htsjdk.samtools.SAMSequenceRecord;
  41 import htsjdk.samtools.util.CloseableIterator;
  42 import htsjdk.tribble.TribbleException;
  43 import htsjdk.variant.variantcontext.Allele;
  44 import htsjdk.variant.variantcontext.VariantContext;
  45 import htsjdk.variant.vcf.VCFConstants;
  46 import htsjdk.variant.vcf.VCFHeader;
  47 import htsjdk.variant.vcf.VCFHeaderLine;
  48 import htsjdk.variant.vcf.VCFHeaderLineCount;
  49 import htsjdk.variant.vcf.VCFHeaderLineType;
  50 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  51 import jalview.analysis.Dna;
  52 import jalview.api.AlignViewControllerGuiI;
  53 import jalview.bin.Cache;
  54 import jalview.bin.Console;
  55 import jalview.datamodel.DBRefEntry;
  56 import jalview.datamodel.GeneLociI;
  57 import jalview.datamodel.Mapping;
  58 import jalview.datamodel.SequenceFeature;
  59 import jalview.datamodel.SequenceI;
  60 import jalview.datamodel.features.FeatureAttributeType;
  61 import jalview.datamodel.features.FeatureSource;
  62 import jalview.datamodel.features.FeatureSources;
  63 import jalview.ext.ensembl.EnsemblMap;
  64 import jalview.ext.htsjdk.HtsContigDb;
  65 import jalview.ext.htsjdk.VCFReader;
  66 import jalview.io.gff.Gff3Helper;
  67 import jalview.io.gff.SequenceOntologyI;
  68 import jalview.util.MapList;
  69 import jalview.util.MappingUtils;
  70 import jalview.util.MessageManager;
  71 import jalview.util.StringUtils;
  72
  73 /**
  74  * A class to read VCF data (using the htsjdk) and add variants as sequence
  75  * features on dna and any related protein product sequences
  76  *
  77  * @author gmcarstairs
  78  */
  79 public class VCFLoader
  80 {
  81   private static final String VCF_ENCODABLE = ":;=%,";
  82
  83   /*
  84    * Jalview feature attributes for VCF fixed column data
  85    */
  86   private static final String VCF_POS = "POS";
  87
  88   private static final String VCF_ID = "ID";
  89
  90   private static final String VCF_QUAL = "QUAL";
  91
  92   private static final String VCF_FILTER = "FILTER";
  93
  94   private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.'
  95
  96   private static final String DEFAULT_SPECIES = "homo_sapiens";
  97
  98   /**
  99    * A class to model the mapping from sequence to VCF coordinates. Cases
 100    * include
 101    * <ul>
 102    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
 103    * <li>a mapping of sequence to chromosomal coordinates, where sequence and
 104    * VCF use the same reference assembly</li>
 105    * <li>a modified mapping of sequence to chromosomal coordinates, where
 106    * sequence and VCF use different reference assembles</li>
 107    * </ul>
 108    */
 109   class VCFMap
 110   {
 111     final String chromosome;
 112
 113     final MapList map;
 114
 115     VCFMap(String chr, MapList m)
 116     {
 117       chromosome = chr;
 118       map = m;
 119     }
 120
 121     @Override
 122     public String toString()
 123     {
 124       return chromosome + ":" + map.toString();
 125     }
 126   }
 127
 128   /*
 129    * Lookup keys, and default values, for Preference entries that describe
 130    * patterns for VCF and VEP fields to capture
 131    */
 132   private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
 133
 134   private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
 135
 136   private static final String DEFAULT_VCF_FIELDS = ".*";
 137
 138   private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
 139
 140   /*
 141    * Lookup keys, and default values, for Preference entries that give
 142    * mappings from tokens in the 'reference' header to species or assembly
 143    */
 144   private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
 145
 146   private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
 147
 148   private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
 149
 150   private static final String DEFAULT_REFERENCE = "grch37"; // fallback default
 151                                                             // is human GRCh37
 152
 153   /*
 154    * keys to fields of VEP CSQ consequence data
 155    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 156    */
 157   private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
 158
 159   private static final String CSQ_ALLELE_KEY = "Allele";
 160
 161   private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref),
 162                                                                  // 1...
 163
 164   private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
 165
 166   /*
 167    * default VCF INFO key for VEP consequence data
 168    * NB this can be overridden running VEP with --vcf_info_field
 169    * - we don't handle this case (require identifier to be CSQ)
 170    */
 171   private static final String CSQ_FIELD = "CSQ";
 172
 173   /*
 174    * separator for fields in consequence data is '|'
 175    */
 176   private static final String PIPE_REGEX = "\\|";
 177
 178   /*
 179    * delimiter that separates multiple consequence data blocks
 180    */
 181   private static final String COMMA = ",";
 182
 183   /*
 184    * the feature group assigned to a VCF variant in Jalview
 185    */
 186   private static final String FEATURE_GROUP_VCF = "VCF";
 187
 188   /*
 189    * internal delimiter used to build keys for assemblyMappings
 190    *
 191    */
 192   private static final String EXCL = "!";
 193
 194   /*
 195    * the VCF file we are processing
 196    */
 197   protected String vcfFilePath;
 198
 199   /*
 200    * mappings between VCF and sequence reference assembly regions, as
 201    * key = "species!chromosome!fromAssembly!toAssembly
 202    * value = Map{fromRange, toRange}
 203    */
 204   private Map<String, Map<int[], int[]>> assemblyMappings;
 205
 206   private VCFReader reader;
 207
 208   /*
 209    * holds details of the VCF header lines (metadata)
 210    */
 211   private VCFHeader header;
 212
 213   /*
 214    * species (as a valid Ensembl term) the VCF is for
 215    */
 216   private String vcfSpecies;
 217
 218   /*
 219    * genome assembly version (as a valid Ensembl identifier) the VCF is for
 220    */
 221   private String vcfAssembly;
 222
 223   /*
 224    * a Dictionary of contigs (if present) referenced in the VCF file
 225    */
 226   private SAMSequenceDictionary dictionary;
 227
 228   /*
 229    * the position (0...) of field in each block of
 230    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 231    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 232    */
 233   private int csqConsequenceFieldIndex = -1;
 234
 235   private int csqAlleleFieldIndex = -1;
 236
 237   private int csqAlleleNumberFieldIndex = -1;
 238
 239   private int csqFeatureFieldIndex = -1;
 240
 241   // todo the same fields for SnpEff ANN data if wanted
 242   // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
 243
 244   /*
 245    * a unique identifier under which to save metadata about feature
 246    * attributes (selected INFO field data)
 247    */
 248   private String sourceId;
 249
 250   /*
 251    * The INFO IDs of data that is both present in the VCF file, and
 252    * also matched by any filters for data of interest
 253    */
 254   List<String> vcfFieldsOfInterest;
 255
 256   /*
 257    * The field offsets and identifiers for VEP (CSQ) data that is both present
 258    * in the VCF file, and also matched by any filters for data of interest
 259    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
 260    */
 261   Map<Integer, String> vepFieldsOfInterest;
 262
 263   /*
 264    * key:value for which rejected data has been seen
 265    * (the error is logged only once for each combination)
 266    */
 267   private Set<String> badData;
 268
 269   /**
 270    * Constructor given a VCF file
 271    *
 272    * @param alignment
 273    */
 274   public VCFLoader(String vcfFile)
 275   {
 276     try
 277     {
 278       initialise(vcfFile);
 279     } catch (IOException e)
 280     {
 281       System.err.println("Error opening VCF file: " + e.getMessage());
 282     }
 283
 284     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 285     assemblyMappings = new HashMap<>();
 286   }
 287
 288   /**
 289    * Starts a new thread to query and load VCF variant data on to the given
 290    * sequences
 291    * <p>
 292    * This method is not thread safe - concurrent threads should use separate
 293    * instances of this class.
 294    *
 295    * @param seqs
 296    * @param gui
 297    */
 298   public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
 299   {
 300     if (gui != null)
 301     {
 302       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 303     }
 304
 305     new Thread()
 306     {
 307       @Override
 308       public void run()
 309       {
 310         VCFLoader.this.doLoad(seqs, gui);
 311       }
 312     }.start();
 313   }
 314
 315   /**
 316    * Reads the specified contig sequence and adds its VCF variants to it
 317    *
 318    * @param contig
 319    *          the id of a single sequence (contig) to load
 320    * @return
 321    */
 322   public SequenceI loadVCFContig(String contig)
 323   {
 324     VCFHeaderLine headerLine = header
 325             .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 326     if (headerLine == null)
 327     {
 328       Console.error("VCF reference header not found");
 329       return null;
 330     }
 331     String ref = headerLine.getValue();
 332     if (ref.startsWith("file://"))
 333     {
 334       ref = ref.substring(7);
 335     }
 336     setSpeciesAndAssembly(ref);
 337
 338     SequenceI seq = null;
 339     File dbFile = new File(ref);
 340
 341     if (dbFile.exists())
 342     {
 343       HtsContigDb db = new HtsContigDb("", dbFile);
 344       seq = db.getSequenceProxy(contig);
 345       loadSequenceVCF(seq);
 346       db.close();
 347     }
 348     else
 349     {
 350       Console.error("VCF reference not found: " + ref);
 351     }
 352
 353     return seq;
 354   }
 355
 356   /**
 357    * Loads VCF on to one or more sequences
 358    *
 359    * @param seqs
 360    * @param gui
 361    *          optional callback handler for messages
 362    */
 363   protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
 364   {
 365     try
 366     {
 367       VCFHeaderLine ref = header
 368               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 369       String reference = ref == null ? null : ref.getValue();
 370
 371       setSpeciesAndAssembly(reference);
 372
 373       int varCount = 0;
 374       int seqCount = 0;
 375
 376       /*
 377        * query for VCF overlapping each sequence in turn
 378        */
 379       for (SequenceI seq : seqs)
 380       {
 381         int added = loadSequenceVCF(seq);
 382         if (added > 0)
 383         {
 384           seqCount++;
 385           varCount += added;
 386           transferAddedFeatures(seq);
 387         }
 388       }
 389       if (gui != null)
 390       {
 391         String msg = MessageManager.formatMessage("label.added_vcf",
 392                 varCount, seqCount);
 393         gui.setStatus(msg);
 394         if (gui.getFeatureSettingsUI() != null)
 395         {
 396           gui.getFeatureSettingsUI().discoverAllFeatureData();
 397         }
 398       }
 399     } catch (Throwable e)
 400     {
 401       System.err.println("Error processing VCF: " + e.getMessage());
 402       e.printStackTrace();
 403       if (gui != null)
 404       {
 405         gui.setStatus("Error occurred - see console for details");
 406       }
 407     } finally
 408     {
 409       if (reader != null)
 410       {
 411         try
 412         {
 413           reader.close();
 414         } catch (IOException e)
 415         {
 416           // ignore
 417         }
 418       }
 419       header = null;
 420       dictionary = null;
 421     }
 422   }
 423
 424   /**
 425    * Attempts to determine and save the species and genome assembly version to
 426    * which the VCF data applies. This may be done by parsing the
 427    * {@code reference} header line, configured in a property file, or
 428    * (potentially) confirmed interactively by the user.
 429    * <p>
 430    * The saved values should be identifiers valid for Ensembl's REST service
 431    * {@code map} endpoint, so they can be used (if necessary) to retrieve the
 432    * mapping between VCF coordinates and sequence coordinates.
 433    *
 434    * @param reference
 435    * @see https://rest.ensembl.org/documentation/info/assembly_map
 436    * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
 437    * @see https://rest.ensembl.org/info/species?content-type=text/xml
 438    */
 439   protected void setSpeciesAndAssembly(String reference)
 440   {
 441     if (reference == null)
 442     {
 443       Console.error("No VCF ##reference found, defaulting to "
 444               + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
 445       reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
 446     }
 447     reference = reference.toLowerCase(Locale.ROOT);
 448
 449     /*
 450      * for a non-human species, or other assembly identifier,
 451      * specify as a Jalview property file entry e.g.
 452      * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
 453      * VCF_SPECIES = c_elegans=celegans
 454      * to map a token in the reference header to a value
 455      */
 456     String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
 457     for (String token : prop.split(","))
 458     {
 459       String[] tokens = token.split("=");
 460       if (tokens.length == 2)
 461       {
 462         if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT)))
 463         {
 464           vcfAssembly = tokens[1].trim();
 465           break;
 466         }
 467       }
 468     }
 469
 470     vcfSpecies = DEFAULT_SPECIES;
 471     prop = Cache.getProperty(VCF_SPECIES);
 472     if (prop != null)
 473     {
 474       for (String token : prop.split(","))
 475       {
 476         String[] tokens = token.split("=");
 477         if (tokens.length == 2)
 478         {
 479           if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT)))
 480           {
 481             vcfSpecies = tokens[1].trim();
 482             break;
 483           }
 484         }
 485       }
 486     }
 487   }
 488
 489   /**
 490    * Opens the VCF file and parses header data
 491    *
 492    * @param filePath
 493    * @throws IOException
 494    */
 495   private void initialise(String filePath) throws IOException
 496   {
 497     vcfFilePath = filePath;
 498
 499     reader = new VCFReader(filePath);
 500
 501     header = reader.getFileHeader();
 502
 503     try
 504     {
 505       dictionary = header.getSequenceDictionary();
 506     } catch (SAMException e)
 507     {
 508       // ignore - thrown if any contig line lacks length info
 509     }
 510
 511     sourceId = filePath;
 512
 513     saveMetadata(sourceId);
 514
 515     /*
 516      * get offset of CSQ ALLELE_NUM and Feature if declared
 517      */
 518     parseCsqHeader();
 519   }
 520
 521   /**
 522    * Reads metadata (such as INFO field descriptions and datatypes) and saves
 523    * them for future reference
 524    *
 525    * @param theSourceId
 526    */
 527   void saveMetadata(String theSourceId)
 528   {
 529     List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
 530             DEFAULT_VCF_FIELDS);
 531     vcfFieldsOfInterest = new ArrayList<>();
 532
 533     FeatureSource metadata = new FeatureSource(theSourceId);
 534
 535     for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
 536     {
 537       String attributeId = info.getID();
 538       String desc = info.getDescription();
 539       VCFHeaderLineType type = info.getType();
 540       FeatureAttributeType attType = null;
 541       switch (type)
 542       {
 543       case Character:
 544         attType = FeatureAttributeType.Character;
 545         break;
 546       case Flag:
 547         attType = FeatureAttributeType.Flag;
 548         break;
 549       case Float:
 550         attType = FeatureAttributeType.Float;
 551         break;
 552       case Integer:
 553         attType = FeatureAttributeType.Integer;
 554         break;
 555       case String:
 556         attType = FeatureAttributeType.String;
 557         break;
 558       }
 559       metadata.setAttributeName(attributeId, desc);
 560       metadata.setAttributeType(attributeId, attType);
 561
 562       if (isFieldWanted(attributeId, vcfFieldPatterns))
 563       {
 564         vcfFieldsOfInterest.add(attributeId);
 565       }
 566     }
 567
 568     FeatureSources.getInstance().addSource(theSourceId, metadata);
 569   }
 570
 571   /**
 572    * Answers true if the field id is matched by any of the filter patterns, else
 573    * false. Matching is against regular expression patterns, and is not
 574    * case-sensitive.
 575    *
 576    * @param id
 577    * @param filters
 578    * @return
 579    */
 580   private boolean isFieldWanted(String id, List<Pattern> filters)
 581   {
 582     for (Pattern p : filters)
 583     {
 584       if (p.matcher(id.toUpperCase(Locale.ROOT)).matches())
 585       {
 586         return true;
 587       }
 588     }
 589     return false;
 590   }
 591
 592   /**
 593    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
 594    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
 595    * required for processing.
 596    * <p>
 597    * CSQ fields are declared in the CSQ INFO Description e.g.
 598    * <p>
 599    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 600    */
 601   protected void parseCsqHeader()
 602   {
 603     List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
 604             DEFAULT_VEP_FIELDS);
 605     vepFieldsOfInterest = new HashMap<>();
 606
 607     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
 608     if (csqInfo == null)
 609     {
 610       return;
 611     }
 612
 613     /*
 614      * parse out the pipe-separated list of CSQ fields; we assume here that
 615      * these form the last part of the description, and contain no spaces
 616      */
 617     String desc = csqInfo.getDescription();
 618     int spacePos = desc.lastIndexOf(" ");
 619     desc = desc.substring(spacePos + 1);
 620
 621     if (desc != null)
 622     {
 623       String[] format = desc.split(PIPE_REGEX);
 624       int index = 0;
 625       for (String field : format)
 626       {
 627         if (CSQ_CONSEQUENCE_KEY.equals(field))
 628         {
 629           csqConsequenceFieldIndex = index;
 630         }
 631         if (CSQ_ALLELE_NUM_KEY.equals(field))
 632         {
 633           csqAlleleNumberFieldIndex = index;
 634         }
 635         if (CSQ_ALLELE_KEY.equals(field))
 636         {
 637           csqAlleleFieldIndex = index;
 638         }
 639         if (CSQ_FEATURE_KEY.equals(field))
 640         {
 641           csqFeatureFieldIndex = index;
 642         }
 643
 644         if (isFieldWanted(field, vepFieldFilters))
 645         {
 646           vepFieldsOfInterest.put(index, field);
 647         }
 648
 649         index++;
 650       }
 651     }
 652   }
 653
 654   /**
 655    * Reads the Preference value for the given key, with default specified if no
 656    * preference set. The value is interpreted as a comma-separated list of
 657    * regular expressions, and converted into a list of compiled patterns ready
 658    * for matching. Patterns are forced to upper-case for non-case-sensitive
 659    * matching.
 660    * <p>
 661    * This supports user-defined filters for fields of interest to capture while
 662    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
 663    * fields with an ID of AF, or starting with AC, would be matched.
 664    *
 665    * @param key
 666    * @param def
 667    * @return
 668    */
 669   private List<Pattern> getFieldMatchers(String key, String def)
 670   {
 671     String pref = Cache.getDefault(key, def);
 672     List<Pattern> patterns = new ArrayList<>();
 673     String[] tokens = pref.split(",");
 674     for (String token : tokens)
 675     {
 676       try
 677       {
 678         patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT)));
 679       } catch (PatternSyntaxException e)
 680       {
 681         System.err.println("Invalid pattern ignored: " + token);
 682       }
 683     }
 684     return patterns;
 685   }
 686
 687   /**
 688    * Transfers VCF features to sequences to which this sequence has a mapping.
 689    *
 690    * @param seq
 691    */
 692   protected void transferAddedFeatures(SequenceI seq)
 693   {
 694     List<DBRefEntry> dbrefs = seq.getDBRefs();
 695     if (dbrefs == null)
 696     {
 697       return;
 698     }
 699     for (DBRefEntry dbref : dbrefs)
 700     {
 701       Mapping mapping = dbref.getMap();
 702       if (mapping == null || mapping.getTo() == null)
 703       {
 704         continue;
 705       }
 706
 707       SequenceI mapTo = mapping.getTo();
 708       MapList map = mapping.getMap();
 709       if (map.getFromRatio() == 3)
 710       {
 711         /*
 712          * dna-to-peptide product mapping
 713          */
 714         // JAL-3187 render on the fly instead
 715         // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 716       }
 717       else
 718       {
 719         /*
 720          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 721          */
 722         List<SequenceFeature> features = seq.getFeatures()
 723                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 724         for (SequenceFeature sf : features)
 725         {
 726           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 727           {
 728             transferFeature(sf, mapTo, map);
 729           }
 730         }
 731       }
 732     }
 733   }
 734
 735   /**
 736    * Tries to add overlapping variants read from a VCF file to the given
 737    * sequence, and returns the number of variant features added
 738    *
 739    * @param seq
 740    * @return
 741    */
 742   protected int loadSequenceVCF(SequenceI seq)
 743   {
 744     VCFMap vcfMap = getVcfMap(seq);
 745     if (vcfMap == null)
 746     {
 747       return 0;
 748     }
 749
 750     /*
 751      * work with the dataset sequence here
 752      */
 753     SequenceI dss = seq.getDatasetSequence();
 754     if (dss == null)
 755     {
 756       dss = seq;
 757     }
 758     return addVcfVariants(dss, vcfMap);
 759   }
 760
 761   /**
 762    * Answers a map from sequence coordinates to VCF chromosome ranges
 763    *
 764    * @param seq
 765    * @return
 766    */
 767   private VCFMap getVcfMap(SequenceI seq)
 768   {
 769     /*
 770      * simplest case: sequence has id and length matching a VCF contig
 771      */
 772     VCFMap vcfMap = null;
 773     if (dictionary != null)
 774     {
 775       vcfMap = getContigMap(seq);
 776     }
 777     if (vcfMap != null)
 778     {
 779       return vcfMap;
 780     }
 781
 782     /*
 783      * otherwise, map to VCF from chromosomal coordinates
 784      * of the sequence (if known)
 785      */
 786     GeneLociI seqCoords = seq.getGeneLoci();
 787     if (seqCoords == null)
 788     {
 789       Console.warn(String.format(
 790               "Can't query VCF for %s as chromosome coordinates not known",
 791               seq.getName()));
 792       return null;
 793     }
 794
 795     String species = seqCoords.getSpeciesId();
 796     String chromosome = seqCoords.getChromosomeId();
 797     String seqRef = seqCoords.getAssemblyId();
 798     MapList map = seqCoords.getMapping();
 799
 800     // note this requires the configured species to match that
 801     // returned with the Ensembl sequence; todo: support aliases?
 802     if (!vcfSpecies.equalsIgnoreCase(species))
 803     {
 804       Console.warn("No VCF loaded to " + seq.getName()
 805               + " as species not matched");
 806       return null;
 807     }
 808
 809     if (seqRef.equalsIgnoreCase(vcfAssembly))
 810     {
 811       return new VCFMap(chromosome, map);
 812     }
 813
 814     /*
 815      * VCF data has a different reference assembly to the sequence:
 816      * query Ensembl to map chromosomal coordinates from sequence to VCF
 817      */
 818     List<int[]> toVcfRanges = new ArrayList<>();
 819     List<int[]> fromSequenceRanges = new ArrayList<>();
 820
 821     for (int[] range : map.getToRanges())
 822     {
 823       int[] fromRange = map.locateInFrom(range[0], range[1]);
 824       if (fromRange == null)
 825       {
 826         // corrupted map?!?
 827         continue;
 828       }
 829
 830       int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
 831               vcfAssembly);
 832       if (newRange == null)
 833       {
 834         Console.error(String.format("Failed to map %s:%s:%s:%d:%d to %s",
 835                 species, chromosome, seqRef, range[0], range[1],
 836                 vcfAssembly));
 837         continue;
 838       }
 839       else
 840       {
 841         toVcfRanges.add(newRange);
 842         fromSequenceRanges.add(fromRange);
 843       }
 844     }
 845
 846     return new VCFMap(chromosome,
 847             new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
 848   }
 849
 850   /**
 851    * If the sequence id matches a contig declared in the VCF file, and the
 852    * sequence length matches the contig length, then returns a 1:1 map of the
 853    * sequence to the contig, else returns null
 854    *
 855    * @param seq
 856    * @return
 857    */
 858   private VCFMap getContigMap(SequenceI seq)
 859   {
 860     String id = seq.getName();
 861     SAMSequenceRecord contig = dictionary.getSequence(id);
 862     if (contig != null)
 863     {
 864       int len = seq.getLength();
 865       if (len == contig.getSequenceLength())
 866       {
 867         MapList map = new MapList(new int[] { 1, len },
 868                 new int[]
 869                 { 1, len }, 1, 1);
 870         return new VCFMap(id, map);
 871       }
 872     }
 873     return null;
 874   }
 875
 876   /**
 877    * Queries the VCF reader for any variants that overlap the mapped chromosome
 878    * ranges of the sequence, and adds as variant features. Returns the number of
 879    * overlapping variants found.
 880    *
 881    * @param seq
 882    * @param map
 883    *          mapping from sequence to VCF coordinates
 884    * @return
 885    */
 886   protected int addVcfVariants(SequenceI seq, VCFMap map)
 887   {
 888     boolean forwardStrand = map.map.isToForwardStrand();
 889
 890     /*
 891      * query the VCF for overlaps of each contiguous chromosomal region
 892      */
 893     int count = 0;
 894
 895     for (int[] range : map.map.getToRanges())
 896     {
 897       int vcfStart = Math.min(range[0], range[1]);
 898       int vcfEnd = Math.max(range[0], range[1]);
 899       try
 900       {
 901         CloseableIterator<VariantContext> variants = reader
 902                 .query(map.chromosome, vcfStart, vcfEnd);
 903         while (variants.hasNext())
 904         {
 905           VariantContext variant = variants.next();
 906
 907           int[] featureRange = map.map.locateInFrom(variant.getStart(),
 908                   variant.getEnd());
 909
 910           /*
 911            * only take features whose range is fully mappable to sequence positions
 912            */
 913           if (featureRange != null)
 914           {
 915             int featureStart = Math.min(featureRange[0], featureRange[1]);
 916             int featureEnd = Math.max(featureRange[0], featureRange[1]);
 917             if (featureEnd - featureStart == variant.getEnd()
 918                     - variant.getStart())
 919             {
 920               count += addAlleleFeatures(seq, variant, featureStart,
 921                       featureEnd, forwardStrand);
 922             }
 923           }
 924         }
 925         variants.close();
 926       } catch (TribbleException e)
 927       {
 928         /*
 929          * RuntimeException throwable by htsjdk
 930          */
 931         String msg = String.format("Error reading VCF for %s:%d-%d: %s ",
 932                 map.chromosome, vcfStart, vcfEnd, e.getLocalizedMessage());
 933         Console.error(msg);
 934       }
 935     }
 936
 937     return count;
 938   }
 939
 940   /**
 941    * A convenience method to get an attribute value for an alternate allele
 942    *
 943    * @param variant
 944    * @param attributeName
 945    * @param alleleIndex
 946    * @return
 947    */
 948   protected String getAttributeValue(VariantContext variant,
 949           String attributeName, int alleleIndex)
 950   {
 951     Object att = variant.getAttribute(attributeName);
 952
 953     if (att instanceof String)
 954     {
 955       return (String) att;
 956     }
 957     else if (att instanceof ArrayList)
 958     {
 959       return ((List<String>) att).get(alleleIndex);
 960     }
 961
 962     return null;
 963   }
 964
 965   /**
 966    * Adds one variant feature for each allele in the VCF variant record, and
 967    * returns the number of features added.
 968    *
 969    * @param seq
 970    * @param variant
 971    * @param featureStart
 972    * @param featureEnd
 973    * @param forwardStrand
 974    * @return
 975    */
 976   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 977           int featureStart, int featureEnd, boolean forwardStrand)
 978   {
 979     int added = 0;
 980
 981     /*
 982      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 983      * so we proceed defensively to get them in strict order
 984      */
 985     int altAlleleCount = variant.getAlternateAlleles().size();
 986     for (int i = 0; i < altAlleleCount; i++)
 987     {
 988       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 989               forwardStrand);
 990     }
 991     return added;
 992   }
 993
 994   /**
 995    * Inspects one allele and attempts to add a variant feature for it to the
 996    * sequence. The additional data associated with this allele is extracted to
 997    * store in the feature's key-value map. Answers the number of features added
 998    * (0 or 1).
 999    *
1000    * @param seq
1001    * @param variant
1002    * @param altAlleleIndex
1003    *          (0, 1..)
1004    * @param featureStart
1005    * @param featureEnd
1006    * @param forwardStrand
1007    * @return
1008    */
1009   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
1010           int altAlleleIndex, int featureStart, int featureEnd,
1011           boolean forwardStrand)
1012   {
1013     String reference = variant.getReference().getBaseString();
1014     Allele alt = variant.getAlternateAllele(altAlleleIndex);
1015     String allele = alt.getBaseString();
1016
1017     /*
1018      * insertion after a genomic base, if on reverse strand, has to be
1019      * converted to insertion of complement after the preceding position
1020      */
1021     int referenceLength = reference.length();
1022     if (!forwardStrand && allele.length() > referenceLength
1023             && allele.startsWith(reference))
1024     {
1025       featureStart -= referenceLength;
1026       featureEnd = featureStart;
1027       char insertAfter = seq.getCharAt(featureStart - seq.getStart());
1028       reference = Dna.reverseComplement(String.valueOf(insertAfter));
1029       allele = allele.substring(referenceLength) + reference;
1030     }
1031
1032     /*
1033      * build the ref,alt allele description e.g. "G,A", using the base
1034      * complement if the sequence is on the reverse strand
1035      */
1036     StringBuilder sb = new StringBuilder();
1037     sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
1038     sb.append(COMMA);
1039     sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
1040     String alleles = sb.toString(); // e.g. G,A
1041
1042     /*
1043      * pick out the consequence data (if any) that is for the current allele
1044      * and feature (transcript) that matches the current sequence
1045      */
1046     String consequence = getConsequenceForAlleleAndFeature(variant,
1047             CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex,
1048             csqAlleleNumberFieldIndex,
1049             seq.getName().toLowerCase(Locale.ROOT), csqFeatureFieldIndex);
1050
1051     /*
1052      * pick out the ontology term for the consequence type
1053      */
1054     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1055     if (consequence != null)
1056     {
1057       type = getOntologyTerm(consequence);
1058     }
1059
1060     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
1061             featureEnd, FEATURE_GROUP_VCF);
1062     sf.setSource(sourceId);
1063
1064     /*
1065      * save the derived alleles as a named attribute; this will be
1066      * needed when Jalview computes derived peptide variants
1067      */
1068     addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles);
1069
1070     /*
1071      * add selected VCF fixed column data as feature attributes
1072      */
1073     addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart()));
1074     addFeatureAttribute(sf, VCF_ID, variant.getID());
1075     addFeatureAttribute(sf, VCF_QUAL,
1076             String.valueOf(variant.getPhredScaledQual()));
1077     addFeatureAttribute(sf, VCF_FILTER, getFilter(variant));
1078
1079     addAlleleProperties(variant, sf, altAlleleIndex, consequence);
1080
1081     seq.addSequenceFeature(sf);
1082
1083     return 1;
1084   }
1085
1086   /**
1087    * Answers the VCF FILTER value for the variant - or an approximation to it.
1088    * This field is either PASS, or a semi-colon separated list of filters not
1089    * passed. htsjdk saves filters as a HashSet, so the order when reassembled
1090    * into a list may be different.
1091    *
1092    * @param variant
1093    * @return
1094    */
1095   String getFilter(VariantContext variant)
1096   {
1097     Set<String> filters = variant.getFilters();
1098     if (filters.isEmpty())
1099     {
1100       return NO_VALUE;
1101     }
1102     Iterator<String> iterator = filters.iterator();
1103     String first = iterator.next();
1104     if (filters.size() == 1)
1105     {
1106       return first;
1107     }
1108
1109     StringBuilder sb = new StringBuilder(first);
1110     while (iterator.hasNext())
1111     {
1112       sb.append(";").append(iterator.next());
1113     }
1114
1115     return sb.toString();
1116   }
1117
1118   /**
1119    * Adds one feature attribute unless the value is null, empty or '.'
1120    *
1121    * @param sf
1122    * @param key
1123    * @param value
1124    */
1125   void addFeatureAttribute(SequenceFeature sf, String key, String value)
1126   {
1127     if (value != null && !value.isEmpty() && !NO_VALUE.equals(value))
1128     {
1129       sf.setValue(key, value);
1130     }
1131   }
1132
1133   /**
1134    * Determines the Sequence Ontology term to use for the variant feature type
1135    * in Jalview. The default is 'sequence_variant', but a more specific term is
1136    * used if:
1137    * <ul>
1138    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
1139    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
1140    * </ul>
1141    *
1142    * @param consequence
1143    * @return
1144    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
1145    */
1146   String getOntologyTerm(String consequence)
1147   {
1148     String type = SequenceOntologyI.SEQUENCE_VARIANT;
1149
1150     /*
1151      * could we associate Consequence data with this allele and feature (transcript)?
1152      * if so, prefer the consequence term from that data
1153      */
1154     if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
1155     {
1156       /*
1157        * no Consequence data so we can't refine the ontology term
1158        */
1159       return type;
1160     }
1161
1162     if (consequence != null)
1163     {
1164       String[] csqFields = consequence.split(PIPE_REGEX);
1165       if (csqFields.length > csqConsequenceFieldIndex)
1166       {
1167         type = csqFields[csqConsequenceFieldIndex];
1168       }
1169     }
1170     else
1171     {
1172       // todo the same for SnpEff consequence data matching if wanted
1173     }
1174
1175     /*
1176      * if of the form (e.g.) missense_variant&splice_region_variant,
1177      * just take the first ('most severe') consequence
1178      */
1179     if (type != null)
1180     {
1181       int pos = type.indexOf('&');
1182       if (pos > 0)
1183       {
1184         type = type.substring(0, pos);
1185       }
1186     }
1187     return type;
1188   }
1189
1190   /**
1191    * Returns matched consequence data if it can be found, else null.
1192    * <ul>
1193    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1194    * <li>splits this on comma (to distinct consequences)</li>
1195    * <li>returns the first consequence (if any) where</li>
1196    * <ul>
1197    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1198    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1199    * </ul>
1200    * </ul>
1201    * If matched, the consequence is returned (as pipe-delimited fields).
1202    *
1203    * @param variant
1204    * @param vcfInfoId
1205    * @param altAlleleIndex
1206    * @param alleleFieldIndex
1207    * @param alleleNumberFieldIndex
1208    * @param seqName
1209    * @param featureFieldIndex
1210    * @return
1211    */
1212   private String getConsequenceForAlleleAndFeature(VariantContext variant,
1213           String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1214           int alleleNumberFieldIndex, String seqName, int featureFieldIndex)
1215   {
1216     if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1217     {
1218       return null;
1219     }
1220     Object value = variant.getAttribute(vcfInfoId);
1221
1222     if (value == null || !(value instanceof List<?>))
1223     {
1224       return null;
1225     }
1226
1227     /*
1228      * inspect each consequence in turn (comma-separated blocks
1229      * extracted by htsjdk)
1230      */
1231     List<String> consequences = (List<String>) value;
1232
1233     for (String consequence : consequences)
1234     {
1235       String[] csqFields = consequence.split(PIPE_REGEX);
1236       if (csqFields.length > featureFieldIndex)
1237       {
1238         String featureIdentifier = csqFields[featureFieldIndex];
1239         if (featureIdentifier.length() > 4 && seqName
1240                 .indexOf(featureIdentifier.toLowerCase(Locale.ROOT)) > -1)
1241         {
1242           /*
1243            * feature (transcript) matched - now check for allele match
1244            */
1245           if (matchAllele(variant, altAlleleIndex, csqFields,
1246                   alleleFieldIndex, alleleNumberFieldIndex))
1247           {
1248             return consequence;
1249           }
1250         }
1251       }
1252     }
1253     return null;
1254   }
1255
1256   private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1257           String[] csqFields, int alleleFieldIndex,
1258           int alleleNumberFieldIndex)
1259   {
1260     /*
1261      * if ALLELE_NUM is present, it must match altAlleleIndex
1262      * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1263      */
1264     if (alleleNumberFieldIndex > -1)
1265     {
1266       if (csqFields.length <= alleleNumberFieldIndex)
1267       {
1268         return false;
1269       }
1270       String alleleNum = csqFields[alleleNumberFieldIndex];
1271       return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1272     }
1273
1274     /*
1275      * else consequence allele must match variant allele
1276      */
1277     if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1278     {
1279       String csqAllele = csqFields[alleleFieldIndex];
1280       String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1281               .getBaseString();
1282       return csqAllele.equals(vcfAllele);
1283     }
1284     return false;
1285   }
1286
1287   /**
1288    * Add any allele-specific VCF key-value data to the sequence feature
1289    *
1290    * @param variant
1291    * @param sf
1292    * @param altAlelleIndex
1293    *          (0, 1..)
1294    * @param consequence
1295    *          if not null, the consequence specific to this sequence (transcript
1296    *          feature) and allele
1297    */
1298   protected void addAlleleProperties(VariantContext variant,
1299           SequenceFeature sf, final int altAlelleIndex, String consequence)
1300   {
1301     Map<String, Object> atts = variant.getAttributes();
1302
1303     for (Entry<String, Object> att : atts.entrySet())
1304     {
1305       String key = att.getKey();
1306
1307       /*
1308        * extract Consequence data (if present) that we are able to
1309        * associated with the allele for this variant feature
1310        */
1311       if (CSQ_FIELD.equals(key))
1312       {
1313         addConsequences(variant, sf, consequence);
1314         continue;
1315       }
1316
1317       /*
1318        * filter out fields we don't want to capture
1319        */
1320       if (!vcfFieldsOfInterest.contains(key))
1321       {
1322         continue;
1323       }
1324
1325       /*
1326        * we extract values for other data which are allele-specific;
1327        * these may be per alternate allele (INFO[key].Number = 'A')
1328        * or per allele including reference (INFO[key].Number = 'R')
1329        */
1330       VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1331       if (infoHeader == null)
1332       {
1333         /*
1334          * can't be sure what data belongs to this allele, so
1335          * play safe and don't take any
1336          */
1337         continue;
1338       }
1339
1340       VCFHeaderLineCount number = infoHeader.getCountType();
1341       int index = altAlelleIndex;
1342       if (number == VCFHeaderLineCount.R)
1343       {
1344         /*
1345          * one value per allele including reference, so bump index
1346          * e.g. the 3rd value is for the  2nd alternate allele
1347          */
1348         index++;
1349       }
1350       else if (number != VCFHeaderLineCount.A)
1351       {
1352         /*
1353          * don't save other values as not allele-related
1354          */
1355         continue;
1356       }
1357
1358       /*
1359        * take the index'th value
1360        */
1361       String value = getAttributeValue(variant, key, index);
1362       if (value != null && isValid(variant, key, value))
1363       {
1364         /*
1365          * decode colon, semicolon, equals sign, percent sign, comma (only)
1366          * as required by the VCF specification (para 1.2)
1367          */
1368         value = StringUtils.urlDecode(value, VCF_ENCODABLE);
1369         addFeatureAttribute(sf, key, value);
1370       }
1371     }
1372   }
1373
1374   /**
1375    * Answers true for '.', null, or an empty value, or if the INFO type is
1376    * String. If the INFO type is Integer or Float, answers false if the value is
1377    * not in valid format.
1378    *
1379    * @param variant
1380    * @param infoId
1381    * @param value
1382    * @return
1383    */
1384   protected boolean isValid(VariantContext variant, String infoId,
1385           String value)
1386   {
1387     if (value == null || value.isEmpty() || NO_VALUE.equals(value))
1388     {
1389       return true;
1390     }
1391     VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId);
1392     if (infoHeader == null)
1393     {
1394       Console.error("Field " + infoId + " has no INFO header");
1395       return false;
1396     }
1397     VCFHeaderLineType infoType = infoHeader.getType();
1398     try
1399     {
1400       if (infoType == VCFHeaderLineType.Integer)
1401       {
1402         Integer.parseInt(value);
1403       }
1404       else if (infoType == VCFHeaderLineType.Float)
1405       {
1406         Float.parseFloat(value);
1407       }
1408     } catch (NumberFormatException e)
1409     {
1410       logInvalidValue(variant, infoId, value);
1411       return false;
1412     }
1413     return true;
1414   }
1415
1416   /**
1417    * Logs an error message for malformed data; duplicate messages (same id and
1418    * value) are not logged
1419    *
1420    * @param variant
1421    * @param infoId
1422    * @param value
1423    */
1424   private void logInvalidValue(VariantContext variant, String infoId,
1425           String value)
1426   {
1427     if (badData == null)
1428     {
1429       badData = new HashSet<>();
1430     }
1431     String token = infoId + ":" + value;
1432     if (!badData.contains(token))
1433     {
1434       badData.add(token);
1435       Console.error(String.format("Invalid VCF data at %s:%d %s=%s",
1436               variant.getContig(), variant.getStart(), infoId, value));
1437     }
1438   }
1439
1440   /**
1441    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1442    * feature.
1443    * <p>
1444    * If <code>myConsequence</code> is not null, then this is the specific
1445    * consequence data (pipe-delimited fields) that is for the current allele and
1446    * transcript (sequence) being processed)
1447    *
1448    * @param variant
1449    * @param sf
1450    * @param myConsequence
1451    */
1452   protected void addConsequences(VariantContext variant, SequenceFeature sf,
1453           String myConsequence)
1454   {
1455     Object value = variant.getAttribute(CSQ_FIELD);
1456
1457     if (value == null || !(value instanceof List<?>))
1458     {
1459       return;
1460     }
1461
1462     List<String> consequences = (List<String>) value;
1463
1464     /*
1465      * inspect CSQ consequences; restrict to the consequence
1466      * associated with the current transcript (Feature)
1467      */
1468     Map<String, String> csqValues = new HashMap<>();
1469
1470     for (String consequence : consequences)
1471     {
1472       if (myConsequence == null || myConsequence.equals(consequence))
1473       {
1474         String[] csqFields = consequence.split(PIPE_REGEX);
1475
1476         /*
1477          * inspect individual fields of this consequence, copying non-null
1478          * values which are 'fields of interest'
1479          */
1480         int i = 0;
1481         for (String field : csqFields)
1482         {
1483           if (field != null && field.length() > 0)
1484           {
1485             String id = vepFieldsOfInterest.get(i);
1486             if (id != null)
1487             {
1488               /*
1489                * VCF spec requires encoding of special characters e.g. '='
1490                * so decode them here before storing
1491                */
1492               field = StringUtils.urlDecode(field, VCF_ENCODABLE);
1493               csqValues.put(id, field);
1494             }
1495           }
1496           i++;
1497         }
1498       }
1499     }
1500
1501     if (!csqValues.isEmpty())
1502     {
1503       sf.setValue(CSQ_FIELD, csqValues);
1504     }
1505   }
1506
1507   /**
1508    * A convenience method to complement a dna base and return the string value
1509    * of its complement
1510    *
1511    * @param reference
1512    * @return
1513    */
1514   protected String complement(byte[] reference)
1515   {
1516     return String.valueOf(Dna.getComplement((char) reference[0]));
1517   }
1518
1519   /**
1520    * Determines the location of the query range (chromosome positions) in a
1521    * different reference assembly.
1522    * <p>
1523    * If the range is just a subregion of one for which we already have a mapping
1524    * (for example, an exon sub-region of a gene), then the mapping is just
1525    * computed arithmetically.
1526    * <p>
1527    * Otherwise, calls the Ensembl REST service that maps from one assembly
1528    * reference's coordinates to another's
1529    *
1530    * @param queryRange
1531    *          start-end chromosomal range in 'fromRef' coordinates
1532    * @param chromosome
1533    * @param species
1534    * @param fromRef
1535    *          assembly reference for the query coordinates
1536    * @param toRef
1537    *          assembly reference we wish to translate to
1538    * @return the start-end range in 'toRef' coordinates
1539    */
1540   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1541           String species, String fromRef, String toRef)
1542   {
1543     /*
1544      * first try shorcut of computing the mapping as a subregion of one
1545      * we already have (e.g. for an exon, if we have the gene mapping)
1546      */
1547     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1548             species, fromRef, toRef);
1549     if (mappedRange != null)
1550     {
1551       return mappedRange;
1552     }
1553
1554     /*
1555      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1556      */
1557     EnsemblMap mapper = new EnsemblMap();
1558     int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1559             toRef, queryRange);
1560
1561     if (mapping == null)
1562     {
1563       // mapping service failure
1564       return null;
1565     }
1566
1567     /*
1568      * save mapping for possible future re-use
1569      */
1570     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1571     if (!assemblyMappings.containsKey(key))
1572     {
1573       assemblyMappings.put(key, new HashMap<int[], int[]>());
1574     }
1575
1576     assemblyMappings.get(key).put(queryRange, mapping);
1577
1578     return mapping;
1579   }
1580
1581   /**
1582    * If we already have a 1:1 contiguous mapping which subsumes the given query
1583    * range, this method just calculates and returns the subset of that mapping,
1584    * else it returns null. In practical terms, if a gene has a contiguous
1585    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1586    * subsidiary exons occupy unchanged relative positions, and just compute
1587    * these as offsets, rather than do another lookup of the mapping.
1588    * <p>
1589    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1590    * simply remove this method or let it always return null.
1591    * <p>
1592    * Warning: many rapid calls to the /map service map result in a 429 overload
1593    * error response
1594    *
1595    * @param queryRange
1596    * @param chromosome
1597    * @param species
1598    * @param fromRef
1599    * @param toRef
1600    * @return
1601    */
1602   protected int[] findSubsumedRangeMapping(int[] queryRange,
1603           String chromosome, String species, String fromRef, String toRef)
1604   {
1605     String key = makeRangesKey(chromosome, species, fromRef, toRef);
1606     if (assemblyMappings.containsKey(key))
1607     {
1608       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1609       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1610       {
1611         int[] fromRange = mappedRange.getKey();
1612         int[] toRange = mappedRange.getValue();
1613         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1614         {
1615           /*
1616            * mapping is 1:1 in length, so we trust it to have no discontinuities
1617            */
1618           if (MappingUtils.rangeContains(fromRange, queryRange))
1619           {
1620             /*
1621              * fromRange subsumes our query range
1622              */
1623             int offset = queryRange[0] - fromRange[0];
1624             int mappedRangeFrom = toRange[0] + offset;
1625             int mappedRangeTo = mappedRangeFrom
1626                     + (queryRange[1] - queryRange[0]);
1627             return new int[] { mappedRangeFrom, mappedRangeTo };
1628           }
1629         }
1630       }
1631     }
1632     return null;
1633   }
1634
1635   /**
1636    * Transfers the sequence feature to the target sequence, locating its start
1637    * and end range based on the mapping. Features which do not overlap the
1638    * target sequence are ignored.
1639    *
1640    * @param sf
1641    * @param targetSequence
1642    * @param mapping
1643    *          mapping from the feature's coordinates to the target sequence
1644    */
1645   protected void transferFeature(SequenceFeature sf,
1646           SequenceI targetSequence, MapList mapping)
1647   {
1648     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1649
1650     if (mappedRange != null)
1651     {
1652       String group = sf.getFeatureGroup();
1653       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1654       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1655       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1656               group, sf.getScore());
1657       targetSequence.addSequenceFeature(copy);
1658     }
1659   }
1660
1661   /**
1662    * Formats a ranges map lookup key
1663    *
1664    * @param chromosome
1665    * @param species
1666    * @param fromRef
1667    * @param toRef
1668    * @return
1669    */
1670   protected static String makeRangesKey(String chromosome, String species,
1671           String fromRef, String toRef)
1672   {
1673     return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef;
1674   }
1675 }