src/jalview/io/vcf/VCFLoader.java

   1 package jalview.io.vcf;
   2
   3 import htsjdk.samtools.util.CloseableIterator;
   4 import htsjdk.variant.variantcontext.Allele;
   5 import htsjdk.variant.variantcontext.VariantContext;
   6 import htsjdk.variant.vcf.VCFHeader;
   7 import htsjdk.variant.vcf.VCFHeaderLine;
   8 import htsjdk.variant.vcf.VCFHeaderLineCount;
   9 import htsjdk.variant.vcf.VCFInfoHeaderLine;
  10
  11 import jalview.analysis.AlignmentUtils;
  12 import jalview.analysis.Dna;
  13 import jalview.api.AlignViewControllerGuiI;
  14 import jalview.datamodel.AlignmentI;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.GeneLociI;
  17 import jalview.datamodel.Mapping;
  18 import jalview.datamodel.SequenceFeature;
  19 import jalview.datamodel.SequenceI;
  20 import jalview.ext.ensembl.EnsemblMap;
  21 import jalview.ext.htsjdk.VCFReader;
  22 import jalview.io.gff.Gff3Helper;
  23 import jalview.io.gff.SequenceOntologyI;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26 import jalview.util.MessageManager;
  27
  28 import java.io.IOException;
  29 import java.util.ArrayList;
  30 import java.util.HashMap;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.Map.Entry;
  34
  35 /**
  36  * A class to read VCF data (using the htsjdk) and add variants as sequence
  37  * features on dna and any related protein product sequences
  38  *
  39  * @author gmcarstairs
  40  */
  41 public class VCFLoader
  42 {
  43   /*
  44    * keys to fields of VEP CSQ consequence data
  45    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  46    */
  47   private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
  48
  49   private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
  50
  51   /*
  52    * default VCF INFO key for VEP consequence data
  53    * NB this can be overridden running VEP with --vcf_info_field
  54    * - we don't handle this case (require CSQ identifier)
  55    */
  56   private static final String CSQ = "CSQ";
  57
  58   /*
  59    * separator for fields in consequence data
  60    */
  61   private static final String PIPE = "|";
  62
  63   private static final String PIPE_REGEX = "\\" + PIPE;
  64
  65   /*
  66    * key for Allele Frequency output by VEP
  67    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
  68    */
  69   private static final String ALLELE_FREQUENCY_KEY = "AF";
  70
  71   /*
  72    * delimiter that separates multiple consequence data blocks
  73    */
  74   private static final String COMMA = ",";
  75
  76   /*
  77    * (temporary) flag that determines whether Jalview adds one feature
  78    * per VCF record, or one per allele (preferred)
  79    */
  80   private static final boolean FEATURE_PER_ALLELE = true;
  81
  82   /*
  83    * the feature group assigned to a VCF variant in Jalview
  84    */
  85   private static final String FEATURE_GROUP_VCF = "VCF";
  86
  87   /*
  88    * internal delimiter used to build keys for assemblyMappings
  89    *
  90    */
  91   private static final String EXCL = "!";
  92
  93   /*
  94    * the alignment we are associating VCF data with
  95    */
  96   private AlignmentI al;
  97
  98   /*
  99    * mappings between VCF and sequence reference assembly regions, as
 100    * key = "species!chromosome!fromAssembly!toAssembly
 101    * value = Map{fromRange, toRange}
 102    */
 103   private Map<String, Map<int[], int[]>> assemblyMappings;
 104
 105   /*
 106    * holds details of the VCF header lines (metadata)
 107    */
 108   private VCFHeader header;
 109
 110   /*
 111    * the position (0...) of the ALLELE_NUM field in each block of
 112    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
 113    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
 114    */
 115   private int csqAlleleNumberFieldIndex = -1;
 116
 117   private int csqFeatureFieldIndex = -1;
 118
 119   /**
 120    * Constructor given an alignment context
 121    *
 122    * @param alignment
 123    */
 124   public VCFLoader(AlignmentI alignment)
 125   {
 126     al = alignment;
 127
 128     // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
 129     assemblyMappings = new HashMap<String, Map<int[], int[]>>();
 130   }
 131
 132   /**
 133    * Starts a new thread to query and load VCF variant data on to the alignment
 134    * <p>
 135    * This method is not thread safe - concurrent threads should use separate
 136    * instances of this class.
 137    *
 138    * @param filePath
 139    * @param gui
 140    */
 141   public void loadVCF(final String filePath,
 142           final AlignViewControllerGuiI gui)
 143   {
 144     if (gui != null)
 145     {
 146       gui.setStatus(MessageManager.getString("label.searching_vcf"));
 147     }
 148
 149     new Thread()
 150     {
 151
 152       @Override
 153       public void run()
 154       {
 155         VCFLoader.this.doLoad(filePath, gui);
 156       }
 157
 158     }.start();
 159   }
 160
 161   /**
 162    * Loads VCF on to an alignment - provided it can be related to one or more
 163    * sequence's chromosomal coordinates
 164    *
 165    * @param filePath
 166    * @param gui
 167    *          optional callback handler for messages
 168    */
 169   protected void doLoad(String filePath, AlignViewControllerGuiI gui)
 170   {
 171     VCFReader reader = null;
 172     try
 173     {
 174       // long start = System.currentTimeMillis();
 175       reader = new VCFReader(filePath);
 176
 177       header = reader.getFileHeader();
 178       VCFHeaderLine ref = header
 179               .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
 180
 181       /*
 182        * note offset of CSQ ALLELE_NUM field if it is declared
 183        */
 184       findAlleleNumberFieldIndex();
 185
 186       // check if reference is wrt assembly19 (GRCh37)
 187       // todo may need to allow user to specify reference assembly?
 188       boolean isRefGrch37 = (ref != null && ref.getValue().contains(
 189               "assembly19"));
 190
 191       int varCount = 0;
 192       int seqCount = 0;
 193
 194       /*
 195        * query for VCF overlapping each sequence in turn
 196        */
 197       for (SequenceI seq : al.getSequences())
 198       {
 199         int added = loadVCF(seq, reader, isRefGrch37);
 200         if (added > 0)
 201         {
 202           seqCount++;
 203           varCount += added;
 204           transferAddedFeatures(seq);
 205         }
 206       }
 207       if (gui != null)
 208       {
 209         // long elapsed = System.currentTimeMillis() - start;
 210         String msg = MessageManager.formatMessage("label.added_vcf",
 211                 varCount, seqCount);
 212         gui.setStatus(msg);
 213         if (gui.getFeatureSettingsUI() != null)
 214         {
 215           gui.getFeatureSettingsUI().discoverAllFeatureData();
 216         }
 217       }
 218     } catch (Throwable e)
 219     {
 220       System.err.println("Error processing VCF: " + e.getMessage());
 221       e.printStackTrace();
 222       if (gui != null)
 223       {
 224         gui.setStatus("Error occurred - see console for details");
 225       }
 226     } finally
 227     {
 228       if (reader != null)
 229       {
 230         try
 231         {
 232           reader.close();
 233         } catch (IOException e)
 234         {
 235           // ignore
 236         }
 237       }
 238     }
 239   }
 240
 241   /**
 242    * If the CSQ INFO header declares that ALLELE_NUM is included in the data,
 243    * record its (pipe-delimited) offset in each (comma-delimited) consequence
 244    * block; CSQ fields are declared in the CSQ INFO Description e.g.
 245    * <p>
 246    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
 247    */
 248   protected void findAlleleNumberFieldIndex()
 249   {
 250     VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ);
 251     String desc = csqInfo.getDescription();
 252     if (desc != null)
 253     {
 254       String[] format = desc.split(PIPE_REGEX);
 255       int index = 0;
 256       for (String field : format)
 257       {
 258         if (ALLELE_NUM_KEY.equals(field))
 259         {
 260           csqAlleleNumberFieldIndex = index;
 261         }
 262         if (FEATURE_KEY.equals(field))
 263         {
 264           csqFeatureFieldIndex = index;
 265         }
 266         index++;
 267       }
 268     }
 269   }
 270
 271   /**
 272    * Transfers VCF features to sequences to which this sequence has a mapping.
 273    * If the mapping is 1:3, computes peptide variants from nucleotide variants.
 274    *
 275    * @param seq
 276    */
 277   protected void transferAddedFeatures(SequenceI seq)
 278   {
 279     DBRefEntry[] dbrefs = seq.getDBRefs();
 280     if (dbrefs == null)
 281     {
 282       return;
 283     }
 284     for (DBRefEntry dbref : dbrefs)
 285     {
 286       Mapping mapping = dbref.getMap();
 287       if (mapping == null || mapping.getTo() == null)
 288       {
 289         continue;
 290       }
 291
 292       SequenceI mapTo = mapping.getTo();
 293       MapList map = mapping.getMap();
 294       if (map.getFromRatio() == 3)
 295       {
 296         /*
 297          * dna-to-peptide product mapping
 298          */
 299         AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
 300       }
 301       else
 302       {
 303         /*
 304          * nucleotide-to-nucleotide mapping e.g. transcript to CDS
 305          */
 306         // TODO no DBRef to CDS is added to transcripts
 307         List<SequenceFeature> features = seq.getFeatures()
 308                 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
 309         for (SequenceFeature sf : features)
 310         {
 311           if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
 312           {
 313             transferFeature(sf, mapTo, map);
 314           }
 315         }
 316       }
 317     }
 318   }
 319
 320   /**
 321    * Tries to add overlapping variants read from a VCF file to the given
 322    * sequence, and returns the number of variant features added. Note that this
 323    * requires the sequence to hold information as to its chromosomal positions
 324    * and reference, in order to be able to map the VCF variants to the sequence.
 325    *
 326    * @param seq
 327    * @param reader
 328    * @param isVcfRefGrch37
 329    * @return
 330    */
 331   protected int loadVCF(SequenceI seq, VCFReader reader,
 332           boolean isVcfRefGrch37)
 333   {
 334     int count = 0;
 335     GeneLociI seqCoords = seq.getGeneLoci();
 336     if (seqCoords == null)
 337     {
 338       return 0;
 339     }
 340
 341     List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
 342     for (int[] range : seqChromosomalContigs)
 343     {
 344       count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
 345     }
 346
 347     return count;
 348   }
 349
 350   /**
 351    * Queries the VCF reader for any variants that overlap the given chromosome
 352    * region of the sequence, and adds as variant features. Returns the number of
 353    * overlapping variants found.
 354    *
 355    * @param seq
 356    * @param reader
 357    * @param range
 358    *          start-end range of a sequence region in its chromosomal
 359    *          coordinates
 360    * @param isVcfRefGrch37
 361    *          true if the VCF is with reference to GRCh37
 362    * @return
 363    */
 364   protected int addVcfVariants(SequenceI seq, VCFReader reader,
 365           int[] range, boolean isVcfRefGrch37)
 366   {
 367     GeneLociI seqCoords = seq.getGeneLoci();
 368
 369     String chromosome = seqCoords.getChromosomeId();
 370     String seqRef = seqCoords.getAssemblyId();
 371     String species = seqCoords.getSpeciesId();
 372
 373     /*
 374      * map chromosomal coordinates from GRCh38 (sequence) to
 375      * GRCh37 (VCF) if necessary
 376      */
 377     // TODO generalise for other assemblies and species
 378     int offset = 0;
 379     String fromRef = "GRCh38";
 380     if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
 381     {
 382       String toRef = "GRCh37";
 383       int[] newRange = mapReferenceRange(range, chromosome, "human",
 384               fromRef, toRef);
 385       if (newRange == null)
 386       {
 387         System.err.println(String.format(
 388                 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
 389                 fromRef, range[0], range[1], toRef));
 390         return 0;
 391       }
 392       offset = newRange[0] - range[0];
 393       range = newRange;
 394     }
 395
 396     boolean forwardStrand = range[0] <= range[1];
 397
 398     /*
 399      * query the VCF for overlaps
 400      * (convert a reverse strand range to forwards)
 401      */
 402     int count = 0;
 403     MapList mapping = seqCoords.getMap();
 404
 405     int fromLocus = Math.min(range[0], range[1]);
 406     int toLocus = Math.max(range[0], range[1]);
 407     CloseableIterator<VariantContext> variants = reader.query(chromosome,
 408             fromLocus, toLocus);
 409     while (variants.hasNext())
 410     {
 411       /*
 412        * get variant location in sequence chromosomal coordinates
 413        */
 414       VariantContext variant = variants.next();
 415
 416       /*
 417        * we can only process SNP variants (which can be reported
 418        * as part of a MIXED variant record
 419        */
 420       if (!variant.isSNP() && !variant.isMixed())
 421       {
 422         continue;
 423       }
 424
 425       int start = variant.getStart() - offset;
 426       int end = variant.getEnd() - offset;
 427
 428       /*
 429        * convert chromosomal location to sequence coordinates
 430        * - null if a partially overlapping feature
 431        */
 432       int[] seqLocation = mapping.locateInFrom(start, end);
 433       if (seqLocation != null)
 434       {
 435         count += addVariantFeature(seq, variant, seqLocation[0],
 436                 seqLocation[1], forwardStrand);
 437       }
 438     }
 439
 440     variants.close();
 441
 442     return count;
 443   }
 444
 445   /**
 446    * Inspects the VCF variant record, and adds variant features to the sequence.
 447    * Only SNP variants are added, not INDELs. Returns the number of features
 448    * added.
 449    * <p>
 450    * If the sequence maps to the reverse strand of the chromosome, reference and
 451    * variant bases are recorded as their complements (C/G, A/T).
 452    *
 453    * @param seq
 454    * @param variant
 455    * @param featureStart
 456    * @param featureEnd
 457    * @param forwardStrand
 458    */
 459   protected int addVariantFeature(SequenceI seq, VariantContext variant,
 460           int featureStart, int featureEnd, boolean forwardStrand)
 461   {
 462     byte[] reference = variant.getReference().getBases();
 463     if (reference.length != 1)
 464     {
 465       /*
 466        * sorry, we don't handle INDEL variants
 467        */
 468       return 0;
 469     }
 470
 471     if (FEATURE_PER_ALLELE)
 472     {
 473       return addAlleleFeatures(seq, variant, featureStart, featureEnd,
 474               forwardStrand);
 475     }
 476
 477     /*
 478      * for now we extract allele frequency as feature score; note
 479      * this attribute is String for a simple SNP, but List<String> if
 480      * multiple alleles at the locus; we extract for the simple case only
 481      */
 482     float score = getAlleleFrequency(variant, 0);
 483
 484     StringBuilder sb = new StringBuilder();
 485     sb.append(forwardStrand ? (char) reference[0] : complement(reference));
 486
 487     /*
 488      * inspect alleles and record SNP variants (as the variant
 489      * record could be MIXED and include INDEL and SNP alleles)
 490      * warning: getAlleles gives no guarantee as to the order
 491      * in which they are returned
 492      */
 493     for (Allele allele : variant.getAlleles())
 494     {
 495       if (!allele.isReference())
 496       {
 497         byte[] alleleBase = allele.getBases();
 498         if (alleleBase.length == 1)
 499         {
 500           sb.append(COMMA).append(
 501                   forwardStrand ? (char) alleleBase[0]
 502                           : complement(alleleBase));
 503         }
 504       }
 505     }
 506     String alleles = sb.toString(); // e.g. G,A,C
 507
 508     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 509
 510     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 511             featureEnd, score, FEATURE_GROUP_VCF);
 512
 513     sf.setValue(Gff3Helper.ALLELES, alleles);
 514
 515     Map<String, Object> atts = variant.getAttributes();
 516     for (Entry<String, Object> att : atts.entrySet())
 517     {
 518       sf.setValue(att.getKey(), att.getValue());
 519     }
 520     seq.addSequenceFeature(sf);
 521
 522     return 1;
 523   }
 524
 525   /**
 526    * A convenience method to get the AF value for the given alternate allele
 527    * index
 528    *
 529    * @param variant
 530    * @param alleleIndex
 531    * @return
 532    */
 533   protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
 534   {
 535     float score = 0f;
 536     String attributeValue = getAttributeValue(variant,
 537             ALLELE_FREQUENCY_KEY, alleleIndex);
 538     if (attributeValue != null)
 539     {
 540       try
 541       {
 542         score = Float.parseFloat(attributeValue);
 543       } catch (NumberFormatException e)
 544       {
 545         // leave as 0
 546       }
 547     }
 548
 549     return score;
 550   }
 551
 552   /**
 553    * A convenience method to get an attribute value for an alternate allele
 554    *
 555    * @param variant
 556    * @param attributeName
 557    * @param alleleIndex
 558    * @return
 559    */
 560   protected String getAttributeValue(VariantContext variant,
 561           String attributeName, int alleleIndex)
 562   {
 563     Object att = variant.getAttribute(attributeName);
 564
 565     if (att instanceof String)
 566     {
 567       return (String) att;
 568     }
 569     else if (att instanceof ArrayList)
 570     {
 571       return ((List<String>) att).get(alleleIndex);
 572     }
 573
 574     return null;
 575   }
 576
 577   /**
 578    * Adds one variant feature for each SNP allele in the VCF variant record, and
 579    * returns the number of features added.
 580    *
 581    * @param seq
 582    * @param variant
 583    * @param featureStart
 584    * @param featureEnd
 585    * @param forwardStrand
 586    * @return
 587    */
 588   protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
 589           int featureStart, int featureEnd, boolean forwardStrand)
 590   {
 591     int added = 0;
 592
 593     /*
 594      * Javadoc says getAlternateAlleles() imposes no order on the list returned
 595      * so we proceed defensively to get them in strict order
 596      */
 597     int altAlleleCount = variant.getAlternateAlleles().size();
 598     for (int i = 0; i < altAlleleCount; i++)
 599     {
 600       added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
 601               forwardStrand);
 602     }
 603     return added;
 604   }
 605
 606   /**
 607    * Inspects one allele and attempts to add a variant feature for it to the
 608    * sequence. Only SNP variants are added as features. We extract as much as
 609    * possible of the additional data associated with this allele to store in the
 610    * feature's key-value map. Answers the number of features added (0 or 1).
 611    *
 612    * @param seq
 613    * @param variant
 614    * @param altAlleleIndex
 615    * @param featureStart
 616    * @param featureEnd
 617    * @param forwardStrand
 618    * @return
 619    */
 620   protected int addAlleleFeature(SequenceI seq, VariantContext variant,
 621           int altAlleleIndex, int featureStart, int featureEnd,
 622           boolean forwardStrand)
 623   {
 624     byte[] reference = variant.getReference().getBases();
 625     Allele alt = variant.getAlternateAllele(altAlleleIndex);
 626     byte[] allele = alt.getBases();
 627     if (allele.length != 1)
 628     {
 629       /*
 630        * not a SNP variant
 631        */
 632       return 0;
 633     }
 634
 635     /*
 636      * build the ref,alt allele description e.g. "G,A"
 637      */
 638     StringBuilder sb = new StringBuilder();
 639     sb.append(forwardStrand ? (char) reference[0] : complement(reference));
 640     sb.append(COMMA);
 641     sb.append(forwardStrand ? (char) allele[0] : complement(allele));
 642     String alleles = sb.toString(); // e.g. G,A
 643
 644     String type = SequenceOntologyI.SEQUENCE_VARIANT;
 645     float score = getAlleleFrequency(variant, altAlleleIndex);
 646
 647     SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
 648             featureEnd, score, FEATURE_GROUP_VCF);
 649
 650     sf.setValue(Gff3Helper.ALLELES, alleles);
 651
 652     addAlleleProperties(variant, seq, sf, altAlleleIndex);
 653
 654     seq.addSequenceFeature(sf);
 655
 656     return 1;
 657   }
 658
 659   /**
 660    * Add any allele-specific VCF key-value data to the sequence feature
 661    *
 662    * @param variant
 663    * @param seq
 664    * @param sf
 665    * @param altAlelleIndex
 666    */
 667   protected void addAlleleProperties(VariantContext variant,
 668  SequenceI seq,
 669           SequenceFeature sf, final int altAlelleIndex)
 670   {
 671     Map<String, Object> atts = variant.getAttributes();
 672
 673     for (Entry<String, Object> att : atts.entrySet())
 674     {
 675       String key = att.getKey();
 676
 677       /*
 678        * extract Consequence data (if present) that we are able to
 679        * associated with the allele for this variant feature
 680        */
 681       if (CSQ.equals(key) && csqAlleleNumberFieldIndex > -1)
 682       {
 683         addConsequences(att.getValue(), seq, sf, altAlelleIndex + 1);
 684         return;
 685       }
 686
 687       /*
 688        * we extract values for other data which are allele-specific;
 689        * these may be per alternate allele (INFO[key].Number = 'A')
 690        * or per allele including reference (INFO[key].Number = 'R')
 691        */
 692       VCFHeaderLineCount number = header.getInfoHeaderLine(key)
 693               .getCountType();
 694       int index = altAlelleIndex;
 695       if (number == VCFHeaderLineCount.R)
 696       {
 697         /*
 698          * one value per allele including reference, so bump index
 699          * e.g. the 3rd value is for the  2nd alternate allele
 700          */
 701         index++;
 702       }
 703       else if (number != VCFHeaderLineCount.A)
 704       {
 705         /*
 706          * don't save other values as not allele-related
 707          */
 708         continue;
 709       }
 710
 711       /*
 712        * take the index'th value
 713        */
 714       String value = getAttributeValue(variant, key, index);
 715       if (value != null)
 716       {
 717         sf.setValue(key, value);
 718       }
 719     }
 720   }
 721
 722   /**
 723    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
 724    * feature for the current allele (and transcript if applicable)
 725    * <p>
 726    * Allele matching: we require field ALLELE_NUM to match altAlleleIndex. If
 727    * the CSQ data does not include ALLELE_NUM values then no data is added to
 728    * the variant feature.
 729    * <p>
 730    * Transcript matching: if sequence name can be identified to at least one of
 731    * the consequences' Feature values, then select only consequences that match
 732    * the value (i.e. consequences for the current transcript sequence). If not,
 733    * take all consequences (this is the case when adding features to the gene
 734    * sequence).
 735    *
 736    * @param value
 737    * @param seq
 738    * @param sf
 739    * @param altAlelleIndex
 740    *          (1=first alternative allele...)
 741    */
 742   protected void addConsequences(Object value, SequenceI seq,
 743           SequenceFeature sf,
 744           int altAlelleIndex)
 745   {
 746     if (!(value instanceof ArrayList<?>))
 747     {
 748       return;
 749     }
 750
 751     List<String> consequences = (List<String>) value;
 752
 753     /*
 754      * if CSQ data includes 'Feature', and any value matches the sequence name,
 755      * then restrict consequence data to the matching value (transcript)
 756      * i.e. just pick out consequences for the transcript the variant feature is on
 757      */
 758     String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase();
 759     boolean matchFeature = false;
 760     String matchFeatureValue = null;
 761     if (csqFeatureFieldIndex > -1)
 762     {
 763       for (String consequence : consequences)
 764       {
 765         String[] csqFields = consequence.split(PIPE_REGEX);
 766         if (csqFields.length > csqFeatureFieldIndex)
 767         {
 768           String featureIdentifier = csqFields[csqFeatureFieldIndex];
 769           if (featureIdentifier.length() > 4
 770                   && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
 771           {
 772             matchFeature = true;
 773             matchFeatureValue = featureIdentifier;
 774           }
 775         }
 776       }
 777     }
 778
 779     StringBuilder sb = new StringBuilder(128);
 780     boolean found = false;
 781
 782     for (String consequence : consequences)
 783     {
 784       String[] csqFields = consequence.split(PIPE_REGEX);
 785
 786       /*
 787        * check consequence is for the current transcript
 788        */
 789       if (matchFeature)
 790       {
 791         if (csqFields.length <= csqFeatureFieldIndex)
 792         {
 793           continue;
 794         }
 795         String featureIdentifier = csqFields[csqFeatureFieldIndex];
 796         if (!featureIdentifier.equals(matchFeatureValue))
 797         {
 798           continue; // consequence is for a different transcript
 799         }
 800       }
 801
 802       if (csqFields.length > csqAlleleNumberFieldIndex)
 803       {
 804         String alleleNum = csqFields[csqAlleleNumberFieldIndex];
 805         if (String.valueOf(altAlelleIndex).equals(alleleNum))
 806         {
 807           if (found)
 808           {
 809             sb.append(COMMA);
 810           }
 811           found = true;
 812           sb.append(consequence);
 813         }
 814       }
 815     }
 816
 817     if (found)
 818     {
 819       sf.setValue(CSQ, sb.toString());
 820     }
 821   }
 822
 823   /**
 824    * A convenience method to complement a dna base and return the string value
 825    * of its complement
 826    *
 827    * @param reference
 828    * @return
 829    */
 830   protected String complement(byte[] reference)
 831   {
 832     return String.valueOf(Dna.getComplement((char) reference[0]));
 833   }
 834
 835   /**
 836    * Determines the location of the query range (chromosome positions) in a
 837    * different reference assembly.
 838    * <p>
 839    * If the range is just a subregion of one for which we already have a mapping
 840    * (for example, an exon sub-region of a gene), then the mapping is just
 841    * computed arithmetically.
 842    * <p>
 843    * Otherwise, calls the Ensembl REST service that maps from one assembly
 844    * reference's coordinates to another's
 845    *
 846    * @param queryRange
 847    *          start-end chromosomal range in 'fromRef' coordinates
 848    * @param chromosome
 849    * @param species
 850    * @param fromRef
 851    *          assembly reference for the query coordinates
 852    * @param toRef
 853    *          assembly reference we wish to translate to
 854    * @return the start-end range in 'toRef' coordinates
 855    */
 856   protected int[] mapReferenceRange(int[] queryRange, String chromosome,
 857           String species, String fromRef, String toRef)
 858   {
 859     /*
 860      * first try shorcut of computing the mapping as a subregion of one
 861      * we already have (e.g. for an exon, if we have the gene mapping)
 862      */
 863     int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
 864             species, fromRef, toRef);
 865     if (mappedRange != null)
 866     {
 867       return mappedRange;
 868     }
 869
 870     /*
 871      * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
 872      */
 873     EnsemblMap mapper = new EnsemblMap();
 874     int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef,
 875             queryRange);
 876
 877     if (mapping == null)
 878     {
 879       // mapping service failure
 880       return null;
 881     }
 882
 883     /*
 884      * save mapping for possible future re-use
 885      */
 886     String key = makeRangesKey(chromosome, species, fromRef, toRef);
 887     if (!assemblyMappings.containsKey(key))
 888     {
 889       assemblyMappings.put(key, new HashMap<int[], int[]>());
 890     }
 891
 892     assemblyMappings.get(key).put(queryRange, mapping);
 893
 894     return mapping;
 895   }
 896
 897   /**
 898    * If we already have a 1:1 contiguous mapping which subsumes the given query
 899    * range, this method just calculates and returns the subset of that mapping,
 900    * else it returns null. In practical terms, if a gene has a contiguous
 901    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
 902    * subsidiary exons occupy unchanged relative positions, and just compute
 903    * these as offsets, rather than do another lookup of the mapping.
 904    * <p>
 905    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
 906    * simply remove this method or let it always return null.
 907    * <p>
 908    * Warning: many rapid calls to the /map service map result in a 429 overload
 909    * error response
 910    *
 911    * @param queryRange
 912    * @param chromosome
 913    * @param species
 914    * @param fromRef
 915    * @param toRef
 916    * @return
 917    */
 918   protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
 919           String species, String fromRef, String toRef)
 920   {
 921     String key = makeRangesKey(chromosome, species, fromRef, toRef);
 922     if (assemblyMappings.containsKey(key))
 923     {
 924       Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
 925       for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
 926       {
 927         int[] fromRange = mappedRange.getKey();
 928         int[] toRange = mappedRange.getValue();
 929         if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
 930         {
 931           /*
 932            * mapping is 1:1 in length, so we trust it to have no discontinuities
 933            */
 934           if (MappingUtils.rangeContains(fromRange, queryRange))
 935           {
 936             /*
 937              * fromRange subsumes our query range
 938              */
 939             int offset = queryRange[0] - fromRange[0];
 940             int mappedRangeFrom = toRange[0] + offset;
 941             int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
 942             return new int[] { mappedRangeFrom, mappedRangeTo };
 943           }
 944         }
 945       }
 946     }
 947     return null;
 948   }
 949
 950   /**
 951    * Transfers the sequence feature to the target sequence, locating its start
 952    * and end range based on the mapping. Features which do not overlap the
 953    * target sequence are ignored.
 954    *
 955    * @param sf
 956    * @param targetSequence
 957    * @param mapping
 958    *          mapping from the feature's coordinates to the target sequence
 959    */
 960   protected void transferFeature(SequenceFeature sf,
 961           SequenceI targetSequence, MapList mapping)
 962   {
 963     int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
 964
 965     if (mappedRange != null)
 966     {
 967       String group = sf.getFeatureGroup();
 968       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
 969       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
 970       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
 971               group, sf.getScore());
 972       targetSequence.addSequenceFeature(copy);
 973     }
 974   }
 975
 976   /**
 977    * Formats a ranges map lookup key
 978    *
 979    * @param chromosome
 980    * @param species
 981    * @param fromRef
 982    * @param toRef
 983    * @return
 984    */
 985   protected static String makeRangesKey(String chromosome, String species,
 986           String fromRef, String toRef)
 987   {
 988     return species + EXCL + chromosome + EXCL + fromRef + EXCL
 989             + toRef;
 990   }
 991 }