src/jalview/ext/ensembl/EnsemblSeqProxy.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ext.ensembl;
  22
  23 import java.io.IOException;
  24 import java.net.MalformedURLException;
  25 import java.net.URL;
  26 import java.util.ArrayList;
  27 import java.util.Arrays;
  28 import java.util.Collections;
  29 import java.util.List;
  30 import java.util.Map;
  31
  32 import org.json.simple.parser.ParseException;
  33
  34 import jalview.analysis.AlignmentUtils;
  35 import jalview.analysis.Dna;
  36 import jalview.bin.Cache;
  37 import jalview.datamodel.Alignment;
  38 import jalview.datamodel.AlignmentI;
  39 import jalview.datamodel.DBRefEntry;
  40 import jalview.datamodel.DBRefSource;
  41 import jalview.datamodel.Mapping;
  42 import jalview.datamodel.Sequence;
  43 import jalview.datamodel.SequenceFeature;
  44 import jalview.datamodel.SequenceI;
  45 import jalview.datamodel.features.SequenceFeatures;
  46 import jalview.exceptions.JalviewException;
  47 import jalview.io.gff.Gff3Helper;
  48 import jalview.io.gff.SequenceOntologyFactory;
  49 import jalview.io.gff.SequenceOntologyI;
  50 import jalview.util.Comparison;
  51 import jalview.util.DBRefUtils;
  52 import jalview.util.IntRangeComparator;
  53 import jalview.util.MapList;
  54
  55 /**
  56  * Base class for Ensembl sequence fetchers
  57  *
  58  * @see http://rest.ensembl.org/documentation/info/sequence_id
  59  * @author gmcarstairs
  60  */
  61 public abstract class EnsemblSeqProxy extends EnsemblRestClient
  62 {
  63   protected static final String DESCRIPTION = "description";
  64
  65   /*
  66    * enum for 'type' parameter to the /sequence REST service
  67    */
  68   public enum EnsemblSeqType
  69   {
  70     /**
  71      * type=genomic to fetch full dna including introns
  72      */
  73     GENOMIC("genomic"),
  74
  75     /**
  76      * type=cdna to fetch coding dna including UTRs
  77      */
  78     CDNA("cdna"),
  79
  80     /**
  81      * type=cds to fetch coding dna excluding UTRs
  82      */
  83     CDS("cds"),
  84
  85     /**
  86      * type=protein to fetch peptide product sequence
  87      */
  88     PROTEIN("protein");
  89
  90     /*
  91      * the value of the 'type' parameter to fetch this version of
  92      * an Ensembl sequence
  93      */
  94     private String type;
  95
  96     EnsemblSeqType(String t)
  97     {
  98       type = t;
  99     }
 100
 101     public String getType()
 102     {
 103       return type;
 104     }
 105
 106   }
 107
 108   /**
 109    * Default constructor (to use rest.ensembl.org)
 110    */
 111   public EnsemblSeqProxy()
 112   {
 113     super();
 114   }
 115
 116   /**
 117    * Constructor given the target domain to fetch data from
 118    */
 119   public EnsemblSeqProxy(String d)
 120   {
 121     super(d);
 122   }
 123
 124   /**
 125    * Makes the sequence queries to Ensembl's REST service and returns an
 126    * alignment consisting of the returned sequences.
 127    */
 128   @Override
 129   public AlignmentI getSequenceRecords(String query) throws Exception
 130   {
 131     // TODO use a String... query vararg instead?
 132
 133     // danger: accession separator used as a regex here, a string elsewhere
 134     // in this case it is ok (it is just a space), but (e.g.) '\' would not be
 135     List<String> allIds = Arrays
 136             .asList(query.split(getAccessionSeparator()));
 137     AlignmentI alignment = null;
 138     inProgress = true;
 139
 140     /*
 141      * execute queries, if necessary in batches of the
 142      * maximum allowed number of ids
 143      */
 144     int maxQueryCount = getMaximumQueryCount();
 145     for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount)
 146     {
 147       int p = Math.min(vSize, v + maxQueryCount);
 148       List<String> ids = allIds.subList(v, p);
 149       try
 150       {
 151         alignment = fetchSequences(ids, alignment);
 152       } catch (Throwable r)
 153       {
 154         inProgress = false;
 155         String msg = "Aborting ID retrieval after " + v
 156                 + " chunks. Unexpected problem (" + r.getLocalizedMessage()
 157                 + ")";
 158         System.err.println(msg);
 159         r.printStackTrace();
 160         break;
 161       }
 162     }
 163
 164     if (alignment == null)
 165     {
 166       return null;
 167     }
 168
 169     /*
 170      * fetch and transfer genomic sequence features,
 171      * fetch protein product and add as cross-reference
 172      */
 173     for (int i = 0, n = allIds.size(); i < n; i++)
 174     {
 175       addFeaturesAndProduct(allIds.get(i), alignment);
 176     }
 177
 178     List<SequenceI> seqs = alignment.getSequences();
 179     for (int i = 0, n = seqs.size(); i < n; i++)
 180     {
 181       getCrossReferences(seqs.get(i));
 182     }
 183
 184     return alignment;
 185   }
 186
 187   /**
 188    * Fetches Ensembl features using the /overlap REST endpoint, and adds them to
 189    * the sequence in the alignment. Also fetches the protein product, maps it
 190    * from the CDS features of the sequence, and saves it as a cross-reference of
 191    * the dna sequence.
 192    *
 193    * @param accId
 194    * @param alignment
 195    */
 196   protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
 197   {
 198     if (alignment == null)
 199     {
 200       return;
 201     }
 202
 203     try
 204     {
 205       /*
 206        * get 'dummy' genomic sequence with gene, transcript,
 207        * exon, cds and variation features
 208        */
 209       SequenceI genomicSequence = null;
 210       EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain());
 211       EnsemblFeatureType[] features = getFeaturesToFetch();
 212
 213       // Platform.timeCheck("ESP.getsequencerec1", Platform.TIME_MARK);
 214
 215       AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
 216               features);
 217       if (geneFeatures != null && geneFeatures.getHeight() > 0)
 218       {
 219         genomicSequence = geneFeatures.getSequenceAt(0);
 220       }
 221
 222       // Platform.timeCheck("ESP.getsequencerec2", Platform.TIME_MARK);
 223
 224       if (genomicSequence != null)
 225       {
 226         /*
 227          * transfer features to the query sequence
 228          */
 229         SequenceI querySeq = alignment.findName(accId, true);
 230         if (transferFeatures(accId, genomicSequence, querySeq))
 231         {
 232
 233           /*
 234            * fetch and map protein product, and add it as a cross-reference
 235            * of the retrieved sequence
 236            */
 237           // Platform.timeCheck("ESP.transferFeatures", Platform.TIME_MARK);
 238           addProteinProduct(querySeq);
 239         }
 240       }
 241     } catch (IOException e)
 242     {
 243       System.err.println(
 244               "Error transferring Ensembl features: " + e.getMessage());
 245     }
 246     // Platform.timeCheck("ESP.addfeat done", Platform.TIME_MARK);
 247   }
 248
 249   /**
 250    * Returns those sequence feature types to fetch from Ensembl. We may want
 251    * features either because they are of interest to the user, or as means to
 252    * identify the locations of the sequence on the genomic sequence (CDS
 253    * features identify CDS, exon features identify cDNA etc).
 254    *
 255    * @return
 256    */
 257   protected abstract EnsemblFeatureType[] getFeaturesToFetch();
 258
 259   /**
 260    * Fetches and maps the protein product, and adds it as a cross-reference of
 261    * the retrieved sequence
 262    */
 263   protected void addProteinProduct(SequenceI querySeq)
 264   {
 265     String accId = querySeq.getName();
 266     try
 267     {
 268       System.out.println("Adding protein product for " + accId);
 269       AlignmentI protein = new EnsemblProtein(getDomain())
 270               .getSequenceRecords(accId);
 271       if (protein == null || protein.getHeight() == 0)
 272       {
 273         System.out.println("No protein product found for " + accId);
 274         return;
 275       }
 276       SequenceI proteinSeq = protein.getSequenceAt(0);
 277
 278       /*
 279        * need dataset sequences (to be the subject of mappings)
 280        */
 281       proteinSeq.createDatasetSequence();
 282       querySeq.createDatasetSequence();
 283
 284       MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq,
 285               proteinSeq);
 286       if (mapList != null)
 287       {
 288         // clunky: ensure Uniprot xref if we have one is on mapped sequence
 289         SequenceI ds = proteinSeq.getDatasetSequence();
 290         // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence()
 291         Mapping map = new Mapping(ds, mapList);
 292         DBRefEntry dbr = new DBRefEntry(getDbSource(),
 293                 getEnsemblDataVersion(), proteinSeq.getName(), map);
 294         querySeq.getDatasetSequence().addDBRef(dbr);
 295         List<DBRefEntry> uprots = DBRefUtils.selectRefs(ds.getDBRefs(),
 296                 new String[]
 297                 { DBRefSource.UNIPROT });
 298         List<DBRefEntry> upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(),
 299                 new String[]
 300                 { DBRefSource.UNIPROT });
 301         if (uprots != null)
 302         {
 303           for (DBRefEntry up : uprots)
 304           {
 305             // locate local uniprot ref and map
 306             List<DBRefEntry> upx = DBRefUtils.searchRefs(upxrefs,
 307                     up.getAccessionId());
 308             DBRefEntry upxref;
 309             if (upx.size() != 0)
 310             {
 311               upxref = upx.get(0);
 312
 313               if (upx.size() > 1)
 314               {
 315                 Cache.warn(
 316                         "Implementation issue - multiple uniprot acc on product sequence.");
 317               }
 318             }
 319             else
 320             {
 321               upxref = new DBRefEntry(DBRefSource.UNIPROT,
 322                       getEnsemblDataVersion(), up.getAccessionId());
 323             }
 324
 325             Mapping newMap = new Mapping(ds, mapList);
 326             upxref.setVersion(getEnsemblDataVersion());
 327             upxref.setMap(newMap);
 328             if (upx.size() == 0)
 329             {
 330               // add the new uniprot ref
 331               querySeq.getDatasetSequence().addDBRef(upxref);
 332             }
 333
 334           }
 335         }
 336
 337         /*
 338          * copy exon features to protein, compute peptide variants from dna
 339          * variants and add as features on the protein sequence ta-da
 340          */
 341         // JAL-3187 render on the fly instead
 342         // AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList);
 343       }
 344     } catch (Exception e)
 345     {
 346       System.err
 347               .println(String.format("Error retrieving protein for %s: %s",
 348                       accId, e.getMessage()));
 349     }
 350   }
 351
 352   /**
 353    * Get database xrefs from Ensembl, and attach them to the sequence
 354    *
 355    * @param seq
 356    */
 357   protected void getCrossReferences(SequenceI seq)
 358   {
 359
 360     // Platform.timeCheck("ESP. getdataseq ", Platform.TIME_MARK);
 361
 362
 363     while (seq.getDatasetSequence() != null)
 364     {
 365       seq = seq.getDatasetSequence();
 366     }
 367
 368     // Platform.timeCheck("ESP. getxref ", Platform.TIME_MARK);
 369
 370     EnsemblXref xrefFetcher = new EnsemblXref(getDomain(), getDbSource(),
 371             getEnsemblDataVersion());
 372     List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName());
 373
 374     for (int i = 0, n = xrefs.size(); i < n; i++)
 375     {
 376 //        Platform.timeCheck("ESP. getxref + " + (i) + "/" + n, Platform.TIME_MARK);
 377         // BH 2019.01.25 this next method was taking 174 ms PER addition for a 266-reference example.
 378         //    DBRefUtils.ensurePrimaries(seq)
 379         //        was at the end of seq.addDBRef, so executed after ever addition!
 380         //        This method was moved to     seq.getPrimaryDBRefs()
 381       seq.addDBRef(xrefs.get(i));
 382     }
 383
 384 //    System.out.println("primaries are " + seq.getPrimaryDBRefs().toString());
 385     /*
 386      * and add a reference to itself
 387      */
 388
 389 //    Platform.timeCheck("ESP. getxref self ", Platform.TIME_MARK);
 390
 391     DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(),
 392     seq.getName());
 393
 394 //    Platform.timeCheck("ESP. getxref self add ", Platform.TIME_MARK);
 395
 396     seq.addDBRef(self);
 397
 398     // Platform.timeCheck("ESP. seqprox done ", Platform.TIME_MARK);
 399   }
 400
 401   /**
 402    * Fetches sequences for the list of accession ids and adds them to the
 403    * alignment. Returns the extended (or created) alignment.
 404    *
 405    * @param ids
 406    * @param alignment
 407    * @return
 408    * @throws JalviewException
 409    * @throws IOException
 410    */
 411   protected AlignmentI fetchSequences(List<String> ids,
 412           AlignmentI alignment) throws JalviewException, IOException
 413   {
 414     if (!isEnsemblAvailable())
 415     {
 416       inProgress = false;
 417       throw new JalviewException("ENSEMBL Rest API not available.");
 418     }
 419     // Platform.timeCheck("EnsemblSeqProx.fetchSeq ", Platform.TIME_MARK);
 420
 421     List<SequenceI> seqs = parseSequenceJson(ids);
 422     if (seqs == null)
 423         return alignment;
 424
 425     if (seqs.isEmpty())
 426     {
 427       throw new IOException("No data returned for " + ids);
 428     }
 429
 430     if (seqs.size() != ids.size())
 431     {
 432       System.out.println(String.format(
 433               "Only retrieved %d sequences for %d query strings",
 434               seqs.size(), ids.size()));
 435     }
 436
 437     if (!seqs.isEmpty())
 438     {
 439       AlignmentI seqal = new Alignment(
 440               seqs.toArray(new SequenceI[seqs.size()]));
 441       for (SequenceI seq : seqs)
 442       {
 443         if (seq.getDescription() == null)
 444         {
 445           seq.setDescription(getDbName());
 446         }
 447         String name = seq.getName();
 448         if (ids.contains(name)
 449                 || ids.contains(name.replace("ENSP", "ENST")))
 450         {
 451           // TODO JAL-3077 use true accession version in dbref
 452           DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(),
 453                   getEnsemblDataVersion(), name);
 454           seq.addDBRef(dbref);
 455         }
 456       }
 457       if (alignment == null)
 458       {
 459         alignment = seqal;
 460       }
 461       else
 462       {
 463         alignment.append(seqal);
 464       }
 465     }
 466     return alignment;
 467   }
 468
 469   /**
 470    * Parses a JSON response for a single sequence ID query
 471    *
 472    * @param br
 473    * @return a single jalview.datamodel.Sequence
 474    * @see http://rest.ensembl.org/documentation/info/sequence_id
 475    */
 476   @SuppressWarnings("unchecked")
 477   protected List<SequenceI> parseSequenceJson(List<String> ids)
 478   {
 479     List<SequenceI> result = new ArrayList<>();
 480     try
 481     {
 482       /*
 483        * for now, assumes only one sequence returned; refactor if needed
 484        * in future to handle a JSONArray with more than one
 485        */
 486       // Platform.timeCheck("ENS seqproxy", Platform.TIME_MARK);
 487       Map<String, Object> val = (Map<String, Object>) getJSON(null, ids, -1, MODE_MAP, null);
 488       if (val == null)
 489           return null;
 490       Object s = val.get("desc");
 491       String desc = s == null ? null : s.toString();
 492       s = val.get("id");
 493       String id = s == null ? null : s.toString();
 494       s = val.get("seq");
 495       String seq = s == null ? null : s.toString();
 496       Sequence sequence = new Sequence(id, seq);
 497       if (desc != null)
 498       {
 499         sequence.setDescription(desc);
 500       }
 501       // todo JAL-3077 make a DBRefEntry with true accession version
 502       // s = val.get("version");
 503       // String version = s == null ? "0" : s.toString();
 504       // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id);
 505       // sequence.addDBRef(dbref);
 506       result.add(sequence);
 507     } catch (ParseException | IOException e)
 508     {
 509       System.err.println("Error processing JSON response: " + e.toString());
 510       // ignore
 511     }
 512     // Platform.timeCheck("ENS seqproxy2", Platform.TIME_MARK);
 513     return result;
 514   }
 515
 516   /**
 517    * Returns the URL for the REST call
 518    *
 519    * @return
 520    * @throws MalformedURLException
 521    */
 522   @Override
 523   protected URL getUrl(List<String> ids) throws MalformedURLException
 524   {
 525     /*
 526      * a single id is included in the URL path
 527      * multiple ids go in the POST body instead
 528      */
 529     StringBuffer urlstring = new StringBuffer(128);
 530     urlstring.append(getDomain() + "/sequence/id");
 531     if (ids.size() == 1)
 532     {
 533       urlstring.append("/").append(ids.get(0));
 534     }
 535     // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
 536     urlstring.append("?type=").append(getSourceEnsemblType().getType());
 537     urlstring.append(("&Accept=application/json"));
 538     urlstring.append(("&content-type=application/json"));
 539
 540     String objectType = getObjectType();
 541     if (objectType != null)
 542     {
 543       urlstring.append("&").append(OBJECT_TYPE).append("=")
 544               .append(objectType);
 545     }
 546
 547     URL url = new URL(urlstring.toString());
 548     return url;
 549   }
 550
 551   /**
 552    * Override this method to specify object_type request parameter
 553    *
 554    * @return
 555    */
 556   protected String getObjectType()
 557   {
 558     return null;
 559   }
 560
 561   /**
 562    * A sequence/id POST request currently allows up to 50 queries
 563    *
 564    * @see http://rest.ensembl.org/documentation/info/sequence_id_post
 565    */
 566   @Override
 567   public int getMaximumQueryCount()
 568   {
 569     return 50;
 570   }
 571
 572   @Override
 573   protected boolean useGetRequest()
 574   {
 575     return false;
 576   }
 577
 578   /**
 579    *
 580    * @return the configured sequence return type for this source
 581    */
 582   protected abstract EnsemblSeqType getSourceEnsemblType();
 583
 584   /**
 585    * Returns a list of [start, end] genomic ranges corresponding to the sequence
 586    * being retrieved.
 587    *
 588    * The correspondence between the frames of reference is made by locating
 589    * those features on the genomic sequence which identify the retrieved
 590    * sequence. Specifically
 591    * <ul>
 592    * <li>genomic sequence is identified by "transcript" features with
 593    * ID=transcript:transcriptId</li>
 594    * <li>cdna sequence is identified by "exon" features with
 595    * Parent=transcript:transcriptId</li>
 596    * <li>cds sequence is identified by "CDS" features with
 597    * Parent=transcript:transcriptId</li>
 598    * </ul>
 599    *
 600    * The returned ranges are sorted to run forwards (for positive strand) or
 601    * backwards (for negative strand). Aborts and returns null if both positive
 602    * and negative strand are found (this should not normally happen).
 603    *
 604    * @param sourceSequence
 605    * @param accId
 606    * @param start
 607    *          the start position of the sequence we are mapping to
 608    * @return
 609    */
 610   protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence,
 611           String accId, int start)
 612   {
 613     List<SequenceFeature> sfs = getIdentifyingFeatures(sourceSequence,
 614             accId);
 615     if (sfs.isEmpty())
 616     {
 617       return null;
 618     }
 619
 620     /*
 621      * generously initial size for number of cds regions
 622      * (worst case titin Q8WZ42 has c. 313 exons)
 623      */
 624     List<int[]> regions = new ArrayList<>(100);
 625     int mappedLength = 0;
 626     int direction = 1; // forward
 627     boolean directionSet = false;
 628
 629     for (SequenceFeature sf : sfs)
 630     {
 631       int strand = sf.getStrand();
 632       strand = strand == 0 ? 1 : strand; // treat unknown as forward
 633
 634       if (directionSet && strand != direction)
 635       {
 636         // abort - mix of forward and backward
 637         System.err
 638                 .println("Error: forward and backward strand for " + accId);
 639         return null;
 640       }
 641       direction = strand;
 642       directionSet = true;
 643
 644       /*
 645        * add to CDS ranges, semi-sorted forwards/backwards
 646        */
 647       if (strand < 0)
 648       {
 649         regions.add(0, new int[] { sf.getEnd(), sf.getBegin() });
 650       }
 651       else
 652       {
 653         regions.add(new int[] { sf.getBegin(), sf.getEnd() });
 654       }
 655       mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
 656     }
 657
 658     if (regions.isEmpty())
 659     {
 660       System.out.println("Failed to identify target sequence for " + accId
 661               + " from genomic features");
 662       return null;
 663     }
 664
 665     /*
 666      * a final sort is needed since Ensembl returns CDS sorted within source
 667      * (havana / ensembl_havana)
 668      */
 669     Collections.sort(regions, direction == 1 ? IntRangeComparator.ASCENDING
 670             : IntRangeComparator.DESCENDING);
 671
 672     List<int[]> to = Arrays
 673             .asList(new int[]
 674             { start, start + mappedLength - 1 });
 675
 676     return new MapList(regions, to, 1, 1);
 677   }
 678
 679   /**
 680    * Answers a list of sequence features that mark positions of the genomic
 681    * sequence feature which are within the sequence being retrieved. For
 682    * example, an 'exon' feature whose parent is the target transcript marks the
 683    * cdna positions of the transcript. For a gene sequence, this is trivially
 684    * just the 'gene' feature with matching gene id.
 685    *
 686    * @param seq
 687    * @param accId
 688    * @return
 689    */
 690   protected abstract List<SequenceFeature> getIdentifyingFeatures(
 691           SequenceI seq, String accId);
 692
 693   int bhtest = 0;
 694
 695   /**
 696    * Transfers the sequence feature to the target sequence, locating its start
 697    * and end range based on the mapping. Features which do not overlap the
 698    * target sequence are ignored.
 699    *
 700    * @param sf
 701    * @param targetSequence
 702    * @param mapping
 703    *          mapping from the sequence feature's coordinates to the target
 704    *          sequence
 705    * @param forwardStrand
 706    */
 707   protected void transferFeature(SequenceFeature sf,
 708           SequenceI targetSequence, MapList mapping, boolean forwardStrand)
 709   {
 710     int start = sf.getBegin();
 711     int end = sf.getEnd();
 712     int[] mappedRange = mapping.locateInTo(start, end);
 713
 714     if (mappedRange != null)
 715     {
 716 //      Platform.timeCheck(null, Platform.TIME_SET);
 717       String group = sf.getFeatureGroup();
 718       if (".".equals(group))
 719       {
 720         group = getDbSource();
 721       }
 722       int newBegin = Math.min(mappedRange[0], mappedRange[1]);
 723       int newEnd = Math.max(mappedRange[0], mappedRange[1]);
 724 //      Platform.timeCheck(null, Platform.TIME_MARK);
 725       bhtest++;
 726       // 280 ms/1000 here:
 727       SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, group, sf.getScore());
 728       // 0.175 ms here:
 729       targetSequence.addSequenceFeature(copy);
 730
 731       /*
 732        * for sequence_variant on reverse strand, have to convert the allele
 733        * values to their complements
 734        */
 735       if (!forwardStrand && SequenceOntologyFactory.getInstance()
 736               .isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
 737       {
 738         reverseComplementAlleles(copy);
 739       }
 740     }
 741   }
 742
 743   /**
 744    * Change the 'alleles' value of a feature by converting to complementary
 745    * bases, and also update the feature description to match
 746    *
 747    * @param sf
 748    */
 749   static void reverseComplementAlleles(SequenceFeature sf)
 750   {
 751     final String alleles = (String) sf.getValue(Gff3Helper.ALLELES);
 752     if (alleles == null)
 753     {
 754       return;
 755     }
 756     StringBuilder complement = new StringBuilder(alleles.length());
 757     for (String allele : alleles.split(","))
 758     {
 759       reverseComplementAllele(complement, allele);
 760     }
 761     String comp = complement.toString();
 762     sf.setValue(Gff3Helper.ALLELES, comp);
 763     sf.setDescription(comp);
 764   }
 765
 766   /**
 767    * Makes the 'reverse complement' of the given allele and appends it to the
 768    * buffer, after a comma separator if not the first
 769    *
 770    * @param complement
 771    * @param allele
 772    */
 773   static void reverseComplementAllele(StringBuilder complement,
 774           String allele)
 775   {
 776     if (complement.length() > 0)
 777     {
 778       complement.append(",");
 779     }
 780
 781     /*
 782      * some 'alleles' are actually descriptive terms
 783      * e.g. HGMD_MUTATION, PhenCode_variation
 784      * - we don't want to 'reverse complement' these
 785      */
 786     if (!Comparison.isNucleotideSequence(allele, true))
 787     {
 788       complement.append(allele);
 789     }
 790     else
 791     {
 792       for (int i = allele.length() - 1; i >= 0; i--)
 793       {
 794         complement.append(Dna.getComplement(allele.charAt(i)));
 795       }
 796     }
 797   }
 798
 799   /**
 800    * Transfers features from sourceSequence to targetSequence
 801    *
 802    * @param accessionId
 803    * @param sourceSequence
 804    * @param targetSequence
 805    * @return true if any features were transferred, else false
 806    */
 807   protected boolean transferFeatures(String accessionId,
 808           SequenceI sourceSequence, SequenceI targetSequence)
 809   {
 810     if (sourceSequence == null || targetSequence == null)
 811     {
 812       return false;
 813     }
 814
 815 //    long start = System.currentTimeMillis();
 816     List<SequenceFeature> sfs = sourceSequence.getFeatures()
 817             .getPositionalFeatures();
 818     MapList mapping = getGenomicRangesFromFeatures(sourceSequence,
 819             accessionId, targetSequence.getStart());
 820     if (mapping == null)
 821     {
 822       return false;
 823     }
 824
 825     // Platform.timeCheck("ESP. xfer " + sfs.size(), Platform.TIME_MARK);
 826
 827     boolean result = transferFeatures(sfs, targetSequence, mapping,
 828             accessionId);
 829 //    System.out.println("transferFeatures (" + (sfs.size()) + " --> "
 830 //            + targetSequence.getFeatures().getFeatureCount(true) + ") to "
 831 //            + targetSequence.getName() + " took "
 832 //            + (System.currentTimeMillis() - start) + "ms");
 833     return result;
 834   }
 835
 836   /**
 837    * Transfer features to the target sequence. The start/end positions are
 838    * converted using the mapping. Features which do not overlap are ignored.
 839    * Features whose parent is not the specified identifier are also ignored.
 840    *
 841    * @param sfs
 842    * @param targetSequence
 843    * @param mapping
 844    * @param parentId
 845    * @return
 846    */
 847   protected boolean transferFeatures(List<SequenceFeature> sfs,
 848           SequenceI targetSequence, MapList mapping, String parentId)
 849   {
 850     final boolean forwardStrand = mapping.isFromForwardStrand();
 851
 852     /*
 853      * sort features by start position (which corresponds to end
 854      * position descending if reverse strand) so as to add them in
 855      * 'forwards' order to the target sequence
 856      */
 857     SequenceFeatures.sortFeatures(sfs, forwardStrand);
 858
 859     boolean transferred = false;
 860
 861     for (int i = 0, n = sfs.size(); i < n; i++)
 862     {
 863
 864 //      if ((i%1000) == 0) {
 865 ////                Platform.timeCheck("Feature " + bhtest, Platform.TIME_GET);
 866 //      Platform.timeCheck("ESP. xferFeature + " + (i) + "/" + n, Platform.TIME_MARK);
 867 //      }
 868
 869       SequenceFeature sf = sfs.get(i);
 870       if (retainFeature(sf, parentId))
 871       {
 872         transferFeature(sf, targetSequence, mapping, forwardStrand);
 873         transferred = true;
 874       }
 875     }
 876
 877     return transferred;
 878   }
 879
 880   /**
 881    * Answers true if the feature type is one we want to keep for the sequence.
 882    * Some features are only retrieved in order to identify the sequence range,
 883    * and may then be discarded as redundant information (e.g. "CDS" feature for
 884    * a CDS sequence).
 885    */
 886   @SuppressWarnings("unused")
 887   protected boolean retainFeature(SequenceFeature sf, String accessionId)
 888   {
 889     return true; // override as required
 890   }
 891
 892   /**
 893    * Answers true if the feature has a Parent which refers to the given
 894    * accession id, or if the feature has no parent. Answers false if the
 895    * feature's Parent is for a different accession id.
 896    *
 897    * @param sf
 898    * @param identifier
 899    * @return
 900    */
 901   protected boolean featureMayBelong(SequenceFeature sf, String identifier)
 902   {
 903     String parent = (String) sf.getValue(PARENT);
 904     if (parent != null
 905             && !parent.equalsIgnoreCase(identifier))
 906     {
 907       // this genomic feature belongs to a different transcript
 908       return false;
 909     }
 910     return true;
 911   }
 912
 913   /**
 914    * Answers a short description of the sequence fetcher
 915    */
 916   @Override
 917   public String getDescription()
 918   {
 919     return "Ensembl " + getSourceEnsemblType().getType()
 920             + " sequence with variant features";
 921   }
 922
 923   /**
 924    * Returns a (possibly empty) list of features on the sequence which have the
 925    * specified sequence ontology term (or a sub-type of it), and the given
 926    * identifier as parent
 927    *
 928    * @param sequence
 929    * @param term
 930    * @param parentId
 931    * @return
 932    */
 933   protected List<SequenceFeature> findFeatures(SequenceI sequence,
 934           String term, String parentId)
 935   {
 936     List<SequenceFeature> result = new ArrayList<>();
 937
 938     List<SequenceFeature> sfs = sequence.getFeatures()
 939             .getFeaturesByOntology(term);
 940     for (SequenceFeature sf : sfs)
 941     {
 942       String parent = (String) sf.getValue(PARENT);
 943       if (parent != null && parent.equalsIgnoreCase(parentId))
 944       {
 945         result.add(sf);
 946       }
 947     }
 948
 949     return result;
 950   }
 951
 952   /**
 953    * Answers true if the feature type is either 'NMD_transcript_variant' or
 954    * 'transcript' (or one of its sub-types in the Sequence Ontology). This is
 955    * because NMD_transcript_variant behaves like 'transcript' in Ensembl
 956    * although strictly speaking it is not (it is a sub-type of
 957    * sequence_variant).
 958    * <p>
 959    * (This test was needed when fetching transcript features as GFF. As we are
 960    * now fetching as JSON, all features have type 'transcript' so the check for
 961    * NMD_transcript_variant is redundant. Left in for any future case arising.)
 962    *
 963    * @param featureType
 964    * @return
 965    */
 966   public static boolean isTranscript(String featureType)
 967   {
 968     return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
 969             || SequenceOntologyFactory.getInstance().isA(featureType,
 970                     SequenceOntologyI.TRANSCRIPT);
 971   }
 972 }