src/jalview/ext/ensembl/EnsemblSeqProxy.java

   1 package jalview.ext.ensembl;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.analysis.Dna;
   5 import jalview.datamodel.Alignment;
   6 import jalview.datamodel.AlignmentI;
   7 import jalview.datamodel.DBRefEntry;
   8 import jalview.datamodel.DBRefSource;
   9 import jalview.datamodel.Mapping;
  10 import jalview.datamodel.SequenceFeature;
  11 import jalview.datamodel.SequenceI;
  12 import jalview.exceptions.JalviewException;
  13 import jalview.io.FastaFile;
  14 import jalview.io.FileParse;
  15 import jalview.io.gff.SequenceOntologyFactory;
  16 import jalview.io.gff.SequenceOntologyI;
  17 import jalview.util.DBRefUtils;
  18 import jalview.util.MapList;
  19
  20 import java.io.IOException;
  21 import java.net.MalformedURLException;
  22 import java.net.URL;
  23 import java.util.ArrayList;
  24 import java.util.Arrays;
  25 import java.util.Collections;
  26 import java.util.Comparator;
  27 import java.util.List;
  28
  29 /**
  30  * Base class for Ensembl sequence fetchers
  31  *
  32  * @see http://rest.ensembl.org/documentation/info/sequence_id
  33  * @author gmcarstairs
  34  */
  35 public abstract class EnsemblSeqProxy extends EnsemblRestClient
  36 {
  37   private static final String ALLELES = "alleles";
  38
  39   protected static final String CONSEQUENCE_TYPE = "consequence_type";
  40
  41   protected static final String PARENT = "Parent";
  42
  43   protected static final String ID = "ID";
  44
  45   protected static final String NAME = "Name";
  46
  47   protected static final String DESCRIPTION = "description";
  48
  49   /*
  50    * enum for 'type' parameter to the /sequence REST service
  51    */
  52   public enum EnsemblSeqType
  53   {
  54     /**
  55      * type=genomic to fetch full dna including introns
  56      */
  57     GENOMIC("genomic"),
  58
  59     /**
  60      * type=cdna to fetch dna including UTRs
  61      */
  62     CDNA("cdna"),
  63
  64     /**
  65      * type=cds to fetch coding dna excluding UTRs
  66      */
  67     CDS("cds"),
  68
  69     /**
  70      * type=protein to fetch peptide product sequence
  71      */
  72     PROTEIN("protein");
  73
  74     /*
  75      * the value of the 'type' parameter to fetch this version of
  76      * an Ensembl sequence
  77      */
  78     private String type;
  79
  80     EnsemblSeqType(String t)
  81     {
  82       type = t;
  83     }
  84
  85     public String getType()
  86     {
  87       return type;
  88     }
  89
  90   }
  91
  92   /**
  93    * A comparator to sort ranges into ascending start position order
  94    */
  95   private class RangeSorter implements Comparator<int[]>
  96   {
  97     boolean forwards;
  98
  99     RangeSorter(boolean forward)
 100     {
 101       forwards = forward;
 102     }
 103
 104     @Override
 105     public int compare(int[] o1, int[] o2)
 106     {
 107       return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]);
 108     }
 109
 110   }
 111
 112   /**
 113    * Default constructor (to use rest.ensembl.org)
 114    */
 115   public EnsemblSeqProxy()
 116   {
 117     super();
 118   }
 119
 120   /**
 121    * Constructor given the target domain to fetch data from
 122    */
 123   public EnsemblSeqProxy(String d)
 124   {
 125     super(d);
 126   }
 127
 128   /**
 129    * Makes the sequence queries to Ensembl's REST service and returns an
 130    * alignment consisting of the returned sequences.
 131    */
 132   @Override
 133   public AlignmentI getSequenceRecords(String query) throws Exception
 134   {
 135     // TODO use a String... query vararg instead?
 136
 137     // danger: accession separator used as a regex here, a string elsewhere
 138     // in this case it is ok (it is just a space), but (e.g.) '\' would not be
 139     List<String> allIds = Arrays.asList(query
 140             .split(getAccessionSeparator()));
 141     AlignmentI alignment = null;
 142     inProgress = true;
 143
 144     /*
 145      * execute queries, if necessary in batches of the
 146      * maximum allowed number of ids
 147      */
 148     int maxQueryCount = getMaximumQueryCount();
 149     for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount)
 150     {
 151       int p = Math.min(vSize, v + maxQueryCount);
 152       List<String> ids = allIds.subList(v, p);
 153       try
 154       {
 155         alignment = fetchSequences(ids, alignment);
 156       } catch (Throwable r)
 157       {
 158         inProgress = false;
 159         String msg = "Aborting ID retrieval after " + v
 160                 + " chunks. Unexpected problem (" + r.getLocalizedMessage()
 161                 + ")";
 162         System.err.println(msg);
 163         break;
 164       }
 165     }
 166
 167     if (alignment == null)
 168     {
 169       return null;
 170     }
 171
 172     /*
 173      * fetch and transfer genomic sequence features,
 174      * fetch protein product and add as cross-reference
 175      */
 176     for (String accId : allIds)
 177     {
 178       addFeaturesAndProduct(accId, alignment);
 179     }
 180
 181     for (SequenceI seq : alignment.getSequences())
 182     {
 183       getCrossReferences(seq);
 184     }
 185
 186     return alignment;
 187   }
 188
 189   /**
 190    * Fetches Ensembl features using the /overlap REST endpoint, and adds them to
 191    * the sequence in the alignment. Also fetches the protein product, maps it
 192    * from the CDS features of the sequence, and saves it as a cross-reference of
 193    * the dna sequence.
 194    *
 195    * @param accId
 196    * @param alignment
 197    */
 198   protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
 199   {
 200     if (alignment == null)
 201     {
 202       return;
 203     }
 204
 205     try
 206     {
 207       /*
 208        * get 'dummy' genomic sequence with exon, cds and variation features
 209        */
 210       SequenceI genomicSequence = null;
 211       EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain());
 212       EnsemblFeatureType[] features = getFeaturesToFetch();
 213       AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
 214               features);
 215       if (geneFeatures.getHeight() > 0)
 216       {
 217         genomicSequence = geneFeatures.getSequenceAt(0);
 218       }
 219       if (genomicSequence != null)
 220       {
 221         /*
 222          * transfer features to the query sequence
 223          */
 224         SequenceI querySeq = alignment.findName(accId);
 225         if (transferFeatures(accId, genomicSequence, querySeq))
 226         {
 227
 228           /*
 229            * fetch and map protein product, and add it as a cross-reference
 230            * of the retrieved sequence
 231            */
 232           addProteinProduct(querySeq);
 233         }
 234       }
 235     } catch (IOException e)
 236     {
 237       System.err.println("Error transferring Ensembl features: "
 238               + e.getMessage());
 239     }
 240   }
 241
 242   /**
 243    * Returns those sequence feature types to fetch from Ensembl. We may want
 244    * features either because they are of interest to the user, or as means to
 245    * identify the locations of the sequence on the genomic sequence (CDS
 246    * features identify CDS, exon features identify cDNA etc).
 247    *
 248    * @return
 249    */
 250   protected abstract EnsemblFeatureType[] getFeaturesToFetch();
 251
 252   /**
 253    * Fetches and maps the protein product, and adds it as a cross-reference of
 254    * the retrieved sequence
 255    */
 256   protected void addProteinProduct(SequenceI querySeq)
 257   {
 258     String accId = querySeq.getName();
 259     try
 260     {
 261       AlignmentI protein = new EnsemblProtein(getDomain())
 262               .getSequenceRecords(accId);
 263       if (protein == null || protein.getHeight() == 0)
 264       {
 265         System.out.println("No protein product found for " + accId);
 266         return;
 267       }
 268       SequenceI proteinSeq = protein.getSequenceAt(0);
 269
 270       /*
 271        * need dataset sequences (to be the subject of mappings)
 272        */
 273       proteinSeq.createDatasetSequence();
 274       querySeq.createDatasetSequence();
 275
 276       MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq);
 277       if (mapList != null)
 278       {
 279         // clunky: ensure Uniprot xref if we have one is on mapped sequence
 280         SequenceI ds = proteinSeq.getDatasetSequence();
 281         ds.setSourceDBRef(proteinSeq.getSourceDBRef());
 282
 283         Mapping map = new Mapping(ds, mapList);
 284         DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
 285                 proteinSeq.getName(), map);
 286         querySeq.getDatasetSequence().addDBRef(dbr);
 287
 288         /*
 289          * copy exon features to protein, compute peptide variants from dna
 290          * variants and add as features on the protein sequence ta-da
 291          */
 292         AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList);
 293       }
 294     } catch (Exception e)
 295     {
 296       System.err
 297               .println(String.format("Error retrieving protein for %s: %s",
 298                       accId, e.getMessage()));
 299     }
 300   }
 301
 302   /**
 303    * Get database xrefs from Ensembl, and attach them to the sequence
 304    *
 305    * @param seq
 306    */
 307   protected void getCrossReferences(SequenceI seq)
 308   {
 309     while (seq.getDatasetSequence() != null)
 310     {
 311       seq = seq.getDatasetSequence();
 312     }
 313
 314     EnsemblXref xrefFetcher = new EnsemblXref(getDomain());
 315     List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName());
 316     for (DBRefEntry xref : xrefs)
 317     {
 318       seq.addDBRef(xref);
 319       /*
 320        * Save any Uniprot xref to be the reference for SIFTS mapping
 321        */
 322       if (DBRefSource.UNIPROT.equals(xref.getSource()))
 323       {
 324         seq.setSourceDBRef(xref);
 325       }
 326     }
 327
 328     /*
 329      * and add a reference to itself
 330      */
 331     DBRefEntry self = new DBRefEntry(getDbSource(), "0", seq.getName());
 332     seq.addDBRef(self);
 333   }
 334
 335   /**
 336    * Fetches sequences for the list of accession ids and adds them to the
 337    * alignment. Returns the extended (or created) alignment.
 338    *
 339    * @param ids
 340    * @param alignment
 341    * @return
 342    * @throws JalviewException
 343    * @throws IOException
 344    */
 345   protected AlignmentI fetchSequences(List<String> ids, AlignmentI alignment)
 346           throws JalviewException, IOException
 347   {
 348     if (!isEnsemblAvailable())
 349     {
 350       inProgress = false;
 351       throw new JalviewException("ENSEMBL Rest API not available.");
 352     }
 353     FileParse fp = getSequenceReader(ids);
 354     FastaFile fr = new FastaFile(fp);
 355     if (fr.hasWarningMessage())
 356     {
 357       System.out.println(String.format(
 358               "Warning when retrieving %d ids %s\n%s", ids.size(),
 359               ids.toString(), fr.getWarningMessage()));
 360     }
 361     else if (fr.getSeqs().size() != ids.size())
 362     {
 363       System.out.println(String.format(
 364               "Only retrieved %d sequences for %d query strings", fr
 365                       .getSeqs().size(), ids.size()));
 366     }
 367
 368     if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0)
 369     {
 370       /*
 371        * POST request has returned an empty FASTA file e.g. for invalid id
 372        */
 373       throw new IOException("No data returned for " + ids);
 374     }
 375
 376     if (fr.getSeqs().size() > 0)
 377     {
 378       AlignmentI seqal = new Alignment(
 379               fr.getSeqsAsArray());
 380       for (SequenceI sq:seqal.getSequences())
 381       {
 382         if (sq.getDescription() == null)
 383         {
 384           sq.setDescription(getDbName());
 385         }
 386         String name = sq.getName();
 387         if (ids.contains(name)
 388                 || ids.contains(name.replace("ENSP", "ENST")))
 389         {
 390           DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name);
 391         }
 392       }
 393       if (alignment == null)
 394       {
 395         alignment = seqal;
 396       }
 397       else
 398       {
 399         alignment.append(seqal);
 400       }
 401     }
 402     return alignment;
 403   }
 404
 405   /**
 406    * Returns the URL for the REST call
 407    *
 408    * @return
 409    * @throws MalformedURLException
 410    */
 411   @Override
 412   protected URL getUrl(List<String> ids) throws MalformedURLException
 413   {
 414     /*
 415      * a single id is included in the URL path
 416      * multiple ids go in the POST body instead
 417      */
 418     StringBuffer urlstring = new StringBuffer(128);
 419     urlstring.append(getDomain() + "/sequence/id");
 420     if (ids.size() == 1)
 421     {
 422       urlstring.append("/").append(ids.get(0));
 423     }
 424     // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
 425     urlstring.append("?type=").append(getSourceEnsemblType().getType());
 426     urlstring.append(("&Accept=text/x-fasta"));
 427
 428     URL url = new URL(urlstring.toString());
 429     return url;
 430   }
 431
 432   /**
 433    * A sequence/id POST request currently allows up to 50 queries
 434    *
 435    * @see http://rest.ensembl.org/documentation/info/sequence_id_post
 436    */
 437   @Override
 438   public int getMaximumQueryCount()
 439   {
 440     return 50;
 441   }
 442
 443   @Override
 444   protected boolean useGetRequest()
 445   {
 446     return false;
 447   }
 448
 449   @Override
 450   protected String getRequestMimeType(boolean multipleIds)
 451   {
 452     return multipleIds ? "application/json" : "text/x-fasta";
 453   }
 454
 455   @Override
 456   protected String getResponseMimeType()
 457   {
 458     return "text/x-fasta";
 459   }
 460
 461   /**
 462    *
 463    * @return the configured sequence return type for this source
 464    */
 465   protected abstract EnsemblSeqType getSourceEnsemblType();
 466
 467   /**
 468    * Returns a list of [start, end] genomic ranges corresponding to the sequence
 469    * being retrieved.
 470    *
 471    * The correspondence between the frames of reference is made by locating
 472    * those features on the genomic sequence which identify the retrieved
 473    * sequence. Specifically
 474    * <ul>
 475    * <li>genomic sequence is identified by "transcript" features with
 476    * ID=transcript:transcriptId</li>
 477    * <li>cdna sequence is identified by "exon" features with
 478    * Parent=transcript:transcriptId</li>
 479    * <li>cds sequence is identified by "CDS" features with
 480    * Parent=transcript:transcriptId</li>
 481    * </ul>
 482    *
 483    * The returned ranges are sorted to run forwards (for positive strand) or
 484    * backwards (for negative strand). Aborts and returns null if both positive
 485    * and negative strand are found (this should not normally happen).
 486    *
 487    * @param sourceSequence
 488    * @param accId
 489    * @param start
 490    *          the start position of the sequence we are mapping to
 491    * @return
 492    */
 493   protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence,
 494           String accId, int start)
 495   {
 496     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
 497     if (sfs == null)
 498     {
 499       return null;
 500     }
 501
 502     /*
 503      * generously initial size for number of cds regions
 504      * (worst case titin Q8WZ42 has c. 313 exons)
 505      */
 506     List<int[]> regions = new ArrayList<int[]>(100);
 507     int mappedLength = 0;
 508     int direction = 1; // forward
 509     boolean directionSet = false;
 510
 511     for (SequenceFeature sf : sfs)
 512     {
 513       /*
 514        * accept the target feature type or a specialisation of it
 515        * (e.g. coding_exon for exon)
 516        */
 517       if (identifiesSequence(sf, accId))
 518       {
 519         int strand = sf.getStrand();
 520         strand = strand == 0 ? 1 : strand; // treat unknown as forward
 521
 522         if (directionSet && strand != direction)
 523         {
 524           // abort - mix of forward and backward
 525           System.err.println("Error: forward and backward strand for "
 526                   + accId);
 527             return null;
 528           }
 529           direction = strand;
 530           directionSet = true;
 531
 532           /*
 533            * add to CDS ranges, semi-sorted forwards/backwards
 534            */
 535           if (strand < 0)
 536           {
 537             regions.add(0, new int[] { sf.getEnd(), sf.getBegin() });
 538           }
 539           else
 540           {
 541           regions.add(new int[] { sf.getBegin(), sf.getEnd() });
 542         }
 543         mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
 544
 545         if (!isSpliceable())
 546         {
 547           /*
 548            * 'gene' sequence is contiguous so we can stop as soon as its
 549            * identifying feature has been found
 550            */
 551           break;
 552         }
 553       }
 554     }
 555
 556     if (regions.isEmpty())
 557     {
 558       System.out.println("Failed to identify target sequence for " + accId
 559               + " from genomic features");
 560       return null;
 561     }
 562
 563     /*
 564      * a final sort is needed since Ensembl returns CDS sorted within source
 565      * (havana / ensembl_havana)
 566      */
 567     Collections.sort(regions, new RangeSorter(direction == 1));
 568
 569     List<int[]> to = Arrays.asList(new int[] { start,
 570         start + mappedLength - 1 });
 571
 572     return new MapList(regions, to, 1, 1);
 573   }
 574
 575   /**
 576    * Answers true if the sequence being retrieved may occupy discontiguous
 577    * regions on the genomic sequence.
 578    */
 579   protected boolean isSpliceable()
 580   {
 581     return true;
 582   }
 583
 584   /**
 585    * Returns true if the sequence feature marks positions of the genomic
 586    * sequence feature which are within the sequence being retrieved. For
 587    * example, an 'exon' feature whose parent is the target transcript marks the
 588    * cdna positions of the transcript.
 589    *
 590    * @param sf
 591    * @param accId
 592    * @return
 593    */
 594   protected abstract boolean identifiesSequence(SequenceFeature sf,
 595           String accId);
 596
 597   /**
 598    * Transfers the sequence feature to the target sequence, locating its start
 599    * and end range based on the mapping. Features which do not overlap the
 600    * target sequence are ignored.
 601    *
 602    * @param sf
 603    * @param targetSequence
 604    * @param mapping
 605    *          mapping from the sequence feature's coordinates to the target
 606    *          sequence
 607    * @param forwardStrand
 608    */
 609   protected void transferFeature(SequenceFeature sf,
 610           SequenceI targetSequence, MapList mapping, boolean forwardStrand)
 611   {
 612     int start = sf.getBegin();
 613     int end = sf.getEnd();
 614     int[] mappedRange = mapping.locateInTo(start, end);
 615
 616     if (mappedRange != null)
 617     {
 618       SequenceFeature copy = new SequenceFeature(sf);
 619       copy.setBegin(Math.min(mappedRange[0], mappedRange[1]));
 620       copy.setEnd(Math.max(mappedRange[0], mappedRange[1]));
 621       targetSequence.addSequenceFeature(copy);
 622
 623       /*
 624        * for sequence_variant on reverse strand, have to convert the allele
 625        * values to their complements
 626        */
 627       if (!forwardStrand
 628               && SequenceOntologyFactory.getInstance().isA(sf.getType(),
 629                       SequenceOntologyI.SEQUENCE_VARIANT))
 630       {
 631         reverseComplementAlleles(copy);
 632       }
 633     }
 634   }
 635
 636   /**
 637    * Change the 'alleles' value of a feature by converting to complementary
 638    * bases, and also update the feature description to match
 639    *
 640    * @param sf
 641    */
 642   static void reverseComplementAlleles(SequenceFeature sf)
 643   {
 644     final String alleles = (String) sf.getValue(ALLELES);
 645     if (alleles == null)
 646     {
 647       return;
 648     }
 649     StringBuilder complement = new StringBuilder(alleles.length());
 650     for (String allele : alleles.split(","))
 651     {
 652       reverseComplementAllele(complement, allele);
 653     }
 654     String comp = complement.toString();
 655     sf.setValue(ALLELES, comp);
 656     sf.setDescription(comp);
 657
 658     /*
 659      * replace value of "alleles=" in sf.ATTRIBUTES as well
 660      * so 'output as GFF' shows reverse complement alleles
 661      */
 662     String atts = sf.getAttributes();
 663     if (atts != null)
 664     {
 665       atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp);
 666       sf.setAttributes(atts);
 667     }
 668   }
 669
 670   /**
 671    * Makes the 'reverse complement' of the given allele and appends it to the
 672    * buffer, after a comma separator if not the first
 673    *
 674    * @param complement
 675    * @param allele
 676    */
 677   static void reverseComplementAllele(StringBuilder complement,
 678           String allele)
 679   {
 680     if (complement.length() > 0)
 681     {
 682       complement.append(",");
 683     }
 684     if ("HGMD_MUTATION".equalsIgnoreCase(allele))
 685     {
 686       complement.append(allele);
 687     }
 688     else
 689     {
 690       char[] alleles = allele.toCharArray();
 691       for (int i = alleles.length - 1; i >= 0; i--)
 692       {
 693         complement.append(Dna.getComplement(alleles[i]));
 694       }
 695     }
 696   }
 697
 698   /**
 699    * Transfers features from sourceSequence to targetSequence
 700    *
 701    * @param accessionId
 702    * @param sourceSequence
 703    * @param targetSequence
 704    * @return true if any features were transferred, else false
 705    */
 706   protected boolean transferFeatures(String accessionId,
 707           SequenceI sourceSequence, SequenceI targetSequence)
 708   {
 709     if (sourceSequence == null || targetSequence == null)
 710     {
 711       return false;
 712     }
 713
 714     // long start = System.currentTimeMillis();
 715     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
 716     MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
 717             targetSequence.getStart());
 718     if (mapping == null)
 719     {
 720       return false;
 721     }
 722
 723     boolean result = transferFeatures(sfs, targetSequence, mapping,
 724             accessionId);
 725     // System.out.println("transferFeatures (" + (sfs.length) + " --> "
 726     // + targetSequence.getSequenceFeatures().length + ") to "
 727     // + targetSequence.getName()
 728     // + " took " + (System.currentTimeMillis() - start) + "ms");
 729     return result;
 730   }
 731
 732   /**
 733    * Transfer features to the target sequence. The start/end positions are
 734    * converted using the mapping. Features which do not overlap are ignored.
 735    * Features whose parent is not the specified identifier are also ignored.
 736    *
 737    * @param features
 738    * @param targetSequence
 739    * @param mapping
 740    * @param parentId
 741    * @return
 742    */
 743   protected boolean transferFeatures(SequenceFeature[] features,
 744           SequenceI targetSequence, MapList mapping, String parentId)
 745   {
 746     final boolean forwardStrand = mapping.isFromForwardStrand();
 747
 748     /*
 749      * sort features by start position (which corresponds to end
 750      * position descending if reverse strand) so as to add them in
 751      * 'forwards' order to the target sequence
 752      */
 753     sortFeatures(features, forwardStrand);
 754
 755     boolean transferred = false;
 756     for (SequenceFeature sf : features)
 757     {
 758       if (retainFeature(sf, parentId))
 759       {
 760         transferFeature(sf, targetSequence, mapping, forwardStrand);
 761         transferred = true;
 762       }
 763     }
 764     return transferred;
 765   }
 766
 767   /**
 768    * Sort features by start position ascending (if on forward strand), or end
 769    * position descending (if on reverse strand)
 770    *
 771    * @param features
 772    * @param forwardStrand
 773    */
 774   protected static void sortFeatures(SequenceFeature[] features,
 775           final boolean forwardStrand)
 776   {
 777     Arrays.sort(features, new Comparator<SequenceFeature>()
 778     {
 779       @Override
 780       public int compare(SequenceFeature o1, SequenceFeature o2)
 781       {
 782         if (forwardStrand)
 783         {
 784           return Integer.compare(o1.getBegin(), o2.getBegin());
 785         }
 786         else
 787         {
 788           return Integer.compare(o2.getEnd(), o1.getEnd());
 789         }
 790       }
 791     });
 792   }
 793
 794   /**
 795    * Answers true if the feature type is one we want to keep for the sequence.
 796    * Some features are only retrieved in order to identify the sequence range,
 797    * and may then be discarded as redundant information (e.g. "CDS" feature for
 798    * a CDS sequence).
 799    */
 800   @SuppressWarnings("unused")
 801   protected boolean retainFeature(SequenceFeature sf, String accessionId)
 802   {
 803     return true; // override as required
 804   }
 805
 806   /**
 807    * Answers true if the feature has a Parent which refers to the given
 808    * accession id, or if the feature has no parent. Answers false if the
 809    * feature's Parent is for a different accession id.
 810    *
 811    * @param sf
 812    * @param identifier
 813    * @return
 814    */
 815   protected boolean featureMayBelong(SequenceFeature sf, String identifier)
 816   {
 817     String parent = (String) sf.getValue(PARENT);
 818     // using contains to allow for prefix "gene:", "transcript:" etc
 819     if (parent != null && !parent.contains(identifier))
 820     {
 821       // this genomic feature belongs to a different transcript
 822       return false;
 823     }
 824     return true;
 825   }
 826
 827   @Override
 828   public String getDescription()
 829   {
 830     return "Ensembl " + getSourceEnsemblType().getType()
 831             + " sequence with variant features";
 832   }
 833
 834   /**
 835    * Returns a (possibly empty) list of features on the sequence which have the
 836    * specified sequence ontology type (or a sub-type of it), and the given
 837    * identifier as parent
 838    *
 839    * @param sequence
 840    * @param type
 841    * @param parentId
 842    * @return
 843    */
 844   protected List<SequenceFeature> findFeatures(SequenceI sequence,
 845           String type, String parentId)
 846   {
 847     List<SequenceFeature> result = new ArrayList<SequenceFeature>();
 848
 849     SequenceFeature[] sfs = sequence.getSequenceFeatures();
 850     if (sfs != null) {
 851       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 852       for (SequenceFeature sf :sfs) {
 853         if (so.isA(sf.getType(), type))
 854         {
 855           String parent = (String) sf.getValue(PARENT);
 856           if (parent.equals(parentId))
 857           {
 858             result.add(sf);
 859           }
 860         }
 861       }
 862     }
 863     return result;
 864   }
 865
 866   /**
 867    * Answers true if the feature type is either 'NMD_transcript_variant' or
 868    * 'transcript' or one of its sub-types in the Sequence Ontology. This is
 869    * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl
 870    * although strictly speaking it is not (it is a sub-type of
 871    * sequence_variant).
 872    *
 873    * @param featureType
 874    * @return
 875    */
 876   public static boolean isTranscript(String featureType)
 877   {
 878     return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
 879             || SequenceOntologyFactory.getInstance().isA(featureType,
 880                     SequenceOntologyI.TRANSCRIPT);
 881   }
 882 }