src/jalview/ext/ensembl/EnsemblGene.java

   1 package jalview.ext.ensembl;
   2
   3 import jalview.datamodel.AlignmentI;
   4 import jalview.datamodel.Sequence;
   5 import jalview.datamodel.SequenceFeature;
   6 import jalview.datamodel.SequenceI;
   7 import jalview.io.gff.SequenceOntology;
   8 import jalview.util.MapList;
   9
  10 import java.util.ArrayList;
  11 import java.util.Arrays;
  12 import java.util.List;
  13
  14 /**
  15  * A class that fetches genomic sequence and all transcripts for an Ensembl gene
  16  *
  17  * @author gmcarstairs
  18  */
  19 public class EnsemblGene extends EnsemblSeqProxy
  20 {
  21   private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
  22       EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
  23       EnsemblFeatureType.exon, EnsemblFeatureType.cds,
  24       EnsemblFeatureType.variation };
  25
  26   @Override
  27   public String getDbName()
  28   {
  29     return "ENSEMBL (GENE)";
  30   }
  31
  32   @Override
  33   protected EnsemblFeatureType[] getFeaturesToFetch()
  34   {
  35     return FEATURES_TO_FETCH;
  36   }
  37
  38   @Override
  39   protected EnsemblSeqType getSourceEnsemblType()
  40   {
  41     return EnsemblSeqType.GENOMIC;
  42   }
  43
  44   /**
  45    * Builds an alignment of all transcripts for the requested gene:
  46    * <ul>
  47    * <li>fetches the gene sequence</li>
  48    * <li>fetches features on the sequence</li>
  49    * <li>identifies "transcript" features whose Parent is the requested gene</li>
  50    * <li>fetches the transcript sequence for each transcript</li>
  51    * <li>makes a mapping from the gene to each transcript</li>
  52    * <li>copies features from gene to transcript sequences</li>
  53    * <li>fetches the protein sequence for each transcript, maps and saves it as
  54    * a cross-reference</li>
  55    * <li>aligns each transcript against the gene sequence based on the position
  56    * mappings</li>
  57    * </ul>
  58    */
  59   @Override
  60   public AlignmentI getSequenceRecords(String query) throws Exception
  61   {
  62     // TODO ? if an ENST identifier is supplied, convert to ENSG?
  63     AlignmentI al = super.getSequenceRecords(query);
  64     if (al.getHeight() > 0)
  65     {
  66       getTranscripts(al, query);
  67     }
  68
  69     return al;
  70   }
  71
  72   /**
  73    * Constructs all transcripts for the gene, as identified by "transcript"
  74    * features whose Parent is the requested gene. The coding transcript
  75    * sequences (i.e. with introns omitted) are added to the alignment.
  76    *
  77    * @param al
  78    * @param accId
  79    * @throws Exception
  80    */
  81   protected void getTranscripts(AlignmentI al, String accId)
  82           throws Exception
  83   {
  84     SequenceI gene = al.getSequenceAt(0);
  85     List<SequenceFeature> transcriptFeatures = getTranscriptFeatures(accId,
  86             gene);
  87
  88     for (SequenceFeature transcriptFeature : transcriptFeatures)
  89     {
  90       makeTranscript(transcriptFeature, al, gene);
  91     }
  92   }
  93
  94   /**
  95    * Constructs a spliced transcript sequence by finding 'exon' features for the
  96    * given id (or failing that 'CDS'). Copies features on to the new sequence.
  97    * 'Aligns' the new sequence against the gene sequence by padding with gaps,
  98    * and adds it to the alignment.
  99    *
 100    * @param transcriptFeature
 101    * @param al
 102    *          the alignment to which to add the new sequence
 103    * @param gene
 104    *          the parent gene sequence, with features
 105    * @return
 106    */
 107   SequenceI makeTranscript(SequenceFeature transcriptFeature,
 108           AlignmentI al, SequenceI gene)
 109   {
 110     String accId = (String) transcriptFeature.getValue("transcript_id");
 111     if (accId == null)
 112     {
 113       return null;
 114     }
 115
 116     /*
 117      * NB we are mapping from gene sequence (not genome), so do not
 118      * need to check for reverse strand (gene and transcript sequences
 119      * are in forward sense)
 120      */
 121
 122     /*
 123      * make a gene-length sequence filled with gaps
 124      * we will fill in the bases for transcript regions
 125      */
 126     char[] seqChars = new char[gene.getLength()];
 127     Arrays.fill(seqChars, al.getGapCharacter());
 128
 129     /*
 130      * look for exon features of the transcript, failing that for CDS
 131      * (for example ENSG00000124610 has 1 CDS but no exon features)
 132      */
 133     String parentId = "transcript:" + accId;
 134     List<SequenceFeature> splices = findFeatures(gene,
 135             SequenceOntology.EXON, parentId);
 136     if (splices.isEmpty())
 137     {
 138       splices = findFeatures(gene, SequenceOntology.CDS, parentId);
 139     }
 140
 141     int transcriptLength = 0;
 142     final char[] geneChars = gene.getSequence();
 143     int offset = gene.getStart(); // to convert to 0-based positions
 144     List<int[]> mappedFrom = new ArrayList<int[]>();
 145
 146     for (SequenceFeature sf : splices)
 147     {
 148       int start = sf.getBegin() - offset;
 149       int end = sf.getEnd() - offset;
 150       int spliceLength = end - start + 1;
 151       System.arraycopy(geneChars, start, seqChars, start, spliceLength);
 152       transcriptLength += spliceLength;
 153       mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() });
 154     }
 155
 156     Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength);
 157     transcript.createDatasetSequence();
 158
 159     al.addSequence(transcript);
 160
 161     /*
 162      * transfer features to the new sequence; we use EnsemblCdna to do this,
 163      * to filter out unwanted features types (see method retainFeature)
 164      */
 165     List<int[]> mapTo = new ArrayList<int[]>();
 166     mapTo.add(new int[] { 1, transcriptLength });
 167     MapList mapping = new MapList(mappedFrom, mapTo, 1, 1);
 168     new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(),
 169             transcript.getDatasetSequence(), mapping, parentId);
 170
 171     /*
 172      * and finally fetch the protein product and save as a cross-reference
 173      */
 174     addProteinProduct(transcript);
 175
 176     return transcript;
 177   }
 178
 179   /**
 180    * Returns a list of the transcript features on the sequence whose Parent is
 181    * the gene for the accession id. Also removes all transcript features from
 182    * the gene sequence, as we have no further need for them and they obscure
 183    * more useful features on the display.
 184    *
 185    * @param accId
 186    * @param geneSequence
 187    * @return
 188    */
 189   protected List<SequenceFeature> getTranscriptFeatures(String accId,
 190           SequenceI geneSequence)
 191   {
 192     List<SequenceFeature> transcriptFeatures = new ArrayList<SequenceFeature>();
 193
 194     List<SequenceFeature> keptFeatures = new ArrayList<SequenceFeature>();
 195     String parentIdentifier = "gene:" + accId;
 196     SequenceFeature[] sfs = geneSequence.getSequenceFeatures();
 197
 198     if (sfs != null)
 199     {
 200       for (SequenceFeature sf : sfs)
 201       {
 202         if (isTranscript(sf.getType()))
 203         {
 204           String parent = (String) sf.getValue(PARENT);
 205           if (parentIdentifier.equals(parent))
 206           {
 207             transcriptFeatures.add(sf);
 208           }
 209         }
 210         else
 211         {
 212           keptFeatures.add(sf);
 213         }
 214       }
 215     }
 216     SequenceFeature[] featuresRetained = keptFeatures.toArray(new SequenceFeature[keptFeatures.size()]);
 217     geneSequence.getDatasetSequence().setSequenceFeatures(featuresRetained);
 218
 219     return transcriptFeatures;
 220   }
 221
 222   @Override
 223   public String getDescription()
 224   {
 225     return "Fetches all transcripts and variant features for a gene";
 226   }
 227
 228   /**
 229    * Default test query is a transcript
 230    */
 231   @Override
 232   public String getTestQuery()
 233   {
 234     return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand
 235     // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand
 236     // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand
 237     // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand
 238   }
 239
 240   /**
 241    * Answers true for a feature of type 'gene' (or a sub-type of gene in the
 242    * Sequence Ontology), whose ID is the accession we are retrieving
 243    */
 244   @Override
 245   protected boolean identifiesSequence(SequenceFeature sf, String accId)
 246   {
 247     if (SequenceOntology.getInstance().isA(sf.getType(),
 248             SequenceOntology.GENE))
 249     {
 250       String id = (String) sf.getValue(ID);
 251       if (("gene:" + accId).equals(id))
 252       {
 253         return true;
 254       }
 255     }
 256     return false;
 257   }
 258
 259   /**
 260    * Answers true unless feature type is 'gene', or 'transcript' with a parent
 261    * which is a different gene. We need the gene features to identify the range,
 262    * but it is redundant information on the gene sequence. Checking the parent
 263    * allows us to drop transcript features which belong to different
 264    * (overlapping) genes.
 265    */
 266   @Override
 267   protected boolean retainFeature(SequenceFeature sf, String accessionId)
 268   {
 269     if (SequenceOntology.getInstance().isA(sf.getType(),
 270             SequenceOntology.GENE))
 271     {
 272       return false;
 273     }
 274
 275     if (isTranscript(sf.getType()))
 276     {
 277       String parent = (String) sf.getValue(PARENT);
 278       if (!("gene:" + accessionId).equals(parent))
 279       {
 280         return false;
 281       }
 282     }
 283     return true;
 284   }
 285
 286   /**
 287    * Answers false. This allows an optimisation - a single 'gene' feature is all
 288    * that is needed to identify the positions of the gene on the genomic
 289    * sequence.
 290    */
 291   @Override
 292   protected boolean isSpliceable()
 293   {
 294     return false;
 295   }
 296
 297 }