src/jalview/ext/ensembl/EnsemblGene.java

   1 package jalview.ext.ensembl;
   2
   3 import jalview.datamodel.AlignmentI;
   4 import jalview.datamodel.Sequence;
   5 import jalview.datamodel.SequenceFeature;
   6 import jalview.datamodel.SequenceI;
   7 import jalview.io.gff.SequenceOntology;
   8 import jalview.util.MapList;
   9
  10 import java.util.ArrayList;
  11 import java.util.Arrays;
  12 import java.util.List;
  13
  14 /**
  15  * A class that fetches genomic sequence and all transcripts for an Ensembl gene
  16  *
  17  * @author gmcarstairs
  18  */
  19 public class EnsemblGene extends EnsemblSeqProxy
  20 {
  21   private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
  22       EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
  23       EnsemblFeatureType.exon, EnsemblFeatureType.cds,
  24       EnsemblFeatureType.variation };
  25
  26   @Override
  27   public String getDbName()
  28   {
  29     return "ENSEMBL (GENE)";
  30   }
  31
  32   @Override
  33   protected EnsemblFeatureType[] getFeaturesToFetch()
  34   {
  35     return FEATURES_TO_FETCH;
  36   }
  37
  38   @Override
  39   protected EnsemblSeqType getSourceEnsemblType()
  40   {
  41     return EnsemblSeqType.GENOMIC;
  42   }
  43
  44   /**
  45    * Builds an alignment of all transcripts for the requested gene:
  46    * <ul>
  47    * <li>fetches the gene sequence</li>
  48    * <li>fetches features on the sequence</li>
  49    * <li>identifies "transcript" features whose Parent is the requested gene</li>
  50    * <li>fetches the transcript sequence for each transcript</li>
  51    * <li>makes a mapping from the gene to each transcript</li>
  52    * <li>copies features from gene to transcript sequences</li>
  53    * <li>fetches the protein sequence for each transcript, maps and saves it as
  54    * a cross-reference</li>
  55    * <li>aligns each transcript against the gene sequence based on the position
  56    * mappings</li>
  57    * </ul>
  58    */
  59   @Override
  60   public AlignmentI getSequenceRecords(String query) throws Exception
  61   {
  62     // TODO ? if an ENST identifier is supplied, convert to ENSG?
  63     AlignmentI al = super.getSequenceRecords(query);
  64     if (al.getHeight() > 0)
  65     {
  66       getTranscripts(al, query);
  67     }
  68
  69     return al;
  70   }
  71
  72   /**
  73    * Constructs all transcripts for the gene, as identified by "transcript"
  74    * features whose Parent is the requested gene. The coding transcript
  75    * sequences (i.e. with introns omitted) are added to the alignment.
  76    *
  77    * @param al
  78    * @param accId
  79    * @throws Exception
  80    */
  81   protected void getTranscripts(AlignmentI al, String accId)
  82           throws Exception
  83   {
  84     SequenceI gene = al.getSequenceAt(0);
  85     List<SequenceFeature> transcriptFeatures = getTranscriptFeatures(accId,
  86             gene);
  87
  88     for (SequenceFeature transcriptFeature : transcriptFeatures)
  89     {
  90       makeTranscript(transcriptFeature, al, gene);
  91     }
  92   }
  93
  94   /**
  95    * Constructs a spliced transcript sequence by finding 'exon' features for the
  96    * given id (or failing that 'CDS'). Copies features on to the new sequence.
  97    * 'Aligns' the new sequence against the gene sequence by padding with gaps,
  98    * and adds it to the alignment.
  99    *
 100    * @param transcriptFeature
 101    * @param al
 102    *          the alignment to which to add the new sequence
 103    * @param gene
 104    *          the parent gene sequence, with features
 105    * @return
 106    */
 107   SequenceI makeTranscript(SequenceFeature transcriptFeature,
 108           AlignmentI al, SequenceI gene)
 109   {
 110     String accId = (String) transcriptFeature.getValue("transcript_id");
 111     if (accId == null)
 112     {
 113       return null;
 114     }
 115
 116     /*
 117      * NB we are mapping from gene sequence (not genome), so do not
 118      * need to check for reverse strand (gene and transcript sequences
 119      * are in forward sense)
 120      */
 121
 122     /*
 123      * make a gene-length sequence filled with gaps
 124      * we will fill in the bases for transcript regions
 125      */
 126     char[] seqChars = new char[gene.getLength()];
 127     Arrays.fill(seqChars, al.getGapCharacter());
 128
 129     /*
 130      * look for exon features of the transcript, failing that for CDS
 131      * (for example ENSG00000124610 has 1 CDS but no exon features)
 132      */
 133     String parentId = "transcript:" + accId;
 134     List<SequenceFeature> splices = findFeatures(gene,
 135             SequenceOntology.EXON, parentId);
 136     if (splices.isEmpty())
 137     {
 138       splices = findFeatures(gene, SequenceOntology.CDS, parentId);
 139     }
 140
 141     int transcriptLength = 0;
 142     final char[] geneChars = gene.getSequence();
 143     int offset = gene.getStart(); // to convert to 0-based positions
 144     List<int[]> mappedFrom = new ArrayList<int[]>();
 145
 146     for (SequenceFeature sf : splices)
 147     {
 148       int start = sf.getBegin() - offset;
 149       int end = sf.getEnd() - offset;
 150       int spliceLength = end - start + 1;
 151       System.arraycopy(geneChars, start, seqChars, start, spliceLength);
 152       transcriptLength += spliceLength;
 153       mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() });
 154     }
 155
 156     Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength);
 157     String geneName = (String) transcriptFeature.getValue(NAME);
 158     if (geneName != null)
 159     {
 160       transcript.setDescription(geneName);
 161     }
 162     transcript.createDatasetSequence();
 163
 164     al.addSequence(transcript);
 165
 166     /*
 167      * transfer features to the new sequence; we use EnsemblCdna to do this,
 168      * to filter out unwanted features types (see method retainFeature)
 169      */
 170     List<int[]> mapTo = new ArrayList<int[]>();
 171     mapTo.add(new int[] { 1, transcriptLength });
 172     MapList mapping = new MapList(mappedFrom, mapTo, 1, 1);
 173     new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(),
 174             transcript.getDatasetSequence(), mapping, parentId);
 175
 176     /*
 177      * and finally fetch the protein product and save as a cross-reference
 178      */
 179     addProteinProduct(transcript);
 180
 181     return transcript;
 182   }
 183
 184   /**
 185    * Returns a list of the transcript features on the sequence whose Parent is
 186    * the gene for the accession id.
 187    *
 188    * @param accId
 189    * @param geneSequence
 190    * @return
 191    */
 192   protected List<SequenceFeature> getTranscriptFeatures(String accId,
 193           SequenceI geneSequence)
 194   {
 195     List<SequenceFeature> transcriptFeatures = new ArrayList<SequenceFeature>();
 196
 197     String parentIdentifier = "gene:" + accId;
 198     SequenceFeature[] sfs = geneSequence.getSequenceFeatures();
 199
 200     if (sfs != null)
 201     {
 202       for (SequenceFeature sf : sfs)
 203       {
 204         if (isTranscript(sf.getType()))
 205         {
 206           String parent = (String) sf.getValue(PARENT);
 207           if (parentIdentifier.equals(parent))
 208           {
 209             transcriptFeatures.add(sf);
 210           }
 211         }
 212       }
 213     }
 214
 215     return transcriptFeatures;
 216   }
 217
 218   @Override
 219   public String getDescription()
 220   {
 221     return "Fetches all transcripts and variant features for a gene";
 222   }
 223
 224   /**
 225    * Default test query is a transcript
 226    */
 227   @Override
 228   public String getTestQuery()
 229   {
 230     return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand
 231     // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand
 232     // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand
 233     // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand
 234   }
 235
 236   /**
 237    * Answers true for a feature of type 'gene' (or a sub-type of gene in the
 238    * Sequence Ontology), whose ID is the accession we are retrieving
 239    */
 240   @Override
 241   protected boolean identifiesSequence(SequenceFeature sf, String accId)
 242   {
 243     if (SequenceOntology.getInstance().isA(sf.getType(),
 244             SequenceOntology.GENE))
 245     {
 246       String id = (String) sf.getValue(ID);
 247       if (("gene:" + accId).equals(id))
 248       {
 249         return true;
 250       }
 251     }
 252     return false;
 253   }
 254
 255   /**
 256    * Answers true unless feature type is 'gene', or 'transcript' with a parent
 257    * which is a different gene. We need the gene features to identify the range,
 258    * but it is redundant information on the gene sequence. Checking the parent
 259    * allows us to drop transcript features which belong to different
 260    * (overlapping) genes.
 261    */
 262   @Override
 263   protected boolean retainFeature(SequenceFeature sf, String accessionId)
 264   {
 265     if (SequenceOntology.getInstance().isA(sf.getType(),
 266             SequenceOntology.GENE))
 267     {
 268       return false;
 269     }
 270
 271     if (isTranscript(sf.getType()))
 272     {
 273       String parent = (String) sf.getValue(PARENT);
 274       if (!("gene:" + accessionId).equals(parent))
 275       {
 276         return false;
 277       }
 278     }
 279     return true;
 280   }
 281
 282   /**
 283    * Answers false. This allows an optimisation - a single 'gene' feature is all
 284    * that is needed to identify the positions of the gene on the genomic
 285    * sequence.
 286    */
 287   @Override
 288   protected boolean isSpliceable()
 289   {
 290     return false;
 291   }
 292
 293   @Override
 294   protected List<String> getCrossReferenceDatabases()
 295   {
 296     // found these for ENSG00000157764 on 30/01/2016:
 297     // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress",
 298     // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"};
 299     return super.getCrossReferenceDatabases();
 300   }
 301
 302 }