src/jalview/ext/ensembl/EnsemblGene.java

   1 package jalview.ext.ensembl;
   2
   3 import jalview.datamodel.AlignmentI;
   4 import jalview.datamodel.Sequence;
   5 import jalview.datamodel.SequenceFeature;
   6 import jalview.datamodel.SequenceI;
   7 import jalview.io.gff.SequenceOntologyFactory;
   8 import jalview.io.gff.SequenceOntologyI;
   9 import jalview.util.MapList;
  10
  11 import java.io.IOException;
  12 import java.util.ArrayList;
  13 import java.util.Arrays;
  14 import java.util.List;
  15
  16 import com.stevesoft.pat.Regex;
  17
  18 /**
  19  * A class that fetches genomic sequence and all transcripts for an Ensembl gene
  20  *
  21  * @author gmcarstairs
  22  */
  23 public class EnsemblGene extends EnsemblSeqProxy
  24 {
  25   private static final String GENE_PREFIX = "gene:";
  26
  27   // TODO modify to accept other species e.g. ENSMUSGnnn
  28   private static final Regex ACCESSION_REGEX = new Regex(
  29           "(ENSG|ENST)[0-9]{11}$");
  30
  31   private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
  32       EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
  33       EnsemblFeatureType.exon, EnsemblFeatureType.cds,
  34       EnsemblFeatureType.variation };
  35
  36   @Override
  37   public String getDbName()
  38   {
  39     return "ENSEMBL (GENE)";
  40   }
  41
  42   @Override
  43   protected EnsemblFeatureType[] getFeaturesToFetch()
  44   {
  45     return FEATURES_TO_FETCH;
  46   }
  47
  48   @Override
  49   protected EnsemblSeqType getSourceEnsemblType()
  50   {
  51     return EnsemblSeqType.GENOMIC;
  52   }
  53
  54   /**
  55    * Builds an alignment of all transcripts for the requested gene:
  56    * <ul>
  57    * <li>fetches the gene sequence</li>
  58    * <li>fetches features on the sequence</li>
  59    * <li>identifies "transcript" features whose Parent is the requested gene</li>
  60    * <li>fetches the transcript sequence for each transcript</li>
  61    * <li>makes a mapping from the gene to each transcript</li>
  62    * <li>copies features from gene to transcript sequences</li>
  63    * <li>fetches the protein sequence for each transcript, maps and saves it as
  64    * a cross-reference</li>
  65    * <li>aligns each transcript against the gene sequence based on the position
  66    * mappings</li>
  67    * </ul>
  68    */
  69   @Override
  70   public AlignmentI getSequenceRecords(String query) throws Exception
  71   {
  72     List<String> transcriptsWanted = null;
  73
  74     if (isTranscriptIdentifier(query))
  75     {
  76       transcriptsWanted = Arrays.asList(query
  77               .split(getAccessionSeparator()));
  78       query = getGeneForTranscript(query);
  79       if (query == null)
  80       {
  81         return null;
  82       }
  83     }
  84
  85     AlignmentI al = super.getSequenceRecords(query);
  86     if (al.getHeight() > 0)
  87     {
  88       getTranscripts(al, query, transcriptsWanted);
  89     }
  90
  91     return al;
  92   }
  93
  94   /**
  95    * Gets the parent gene identifier for a given transcript identifier, by
  96    * retrieving 'transcript' features overlapping the transcript, and finding
  97    * the Parent property of the feature whose id is the given identifier.
  98    *
  99    * @param query
 100    * @return
 101    */
 102   protected String getGeneForTranscript(String transcriptId)
 103   {
 104     String geneId = null;
 105
 106     /*
 107      * reduce multiple transcripts (e.g. from Uniprot x-ref) to the first
 108      * one only as representative (they should all have the same gene)
 109      */
 110     transcriptId = transcriptId.split(getAccessionSeparator())[0];
 111
 112     try
 113     {
 114       EnsemblFeatureType[] geneFeature = new EnsemblFeatureType[] { EnsemblFeatureType.transcript };
 115       AlignmentI al = new EnsemblFeatures().getSequenceRecords(
 116               transcriptId, geneFeature);
 117       if (al != null && al.getHeight() > 0)
 118       {
 119         SequenceFeature[] sfs = al.getSequenceAt(0).getSequenceFeatures();
 120         if (sfs != null)
 121         {
 122           for (SequenceFeature sf : sfs)
 123           {
 124             if (transcriptId.equals(getTranscriptId(sf)))
 125             {
 126               String parent = (String) sf.getValue(PARENT);
 127               if (parent != null && parent.startsWith(GENE_PREFIX))
 128               {
 129                 geneId = parent.substring(5);
 130               }
 131               break;
 132             }
 133           }
 134         }
 135       }
 136       return geneId;
 137     } catch (IOException e)
 138     {
 139       System.err.println("Error retrieving gene id for " + transcriptId
 140               + ": " + e.getMessage());
 141       return null;
 142     }
 143   }
 144
 145   /**
 146    * Constructs all transcripts for the gene, as identified by "transcript"
 147    * features whose Parent is the requested gene. The coding transcript
 148    * sequences (i.e. with introns omitted) are added to the alignment.
 149    *
 150    * @param al
 151    * @param accId
 152    * @param transcriptsWanted
 153    *          optional list of transcript ids to filter by
 154    * @throws Exception
 155    */
 156   protected void getTranscripts(AlignmentI al, String accId,
 157           List<String> transcriptsWanted)
 158           throws Exception
 159   {
 160     SequenceI gene = al.getSequenceAt(0);
 161     List<SequenceFeature> transcriptFeatures = getTranscriptFeatures(accId,
 162             gene, transcriptsWanted);
 163
 164     for (SequenceFeature transcriptFeature : transcriptFeatures)
 165     {
 166       makeTranscript(transcriptFeature, al, gene);
 167     }
 168   }
 169
 170   /**
 171    * Constructs a spliced transcript sequence by finding 'exon' features for the
 172    * given id (or failing that 'CDS'). Copies features on to the new sequence.
 173    * 'Aligns' the new sequence against the gene sequence by padding with gaps,
 174    * and adds it to the alignment.
 175    *
 176    * @param transcriptFeature
 177    * @param al
 178    *          the alignment to which to add the new sequence
 179    * @param gene
 180    *          the parent gene sequence, with features
 181    * @return
 182    */
 183   SequenceI makeTranscript(SequenceFeature transcriptFeature,
 184           AlignmentI al, SequenceI gene)
 185   {
 186     String accId = getTranscriptId(transcriptFeature);
 187     if (accId == null)
 188     {
 189       return null;
 190     }
 191
 192     /*
 193      * NB we are mapping from gene sequence (not genome), so do not
 194      * need to check for reverse strand (gene and transcript sequences
 195      * are in forward sense)
 196      */
 197
 198     /*
 199      * make a gene-length sequence filled with gaps
 200      * we will fill in the bases for transcript regions
 201      */
 202     char[] seqChars = new char[gene.getLength()];
 203     Arrays.fill(seqChars, al.getGapCharacter());
 204
 205     /*
 206      * look for exon features of the transcript, failing that for CDS
 207      * (for example ENSG00000124610 has 1 CDS but no exon features)
 208      */
 209     String parentId = "transcript:" + accId;
 210     List<SequenceFeature> splices = findFeatures(gene,
 211             SequenceOntologyI.EXON, parentId);
 212     if (splices.isEmpty())
 213     {
 214       splices = findFeatures(gene, SequenceOntologyI.CDS, parentId);
 215     }
 216
 217     int transcriptLength = 0;
 218     final char[] geneChars = gene.getSequence();
 219     int offset = gene.getStart(); // to convert to 0-based positions
 220     List<int[]> mappedFrom = new ArrayList<int[]>();
 221
 222     for (SequenceFeature sf : splices)
 223     {
 224       int start = sf.getBegin() - offset;
 225       int end = sf.getEnd() - offset;
 226       int spliceLength = end - start + 1;
 227       System.arraycopy(geneChars, start, seqChars, start, spliceLength);
 228       transcriptLength += spliceLength;
 229       mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() });
 230     }
 231
 232     Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength);
 233     String geneName = (String) transcriptFeature.getValue(NAME);
 234     if (geneName != null)
 235     {
 236       transcript.setDescription(geneName);
 237     }
 238     transcript.createDatasetSequence();
 239
 240     al.addSequence(transcript);
 241
 242     /*
 243      * transfer features to the new sequence; we use EnsemblCdna to do this,
 244      * to filter out unwanted features types (see method retainFeature)
 245      */
 246     List<int[]> mapTo = new ArrayList<int[]>();
 247     mapTo.add(new int[] { 1, transcriptLength });
 248     MapList mapping = new MapList(mappedFrom, mapTo, 1, 1);
 249     new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(),
 250             transcript.getDatasetSequence(), mapping, parentId);
 251
 252     /*
 253      * and finally fetch the protein product and save as a cross-reference
 254      */
 255     new EnsemblCdna().addProteinProduct(transcript);
 256
 257     return transcript;
 258   }
 259
 260   /**
 261    * Returns the 'transcript_id' property of the sequence feature (or null)
 262    *
 263    * @param feature
 264    * @return
 265    */
 266   protected String getTranscriptId(SequenceFeature feature)
 267   {
 268     return (String) feature.getValue("transcript_id");
 269   }
 270
 271   /**
 272    * Returns a list of the transcript features on the sequence whose Parent is
 273    * the gene for the accession id.
 274    *
 275    * @param accId
 276    * @param geneSequence
 277    * @param transcriptsWanted
 278    *          optional list of ids to filter on
 279    * @return
 280    */
 281   protected List<SequenceFeature> getTranscriptFeatures(String accId,
 282           SequenceI geneSequence, List<String> transcriptsWanted)
 283   {
 284     List<SequenceFeature> transcriptFeatures = new ArrayList<SequenceFeature>();
 285
 286     String parentIdentifier = GENE_PREFIX + accId;
 287     SequenceFeature[] sfs = geneSequence.getSequenceFeatures();
 288
 289     if (sfs != null)
 290     {
 291       for (SequenceFeature sf : sfs)
 292       {
 293         if (isTranscript(sf.getType()))
 294         {
 295           if (transcriptsWanted != null)
 296           {
 297             String transcriptId = (String) sf.getValue("transcript_id");
 298             if (!transcriptsWanted.contains(transcriptId))
 299             {
 300               // continue;
 301             }
 302           }
 303           String parent = (String) sf.getValue(PARENT);
 304           if (parentIdentifier.equals(parent))
 305           {
 306             transcriptFeatures.add(sf);
 307           }
 308         }
 309       }
 310     }
 311
 312     return transcriptFeatures;
 313   }
 314
 315   @Override
 316   public String getDescription()
 317   {
 318     return "Fetches all transcripts and variant features for a gene or transcript";
 319   }
 320
 321   /**
 322    * Default test query is a gene id (can also enter a transcript id)
 323    */
 324   @Override
 325   public String getTestQuery()
 326   {
 327     return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand
 328     // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand
 329     // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand
 330     // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand
 331   }
 332
 333   /**
 334    * Answers true for a feature of type 'gene' (or a sub-type of gene in the
 335    * Sequence Ontology), whose ID is the accession we are retrieving
 336    */
 337   @Override
 338   protected boolean identifiesSequence(SequenceFeature sf, String accId)
 339   {
 340     if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
 341             SequenceOntologyI.GENE))
 342     {
 343       String id = (String) sf.getValue(ID);
 344       if ((GENE_PREFIX + accId).equals(id))
 345       {
 346         return true;
 347       }
 348     }
 349     return false;
 350   }
 351
 352   /**
 353    * Answers true unless feature type is 'gene', or 'transcript' with a parent
 354    * which is a different gene. We need the gene features to identify the range,
 355    * but it is redundant information on the gene sequence. Checking the parent
 356    * allows us to drop transcript features which belong to different
 357    * (overlapping) genes.
 358    */
 359   @Override
 360   protected boolean retainFeature(SequenceFeature sf, String accessionId)
 361   {
 362     if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
 363             SequenceOntologyI.GENE))
 364     {
 365       return false;
 366     }
 367
 368     if (isTranscript(sf.getType()))
 369     {
 370       String parent = (String) sf.getValue(PARENT);
 371       if (!(GENE_PREFIX + accessionId).equals(parent))
 372       {
 373         return false;
 374       }
 375     }
 376     return true;
 377   }
 378
 379   /**
 380    * Answers false. This allows an optimisation - a single 'gene' feature is all
 381    * that is needed to identify the positions of the gene on the genomic
 382    * sequence.
 383    */
 384   @Override
 385   protected boolean isSpliceable()
 386   {
 387     return false;
 388   }
 389
 390   @Override
 391   protected List<String> getCrossReferenceDatabases()
 392   {
 393     // found these for ENSG00000157764 on 30/01/2016:
 394     // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress",
 395     // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"};
 396     return super.getCrossReferenceDatabases();
 397   }
 398
 399   /**
 400    * Override to do nothing as Ensembl doesn't return a protein sequence for a
 401    * gene identifier
 402    */
 403   @Override
 404   protected void addProteinProduct(SequenceI querySeq)
 405   {
 406   }
 407
 408   @Override
 409   public Regex getAccessionValidator()
 410   {
 411     return ACCESSION_REGEX;
 412   }
 413
 414 }