X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblGene.java;h=24e3e955fe442972c96f99ae6257cfaa0fecfa14;hb=604cbee405a837565ba1a74aa9bddd62aed685ab;hp=73649b43727341000723abd11d59d546a045d991;hpb=ef9282b464dc189faf9ce40a4b7420a204266668;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 73649b4..24e3e95 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -1,14 +1,40 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.ext.ensembl; +import jalview.api.FeatureColourI; +import jalview.api.FeatureSettingsModelI; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; +import jalview.schemes.FeatureColour; +import jalview.schemes.FeatureSettingsAdapter; import jalview.util.MapList; -import java.io.IOException; +import java.awt.Color; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -24,19 +50,39 @@ public class EnsemblGene extends EnsemblSeqProxy { private static final String GENE_PREFIX = "gene:"; - // TODO modify to accept other species e.g. ENSMUSGnnn - private static final Regex ACCESSION_REGEX = new Regex( - "(ENSG|ENST)[0-9]{11}$"); + /* + * accepts anything as we will attempt lookup of gene or + * transcript id or gene name + */ + private static final Regex ACCESSION_REGEX = new Regex(".*"); private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.gene, EnsemblFeatureType.transcript, EnsemblFeatureType.exon, EnsemblFeatureType.cds, EnsemblFeatureType.variation }; + /** + * Default constructor (to use rest.ensembl.org) + */ + public EnsemblGene() + { + super(); + } + + /** + * Constructor given the target domain to fetch data from + * + * @param d + */ + public EnsemblGene(String d) + { + super(d); + } + @Override public String getDbName() { - return "ENSEMBL (GENE)"; + return "ENSEMBL"; } @Override @@ -52,8 +98,15 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Builds an alignment of all transcripts for the requested gene: + * Returns an alignment containing the gene(s) for the given gene or + * transcript identifier, or external identifier (e.g. Uniprot id). If given a + * gene name or external identifier, returns any related gene sequences found + * for model organisms. If only a single gene is queried for, then its + * transcripts are also retrieved and added to the alignment.
+ * Method: * + * + * @param query + * a single gene or transcript identifier or gene name + * @return an alignment containing a gene, and possibly transcripts, or null */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { - List transcriptsWanted = null; + /* + * convert to a non-duplicated list of gene identifiers + */ + List geneIds = getGeneIds(query); - if (isTranscriptIdentifier(query)) + AlignmentI al = null; + for (String geneId : geneIds) { - transcriptsWanted = Arrays.asList(query - .split(getAccessionSeparator())); - query = getGeneForTranscript(query); - if (query == null) + /* + * fetch the gene sequence(s) with features and xrefs + */ + AlignmentI geneAlignment = super.getSequenceRecords(geneId); + if (geneAlignment == null) { - return null; + continue; + } + if (geneAlignment.getHeight() == 1) + { + getTranscripts(geneAlignment, geneId); + } + if (al == null) + { + al = geneAlignment; + } + else + { + al.append(geneAlignment); } } - - AlignmentI al = super.getSequenceRecords(query); - if (al.getHeight() > 0) - { - getTranscripts(al, query, transcriptsWanted); - } - return al; } /** - * Gets the parent gene identifier for a given transcript identifier, by - * retrieving 'transcript' features overlapping the transcript, and finding - * the Parent property of the feature whose id is the given identifier. + * Converts a query, which may contain one or more gene or transcript + * identifiers, into a non-redundant list of gene identifiers. * - * @param query + * @param accessions * @return */ - protected String getGeneForTranscript(String transcriptId) + List getGeneIds(String accessions) { - String geneId = null; - - /* - * reduce multiple transcripts (e.g. from Uniprot x-ref) to the first - * one only as representative (they should all have the same gene) - */ - transcriptId = transcriptId.split(getAccessionSeparator())[0]; + List geneIds = new ArrayList(); - try + for (String acc : accessions.split(getAccessionSeparator())) { - EnsemblFeatureType[] geneFeature = new EnsemblFeatureType[] { EnsemblFeatureType.transcript }; - AlignmentI al = new EnsemblFeatures().getSequenceRecords( - transcriptId, geneFeature); - if (al != null && al.getHeight() > 0) + if (isGeneIdentifier(acc)) { - SequenceFeature[] sfs = al.getSequenceAt(0).getSequenceFeatures(); - if (sfs != null) + if (!geneIds.contains(acc)) { - for (SequenceFeature sf : sfs) + geneIds.add(acc); + } + } + + /* + * if given a transcript id, look up its gene parent + */ + else if (isTranscriptIdentifier(acc)) + { + String geneId = new EnsemblLookup(getDomain()).getParent(acc); + if (geneId != null && !geneIds.contains(geneId)) + { + geneIds.add(geneId); + } + } + + /* + * if given a gene or other external name, lookup and fetch + * the corresponding gene for all model organisms + */ + else + { + List ids = new EnsemblSymbol(getDomain(), getDbSource(), + getDbVersion()).getIds(acc); + for (String geneId : ids) + { + if (!geneIds.contains(geneId)) { - if (transcriptId.equals(getTranscriptId(sf))) - { - String parent = (String) sf.getValue(PARENT); - if (parent != null && parent.startsWith(GENE_PREFIX)) - { - geneId = parent.substring(5); - } - break; - } + geneIds.add(geneId); } } } - return geneId; - } catch (IOException e) + } + return geneIds; + } + + /** + * Attempts to get Ensembl stable identifiers for model organisms for a gene + * name by calling the xrefs symbol REST service to resolve the gene name. + * + * @param query + * @return + */ + protected String getGeneIdentifiersForName(String query) + { + List ids = new EnsemblSymbol(getDomain(), getDbSource(), + getDbVersion()).getIds(query); + if (ids != null) { - System.err.println("Error retrieving gene id for " + transcriptId - + ": " + e.getMessage()); - return null; + for (String id : ids) + { + if (isGeneIdentifier(id)) + { + return id; + } + } } + return null; } /** @@ -149,22 +242,48 @@ public class EnsemblGene extends EnsemblSeqProxy * * @param al * @param accId - * @param transcriptsWanted - * optional list of transcript ids to filter by * @throws Exception */ - protected void getTranscripts(AlignmentI al, String accId, - List transcriptsWanted) + protected void getTranscripts(AlignmentI al, String accId) throws Exception { SequenceI gene = al.getSequenceAt(0); List transcriptFeatures = getTranscriptFeatures(accId, - gene, transcriptsWanted); + gene); for (SequenceFeature transcriptFeature : transcriptFeatures) { makeTranscript(transcriptFeature, al, gene); } + + clearGeneFeatures(gene); + } + + /** + * Remove unwanted features (transcript, exon, CDS) from the gene sequence + * after we have used them to derive transcripts and transfer features + * + * @param gene + */ + protected void clearGeneFeatures(SequenceI gene) + { + SequenceFeature[] sfs = gene.getSequenceFeatures(); + if (sfs != null) + { + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + List filtered = new ArrayList(); + for (SequenceFeature sf : sfs) + { + String type = sf.getType(); + if (!isTranscript(type) && !so.isA(type, SequenceOntologyI.EXON) + && !so.isA(type, SequenceOntologyI.CDS)) + { + filtered.add(sf); + } + } + gene.setSequenceFeatures(filtered + .toArray(new SequenceFeature[filtered.size()])); + } } /** @@ -230,10 +349,25 @@ public class EnsemblGene extends EnsemblSeqProxy } Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength); - String geneName = (String) transcriptFeature.getValue(NAME); - if (geneName != null) + + /* + * Ensembl has gene name as transcript Name + * EnsemblGenomes doesn't, but has a url-encoded description field + */ + String description = (String) transcriptFeature.getValue(NAME); + if (description == null) + { + description = (String) transcriptFeature.getValue(DESCRIPTION); + } + if (description != null) { - transcript.setDescription(geneName); + try + { + transcript.setDescription(URLDecoder.decode(description, "UTF-8")); + } catch (UnsupportedEncodingException e) + { + e.printStackTrace(); // as if + } } transcript.createDatasetSequence(); @@ -246,13 +380,19 @@ public class EnsemblGene extends EnsemblSeqProxy List mapTo = new ArrayList(); mapTo.add(new int[] { 1, transcriptLength }); MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); - new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(), + EnsemblCdna cdna = new EnsemblCdna(getDomain()); + cdna.transferFeatures(gene.getSequenceFeatures(), transcript.getDatasetSequence(), mapping, parentId); /* + * fetch and save cross-references + */ + cdna.getCrossReferences(transcript); + + /* * and finally fetch the protein product and save as a cross-reference */ - new EnsemblCdna().addProteinProduct(transcript); + cdna.addProteinProduct(transcript); return transcript; } @@ -274,12 +414,10 @@ public class EnsemblGene extends EnsemblSeqProxy * * @param accId * @param geneSequence - * @param transcriptsWanted - * optional list of ids to filter on * @return */ protected List getTranscriptFeatures(String accId, - SequenceI geneSequence, List transcriptsWanted) + SequenceI geneSequence) { List transcriptFeatures = new ArrayList(); @@ -292,14 +430,6 @@ public class EnsemblGene extends EnsemblSeqProxy { if (isTranscript(sf.getType())) { - if (transcriptsWanted != null) - { - String transcriptId = (String) sf.getValue("transcript_id"); - if (!transcriptsWanted.contains(transcriptId)) - { - // continue; - } - } String parent = (String) sf.getValue(PARENT); if (parentIdentifier.equals(parent)) { @@ -359,13 +489,13 @@ public class EnsemblGene extends EnsemblSeqProxy @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - if (SequenceOntologyFactory.getInstance().isA(sf.getType(), - SequenceOntologyI.GENE)) + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + String type = sf.getType(); + if (so.isA(type, SequenceOntologyI.GENE)) { return false; } - - if (isTranscript(sf.getType())) + if (isTranscript(type)) { String parent = (String) sf.getValue(PARENT); if (!(GENE_PREFIX + accessionId).equals(parent)) @@ -387,15 +517,6 @@ public class EnsemblGene extends EnsemblSeqProxy return false; } - @Override - protected List getCrossReferenceDatabases() - { - // found these for ENSG00000157764 on 30/01/2016: - // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress", - // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"}; - return super.getCrossReferenceDatabases(); - } - /** * Override to do nothing as Ensembl doesn't return a protein sequence for a * gene identifier @@ -411,4 +532,84 @@ public class EnsemblGene extends EnsemblSeqProxy return ACCESSION_REGEX; } + /** + * Returns a descriptor for suitable feature display settings with + *
    + *
  • only exon or sequence_variant features (or their subtypes in the + * Sequence Ontology) visible
  • + *
  • variant features coloured red
  • + *
  • exon features coloured by label (exon name)
  • + *
  • variants displayed above (on top of) exons
  • + *
+ */ + @Override + public FeatureSettingsModelI getFeatureColourScheme() + { + return new FeatureSettingsAdapter() + { + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + + @Override + public boolean isFeatureDisplayed(String type) + { + return (so.isA(type, SequenceOntologyI.EXON) || so.isA(type, + SequenceOntologyI.SEQUENCE_VARIANT)); + } + + @Override + public FeatureColourI getFeatureColour(String type) + { + if (so.isA(type, SequenceOntologyI.EXON)) + { + return new FeatureColour() + { + @Override + public boolean isColourByLabel() + { + return true; + } + }; + } + if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)) + { + return new FeatureColour() + { + + @Override + public Color getColour() + { + return Color.RED; + } + }; + } + return null; + } + + /** + * order to render sequence_variant after exon after the rest + */ + @Override + public int compare(String feature1, String feature2) + { + if (so.isA(feature1, SequenceOntologyI.SEQUENCE_VARIANT)) + { + return +1; + } + if (so.isA(feature2, SequenceOntologyI.SEQUENCE_VARIANT)) + { + return -1; + } + if (so.isA(feature1, SequenceOntologyI.EXON)) + { + return +1; + } + if (so.isA(feature2, SequenceOntologyI.EXON)) + { + return -1; + } + return 0; + } + }; + } + }