X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblGene.java;h=75a7154595a919bf1f7454224813c8e50a519e22;hb=fdde9a078d7bdb46ed9fb7fe115ea83c84a19c81;hp=bbdab55e316ef518dce4bf86106e8949f019d348;hpb=f9b80711054b61e8c2257488a1637e15616cb9c9;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index bbdab55..75a7154 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -1,15 +1,49 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.ext.ensembl; -import jalview.datamodel.AlignedCodonFrame; +import jalview.api.FeatureColourI; +import jalview.api.FeatureSettingsModelI; import jalview.datamodel.AlignmentI; +import jalview.datamodel.GeneLociI; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.io.gff.SequenceOntology; +import jalview.datamodel.features.SequenceFeatures; +import jalview.io.gff.SequenceOntologyFactory; +import jalview.io.gff.SequenceOntologyI; +import jalview.schemes.FeatureColour; +import jalview.schemes.FeatureSettingsAdapter; import jalview.util.MapList; +import jalview.util.Platform; +import java.awt.Color; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import com.stevesoft.pat.Regex; + /** * A class that fetches genomic sequence and all transcripts for an Ensembl gene * @@ -17,15 +51,41 @@ import java.util.List; */ public class EnsemblGene extends EnsemblSeqProxy { + /* + * accepts anything as we will attempt lookup of gene or + * transcript id or gene name + */ + private static final Regex ACCESSION_REGEX = new Regex(".*"); + private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.gene, EnsemblFeatureType.transcript, EnsemblFeatureType.exon, EnsemblFeatureType.cds, EnsemblFeatureType.variation }; + private static final String CHROMOSOME = "chromosome"; + + /** + * Default constructor (to use rest.ensembl.org) + */ + public EnsemblGene() + { + super(); + } + + /** + * Constructor given the target domain to fetch data from + * + * @param d + */ + public EnsemblGene(String d) + { + super(d); + } + @Override public String getDbName() { - return "ENSEMBL (GENE)"; + return "ENSEMBL"; } @Override @@ -40,12 +100,26 @@ public class EnsemblGene extends EnsemblSeqProxy return EnsemblSeqType.GENOMIC; } + @Override + protected String getObjectType() + { + return OBJECT_TYPE_GENE; + } + /** - * Builds an alignment of all transcripts for the requested gene: + * Returns an alignment containing the gene(s) for the given gene or + * transcript identifier, or external identifier (e.g. Uniprot id). If given a + * gene name or external identifier, returns any related gene sequences found + * for model organisms. If only a single gene is queried for, then its + * transcripts are also retrieved and added to the alignment.
+ * Method: * + * + * @param query + * a single gene or transcript identifier or gene name + * @return an alignment containing a gene, and possibly transcripts, or null */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { - AlignmentI al = super.getSequenceRecords(query); - if (al.getHeight() > 0) + /* + * convert to a non-duplicated list of gene identifiers + */ + List geneIds = getGeneIds(query); + AlignmentI al = null; + for (String geneId : geneIds) { - getTranscripts(al, query); - } + /* + * fetch the gene sequence(s) with features and xrefs + */ + AlignmentI geneAlignment = super.getSequenceRecords(geneId); + if (geneAlignment == null) + { + continue; + } + if (geneAlignment.getHeight() == 1) + { + // ensure id has 'correct' case for the Ensembl identifier + geneId = geneAlignment.getSequenceAt(0).getName(); + findGeneLoci(geneAlignment.getSequenceAt(0), geneId); + getTranscripts(geneAlignment, geneId); + } + if (al == null) + { + al = geneAlignment; + } + else + { + al.append(geneAlignment); + } + } return al; } /** - * Find and fetch all transcripts for the gene, as identified by "transcript" - * features whose Parent is the requested gene + * Calls the /lookup/id REST service, parses the response for gene + * coordinates, and if successful, adds these to the sequence. If this fails, + * fall back on trying to parse the sequence description in case it is in + * Ensembl-gene format e.g. chromosome:GRCh38:17:45051610:45109016:1. + * + * @param seq + * @param geneId + */ + void findGeneLoci(SequenceI seq, String geneId) + { + GeneLociI geneLoci = new EnsemblLookup(getDomain()).getGeneLoci(geneId); + if (geneLoci != null) + { + seq.setGeneLoci(geneLoci.getSpeciesId(), geneLoci.getAssemblyId(), + geneLoci.getChromosomeId(), geneLoci.getMapping()); + } + else + { + parseChromosomeLocations(seq); + } + } + + /** + * Parses and saves fields of an Ensembl-style description e.g. + * chromosome:GRCh38:17:45051610:45109016:1 + * + * @param seq + */ + boolean parseChromosomeLocations(SequenceI seq) + { + String description = seq.getDescription(); + if (description == null) + { + return false; + } + String[] tokens = description.split(":"); + if (tokens.length == 6 && tokens[0].startsWith(CHROMOSOME)) + { + String ref = tokens[1]; + String chrom = tokens[2]; + try + { + int chStart = Integer.parseInt(tokens[3]); + int chEnd = Integer.parseInt(tokens[4]); + boolean forwardStrand = "1".equals(tokens[5]); + String species = ""; // not known here + int[] from = new int[] { seq.getStart(), seq.getEnd() }; + int[] to = new int[] { forwardStrand ? chStart : chEnd, + forwardStrand ? chEnd : chStart }; + MapList map = new MapList(from, to, 1, 1); + seq.setGeneLoci(species, ref, chrom, map); + return true; + } catch (NumberFormatException e) + { + System.err.println("Bad integers in description " + description); + } + } + return false; + } + + /** + * Converts a query, which may contain one or more gene, transcript, or + * external (to Ensembl) identifiers, into a non-redundant list of gene + * identifiers. + * + * @param accessions + * @return + */ + List getGeneIds(String accessions) + { + List geneIds = new ArrayList<>(); + + for (String acc : accessions.split(getAccessionSeparator())) + { + /* + * First try lookup as an Ensembl (gene or transcript) identifier + */ + String geneId = new EnsemblLookup(getDomain()).getGeneId(acc); + if (geneId != null) + { + if (!geneIds.contains(geneId)) + { + geneIds.add(geneId); + } + } + else + { + /* + * if given a gene or other external name, lookup and fetch + * the corresponding gene for all model organisms + */ + List ids = new EnsemblSymbol(getDomain(), getDbSource(), + getDbVersion()).getGeneIds(acc); + for (String id : ids) + { + if (!geneIds.contains(id)) + { + geneIds.add(id); + } + } + } + } + return geneIds; + } + + /** + * Constructs all transcripts for the gene, as identified by "transcript" + * features whose Parent is the requested gene. The coding transcript + * sequences (i.e. with introns omitted) are added to the alignment. * * @param al * @param accId @@ -79,136 +290,405 @@ public class EnsemblGene extends EnsemblSeqProxy throws Exception { SequenceI gene = al.getSequenceAt(0); - List transcriptIds = getTranscriptIds(accId, gene); + List transcriptFeatures = getTranscriptFeatures(accId, + gene); - // TODO: could just use features and genomic sequence - // to generate the transcript sequences - faster - // could also grab "Name" as transcript description (gene name) - for (String transcriptId : transcriptIds) + for (SequenceFeature transcriptFeature : transcriptFeatures) { - /* - * fetch and map the transcript sequence; we can pass in the gene - * sequence with features marked to save fetching it again - */ - EnsemblCdna cdnaFetcher = new EnsemblCdna(); - AlignmentI al2 = cdnaFetcher.getSequenceRecords(transcriptId, - gene); - for (SequenceI seq : al2.getSequences()) - { - /* - * build mapping from gene sequence to transcript - */ - MapList mapping = cdnaFetcher.getGenomicRanges(gene, transcriptId, - seq.getStart()); + makeTranscript(transcriptFeature, al, gene); + } - /* - * align the transcript to the gene - */ - AlignedCodonFrame acf = new AlignedCodonFrame(); - acf.addMap(gene, seq, mapping); - char gap = al.getGapCharacter(); - // AlignmentUtils.alignSequenceAs(seq, gene, acf, String.valueOf(gap), - // gap, false, false); + clearGeneFeatures(gene); + } - al.addSequence(seq); - } + /** + * Remove unwanted features (transcript, exon, CDS) from the gene sequence + * after we have used them to derive transcripts and transfer features + * + * @param gene + */ + protected void clearGeneFeatures(SequenceI gene) + { + /* + * Note we include NMD_transcript_variant here because it behaves like + * 'transcript' in Ensembl, although strictly speaking it is not + * (it is a sub-type of sequence_variant) + */ + String[] soTerms = new String[] { + SequenceOntologyI.NMD_TRANSCRIPT_VARIANT, + SequenceOntologyI.TRANSCRIPT, SequenceOntologyI.EXON, + SequenceOntologyI.CDS }; + List sfs = gene.getFeatures() + .getFeaturesByOntology(soTerms); + for (SequenceFeature sf : sfs) + { + gene.deleteFeature(sf); } } /** - * Returns a list of the ids of transcript features on the sequence whose - * Parent is the gene for the accession id + * Constructs a spliced transcript sequence by finding 'exon' features for the + * given id (or failing that 'CDS'). Copies features on to the new sequence. + * 'Aligns' the new sequence against the gene sequence by padding with gaps, + * and adds it to the alignment. * - * @param accId - * @param geneSequence + * @param transcriptFeature + * @param al + * the alignment to which to add the new sequence + * @param gene + * the parent gene sequence, with features * @return */ - protected List getTranscriptIds(String accId, SequenceI geneSequence) + SequenceI makeTranscript(SequenceFeature transcriptFeature, AlignmentI al, + SequenceI gene) { - SequenceOntology so = SequenceOntology.getInstance(); - List transcriptIds = new ArrayList(); + String accId = getTranscriptId(transcriptFeature); + if (accId == null) + { + return null; + } /* - * scan for transcript features belonging to our gene; - * also remove any which belong to other genes + * NB we are mapping from gene sequence (not genome), so do not + * need to check for reverse strand (gene and transcript sequences + * are in forward sense) */ - SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); - List keptFeatures = new ArrayList(); - boolean featureDropped = false; - String parentIdentifier = "gene:" + accId; - for (SequenceFeature sf : sfs) + + /* + * make a gene-length sequence filled with gaps + * we will fill in the bases for transcript regions + */ + char[] seqChars = new char[gene.getLength()]; + Arrays.fill(seqChars, al.getGapCharacter()); + + /* + * look for exon features of the transcript, failing that for CDS + * (for example ENSG00000124610 has 1 CDS but no exon features) + */ + String parentId = accId; + List splices = findFeatures(gene, + SequenceOntologyI.EXON, parentId); + if (splices.isEmpty()) + { + splices = findFeatures(gene, SequenceOntologyI.CDS, parentId); + } + SequenceFeatures.sortFeatures(splices, true); + + int transcriptLength = 0; + final char[] geneChars = gene.getSequence(); + int offset = gene.getStart(); // to convert to 0-based positions + List mappedFrom = new ArrayList<>(); + + for (SequenceFeature sf : splices) { - if (so.isA(sf.getType(), SequenceOntology.TRANSCRIPT)) + int start = sf.getBegin() - offset; + int end = sf.getEnd() - offset; + int spliceLength = end - start + 1; + System.arraycopy(geneChars, start, seqChars, start, spliceLength); + transcriptLength += spliceLength; + mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() }); + } + + Sequence transcript = new Sequence(accId, seqChars, 1, + transcriptLength); + + /* + * Ensembl has gene name as transcript Name + * EnsemblGenomes doesn't, but has a url-encoded description field + */ + String description = transcriptFeature.getDescription(); + if (description == null) + { + description = (String) transcriptFeature.getValue(DESCRIPTION); + } + if (description != null) + { + try { - String parent = (String) sf.getValue(PARENT); - if (parentIdentifier.equals(parent)) - { - transcriptIds.add((String) sf.getValue("transcript_id")); - keptFeatures.add(sf); - } - else - { - featureDropped = true; - } - } - else + transcript.setDescription(URLDecoder.decode(description, "UTF-8")); + } catch (UnsupportedEncodingException e) { - keptFeatures.add(sf); + e.printStackTrace(); // as if } } - if (featureDropped) + transcript.createDatasetSequence(); + + al.addSequence(transcript); + + /* + * transfer features to the new sequence; we use EnsemblCdna to do this, + * to filter out unwanted features types (see method retainFeature) + */ + List mapTo = new ArrayList<>(); + mapTo.add(new int[] { 1, transcriptLength }); + MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); + EnsemblCdna cdna = new EnsemblCdna(getDomain()); + cdna.transferFeatures(gene.getFeatures().getPositionalFeatures(), + transcript.getDatasetSequence(), mapping, parentId); + + mapTranscriptToChromosome(transcript, gene, mapping); + + /* + * fetch and save cross-references + */ + cdna.getCrossReferences(transcript); + + /* + * and finally fetch the protein product and save as a cross-reference + */ + cdna.addProteinProduct(transcript); + + return transcript; + } + + /** + * If the gene has a mapping to chromosome coordinates, derive the transcript + * chromosome regions and save on the transcript sequence + * + * @param transcript + * @param gene + * @param mapping + * the mapping from gene to transcript positions + */ + protected void mapTranscriptToChromosome(SequenceI transcript, + SequenceI gene, MapList mapping) + { + GeneLociI loci = gene.getGeneLoci(); + if (loci == null) + { + return; + } + + MapList geneMapping = loci.getMapping(); + + List exons = mapping.getFromRanges(); + List transcriptLoci = new ArrayList<>(); + + for (int[] exon : exons) + { + transcriptLoci.add(geneMapping.locateInTo(exon[0], exon[1])); + } + + List transcriptRange = Arrays + .asList(new int[] + { transcript.getStart(), transcript.getEnd() }); + MapList mapList = new MapList(transcriptRange, transcriptLoci, 1, 1); + + transcript.setGeneLoci(loci.getSpeciesId(), loci.getAssemblyId(), + loci.getChromosomeId(), mapList); + } + + /** + * Returns the 'transcript_id' property of the sequence feature (or null) + * + * @param feature + * @return + */ + protected String getTranscriptId(SequenceFeature feature) + { + return (String) feature.getValue(JSON_ID); + } + + /** + * Returns a list of the transcript features on the sequence whose Parent is + * the gene for the accession id. + *

+ * Transcript features are those of type "transcript", or any of its sub-types + * in the Sequence Ontology e.g. "mRNA", "processed_transcript". We also + * include "NMD_transcript_variant", because this type behaves like a + * transcript identifier in Ensembl, although strictly speaking it is not in + * the SO. + * + * @param accId + * @param geneSequence + * @return + */ + protected List getTranscriptFeatures(String accId, + SequenceI geneSequence) + { + List transcriptFeatures = new ArrayList<>(); + + String parentIdentifier = accId; + + List sfs = geneSequence.getFeatures() + .getFeaturesByOntology(SequenceOntologyI.TRANSCRIPT); + sfs.addAll(geneSequence.getFeatures().getPositionalFeatures( + SequenceOntologyI.NMD_TRANSCRIPT_VARIANT)); + + for (SequenceFeature sf : sfs) { - geneSequence.getDatasetSequence().setSequenceFeatures( - keptFeatures.toArray(new SequenceFeature[keptFeatures - .size()])); + String parent = (String) sf.getValue(PARENT); + if (parentIdentifier.equalsIgnoreCase(parent)) + { + transcriptFeatures.add(sf); + } } - return transcriptIds; + + return transcriptFeatures; } @Override public String getDescription() { - return "Fetches all transcripts and variant features for a gene"; + return "Fetches all transcripts and variant features for a gene or transcript"; } /** - * Default test query is a transcript + * Default test query is a gene id (can also enter a transcript id) */ @Override public String getTestQuery() { - return "ENSG00000157764"; // reverse strand - // ENSG00000090266 // forward strand + return Platform.isJS() ? "ENSG00000123569" : "ENSG00000157764"; + // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand + // ENSG00000157764 // BRAF, 5 transcripts, reverse strand + // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand + // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand } /** - * Answers true for a feature of type 'gene' (or a sub-type of gene in the - * Sequence Ontology), whose ID is the accession we are retrieving + * Answers a list of sequence features (if any) whose type is 'gene' (or a + * subtype of gene in the Sequence Ontology), and whose ID is the accession we + * are retrieving */ @Override - protected boolean identifiesSequence(SequenceFeature sf, String accId) + protected List getIdentifyingFeatures(SequenceI seq, + String accId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.GENE)) + List result = new ArrayList<>(); + List sfs = seq.getFeatures() + .getFeaturesByOntology(SequenceOntologyI.GENE); + for (SequenceFeature sf : sfs) { - String id = (String) sf.getValue(ID); - if (("gene:" + accId).equals(id)) + String id = (String) sf.getValue(JSON_ID); + if (accId.equalsIgnoreCase(id)) { - return true; + result.add(sf); } } - return false; + return result; } /** - * Answers true unless feature type is 'gene'. We need the gene features to - * identify the range, but it is redundant information on the gene sequence. + * Answers true unless feature type is 'gene', or 'transcript' with a parent + * which is a different gene. We need the gene features to identify the range, + * but it is redundant information on the gene sequence. Checking the parent + * allows us to drop transcript features which belong to different + * (overlapping) genes. */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - return !SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.GENE); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + String type = sf.getType(); + if (so.isA(type, SequenceOntologyI.GENE)) + { + return false; + } + if (isTranscript(type)) + { + String parent = (String) sf.getValue(PARENT); + if (!accessionId.equalsIgnoreCase(parent)) + { + return false; + } + } + return true; + } + + /** + * Override to do nothing as Ensembl doesn't return a protein sequence for a + * gene identifier + */ + @Override + protected void addProteinProduct(SequenceI querySeq) + { + } + + @Override + public Regex getAccessionValidator() + { + return ACCESSION_REGEX; + } + + /** + * Returns a descriptor for suitable feature display settings with + *

    + *
  • only exon or sequence_variant features (or their subtypes in the + * Sequence Ontology) visible
  • + *
  • variant features coloured red
  • + *
  • exon features coloured by label (exon name)
  • + *
  • variants displayed above (on top of) exons
  • + *
+ */ + @Override + public FeatureSettingsModelI getFeatureColourScheme() + { + return new FeatureSettingsAdapter() + { + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + + @Override + public boolean isFeatureHidden(String type) + { + return (!so.isA(type, SequenceOntologyI.EXON) + && !so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)); + } + + @Override + public FeatureColourI getFeatureColour(String type) + { + if (so.isA(type, SequenceOntologyI.EXON)) + { + return new FeatureColour() + { + @Override + public boolean isColourByLabel() + { + return true; + } + }; + } + if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)) + { + return new FeatureColour() + { + + @Override + public Color getColour() + { + return Color.RED; + } + }; + } + return null; + } + + /** + * order to render sequence_variant after exon after the rest + */ + @Override + public int compare(String feature1, String feature2) + { + if (so.isA(feature1, SequenceOntologyI.SEQUENCE_VARIANT)) + { + return +1; + } + if (so.isA(feature2, SequenceOntologyI.SEQUENCE_VARIANT)) + { + return -1; + } + if (so.isA(feature1, SequenceOntologyI.EXON)) + { + return +1; + } + if (so.isA(feature2, SequenceOntologyI.EXON)) + { + return -1; + } + return 0; + } + }; } }