/**
* Answers true unless the feature type is 'exon' (or a sub-type of exon in
* the Sequence Ontology). Exon features are only retrieved in order to
- * identify the exon sequence range, and are redundant information on the exon
+ * identify the exon sequence loci, and are redundant information on the exon
* sequence itself.
*/
@Override
- protected boolean retainFeature(String type)
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
{
- return !SequenceOntology.getInstance().isA(type, SequenceOntology.EXON);
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.EXON))
+ {
+ return false;
+ }
+ return super.retainFeature(sf, accessionId);
}
/**
* itself.
*/
@Override
- protected boolean retainFeature(String type)
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
{
- return !SequenceOntology.getInstance().isA(type, SequenceOntology.CDS);
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.CDS))
+ {
+ return false;
+ }
+ return super.retainFeature(sf, accessionId);
}
/**
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.io.gff.SequenceOntology;
+import jalview.util.MapList;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A class that fetches genomic sequence and all transcripts for an Ensembl gene
+ *
+ * @author gmcarstairs
+ */
+public class EnsemblGene extends EnsemblSeqProxy
+{
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
+ EnsemblFeatureType.exon, EnsemblFeatureType.cds,
+ EnsemblFeatureType.variation };
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (GENE)";
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.GENOMIC;
+ }
+
+ /**
+ * Builds an alignment of all transcripts for the requested gene:
+ * <ul>
+ * <li>fetches the gene sequence</li>
+ * <li>fetches features on the sequence</li>
+ * <li>identifies "transcript" features whose Parent is the requested gene</li>
+ * <li>fetches the transcript sequence for each transcript</li>
+ * <li>makes a mapping from the gene to each transcript</li>
+ * <li>copies features from gene to transcript sequences</li>
+ * <li>fetches the protein sequence for each transcript, maps and saves it as
+ * a cross-reference</li>
+ * <li>aligns each transcript against the gene sequence based on the position
+ * mappings</li>
+ * </ul>
+ */
+ @Override
+ public AlignmentI getSequenceRecords(String query) throws Exception
+ {
+ AlignmentI al = super.getSequenceRecords(query);
+ if (al.getHeight() > 0)
+ {
+ getTranscripts(al, query);
+ }
+
+ return al;
+ }
+
+ /**
+ * Find and fetch all transcripts for the gene, as identified by "transcript"
+ * features whose Parent is the requested gene
+ *
+ * @param al
+ * @param accId
+ * @throws Exception
+ */
+ protected void getTranscripts(AlignmentI al, String accId)
+ throws Exception
+ {
+ SequenceI gene = al.getSequenceAt(0);
+ List<String> transcriptIds = getTranscriptIds(accId, gene);
+
+ // TODO: could just use features and genomic sequence
+ // to generate the transcript sequences - faster
+ // could also grab "Name" as transcript description (gene name)
+ for (String transcriptId : transcriptIds)
+ {
+ /*
+ * fetch and map the transcript sequence; we can pass in the gene
+ * sequence with features marked to save fetching it again
+ */
+ EnsemblCdna cdnaFetcher = new EnsemblCdna();
+ AlignmentI al2 = cdnaFetcher.getSequenceRecords(transcriptId,
+ gene);
+ for (SequenceI seq : al2.getSequences())
+ {
+ /*
+ * build mapping from gene sequence to transcript
+ */
+ MapList mapping = cdnaFetcher.getGenomicRanges(gene, transcriptId,
+ seq.getStart());
+
+ /*
+ * align the transcript to the gene
+ */
+ AlignedCodonFrame acf = new AlignedCodonFrame();
+ acf.addMap(gene, seq, mapping);
+ char gap = al.getGapCharacter();
+ // AlignmentUtils.alignSequenceAs(seq, gene, acf, String.valueOf(gap),
+ // gap, false, false);
+
+ al.addSequence(seq);
+ }
+ }
+ }
+
+ /**
+ * Returns a list of the ids of transcript features on the sequence whose
+ * Parent is the gene for the accession id
+ *
+ * @param accId
+ * @param geneSequence
+ * @return
+ */
+ protected List<String> getTranscriptIds(String accId, SequenceI geneSequence)
+ {
+ SequenceOntology so = SequenceOntology.getInstance();
+ List<String> transcriptIds = new ArrayList<String>();
+
+ /*
+ * scan for transcript features belonging to our gene;
+ * also remove any which belong to other genes
+ */
+ SequenceFeature[] sfs = geneSequence.getSequenceFeatures();
+ List<SequenceFeature> keptFeatures = new ArrayList<SequenceFeature>();
+ boolean featureDropped = false;
+ String parentIdentifier = "gene:" + accId;
+ for (SequenceFeature sf : sfs)
+ {
+ if (so.isA(sf.getType(), SequenceOntology.TRANSCRIPT))
+ {
+ String parent = (String) sf.getValue(PARENT);
+ if (parentIdentifier.equals(parent))
+ {
+ transcriptIds.add((String) sf.getValue("transcript_id"));
+ keptFeatures.add(sf);
+ }
+ else
+ {
+ featureDropped = true;
+ }
+ }
+ else
+ {
+ keptFeatures.add(sf);
+ }
+ }
+ if (featureDropped)
+ {
+ geneSequence.getDatasetSequence().setSequenceFeatures(
+ keptFeatures.toArray(new SequenceFeature[keptFeatures
+ .size()]));
+ }
+ return transcriptIds;
+ }
+
+ @Override
+ public String getDescription()
+ {
+ return "Fetches all transcripts and variant features for a gene";
+ }
+
+ /**
+ * Default test query is a transcript
+ */
+ @Override
+ public String getTestQuery()
+ {
+ return "ENSG00000157764"; // reverse strand
+ // ENSG00000090266 // forward strand
+ }
+
+ /**
+ * Answers true for a feature of type 'gene' (or a sub-type of gene in the
+ * Sequence Ontology), whose ID is the accession we are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.GENE))
+ {
+ String id = (String) sf.getValue(ID);
+ if (("gene:" + accId).equals(id))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Answers true unless feature type is 'gene'. We need the gene features to
+ * identify the range, but it is redundant information on the gene sequence.
+ */
+ @Override
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ return !SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.GENE);
+ }
+
+}
* redundant information on the transcript sequence itself.
*/
@Override
- protected boolean retainFeature(String type)
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
{
- return !SequenceOntology.getInstance().isA(type,
- SequenceOntology.TRANSCRIPT);
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.TRANSCRIPT))
+ {
+ return false;
+ }
+ return super.retainFeature(sf, accessionId);
}
/**
}
@Override
- public boolean useGetRequest()
+ protected boolean useGetRequest()
{
return true;
}
* describes the required encoding of the response.
*/
@Override
- public String getRequestMimeType()
+ protected String getRequestMimeType()
{
return "text/x-gff3";
}
* Returns the MIME type for GFF3.
*/
@Override
- public String getResponseMimeType()
+ protected String getResponseMimeType()
{
return "text/x-gff3";
}
* @return
* @throws IOException
*/
- public AlignmentI getSequenceRecords(String accId,
- EnsemblFeatureType[] features)
- throws IOException
+ protected AlignmentI getSequenceRecords(String accId,
+ EnsemblFeatureType[] features) throws IOException
{
featuresWanted = features;
return getSequenceRecords(accId);
*
* @return
*/
- public abstract boolean useGetRequest();
+ protected abstract boolean useGetRequest();
/**
* Return the desired value for the Content-Type request header
* @return
* @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
*/
- public abstract String getRequestMimeType();
+ protected abstract String getRequestMimeType();
/**
* Return the desired value for the Accept request header
* @return
* @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
*/
- public abstract String getResponseMimeType();
+ protected abstract String getResponseMimeType();
/**
* Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if
* @return
* @throws IOException
*/
- public FileParse getSequenceReader(List<String> ids)
+ protected FileParse getSequenceReader(List<String> ids)
throws IOException
{
URL url = getUrl(ids);
*
* @return
*/
- public boolean isEnsemblAvailable()
+ protected boolean isEnsemblAvailable()
{
long now = System.currentTimeMillis();
boolean retest = now - lastCheck > RETEST_INTERVAL;
return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]);
}
- };
+ }
+
+ /*
+ * genomic sequence, with features retrieved from the REST overlap service
+ */
+ private SequenceI genomicSequence;
/**
* Constructor
/*
* get 'dummy' genomic sequence with exon, cds and variation features
*/
- EnsemblOverlap gffFetcher = new EnsemblOverlap();
- EnsemblFeatureType[] features = getFeaturesToFetch();
- AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
- features);
- if (geneFeatures.getHeight() > 0)
+ if (genomicSequence == null)
+ {
+ EnsemblOverlap gffFetcher = new EnsemblOverlap();
+ EnsemblFeatureType[] features = getFeaturesToFetch();
+ AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
+ features);
+ if (geneFeatures.getHeight() > 0)
+ {
+ /*
+ * transfer features to the query sequence
+ */
+ genomicSequence = geneFeatures.getSequenceAt(0);
+ }
+ }
+ if (genomicSequence != null)
{
- /*
- * transfer features to the query sequence
- */
- SequenceI genomicSequence = geneFeatures.getSequenceAt(0);
SequenceI querySeq = alignment.findName(accId);
- transferFeatures(accId, genomicSequence, querySeq);
+ if (transferFeatures(accId, genomicSequence, querySeq))
+ {
- /*
- * fetch and map protein product, and add it as a cross-reference
- * of the retrieved sequence
- */
- addProteinProduct(querySeq);
+ /*
+ * fetch and map protein product, and add it as a cross-reference
+ * of the retrieved sequence
+ */
+ addProteinProduct(querySeq);
+ }
}
} catch (IOException e)
{
}
@Override
- public boolean useGetRequest()
+ protected boolean useGetRequest()
{
return false;
}
@Override
- public String getRequestMimeType()
+ protected String getRequestMimeType()
{
return "application/json";
}
@Override
- public String getResponseMimeType()
+ protected String getResponseMimeType()
{
return "text/x-fasta";
}
* backwards (for negative strand). Aborts and returns null if both positive
* and negative strand are found (this should not normally happen).
*
- * @param sfs
+ * @param sourceSequence
* @param accId
+ * @param start
+ * the start position of the sequence we are mapping to
* @return
*/
- protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId)
+ protected MapList getGenomicRanges(SequenceI sourceSequence,
+ String accId, int start)
{
+ SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return null;
+ }
+
/*
* generously size for initial number of cds regions
* (worst case titin Q8WZ42 has c. 313 exons)
*/
List<int[]> regions = new ArrayList<int[]>(100);
+ int sourceLength = sourceSequence.getLength();
int mappedLength = 0;
int direction = 1; // forward
boolean directionSet = false;
}
else
{
- regions.add(new int[] { sf.getBegin(), sf.getEnd() });
- }
- mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
+ regions.add(new int[] { sf.getBegin(), sf.getEnd() });
}
+ mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
+
+ if (mappedLength >= sourceLength)
+ {
+ /*
+ * break for the case of matching gene features v gene sequence
+ * - only need to locate the 'gene' feature for accId
+ */
+ break;
+ }
+ }
}
+ if (regions.isEmpty())
+ {
+ System.out.println("Failed to identify target sequence for " + accId
+ + " from genomic features");
+ return null;
+ }
+
/*
* a final sort is needed since Ensembl returns CDS sorted within source
* (havana / ensembl_havana)
Collections.sort(regions, new RangeSorter(direction == 1));
List<int[]> to = new ArrayList<int[]>();
- to.add(new int[] { 1, mappedLength });
+ to.add(new int[] { start, start + mappedLength - 1 });
return new MapList(regions, to, 1, 1);
}
/**
- * Returns true if the sequence feature identifies positions of the genomic
- * sequence feature which are within the sequence being retrieved.
+ * Returns true if the sequence feature marks positions of the genomic
+ * sequence feature which are within the sequence being retrieved. For
+ * example, an 'exon' feature whose parent is the target transcript marks the
+ * cdna positions of the transcript.
*
* @param sf
* @param accId
protected void transferFeature(SequenceFeature sf,
SequenceI targetSequence, MapList overlap)
{
- String parent = (String) sf.getValue(PARENT);
- if (parent != null && !parent.contains(targetSequence.getName()))
- {
- // this genomic feature belongs to a different transcript
- return;
- }
-
int start = sf.getBegin();
int end = sf.getEnd();
int[] mappedRange = overlap.locateInTo(start, end);
if (mappedRange != null)
{
SequenceFeature copy = new SequenceFeature(sf);
- int offset = targetSequence.getStart() - 1;
- copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1]));
- copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1]));
+ copy.setBegin(Math.min(mappedRange[0], mappedRange[1]));
+ copy.setEnd(Math.max(mappedRange[0], mappedRange[1]));
targetSequence.addSequenceFeature(copy);
/*
* @param accessionId
* @param sourceSequence
* @param targetSequence
+ * @return true if any features were transferred, else false
*/
- protected void transferFeatures(String accessionId,
+ protected boolean transferFeatures(String accessionId,
SequenceI sourceSequence, SequenceI targetSequence)
{
if (sourceSequence == null || targetSequence == null)
{
- return;
+ return false;
}
SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
- MapList overlap = getGenomicRanges(sfs, accessionId);
+ MapList overlap = getGenomicRanges(sourceSequence, accessionId,
+ targetSequence.getStart());
+ if (overlap == null)
+ {
+ return false;
+ }
final boolean forwardStrand = overlap.isFromForwardStrand();
}
});
+ boolean transferred = false;
for (SequenceFeature sf : sfs)
{
- if (retainFeature(sf.getType()))
+ if (retainFeature(sf, accessionId))
{
transferFeature(sf, targetSequence, overlap);
+ transferred = true;
}
}
+ return transferred;
}
/**
- * Answers true if the feature type is one to attach to the retrieved sequence
+ * Answers true if the feature is one to attach to the retrieved sequence
*
* @param type
* @return
*/
- protected boolean retainFeature(@SuppressWarnings("unused") String type)
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ String parent = (String) sf.getValue(PARENT);
+ if (parent != null && !parent.contains(accessionId))
+ {
+ // this genomic feature belongs to a different transcript
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String getDescription()
+ {
+ return "Ensembl " + getSourceEnsemblType().getType()
+ + " sequence with variant features";
+ }
+
+ public AlignmentI getSequenceRecords(String transcriptId,
+ SequenceI geneSeq) throws Exception
{
- return true; // default is to keep all
+ this.genomicSequence = geneSeq;
+ return getSequenceRecords(transcriptId);
}
}
desc = StringUtils.listToDelimitedString(
attributes.get("alleles"), ",");
}
+
+ /*
+ * Ensembl returns gene name as 'Name' for a transcript
+ */
+ if (SequenceOntology.getInstance().isA(sf.getType(),
+ SequenceOntology.TRANSCRIPT))
+ {
+ desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
+ }
return desc;
}
}
// SO:0000673
public static final String TRANSCRIPT = "transcript";
+ // SO:0000704
+ public static final String GENE = "gene";
+
/*
* singleton instance of this class
*/
import jalview.ext.ensembl.EnsemblCdna;
import jalview.ext.ensembl.EnsemblCds;
+import jalview.ext.ensembl.EnsemblGene;
import jalview.ext.ensembl.EnsemblGenome;
import jalview.ext.ensembl.EnsemblProtein;
import jalview.ws.dbsources.EmblCdsSource;
addDBRefSourceImpl(EnsemblProtein.class);
addDBRefSourceImpl(EnsemblCds.class);
addDBRefSourceImpl(EnsemblGenome.class);
+ addDBRefSourceImpl(EnsemblGene.class);
addDBRefSourceImpl(EnsemblCdna.class);
addDBRefSourceImpl(EmblSource.class);
addDBRefSourceImpl(EmblCdsSource.class);
}
@Override
- public boolean useGetRequest()
+ protected boolean useGetRequest()
{
return false;
}
@Override
- public String getRequestMimeType()
+ protected String getRequestMimeType()
{
return null;
}
@Override
- public String getResponseMimeType()
+ protected String getResponseMimeType()
{
return null;
}
}
@Override
- public boolean useGetRequest()
+ protected boolean useGetRequest()
{
// TODO Auto-generated method stub
return false;
}
@Override
- public String getRequestMimeType()
+ protected String getRequestMimeType()
{
// TODO Auto-generated method stub
return null;
}
@Override
- public String getResponseMimeType()
+ protected String getResponseMimeType()
{
// TODO Auto-generated method stub
return null;
+ (isAvailable ? "UP!"
: "DOWN or unreachable ******************* BAD!"));
}
+ // todo lots of tests
}
\ No newline at end of file