JAL-2253 resolve diffrences between patches post-code review and diverging develop
}
/**
- * Converts a query, which may contain one or more gene or transcript
- * identifiers, into a non-redundant list of gene identifiers.
+ * Converts a query, which may contain one or more gene, transcript, or
+ * external (to Ensembl) identifiers, into a non-redundant list of gene
+ * identifiers.
*
* @param accessions
* @return
for (String acc : accessions.split(getAccessionSeparator()))
{
- if (isGeneIdentifier(acc))
- {
- if (!geneIds.contains(acc))
- {
- geneIds.add(acc);
- }
- }
-
/*
- * if given a transcript id, look up its gene parent
+ * First try lookup as an Ensembl (gene or transcript) identifier
*/
- else if (isTranscriptIdentifier(acc))
+ String geneId = new EnsemblLookup(getDomain()).getGeneId(acc);
+ if (geneId != null)
{
- String geneId = new EnsemblLookup(getDomain()).getParent(acc);
- if (geneId != null && !geneIds.contains(geneId))
+ if (!geneIds.contains(geneId))
{
geneIds.add(geneId);
}
}
- else if (isProteinIdentifier(acc))
- {
- String tscriptId = new EnsemblLookup(getDomain()).getParent(acc);
- if (tscriptId != null)
- {
- String geneId = new EnsemblLookup(getDomain())
- .getParent(tscriptId);
-
- if (geneId != null && !geneIds.contains(geneId))
- {
- geneIds.add(geneId);
- }
- }
- // NOTE - acc is lost if it resembles an ENS.+ ID but isn't actually
- // resolving to one... e.g. ENSMICP00000009241
- }
- /*
- * if given a gene or other external name, lookup and fetch
- * the corresponding gene for all model organisms
- */
else
{
+ /*
+ * if given a gene or other external name, lookup and fetch
+ * the corresponding gene for all model organisms
+ */
List<String> ids = new EnsemblSymbol(getDomain(), getDbSource(),
- getDbVersion()).getIds(acc);
- for (String geneId : ids)
+ getDbVersion()).getGeneIds(acc);
+ for (String id : ids)
{
- if (!geneIds.contains(geneId))
+ if (!geneIds.contains(id))
{
- geneIds.add(geneId);
+ geneIds.add(id);
}
}
}
}
/**
- * Attempts to get Ensembl stable identifiers for model organisms for a gene
- * name by calling the xrefs symbol REST service to resolve the gene name.
- *
- * @param query
- * @return
- */
- protected String getGeneIdentifiersForName(String query)
- {
- List<String> ids = new EnsemblSymbol(getDomain(), getDbSource(),
- getDbVersion()).getIds(query);
- if (ids != null)
- {
- for (String id : ids)
- {
- if (isGeneIdentifier(id))
- {
- return id;
- }
- }
- }
- return null;
- }
-
- /**
* Constructs all transcripts for the gene, as identified by "transcript"
* features whose Parent is the requested gene. The coding transcript
* sequences (i.e. with introns omitted) are added to the alignment.
}
@Override
- public boolean isGeneIdentifier(String query)
- {
- return true;
- }
-
- @Override
public String getDbName()
{
return "EnsemblGenomes";
}
+ private String Wrong[];
@Override
public String getTestQuery()
{
public class EnsemblLookup extends EnsemblRestClient
{
+ private static final String OBJECT_TYPE_TRANSLATION = "Translation";
+ private static final String PARENT = "Parent";
+ private static final String OBJECT_TYPE_TRANSCRIPT = "Transcript";
+ private static final String ID = "id";
+ private static final String OBJECT_TYPE_GENE = "Gene";
+ private static final String OBJECT_TYPE = "object_type";
+
/**
* Default constructor (to use rest.ensembl.org)
*/
protected URL getUrl(String identifier)
{
String url = getDomain() + "/lookup/id/" + identifier
- + "?content-type=application/json";
+ + CONTENT_TYPE_JSON;
try
{
return new URL(url);
* @param identifier
* @return
*/
- public String getParent(String identifier)
+ public String getGeneId(String identifier)
{
List<String> ids = Arrays.asList(new String[] { identifier });
}
/**
- * Parses "Parent" from the JSON response and returns the value, or null if
- * not found
+ * Parses the JSON response and returns the gene identifier, or null if not
+ * found. If the returned object_type is Gene, returns the id, if Transcript
+ * returns the Parent. If it is Translation (peptide identifier), then the
+ * Parent is the transcript identifier, so we redo the search with this value.
*
* @param br
* @return
*/
protected String parseResponse(BufferedReader br) throws IOException
{
- String parent = null;
+ String geneId = null;
JSONParser jp = new JSONParser();
try
{
JSONObject val = (JSONObject) jp.parse(br);
- parent = val.get("Parent").toString();
+ String type = val.get(OBJECT_TYPE).toString();
+ if (OBJECT_TYPE_GENE.equalsIgnoreCase(type))
+ {
+ geneId = val.get(ID).toString();
+ }
+ else if (OBJECT_TYPE_TRANSCRIPT.equalsIgnoreCase(type))
+ {
+ geneId = val.get(PARENT).toString();
+ }
+ else if (OBJECT_TYPE_TRANSLATION.equalsIgnoreCase(type))
+ {
+ String transcriptId = val.get(PARENT).toString();
+ try
+ {
+ geneId = getGeneId(transcriptId);
+ } catch (StackOverflowError e)
+ {
+ /*
+ * unlikely data condition error!
+ */
+ System.err
+ .println("** Ensembl lookup "
+ + getUrl(transcriptId).toString()
+ + " looping on Parent!");
+ }
+ }
} catch (ParseException e)
{
// ignore
}
- return parent;
+ return geneId;
}
}
import jalview.datamodel.AlignmentI;
import jalview.datamodel.SequenceFeature;
-import java.util.List;
-
import com.stevesoft.pat.Regex;
/**
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
-import com.stevesoft.pat.Regex;
-
/**
* Base class for Ensembl REST service clients
*
* @see https://github.com/Ensembl/ensembl-rest/wiki/Change-log
* @see http://rest.ensembl.org/info/rest?content-type=application/json
*/
- private static final String LATEST_ENSEMBLGENOMES_REST_VERSION = "5.0";
+ private static final String LATEST_ENSEMBLGENOMES_REST_VERSION = "6.0";
- private static final String LATEST_ENSEMBL_REST_VERSION = "5.0";
+ private static final String LATEST_ENSEMBL_REST_VERSION = "6.1";
private static final String REST_CHANGE_LOG = "https://github.com/Ensembl/ensembl-rest/wiki/Change-log";
private final static long VERSION_RETEST_INTERVAL = 1000L * 3600; // 1 hr
- private static final Regex PROTEIN_REGEX = new Regex(
- "(ENS)([A-Z]{3}|)P[0-9]{11}$");
-
- private static final Regex TRANSCRIPT_REGEX = new Regex(
- "(ENS)([A-Z]{3}|)T[0-9]{11}$");
-
- private static final Regex GENE_REGEX = new Regex(
- "(ENS)([A-Z]{3}|)G[0-9]{11}$");
+ protected static final String CONTENT_TYPE_JSON = "?content-type=application/json";
static
{
- domainData = new HashMap<String, EnsemblInfo>();
+ domainData = new HashMap<>();
domainData.put(ENSEMBL_REST,
new EnsemblInfo(ENSEMBL_REST, LATEST_ENSEMBL_REST_VERSION));
domainData.put(ENSEMBL_GENOMES_REST, new EnsemblInfo(
setDomain(d);
}
- /**
- * Answers true if the query matches the regular expression pattern for an
- * Ensembl transcript stable identifier
- *
- * @param query
- * @return
- */
- public boolean isTranscriptIdentifier(String query)
- {
- return query == null ? false : TRANSCRIPT_REGEX.search(query);
- }
-
- /**
- * Answers true if the query matches the regular expression pattern for an
- * Ensembl protein stable identifier
- *
- * @param query
- * @return
- */
- public boolean isProteinIdentifier(String query)
- {
- return query == null ? false : PROTEIN_REGEX.search(query);
- }
-
- /**
- * Answers true if the query matches the regular expression pattern for an
- * Ensembl gene stable identifier
- *
- * @param query
- * @return
- */
- public boolean isGeneIdentifier(String query)
- {
- return query == null ? false : GENE_REGEX.search(query);
- }
-
@Override
public boolean queryInProgress()
{
{
// note this format works for both ensembl and ensemblgenomes
// info/ping.json works for ensembl only (March 2016)
- URL ping = new URL(
- getDomain() + "/info/ping?content-type=application/json");
+ URL ping = new URL(getDomain() + "/info/ping" + CONTENT_TYPE_JSON);
/*
* expect {"ping":1} if ok
br = getHttpResponse(ping, null, 2 * 1000);
if (br == null)
{
+ // error reponse status
return false;
}
JSONParser jp = new JSONParser();
URL url = null;
try
{
- url = new URL(
- getDomain() + "/info/rest?content-type=application/json");
+ url = new URL(getDomain() + "/info/rest" + CONTENT_TYPE_JSON);
BufferedReader br = getHttpResponse(url, null);
+ if (br == null)
+ {
+ return;
+ }
JSONObject val = (JSONObject) jp.parse(br);
String version = val.get("release").toString();
String majorVersion = version.substring(0, version.indexOf("."));
try
{
- url = new URL(
- getDomain() + "/info/data?content-type=application/json");
+ url = new URL(getDomain() + "/info/data" + CONTENT_TYPE_JSON);
br = getHttpResponse(url, null);
if (br != null)
{
*/
public class EnsemblSymbol extends EnsemblXref
{
+ private static final String GENE = "gene";
+ private static final String TYPE = "type";
+ private static final String ID = "id";
+
/**
* Constructor given the target domain to fetch data from
*
while (rvals.hasNext())
{
JSONObject val = (JSONObject) rvals.next();
- String id = val.get("id").toString();
- if (id != null && isGeneIdentifier(id))
+ String id = val.get(ID).toString();
+ String type = val.get(TYPE).toString();
+ if (id != null && GENE.equals(type))
{
result = id;
break;
return result;
}
- protected URL getUrl(String id, Species species)
+ /**
+ * Constructs the URL for the REST symbol endpoint
+ *
+ * @param id
+ * the accession id (Ensembl or external)
+ * @param species
+ * a species name recognisable by Ensembl
+ * @param type
+ * an optional type to filter the response (gene, transcript,
+ * translation)
+ * @return
+ */
+ protected URL getUrl(String id, Species species, String... type)
{
- String url = getDomain() + "/xrefs/symbol/" + species.toString() + "/"
- + id + "?content-type=application/json";
+ StringBuilder sb = new StringBuilder();
+ sb.append(getDomain()).append("/xrefs/symbol/")
+ .append(species.toString()).append("/").append(id)
+ .append(CONTENT_TYPE_JSON);
+ for (String t : type)
+ {
+ sb.append("&object_type=").append(t);
+ }
try
{
+ String url = sb.toString();
return new URL(url);
} catch (MalformedURLException e)
{
* @param identifier
* @return
*/
- public List<String> getIds(String identifier)
+ public List<String> getGeneIds(String identifier)
{
List<String> result = new ArrayList<String>();
List<String> ids = new ArrayList<String>();
{
for (Species taxon : Species.getModelOrganisms())
{
- URL url = getUrl(query, taxon);
+ URL url = getUrl(query, taxon, GENE);
if (url != null)
{
br = getHttpResponse(url, ids);
if (br != null)
{
String geneId = parseSymbolResponse(br);
- if (geneId != null)
+ System.out.println(url + " returned " + geneId);
+ if (geneId != null && !result.contains(geneId))
{
result.add(geneId);
}
while (rvals.hasNext())
{
JSONObject val = (JSONObject) rvals.next();
- String dbname = val.get("dbname").toString();
- if (GO_GENE_ONTOLOGY.equals(dbname))
- {
- continue;
- }
+ String db = val.get("dbname").toString();
String id = val.get("primary_id").toString();
- if (dbname != null && id != null)
+ if (db != null && id != null
+ && !GO_GENE_ONTOLOGY.equals(db))
{
- dbname = DBRefUtils.getCanonicalName(dbname);
- DBRefEntry dbref = new DBRefEntry(dbname, getXRefVersion(), id);
+ db = DBRefUtils.getCanonicalName(db);
+ DBRefEntry dbref = new DBRefEntry(db, getXRefVersion(), id);
result.add(dbref);
}
}
protected URL getUrl(String identifier)
{
String url = getDomain() + "/xrefs/id/" + identifier
- + "?content-type=application/json&all_levels=1";
+ + CONTENT_TYPE_JSON + "&all_levels=1";
try
{
return new URL(url);
assertEquals(-1, fc.compare("coding_exon", "feature_variant"));
assertEquals(1f, fc.getTransparency());
}
+
+ @Test(groups = "Network")
+ public void testGetGeneIds()
+ {
+ /*
+ * ENSG00000158828 gene id PINK1 human
+ * ENST00000321556 transcript for the same gene - should not be duplicated
+ * P30419 Uniprot identifier for ENSG00000136448
+ * ENST00000592782 transcript for Uniprot gene - should not be duplicated
+ * BRAF - gene name resolvabe (at time of writing) for 6 model species
+ */
+ String ids = "ENSG00000158828 ENST00000321556 P30419 ENST00000592782 BRAF";
+ EnsemblGene testee = new EnsemblGene();
+ List<String> geneIds = testee.getGeneIds(ids);
+ assertEquals(8, geneIds.size());
+ assertTrue(geneIds.contains("ENSG00000158828"));
+ assertTrue(geneIds.contains("ENSG00000136448"));
+ assertTrue(geneIds.contains("ENSG00000157764")); // BRAF human
+ assertTrue(geneIds.contains("ENSMUSG00000002413")); // mouse
+ assertTrue(geneIds.contains("ENSRNOG00000010957")); // rat
+ assertTrue(geneIds.contains("ENSXETG00000004845")); // xenopus
+ assertTrue(geneIds.contains("ENSDARG00000017661")); // zebrafish
+ assertTrue(geneIds.contains("ENSGALG00000012865")); // chicken
+ }
}
}
- @Test(groups = "Functional")
- public void testIsTranscriptIdentifier()
- {
- EnsemblSeqProxy testee = new EnsemblGene();
- assertFalse(testee.isTranscriptIdentifier(null));
- assertFalse(testee.isTranscriptIdentifier(""));
- assertFalse(testee.isTranscriptIdentifier("ENSG00000012345"));
- assertTrue(testee.isTranscriptIdentifier("ENST00000012345"));
- assertTrue(testee.isTranscriptIdentifier("ENSMUST00000012345"));
- assertFalse(testee.isTranscriptIdentifier("enst00000012345"));
- assertFalse(testee.isTranscriptIdentifier("ENST000000123456"));
- assertFalse(testee.isTranscriptIdentifier("ENST0000001234"));
- }
-
- @Test(groups = "Functional")
- public void testIsGeneIdentifier()
- {
- EnsemblSeqProxy testee = new EnsemblGene();
- assertFalse(testee.isGeneIdentifier(null));
- assertFalse(testee.isGeneIdentifier(""));
- assertFalse(testee.isGeneIdentifier("ENST00000012345"));
- assertTrue(testee.isGeneIdentifier("ENSG00000012345"));
- assertTrue(testee.isGeneIdentifier("ENSMUSG00000012345"));
- assertFalse(testee.isGeneIdentifier("ensg00000012345"));
- assertFalse(testee.isGeneIdentifier("ENSG000000123456"));
- assertFalse(testee.isGeneIdentifier("ENSG0000001234"));
- }
-
/**
* Test the method that appends a single allele's reverse complement to a
* string buffer