From d547268f85f377f14dbeb169072caebad45f9e2b Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 29 Apr 2016 15:59:23 +0100 Subject: [PATCH] JAL-2051 better checking of retrieved / duplicated accession ids --- src/jalview/analysis/CrossRef.java | 4 +- src/jalview/ext/ensembl/EnsemblGene.java | 102 +++++--- src/jalview/ext/ensembl/EnsemblRestClient.java | 14 ++ src/jalview/gui/SequenceFetcher.java | 302 ++++++++++++++---------- 4 files changed, 263 insertions(+), 159 deletions(-) diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 3563eba..7e77fc1 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -327,8 +327,8 @@ public class CrossRef } if (l > 0) { - System.out - .println("Attempting to retrieve cross referenced sequences."); + // System.out + // .println("Attempting to retrieve cross referenced sequences."); DBRefEntry[] t = new DBRefEntry[l]; l = 0; for (int r = 0; r < xrfs.length; r++) diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 84b5dcf..4dd1bba 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -11,7 +11,6 @@ import jalview.io.gff.SequenceOntologyI; import jalview.schemes.FeatureColourAdapter; import jalview.schemes.FeatureSettingsAdapter; import jalview.util.MapList; -import jalview.util.StringUtils; import java.awt.Color; import java.io.UnsupportedEncodingException; @@ -108,47 +107,84 @@ public class EnsemblGene extends EnsemblSeqProxy public AlignmentI getSequenceRecords(String query) throws Exception { /* - * if given a transcript id, look up its gene parent + * convert to a non-duplicated list of gene identifiers */ - if (isTranscriptIdentifier(query)) + List geneIds = getGeneIds(query); + + AlignmentI al = null; + for (String geneId : geneIds) { - query = new EnsemblLookup(getDomain()).getParent(query); - if (query == null) + /* + * fetch the gene sequence(s) with features and xrefs + */ + AlignmentI geneAlignment = super.getSequenceRecords(geneId); + + if (geneAlignment.getHeight() == 1) { - return null; + getTranscripts(geneAlignment, geneId); + } + if (al == null) + { + al = geneAlignment; + } + else + { + al.append(geneAlignment); } } + return al; + } - /* - * if given a gene or other external name, lookup and fetch - * the corresponding gene for all model organisms - */ - if (!isGeneIdentifier(query)) + /** + * Converts a query, which may contain one or more gene or transcript + * identifiers, into a non-redundant list of gene identifiers. + * + * @param accessions + * @return + */ + List getGeneIds(String accessions) + { + List geneIds = new ArrayList(); + + for (String acc : accessions.split(getAccessionSeparator())) { - List geneIds = new EnsemblSymbol(getDomain()).getIds(query); - if (geneIds.isEmpty()) + if (isGeneIdentifier(acc)) { - return null; + if (!geneIds.contains(acc)) + { + geneIds.add(acc); + } } - String theIds = StringUtils.listToDelimitedString(geneIds, - getAccessionSeparator()); - return getSequenceRecords(theIds); - } - /* - * fetch the gene sequence(s) with features and xrefs - */ - AlignmentI al = super.getSequenceRecords(query); + /* + * if given a transcript id, look up its gene parent + */ + else if (isTranscriptIdentifier(acc)) + { + String geneId = new EnsemblLookup(getDomain()).getParent(acc); + if (geneId != null && !geneIds.contains(geneId)) + { + geneIds.add(geneId); + } + } - /* - * if we retrieved a single gene, get its transcripts as well - */ - if (al.getHeight() == 1) - { - getTranscripts(al, query); + /* + * if given a gene or other external name, lookup and fetch + * the corresponding gene for all model organisms + */ + else + { + List ids = new EnsemblSymbol(getDomain()).getIds(acc); + for (String geneId : ids) + { + if (!geneIds.contains(geneId)) + { + geneIds.add(geneId); + } + } + } } - - return al; + return geneIds; } /** @@ -551,10 +587,4 @@ public class EnsemblGene extends EnsemblSeqProxy }; } - @Override - public int getMaximumQueryCount() - { - return 1; - } - } diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java index 441ec7c..6a564f1 100644 --- a/src/jalview/ext/ensembl/EnsemblRestClient.java +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -85,11 +85,25 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher domain = d; } + /** + * Answers true if the query matches the regular expression pattern for an + * Ensembl transcript stable identifier + * + * @param query + * @return + */ public boolean isTranscriptIdentifier(String query) { return query == null ? false : TRANSCRIPT_REGEX.search(query); } + /** + * Answers true if the query matches the regular expression pattern for an + * Ensembl gene stable identifier + * + * @param query + * @return + */ public boolean isGeneIdentifier(String query) { return query == null ? false : GENE_REGEX.search(query); diff --git a/src/jalview/gui/SequenceFetcher.java b/src/jalview/gui/SequenceFetcher.java index 7dcd2b9..812bf76 100755 --- a/src/jalview/gui/SequenceFetcher.java +++ b/src/jalview/gui/SequenceFetcher.java @@ -539,30 +539,31 @@ public class SequenceFetcher extends JPanel implements Runnable // TODO: Refactor to GUI independent code and write tests. // indicate if successive sources should be merged into one alignment. boolean addToLast = false; - ArrayList aresultq = new ArrayList(), presultTitle = new ArrayList(); - ArrayList presult = new ArrayList(), aresult = new ArrayList(); + List aresultq = new ArrayList(); + List presultTitle = new ArrayList(); + List presult = new ArrayList(); + List aresult = new ArrayList(); Iterator proxies = database.getSelectedSources() .iterator(); String[] qries; - List nextfetch = Arrays.asList(qries = textArea.getText() + List nextFetch = Arrays.asList(qries = textArea.getText() .split(";")); Iterator en = Arrays.asList(new String[0]).iterator(); int nqueries = qries.length; FeatureSettingsModelI preferredFeatureColours = null; - while (proxies.hasNext() && (en.hasNext() || nextfetch.size() > 0)) + while (proxies.hasNext() && (en.hasNext() || nextFetch.size() > 0)) { - if (!en.hasNext() && nextfetch.size() > 0) + if (!en.hasNext() && nextFetch.size() > 0) { - en = nextfetch.iterator(); - nqueries = nextfetch.size(); + en = nextFetch.iterator(); + nqueries = nextFetch.size(); // save the remaining queries in the original array - qries = nextfetch.toArray(new String[nqueries]); - nextfetch = new ArrayList(); + qries = nextFetch.toArray(new String[nqueries]); + nextFetch = new ArrayList(); } DbSourceProxy proxy = proxies.next(); - boolean isAliSource = false; try { // update status @@ -573,122 +574,27 @@ public class SequenceFetcher extends JPanel implements Runnable Integer.valueOf(nqueries).toString(), proxy.getDbName() }), Thread.currentThread() .hashCode()); - isAliSource = proxy.isAlignmentSource(); if (proxy.getMaximumQueryCount() == 1) { + /* + * proxy only handles one accession id at a time + */ while (en.hasNext()) { - String item = en.next(); - try + String acc = en.next(); + if (!fetchSingleAccession(proxy, acc, aresultq, aresult)) { - if (aresult != null) - { - try - { - // give the server a chance to breathe - Thread.sleep(5); - } catch (Exception e) - { - // - } - - } - - AlignmentI indres = null; - try - { - indres = proxy.getSequenceRecords(item); - } catch (OutOfMemoryError oome) - { - new OOMWarning("fetching " + item + " from " - + proxy.getDbName(), oome, this); - } - if (indres != null) - { - aresultq.add(item); - aresult.add(indres); - } - else - { - nextfetch.add(item); - } - } catch (Exception e) - { - Cache.log.info( - "Error retrieving " + item - + " from " + proxy.getDbName(), e); - nextfetch.add(item); + nextFetch.add(acc); } } } else { - StringBuffer multiacc = new StringBuffer(); - ArrayList tosend = new ArrayList(); - while (en.hasNext()) - { - String nel = en.next(); - tosend.add(nel); - multiacc.append(nel); - if (en.hasNext()) - { - multiacc.append(proxy.getAccessionSeparator()); - } - } - try - { - AlignmentI rslt; - SequenceI[] rs; - List nores = new ArrayList(); - rslt = proxy.getSequenceRecords(multiacc.toString()); - if (rslt == null || rslt.getHeight() == 0) - { - // no results - pass on all queries to next source - nextfetch.addAll(tosend); - } - else - { - aresultq.add(multiacc.toString()); - aresult.add(rslt); - - rs = rslt.getSequencesArray(); - // search for each query in the dbrefs associated with each - // sequence - // returned. - // ones we do not find will be used to query next source (if any) - for (String q : tosend) - { - DBRefEntry dbr = new DBRefEntry(), found[] = null; - dbr.setSource(proxy.getDbSource()); - dbr.setVersion(null); - String accId = proxy.getAccessionIdFromQuery(q); - dbr.setAccessionId(accId); - boolean rfound = false; - for (int r = 0; r < rs.length; r++) - { - if (rs[r] != null) - { - found = DBRefUtils.searchRefs(rs[r].getDBRefs(), accId); - if (found != null && found.length > 0) - { - rfound = true; - rs[r] = null; - } - } - } - if (!rfound) - { - nextfetch.add(q); - } - } - } - } catch (OutOfMemoryError oome) - { - new OOMWarning("fetching " + multiacc + " from " - + database.getSelectedItem(), oome, this); - } + /* + * proxy can fetch multiple accessions at one time + */ + fetchMultipleAccessions(proxy, en, aresultq, aresult, nextFetch); } - } catch (Exception e) { showErrorMessage("Error retrieving " + textArea.getText() @@ -701,7 +607,6 @@ public class SequenceFetcher extends JPanel implements Runnable e.printStackTrace(); } catch (OutOfMemoryError e) { - // resets dialog box - so we don't use OOMwarning here. showErrorMessage("Out of Memory when retrieving " + textArea.getText() + " from " @@ -714,6 +619,7 @@ public class SequenceFetcher extends JPanel implements Runnable + " from " + database.getSelectedItem()); e.printStackTrace(); } + // Stack results ready for opening in alignment windows if (aresult != null && aresult.size() > 0) { @@ -725,7 +631,7 @@ public class SequenceFetcher extends JPanel implements Runnable } AlignmentI ar = null; - if (isAliSource) + if (proxy.isAlignmentSource()) { addToLast = false; // new window for each result @@ -755,7 +661,6 @@ public class SequenceFetcher extends JPanel implements Runnable { ar.append(aresult.remove(0)); } - ; } addToLast = true; presult.add(ar); @@ -779,14 +684,14 @@ public class SequenceFetcher extends JPanel implements Runnable } // only remove visual delay after we finished parsing. guiWindow.setProgressBar(null, Thread.currentThread().hashCode()); - if (nextfetch.size() > 0) + if (nextFetch.size() > 0) { StringBuffer sb = new StringBuffer(); sb.append("Didn't retrieve the following " - + (nextfetch.size() == 1 ? "query" : nextfetch.size() + + (nextFetch.size() == 1 ? "query" : nextFetch.size() + " queries") + ": \n"); int l = sb.length(), lr = 0; - for (String s : nextfetch) + for (String s : nextFetch) { if (l != sb.length()) { @@ -804,6 +709,161 @@ public class SequenceFetcher extends JPanel implements Runnable } /** + * Tries to fetch one or more accession ids from the database proxy + * + * @param proxy + * @param accessions + * the queries to fetch + * @param aresultq + * a successful queries list to add to + * @param aresult + * a list of retrieved alignments to add to + * @param nextFetch + * failed queries are added to this list + * @throws Exception + */ + void fetchMultipleAccessions(DbSourceProxy proxy, + Iterator accessions, List aresultq, + List aresult, List nextFetch) + throws Exception + { + StringBuilder multiacc = new StringBuilder(); + List tosend = new ArrayList(); + while (accessions.hasNext()) + { + String nel = accessions.next(); + tosend.add(nel); + multiacc.append(nel); + if (accessions.hasNext()) + { + multiacc.append(proxy.getAccessionSeparator()); + } + } + + try + { + String query = multiacc.toString(); + AlignmentI rslt = proxy.getSequenceRecords(query); + if (rslt == null || rslt.getHeight() == 0) + { + // no results - pass on all queries to next source + nextFetch.addAll(tosend); + } + else + { + aresultq.add(query); + aresult.add(rslt); + if (tosend.size() > 1) + { + checkResultForQueries(rslt, tosend, nextFetch, proxy); + } + } + } catch (OutOfMemoryError oome) + { + new OOMWarning("fetching " + multiacc + " from " + + database.getSelectedItem(), oome, this); + } + } + + /** + * Query for a single accession id via the database proxy + * + * @param proxy + * @param accession + * @param aresultq + * a list of successful queries to add to + * @param aresult + * a list of retrieved alignments to add to + * @return true if the fetch was successful, else false + */ + boolean fetchSingleAccession(DbSourceProxy proxy, String accession, + List aresultq, List aresult) + { + boolean success = false; + try + { + if (aresult != null) + { + try + { + // give the server a chance to breathe + Thread.sleep(5); + } catch (Exception e) + { + // + } + } + + AlignmentI indres = null; + try + { + indres = proxy.getSequenceRecords(accession); + } catch (OutOfMemoryError oome) + { + new OOMWarning("fetching " + accession + " from " + + proxy.getDbName(), oome, this); + } + if (indres != null) + { + aresultq.add(accession); + aresult.add(indres); + success = true; + } + } catch (Exception e) + { + Cache.log.info( + "Error retrieving " + accession + + " from " + proxy.getDbName(), e); + } finally + { + return success; + } + } + + /** + * Checks which of the queries were successfully retrieved by searching the + * DBRefs of the retrieved sequences for a match. Any not found are added to + * the 'nextFetch' list. + * + * @param rslt + * @param queries + * @param nextFetch + * @param proxy + */ + void checkResultForQueries(AlignmentI rslt, List queries, + List nextFetch, DbSourceProxy proxy) + { + SequenceI[] rs = rslt.getSequencesArray(); + + for (String q : queries) + { + DBRefEntry[] found = null; + DBRefEntry dbr = new DBRefEntry(); + dbr.setSource(proxy.getDbSource()); + dbr.setVersion(null); + String accId = proxy.getAccessionIdFromQuery(q); + dbr.setAccessionId(accId); + boolean rfound = false; + for (int r = 0; r < rs.length; r++) + { + if (rs[r] != null) + { + found = DBRefUtils.searchRefs(rs[r].getDBRefs(), accId); + if (found != null && found.length > 0) + { + rfound = true; + break; + } + } + } + if (!rfound) + { + nextFetch.add(q); + } + } + } + + /** * * @return a standard title for any results retrieved using the currently * selected source and settings -- 1.7.10.2