2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.AlignSeq;
24 import jalview.bin.Cache;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.DBRefEntry;
27 import jalview.datamodel.DBRefSource;
28 import jalview.datamodel.Mapping;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.datamodel.SequenceI;
31 import jalview.gui.AlignFrame;
32 import jalview.gui.CutAndPasteTransfer;
33 import jalview.gui.DasSourceBrowser;
34 import jalview.gui.Desktop;
35 import jalview.gui.IProgressIndicator;
36 import jalview.gui.OOMWarning;
37 import jalview.util.DBRefUtils;
38 import jalview.util.MessageManager;
39 import jalview.ws.dbsources.das.api.jalviewSourceI;
40 import jalview.ws.dbsources.das.datamodel.DasSequenceSource;
41 import jalview.ws.seqfetcher.DbSourceProxy;
43 import java.util.ArrayList;
44 import java.util.Enumeration;
45 import java.util.Hashtable;
46 import java.util.List;
47 import java.util.StringTokenizer;
48 import java.util.Vector;
50 import uk.ac.ebi.picr.model.UPEntry;
51 import uk.ac.ebi.www.picr.AccessionMappingService.AccessionMapperInterface;
52 import uk.ac.ebi.www.picr.AccessionMappingService.AccessionMapperServiceLocator;
55 * Implements a runnable for validating a sequence against external databases
56 * and then propagating references and features onto the sequence(s)
61 public class DBRefFetcher implements Runnable
65 IProgressIndicator af;
67 CutAndPasteTransfer output = new CutAndPasteTransfer();
69 StringBuffer sbuffer = new StringBuffer();
71 boolean running = false;
74 * picr client instance
76 AccessionMapperInterface picrClient = null;
78 // /This will be a collection of Vectors of sequenceI refs.
79 // The key will be the seq name or accession id of the seq
82 DbSourceProxy[] dbSources;
84 SequenceFetcher sfetcher;
86 private SequenceI[] alseqs;
89 * when true - retrieved sequences will be trimmed to cover longest derived
92 private boolean trimDsSeqs = true;
99 * Creates a new SequenceFeatureFetcher object and fetches from the currently
100 * selected set of databases.
103 * fetch references for these sequences
105 * the parent alignframe for progress bar monitoring.
107 public DBRefFetcher(SequenceI[] seqs, AlignFrame af)
109 this(seqs, af, null);
113 * Creates a new SequenceFeatureFetcher object and fetches from the currently
114 * selected set of databases.
117 * fetch references for these sequences
119 * the parent alignframe for progress bar monitoring.
121 * array of database source strings to query references from
123 public DBRefFetcher(SequenceI[] seqs, AlignFrame af,
124 DbSourceProxy[] sources)
127 alseqs = new SequenceI[seqs.length];
128 SequenceI[] ds = new SequenceI[seqs.length];
129 for (int i = 0; i < seqs.length; i++)
132 if (seqs[i].getDatasetSequence() != null)
134 ds[i] = seqs[i].getDatasetSequence();
142 // TODO Jalview 2.5 lots of this code should be in the gui package!
143 sfetcher = jalview.gui.SequenceFetcher.getSequenceFetcherSingleton(af);
144 // set default behaviour for transferring excess sequence data to the
146 trimDsSeqs = Cache.getDefault("TRIM_FETCHED_DATASET_SEQS", true);
149 // af.featureSettings_actionPerformed(null);
150 String[] defdb = null, otherdb = sfetcher
151 .getDbInstances(DasSequenceSource.class);
152 List<DbSourceProxy> selsources = new ArrayList<DbSourceProxy>();
153 Vector dasselsrc = (af.featureSettings != null) ? af.featureSettings
154 .getSelectedSources() : new DasSourceBrowser()
155 .getSelectedSources();
156 Enumeration<jalviewSourceI> en = dasselsrc.elements();
157 while (en.hasMoreElements())
159 jalviewSourceI src = en.nextElement();
160 List<DbSourceProxy> sp = src.getSequenceSourceProxies();
163 selsources.addAll(sp);
166 Cache.log.debug("Added many Db Sources for :" + src.getTitle());
170 // select appropriate databases based on alignFrame context.
171 if (af.getViewport().getAlignment().isNucleotide())
173 defdb = DBRefSource.DNACODINGDBS;
177 defdb = DBRefSource.PROTEINDBS;
179 List<DbSourceProxy> srces = new ArrayList<DbSourceProxy>();
180 for (String ddb : defdb)
182 List<DbSourceProxy> srcesfordb = sfetcher.getSourceProxy(ddb);
183 if (srcesfordb != null)
185 srces.addAll(srcesfordb);
189 // append the selected sequence sources to the default dbs
190 srces.addAll(selsources);
191 dbSources = srces.toArray(new DbSourceProxy[0]);
195 // we assume the caller knows what they're doing and ensured that all the
196 // db source names are valid
202 * retrieve all the das sequence sources and add them to the list of db
203 * sources to retrieve from
205 public void appendAllDasSources()
207 if (dbSources == null)
209 dbSources = new DbSourceProxy[0];
211 // append additional sources
212 DbSourceProxy[] otherdb = sfetcher
213 .getDbSourceProxyInstances(DasSequenceSource.class);
214 if (otherdb != null && otherdb.length > 0)
216 DbSourceProxy[] newsrc = new DbSourceProxy[dbSources.length
218 System.arraycopy(dbSources, 0, newsrc, 0, dbSources.length);
219 System.arraycopy(otherdb, 0, newsrc, dbSources.length, otherdb.length);
225 * start the fetcher thread
227 * @param waitTillFinished
228 * true to block until the fetcher has finished
230 public void fetchDBRefs(boolean waitTillFinished)
232 Thread thread = new Thread(this);
236 if (waitTillFinished)
243 } catch (Exception ex)
251 * The sequence will be added to a vector of sequences belonging to key which
252 * could be either seq name or dbref id
259 void addSeqId(SequenceI seq, String key)
261 key = key.toUpperCase();
264 if (seqRefs.containsKey(key))
266 seqs = (Vector) seqRefs.get(key);
268 if (seqs != null && !seqs.contains(seq))
270 seqs.addElement(seq);
272 else if (seqs == null)
275 seqs.addElement(seq);
282 seqs.addElement(seq);
285 seqRefs.put(key, seqs);
293 if (dbSources == null)
295 throw new Error(MessageManager.getString("error.implementation_error_must_init_dbsources"));
298 long startTime = System.currentTimeMillis();
299 af.setProgressBar(MessageManager.getString("status.fetching_db_refs"), startTime);
302 if (Cache.getDefault("DBREFFETCH_USEPICR", false))
304 picrClient = new AccessionMapperServiceLocator()
305 .getAccessionMapperPort();
307 } catch (Exception e)
309 System.err.println("Couldn't locate PICR service instance.\n");
313 Vector sdataset = new Vector();
314 for (int s = 0; s < dataset.length; s++)
316 sdataset.addElement(dataset[s]);
318 while (sdataset.size() > 0 && db < dbSources.length)
320 int maxqlen = 1; // default number of queries made to at one time
321 System.err.println("Verifying against " + dbSources[db].getDbName());
324 // iterate through db for each remaining un-verified sequence
325 SequenceI[] currSeqs = new SequenceI[sdataset.size()];
326 sdataset.copyInto(currSeqs);// seqs that are to be validated against
328 Vector queries = new Vector(); // generated queries curSeq
329 seqRefs = new Hashtable();
333 DbSourceProxy dbsource = dbSources[db];
335 // for moment, we dumbly iterate over all retrieval sources for a
336 // particular database
337 // TODO: introduce multithread multisource queries and logic to remove a
338 // query from other sources if any source for a database returns a
340 if (dbsource.getDbSourceProperties().containsKey(
341 DBRefSource.MULTIACC))
343 maxqlen = ((Integer) dbsource.getDbSourceProperties().get(
344 DBRefSource.MULTIACC)).intValue();
350 while (queries.size() > 0 || seqIndex < currSeqs.length)
352 if (queries.size() > 0)
354 // Still queries to make for current seqIndex
355 StringBuffer queryString = new StringBuffer("");
356 int numq = 0, nqSize = (maxqlen > queries.size()) ? queries
359 while (queries.size() > 0 && numq < nqSize)
361 String query = (String) queries.elementAt(0);
362 if (dbsource.isValidReference(query))
364 queryString.append((numq == 0) ? "" : dbsource
365 .getAccessionSeparator());
366 queryString.append(query);
369 // remove the extracted query string
370 queries.removeElementAt(0);
372 // make the queries and process the response
373 AlignmentI retrieved = null;
376 if (Cache.log.isDebugEnabled())
378 Cache.log.debug("Querying "
379 + dbsource.getDbName() + " with : '"
380 + queryString.toString() + "'");
382 retrieved = dbsource.getSequenceRecords(queryString
384 } catch (Exception ex)
386 ex.printStackTrace();
387 } catch (OutOfMemoryError err)
389 new OOMWarning("retrieving database references ("
390 + queryString.toString() + ")", err);
392 if (retrieved != null)
394 transferReferences(sdataset, dbsource.getDbSource(),
395 retrieved, trimDsSeqs);
400 // make some more strings for use as queries
401 for (int i = 0; (seqIndex < dataset.length) && (i < 50); seqIndex++, i++)
403 SequenceI sequence = dataset[seqIndex];
404 DBRefEntry[] uprefs = DBRefUtils.selectRefs(
405 sequence.getDBRef(), new String[]
406 { dbsource.getDbSource() }); // jalview.datamodel.DBRefSource.UNIPROT
408 // check for existing dbrefs to use
409 if (uprefs != null && uprefs.length > 0)
411 for (int j = 0; j < uprefs.length; j++)
413 addSeqId(sequence, uprefs[j].getAccessionId());
414 queries.addElement(uprefs[j].getAccessionId()
420 // generate queries from sequence ID string
421 StringTokenizer st = new StringTokenizer(
422 sequence.getName(), "|");
423 while (st.hasMoreTokens())
425 String token = st.nextToken();
426 UPEntry[] presp = null;
427 if (picrClient != null)
429 // resolve the string against PICR to recover valid IDs
432 presp = picrClient.getUPIForAccession(token, null,
433 picrClient.getMappedDatabaseNames(), null,
435 } catch (Exception e)
437 System.err.println("Exception with Picr for '"
442 if (presp != null && presp.length > 0)
444 for (int id = 0; id < presp.length; id++)
446 // construct sequences from response if sequences are
447 // present, and do a transferReferences
448 // otherwise transfer non sequence x-references directly.
451 .println("Validated ID against PICR... (for what its worth):"
453 addSeqId(sequence, token);
454 queries.addElement(token.toUpperCase());
459 // System.out.println("Not querying source with token="+token+"\n");
460 addSeqId(sequence, token);
461 queries.addElement(token.toUpperCase());
469 // advance to next database
471 } // all databases have been queries.
472 if (sbuffer.length() > 0)
474 output.setText(MessageManager
475 .getString("label.your_sequences_have_been_verified")
476 + sbuffer.toString());
477 Desktop.addInternalFrame(output,
478 MessageManager.getString("label.sequence_names_updated"),
480 // The above is the dataset, we must now find out the index
481 // of the viewed sequence
486 MessageManager.getString("label.dbref_search_completed"),
488 // promptBeforeBlast();
495 * Verify local sequences in seqRefs against the retrieved sequence database
498 * @param trimDatasetSeqs
501 void transferReferences(Vector sdataset, String dbSource,
502 AlignmentI retrievedAl, boolean trimDatasetSeqs) // File
505 System.out.println("trimming ? " + trimDatasetSeqs);
506 if (retrievedAl == null || retrievedAl.getHeight() == 0)
510 SequenceI[] retrieved = recoverDbSequences(retrievedAl
511 .getSequencesArray());
512 SequenceI sequence = null;
513 boolean transferred = false;
514 StringBuffer messages = new StringBuffer();
516 // Vector entries = new Uniprot().getUniprotEntries(file);
518 int i, iSize = retrieved.length; // entries == null ? 0 : entries.size();
519 // UniprotEntry entry;
520 for (i = 0; i < iSize; i++)
522 SequenceI entry = retrieved[i]; // (UniprotEntry) entries.elementAt(i);
524 // Work out which sequences this sequence matches,
525 // taking into account all accessionIds and names in the file
526 Vector sequenceMatches = new Vector();
527 // look for corresponding accession ids
528 DBRefEntry[] entryRefs = DBRefUtils.selectRefs(
529 entry.getDBRef(), new String[]
531 if (entryRefs == null)
534 .println("Dud dbSource string ? no entryrefs selected for "
535 + dbSource + " on " + entry.getName());
538 for (int j = 0; j < entryRefs.length; j++)
540 String accessionId = entryRefs[j].getAccessionId(); // .getAccession().elementAt(j).toString();
541 // match up on accessionId
542 if (seqRefs.containsKey(accessionId.toUpperCase()))
544 Vector seqs = (Vector) seqRefs.get(accessionId);
545 for (int jj = 0; jj < seqs.size(); jj++)
547 sequence = (SequenceI) seqs.elementAt(jj);
548 if (!sequenceMatches.contains(sequence))
550 sequenceMatches.addElement(sequence);
555 if (sequenceMatches.size() == 0)
557 // failed to match directly on accessionId==query so just compare all
558 // sequences to entry
559 Enumeration e = seqRefs.keys();
560 while (e.hasMoreElements())
562 Vector sqs = (Vector) seqRefs.get(e.nextElement());
563 if (sqs != null && sqs.size() > 0)
565 Enumeration sqe = sqs.elements();
566 while (sqe.hasMoreElements())
568 sequenceMatches.addElement(sqe.nextElement());
573 // look for corresponding names
574 // this is uniprot specific ?
575 // could be useful to extend this so we try to find any 'significant'
576 // information in common between two sequence objects.
578 * DBRefEntry[] entryRefs =
579 * jalview.util.DBRefUtils.selectRefs(entry.getDBRef(), new String[] {
580 * dbSource }); for (int j = 0; j < entry.getName().size(); j++) { String
581 * name = entry.getName().elementAt(j).toString(); if
582 * (seqRefs.containsKey(name)) { Vector seqs = (Vector) seqRefs.get(name);
583 * for (int jj = 0; jj < seqs.size(); jj++) { sequence = (SequenceI)
584 * seqs.elementAt(jj); if (!sequenceMatches.contains(sequence)) {
585 * sequenceMatches.addElement(sequence); } } } }
587 // sequenceMatches now contains the set of all sequences associated with
588 // the returned db record
589 String entrySeq = entry.getSequenceAsString().toUpperCase();
590 for (int m = 0; m < sequenceMatches.size(); m++)
592 sequence = (SequenceI) sequenceMatches.elementAt(m);
593 // only update start and end positions and shift features if there are
594 // no existing references
595 // TODO: test for legacy where uniprot or EMBL refs exist but no
596 // mappings are made (but content matches retrieved set)
597 boolean updateRefFrame = sequence.getDBRef() == null
598 || sequence.getDBRef().length == 0;
600 // verify sequence against the entry sequence
602 String nonGapped = AlignSeq.extractGaps("-. ",
603 sequence.getSequenceAsString()).toUpperCase();
605 int absStart = entrySeq.indexOf(nonGapped);
606 int mapStart = entry.getStart();
611 // Is local sequence contained in dataset sequence?
612 absStart = nonGapped.indexOf(entrySeq);
614 { // verification failed.
615 messages.append(sequence.getName()
616 + " SEQUENCE NOT %100 MATCH \n");
620 sbuffer.append(sequence.getName() + " HAS " + absStart
621 + " PREFIXED RESIDUES COMPARED TO " + dbSource + "\n");
623 // + " - ANY SEQUENCE FEATURES"
624 // + " HAVE BEEN ADJUSTED ACCORDINGLY \n");
626 // create valid mapping between matching region of local sequence and
627 // the mapped sequence
628 mp = new Mapping(null, new int[]
629 { sequence.getStart() + absStart,
630 sequence.getStart() + absStart + entrySeq.length() - 1 },
633 entry.getStart() + entrySeq.length() - 1 }, 1, 1);
634 updateRefFrame = false; // mapping is based on current start/end so
635 // don't modify start and end
640 // update start and end of local sequence to place it in entry's
642 // apply identity map map from whole of local sequence to matching
643 // region of database
645 mp = null; // Mapping.getIdentityMap();
647 // new int[] { absStart+sequence.getStart(),
648 // absStart+sequence.getStart()+entrySeq.length()-1},
649 // new int[] { entry.getStart(), entry.getEnd() }, 1, 1);
650 // relocate local features for updated start
653 if (sequence.getSequenceFeatures() != null)
655 SequenceFeature[] sf = sequence.getSequenceFeatures();
656 int start = sequence.getStart();
657 int end = sequence.getEnd();
658 int startShift = 1 - absStart - start; // how much the features
661 for (int sfi = 0; sfi < sf.length; sfi++)
663 if (sf[sfi].getBegin() >= start && sf[sfi].getEnd() <= end)
665 // shift feature along by absstart
666 sf[sfi].setBegin(sf[sfi].getBegin() + startShift);
667 sf[sfi].setEnd(sf[sfi].getEnd() + startShift);
674 System.out.println("Adding dbrefs to " + sequence.getName()
675 + " from " + dbSource + " sequence : " + entry.getName());
676 sequence.transferAnnotation(entry, mp);
677 // unknownSequences.remove(sequence);
678 int absEnd = absStart + nonGapped.length();
680 if (!trimDatasetSeqs)
682 // insert full length sequence from record
683 sequence.setSequence(entry.getSequenceAsString());
684 sequence.setStart(entry.getStart());
688 // finally, update local sequence reference frame if we're allowed
691 // just fix start/end
692 sequence.setStart(absStart);
693 sequence.setEnd(absEnd);
695 // search for alignment sequences to update coordinate frame for
696 for (int alsq = 0; alsq < alseqs.length; alsq++)
698 if (alseqs[alsq].getDatasetSequence() == sequence)
700 String ngAlsq = AlignSeq.extractGaps("-. ",
701 alseqs[alsq].getSequenceAsString()).toUpperCase();
702 int oldstrt = alseqs[alsq].getStart();
703 alseqs[alsq].setStart(sequence.getSequenceAsString()
704 .toUpperCase().indexOf(ngAlsq)
705 + sequence.getStart());
706 if (oldstrt != alseqs[alsq].getStart())
708 alseqs[alsq].setEnd(ngAlsq.length()
709 + alseqs[alsq].getStart() - 1);
713 // TODO: search for all other references to this dataset sequence, and
715 // TODO: update all AlCodonMappings which involve this alignment
716 // sequence (e.g. Q30167 cdna translation from exon2 product (vamsas
719 // and remove it from the rest
720 // TODO: decide if we should remove annotated sequence from set
721 sdataset.remove(sequence);
722 // TODO: should we make a note of sequences that have received new DB
723 // ids, so we can query all enabled DAS servers for them ?
728 // report the ID/sequence mismatches
729 sbuffer.append(messages);
734 * loop thru and collect additional sequences in Map.
736 * @param sequencesArray
739 private SequenceI[] recoverDbSequences(SequenceI[] sequencesArray)
741 Vector nseq = new Vector();
742 for (int i = 0; sequencesArray != null && i < sequencesArray.length; i++)
744 nseq.addElement(sequencesArray[i]);
745 DBRefEntry dbr[] = sequencesArray[i].getDBRef();
747 for (int r = 0; (dbr != null) && r < dbr.length; r++)
749 if ((map = dbr[r].getMap()) != null)
751 if (map.getTo() != null && !nseq.contains(map.getTo()))
753 nseq.addElement(map.getTo());
760 sequencesArray = new SequenceI[nseq.size()];
761 nseq.toArray(sequencesArray);
763 return sequencesArray;