From d11914fc106b8cfbcbf14176b49f9ae71992a8b8 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Mon, 29 Aug 2016 22:15:58 +0100 Subject: [PATCH] JAL-2154 filter out dbrefs that already match primary refs for sequences already retrieved. This is a patch for ENA records where protein products were not reconstructed (AC006347) resulting in duplicate sequence fetches --- src/jalview/analysis/CrossRef.java | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 46256a6..1295b46 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -390,6 +390,37 @@ public class CrossRef SequenceI[] retrieved = null; SequenceI dss = seq.getDatasetSequence() == null ? seq : seq .getDatasetSequence(); + // first filter in case we are retrieving crossrefs that have already been + // retrieved. this happens for cases where a database record doesn't yield + // protein products for CDS + DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]); + for (SequenceI sq : dataset.getSequences()) + { + boolean dupeFound = false; + // !fromDna means we are looking only for nucleotide sequences, not + // protein + if (sq.isProtein() == fromDna) + { + for (DBRefEntry dbr : sq.getPrimaryDBRefs()) + { + for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr)) + { + sourceRefs.remove(found); + dupeFound = true; + } + } + } + if (dupeFound) + { + dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]); + } + } + if (sourceRefs.size() == 0) + { + // no more work to do! We already had all requested sequence records in + // the dataset. + return; + } try { retrieved = sftch.getSequences(sourceRefs, !fromDna); -- 1.7.10.2