From: Jim Procter Date: Mon, 29 Aug 2016 21:15:58 +0000 (+0100) Subject: JAL-2154 filter out dbrefs that already match primary refs for sequences already... X-Git-Tag: Release_2_10_0~47^2~4^2~33 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=d11914fc106b8cfbcbf14176b49f9ae71992a8b8;p=jalview.git JAL-2154 filter out dbrefs that already match primary refs for sequences already retrieved. This is a patch for ENA records where protein products were not reconstructed (AC006347) resulting in duplicate sequence fetches --- diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 46256a6..1295b46 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -390,6 +390,37 @@ public class CrossRef SequenceI[] retrieved = null; SequenceI dss = seq.getDatasetSequence() == null ? seq : seq .getDatasetSequence(); + // first filter in case we are retrieving crossrefs that have already been + // retrieved. this happens for cases where a database record doesn't yield + // protein products for CDS + DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]); + for (SequenceI sq : dataset.getSequences()) + { + boolean dupeFound = false; + // !fromDna means we are looking only for nucleotide sequences, not + // protein + if (sq.isProtein() == fromDna) + { + for (DBRefEntry dbr : sq.getPrimaryDBRefs()) + { + for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr)) + { + sourceRefs.remove(found); + dupeFound = true; + } + } + } + if (dupeFound) + { + dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]); + } + } + if (sourceRefs.size() == 0) + { + // no more work to do! We already had all requested sequence records in + // the dataset. + return; + } try { retrieved = sftch.getSequences(sourceRefs, !fromDna);