From 6be186417258f0fd90a426a13032d3092a67ce6c Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Sun, 2 Oct 2016 12:22:20 +0100 Subject: [PATCH] JAL-2210 try to merge all sequences referenced by a sequence with a matching xref with ones already in dataset (rather than just ones matching the source sequence) before adding the xrefed sequence to the returned sequence set. --- src/jalview/analysis/CrossRef.java | 110 +++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 34 deletions(-) diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 46e2119..31b35ec 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -483,15 +483,28 @@ public class CrossRef * @param cf * @param sourceSequence * @param retrievedSequence + * @return true if retrieveSequence was imported */ - private void importCrossRefSeq(AlignedCodonFrame cf, + private boolean importCrossRefSeq(AlignedCodonFrame cf, SequenceI sourceSequence, SequenceI retrievedSequence) { + /** + * set when retrievedSequence has been verified as a crossreference for + * sourceSequence + */ + boolean imported = false; DBRefEntry[] dbr = retrievedSequence.getDBRefs(); + List newDsSeqs = new ArrayList(); if (dbr != null) { for (DBRefEntry dbref : dbr) { + SequenceI matched = findInDataset(dbref); + if (matched == sourceSequence) + { + // verified retrieved and source sequence cross-reference each other + imported = true; + } // find any entry where we should put in the sequence being // cross-referenced into the map Mapping map = dbref.getMap(); @@ -499,61 +512,70 @@ public class CrossRef { if (map.getTo() != null && map.getMap() != null) { - // TODO findInDataset requires exact sequence match but - // 'congruent' test is only for the mapped part - // maybe not a problem in practice since only ENA provide a - // mapping and it is to the full protein translation of CDS - SequenceI matched = findInDataset(dbref); - // matcher.findIdMatch(map.getTo()); - if (matched != null) + if (map.getTo() == sourceSequence) { - /* - * already got an xref to this sequence; update this - * map to point to the same sequence, and add - * any new dbrefs to it - */ - DBRefEntry[] toRefs = map.getTo().getDBRefs(); - if (toRefs != null) - { - for (DBRefEntry ref : toRefs) - { - matched.addDBRef(ref); // add or update mapping - } - } - map.setTo(matched); + // already called to import once, and most likely this sequence + // already imported ! + continue; } - else + if (matched == null) { - if (dataset.findIndex(map.getTo()) == -1) - { - dataset.addSequence(map.getTo()); - matcher.add(map.getTo()); - } + /* + * sequence is new to dataset, so save a reference so it can be added. + */ + newDsSeqs.add(map.getTo()); + continue; } + /* + * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one. + */ + try { // compare ms with dss and replace with dss in mapping // if map is congruent SequenceI ms = map.getTo(); + // TODO findInDataset requires exact sequence match but + // 'congruent' test is only for the mapped part + // maybe not a problem in practice since only ENA provide a + // mapping and it is to the full protein translation of CDS + // matcher.findIdMatch(map.getTo()); + // TODO addendum: if matched is shorter than getTo, this will fail + // - when it should really succeed. int sf = map.getMap().getToLowest(); int st = map.getMap().getToHighest(); SequenceI mappedrg = ms.getSubSequence(sf, st); if (mappedrg.getLength() > 0 && ms.getSequenceAsString().equals( - sourceSequence.getSequenceAsString())) + matched.getSequenceAsString())) { + /* + * sequences were a match, + */ String msg = "Mapping updated from " + ms.getName() + " to retrieved crossreference " - + sourceSequence.getName(); + + matched.getName(); System.out.println(msg); - map.setTo(sourceSequence); + + DBRefEntry[] toRefs = map.getTo().getDBRefs(); + if (toRefs != null) + { + /* + * transfer database refs + */ + for (DBRefEntry ref : toRefs) + { + matched.addDBRef(ref); // add or update mapping + } + } + map.setTo(matched); /* * give the reverse reference the inverse mapping * (if it doesn't have one already) */ - setReverseMapping(sourceSequence, dbref, cf); + setReverseMapping(matched, dbref, cf); /* * copy sequence features as well, avoiding @@ -579,9 +601,10 @@ public class CrossRef return super.equals(o, true); } }; - sourceSequence.addSequenceFeature(newFeature); + matched.addSequenceFeature(newFeature); } } + } cf.addMap(retrievedSequence, map.getTo(), map.getMap()); } catch (Exception e) @@ -594,7 +617,26 @@ public class CrossRef } } } - retrievedSequence.updatePDBIds(); + if (imported) + { + retrievedSequence.updatePDBIds(); + rseqs.add(retrievedSequence); + if (dataset.findIndex(retrievedSequence) == -1) + { + dataset.addSequence(retrievedSequence); + matcher.add(retrievedSequence); + } + for (SequenceI newToSeq : newDsSeqs) + { + + if (dataset.findIndex(newToSeq) == -1) + { + dataset.addSequence(newToSeq); + matcher.add(newToSeq); + } + } + } + return imported; } /** * Sets the inverse sequence mapping in the corresponding dbref of the mapped -- 1.7.10.2