From 63cead20b09743e899a22cb89f1c09e4d41cc8c0 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 12 Feb 2019 08:44:08 +0000 Subject: [PATCH] JAL-3195 revised SequenceIdMatcher (tests todo) --- src/jalview/analysis/SequenceIdMatcher.java | 76 ++++++++++++++++++++------- src/jalview/io/FeaturesFile.java | 2 +- 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java index 3d4cbe7..9888647 100755 --- a/src/jalview/analysis/SequenceIdMatcher.java +++ b/src/jalview/analysis/SequenceIdMatcher.java @@ -27,7 +27,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; -import java.util.Vector; +import java.util.Map; /** * Routines for approximate Sequence Id resolution by name using string @@ -37,11 +37,14 @@ import java.util.Vector; */ public class SequenceIdMatcher { - private HashMap names; + private HashMap> names; + + private Map> excludes; public SequenceIdMatcher(List seqs) { - names = new HashMap(); + names = new HashMap<>(); + excludes = new HashMap<>(); addAll(seqs); } @@ -65,9 +68,8 @@ public class SequenceIdMatcher */ public void add(SequenceI seq) { - // TODO: deal with ID collisions - SequenceI should be appended to list - // associated with this key. - names.put(new SeqIdName(seq.getDisplayId(true)), seq); + SeqIdName key = new SeqIdName(seq.getDisplayId(true)); + addMatchCandidate(key, seq); SequenceI dbseq = seq; while (dbseq.getDatasetSequence() != null) { @@ -77,18 +79,45 @@ public class SequenceIdMatcher if (dbseq.getDBRefs() != null) { DBRefEntry dbr[] = dbseq.getDBRefs(); - SeqIdName sid = null; for (int r = 0; r < dbr.length; r++) { - sid = new SeqIdName(dbr[r].getAccessionId()); - if (!names.containsKey(sid)) + DBRefEntry dbref = dbr[r]; + SeqIdName sid = new SeqIdName(dbref.getAccessionId()); + if (dbref.getMap() != null + && dbref.getMap().getMap().isTripletMap()) { - names.put(sid, seq); + /* + * dbref with 3:1 or 1:3 mapping (e.g. CDS/protein); + * mark as not a valid match for this id + */ + List excluded = excludes.get(sid); + if (excluded == null) + { + excludes.put(sid, excluded = new ArrayList<>()); + } + excluded.add(seq); + System.out.println("Excluding " + sid + "->" + seq); + continue; } + addMatchCandidate(sid, seq); } } } + void addMatchCandidate(SeqIdName key, SequenceI seq) + { + List namesList = names.get(key); + if (namesList == null) + { + names.put(key, namesList = new ArrayList<>()); + } + if (!namesList.contains(seq)) + { + namesList.add(seq); + System.out.println("Adding " + key + "->" + seq); + } + } + /** * convenience method to make a matcher from concrete array * @@ -130,21 +159,21 @@ public class SequenceIdMatcher private List pickbestMatches(SeqIdName candName, List matches) { - ArrayList best = new ArrayList(); + List best = new ArrayList<>(); if (candName == null || matches == null || matches.size() == 0) { return null; } SequenceI match = matches.remove(0); best.add(match); - names.put(new SeqIdName(match.getName()), match); + addMatchCandidate(new SeqIdName(match.getName()), match); int matchlen = match.getName().length(); int namlen = candName.id.length(); while (matches.size() > 0) { // look through for a better one. SequenceI cand = matches.remove(0); - names.put(new SeqIdName(cand.getName()), cand); + addMatchCandidate(new SeqIdName(cand.getName()), cand); int q, w, candlen = cand.getName().length(); // keep the one with an id 'closer' to the given seqnam string if ((q = Math.abs(matchlen - namlen)) > (w = Math @@ -251,13 +280,22 @@ public class SequenceIdMatcher * SeqIdName * @return SequenceI */ - private SequenceI findIdMatch( - jalview.analysis.SequenceIdMatcher.SeqIdName nam) + private SequenceI findIdMatch(SeqIdName nam) { - Vector matches = new Vector(); + List matches = new ArrayList<>(); while (names.containsKey(nam)) { - matches.addElement(names.remove(nam)); + List candidates = names.remove(nam); + List except = excludes.get(nam); + int j = candidates.size(); + for (int i = 0; i < j; i++) + { + SequenceI candidate = candidates.get(i); + if (!except.contains(candidate)) + { + matches.add(candidate); + } + } } return pickbestMatch(nam, matches); } @@ -272,10 +310,10 @@ public class SequenceIdMatcher private List findAllIdMatches( jalview.analysis.SequenceIdMatcher.SeqIdName nam) { - ArrayList matches = new ArrayList(); + List matches = new ArrayList<>(); while (names.containsKey(nam)) { - matches.add(names.remove(nam)); + matches.addAll(names.remove(nam)); } List r = pickbestMatches(nam, matches); return r; diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index 169da5a..559ca79 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -393,7 +393,7 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI } String desc = gffColumns[0]; String seqId = gffColumns[1]; - SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching); + SequenceI seq; if (!ID_NOT_SPECIFIED.equals(seqId)) { -- 1.7.10.2