2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import java.util.Locale;
25 import jalview.datamodel.DBRefEntry;
26 import jalview.datamodel.SequenceI;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.HashMap;
31 import java.util.List;
32 import java.util.Vector;
35 * Routines for approximate Sequence Id resolution by name using string
36 * containment (on word boundaries) rather than equivalence. It also attempts to
37 * resolve ties where no exact match is available by picking the the id closest
40 public class SequenceIdMatcher
42 private HashMap<SeqIdName, SequenceI> names;
44 public SequenceIdMatcher(List<SequenceI> seqs)
46 names = new HashMap<SeqIdName, SequenceI>();
51 * Adds sequences to this matcher
55 public void addAll(List<SequenceI> seqs)
57 for (SequenceI seq : seqs)
64 * Adds one sequence to this matcher
68 public void add(SequenceI seq)
70 // TODO: deal with ID collisions - SequenceI should be appended to list
71 // associated with this key.
72 names.put(new SeqIdName(seq.getDisplayId(true)), seq);
73 SequenceI dbseq = seq;
74 while (dbseq.getDatasetSequence() != null)
76 dbseq = dbseq.getDatasetSequence();
78 // add in any interesting identifiers
79 List<DBRefEntry> dbr = dbseq.getDBRefs();
83 for (int r = 0, nr = dbr.size(); r < nr; r++)
85 sid = new SeqIdName(dbr.get(r).getAccessionId());
86 if (!names.containsKey(sid))
95 * convenience method to make a matcher from concrete array
99 public SequenceIdMatcher(SequenceI[] sequences)
101 this(Arrays.asList(sequences));
105 * returns the closest SequenceI in matches to SeqIdName and returns all the
106 * matches to the names hash.
111 * List of SequenceI objects
112 * @return SequenceI closest SequenceI to SeqIdName
114 private SequenceI pickbestMatch(SeqIdName candName,
115 List<SequenceI> matches)
117 List<SequenceI> st = pickbestMatches(candName, matches);
118 return st == null || st.size() == 0 ? null : st.get(0);
122 * returns the closest SequenceI in matches to SeqIdName and returns all the
123 * matches to the names hash.
128 * Vector of SequenceI objects
129 * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
132 private List<SequenceI> pickbestMatches(SeqIdName candName,
133 List<SequenceI> matches)
135 ArrayList<SequenceI> best = new ArrayList<SequenceI>();
136 if (candName == null || matches == null || matches.size() == 0)
140 SequenceI match = matches.remove(0);
142 names.put(new SeqIdName(match.getName()), match);
143 int matchlen = match.getName().length();
144 int namlen = candName.id.length();
145 while (matches.size() > 0)
147 // look through for a better one.
148 SequenceI cand = matches.remove(0);
149 names.put(new SeqIdName(cand.getName()), cand);
150 int q, w, candlen = cand.getName().length();
151 // keep the one with an id 'closer' to the given seqnam string
152 if ((q = Math.abs(matchlen - namlen)) > (w = Math
153 .abs(candlen - namlen)) && candlen > matchlen)
160 if (q == w && candlen == matchlen)
166 if (best.size() == 0)
175 * get SequenceI with closest SequenceI.getName() to seq.getName()
181 public SequenceI findIdMatch(SequenceI seq)
183 SeqIdName nam = new SeqIdName(seq.getName());
184 return findIdMatch(nam);
187 public SequenceI findIdMatch(String seqnam)
189 SeqIdName nam = new SeqIdName(seqnam);
190 return findIdMatch(nam);
194 * Find all matches for a given sequence name.
197 * string to query Matcher with.
198 * @return a new array or (possibly) null
200 public SequenceI[] findAllIdMatches(String seqnam)
203 SeqIdName nam = new SeqIdName(seqnam);
204 List<SequenceI> m = findAllIdMatches(nam);
207 return m.toArray(new SequenceI[m.size()]);
215 * Return pointers to sequences (or sequence object containers) which have
216 * same Id as a given set of different sequence objects
220 * @return SequenceI[]
222 public SequenceI[] findIdMatch(SequenceI[] seqs)
224 SequenceI[] namedseqs = null;
230 namedseqs = new SequenceI[seqs.length];
233 nam = new SeqIdName(seqs[i].getName());
235 if (names.containsKey(nam))
237 namedseqs[i] = findIdMatch(nam);
243 } while (++i < seqs.length);
250 * core findIdMatch search method
256 private SequenceI findIdMatch(
257 jalview.analysis.SequenceIdMatcher.SeqIdName nam)
259 Vector matches = new Vector();
260 while (names.containsKey(nam))
262 matches.addElement(names.remove(nam));
264 return pickbestMatch(nam, matches);
268 * core findIdMatch search method for finding all equivalent matches
272 * @return SequenceI[]
274 private List<SequenceI> findAllIdMatches(
275 jalview.analysis.SequenceIdMatcher.SeqIdName nam)
277 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
278 while (names.containsKey(nam))
280 matches.add(names.remove(nam));
282 List<SequenceI> r = pickbestMatches(nam, matches);
294 id = s.toLowerCase(Locale.ROOT);
303 public int hashCode()
305 return ((id.length() >= 4) ? id.substring(0, 4).hashCode()
310 public boolean equals(Object s)
316 if (s instanceof SeqIdName)
318 return this.stringequals(((SeqIdName) s).id);
322 if (s instanceof String)
324 return this.stringequals(((String) s).toLowerCase(Locale.ROOT));
332 * Characters that define the end of a unique sequence ID at the beginning
333 * of an arbitrary ID string JBPNote: This is a heuristic that will fail for
334 * arbritrarily extended sequence id's (like portions of an aligned set of
335 * repeats from one sequence)
337 private String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
341 * matches if one ID properly contains another at a whitespace boundary.
342 * TODO: (JBPNote) These are not efficient. should use char[] for speed
343 * todo: (JBPNote) Set separator characters appropriately
348 private boolean stringequals(String s)
350 if (id.length() > s.length())
352 return id.startsWith(s)
353 ? (WORD_SEP.indexOf(id.charAt(s.length())) > -1)
358 return s.startsWith(id)
359 ? (s.equals(id) ? true
360 : (WORD_SEP.indexOf(s.charAt(id.length())) > -1))
366 * toString method returns the wrapped sequence id. For debugging purposes
367 * only, behaviour not guaranteed not to change.
370 public String toString()