2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.4)
3 * Copyright (C) 2008 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.analysis;
23 import jalview.datamodel.*;
28 * <p>Description: </p>
29 * Routine which does approximate Sequence Id resolution by name using
30 * string containment (on word boundaries) rather than equivalence. It also
31 * attempts to resolve ties where no exact match is available by picking the
32 * the id closest to the query.
33 * <p>Copyright: Copyright (c) 2004</p>
35 * <p>Company: Dundee University</p>
37 * @author not attributable
40 public class SequenceIdMatcher
42 private Hashtable names;
44 public SequenceIdMatcher(SequenceI[] seqs)
46 names = new Hashtable();
47 for (int i = 0; i < seqs.length; i++)
49 names.put(new SeqIdName(seqs[i].getName()), seqs[i]);
54 * returns the closest SequenceI in matches to SeqIdName and returns all the matches
56 * @param candName SeqIdName
57 * @param matches Vector of SequenceI objects
58 * @return SequenceI closest SequenceI to SeqIdName
60 private SequenceI pickbestMatch(SeqIdName candName, Vector matches)
62 SequenceI match = null;
63 if (candName == null || matches == null || matches.size() == 0)
67 match = (SequenceI) matches.elementAt(0);
68 matches.removeElementAt(0);
69 names.put(new SeqIdName(match.getName()), match);
70 int matchlen = match.getName().length();
71 int namlen = candName.id.length();
72 while (matches.size() > 0)
74 // look through for a better one.
75 SequenceI cand = (SequenceI) matches.elementAt(0);
76 names.put(new SeqIdName(cand.getName()), cand);
77 int candlen = cand.getName().length();
78 // keep the one with an id 'closer' to the given seqnam string
79 if (Math.abs(matchlen - namlen) > Math.abs(candlen - namlen) &&
90 * get SequenceI with closest SequenceI.getName() to seq.getName()
91 * @param seq SequenceI
94 SequenceI findIdMatch(SequenceI seq)
96 SeqIdName nam = new SeqIdName(seq.getName());
97 return findIdMatch(nam);
100 SequenceI findIdMatch(String seqnam)
102 SeqIdName nam = new SeqIdName(seqnam);
103 return findIdMatch(nam);
109 * Return pointers to sequences (or sequence object containers)
110 * which have same Id as a given set of different sequence objects
112 * @param seqs SequenceI[]
113 * @return SequenceI[]
115 SequenceI[] findIdMatch(SequenceI[] seqs)
117 SequenceI[] namedseqs = null;
123 namedseqs = new SequenceI[seqs.length];
126 nam = new SeqIdName(seqs[i].getName());
128 if (names.containsKey(nam))
130 namedseqs[i] = findIdMatch(nam);
137 while (++i < seqs.length);
144 * core findIdMatch search method
145 * @param nam SeqIdName
148 private SequenceI findIdMatch(jalview.analysis.SequenceIdMatcher.SeqIdName
151 Vector matches = new Vector();
152 while (names.containsKey(nam))
154 matches.addElement(names.remove(nam));
156 return pickbestMatch(nam, matches);
159 private class SeqIdName
175 public int hashCode()
177 return ( (id.length() >= 4) ? id.substring(0, 4).hashCode() : id.hashCode());
180 public boolean equals(Object s)
182 if (s instanceof SeqIdName)
184 return this.equals( (SeqIdName) s);
188 if (s instanceof String)
190 return this.equals( (String) s);
198 * Characters that define the end of a unique sequence ID at
199 * the beginning of an arbitrary ID string
200 * JBPNote: This is a heuristic that will fail for arbritrarily extended sequence id's
201 * (like portions of an aligned set of repeats from one sequence)
203 private String WORD_SEP = "~. |#\\/<>!\"£$%^*)}[@',?_";
206 * matches if one ID properly contains another at a whitespace boundary.
207 * TODO: (JBPNote) These are not efficient. should use char[] for speed
208 * todo: (JBPNote) Set separator characters appropriately
212 public boolean equals(SeqIdName s)
214 if (id.length() > s.id.length())
216 return id.startsWith(s.id) ?
217 (WORD_SEP.indexOf(id.charAt(s.id.length())) > -1)
222 return s.id.startsWith(id) ?
223 (s.id.equals(id) ? true :
224 (WORD_SEP.indexOf(s.id.charAt(id.length())) > -1))
229 public boolean equals(String s)
231 if (id.length() > s.length())
233 return id.startsWith(s) ?
234 (WORD_SEP.indexOf(id.charAt(s.length())) > -1)
239 return s.startsWith(id) ?
240 (s.equals(id) ? true :
241 (WORD_SEP.indexOf(s.charAt(id.length())) > -1))