2 * Jalview - A Sequence Alignment Editor and Viewer
\r
3 * Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
\r
5 * This program is free software; you can redistribute it and/or
\r
6 * modify it under the terms of the GNU General Public License
\r
7 * as published by the Free Software Foundation; either version 2
\r
8 * of the License, or (at your option) any later version.
\r
10 * This program is distributed in the hope that it will be useful,
\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
13 * GNU General Public License for more details.
\r
15 * You should have received a copy of the GNU General Public License
\r
16 * along with this program; if not, write to the Free Software
\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
19 package jalview.analysis;
\r
23 import jalview.datamodel.*;
\r
28 * <p>Description: </p>
\r
29 * Routine which does approximate Sequence Id resolution by name using
\r
30 * string containment (on word boundaries) rather than equivalence. It also
\r
31 * attempts to resolve ties where no exact match is available by picking the
\r
32 * the id closest to the query.
\r
33 * <p>Copyright: Copyright (c) 2004</p>
\r
35 * <p>Company: Dundee University</p>
\r
37 * @author not attributable
\r
40 public class SequenceIdMatcher
\r
42 private Hashtable names;
\r
44 public SequenceIdMatcher(SequenceI[] seqs)
\r
46 names = new Hashtable();
\r
47 for (int i = 0; i < seqs.length; i++)
\r
49 names.put(new SeqIdName(seqs[i].getName()), seqs[i]);
\r
53 * returns the closest SequenceI in matches to SeqIdName and returns all the matches
\r
54 * to the names hash.
\r
55 * @param candName SeqIdName
\r
56 * @param matches Vector of SequenceI objects
\r
57 * @return SequenceI closest SequenceI to SeqIdName
\r
59 private SequenceI pickbestMatch(SeqIdName candName, Vector matches) {
\r
60 SequenceI match=null;
\r
61 if (candName==null || matches==null || matches.size()==0)
\r
63 match=(SequenceI) matches.elementAt(0);
\r
64 matches.removeElementAt(0);
\r
65 names.put(new SeqIdName(match.getName()), match);
\r
66 int matchlen=match.getName().length();
\r
67 int namlen=candName.id.length();
\r
68 while (matches.size()>0) {
\r
69 // look through for a better one.
\r
70 SequenceI cand=(SequenceI) matches.elementAt(0);
\r
71 names.put(new SeqIdName(cand.getName()), cand);
\r
72 int candlen = cand.getName().length();
\r
73 // keep the one with an id 'closer' to the given seqnam string
\r
74 if (Math.abs(matchlen-namlen)>Math.abs(candlen-namlen) && candlen>matchlen) {
\r
83 * get SequenceI with closest SequenceI.getName() to seq.getName()
\r
84 * @param seq SequenceI
\r
87 SequenceI findIdMatch(SequenceI seq)
\r
89 SeqIdName nam = new SeqIdName(seq.getName());
\r
90 return findIdMatch(nam);
\r
93 SequenceI findIdMatch(String seqnam)
\r
95 SeqIdName nam = new SeqIdName(seqnam);
\r
96 return findIdMatch(nam);
\r
101 * Return pointers to sequences (or sequence object containers)
\r
102 * which have same Id as a given set of different sequence objects
\r
104 * @param seqs SequenceI[]
\r
105 * @return SequenceI[]
\r
107 SequenceI[] findIdMatch(SequenceI[] seqs)
\r
109 SequenceI[] namedseqs = null;
\r
113 if (seqs.length > 0)
\r
115 namedseqs = new SequenceI[seqs.length];
\r
118 nam = new SeqIdName(seqs[i].getName());
\r
120 if (names.containsKey(nam))
\r
122 namedseqs[i] = findIdMatch(nam);
\r
126 namedseqs[i] = null;
\r
129 while (++i < seqs.length);
\r
136 * core findIdMatch search method
\r
137 * @param nam SeqIdName
\r
138 * @return SequenceI
\r
140 private SequenceI findIdMatch(jalview.analysis.SequenceIdMatcher.SeqIdName
\r
143 Vector matches=new Vector();
\r
144 while (names.containsKey(nam))
\r
146 matches.addElement(names.remove(nam));
\r
148 return pickbestMatch(nam, matches);
\r
151 private class SeqIdName
\r
155 SeqIdName(String s)
\r
158 id = new String(s);
\r
163 public int hashCode()
\r
165 return ((id.length()>=4) ? id.substring(0, 4).hashCode() : id.hashCode());
\r
168 public boolean equals(Object s)
\r
170 if (s instanceof SeqIdName)
\r
172 return this.equals( (SeqIdName) s);
\r
176 if (s instanceof String)
\r
178 return this.equals( (String) s);
\r
186 * Characters that define the end of a unique sequence ID at
\r
187 * the beginning of an arbitrary ID string
\r
188 * JBPNote: This is a heuristic that will fail for arbritrarily extended sequence id's
\r
189 * (like portions of an aligned set of repeats from one sequence)
\r
191 private String WORD_SEP="~. |#\\/<>!\"£$%^*)}[@',?_";
\r
194 * matches if one ID properly contains another at a whitespace boundary.
\r
195 * TODO: (JBPNote) These are not efficient. should use char[] for speed
\r
196 * todo: (JBPNote) Set separator characters appropriately
\r
197 * @param s SeqIdName
\r
200 public boolean equals(SeqIdName s)
\r
202 if (id.length()>s.id.length()) {
\r
203 return id.startsWith(s.id) ?
\r
204 (WORD_SEP.indexOf(id.charAt(s.id.length()))>-1)
\r
207 return s.id.startsWith(id) ?
\r
208 (s.id.equals(id) ? true :
\r
209 (WORD_SEP.indexOf(s.id.charAt(id.length()))>-1))
\r
213 public boolean equals(String s)
\r
215 if (id.length()>s.length()) {
\r
216 return id.startsWith(s) ?
\r
217 (WORD_SEP.indexOf(id.charAt(s.length()))>-1)
\r
220 return s.startsWith(id) ?
\r
221 (s.equals(id) ? true :
\r
222 (WORD_SEP.indexOf(s.charAt(id.length()))>-1))
\r