/*
- * Jalview - A Sequence Alignment Editor and Viewer (Development Version 2.4.1)
- * Copyright (C) 2009 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * This file is part of Jalview.
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.analysis;
-import java.util.*;
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.SequenceI;
-import jalview.datamodel.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Vector;
/**
- * <p>
- * Title:
- * </p>
- * SequenceIdMatcher
- * <p>
- * Description:
- * </p>
- * Routine which does approximate Sequence Id resolution by name using string
+ * Routines for approximate Sequence Id resolution by name using string
* containment (on word boundaries) rather than equivalence. It also attempts to
* resolve ties where no exact match is available by picking the the id closest
* to the query.
- * <p>
- * Copyright: Copyright (c) 2004
- * </p>
- *
- * <p>
- * Company: Dundee University
- * </p>
- *
- * @author not attributable
- * @version 1.0
*/
public class SequenceIdMatcher
{
- private Hashtable names;
+ private HashMap<SeqIdName, SequenceI> names;
+
+ public SequenceIdMatcher(List<SequenceI> seqs)
+ {
+ names = new HashMap<SeqIdName, SequenceI>();
+ addAll(seqs);
+ }
+
+ /**
+ * Adds sequences to this matcher
+ *
+ * @param seqs
+ */
+ public void addAll(List<SequenceI> seqs)
+ {
+ for (SequenceI seq : seqs)
+ {
+ add(seq);
+ }
+ }
- public SequenceIdMatcher(SequenceI[] seqs)
+ /**
+ * Adds one sequence to this matcher
+ *
+ * @param seq
+ */
+ public void add(SequenceI seq)
{
- names = new Hashtable();
- for (int i = 0; i < seqs.length; i++)
+ // TODO: deal with ID collisions - SequenceI should be appended to list
+ // associated with this key.
+ names.put(new SeqIdName(seq.getDisplayId(true)), seq);
+ SequenceI dbseq = seq;
+ while (dbseq.getDatasetSequence() != null)
{
- names.put(new SeqIdName(seqs[i].getName()), seqs[i]);
- // add in any interesting identifiers
- if (seqs[i].getDBRef()!=null)
+ dbseq = dbseq.getDatasetSequence();
+ }
+ // add in any interesting identifiers
+ if (dbseq.getDBRefs() != null)
+ {
+ DBRefEntry dbr[] = dbseq.getDBRefs();
+ SeqIdName sid = null;
+ for (int r = 0; r < dbr.length; r++)
{
- DBRefEntry dbr[] = seqs[i].getDBRef();
- SeqIdName sid=null;
- for (int r=0;r<dbr.length;r++)
+ sid = new SeqIdName(dbr[r].getAccessionId());
+ if (!names.containsKey(sid))
{
- sid = new SeqIdName(dbr[r].getAccessionId());
- if (!names.contains(sid))
- {
- names.put(sid, seqs[i]);
- }
+ names.put(sid, seq);
}
}
}
}
/**
+ * convenience method to make a matcher from concrete array
+ *
+ * @param sequences
+ */
+ public SequenceIdMatcher(SequenceI[] sequences)
+ {
+ this(Arrays.asList(sequences));
+ }
+
+ /**
* returns the closest SequenceI in matches to SeqIdName and returns all the
* matches to the names hash.
*
* @param candName
- * SeqIdName
+ * SeqIdName
* @param matches
- * Vector of SequenceI objects
+ * List of SequenceI objects
* @return SequenceI closest SequenceI to SeqIdName
*/
- private SequenceI pickbestMatch(SeqIdName candName, Vector matches)
+ private SequenceI pickbestMatch(SeqIdName candName,
+ List<SequenceI> matches)
+ {
+ List<SequenceI> st = pickbestMatches(candName, matches);
+ return st == null || st.size() == 0 ? null : st.get(0);
+ }
+
+ /**
+ * returns the closest SequenceI in matches to SeqIdName and returns all the
+ * matches to the names hash.
+ *
+ * @param candName
+ * SeqIdName
+ * @param matches
+ * Vector of SequenceI objects
+ * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
+ * ties }
+ */
+ private List<SequenceI> pickbestMatches(SeqIdName candName,
+ List<SequenceI> matches)
{
- SequenceI match = null;
+ ArrayList<SequenceI> best = new ArrayList<SequenceI>();
if (candName == null || matches == null || matches.size() == 0)
{
return null;
}
- match = (SequenceI) matches.elementAt(0);
- matches.removeElementAt(0);
+ SequenceI match = matches.remove(0);
+ best.add(match);
names.put(new SeqIdName(match.getName()), match);
int matchlen = match.getName().length();
int namlen = candName.id.length();
while (matches.size() > 0)
{
// look through for a better one.
- SequenceI cand = (SequenceI) matches.elementAt(0);
+ SequenceI cand = matches.remove(0);
names.put(new SeqIdName(cand.getName()), cand);
- int candlen = cand.getName().length();
+ int q, w, candlen = cand.getName().length();
// keep the one with an id 'closer' to the given seqnam string
- if (Math.abs(matchlen - namlen) > Math.abs(candlen - namlen)
- && candlen > matchlen)
+ if ((q = Math.abs(matchlen - namlen)) > (w = Math
+ .abs(candlen - namlen)) && candlen > matchlen)
{
+ best.clear();
match = cand;
matchlen = candlen;
+ best.add(match);
+ }
+ if (q == w && candlen == matchlen)
+ {
+ // record any ties
+ best.add(cand);
}
}
- return match;
+ if (best.size() == 0)
+ {
+ return null;
+ }
+ ;
+ return best;
}
/**
* get SequenceI with closest SequenceI.getName() to seq.getName()
*
* @param seq
- * SequenceI
+ * SequenceI
* @return SequenceI
*/
- SequenceI findIdMatch(SequenceI seq)
+ public SequenceI findIdMatch(SequenceI seq)
{
SeqIdName nam = new SeqIdName(seq.getName());
return findIdMatch(nam);
}
- SequenceI findIdMatch(String seqnam)
+ public SequenceI findIdMatch(String seqnam)
{
SeqIdName nam = new SeqIdName(seqnam);
return findIdMatch(nam);
}
/**
+ * Find all matches for a given sequence name.
+ *
+ * @param seqnam
+ * string to query Matcher with.
+ * @return a new array or (possibly) null
+ */
+ public SequenceI[] findAllIdMatches(String seqnam)
+ {
+
+ SeqIdName nam = new SeqIdName(seqnam);
+ List<SequenceI> m = findAllIdMatches(nam);
+ if (m != null)
+ {
+ return m.toArray(new SequenceI[m.size()]);
+ }
+ return null;
+ }
+
+ /**
* findIdMatch
*
* Return pointers to sequences (or sequence object containers) which have
* same Id as a given set of different sequence objects
*
* @param seqs
- * SequenceI[]
+ * SequenceI[]
* @return SequenceI[]
*/
- SequenceI[] findIdMatch(SequenceI[] seqs)
+ public SequenceI[] findIdMatch(SequenceI[] seqs)
{
SequenceI[] namedseqs = null;
int i = 0;
* core findIdMatch search method
*
* @param nam
- * SeqIdName
+ * SeqIdName
* @return SequenceI
*/
private SequenceI findIdMatch(
return pickbestMatch(nam, matches);
}
- private class SeqIdName
+ /**
+ * core findIdMatch search method for finding all equivalent matches
+ *
+ * @param nam
+ * SeqIdName
+ * @return SequenceI[]
+ */
+ private List<SequenceI> findAllIdMatches(
+ jalview.analysis.SequenceIdMatcher.SeqIdName nam)
+ {
+ ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
+ while (names.containsKey(nam))
+ {
+ matches.add(names.remove(nam));
+ }
+ List<SequenceI> r = pickbestMatches(nam, matches);
+ return r;
+ }
+
+ class SeqIdName
{
String id;
{
if (s != null)
{
- id = new String(s);
+ id = s.toLowerCase();
}
else
{
}
}
+ @Override
public int hashCode()
{
- return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
- .hashCode());
+ return ((id.length() >= 4) ? id.substring(0, 4).hashCode()
+ : id.hashCode());
}
+ @Override
public boolean equals(Object s)
{
+ if (s == null)
+ {
+ return false;
+ }
if (s instanceof SeqIdName)
{
- return this.equals((SeqIdName) s);
+ return this.stringequals(((SeqIdName) s).id);
}
else
{
if (s instanceof String)
{
- return this.equals((String) s);
+ return this.stringequals(((String) s).toLowerCase());
}
}
* arbritrarily extended sequence id's (like portions of an aligned set of
* repeats from one sequence)
*/
- private String WORD_SEP = "~. |#\\/<>!\"£$%^*)}[@',?_";
+ private String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
+ + "$%^*)}[@',?_";
/**
* matches if one ID properly contains another at a whitespace boundary.
* todo: (JBPNote) Set separator characters appropriately
*
* @param s
- * SeqIdName
* @return boolean
*/
- public boolean equals(SeqIdName s)
+ private boolean stringequals(String s)
{
- if (id.length() > s.id.length())
+ if (id.length() > s.length())
{
- return id.startsWith(s.id) ? (WORD_SEP.indexOf(id.charAt(s.id
- .length())) > -1) : false;
+ return id.startsWith(s)
+ ? (WORD_SEP.indexOf(id.charAt(s.length())) > -1)
+ : false;
}
else
{
- return s.id.startsWith(id) ? (s.id.equals(id) ? true : (WORD_SEP
- .indexOf(s.id.charAt(id.length())) > -1)) : false;
+ return s.startsWith(id)
+ ? (s.equals(id) ? true
+ : (WORD_SEP.indexOf(s.charAt(id.length())) > -1))
+ : false;
}
}
- public boolean equals(String s)
+ /**
+ * toString method returns the wrapped sequence id. For debugging purposes
+ * only, behaviour not guaranteed not to change.
+ */
+ @Override
+ public String toString()
{
- if (id.length() > s.length())
- {
- return id.startsWith(s) ? (WORD_SEP.indexOf(id.charAt(s.length())) > -1)
- : false;
- }
- else
- {
- return s.startsWith(id) ? (s.equals(id) ? true : (WORD_SEP
- .indexOf(s.charAt(id.length())) > -1)) : false;
- }
+ return id;
}
}
}