/*\r
* Jalview - A Sequence Alignment Editor and Viewer\r
- * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
+ * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
*\r
* This program is free software; you can redistribute it and/or\r
* modify it under the terms of the GNU General Public License\r
* SequenceIdMatcher\r
* <p>Description: </p>\r
* Routine which does approximate Sequence Id resolution by name using\r
- * string containment (on word boundaries) rather than equivalence\r
+ * string containment (on word boundaries) rather than equivalence. It also\r
+ * attempts to resolve ties where no exact match is available by picking the\r
+ * the id closest to the query.\r
* <p>Copyright: Copyright (c) 2004</p>\r
*\r
* <p>Company: Dundee University</p>\r
}\r
}\r
\r
- SequenceI findIdMatch(SequenceI seq)\r
+ /**\r
+ * returns the closest SequenceI in matches to SeqIdName and returns all the matches\r
+ * to the names hash.\r
+ * @param candName SeqIdName\r
+ * @param matches Vector of SequenceI objects\r
+ * @return SequenceI closest SequenceI to SeqIdName\r
+ */\r
+ private SequenceI pickbestMatch(SeqIdName candName, Vector matches)\r
{\r
- SeqIdName nam = new SeqIdName(seq.getName());\r
-\r
- if (names.containsKey(nam))\r
+ SequenceI match = null;\r
+ if (candName == null || matches == null || matches.size() == 0)\r
{\r
- return (SequenceI) names.get(nam);\r
+ return null;\r
}\r
+ match = (SequenceI) matches.elementAt(0);\r
+ matches.removeElementAt(0);\r
+ names.put(new SeqIdName(match.getName()), match);\r
+ int matchlen = match.getName().length();\r
+ int namlen = candName.id.length();\r
+ while (matches.size() > 0)\r
+ {\r
+ // look through for a better one.\r
+ SequenceI cand = (SequenceI) matches.elementAt(0);\r
+ names.put(new SeqIdName(cand.getName()), cand);\r
+ int candlen = cand.getName().length();\r
+ // keep the one with an id 'closer' to the given seqnam string\r
+ if (Math.abs(matchlen - namlen) > Math.abs(candlen - namlen) &&\r
+ candlen > matchlen)\r
+ {\r
+ match = cand;\r
+ matchlen = candlen;\r
+ }\r
+ }\r
+ return match;\r
+ }\r
\r
- return null;\r
+ /**\r
+ * get SequenceI with closest SequenceI.getName() to seq.getName()\r
+ * @param seq SequenceI\r
+ * @return SequenceI\r
+ */\r
+ SequenceI findIdMatch(SequenceI seq)\r
+ {\r
+ SeqIdName nam = new SeqIdName(seq.getName());\r
+ return findIdMatch(nam);\r
}\r
\r
SequenceI findIdMatch(String seqnam)\r
{\r
SeqIdName nam = new SeqIdName(seqnam);\r
-\r
- if (names.containsKey(nam))\r
- {\r
- return (SequenceI) names.get(nam);\r
- }\r
-\r
- return null;\r
+ return findIdMatch(nam);\r
}\r
\r
/**\r
\r
if (names.containsKey(nam))\r
{\r
- namedseqs[i] = (SequenceI) names.get(nam);\r
+ namedseqs[i] = findIdMatch(nam);\r
}\r
else\r
{\r
return namedseqs;\r
}\r
\r
+ /**\r
+ * core findIdMatch search method\r
+ * @param nam SeqIdName\r
+ * @return SequenceI\r
+ */\r
+ private SequenceI findIdMatch(jalview.analysis.SequenceIdMatcher.SeqIdName\r
+ nam)\r
+ {\r
+ Vector matches = new Vector();\r
+ while (names.containsKey(nam))\r
+ {\r
+ matches.addElement(names.remove(nam));\r
+ }\r
+ return pickbestMatch(nam, matches);\r
+ }\r
+\r
private class SeqIdName\r
{\r
String id;\r
\r
SeqIdName(String s)\r
{\r
- id = new String(s);\r
+ if (s != null)\r
+ {\r
+ id = new String(s);\r
+ }\r
+ else\r
+ {\r
+ id = "";\r
+ }\r
}\r
\r
public int hashCode()\r
{\r
- return (id.substring(0, 4).hashCode());\r
+ return ( (id.length() >= 4) ? id.substring(0, 4).hashCode() : id.hashCode());\r
}\r
\r
public boolean equals(Object s)\r
}\r
\r
/**\r
+ * Characters that define the end of a unique sequence ID at\r
+ * the beginning of an arbitrary ID string\r
+ * JBPNote: This is a heuristic that will fail for arbritrarily extended sequence id's\r
+ * (like portions of an aligned set of repeats from one sequence)\r
+ */\r
+ private String WORD_SEP = "~. |#\\/<>!\"£$%^*)}[@',?_";\r
+\r
+ /**\r
* matches if one ID properly contains another at a whitespace boundary.\r
* TODO: (JBPNote) These are not efficient. should use char[] for speed\r
+ * todo: (JBPNote) Set separator characters appropriately\r
* @param s SeqIdName\r
* @return boolean\r
*/\r
public boolean equals(SeqIdName s)\r
{\r
- if (id.length()>s.id.length()) {\r
+ if (id.length() > s.id.length())\r
+ {\r
return id.startsWith(s.id) ?\r
- (id.equals(s.id) ? true : id.startsWith(s.id+" "))\r
+ (WORD_SEP.indexOf(id.charAt(s.id.length())) > -1)\r
: false;\r
- } else\r
+ }\r
+ else\r
+ {\r
return s.id.startsWith(id) ?\r
- (s.id.equals(id) ? true : s.id.startsWith(id+" "))\r
+ (s.id.equals(id) ? true :\r
+ (WORD_SEP.indexOf(s.id.charAt(id.length())) > -1))\r
: false;\r
+ }\r
}\r
\r
public boolean equals(String s)\r
{\r
- if (id.length()>s.length()) {\r
+ if (id.length() > s.length())\r
+ {\r
return id.startsWith(s) ?\r
- (id.equals(s) ? true : id.startsWith(s+" "))\r
+ (WORD_SEP.indexOf(id.charAt(s.length())) > -1)\r
: false;\r
- } else\r
+ }\r
+ else\r
+ {\r
return s.startsWith(id) ?\r
- (s.equals(id) ? true : s.startsWith(id+" "))\r
+ (s.equals(id) ? true :\r
+ (WORD_SEP.indexOf(s.charAt(id.length())) > -1))\r
: false;\r
+ }\r
}\r
}\r
}\r