* <p>Title: </p>\r
* SequenceIdMatcher\r
* <p>Description: </p>\r
- * Routine which does approximate Sequence Id resolution by name using string containment rather than equivalence\r
+ * Routine which does approximate Sequence Id resolution by name using\r
+ * string containment (on word boundaries) rather than equivalence\r
* <p>Copyright: Copyright (c) 2004</p>\r
*\r
* <p>Company: Dundee University</p>\r
public SequenceIdMatcher(SequenceI[] seqs)\r
{\r
names = new Hashtable();\r
-\r
for (int i = 0; i < seqs.length; i++)\r
{\r
names.put(new SeqIdName(seqs[i].getName()), seqs[i]);\r
*/\r
SequenceI[] findIdMatch(SequenceI[] seqs)\r
{\r
- SequenceI[] namedseqs = new SequenceI[seqs.length];\r
-\r
+ SequenceI[] namedseqs = null;\r
int i = 0;\r
SeqIdName nam;\r
\r
if (seqs.length > 0)\r
{\r
+ namedseqs = new SequenceI[seqs.length];\r
do\r
{\r
nam = new SeqIdName(seqs[i].getName());\r
namedseqs[i] = null;\r
}\r
}\r
- while (i++ < seqs.length);\r
+ while (++i < seqs.length);\r
}\r
\r
return namedseqs;\r
\r
SeqIdName(String s)\r
{\r
- id = new String(s);\r
+ if (s!=null)\r
+ id = new String(s);\r
+ else\r
+ id = "";\r
}\r
\r
public int hashCode()\r
{\r
- return (id.substring(0, 4).hashCode());\r
+ return ((id.length()>=4) ? id.substring(0, 4).hashCode() : id.hashCode());\r
}\r
\r
public boolean equals(Object s)\r
return false;\r
}\r
\r
+ /**\r
+ * Characters that define the end of a unique sequence ID at\r
+ * the beginning of an arbitrary ID string\r
+ * JBPNote: This is a heuristic that will fail for arbritrarily extended sequence id's\r
+ * (like portions of an aligned set of repeats from one sequence)\r
+ */\r
+ private String WORD_SEP="~. |#\\/<>!\"£$%^*)}[@',?";\r
+\r
+ /**\r
+ * matches if one ID properly contains another at a whitespace boundary.\r
+ * TODO: (JBPNote) These are not efficient. should use char[] for speed\r
+ * todo: (JBPNote) Set separator characters appropriately\r
+ * @param s SeqIdName\r
+ * @return boolean\r
+ */\r
public boolean equals(SeqIdName s)\r
{\r
- if (id.startsWith(s.id) || s.id.startsWith(id))\r
- {\r
- return true;\r
- }\r
-\r
- return false;\r
+ if (id.length()>s.id.length()) {\r
+ return id.startsWith(s.id) ?\r
+ (WORD_SEP.indexOf(id.charAt(s.id.length()))>-1)\r
+ : false;\r
+ } else\r
+ return s.id.startsWith(id) ?\r
+ (s.id.equals(id) ? true :\r
+ (WORD_SEP.indexOf(s.id.charAt(id.length()))>-1))\r
+ : false;\r
}\r
\r
public boolean equals(String s)\r
{\r
- if (id.startsWith(s) || s.startsWith(id))\r
- {\r
- return true;\r
- }\r
-\r
- return false;\r
+ if (id.length()>s.length()) {\r
+ return id.startsWith(s) ?\r
+ (WORD_SEP.indexOf(id.charAt(s.length()))>-1)\r
+ : false;\r
+ } else\r
+ return s.startsWith(id) ?\r
+ (s.equals(id) ? true :\r
+ (WORD_SEP.indexOf(s.charAt(id.length()))>-1))\r
+ : false;\r
}\r
}\r
}\r