X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=inline;f=src%2Fjalview%2Fanalysis%2FSequenceIdMatcher.java;h=9d33996333997931356a5f3ca6b7530e7cb0ce77;hb=b242374a54c1288647de25dcf02163da168044a0;hp=1efe4982810ecb2a79b57a85834178374108926a;hpb=efc31b4a8d5cee63555586804a2b79c06bdb5a14;p=jalview.git diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java index 1efe498..9d33996 100755 --- a/src/jalview/analysis/SequenceIdMatcher.java +++ b/src/jalview/analysis/SequenceIdMatcher.java @@ -26,7 +26,8 @@ import jalview.datamodel.*; *

Title:

* SequenceIdMatcher *

Description:

- * Routine which does approximate Sequence Id resolution by name using string containment rather than equivalence + * Routine which does approximate Sequence Id resolution by name using + * string containment (on word boundaries) rather than equivalence *

Copyright: Copyright (c) 2004

* *

Company: Dundee University

@@ -41,7 +42,6 @@ public class SequenceIdMatcher public SequenceIdMatcher(SequenceI[] seqs) { names = new Hashtable(); - for (int i = 0; i < seqs.length; i++) { names.put(new SeqIdName(seqs[i].getName()), seqs[i]); @@ -83,13 +83,13 @@ public class SequenceIdMatcher */ SequenceI[] findIdMatch(SequenceI[] seqs) { - SequenceI[] namedseqs = new SequenceI[seqs.length]; - + SequenceI[] namedseqs = null; int i = 0; SeqIdName nam; if (seqs.length > 0) { + namedseqs = new SequenceI[seqs.length]; do { nam = new SeqIdName(seqs[i].getName()); @@ -103,7 +103,7 @@ public class SequenceIdMatcher namedseqs[i] = null; } } - while (i++ < seqs.length); + while (++i < seqs.length); } return namedseqs; @@ -115,12 +115,15 @@ public class SequenceIdMatcher SeqIdName(String s) { - id = new String(s); + if (s!=null) + id = new String(s); + else + id = ""; } public int hashCode() { - return (id.substring(0, 4).hashCode()); + return ((id.length()>=4) ? id.substring(0, 4).hashCode() : id.hashCode()); } public boolean equals(Object s) @@ -140,24 +143,45 @@ public class SequenceIdMatcher return false; } + /** + * Characters that define the end of a unique sequence ID at + * the beginning of an arbitrary ID string + * JBPNote: This is a heuristic that will fail for arbritrarily extended sequence id's + * (like portions of an aligned set of repeats from one sequence) + */ + private String WORD_SEP="~. |#\\/<>!\"£$%^*)}[@',?"; + + /** + * matches if one ID properly contains another at a whitespace boundary. + * TODO: (JBPNote) These are not efficient. should use char[] for speed + * todo: (JBPNote) Set separator characters appropriately + * @param s SeqIdName + * @return boolean + */ public boolean equals(SeqIdName s) { - if (id.startsWith(s.id) || s.id.startsWith(id)) - { - return true; - } - - return false; + if (id.length()>s.id.length()) { + return id.startsWith(s.id) ? + (WORD_SEP.indexOf(id.charAt(s.id.length()))>-1) + : false; + } else + return s.id.startsWith(id) ? + (s.id.equals(id) ? true : + (WORD_SEP.indexOf(s.id.charAt(id.length()))>-1)) + : false; } public boolean equals(String s) { - if (id.startsWith(s) || s.startsWith(id)) - { - return true; - } - - return false; + if (id.length()>s.length()) { + return id.startsWith(s) ? + (WORD_SEP.indexOf(id.charAt(s.length()))>-1) + : false; + } else + return s.startsWith(id) ? + (s.equals(id) ? true : + (WORD_SEP.indexOf(s.charAt(id.length()))>-1)) + : false; } } }