--- /dev/null
+package jalview.analysis;
+
+/**
+ * implements a weak hash map based on the first four characters of the given
+ * string, normalised to lower case
+ *
+ * @author jprocter
+ *
+ */
+class SeqIdName
+{
+ String id, origid;
+
+ SeqIdName(String s)
+ {
+ if (s != null)
+ {
+ id = new String(s).toLowerCase();
+ origid = new String(s);
+ }
+ else
+ {
+ id = "";
+ }
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
+ .hashCode());
+ }
+
+ @Override
+ public boolean equals(Object s)
+ {
+ if (s == null)
+ {
+ return false;
+ }
+ if (s instanceof SeqIdName)
+ {
+ return this.equals((SeqIdName) s);
+ }
+ else
+ {
+ if (s instanceof String)
+ {
+ return this.equals((String) s);
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * matches if one ID properly contains another at a whitespace boundary.
+ * TODO: (JBPNote) These are not efficient. should use char[] for speed
+ * todo: (JBPNote) Set separator characters appropriately
+ *
+ * @param s
+ * SeqIdName
+ * @return boolean
+ */
+ public boolean equals(SeqIdName s)
+ {
+ // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
+ // match contains one ID flanked
+ if (id.length() > s.id.length())
+ {
+ return check_wordmatch(id, s.id);
+ }
+ else
+ {
+ return check_wordmatch(s.id, id);
+ }
+ }
+
+ private boolean check_wordmatch(String longer, String shorter)
+ {
+ boolean elen = longer.length() == shorter.length();
+ int sp = longer.indexOf(shorter);
+ if (sp == -1)
+ {
+ return false;
+ }
+
+ if (sp == 0)
+ {
+ // end of match is word boundary
+ return elen ? true : (SequenceIdMatcher.WORD_SEP.indexOf(longer.charAt(shorter
+ .length() + sp)) > -1);
+ }
+ if (SequenceIdMatcher.WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1)
+ {
+ if (sp + shorter.length() == longer.length())
+ {
+ return true;
+ }
+ else
+ {
+ // end of match is word boundary
+ return elen ? false
+ : sp + shorter.length() == longer.length() ? true
+ : (SequenceIdMatcher.WORD_SEP.indexOf(longer.charAt(shorter
+ .length() + sp)) > -1);
+ }
+ }
+ else
+ {
+ // prefix of match is not a word boundary
+ return false;
+ }
+ }
+
+ public boolean equals(String s)
+ {
+ s = s.toLowerCase(); // TODO: employ faster to lower case operation
+ if (id.length() > s.length())
+ {
+ return check_wordmatch(id, s);
+ }
+ else
+ {
+ return check_wordmatch(s, id);
+ }
+ }
+
+ @Override
+ public String toString()
+ {
+ return id;
+ }
+
+ public boolean equalsCase(String s)
+ {
+ if (origid.length() > s.length())
+ {
+ return check_wordmatch(origid, s);
+ }
+ else
+ {
+ return check_wordmatch(s, origid);
+ }
+ }
+
+ public boolean equalsCase(SeqIdName sid)
+ {
+ return equalsCase(sid.origid);
+ }
+}
\ No newline at end of file
* arbritrarily extended sequence id's (like portions of an aligned set of
* repeats from one sequence)
*/
- private static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
+ static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
+ "$%^*)}[@',?_";
/**
* @param seq
* @return list of split keys
*/
- public List<SeqIdName> getWordsFor(SequenceI seq)
+ public static List<SeqIdName> getWordsFor(SequenceI seq)
{
ArrayList<SeqIdName> keys = new ArrayList<SeqIdName>();
String name = seq.getName(), limits = "/" + seq.getStart() + "-"
// if not closer, then check if current best is actually identical in case
// as
// well
- if (is_closer || (!candName.equalsCase(best.get(0).getName())))
+ if (is_closer
+ || (candName.equalsCase(cand.getName()) && !candName
+ .equalsCase(best.get(0).getName())))
{
best.clear();
match = cand;
matchlen = candlen;
best.add(match);
}
- if (q == w && candlen == matchlen)
+ else
{
- // equivalently good, and matches with case as well. so
- // record any ties
- best.add(cand);
+ if (q == w && candlen == matchlen)
+ {
+ // equivalently good, and matches with case as well. so
+ // record any ties
+ best.add(cand);
+ }
}
}
if (best.size() == 0)
* @return SequenceI
*/
private SequenceI findIdMatch(
- jalview.analysis.SequenceIdMatcher.SeqIdName nam)
+ jalview.analysis.SeqIdName nam)
{
ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
while (names.containsKey(nam))
* @return SequenceI[]
*/
private List<SequenceI> findAllIdMatches(
- jalview.analysis.SequenceIdMatcher.SeqIdName nam)
+ jalview.analysis.SeqIdName nam)
{
ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
while (names.containsKey(nam))
List<SequenceI> r = pickbestMatches(nam, matches);
return r;
}
-
- private class SeqIdName
- {
- String id, origid;
-
- SeqIdName(String s)
- {
- if (s != null)
- {
- id = new String(s).toLowerCase();
- origid = new String(s);
- }
- else
- {
- id = "";
- }
- }
-
- @Override
- public int hashCode()
- {
- return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
- .hashCode());
- }
-
- @Override
- public boolean equals(Object s)
- {
- if (s == null)
- {
- return false;
- }
- if (s instanceof SeqIdName)
- {
- return this.equals((SeqIdName) s);
- }
- else
- {
- if (s instanceof String)
- {
- return this.equals((String) s);
- }
- }
-
- return false;
- }
-
- /**
- * matches if one ID properly contains another at a whitespace boundary.
- * TODO: (JBPNote) These are not efficient. should use char[] for speed
- * todo: (JBPNote) Set separator characters appropriately
- *
- * @param s
- * SeqIdName
- * @return boolean
- */
- public boolean equals(SeqIdName s)
- {
- // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
- // match contains one ID flanked
- if (id.length() > s.id.length())
- {
- return check_wordmatch(id, s.id);
- }
- else
- {
- return check_wordmatch(s.id, id);
- }
- }
-
- private boolean check_wordmatch(String longer, String shorter)
- {
- boolean elen = longer.length() == shorter.length();
- int sp = longer.indexOf(shorter);
- if (sp == -1)
- {
- return false;
- }
-
- if (sp == 0)
- {
- // end of match is word boundary
- return elen ? true : (WORD_SEP.indexOf(longer.charAt(shorter
- .length() + sp)) > -1);
- }
- if (WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1)
- {
- if (sp + shorter.length() == longer.length())
- {
- return true;
- }
- else
- {
- // end of match is word boundary
- return elen ? false
- : sp + shorter.length() == longer.length() ? true
- : (WORD_SEP.indexOf(longer.charAt(shorter
- .length() + sp)) > -1);
- }
- }
- else
- {
- // prefix of match is not a word boundary
- return false;
- }
- }
-
- public boolean equals(String s)
- {
- s = s.toLowerCase(); // TODO: employ faster to lower case operation
- if (id.length() > s.length())
- {
- return check_wordmatch(id, s);
- }
- else
- {
- return check_wordmatch(s, id);
- }
- }
-
- @Override
- public String toString()
- {
- return id;
- }
-
- public boolean equalsCase(String s)
- {
- if (origid.length() > s.length())
- {
- return check_wordmatch(origid, s);
- }
- else
- {
- return check_wordmatch(s, origid);
- }
- }
-
- public boolean equalsCase(SeqIdName sid)
- {
- return equalsCase(sid.origid);
- }
- }
}
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import org.testng.Assert;
import org.testng.annotations.Test;
{
for (SequenceI sq : SequenceIdMatcherTest.someseqs)
{
+ System.out.println("Searching with '" + sq.getName() + "'");
SequenceI[] idmatches = getMatcher().findAllIdMatches(sq.getName());
Assert.assertTrue(
idmatches.length >= 1,
"Couldn't recover at least one sequence for string '"
+ sq.getName() + "'");
+ for (SequenceI f : idmatches)
+ {
+ System.out.println("For '" + sq.getName() + "' found '"
+ + f.getName() + "'");
+ }
+
SequenceI[] seqmatches = getMatcher().findIdMatch(
new SequenceI[] { sq });
Assert.assertEquals(1, seqmatches.length,
}
@Test(groups = { "Functional" })
+ public void testWordSplit()
+ {
+ String[] words = new String[] { "several", "words", "separated",
+ "fully" };
+ String full = "";
+ for (String word : words)
+ {
+ if (full.length() > 0)
+ {
+ full += "|";
+ }
+ full += word;
+ }
+ List<SeqIdName> bits = SequenceIdMatcher.getWordsFor(new Sequence(full,
+ "dummy"));
+ for (String word : words)
+ {
+ List<SeqIdName> equals = new ArrayList<SeqIdName>();
+ for (SeqIdName bit : bits)
+ {
+ if (bit.equals(word))
+ {
+ equals.add(bit);
+ }
+ }
+ Assert.assertTrue(equals.size() > 0,
+ "Word generation has broken. Expected at least one match for '"
+ + word + "'");
+ }
+ }
+ @Test(groups = { "Functional" })
public void testFlankingMatch()
{
SequenceI[] match = getMatcher().findAllIdMatches("complexId");
.findAllIdMatches("complexId");
// should find 6 distinct sequences
Assert.assertNotNull(fmatch, "Flanking matches not found.");
+ for (SequenceI f:fmatch)
+ {
+ System.out.println("Flanking 'complexId' match: '" + f.getName()
+ + "'");
+ }
Assert.assertEquals(fmatch.length, 6,
"Couldn't find all entries with IDs containing 'complexId' word match");