From 215801b5d918191341ee80e71022b3e67f2dfeb4 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Fri, 13 Nov 2015 14:48:10 +0000 Subject: [PATCH] JAL-1965 refactored SeqIdName to public class, and more tests for word generation --- src/jalview/analysis/SeqIdName.java | 151 ++++++++++++++++++++ src/jalview/analysis/SequenceIdMatcher.java | 166 ++-------------------- test/jalview/analysis/SequenceIdMatcherTest.java | 45 ++++++ 3 files changed, 210 insertions(+), 152 deletions(-) create mode 100644 src/jalview/analysis/SeqIdName.java diff --git a/src/jalview/analysis/SeqIdName.java b/src/jalview/analysis/SeqIdName.java new file mode 100644 index 0000000..ddf63ba --- /dev/null +++ b/src/jalview/analysis/SeqIdName.java @@ -0,0 +1,151 @@ +package jalview.analysis; + +/** + * implements a weak hash map based on the first four characters of the given + * string, normalised to lower case + * + * @author jprocter + * + */ +class SeqIdName +{ + String id, origid; + + SeqIdName(String s) + { + if (s != null) + { + id = new String(s).toLowerCase(); + origid = new String(s); + } + else + { + id = ""; + } + } + + @Override + public int hashCode() + { + return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id + .hashCode()); + } + + @Override + public boolean equals(Object s) + { + if (s == null) + { + return false; + } + if (s instanceof SeqIdName) + { + return this.equals((SeqIdName) s); + } + else + { + if (s instanceof String) + { + return this.equals((String) s); + } + } + + return false; + } + + /** + * matches if one ID properly contains another at a whitespace boundary. + * TODO: (JBPNote) These are not efficient. should use char[] for speed + * todo: (JBPNote) Set separator characters appropriately + * + * @param s + * SeqIdName + * @return boolean + */ + public boolean equals(SeqIdName s) + { + // TODO: JAL-732 patch for cases when name includes a list of IDs, and the + // match contains one ID flanked + if (id.length() > s.id.length()) + { + return check_wordmatch(id, s.id); + } + else + { + return check_wordmatch(s.id, id); + } + } + + private boolean check_wordmatch(String longer, String shorter) + { + boolean elen = longer.length() == shorter.length(); + int sp = longer.indexOf(shorter); + if (sp == -1) + { + return false; + } + + if (sp == 0) + { + // end of match is word boundary + return elen ? true : (SequenceIdMatcher.WORD_SEP.indexOf(longer.charAt(shorter + .length() + sp)) > -1); + } + if (SequenceIdMatcher.WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1) + { + if (sp + shorter.length() == longer.length()) + { + return true; + } + else + { + // end of match is word boundary + return elen ? false + : sp + shorter.length() == longer.length() ? true + : (SequenceIdMatcher.WORD_SEP.indexOf(longer.charAt(shorter + .length() + sp)) > -1); + } + } + else + { + // prefix of match is not a word boundary + return false; + } + } + + public boolean equals(String s) + { + s = s.toLowerCase(); // TODO: employ faster to lower case operation + if (id.length() > s.length()) + { + return check_wordmatch(id, s); + } + else + { + return check_wordmatch(s, id); + } + } + + @Override + public String toString() + { + return id; + } + + public boolean equalsCase(String s) + { + if (origid.length() > s.length()) + { + return check_wordmatch(origid, s); + } + else + { + return check_wordmatch(s, origid); + } + } + + public boolean equalsCase(SeqIdName sid) + { + return equalsCase(sid.origid); + } +} \ No newline at end of file diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java index 0fde224..c0981a1 100755 --- a/src/jalview/analysis/SequenceIdMatcher.java +++ b/src/jalview/analysis/SequenceIdMatcher.java @@ -59,7 +59,7 @@ public class SequenceIdMatcher * arbritrarily extended sequence id's (like portions of an aligned set of * repeats from one sequence) */ - private static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4) + static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4) + "$%^*)}[@',?_"; /** @@ -161,7 +161,7 @@ public class SequenceIdMatcher * @param seq * @return list of split keys */ - public List getWordsFor(SequenceI seq) + public static List getWordsFor(SequenceI seq) { ArrayList keys = new ArrayList(); String name = seq.getName(), limits = "/" + seq.getStart() + "-" @@ -273,18 +273,23 @@ public class SequenceIdMatcher // if not closer, then check if current best is actually identical in case // as // well - if (is_closer || (!candName.equalsCase(best.get(0).getName()))) + if (is_closer + || (candName.equalsCase(cand.getName()) && !candName + .equalsCase(best.get(0).getName()))) { best.clear(); match = cand; matchlen = candlen; best.add(match); } - if (q == w && candlen == matchlen) + else { - // equivalently good, and matches with case as well. so - // record any ties - best.add(cand); + if (q == w && candlen == matchlen) + { + // equivalently good, and matches with case as well. so + // record any ties + best.add(cand); + } } } if (best.size() == 0) @@ -378,7 +383,7 @@ public class SequenceIdMatcher * @return SequenceI */ private SequenceI findIdMatch( - jalview.analysis.SequenceIdMatcher.SeqIdName nam) + jalview.analysis.SeqIdName nam) { ArrayList matches = new ArrayList(); while (names.containsKey(nam)) @@ -396,7 +401,7 @@ public class SequenceIdMatcher * @return SequenceI[] */ private List findAllIdMatches( - jalview.analysis.SequenceIdMatcher.SeqIdName nam) + jalview.analysis.SeqIdName nam) { ArrayList matches = new ArrayList(); while (names.containsKey(nam)) @@ -406,147 +411,4 @@ public class SequenceIdMatcher List r = pickbestMatches(nam, matches); return r; } - - private class SeqIdName - { - String id, origid; - - SeqIdName(String s) - { - if (s != null) - { - id = new String(s).toLowerCase(); - origid = new String(s); - } - else - { - id = ""; - } - } - - @Override - public int hashCode() - { - return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id - .hashCode()); - } - - @Override - public boolean equals(Object s) - { - if (s == null) - { - return false; - } - if (s instanceof SeqIdName) - { - return this.equals((SeqIdName) s); - } - else - { - if (s instanceof String) - { - return this.equals((String) s); - } - } - - return false; - } - - /** - * matches if one ID properly contains another at a whitespace boundary. - * TODO: (JBPNote) These are not efficient. should use char[] for speed - * todo: (JBPNote) Set separator characters appropriately - * - * @param s - * SeqIdName - * @return boolean - */ - public boolean equals(SeqIdName s) - { - // TODO: JAL-732 patch for cases when name includes a list of IDs, and the - // match contains one ID flanked - if (id.length() > s.id.length()) - { - return check_wordmatch(id, s.id); - } - else - { - return check_wordmatch(s.id, id); - } - } - - private boolean check_wordmatch(String longer, String shorter) - { - boolean elen = longer.length() == shorter.length(); - int sp = longer.indexOf(shorter); - if (sp == -1) - { - return false; - } - - if (sp == 0) - { - // end of match is word boundary - return elen ? true : (WORD_SEP.indexOf(longer.charAt(shorter - .length() + sp)) > -1); - } - if (WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1) - { - if (sp + shorter.length() == longer.length()) - { - return true; - } - else - { - // end of match is word boundary - return elen ? false - : sp + shorter.length() == longer.length() ? true - : (WORD_SEP.indexOf(longer.charAt(shorter - .length() + sp)) > -1); - } - } - else - { - // prefix of match is not a word boundary - return false; - } - } - - public boolean equals(String s) - { - s = s.toLowerCase(); // TODO: employ faster to lower case operation - if (id.length() > s.length()) - { - return check_wordmatch(id, s); - } - else - { - return check_wordmatch(s, id); - } - } - - @Override - public String toString() - { - return id; - } - - public boolean equalsCase(String s) - { - if (origid.length() > s.length()) - { - return check_wordmatch(origid, s); - } - else - { - return check_wordmatch(s, origid); - } - } - - public boolean equalsCase(SeqIdName sid) - { - return equalsCase(sid.origid); - } - } } diff --git a/test/jalview/analysis/SequenceIdMatcherTest.java b/test/jalview/analysis/SequenceIdMatcherTest.java index 2a07ad5..325a0c4 100644 --- a/test/jalview/analysis/SequenceIdMatcherTest.java +++ b/test/jalview/analysis/SequenceIdMatcherTest.java @@ -3,7 +3,9 @@ package jalview.analysis; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; @@ -34,11 +36,18 @@ public class SequenceIdMatcherTest { for (SequenceI sq : SequenceIdMatcherTest.someseqs) { + System.out.println("Searching with '" + sq.getName() + "'"); SequenceI[] idmatches = getMatcher().findAllIdMatches(sq.getName()); Assert.assertTrue( idmatches.length >= 1, "Couldn't recover at least one sequence for string '" + sq.getName() + "'"); + for (SequenceI f : idmatches) + { + System.out.println("For '" + sq.getName() + "' found '" + + f.getName() + "'"); + } + SequenceI[] seqmatches = getMatcher().findIdMatch( new SequenceI[] { sq }); Assert.assertEquals(1, seqmatches.length, @@ -96,6 +105,37 @@ public class SequenceIdMatcherTest } @Test(groups = { "Functional" }) + public void testWordSplit() + { + String[] words = new String[] { "several", "words", "separated", + "fully" }; + String full = ""; + for (String word : words) + { + if (full.length() > 0) + { + full += "|"; + } + full += word; + } + List bits = SequenceIdMatcher.getWordsFor(new Sequence(full, + "dummy")); + for (String word : words) + { + List equals = new ArrayList(); + for (SeqIdName bit : bits) + { + if (bit.equals(word)) + { + equals.add(bit); + } + } + Assert.assertTrue(equals.size() > 0, + "Word generation has broken. Expected at least one match for '" + + word + "'"); + } + } + @Test(groups = { "Functional" }) public void testFlankingMatch() { SequenceI[] match = getMatcher().findAllIdMatches("complexId"); @@ -107,6 +147,11 @@ public class SequenceIdMatcherTest .findAllIdMatches("complexId"); // should find 6 distinct sequences Assert.assertNotNull(fmatch, "Flanking matches not found."); + for (SequenceI f:fmatch) + { + System.out.println("Flanking 'complexId' match: '" + f.getName() + + "'"); + } Assert.assertEquals(fmatch.length, 6, "Couldn't find all entries with IDs containing 'complexId' word match"); -- 1.7.10.2