* arbritrarily extended sequence id's (like portions of an aligned set of
* repeats from one sequence)
*/
- private static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
+ static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
+ "$%^*)}[@',?_";
/**
* - when true, "myseq" matches "X|myseq" and "myseq"
* @param seqs
*/
- public SequenceIdMatcher(boolean wordBasedMatch,
- List<SequenceI> seqs)
+ public SequenceIdMatcher(boolean wordBasedMatch, List<SequenceI> seqs)
{
wordBased = wordBasedMatch;
names = new HashMap<SeqIdName, Set<SequenceI>>();
public void addSeq(SequenceI seq)
{
- // TODO: deal with ID collisions - SequenceI should be appended to list
- // associated with this key.
+ // TODO: deal with ID collisions - SequenceI should be appended to list
+ // associated with this key.
addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq);
- if (wordBased)
+ if (wordBased)
+ {
+ for (SeqIdName key : getWordsFor(seq))
{
- for (SeqIdName key : getWordsFor(seq))
- {
addSeqIdName(key, seq);
- }
}
- SequenceI dbseq = seq;
+ }
+ SequenceI dbseq = seq;
// TODO add test for database xref resolution
- while (dbseq.getDatasetSequence() != null)
- {
- dbseq = dbseq.getDatasetSequence();
- }
- // add in any interesting identifiers
- if (dbseq.getDBRefs() != null)
+ while (dbseq.getDatasetSequence() != null)
+ {
+ dbseq = dbseq.getDatasetSequence();
+ }
+ // add in any interesting identifiers
+ if (dbseq.getDBRefs() != null)
+ {
+ DBRefEntry dbr[] = dbseq.getDBRefs();
+ SeqIdName sid = null;
+ for (int r = 0; r < dbr.length; r++)
{
- DBRefEntry dbr[] = dbseq.getDBRefs();
- SeqIdName sid = null;
- for (int r = 0; r < dbr.length; r++)
+ sid = new SeqIdName(dbr[r].getAccessionId());
+ if (!names.containsKey(sid))
{
- sid = new SeqIdName(dbr[r].getAccessionId());
- if (!names.containsKey(sid))
- {
addSeqIdName(sid, seq);
- }
}
+ }
}
}
-
/**
* generate word based keys for the given sequence
*
* @param seq
* @return list of split keys
*/
- public List<SeqIdName> getWordsFor(SequenceI seq)
+ public static List<SeqIdName> getWordsFor(SequenceI seq)
{
ArrayList<SeqIdName> keys = new ArrayList<SeqIdName>();
String name = seq.getName(), limits = "/" + seq.getStart() + "-"
// if not closer, then check if current best is actually identical in case
// as
// well
- if (is_closer || (!candName.equalsCase(best.get(0).getName())))
+ if (is_closer
+ || (candName.equalsCase(cand.getName()) && !candName
+ .equalsCase(best.get(0).getName())))
{
best.clear();
match = cand;
matchlen = candlen;
best.add(match);
}
- if (q == w && candlen == matchlen)
+ else
{
- // equivalently good, and matches with case as well. so
- // record any ties
- best.add(cand);
+ if (q == w && candlen == matchlen)
+ {
+ // equivalently good, and matches with case as well. so
+ // record any ties
+ best.add(cand);
+ }
}
}
if (best.size() == 0)
* @return SequenceI
*/
private SequenceI findIdMatch(
- jalview.analysis.SequenceIdMatcher.SeqIdName nam)
+ jalview.analysis.SeqIdName nam)
{
ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
while (names.containsKey(nam))
* @return SequenceI[]
*/
private List<SequenceI> findAllIdMatches(
- jalview.analysis.SequenceIdMatcher.SeqIdName nam)
+ jalview.analysis.SeqIdName nam)
{
ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
while (names.containsKey(nam))
List<SequenceI> r = pickbestMatches(nam, matches);
return r;
}
-
- private class SeqIdName
- {
- String id, origid;
-
- SeqIdName(String s)
- {
- if (s != null)
- {
- id = new String(s).toLowerCase();
- origid = new String(s);
- }
- else
- {
- id = "";
- }
- }
-
- @Override
- public int hashCode()
- {
- return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
- .hashCode());
- }
-
- @Override
- public boolean equals(Object s)
- {
- if (s == null)
- {
- return false;
- }
- if (s instanceof SeqIdName)
- {
- return this.equals((SeqIdName) s);
- }
- else
- {
- if (s instanceof String)
- {
- return this.equals((String) s);
- }
- }
-
- return false;
- }
-
- /**
- * matches if one ID properly contains another at a whitespace boundary.
- * TODO: (JBPNote) These are not efficient. should use char[] for speed
- * todo: (JBPNote) Set separator characters appropriately
- *
- * @param s
- * SeqIdName
- * @return boolean
- */
- public boolean equals(SeqIdName s)
- {
- // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
- // match contains one ID flanked
- if (id.length() > s.id.length())
- {
- return check_wordmatch(id, s.id);
- }
- else
- {
- return check_wordmatch(s.id, id);
- }
- }
-
- private boolean check_wordmatch(String longer, String shorter)
- {
- boolean elen = longer.length() == shorter.length();
- int sp = longer.indexOf(shorter);
- if (sp == -1)
- {
- return false;
- }
-
- if (sp == 0)
- {
- // end of match is word boundary
- return elen ? true : (WORD_SEP.indexOf(longer.charAt(shorter
- .length() + sp)) > -1);
- }
- if (WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1)
- {
- if (sp + shorter.length() == longer.length())
- {
- return true;
- }
- else
- {
- // end of match is word boundary
- return elen ? false
- : sp + shorter.length() == longer.length() ? true
- : (WORD_SEP.indexOf(longer.charAt(shorter
- .length() + sp)) > -1);
- }
- }
- else
- {
- // prefix of match is not a word boundary
- return false;
- }
- }
-
- public boolean equals(String s)
- {
- s = s.toLowerCase(); // TODO: employ faster to lower case operation
- if (id.length() > s.length())
- {
- return check_wordmatch(id, s);
- }
- else
- {
- return check_wordmatch(s, id);
- }
- }
-
- @Override
- public String toString()
- {
- return id;
- }
-
- public boolean equalsCase(String s)
- {
- if (origid.length() > s.length())
- {
- return check_wordmatch(origid, s);
- }
- else
- {
- return check_wordmatch(s, origid);
- }
- }
-
- public boolean equalsCase(SeqIdName sid)
- {
- return equalsCase(sid.origid);
- }
- }
}