/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.analysis;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.SequenceI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Routines for approximate Sequence Id resolution by name using string
* containment (on word boundaries) rather than equivalence. It also attempts to
* resolve ties where no exact match is available by picking the the id closest
* to the query.
*/
public class SequenceIdMatcher
{
/**
* weak hash for each sequence
*/
private HashMap> names;
// /**
// * cache of values removed for each query string.
// */
// private HashMap> resolved;
/**
* do we index sequences on all 'words' in ID string ?
*/
private boolean wordBased = false;
/**
* Characters that define the end of a unique sequence ID at the beginning of
* an arbitrary ID string JBPNote: This is a heuristic that will fail for
* arbritrarily extended sequence id's (like portions of an aligned set of
* repeats from one sequence)
*/
static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
+ "$%^*)}[@',?_";
/**
* @return true if matcher is word-based (ie string key matches one of the
* words within the body of one or more sequence IDs)
*/
public boolean isWordBased()
{
return wordBased;
}
/**
* Construct a standard (non-word based) matcher. To configure word based
* matching, use the fully qualified constructor
*
* @param seqs
*/
public SequenceIdMatcher(List seqs)
{
this(false, seqs);
}
/**
* construct a new matcher for a set of sequences, configured as required.
* Note: enabling word based matching
*
* @param wordBasedMatch
* - when true, "myseq" matches "X|myseq" and "myseq"
* @param seqs
*/
public SequenceIdMatcher(boolean wordBasedMatch, List seqs)
{
wordBased = wordBasedMatch;
names = new HashMap>();
addAll(seqs);
}
/**
* add more sequences to this matcher - also used by the constructor
*
* @param seqs
*/
public void addAll(List seqs)
{
for (SequenceI seq : seqs)
{
addSeq(seq);
}
}
private void addSeqIdName(SeqIdName idname, SequenceI seq)
{
Set seqset = names.get(idname);
if (seqset == null)
{
seqset = new HashSet();
names.put(idname, seqset);
}
seqset.add(seq);
}
public void addSeq(SequenceI seq)
{
// TODO: deal with ID collisions - SequenceI should be appended to list
// associated with this key.
addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq);
if (wordBased)
{
for (SeqIdName key : getWordsFor(seq))
{
addSeqIdName(key, seq);
}
}
SequenceI dbseq = seq;
// TODO add test for database xref resolution
while (dbseq.getDatasetSequence() != null)
{
dbseq = dbseq.getDatasetSequence();
}
// add in any interesting identifiers
if (dbseq.getDBRefs() != null)
{
DBRefEntry dbr[] = dbseq.getDBRefs();
SeqIdName sid = null;
for (int r = 0; r < dbr.length; r++)
{
sid = new SeqIdName(dbr[r].getAccessionId());
if (!names.containsKey(sid))
{
addSeqIdName(sid, seq);
}
}
}
}
/**
* generate word based keys for the given sequence
*
* @param seq
* @return list of split keys
*/
public static List getWordsFor(SequenceI seq)
{
ArrayList keys = new ArrayList();
String name = seq.getName(), limits = "/" + seq.getStart() + "-"
+ seq.getEnd();
int namel = name.length();
char[] sep = new char[WORD_SEP.length()];
// find only the separators present in the ID.
for (int i = 0; i < sep.length; i++)
{
sep[i] = WORD_SEP.charAt(i);
if (seq.getName().indexOf("" + sep[i]) == -1)
{
sep[i] = 0;
}
}
;
// make words
for (int i = 0; i < sep.length; i++)
{
if (sep[i] > 0)
{
int p = 0, m = -1;
while ((m = name.indexOf(sep[i], p)) > p)
{
if (m > 0 && m - p > 5)
{
// split to end of word m with this delimiter
keys.add(new SeqIdName(name.substring(p, m) + limits));
}
if (namel - m > 5)
{
// index word after this delimiter m
keys.add(new SeqIdName(name.substring(m + 1) + limits));
}
p = m + 1;
}
if (namel - p > 4)
{
// index word after this delimiter m
keys.add(new SeqIdName(name.substring(p) + limits));
}
}
}
return keys;
}
/**
* convenience method to make a matcher from concrete array Note: in order to
* support word based matching, use the fully qualified constructor
*
* @param sequences
*/
public SequenceIdMatcher(SequenceI[] sequences)
{
this(Arrays.asList(sequences));
}
/**
* returns the closest SequenceI in matches to SeqIdName and returns all the
* matches to the names hash.
*
* @param candName
* SeqIdName
* @param matches
* List of SequenceI objects
* @return SequenceI closest SequenceI to SeqIdName
*/
private SequenceI pickbestMatch(SeqIdName candName,
List matches)
{
List st = pickbestMatches(candName, matches);
return st == null || st.size() == 0 ? null : st.get(0);
}
/**
* returns the SequenceI's with exact word matches to candName
*
* @param candName
* SeqIdName
* @param matches
* List of SequenceI objects - some of which may be duplicates
* @return { word matches to candName }
*/
private List pickwordMatches(SeqIdName candName,
List matches)
{
List best = new ArrayList();
for (SequenceI match : matches)
{
if (!best.contains(match))
{
if (candName.equalsCase(match.getDisplayId(true)))
{
// put the exact match at the beginning
best.add(0, match);
}
else
{
best.add(match);
}
addSeq(match);
}
}
return best;
}
/**
* returns the closest SequenceI in matches to SeqIdName and returns all the
* matches to the names hash.
*
* @param candName
* SeqIdName
* @param matches
* Vector of SequenceI objects
* @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
* ties }
*/
private List pickbestMatches(SeqIdName candName,
List matches)
{
ArrayList best = new ArrayList();
if (candName == null || matches == null || matches.size() == 0)
{
return null;
}
SequenceI match = matches.remove(0);
best.add(match);
addSeq(match);
int matchlen = match.getName().length();
int namlen = candName.id.length();
while (matches.size() > 0)
{
// look through for a better one.
SequenceI cand = matches.remove(0);
addSeq(cand);
int q, w, candlen = cand.getName().length();
// keep the one with an id 'closer' to the given seqnam string
boolean is_closer = ((q = Math.abs(matchlen - namlen)) > (w = Math
.abs(candlen - namlen)) && candlen > matchlen);
// if not closer, then check if current best is actually identical in case
// as
// well
if (is_closer
|| (candName.equalsCase(cand.getName()) && !candName
.equalsCase(best.get(0).getName())))
{
best.clear();
match = cand;
matchlen = candlen;
best.add(match);
}
else
{
if (q == w && candlen == matchlen)
{
// equivalently good, and matches with case as well. so
// record any ties
best.add(cand);
}
}
}
if (best.size() == 0)
{
return null;
}
;
return best;
}
/**
* get SequenceI with closest SequenceI.getName() to seq.getName()
*
* @param seq
* SequenceI
* @return SequenceI
*/
public SequenceI findIdMatch(SequenceI seq)
{
SeqIdName nam = new SeqIdName(seq.getName());
return findIdMatch(nam);
}
public SequenceI findIdMatch(String seqnam)
{
SeqIdName nam = new SeqIdName(seqnam);
return findIdMatch(nam);
}
/**
* Find all matches for a given sequence name.
*
* @param seqnam
* string to query Matcher with.
* @return a new array or null of no match exists
*/
public SequenceI[] findAllIdMatches(String seqnam)
{
SeqIdName nam = new SeqIdName(seqnam);
List m = findAllIdMatches(nam);
if (m != null && m.size() > 0)
{
return m.toArray(new SequenceI[m.size()]);
}
return null;
}
/**
* findIdMatch
*
* Return pointers to sequences (or sequence object containers) which have
* same Id as a given set of different sequence objects
*
* @param seqs
* SequenceI[]
* @return SequenceI[]
*/
public SequenceI[] findIdMatch(SequenceI[] seqs)
{
SequenceI[] namedseqs = null;
int i = 0;
SeqIdName nam;
if (seqs.length > 0)
{
namedseqs = new SequenceI[seqs.length];
do
{
nam = new SeqIdName(seqs[i].getName());
if (names.containsKey(nam))
{
namedseqs[i] = findIdMatch(nam);
}
else
{
namedseqs[i] = null;
}
} while (++i < seqs.length);
}
return namedseqs;
}
/**
* core findIdMatch search method
*
* @param nam
* SeqIdName
* @return SequenceI
*/
private SequenceI findIdMatch(
jalview.analysis.SeqIdName nam)
{
ArrayList matches = new ArrayList();
while (names.containsKey(nam))
{
matches.addAll(names.remove(nam));
}
return pickbestMatch(nam, matches);
}
/**
* core findIdMatch search method for finding all equivalent matches
*
* @param nam
* SeqIdName
* @return SequenceI[]
*/
private List findAllIdMatches(
jalview.analysis.SeqIdName nam)
{
ArrayList matches = new ArrayList();
while (names.containsKey(nam))
{
matches.addAll(names.remove(nam));
}
List r = (wordBased) ? pickwordMatches(nam, matches)
: pickbestMatches(nam, matches);
return r;
}
}