import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
-import java.util.Vector;
+import java.util.Map;
/**
* Routines for approximate Sequence Id resolution by name using string
*/
public class SequenceIdMatcher
{
- private HashMap<SeqIdName, SequenceI> names;
+ private HashMap<SeqIdName, List<SequenceI>> names;
+
+ private Map<SeqIdName, List<SequenceI>> excludes;
public SequenceIdMatcher(List<SequenceI> seqs)
{
- names = new HashMap<SeqIdName, SequenceI>();
+ names = new HashMap<>();
+ excludes = new HashMap<>();
addAll(seqs);
}
*/
public void add(SequenceI seq)
{
- // TODO: deal with ID collisions - SequenceI should be appended to list
- // associated with this key.
- names.put(new SeqIdName(seq.getDisplayId(true)), seq);
+ SeqIdName key = new SeqIdName(seq.getDisplayId(true));
+ addMatchCandidate(key, seq);
SequenceI dbseq = seq;
while (dbseq.getDatasetSequence() != null)
{
if (dbseq.getDBRefs() != null)
{
DBRefEntry dbr[] = dbseq.getDBRefs();
- SeqIdName sid = null;
for (int r = 0; r < dbr.length; r++)
{
- sid = new SeqIdName(dbr[r].getAccessionId());
- if (!names.containsKey(sid))
+ DBRefEntry dbref = dbr[r];
+ SeqIdName sid = new SeqIdName(dbref.getAccessionId());
+ if (dbref.getMap() != null
+ && dbref.getMap().getMap().isTripletMap())
{
- names.put(sid, seq);
+ /*
+ * dbref with 3:1 or 1:3 mapping (e.g. CDS/protein);
+ * mark as not a valid match for this id
+ */
+ List<SequenceI> excluded = excludes.get(sid);
+ if (excluded == null)
+ {
+ excludes.put(sid, excluded = new ArrayList<>());
+ }
+ excluded.add(seq);
+ System.out.println("Excluding " + sid + "->" + seq);
+ continue;
}
+ addMatchCandidate(sid, seq);
}
}
}
+ void addMatchCandidate(SeqIdName key, SequenceI seq)
+ {
+ List<SequenceI> namesList = names.get(key);
+ if (namesList == null)
+ {
+ names.put(key, namesList = new ArrayList<>());
+ }
+ if (!namesList.contains(seq))
+ {
+ namesList.add(seq);
+ System.out.println("Adding " + key + "->" + seq);
+ }
+ }
+
/**
* convenience method to make a matcher from concrete array
*
private List<SequenceI> pickbestMatches(SeqIdName candName,
List<SequenceI> matches)
{
- ArrayList<SequenceI> best = new ArrayList<SequenceI>();
+ List<SequenceI> best = new ArrayList<>();
if (candName == null || matches == null || matches.size() == 0)
{
return null;
}
SequenceI match = matches.remove(0);
best.add(match);
- names.put(new SeqIdName(match.getName()), match);
+ addMatchCandidate(new SeqIdName(match.getName()), match);
int matchlen = match.getName().length();
int namlen = candName.id.length();
while (matches.size() > 0)
{
// look through for a better one.
SequenceI cand = matches.remove(0);
- names.put(new SeqIdName(cand.getName()), cand);
+ addMatchCandidate(new SeqIdName(cand.getName()), cand);
int q, w, candlen = cand.getName().length();
// keep the one with an id 'closer' to the given seqnam string
if ((q = Math.abs(matchlen - namlen)) > (w = Math
* SeqIdName
* @return SequenceI
*/
- private SequenceI findIdMatch(
- jalview.analysis.SequenceIdMatcher.SeqIdName nam)
+ private SequenceI findIdMatch(SeqIdName nam)
{
- Vector matches = new Vector();
+ List<SequenceI> matches = new ArrayList<>();
while (names.containsKey(nam))
{
- matches.addElement(names.remove(nam));
+ List<SequenceI> candidates = names.remove(nam);
+ List<SequenceI> except = excludes.get(nam);
+ int j = candidates.size();
+ for (int i = 0; i < j; i++)
+ {
+ SequenceI candidate = candidates.get(i);
+ if (!except.contains(candidate))
+ {
+ matches.add(candidate);
+ }
+ }
}
return pickbestMatch(nam, matches);
}
private List<SequenceI> findAllIdMatches(
jalview.analysis.SequenceIdMatcher.SeqIdName nam)
{
- ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
+ List<SequenceI> matches = new ArrayList<>();
while (names.containsKey(nam))
{
- matches.add(names.remove(nam));
+ matches.addAll(names.remove(nam));
}
List<SequenceI> r = pickbestMatches(nam, matches);
return r;