/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import jalview.datamodel.DBRefEntry; import jalview.datamodel.SequenceI; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Routines for approximate Sequence Id resolution by name using string * containment (on word boundaries) rather than equivalence. It also attempts to * resolve ties where no exact match is available by picking the the id closest * to the query. */ public class SequenceIdMatcher { /** * weak hash for each sequence */ private HashMap> names; // /** // * cache of values removed for each query string. // */ // private HashMap> resolved; /** * do we index sequences on all 'words' in ID string ? */ private boolean wordBased = false; /** * Characters that define the end of a unique sequence ID at the beginning of * an arbitrary ID string JBPNote: This is a heuristic that will fail for * arbritrarily extended sequence id's (like portions of an aligned set of * repeats from one sequence) */ static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4) + "$%^*)}[@',?_"; /** * @return true if matcher is word-based (ie string key matches one of the * words within the body of one or more sequence IDs) */ public boolean isWordBased() { return wordBased; } /** * Construct a standard (non-word based) matcher. To configure word based * matching, use the fully qualified constructor * * @param seqs */ public SequenceIdMatcher(List seqs) { this(false, seqs); } /** * construct a new matcher for a set of sequences, configured as required. * Note: enabling word based matching * * @param wordBasedMatch * - when true, "myseq" matches "X|myseq" and "myseq" * @param seqs */ public SequenceIdMatcher(boolean wordBasedMatch, List seqs) { wordBased = wordBasedMatch; names = new HashMap>(); addAll(seqs); } /** * add more sequences to this matcher - also used by the constructor * * @param seqs */ public void addAll(List seqs) { for (SequenceI seq : seqs) { addSeq(seq); } } private void addSeqIdName(SeqIdName idname, SequenceI seq) { Set seqset = names.get(idname); if (seqset == null) { seqset = new HashSet(); names.put(idname, seqset); } seqset.add(seq); } public void addSeq(SequenceI seq) { // TODO: deal with ID collisions - SequenceI should be appended to list // associated with this key. addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq); if (wordBased) { for (SeqIdName key : getWordsFor(seq)) { addSeqIdName(key, seq); } } SequenceI dbseq = seq; // TODO add test for database xref resolution while (dbseq.getDatasetSequence() != null) { dbseq = dbseq.getDatasetSequence(); } // add in any interesting identifiers if (dbseq.getDBRefs() != null) { DBRefEntry dbr[] = dbseq.getDBRefs(); SeqIdName sid = null; for (int r = 0; r < dbr.length; r++) { sid = new SeqIdName(dbr[r].getAccessionId()); if (!names.containsKey(sid)) { addSeqIdName(sid, seq); } } } } /** * generate word based keys for the given sequence * * @param seq * @return list of split keys */ public static List getWordsFor(SequenceI seq) { ArrayList keys = new ArrayList(); String name = seq.getName(), limits = "/" + seq.getStart() + "-" + seq.getEnd(); int namel = name.length(); char[] sep = new char[WORD_SEP.length()]; // find only the separators present in the ID. for (int i = 0; i < sep.length; i++) { sep[i] = WORD_SEP.charAt(i); if (seq.getName().indexOf("" + sep[i]) == -1) { sep[i] = 0; } } ; // make words for (int i = 0; i < sep.length; i++) { if (sep[i] > 0) { int p = 0, m = -1; while ((m = name.indexOf(sep[i], p)) > p) { if (m > 0 && m - p > 5) { // split to end of word m with this delimiter keys.add(new SeqIdName(name.substring(p, m) + limits)); } if (namel - m > 5) { // index word after this delimiter m keys.add(new SeqIdName(name.substring(m + 1) + limits)); } p = m + 1; } if (namel - p > 4) { // index word after this delimiter m keys.add(new SeqIdName(name.substring(p) + limits)); } } } return keys; } /** * convenience method to make a matcher from concrete array Note: in order to * support word based matching, use the fully qualified constructor * * @param sequences */ public SequenceIdMatcher(SequenceI[] sequences) { this(Arrays.asList(sequences)); } /** * returns the closest SequenceI in matches to SeqIdName and returns all the * matches to the names hash. * * @param candName * SeqIdName * @param matches * List of SequenceI objects * @return SequenceI closest SequenceI to SeqIdName */ private SequenceI pickbestMatch(SeqIdName candName, List matches) { List st = pickbestMatches(candName, matches); return st == null || st.size() == 0 ? null : st.get(0); } /** * returns the SequenceI's with exact word matches to candName * * @param candName * SeqIdName * @param matches * List of SequenceI objects - some of which may be duplicates * @return { word matches to candName } */ private List pickwordMatches(SeqIdName candName, List matches) { List best = new ArrayList(); for (SequenceI match : matches) { if (!best.contains(match)) { if (candName.equalsCase(match.getDisplayId(true))) { // put the exact match at the beginning best.add(0, match); } else { best.add(match); } addSeq(match); } } return best; } /** * returns the closest SequenceI in matches to SeqIdName and returns all the * matches to the names hash. * * @param candName * SeqIdName * @param matches * Vector of SequenceI objects * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[] * ties } */ private List pickbestMatches(SeqIdName candName, List matches) { ArrayList best = new ArrayList(); if (candName == null || matches == null || matches.size() == 0) { return null; } SequenceI match = matches.remove(0); best.add(match); addSeq(match); int matchlen = match.getName().length(); int namlen = candName.id.length(); while (matches.size() > 0) { // look through for a better one. SequenceI cand = matches.remove(0); addSeq(cand); int q, w, candlen = cand.getName().length(); // keep the one with an id 'closer' to the given seqnam string boolean is_closer = ((q = Math.abs(matchlen - namlen)) > (w = Math .abs(candlen - namlen)) && candlen > matchlen); // if not closer, then check if current best is actually identical in case // as // well if (is_closer || (candName.equalsCase(cand.getName()) && !candName .equalsCase(best.get(0).getName()))) { best.clear(); match = cand; matchlen = candlen; best.add(match); } else { if (q == w && candlen == matchlen) { // equivalently good, and matches with case as well. so // record any ties best.add(cand); } } } if (best.size() == 0) { return null; } ; return best; } /** * get SequenceI with closest SequenceI.getName() to seq.getName() * * @param seq * SequenceI * @return SequenceI */ public SequenceI findIdMatch(SequenceI seq) { SeqIdName nam = new SeqIdName(seq.getName()); return findIdMatch(nam); } public SequenceI findIdMatch(String seqnam) { SeqIdName nam = new SeqIdName(seqnam); return findIdMatch(nam); } /** * Find all matches for a given sequence name. * * @param seqnam * string to query Matcher with. * @return a new array or null of no match exists */ public SequenceI[] findAllIdMatches(String seqnam) { SeqIdName nam = new SeqIdName(seqnam); List m = findAllIdMatches(nam); if (m != null && m.size() > 0) { return m.toArray(new SequenceI[m.size()]); } return null; } /** * findIdMatch * * Return pointers to sequences (or sequence object containers) which have * same Id as a given set of different sequence objects * * @param seqs * SequenceI[] * @return SequenceI[] */ public SequenceI[] findIdMatch(SequenceI[] seqs) { SequenceI[] namedseqs = null; int i = 0; SeqIdName nam; if (seqs.length > 0) { namedseqs = new SequenceI[seqs.length]; do { nam = new SeqIdName(seqs[i].getName()); if (names.containsKey(nam)) { namedseqs[i] = findIdMatch(nam); } else { namedseqs[i] = null; } } while (++i < seqs.length); } return namedseqs; } /** * core findIdMatch search method * * @param nam * SeqIdName * @return SequenceI */ private SequenceI findIdMatch( jalview.analysis.SeqIdName nam) { ArrayList matches = new ArrayList(); while (names.containsKey(nam)) { matches.addAll(names.remove(nam)); } return pickbestMatch(nam, matches); } /** * core findIdMatch search method for finding all equivalent matches * * @param nam * SeqIdName * @return SequenceI[] */ private List findAllIdMatches( jalview.analysis.SeqIdName nam) { ArrayList matches = new ArrayList(); while (names.containsKey(nam)) { matches.addAll(names.remove(nam)); } List r = (wordBased) ? pickwordMatches(nam, matches) : pickbestMatches(nam, matches); return r; } }