src/jalview/analysis/SequenceIdMatcher.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.DBRefEntry;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.util.ArrayList;
  27 import java.util.Arrays;
  28 import java.util.HashMap;
  29 import java.util.HashSet;
  30 import java.util.List;
  31 import java.util.Set;
  32
  33 /**
  34  * Routines for approximate Sequence Id resolution by name using string
  35  * containment (on word boundaries) rather than equivalence. It also attempts to
  36  * resolve ties where no exact match is available by picking the the id closest
  37  * to the query.
  38  */
  39 public class SequenceIdMatcher
  40 {
  41   /**
  42    * weak hash for each sequence
  43    */
  44   private HashMap<SeqIdName, Set<SequenceI>> names;
  45
  46   // /**
  47   // * cache of values removed for each query string.
  48   // */
  49   // private HashMap<String, List<SequenceI>> resolved;
  50
  51   /**
  52    * do we index sequences on all 'words' in ID string ?
  53    */
  54   private boolean wordBased = false;
  55
  56   /**
  57    * Characters that define the end of a unique sequence ID at the beginning of
  58    * an arbitrary ID string JBPNote: This is a heuristic that will fail for
  59    * arbritrarily extended sequence id's (like portions of an aligned set of
  60    * repeats from one sequence)
  61    */
  62   static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
  63           + "$%^*)}[@',?_";
  64
  65   /**
  66    * @return true if matcher is word-based (ie string key matches one of the
  67    *         words within the body of one or more sequence IDs)
  68    */
  69   public boolean isWordBased()
  70   {
  71     return wordBased;
  72   }
  73
  74   /**
  75    * Construct a standard (non-word based) matcher. To configure word based
  76    * matching, use the fully qualified constructor
  77    *
  78    * @param seqs
  79    */
  80   public SequenceIdMatcher(List<SequenceI> seqs)
  81   {
  82     this(false, seqs);
  83   }
  84
  85   /**
  86    * construct a new matcher for a set of sequences, configured as required.
  87    * Note: enabling word based matching
  88    *
  89    * @param wordBasedMatch
  90    *          - when true, "myseq" matches "X|myseq" and "myseq"
  91    * @param seqs
  92    */
  93   public SequenceIdMatcher(boolean wordBasedMatch, List<SequenceI> seqs)
  94   {
  95     wordBased = wordBasedMatch;
  96     names = new HashMap<SeqIdName, Set<SequenceI>>();
  97     addAll(seqs);
  98   }
  99
 100   /**
 101    * add more sequences to this matcher - also used by the constructor
 102    *
 103    * @param seqs
 104    */
 105   public void addAll(List<SequenceI> seqs)
 106   {
 107     for (SequenceI seq : seqs)
 108     {
 109       addSeq(seq);
 110     }
 111   }
 112
 113   private void addSeqIdName(SeqIdName idname, SequenceI seq)
 114   {
 115     Set<SequenceI> seqset = names.get(idname);
 116     if (seqset == null)
 117     {
 118       seqset = new HashSet<SequenceI>();
 119       names.put(idname, seqset);
 120     }
 121     seqset.add(seq);
 122   }
 123
 124   public void addSeq(SequenceI seq)
 125   {
 126     // TODO: deal with ID collisions - SequenceI should be appended to list
 127     // associated with this key.
 128     addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq);
 129     if (wordBased)
 130     {
 131       for (SeqIdName key : getWordsFor(seq))
 132       {
 133         addSeqIdName(key, seq);
 134       }
 135     }
 136     SequenceI dbseq = seq;
 137     // TODO add test for database xref resolution
 138     while (dbseq.getDatasetSequence() != null)
 139     {
 140       dbseq = dbseq.getDatasetSequence();
 141     }
 142     // add in any interesting identifiers
 143     if (dbseq.getDBRefs() != null)
 144     {
 145       DBRefEntry dbr[] = dbseq.getDBRefs();
 146       SeqIdName sid = null;
 147       for (int r = 0; r < dbr.length; r++)
 148       {
 149         sid = new SeqIdName(dbr[r].getAccessionId());
 150         if (!names.containsKey(sid))
 151         {
 152           addSeqIdName(sid, seq);
 153         }
 154       }
 155     }
 156   }
 157
 158   /**
 159    * generate word based keys for the given sequence
 160    *
 161    * @param seq
 162    * @return list of split keys
 163    */
 164   public static List<SeqIdName> getWordsFor(SequenceI seq)
 165   {
 166     ArrayList<SeqIdName> keys = new ArrayList<SeqIdName>();
 167     String name = seq.getName(), limits = "/" + seq.getStart() + "-"
 168             + seq.getEnd();
 169     int namel = name.length();
 170     char[] sep = new char[WORD_SEP.length()];
 171     // find only the separators present in the ID.
 172     for (int i = 0; i < sep.length; i++)
 173     {
 174       sep[i] = WORD_SEP.charAt(i);
 175       if (seq.getName().indexOf("" + sep[i]) == -1)
 176       {
 177         sep[i] = 0;
 178       }
 179     }
 180     ;
 181     // make words
 182     for (int i = 0; i < sep.length; i++)
 183     {
 184       if (sep[i] > 0)
 185       {
 186         int p = 0, m = -1;
 187         while ((m = name.indexOf(sep[i], p)) > p)
 188         {
 189
 190           if (m > 0 && m - p > 5)
 191           {
 192             // split to end of word m with this delimiter
 193             keys.add(new SeqIdName(name.substring(p, m) + limits));
 194           }
 195           if (namel - m > 5)
 196           {
 197             // index word after this delimiter m
 198             keys.add(new SeqIdName(name.substring(m + 1) + limits));
 199           }
 200           p = m + 1;
 201         }
 202         if (namel - p > 4)
 203         {
 204           // index word after this delimiter m
 205           keys.add(new SeqIdName(name.substring(p) + limits));
 206         }
 207       }
 208     }
 209     return keys;
 210   }
 211
 212   /**
 213    * convenience method to make a matcher from concrete array Note: in order to
 214    * support word based matching, use the fully qualified constructor
 215    *
 216    * @param sequences
 217    */
 218   public SequenceIdMatcher(SequenceI[] sequences)
 219   {
 220     this(Arrays.asList(sequences));
 221   }
 222
 223   /**
 224    * returns the closest SequenceI in matches to SeqIdName and returns all the
 225    * matches to the names hash.
 226    *
 227    * @param candName
 228    *          SeqIdName
 229    * @param matches
 230    *          List of SequenceI objects
 231    * @return SequenceI closest SequenceI to SeqIdName
 232    */
 233   private SequenceI pickbestMatch(SeqIdName candName,
 234           List<SequenceI> matches)
 235   {
 236     List<SequenceI> st = pickbestMatches(candName, matches);
 237     return st == null || st.size() == 0 ? null : st.get(0);
 238   }
 239
 240   /**
 241    * returns the SequenceI's with exact word matches to candName
 242    *
 243    * @param candName
 244    *          SeqIdName
 245    * @param matches
 246    *          List of SequenceI objects - some of which may be duplicates
 247    * @return { word matches to candName }
 248    */
 249   private List<SequenceI> pickwordMatches(SeqIdName candName,
 250           List<SequenceI> matches)
 251   {
 252     List<SequenceI> best = new ArrayList<SequenceI>();
 253     for (SequenceI match : matches)
 254     {
 255       if (!best.contains(match))
 256       {
 257         if (candName.equalsCase(match.getDisplayId(true)))
 258         {
 259           // put the exact match at the beginning
 260           best.add(0, match);
 261         }
 262         else
 263         {
 264           best.add(match);
 265         }
 266         addSeq(match);
 267       }
 268     }
 269     return best;
 270   }
 271
 272   /**
 273    * returns the closest SequenceI in matches to SeqIdName and returns all the
 274    * matches to the names hash.
 275    *
 276    * @param candName
 277    *          SeqIdName
 278    * @param matches
 279    *          Vector of SequenceI objects
 280    * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
 281    *         ties }
 282    */
 283   private List<SequenceI> pickbestMatches(SeqIdName candName,
 284           List<SequenceI> matches)
 285   {
 286     ArrayList<SequenceI> best = new ArrayList<SequenceI>();
 287     if (candName == null || matches == null || matches.size() == 0)
 288     {
 289       return null;
 290     }
 291     SequenceI match = matches.remove(0);
 292     best.add(match);
 293     addSeq(match);
 294     int matchlen = match.getName().length();
 295     int namlen = candName.id.length();
 296     while (matches.size() > 0)
 297     {
 298       // look through for a better one.
 299       SequenceI cand = matches.remove(0);
 300       addSeq(cand);
 301       int q, w, candlen = cand.getName().length();
 302       // keep the one with an id 'closer' to the given seqnam string
 303       boolean is_closer = ((q = Math.abs(matchlen - namlen)) > (w = Math
 304               .abs(candlen - namlen)) && candlen > matchlen);
 305       // if not closer, then check if current best is actually identical in case
 306       // as
 307       // well
 308       if (is_closer
 309               || (candName.equalsCase(cand.getName()) && !candName
 310                       .equalsCase(best.get(0).getName())))
 311       {
 312         best.clear();
 313         match = cand;
 314         matchlen = candlen;
 315         best.add(match);
 316       }
 317       else
 318       {
 319         if (q == w && candlen == matchlen)
 320         {
 321           // equivalently good, and matches with case as well. so
 322           // record any ties
 323           best.add(cand);
 324         }
 325       }
 326     }
 327     if (best.size() == 0)
 328     {
 329       return null;
 330     }
 331     ;
 332     return best;
 333   }
 334
 335   /**
 336    * get SequenceI with closest SequenceI.getName() to seq.getName()
 337    *
 338    * @param seq
 339    *          SequenceI
 340    * @return SequenceI
 341    */
 342   public SequenceI findIdMatch(SequenceI seq)
 343   {
 344     SeqIdName nam = new SeqIdName(seq.getName());
 345     return findIdMatch(nam);
 346   }
 347
 348   public SequenceI findIdMatch(String seqnam)
 349   {
 350     SeqIdName nam = new SeqIdName(seqnam);
 351     return findIdMatch(nam);
 352   }
 353
 354   /**
 355    * Find all matches for a given sequence name.
 356    *
 357    * @param seqnam
 358    *          string to query Matcher with.
 359    * @return a new array or null of no match exists
 360    */
 361   public SequenceI[] findAllIdMatches(String seqnam)
 362   {
 363
 364     SeqIdName nam = new SeqIdName(seqnam);
 365     List<SequenceI> m = findAllIdMatches(nam);
 366     if (m != null && m.size() > 0)
 367     {
 368       return m.toArray(new SequenceI[m.size()]);
 369     }
 370     return null;
 371   }
 372
 373   /**
 374    * findIdMatch
 375    *
 376    * Return pointers to sequences (or sequence object containers) which have
 377    * same Id as a given set of different sequence objects
 378    *
 379    * @param seqs
 380    *          SequenceI[]
 381    * @return SequenceI[]
 382    */
 383   public SequenceI[] findIdMatch(SequenceI[] seqs)
 384   {
 385     SequenceI[] namedseqs = null;
 386     int i = 0;
 387     SeqIdName nam;
 388
 389     if (seqs.length > 0)
 390     {
 391       namedseqs = new SequenceI[seqs.length];
 392       do
 393       {
 394         nam = new SeqIdName(seqs[i].getName());
 395
 396         if (names.containsKey(nam))
 397         {
 398           namedseqs[i] = findIdMatch(nam);
 399         }
 400         else
 401         {
 402           namedseqs[i] = null;
 403         }
 404       } while (++i < seqs.length);
 405     }
 406
 407     return namedseqs;
 408   }
 409
 410   /**
 411    * core findIdMatch search method
 412    *
 413    * @param nam
 414    *          SeqIdName
 415    * @return SequenceI
 416    */
 417   private SequenceI findIdMatch(
 418           jalview.analysis.SeqIdName nam)
 419   {
 420     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 421     while (names.containsKey(nam))
 422     {
 423       matches.addAll(names.remove(nam));
 424     }
 425     return pickbestMatch(nam, matches);
 426   }
 427
 428   /**
 429    * core findIdMatch search method for finding all equivalent matches
 430    *
 431    * @param nam
 432    *          SeqIdName
 433    * @return SequenceI[]
 434    */
 435   private List<SequenceI> findAllIdMatches(
 436           jalview.analysis.SeqIdName nam)
 437   {
 438     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 439     while (names.containsKey(nam))
 440     {
 441       matches.addAll(names.remove(nam));
 442     }
 443     List<SequenceI> r = (wordBased) ? pickwordMatches(nam, matches)
 444             : pickbestMatches(nam, matches);
 445     return r;
 446   }
 447 }