src/jalview/analysis/SequenceIdMatcher.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.DBRefEntry;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.util.ArrayList;
  27 import java.util.Arrays;
  28 import java.util.HashMap;
  29 import java.util.HashSet;
  30 import java.util.List;
  31 import java.util.Set;
  32
  33 /**
  34  * Routines for approximate Sequence Id resolution by name using string
  35  * containment (on word boundaries) rather than equivalence. It also attempts to
  36  * resolve ties where no exact match is available by picking the the id closest
  37  * to the query.
  38  */
  39 public class SequenceIdMatcher
  40 {
  41   /**
  42    * weak hash for each sequence
  43    */
  44   private HashMap<SeqIdName, Set<SequenceI>> names;
  45
  46   // /**
  47   // * cache of values removed for each query string.
  48   // */
  49   // private HashMap<String, List<SequenceI>> resolved;
  50
  51   /**
  52    * do we index sequences on all 'words' in ID string ?
  53    */
  54   private boolean wordBased = false;
  55
  56   /**
  57    * Characters that define the end of a unique sequence ID at the beginning of
  58    * an arbitrary ID string JBPNote: This is a heuristic that will fail for
  59    * arbritrarily extended sequence id's (like portions of an aligned set of
  60    * repeats from one sequence)
  61    */
  62   private static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
  63           + "$%^*)}[@',?_";
  64
  65   /**
  66    * @return true if matcher is word-based (ie string key matches one of the
  67    *         words within the body of one or more sequence IDs)
  68    */
  69   public boolean isWordBased()
  70   {
  71     return wordBased;
  72   }
  73
  74   /**
  75    * Construct a standard (non-word based) matcher. To configure word based
  76    * matching, use the fully qualified constructor
  77    *
  78    * @param seqs
  79    */
  80   public SequenceIdMatcher(List<SequenceI> seqs)
  81   {
  82     this(false, seqs);
  83   }
  84
  85   /**
  86    * construct a new matcher for a set of sequences, configured as required.
  87    * Note: enabling word based matching
  88    *
  89    * @param wordBasedMatch
  90    *          - when true, "myseq" matches "X|myseq" and "myseq"
  91    * @param seqs
  92    */
  93   public SequenceIdMatcher(boolean wordBasedMatch,
  94           List<SequenceI> seqs)
  95   {
  96     wordBased = wordBasedMatch;
  97     names = new HashMap<SeqIdName, Set<SequenceI>>();
  98     addAll(seqs);
  99   }
 100
 101   /**
 102    * add more sequences to this matcher - also used by the constructor
 103    *
 104    * @param seqs
 105    */
 106   public void addAll(List<SequenceI> seqs)
 107   {
 108     for (SequenceI seq : seqs)
 109     {
 110       addSeq(seq);
 111     }
 112   }
 113
 114   private void addSeqIdName(SeqIdName idname, SequenceI seq)
 115   {
 116     Set<SequenceI> seqset = names.get(idname);
 117     if (seqset == null)
 118     {
 119       seqset = new HashSet<SequenceI>();
 120       names.put(idname, seqset);
 121     }
 122     seqset.add(seq);
 123   }
 124
 125   public void addSeq(SequenceI seq)
 126   {
 127       // TODO: deal with ID collisions - SequenceI should be appended to list
 128       // associated with this key.
 129     addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq);
 130       if (wordBased)
 131       {
 132         for (SeqIdName key : getWordsFor(seq))
 133         {
 134         addSeqIdName(key, seq);
 135         }
 136       }
 137       SequenceI dbseq = seq;
 138     // TODO add test for database xref resolution
 139       while (dbseq.getDatasetSequence() != null)
 140       {
 141         dbseq = dbseq.getDatasetSequence();
 142       }
 143       // add in any interesting identifiers
 144       if (dbseq.getDBRefs() != null)
 145       {
 146         DBRefEntry dbr[] = dbseq.getDBRefs();
 147         SeqIdName sid = null;
 148         for (int r = 0; r < dbr.length; r++)
 149         {
 150           sid = new SeqIdName(dbr[r].getAccessionId());
 151           if (!names.containsKey(sid))
 152           {
 153           addSeqIdName(sid, seq);
 154           }
 155         }
 156     }
 157   }
 158
 159
 160   /**
 161    * generate word based keys for the given sequence
 162    *
 163    * @param seq
 164    * @return list of split keys
 165    */
 166   public List<SeqIdName> getWordsFor(SequenceI seq)
 167   {
 168     ArrayList<SeqIdName> keys = new ArrayList<SeqIdName>();
 169     String name = seq.getName(), limits = "/" + seq.getStart() + "-"
 170             + seq.getEnd();
 171     int namel = name.length();
 172     char[] sep = new char[WORD_SEP.length()];
 173     // find only the separators present in the ID.
 174     for (int i = 0; i < sep.length; i++)
 175     {
 176       sep[i] = WORD_SEP.charAt(i);
 177       if (seq.getName().indexOf("" + sep[i]) == -1)
 178       {
 179         sep[i] = 0;
 180       }
 181     }
 182     ;
 183     // make words
 184     for (int i = 0; i < sep.length; i++)
 185     {
 186       if (sep[i] > 0)
 187       {
 188         int p = 0, m = -1;
 189         while ((m = name.indexOf(sep[i], p)) > p)
 190         {
 191
 192           if (m > 0 && m - p > 5)
 193           {
 194             // split to end of word m with this delimiter
 195             keys.add(new SeqIdName(name.substring(p, m - 1) + limits));
 196           }
 197           if (namel - m > 5)
 198           {
 199             // index word after this delimiter m
 200             keys.add(new SeqIdName(name.substring(m + 1) + limits));
 201           }
 202           p = m + 1;
 203         }
 204         if (namel - p > 4)
 205         {
 206           // index word after this delimiter m
 207           keys.add(new SeqIdName(name.substring(p) + limits));
 208         }
 209       }
 210     }
 211     return keys;
 212   }
 213
 214   /**
 215    * convenience method to make a matcher from concrete array Note: in order to
 216    * support word based matching, use the fully qualified constructor
 217    *
 218    * @param sequences
 219    */
 220   public SequenceIdMatcher(SequenceI[] sequences)
 221   {
 222     this(Arrays.asList(sequences));
 223   }
 224
 225   /**
 226    * returns the closest SequenceI in matches to SeqIdName and returns all the
 227    * matches to the names hash.
 228    *
 229    * @param candName
 230    *          SeqIdName
 231    * @param matches
 232    *          List of SequenceI objects
 233    * @return SequenceI closest SequenceI to SeqIdName
 234    */
 235   private SequenceI pickbestMatch(SeqIdName candName,
 236           List<SequenceI> matches)
 237   {
 238     List<SequenceI> st = pickbestMatches(candName, matches);
 239     return st == null || st.size() == 0 ? null : st.get(0);
 240   }
 241
 242   /**
 243    * returns the closest SequenceI in matches to SeqIdName and returns all the
 244    * matches to the names hash.
 245    *
 246    * @param candName
 247    *          SeqIdName
 248    * @param matches
 249    *          Vector of SequenceI objects
 250    * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
 251    *         ties }
 252    */
 253   private List<SequenceI> pickbestMatches(SeqIdName candName,
 254           List<SequenceI> matches)
 255   {
 256     ArrayList<SequenceI> best = new ArrayList<SequenceI>();
 257     if (candName == null || matches == null || matches.size() == 0)
 258     {
 259       return null;
 260     }
 261     SequenceI match = matches.remove(0);
 262     best.add(match);
 263     addSeq(match);
 264     int matchlen = match.getName().length();
 265     int namlen = candName.id.length();
 266     while (matches.size() > 0)
 267     {
 268       // look through for a better one.
 269       SequenceI cand = matches.remove(0);
 270       addSeq(cand);
 271       int q, w, candlen = cand.getName().length();
 272       // keep the one with an id 'closer' to the given seqnam string
 273       if ((q = Math.abs(matchlen - namlen)) > (w = Math.abs(candlen
 274               - namlen))
 275               && candlen > matchlen)
 276       {
 277         best.clear();
 278         match = cand;
 279         matchlen = candlen;
 280         best.add(match);
 281       }
 282       if (q == w && candlen == matchlen)
 283       {
 284         // record any ties
 285         best.add(cand);
 286       }
 287     }
 288     if (best.size() == 0)
 289     {
 290       return null;
 291     }
 292     ;
 293     return best;
 294   }
 295
 296   /**
 297    * get SequenceI with closest SequenceI.getName() to seq.getName()
 298    *
 299    * @param seq
 300    *          SequenceI
 301    * @return SequenceI
 302    */
 303   public SequenceI findIdMatch(SequenceI seq)
 304   {
 305     SeqIdName nam = new SeqIdName(seq.getName());
 306     return findIdMatch(nam);
 307   }
 308
 309   public SequenceI findIdMatch(String seqnam)
 310   {
 311     SeqIdName nam = new SeqIdName(seqnam);
 312     return findIdMatch(nam);
 313   }
 314
 315   /**
 316    * Find all matches for a given sequence name.
 317    *
 318    * @param seqnam
 319    *          string to query Matcher with.
 320    * @return a new array or (possibly) null
 321    */
 322   public SequenceI[] findAllIdMatches(String seqnam)
 323   {
 324
 325     SeqIdName nam = new SeqIdName(seqnam);
 326     List<SequenceI> m = findAllIdMatches(nam);
 327     if (m != null)
 328     {
 329       return m.toArray(new SequenceI[m.size()]);
 330     }
 331     return null;
 332   }
 333
 334   /**
 335    * findIdMatch
 336    *
 337    * Return pointers to sequences (or sequence object containers) which have
 338    * same Id as a given set of different sequence objects
 339    *
 340    * @param seqs
 341    *          SequenceI[]
 342    * @return SequenceI[]
 343    */
 344   public SequenceI[] findIdMatch(SequenceI[] seqs)
 345   {
 346     SequenceI[] namedseqs = null;
 347     int i = 0;
 348     SeqIdName nam;
 349
 350     if (seqs.length > 0)
 351     {
 352       namedseqs = new SequenceI[seqs.length];
 353       do
 354       {
 355         nam = new SeqIdName(seqs[i].getName());
 356
 357         if (names.containsKey(nam))
 358         {
 359           namedseqs[i] = findIdMatch(nam);
 360         }
 361         else
 362         {
 363           namedseqs[i] = null;
 364         }
 365       } while (++i < seqs.length);
 366     }
 367
 368     return namedseqs;
 369   }
 370
 371   /**
 372    * core findIdMatch search method
 373    *
 374    * @param nam
 375    *          SeqIdName
 376    * @return SequenceI
 377    */
 378   private SequenceI findIdMatch(
 379           jalview.analysis.SequenceIdMatcher.SeqIdName nam)
 380   {
 381     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 382     while (names.containsKey(nam))
 383     {
 384       matches.addAll(names.remove(nam));
 385     }
 386     return pickbestMatch(nam, matches);
 387   }
 388
 389   /**
 390    * core findIdMatch search method for finding all equivalent matches
 391    *
 392    * @param nam
 393    *          SeqIdName
 394    * @return SequenceI[]
 395    */
 396   private List<SequenceI> findAllIdMatches(
 397           jalview.analysis.SequenceIdMatcher.SeqIdName nam)
 398   {
 399     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 400     while (names.containsKey(nam))
 401     {
 402       matches.addAll(names.remove(nam));
 403     }
 404     List<SequenceI> r = pickbestMatches(nam, matches);
 405     return r;
 406   }
 407
 408   private class SeqIdName
 409   {
 410     String id;
 411
 412     SeqIdName(String s)
 413     {
 414       if (s != null)
 415       {
 416         id = new String(s).toLowerCase();
 417       }
 418       else
 419       {
 420         id = "";
 421       }
 422     }
 423
 424     @Override
 425     public int hashCode()
 426     {
 427       return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
 428               .hashCode());
 429     }
 430
 431     @Override
 432     public boolean equals(Object s)
 433     {
 434       if (s == null)
 435       {
 436         return false;
 437       }
 438       if (s instanceof SeqIdName)
 439       {
 440         return this.equals((SeqIdName) s);
 441       }
 442       else
 443       {
 444         if (s instanceof String)
 445         {
 446           return this.equals((String) s);
 447         }
 448       }
 449
 450       return false;
 451     }
 452
 453     /**
 454      * matches if one ID properly contains another at a whitespace boundary.
 455      * TODO: (JBPNote) These are not efficient. should use char[] for speed
 456      * todo: (JBPNote) Set separator characters appropriately
 457      *
 458      * @param s
 459      *          SeqIdName
 460      * @return boolean
 461      */
 462     public boolean equals(SeqIdName s)
 463     {
 464       // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
 465       // match contains one ID flanked
 466       if (id.length() > s.id.length())
 467       {
 468         return check_wordmatch(id, s.id);
 469       }
 470       else
 471       {
 472         return check_wordmatch(s.id, id);
 473       }
 474     }
 475
 476     private boolean check_wordmatch(String longer, String shorter)
 477     {
 478       boolean elen = longer.length() == shorter.length();
 479       int sp = longer.indexOf(shorter);
 480       if (sp == -1)
 481       {
 482         return false;
 483       }
 484
 485       if (sp == 0)
 486       {
 487         // end of match is word boundary
 488         return elen ? true : (WORD_SEP.indexOf(longer.charAt(shorter
 489                 .length() + sp)) > -1);
 490       }
 491       if (WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1)
 492       {
 493         if (sp + shorter.length() == longer.length())
 494         {
 495           return true;
 496         }
 497         else
 498         {
 499           // end of match is word boundary
 500           return elen ? false
 501                   : sp + shorter.length() == longer.length() ? true
 502                           : (WORD_SEP.indexOf(longer.charAt(shorter
 503                                   .length() + sp)) > -1);
 504         }
 505       }
 506       else
 507       {
 508         // prefix of match is not a word boundary
 509         return false;
 510       }
 511     }
 512
 513     public boolean equals(String s)
 514     {
 515       s = s.toLowerCase(); // TODO: employ faster to lower case operation
 516       if (id.length() > s.length())
 517       {
 518         return check_wordmatch(id, s);
 519       }
 520       else
 521       {
 522         return check_wordmatch(s, id);
 523       }
 524     }
 525
 526     @Override
 527     public String toString()
 528     {
 529       return id;
 530     }
 531   }
 532 }