src/jalview/analysis/SequenceIdMatcher.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.DBRefEntry;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.util.ArrayList;
  27 import java.util.Arrays;
  28 import java.util.HashMap;
  29 import java.util.HashSet;
  30 import java.util.List;
  31 import java.util.Set;
  32
  33 /**
  34  * Routines for approximate Sequence Id resolution by name using string
  35  * containment (on word boundaries) rather than equivalence. It also attempts to
  36  * resolve ties where no exact match is available by picking the the id closest
  37  * to the query.
  38  */
  39 public class SequenceIdMatcher
  40 {
  41   /**
  42    * weak hash for each sequence
  43    */
  44   private HashMap<SeqIdName, Set<SequenceI>> names;
  45
  46   // /**
  47   // * cache of values removed for each query string.
  48   // */
  49   // private HashMap<String, List<SequenceI>> resolved;
  50
  51   /**
  52    * do we index sequences on all 'words' in ID string ?
  53    */
  54   private boolean wordBased = false;
  55
  56   /**
  57    * Characters that define the end of a unique sequence ID at the beginning of
  58    * an arbitrary ID string JBPNote: This is a heuristic that will fail for
  59    * arbritrarily extended sequence id's (like portions of an aligned set of
  60    * repeats from one sequence)
  61    */
  62   private static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
  63           + "$%^*)}[@',?_";
  64
  65   /**
  66    * @return true if matcher is word-based (ie string key matches one of the
  67    *         words within the body of one or more sequence IDs)
  68    */
  69   public boolean isWordBased()
  70   {
  71     return wordBased;
  72   }
  73
  74   /**
  75    * Construct a standard (non-word based) matcher. To configure word based
  76    * matching, use the fully qualified constructor
  77    *
  78    * @param seqs
  79    */
  80   public SequenceIdMatcher(List<SequenceI> seqs)
  81   {
  82     this(false, seqs);
  83   }
  84
  85   /**
  86    * construct a new matcher for a set of sequences, configured as required.
  87    * Note: enabling word based matching
  88    *
  89    * @param wordBasedMatch
  90    *          - when true, "myseq" matches "X|myseq" and "myseq"
  91    * @param seqs
  92    */
  93   public SequenceIdMatcher(boolean wordBasedMatch,
  94           List<SequenceI> seqs)
  95   {
  96     wordBased = wordBasedMatch;
  97     names = new HashMap<SeqIdName, Set<SequenceI>>();
  98     addAll(seqs);
  99   }
 100
 101   /**
 102    * add more sequences to this matcher - also used by the constructor
 103    *
 104    * @param seqs
 105    */
 106   public void addAll(List<SequenceI> seqs)
 107   {
 108     for (SequenceI seq : seqs)
 109     {
 110       addSeq(seq);
 111     }
 112   }
 113
 114   private void addSeqIdName(SeqIdName idname, SequenceI seq)
 115   {
 116     Set<SequenceI> seqset = names.get(idname);
 117     if (seqset == null)
 118     {
 119       seqset = new HashSet<SequenceI>();
 120       names.put(idname, seqset);
 121     }
 122     seqset.add(seq);
 123   }
 124
 125   public void addSeq(SequenceI seq)
 126   {
 127       // TODO: deal with ID collisions - SequenceI should be appended to list
 128       // associated with this key.
 129     addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq);
 130       if (wordBased)
 131       {
 132         for (SeqIdName key : getWordsFor(seq))
 133         {
 134         addSeqIdName(key, seq);
 135         }
 136       }
 137       SequenceI dbseq = seq;
 138     // TODO add test for database xref resolution
 139       while (dbseq.getDatasetSequence() != null)
 140       {
 141         dbseq = dbseq.getDatasetSequence();
 142       }
 143       // add in any interesting identifiers
 144       if (dbseq.getDBRefs() != null)
 145       {
 146         DBRefEntry dbr[] = dbseq.getDBRefs();
 147         SeqIdName sid = null;
 148         for (int r = 0; r < dbr.length; r++)
 149         {
 150           sid = new SeqIdName(dbr[r].getAccessionId());
 151           if (!names.containsKey(sid))
 152           {
 153           addSeqIdName(sid, seq);
 154           }
 155         }
 156     }
 157   }
 158
 159
 160   /**
 161    * generate word based keys for the given sequence
 162    *
 163    * @param seq
 164    * @return list of split keys
 165    */
 166   public List<SeqIdName> getWordsFor(SequenceI seq)
 167   {
 168     ArrayList<SeqIdName> keys = new ArrayList<SeqIdName>();
 169     String name = seq.getName(), limits = "/" + seq.getStart() + "-"
 170             + seq.getEnd();
 171     int namel = name.length();
 172     char[] sep = new char[WORD_SEP.length()];
 173     // find only the separators present in the ID.
 174     for (int i = 0; i < sep.length; i++)
 175     {
 176       sep[i] = WORD_SEP.charAt(i);
 177       if (seq.getName().indexOf("" + sep[i]) == -1)
 178       {
 179         sep[i] = 0;
 180       }
 181     }
 182     ;
 183     // make words
 184     for (int i = 0; i < sep.length; i++)
 185     {
 186       if (sep[i] > 0)
 187       {
 188         int p = 0, m = -1;
 189         while ((m = name.indexOf(sep[i], p)) > p)
 190         {
 191
 192           if (m > 0 && m - p > 5)
 193           {
 194             // split to end of word m with this delimiter
 195             keys.add(new SeqIdName(name.substring(p, m - 1) + limits));
 196           }
 197           if (namel - m > 5)
 198           {
 199             // index word after this delimiter m
 200             keys.add(new SeqIdName(name.substring(m + 1) + limits));
 201           }
 202           p = m + 1;
 203         }
 204         if (namel - p > 4)
 205         {
 206           // index word after this delimiter m
 207           keys.add(new SeqIdName(name.substring(p) + limits));
 208         }
 209       }
 210     }
 211     return keys;
 212   }
 213
 214   /**
 215    * convenience method to make a matcher from concrete array Note: in order to
 216    * support word based matching, use the fully qualified constructor
 217    *
 218    * @param sequences
 219    */
 220   public SequenceIdMatcher(SequenceI[] sequences)
 221   {
 222     this(Arrays.asList(sequences));
 223   }
 224
 225   /**
 226    * returns the closest SequenceI in matches to SeqIdName and returns all the
 227    * matches to the names hash.
 228    *
 229    * @param candName
 230    *          SeqIdName
 231    * @param matches
 232    *          List of SequenceI objects
 233    * @return SequenceI closest SequenceI to SeqIdName
 234    */
 235   private SequenceI pickbestMatch(SeqIdName candName,
 236           List<SequenceI> matches)
 237   {
 238     List<SequenceI> st = pickbestMatches(candName, matches);
 239     return st == null || st.size() == 0 ? null : st.get(0);
 240   }
 241
 242   /**
 243    * returns the closest SequenceI in matches to SeqIdName and returns all the
 244    * matches to the names hash.
 245    *
 246    * @param candName
 247    *          SeqIdName
 248    * @param matches
 249    *          Vector of SequenceI objects
 250    * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
 251    *         ties }
 252    */
 253   private List<SequenceI> pickbestMatches(SeqIdName candName,
 254           List<SequenceI> matches)
 255   {
 256     ArrayList<SequenceI> best = new ArrayList<SequenceI>();
 257     if (candName == null || matches == null || matches.size() == 0)
 258     {
 259       return null;
 260     }
 261     SequenceI match = matches.remove(0);
 262     best.add(match);
 263     addSeq(match);
 264     int matchlen = match.getName().length();
 265     int namlen = candName.id.length();
 266     while (matches.size() > 0)
 267     {
 268       // look through for a better one.
 269       SequenceI cand = matches.remove(0);
 270       addSeq(cand);
 271       int q, w, candlen = cand.getName().length();
 272       // keep the one with an id 'closer' to the given seqnam string
 273       boolean is_closer = ((q = Math.abs(matchlen - namlen)) > (w = Math
 274               .abs(candlen - namlen)) && candlen > matchlen);
 275       // if not closer, then check if current best is actually identical in case
 276       // as
 277       // well
 278       if (is_closer || (!candName.equalsCase(best.get(0).getName())))
 279       {
 280         best.clear();
 281         match = cand;
 282         matchlen = candlen;
 283         best.add(match);
 284       }
 285       if (q == w && candlen == matchlen)
 286       {
 287         // equivalently good, and matches with case as well. so
 288         // record any ties
 289         best.add(cand);
 290       }
 291     }
 292     if (best.size() == 0)
 293     {
 294       return null;
 295     }
 296     ;
 297     return best;
 298   }
 299
 300   /**
 301    * get SequenceI with closest SequenceI.getName() to seq.getName()
 302    *
 303    * @param seq
 304    *          SequenceI
 305    * @return SequenceI
 306    */
 307   public SequenceI findIdMatch(SequenceI seq)
 308   {
 309     SeqIdName nam = new SeqIdName(seq.getName());
 310     return findIdMatch(nam);
 311   }
 312
 313   public SequenceI findIdMatch(String seqnam)
 314   {
 315     SeqIdName nam = new SeqIdName(seqnam);
 316     return findIdMatch(nam);
 317   }
 318
 319   /**
 320    * Find all matches for a given sequence name.
 321    *
 322    * @param seqnam
 323    *          string to query Matcher with.
 324    * @return a new array or (possibly) null
 325    */
 326   public SequenceI[] findAllIdMatches(String seqnam)
 327   {
 328
 329     SeqIdName nam = new SeqIdName(seqnam);
 330     List<SequenceI> m = findAllIdMatches(nam);
 331     if (m != null)
 332     {
 333       return m.toArray(new SequenceI[m.size()]);
 334     }
 335     return null;
 336   }
 337
 338   /**
 339    * findIdMatch
 340    *
 341    * Return pointers to sequences (or sequence object containers) which have
 342    * same Id as a given set of different sequence objects
 343    *
 344    * @param seqs
 345    *          SequenceI[]
 346    * @return SequenceI[]
 347    */
 348   public SequenceI[] findIdMatch(SequenceI[] seqs)
 349   {
 350     SequenceI[] namedseqs = null;
 351     int i = 0;
 352     SeqIdName nam;
 353
 354     if (seqs.length > 0)
 355     {
 356       namedseqs = new SequenceI[seqs.length];
 357       do
 358       {
 359         nam = new SeqIdName(seqs[i].getName());
 360
 361         if (names.containsKey(nam))
 362         {
 363           namedseqs[i] = findIdMatch(nam);
 364         }
 365         else
 366         {
 367           namedseqs[i] = null;
 368         }
 369       } while (++i < seqs.length);
 370     }
 371
 372     return namedseqs;
 373   }
 374
 375   /**
 376    * core findIdMatch search method
 377    *
 378    * @param nam
 379    *          SeqIdName
 380    * @return SequenceI
 381    */
 382   private SequenceI findIdMatch(
 383           jalview.analysis.SequenceIdMatcher.SeqIdName nam)
 384   {
 385     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 386     while (names.containsKey(nam))
 387     {
 388       matches.addAll(names.remove(nam));
 389     }
 390     return pickbestMatch(nam, matches);
 391   }
 392
 393   /**
 394    * core findIdMatch search method for finding all equivalent matches
 395    *
 396    * @param nam
 397    *          SeqIdName
 398    * @return SequenceI[]
 399    */
 400   private List<SequenceI> findAllIdMatches(
 401           jalview.analysis.SequenceIdMatcher.SeqIdName nam)
 402   {
 403     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 404     while (names.containsKey(nam))
 405     {
 406       matches.addAll(names.remove(nam));
 407     }
 408     List<SequenceI> r = pickbestMatches(nam, matches);
 409     return r;
 410   }
 411
 412   private class SeqIdName
 413   {
 414     String id, origid;
 415
 416     SeqIdName(String s)
 417     {
 418       if (s != null)
 419       {
 420         id = new String(s).toLowerCase();
 421         origid = new String(s);
 422       }
 423       else
 424       {
 425         id = "";
 426       }
 427     }
 428
 429     @Override
 430     public int hashCode()
 431     {
 432       return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
 433               .hashCode());
 434     }
 435
 436     @Override
 437     public boolean equals(Object s)
 438     {
 439       if (s == null)
 440       {
 441         return false;
 442       }
 443       if (s instanceof SeqIdName)
 444       {
 445         return this.equals((SeqIdName) s);
 446       }
 447       else
 448       {
 449         if (s instanceof String)
 450         {
 451           return this.equals((String) s);
 452         }
 453       }
 454
 455       return false;
 456     }
 457
 458     /**
 459      * matches if one ID properly contains another at a whitespace boundary.
 460      * TODO: (JBPNote) These are not efficient. should use char[] for speed
 461      * todo: (JBPNote) Set separator characters appropriately
 462      *
 463      * @param s
 464      *          SeqIdName
 465      * @return boolean
 466      */
 467     public boolean equals(SeqIdName s)
 468     {
 469       // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
 470       // match contains one ID flanked
 471       if (id.length() > s.id.length())
 472       {
 473         return check_wordmatch(id, s.id);
 474       }
 475       else
 476       {
 477         return check_wordmatch(s.id, id);
 478       }
 479     }
 480
 481     private boolean check_wordmatch(String longer, String shorter)
 482     {
 483       boolean elen = longer.length() == shorter.length();
 484       int sp = longer.indexOf(shorter);
 485       if (sp == -1)
 486       {
 487         return false;
 488       }
 489
 490       if (sp == 0)
 491       {
 492         // end of match is word boundary
 493         return elen ? true : (WORD_SEP.indexOf(longer.charAt(shorter
 494                 .length() + sp)) > -1);
 495       }
 496       if (WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1)
 497       {
 498         if (sp + shorter.length() == longer.length())
 499         {
 500           return true;
 501         }
 502         else
 503         {
 504           // end of match is word boundary
 505           return elen ? false
 506                   : sp + shorter.length() == longer.length() ? true
 507                           : (WORD_SEP.indexOf(longer.charAt(shorter
 508                                   .length() + sp)) > -1);
 509         }
 510       }
 511       else
 512       {
 513         // prefix of match is not a word boundary
 514         return false;
 515       }
 516     }
 517
 518     public boolean equals(String s)
 519     {
 520       s = s.toLowerCase(); // TODO: employ faster to lower case operation
 521       if (id.length() > s.length())
 522       {
 523         return check_wordmatch(id, s);
 524       }
 525       else
 526       {
 527         return check_wordmatch(s, id);
 528       }
 529     }
 530
 531     @Override
 532     public String toString()
 533     {
 534       return id;
 535     }
 536
 537     public boolean equalsCase(String s)
 538     {
 539       if (origid.length() > s.length())
 540       {
 541         return check_wordmatch(origid, s);
 542       }
 543       else
 544       {
 545         return check_wordmatch(s, origid);
 546       }
 547     }
 548
 549     public boolean equalsCase(SeqIdName sid)
 550     {
 551       return equalsCase(sid.origid);
 552     }
 553   }
 554 }