src/jalview/analysis/SequenceIdMatcher.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.DBRefEntry;
  24 import jalview.datamodel.SequenceI;
  25
  26 import java.util.ArrayList;
  27 import java.util.Arrays;
  28 import java.util.HashMap;
  29 import java.util.HashSet;
  30 import java.util.List;
  31 import java.util.Set;
  32
  33 /**
  34  * Routines for approximate Sequence Id resolution by name using string
  35  * containment (on word boundaries) rather than equivalence. It also attempts to
  36  * resolve ties where no exact match is available by picking the the id closest
  37  * to the query.
  38  */
  39 public class SequenceIdMatcher
  40 {
  41   /**
  42    * weak hash for each sequence
  43    */
  44   private HashMap<SeqIdName, Set<SequenceI>> names;
  45
  46   // /**
  47   // * cache of values removed for each query string.
  48   // */
  49   // private HashMap<String, List<SequenceI>> resolved;
  50
  51   /**
  52    * do we index sequences on all 'words' in ID string ?
  53    */
  54   private boolean wordBased = false;
  55
  56   /**
  57    * Characters that define the end of a unique sequence ID at the beginning of
  58    * an arbitrary ID string JBPNote: This is a heuristic that will fail for
  59    * arbritrarily extended sequence id's (like portions of an aligned set of
  60    * repeats from one sequence)
  61    */
  62   private static String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
  63           + "$%^*)}[@',?_";
  64
  65   /**
  66    * @return true if matcher is word-based (ie string key matches one of the
  67    *         words within the body of one or more sequence IDs)
  68    */
  69   public boolean isWordBased()
  70   {
  71     return wordBased;
  72   }
  73
  74   /**
  75    * Construct a standard (non-word based) matcher. To configure word based
  76    * matching, use the fully qualified constructor
  77    *
  78    * @param seqs
  79    */
  80   public SequenceIdMatcher(List<SequenceI> seqs)
  81   {
  82     this(false, seqs);
  83   }
  84
  85   /**
  86    * construct a new matcher for a set of sequences, configured as required.
  87    * Note: enabling word based matching
  88    *
  89    * @param wordBasedMatch
  90    *          - when true, "myseq" matches "X|myseq" and "myseq"
  91    * @param seqs
  92    */
  93   public SequenceIdMatcher(boolean wordBasedMatch, List<SequenceI> seqs)
  94   {
  95     wordBased = wordBasedMatch;
  96     names = new HashMap<SeqIdName, Set<SequenceI>>();
  97     addAll(seqs);
  98   }
  99
 100   /**
 101    * add more sequences to this matcher - also used by the constructor
 102    *
 103    * @param seqs
 104    */
 105   public void addAll(List<SequenceI> seqs)
 106   {
 107     for (SequenceI seq : seqs)
 108     {
 109       addSeq(seq);
 110     }
 111   }
 112
 113   private void addSeqIdName(SeqIdName idname, SequenceI seq)
 114   {
 115     Set<SequenceI> seqset = names.get(idname);
 116     if (seqset == null)
 117     {
 118       seqset = new HashSet<SequenceI>();
 119       names.put(idname, seqset);
 120     }
 121     seqset.add(seq);
 122   }
 123
 124   public void addSeq(SequenceI seq)
 125   {
 126     // TODO: deal with ID collisions - SequenceI should be appended to list
 127     // associated with this key.
 128     addSeqIdName(new SeqIdName(seq.getDisplayId(true)), seq);
 129     if (wordBased)
 130     {
 131       for (SeqIdName key : getWordsFor(seq))
 132       {
 133         addSeqIdName(key, seq);
 134       }
 135     }
 136     SequenceI dbseq = seq;
 137     // TODO add test for database xref resolution
 138     while (dbseq.getDatasetSequence() != null)
 139     {
 140       dbseq = dbseq.getDatasetSequence();
 141     }
 142     // add in any interesting identifiers
 143     if (dbseq.getDBRefs() != null)
 144     {
 145       DBRefEntry dbr[] = dbseq.getDBRefs();
 146       SeqIdName sid = null;
 147       for (int r = 0; r < dbr.length; r++)
 148       {
 149         sid = new SeqIdName(dbr[r].getAccessionId());
 150         if (!names.containsKey(sid))
 151         {
 152           addSeqIdName(sid, seq);
 153         }
 154       }
 155     }
 156   }
 157
 158   /**
 159    * generate word based keys for the given sequence
 160    *
 161    * @param seq
 162    * @return list of split keys
 163    */
 164   public List<SeqIdName> getWordsFor(SequenceI seq)
 165   {
 166     ArrayList<SeqIdName> keys = new ArrayList<SeqIdName>();
 167     String name = seq.getName(), limits = "/" + seq.getStart() + "-"
 168             + seq.getEnd();
 169     int namel = name.length();
 170     char[] sep = new char[WORD_SEP.length()];
 171     // find only the separators present in the ID.
 172     for (int i = 0; i < sep.length; i++)
 173     {
 174       sep[i] = WORD_SEP.charAt(i);
 175       if (seq.getName().indexOf("" + sep[i]) == -1)
 176       {
 177         sep[i] = 0;
 178       }
 179     }
 180     ;
 181     // make words
 182     for (int i = 0; i < sep.length; i++)
 183     {
 184       if (sep[i] > 0)
 185       {
 186         int p = 0, m = -1;
 187         while ((m = name.indexOf(sep[i], p)) > p)
 188         {
 189
 190           if (m > 0 && m - p > 5)
 191           {
 192             // split to end of word m with this delimiter
 193             keys.add(new SeqIdName(name.substring(p, m - 1) + limits));
 194           }
 195           if (namel - m > 5)
 196           {
 197             // index word after this delimiter m
 198             keys.add(new SeqIdName(name.substring(m + 1) + limits));
 199           }
 200           p = m + 1;
 201         }
 202         if (namel - p > 4)
 203         {
 204           // index word after this delimiter m
 205           keys.add(new SeqIdName(name.substring(p) + limits));
 206         }
 207       }
 208     }
 209     return keys;
 210   }
 211
 212   /**
 213    * convenience method to make a matcher from concrete array Note: in order to
 214    * support word based matching, use the fully qualified constructor
 215    *
 216    * @param sequences
 217    */
 218   public SequenceIdMatcher(SequenceI[] sequences)
 219   {
 220     this(Arrays.asList(sequences));
 221   }
 222
 223   /**
 224    * returns the closest SequenceI in matches to SeqIdName and returns all the
 225    * matches to the names hash.
 226    *
 227    * @param candName
 228    *          SeqIdName
 229    * @param matches
 230    *          List of SequenceI objects
 231    * @return SequenceI closest SequenceI to SeqIdName
 232    */
 233   private SequenceI pickbestMatch(SeqIdName candName,
 234           List<SequenceI> matches)
 235   {
 236     List<SequenceI> st = pickbestMatches(candName, matches);
 237     return st == null || st.size() == 0 ? null : st.get(0);
 238   }
 239
 240   /**
 241    * returns the closest SequenceI in matches to SeqIdName and returns all the
 242    * matches to the names hash.
 243    *
 244    * @param candName
 245    *          SeqIdName
 246    * @param matches
 247    *          Vector of SequenceI objects
 248    * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
 249    *         ties }
 250    */
 251   private List<SequenceI> pickbestMatches(SeqIdName candName,
 252           List<SequenceI> matches)
 253   {
 254     ArrayList<SequenceI> best = new ArrayList<SequenceI>();
 255     if (candName == null || matches == null || matches.size() == 0)
 256     {
 257       return null;
 258     }
 259     SequenceI match = matches.remove(0);
 260     best.add(match);
 261     addSeq(match);
 262     int matchlen = match.getName().length();
 263     int namlen = candName.id.length();
 264     while (matches.size() > 0)
 265     {
 266       // look through for a better one.
 267       SequenceI cand = matches.remove(0);
 268       addSeq(cand);
 269       int q, w, candlen = cand.getName().length();
 270       // keep the one with an id 'closer' to the given seqnam string
 271       boolean is_closer = ((q = Math.abs(matchlen - namlen)) > (w = Math
 272               .abs(candlen - namlen)) && candlen > matchlen);
 273       // if not closer, then check if current best is actually identical in case
 274       // as
 275       // well
 276       if (is_closer || (!candName.equalsCase(best.get(0).getName())))
 277       {
 278         best.clear();
 279         match = cand;
 280         matchlen = candlen;
 281         best.add(match);
 282       }
 283       if (q == w && candlen == matchlen)
 284       {
 285         // equivalently good, and matches with case as well. so
 286         // record any ties
 287         best.add(cand);
 288       }
 289     }
 290     if (best.size() == 0)
 291     {
 292       return null;
 293     }
 294     ;
 295     return best;
 296   }
 297
 298   /**
 299    * get SequenceI with closest SequenceI.getName() to seq.getName()
 300    *
 301    * @param seq
 302    *          SequenceI
 303    * @return SequenceI
 304    */
 305   public SequenceI findIdMatch(SequenceI seq)
 306   {
 307     SeqIdName nam = new SeqIdName(seq.getName());
 308     return findIdMatch(nam);
 309   }
 310
 311   public SequenceI findIdMatch(String seqnam)
 312   {
 313     SeqIdName nam = new SeqIdName(seqnam);
 314     return findIdMatch(nam);
 315   }
 316
 317   /**
 318    * Find all matches for a given sequence name.
 319    *
 320    * @param seqnam
 321    *          string to query Matcher with.
 322    * @return a new array or (possibly) null
 323    */
 324   public SequenceI[] findAllIdMatches(String seqnam)
 325   {
 326
 327     SeqIdName nam = new SeqIdName(seqnam);
 328     List<SequenceI> m = findAllIdMatches(nam);
 329     if (m != null)
 330     {
 331       return m.toArray(new SequenceI[m.size()]);
 332     }
 333     return null;
 334   }
 335
 336   /**
 337    * findIdMatch
 338    *
 339    * Return pointers to sequences (or sequence object containers) which have
 340    * same Id as a given set of different sequence objects
 341    *
 342    * @param seqs
 343    *          SequenceI[]
 344    * @return SequenceI[]
 345    */
 346   public SequenceI[] findIdMatch(SequenceI[] seqs)
 347   {
 348     SequenceI[] namedseqs = null;
 349     int i = 0;
 350     SeqIdName nam;
 351
 352     if (seqs.length > 0)
 353     {
 354       namedseqs = new SequenceI[seqs.length];
 355       do
 356       {
 357         nam = new SeqIdName(seqs[i].getName());
 358
 359         if (names.containsKey(nam))
 360         {
 361           namedseqs[i] = findIdMatch(nam);
 362         }
 363         else
 364         {
 365           namedseqs[i] = null;
 366         }
 367       } while (++i < seqs.length);
 368     }
 369
 370     return namedseqs;
 371   }
 372
 373   /**
 374    * core findIdMatch search method
 375    *
 376    * @param nam
 377    *          SeqIdName
 378    * @return SequenceI
 379    */
 380   private SequenceI findIdMatch(
 381           jalview.analysis.SequenceIdMatcher.SeqIdName nam)
 382   {
 383     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 384     while (names.containsKey(nam))
 385     {
 386       matches.addAll(names.remove(nam));
 387     }
 388     return pickbestMatch(nam, matches);
 389   }
 390
 391   /**
 392    * core findIdMatch search method for finding all equivalent matches
 393    *
 394    * @param nam
 395    *          SeqIdName
 396    * @return SequenceI[]
 397    */
 398   private List<SequenceI> findAllIdMatches(
 399           jalview.analysis.SequenceIdMatcher.SeqIdName nam)
 400   {
 401     ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
 402     while (names.containsKey(nam))
 403     {
 404       matches.addAll(names.remove(nam));
 405     }
 406     List<SequenceI> r = pickbestMatches(nam, matches);
 407     return r;
 408   }
 409
 410   private class SeqIdName
 411   {
 412     String id, origid;
 413
 414     SeqIdName(String s)
 415     {
 416       if (s != null)
 417       {
 418         id = new String(s).toLowerCase();
 419         origid = new String(s);
 420       }
 421       else
 422       {
 423         id = "";
 424       }
 425     }
 426
 427     @Override
 428     public int hashCode()
 429     {
 430       return ((id.length() >= 4) ? id.substring(0, 4).hashCode() : id
 431               .hashCode());
 432     }
 433
 434     @Override
 435     public boolean equals(Object s)
 436     {
 437       if (s == null)
 438       {
 439         return false;
 440       }
 441       if (s instanceof SeqIdName)
 442       {
 443         return this.equals((SeqIdName) s);
 444       }
 445       else
 446       {
 447         if (s instanceof String)
 448         {
 449           return this.equals((String) s);
 450         }
 451       }
 452
 453       return false;
 454     }
 455
 456     /**
 457      * matches if one ID properly contains another at a whitespace boundary.
 458      * TODO: (JBPNote) These are not efficient. should use char[] for speed
 459      * todo: (JBPNote) Set separator characters appropriately
 460      *
 461      * @param s
 462      *          SeqIdName
 463      * @return boolean
 464      */
 465     public boolean equals(SeqIdName s)
 466     {
 467       // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
 468       // match contains one ID flanked
 469       if (id.length() > s.id.length())
 470       {
 471         return check_wordmatch(id, s.id);
 472       }
 473       else
 474       {
 475         return check_wordmatch(s.id, id);
 476       }
 477     }
 478
 479     private boolean check_wordmatch(String longer, String shorter)
 480     {
 481       boolean elen = longer.length() == shorter.length();
 482       int sp = longer.indexOf(shorter);
 483       if (sp == -1)
 484       {
 485         return false;
 486       }
 487
 488       if (sp == 0)
 489       {
 490         // end of match is word boundary
 491         return elen ? true : (WORD_SEP.indexOf(longer.charAt(shorter
 492                 .length() + sp)) > -1);
 493       }
 494       if (WORD_SEP.indexOf(longer.charAt(sp - 1)) > -1)
 495       {
 496         if (sp + shorter.length() == longer.length())
 497         {
 498           return true;
 499         }
 500         else
 501         {
 502           // end of match is word boundary
 503           return elen ? false
 504                   : sp + shorter.length() == longer.length() ? true
 505                           : (WORD_SEP.indexOf(longer.charAt(shorter
 506                                   .length() + sp)) > -1);
 507         }
 508       }
 509       else
 510       {
 511         // prefix of match is not a word boundary
 512         return false;
 513       }
 514     }
 515
 516     public boolean equals(String s)
 517     {
 518       s = s.toLowerCase(); // TODO: employ faster to lower case operation
 519       if (id.length() > s.length())
 520       {
 521         return check_wordmatch(id, s);
 522       }
 523       else
 524       {
 525         return check_wordmatch(s, id);
 526       }
 527     }
 528
 529     @Override
 530     public String toString()
 531     {
 532       return id;
 533     }
 534
 535     public boolean equalsCase(String s)
 536     {
 537       if (origid.length() > s.length())
 538       {
 539         return check_wordmatch(origid, s);
 540       }
 541       else
 542       {
 543         return check_wordmatch(s, origid);
 544       }
 545     }
 546
 547     public boolean equalsCase(SeqIdName sid)
 548     {
 549       return equalsCase(sid.origid);
 550     }
 551   }
 552 }