src/jalview/datamodel/ResidueCount.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.Comparison;
  24 import jalview.util.Format;
  25 import jalview.util.QuickSort;
  26 import jalview.util.SparseCount;
  27
  28 /**
  29  * A class to count occurrences of residues in a profile, optimised for speed
  30  * and memory footprint.
  31  * @author gmcarstairs
  32  *
  33  */
  34 public class ResidueCount
  35 {
  36   /**
  37    * A data bean to hold the results of counting symbols
  38    */
  39   public class SymbolCounts
  40   {
  41     /**
  42      * the symbols seen (as char values), in no particular order
  43      */
  44     public final char[] symbols;
  45
  46     /**
  47      * the counts for each symbol, in the same order as the symbols
  48      */
  49     public final int[] values;
  50
  51     SymbolCounts(char[] s, int[] v)
  52     {
  53       symbols = s;
  54       values = v;
  55     }
  56   }
  57
  58   private static final int TOUPPERCASE = 'A' - 'a';
  59
  60   /*
  61    * nucleotide symbols to count (including N unknown)
  62    */
  63   private static final String NUCS = "ACGNTU";
  64
  65   /*
  66    * amino acid symbols to count (including X unknown)
  67    * NB we also include U so as to support counting of RNA bases
  68    * in the "don't know" case of nucleotide / peptide
  69    */
  70   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  71
  72   private static final int GAP_COUNT = 0;
  73
  74   /*
  75    * fast lookup tables holding the index into our count
  76    * arrays of each symbol; index 0 is reserved for gap counting
  77    */
  78   private static int[] NUC_INDEX = new int[26];
  79
  80   private static int[] AA_INDEX = new int[26];
  81   static
  82   {
  83     for (int i = 0; i < NUCS.length(); i++)
  84     {
  85       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  86     }
  87     for (int i = 0; i < AAS.length(); i++)
  88     {
  89       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  90     }
  91   }
  92
  93   /*
  94    * counts array, just big enough for the nucleotide or peptide
  95    * character set (plus gap counts in position 0)
  96    */
  97   private short[] counts;
  98
  99   /*
 100    * alternative array of int counts for use if any count
 101    * exceeds the maximum value of short (32767)
 102    */
 103   private int[] intCounts;
 104
 105   /*
 106    * flag set if we switch from short to int counts
 107    */
 108   private boolean useIntCounts;
 109
 110   /*
 111    * general-purpose counter, only for use for characters
 112    * that are not in the expected alphabet
 113    */
 114   private SparseCount otherData;
 115
 116   /*
 117    * keeps track of the maximum count value recorded
 118    * (if this class ever allows decrements, would need to
 119    * calculate this on request instead)
 120    */
 121   int maxCount;
 122
 123   /*
 124    * if we think we are counting nucleotide, can get by with smaller
 125    * array to hold counts
 126    */
 127   private boolean isNucleotide;
 128
 129   /**
 130    * Default constructor allocates arrays able to count either nucleotide or
 131    * peptide bases. Use this constructor if not sure which the data is.
 132    */
 133   public ResidueCount()
 134   {
 135     this(false);
 136   }
 137
 138   /**
 139    * Constructor that allocates an array just big enough for the anticipated
 140    * characters, plus one position to count gaps
 141    */
 142   public ResidueCount(boolean nucleotide)
 143   {
 144     isNucleotide = nucleotide;
 145     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 146     counts = new short[charsToCount + 1];
 147   }
 148
 149   /**
 150    * Increments the count for the given character. The supplied character may be
 151    * upper or lower case but counts are for the upper case only. Gap characters
 152    * (space, ., -) are all counted together.
 153    *
 154    * @param c
 155    * @return the new value of the count for the character
 156    */
 157   public int add(final char c)
 158   {
 159     char u = toUpperCase(c);
 160     int newValue = 0;
 161     int offset = getOffset(u);
 162
 163     /*
 164      * offset 0 is reserved for gap counting, so 0 here means either
 165      * an unexpected character, or a gap character passed in error
 166      */
 167     if (offset == 0)
 168     {
 169       if (Comparison.isGap(u))
 170       {
 171         newValue = addGap();
 172       }
 173       else
 174       {
 175         newValue = addOtherCharacter(u);
 176       }
 177     }
 178     else
 179     {
 180       newValue = increment(offset);
 181     }
 182     return newValue;
 183   }
 184
 185   /**
 186    * Increment the count at the specified offset. If this would result in short
 187    * overflow, promote to counting int values instead.
 188    *
 189    * @param offset
 190    * @return the new value of the count at this offset
 191    */
 192   int increment(int offset)
 193   {
 194     int newValue = 0;
 195     if (useIntCounts)
 196     {
 197       newValue = intCounts[offset];
 198       intCounts[offset] = ++newValue;
 199     }
 200     else
 201     {
 202       if (counts[offset] == Short.MAX_VALUE)
 203       {
 204         handleOverflow();
 205         newValue = intCounts[offset];
 206         intCounts[offset] = ++newValue;
 207       }
 208       else
 209       {
 210         newValue = counts[offset];
 211         counts[offset] = (short) ++newValue;
 212       }
 213     }
 214     maxCount = Math.max(maxCount, newValue);
 215     return newValue;
 216   }
 217
 218   /**
 219    * Switch from counting in short to counting in int
 220    */
 221   synchronized void handleOverflow()
 222   {
 223     intCounts = new int[counts.length];
 224     for (int i = 0; i < counts.length; i++)
 225     {
 226       intCounts[i] = counts[i];
 227     }
 228     counts = null;
 229     useIntCounts = true;
 230   }
 231
 232   /**
 233    * Returns this character's offset in the count array
 234    *
 235    * @param c
 236    * @return
 237    */
 238   int getOffset(char c)
 239   {
 240     int offset = 0;
 241     if ('A' <= c && c <= 'Z')
 242     {
 243       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 244     }
 245     return offset;
 246   }
 247
 248   /**
 249    * @param c
 250    * @return
 251    */
 252   protected char toUpperCase(final char c)
 253   {
 254     char u = c;
 255     if ('a' <= c && c <= 'z')
 256     {
 257       u = (char) (c + TOUPPERCASE);
 258     }
 259     return u;
 260   }
 261
 262   /**
 263    * Increment count for some unanticipated character. The first time this
 264    * called, a SparseCount is instantiated to hold these 'extra' counts.
 265    *
 266    * @param c
 267    * @return the new value of the count for the character
 268    */
 269   int addOtherCharacter(char c)
 270   {
 271     if (otherData == null)
 272     {
 273       otherData = new SparseCount();
 274     }
 275     int newValue = otherData.add(c, 1);
 276     maxCount = Math.max(maxCount, newValue);
 277     return newValue;
 278   }
 279
 280   /**
 281    * Set count for some unanticipated character. The first time this called, a
 282    * SparseCount is instantiated to hold these 'extra' counts.
 283    *
 284    * @param c
 285    * @param value
 286    */
 287   void setOtherCharacter(char c, int value)
 288   {
 289     if (otherData == null)
 290     {
 291       otherData = new SparseCount();
 292     }
 293     otherData.put(c, value);
 294   }
 295
 296   /**
 297    * Increment count of gap characters
 298    *
 299    * @return the new count of gaps
 300    */
 301   public int addGap()
 302   {
 303     int newValue;
 304     if (useIntCounts)
 305     {
 306       newValue = ++intCounts[GAP_COUNT];
 307     }
 308     else
 309     {
 310       newValue = ++counts[GAP_COUNT];
 311     }
 312     return newValue;
 313   }
 314
 315   /**
 316    * Answers true if we are counting ints (only after overflow of short counts)
 317    *
 318    * @return
 319    */
 320   boolean isCountingInts()
 321   {
 322     return useIntCounts;
 323   }
 324
 325   /**
 326    * Sets the count for the given character. The supplied character may be upper
 327    * or lower case but counts are for the upper case only.
 328    *
 329    * @param c
 330    * @param count
 331    */
 332   public void put(char c, int count)
 333   {
 334     char u = toUpperCase(c);
 335     int offset = getOffset(u);
 336
 337     /*
 338      * offset 0 is reserved for gap counting, so 0 here means either
 339      * an unexpected character, or a gap character passed in error
 340      */
 341     if (offset == 0)
 342     {
 343       if (Comparison.isGap(u))
 344       {
 345         set(0, count);
 346       }
 347       else
 348       {
 349         setOtherCharacter(u, count);
 350         maxCount = Math.max(maxCount, count);
 351       }
 352     }
 353     else
 354     {
 355       set(offset, count);
 356       maxCount = Math.max(maxCount, count);
 357     }
 358   }
 359
 360   /**
 361    * Sets the count at the specified offset. If this would result in short
 362    * overflow, promote to counting int values instead.
 363    *
 364    * @param offset
 365    * @param value
 366    */
 367   void set(int offset, int value)
 368   {
 369     if (useIntCounts)
 370     {
 371       intCounts[offset] = value;
 372     }
 373     else
 374     {
 375       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 376       {
 377         handleOverflow();
 378         intCounts[offset] = value;
 379       }
 380       else
 381       {
 382         counts[offset] = (short) value;
 383       }
 384     }
 385   }
 386
 387   /**
 388    * Returns the count for the given character, or zero if no count held
 389    *
 390    * @param c
 391    * @return
 392    */
 393   public int getCount(char c)
 394   {
 395     char u = toUpperCase(c);
 396     int offset = getOffset(u);
 397     if (offset == 0)
 398     {
 399       if (!Comparison.isGap(u))
 400       {
 401         // should have called getGapCount()
 402         return otherData == null ? 0 : otherData.get(u);
 403       }
 404     }
 405     return useIntCounts ? intCounts[offset] : counts[offset];
 406   }
 407
 408   public int getGapCount()
 409   {
 410     return useIntCounts ? intCounts[0] : counts[0];
 411   }
 412
 413   /**
 414    * Answers true if this object wraps a counter for unexpected characters
 415    *
 416    * @return
 417    */
 418   boolean isUsingOtherData()
 419   {
 420     return otherData != null;
 421   }
 422
 423   /**
 424    * Returns the character (or concatenated characters) for the symbol(s) with
 425    * the given count in the profile. Can be used to get the modal residue by
 426    * supplying the modal count value. Returns an empty string if no symbol has
 427    * the given count. The symbols are in alphabetic order of standard peptide or
 428    * nucleotide characters, followed by 'other' symbols if any.
 429    *
 430    * @return
 431    */
 432   public String getResiduesForCount(int count)
 433   {
 434     if (count == 0)
 435     {
 436       return "";
 437     }
 438
 439     /*
 440      * find counts for the given value and append the
 441      * corresponding symbol
 442      */
 443     StringBuilder modal = new StringBuilder();
 444     if (useIntCounts)
 445     {
 446       for (int i = 1; i < intCounts.length; i++)
 447       {
 448         if (intCounts[i] == count)
 449         {
 450           modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
 451                   .charAt(i - 1));
 452         }
 453       }
 454     }
 455     else
 456     {
 457       for (int i = 1; i < counts.length; i++)
 458       {
 459         if (counts[i] == count)
 460         {
 461           modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
 462                   .charAt(i - 1));
 463         }
 464       }
 465     }
 466     if (otherData != null)
 467     {
 468       for (int i = 0; i < otherData.size(); i++)
 469       {
 470         if (otherData.valueAt(i) == count)
 471         {
 472           modal.append((char) otherData.keyAt(i));
 473         }
 474       }
 475     }
 476     return modal.toString();
 477   }
 478
 479   /**
 480    * Returns the highest count for any symbol(s) in the profile (excluding gap)
 481    *
 482    * @return
 483    */
 484   public int getModalCount()
 485   {
 486     return maxCount;
 487   }
 488
 489   /**
 490    * Returns the number of distinct symbols with a non-zero count (excluding the
 491    * gap symbol)
 492    *
 493    * @return
 494    */
 495   public int size() {
 496     int size = 0;
 497     if (useIntCounts)
 498     {
 499       for (int i = 1; i < intCounts.length; i++)
 500       {
 501         if (intCounts[i] > 0)
 502         {
 503           size++;
 504         }
 505       }
 506     }
 507     else
 508     {
 509       for (int i = 1; i < counts.length; i++)
 510       {
 511         if (counts[i] > 0)
 512         {
 513           size++;
 514         }
 515       }
 516     }
 517
 518     /*
 519      * include 'other' characters recorded (even if count is zero
 520      * though that would be a strange use case)
 521      */
 522     if (otherData != null)
 523     {
 524       size += otherData.size();
 525     }
 526
 527     return size;
 528   }
 529
 530   /**
 531    * Returns a data bean holding those symbols that have a non-zero count
 532    * (excluding the gap symbol), with their counts.
 533    *
 534    * @return
 535    */
 536   public SymbolCounts getSymbolCounts()
 537   {
 538     int size = size();
 539     char[] symbols = new char[size];
 540     int[] values = new int[size];
 541     int j = 0;
 542
 543     if (useIntCounts)
 544     {
 545       for (int i = 1; i < intCounts.length; i++)
 546       {
 547         if (intCounts[i] > 0)
 548         {
 549           char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
 550                   .charAt(i - 1);
 551           symbols[j] = symbol;
 552           values[j] = intCounts[i];
 553           j++;
 554         }
 555       }
 556     }
 557     else
 558     {
 559       for (int i = 1; i < counts.length; i++)
 560       {
 561         if (counts[i] > 0)
 562         {
 563           char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
 564                   .charAt(i - 1);
 565           symbols[j] = symbol;
 566           values[j] = counts[i];
 567           j++;
 568         }
 569       }
 570     }
 571     if (otherData != null)
 572     {
 573       for (int i = 0; i < otherData.size(); i++)
 574       {
 575         symbols[j] = (char) otherData.keyAt(i);
 576         values[j] = otherData.valueAt(i);
 577         j++;
 578       }
 579     }
 580
 581     return new SymbolCounts(symbols, values);
 582   }
 583
 584   /**
 585    * Returns a tooltip string showing residues in descending order of their
 586    * percentage frequency in the profile
 587    *
 588    * @param normaliseBy
 589    *          the divisor for residue counts (may or may not include gapped
 590    *          sequence count)
 591    * @param percentageDecPl
 592    *          the number of decimal places to show in percentages
 593    * @return
 594    */
 595   public String getTooltip(int normaliseBy, int percentageDecPl)
 596   {
 597     SymbolCounts symbolCounts = getSymbolCounts();
 598     char[] ca = symbolCounts.symbols;
 599     int[] vl = symbolCounts.values;
 600
 601     /*
 602      * sort characters into ascending order of their counts
 603      */
 604     QuickSort.sort(vl, ca);
 605
 606     /*
 607      * traverse in reverse order (highest count first) to build tooltip
 608      */
 609     boolean first = true;
 610     StringBuilder sb = new StringBuilder(64);
 611     for (int c = ca.length - 1; c >= 0; c--)
 612     {
 613       final char residue = ca[c];
 614       // TODO combine residues which share a percentage
 615       // (see AAFrequency.completeCdnaConsensus)
 616       float tval = (vl[c] * 100f) / normaliseBy;
 617       sb.append(first ? "" : "; ").append(residue).append(" ");
 618       Format.appendPercentage(sb, tval, percentageDecPl);
 619       sb.append("%");
 620       first = false;
 621     }
 622     return sb.toString();
 623   }
 624
 625   /**
 626    * Returns a string representation of the symbol counts, for debug purposes.
 627    */
 628   @Override
 629   public String toString()
 630   {
 631     StringBuilder sb = new StringBuilder();
 632     sb.append("[ ");
 633     SymbolCounts sc = getSymbolCounts();
 634     for (int i = 0; i < sc.symbols.length; i++)
 635     {
 636       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 637     }
 638     sb.append("]");
 639     return sb.toString();
 640   }
 641 }