src/jalview/datamodel/ResidueCount.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.Comparison;
  24 import jalview.util.Format;
  25 import jalview.util.QuickSort;
  26 import jalview.util.SparseCount;
  27
  28 /**
  29  * A class to count occurrences of residues in a profile, optimised for speed
  30  * and memory footprint.
  31  *
  32  * @author gmcarstairs
  33  *
  34  */
  35 public class ResidueCount
  36 {
  37   /**
  38    * A data bean to hold the results of counting symbols
  39    */
  40   public class SymbolCounts
  41   {
  42     /**
  43      * the symbols seen (as char values), in no particular order
  44      */
  45     public final char[] symbols;
  46
  47     /**
  48      * the counts for each symbol, in the same order as the symbols
  49      */
  50     public final int[] values;
  51
  52     SymbolCounts(char[] s, int[] v)
  53     {
  54       symbols = s;
  55       values = v;
  56     }
  57   }
  58
  59   private static final int TOUPPERCASE = 'A' - 'a';
  60
  61   /*
  62    * nucleotide symbols to count (including N unknown)
  63    */
  64   private static final String NUCS = "ACGNTU";
  65
  66   /*
  67    * amino acid symbols to count (including X unknown)
  68    * NB we also include U so as to support counting of RNA bases
  69    * in the "don't know" case of nucleotide / peptide
  70    */
  71   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  72
  73   private static final int GAP_COUNT = 0;
  74
  75   /*
  76    * fast lookup tables holding the index into our count
  77    * arrays of each symbol; index 0 is reserved for gap counting
  78    */
  79   private static int[] NUC_INDEX = new int[26];
  80
  81   private static int[] AA_INDEX = new int[26];
  82   static
  83   {
  84     for (int i = 0; i < NUCS.length(); i++)
  85     {
  86       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  87     }
  88     for (int i = 0; i < AAS.length(); i++)
  89     {
  90       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  91     }
  92   }
  93
  94   /*
  95    * counts array, just big enough for the nucleotide or peptide
  96    * character set (plus gap counts in position 0)
  97    */
  98   private short[] counts;
  99
 100   /*
 101    * alternative array of int counts for use if any count
 102    * exceeds the maximum value of short (32767)
 103    */
 104   private int[] intCounts;
 105
 106   /*
 107    * flag set if we switch from short to int counts
 108    */
 109   private boolean useIntCounts;
 110
 111   /*
 112    * general-purpose counter, only for use for characters
 113    * that are not in the expected alphabet
 114    */
 115   private SparseCount otherData;
 116
 117   /*
 118    * keeps track of the maximum count value recorded
 119    * (if this class ever allows decrements, would need to
 120    * calculate this on request instead)
 121    */
 122   int maxCount;
 123
 124   /*
 125    * if we think we are counting nucleotide, can get by with smaller
 126    * array to hold counts
 127    */
 128   private boolean isNucleotide;
 129
 130   /**
 131    * Default constructor allocates arrays able to count either nucleotide or
 132    * peptide bases. Use this constructor if not sure which the data is.
 133    */
 134   public ResidueCount()
 135   {
 136     this(false);
 137   }
 138
 139   /**
 140    * Constructor that allocates an array just big enough for the anticipated
 141    * characters, plus one position to count gaps
 142    */
 143   public ResidueCount(boolean nucleotide)
 144   {
 145     isNucleotide = nucleotide;
 146     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 147     counts = new short[charsToCount + 1];
 148   }
 149
 150   /**
 151    * Increments the count for the given character. The supplied character may be
 152    * upper or lower case but counts are for the upper case only. Gap characters
 153    * (space, ., -) are all counted together.
 154    *
 155    * @param c
 156    * @return the new value of the count for the character
 157    */
 158   public int add(final char c)
 159   {
 160     char u = toUpperCase(c);
 161     int newValue = 0;
 162     int offset = getOffset(u);
 163
 164     /*
 165      * offset 0 is reserved for gap counting, so 0 here means either
 166      * an unexpected character, or a gap character passed in error
 167      */
 168     if (offset == 0)
 169     {
 170       if (Comparison.isGap(u))
 171       {
 172         newValue = addGap();
 173       }
 174       else
 175       {
 176         newValue = addOtherCharacter(u);
 177       }
 178     }
 179     else
 180     {
 181       newValue = increment(offset);
 182     }
 183     return newValue;
 184   }
 185
 186   /**
 187    * Increment the count at the specified offset. If this would result in short
 188    * overflow, promote to counting int values instead.
 189    *
 190    * @param offset
 191    * @return the new value of the count at this offset
 192    */
 193   int increment(int offset)
 194   {
 195     int newValue = 0;
 196     if (useIntCounts)
 197     {
 198       newValue = intCounts[offset];
 199       intCounts[offset] = ++newValue;
 200     }
 201     else
 202     {
 203       if (counts[offset] == Short.MAX_VALUE)
 204       {
 205         handleOverflow();
 206         newValue = intCounts[offset];
 207         intCounts[offset] = ++newValue;
 208       }
 209       else
 210       {
 211         newValue = counts[offset];
 212         counts[offset] = (short) ++newValue;
 213       }
 214     }
 215     maxCount = Math.max(maxCount, newValue);
 216     return newValue;
 217   }
 218
 219   /**
 220    * Switch from counting in short to counting in int
 221    */
 222   synchronized void handleOverflow()
 223   {
 224     intCounts = new int[counts.length];
 225     for (int i = 0; i < counts.length; i++)
 226     {
 227       intCounts[i] = counts[i];
 228     }
 229     counts = null;
 230     useIntCounts = true;
 231   }
 232
 233   /**
 234    * Returns this character's offset in the count array
 235    *
 236    * @param c
 237    * @return
 238    */
 239   int getOffset(char c)
 240   {
 241     int offset = 0;
 242     if ('A' <= c && c <= 'Z')
 243     {
 244       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 245     }
 246     return offset;
 247   }
 248
 249   /**
 250    * @param c
 251    * @return
 252    */
 253   protected char toUpperCase(final char c)
 254   {
 255     char u = c;
 256     if ('a' <= c && c <= 'z')
 257     {
 258       u = (char) (c + TOUPPERCASE);
 259     }
 260     return u;
 261   }
 262
 263   /**
 264    * Increment count for some unanticipated character. The first time this
 265    * called, a SparseCount is instantiated to hold these 'extra' counts.
 266    *
 267    * @param c
 268    * @return the new value of the count for the character
 269    */
 270   int addOtherCharacter(char c)
 271   {
 272     if (otherData == null)
 273     {
 274       otherData = new SparseCount();
 275     }
 276     int newValue = otherData.add(c, 1);
 277     maxCount = Math.max(maxCount, newValue);
 278     return newValue;
 279   }
 280
 281   /**
 282    * Set count for some unanticipated character. The first time this called, a
 283    * SparseCount is instantiated to hold these 'extra' counts.
 284    *
 285    * @param c
 286    * @param value
 287    */
 288   void setOtherCharacter(char c, int value)
 289   {
 290     if (otherData == null)
 291     {
 292       otherData = new SparseCount();
 293     }
 294     otherData.put(c, value);
 295   }
 296
 297   /**
 298    * Increment count of gap characters
 299    *
 300    * @return the new count of gaps
 301    */
 302   public int addGap()
 303   {
 304     int newValue;
 305     if (useIntCounts)
 306     {
 307       newValue = ++intCounts[GAP_COUNT];
 308     }
 309     else
 310     {
 311       newValue = ++counts[GAP_COUNT];
 312     }
 313     return newValue;
 314   }
 315
 316   /**
 317    * Answers true if we are counting ints (only after overflow of short counts)
 318    *
 319    * @return
 320    */
 321   boolean isCountingInts()
 322   {
 323     return useIntCounts;
 324   }
 325
 326   /**
 327    * Sets the count for the given character. The supplied character may be upper
 328    * or lower case but counts are for the upper case only.
 329    *
 330    * @param c
 331    * @param count
 332    */
 333   public void put(char c, int count)
 334   {
 335     char u = toUpperCase(c);
 336     int offset = getOffset(u);
 337
 338     /*
 339      * offset 0 is reserved for gap counting, so 0 here means either
 340      * an unexpected character, or a gap character passed in error
 341      */
 342     if (offset == 0)
 343     {
 344       if (Comparison.isGap(u))
 345       {
 346         set(0, count);
 347       }
 348       else
 349       {
 350         setOtherCharacter(u, count);
 351         maxCount = Math.max(maxCount, count);
 352       }
 353     }
 354     else
 355     {
 356       set(offset, count);
 357       maxCount = Math.max(maxCount, count);
 358     }
 359   }
 360
 361   /**
 362    * Sets the count at the specified offset. If this would result in short
 363    * overflow, promote to counting int values instead.
 364    *
 365    * @param offset
 366    * @param value
 367    */
 368   void set(int offset, int value)
 369   {
 370     if (useIntCounts)
 371     {
 372       intCounts[offset] = value;
 373     }
 374     else
 375     {
 376       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 377       {
 378         handleOverflow();
 379         intCounts[offset] = value;
 380       }
 381       else
 382       {
 383         counts[offset] = (short) value;
 384       }
 385     }
 386   }
 387
 388   /**
 389    * Returns the count for the given character, or zero if no count held
 390    *
 391    * @param c
 392    * @return
 393    */
 394   public int getCount(char c)
 395   {
 396     char u = toUpperCase(c);
 397     int offset = getOffset(u);
 398     if (offset == 0)
 399     {
 400       if (!Comparison.isGap(u))
 401       {
 402         // should have called getGapCount()
 403         return otherData == null ? 0 : otherData.get(u);
 404       }
 405     }
 406     return useIntCounts ? intCounts[offset] : counts[offset];
 407   }
 408
 409   public int getGapCount()
 410   {
 411     return useIntCounts ? intCounts[0] : counts[0];
 412   }
 413
 414   /**
 415    * Answers true if this object wraps a counter for unexpected characters
 416    *
 417    * @return
 418    */
 419   boolean isUsingOtherData()
 420   {
 421     return otherData != null;
 422   }
 423
 424   /**
 425    * Returns the character (or concatenated characters) for the symbol(s) with
 426    * the given count in the profile. Can be used to get the modal residue by
 427    * supplying the modal count value. Returns an empty string if no symbol has
 428    * the given count. The symbols are in alphabetic order of standard peptide or
 429    * nucleotide characters, followed by 'other' symbols if any.
 430    *
 431    * @return
 432    */
 433   public String getResiduesForCount(int count)
 434   {
 435     if (count == 0)
 436     {
 437       return "";
 438     }
 439
 440     /*
 441      * find counts for the given value and append the
 442      * corresponding symbol
 443      */
 444     StringBuilder modal = new StringBuilder();
 445     if (useIntCounts)
 446     {
 447       for (int i = 1; i < intCounts.length; i++)
 448       {
 449         if (intCounts[i] == count)
 450         {
 451           modal.append(
 452                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 453         }
 454       }
 455     }
 456     else
 457     {
 458       for (int i = 1; i < counts.length; i++)
 459       {
 460         if (counts[i] == count)
 461         {
 462           modal.append(
 463                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 464         }
 465       }
 466     }
 467     if (otherData != null)
 468     {
 469       for (int i = 0; i < otherData.size(); i++)
 470       {
 471         if (otherData.valueAt(i) == count)
 472         {
 473           modal.append((char) otherData.keyAt(i));
 474         }
 475       }
 476     }
 477     return modal.toString();
 478   }
 479
 480   /**
 481    * Returns the highest count for any symbol(s) in the profile (excluding gap)
 482    *
 483    * @return
 484    */
 485   public int getModalCount()
 486   {
 487     return maxCount;
 488   }
 489
 490   /**
 491    * Returns the number of distinct symbols with a non-zero count (excluding the
 492    * gap symbol)
 493    *
 494    * @return
 495    */
 496   public int size()
 497   {
 498     int size = 0;
 499     if (useIntCounts)
 500     {
 501       for (int i = 1; i < intCounts.length; i++)
 502       {
 503         if (intCounts[i] > 0)
 504         {
 505           size++;
 506         }
 507       }
 508     }
 509     else
 510     {
 511       for (int i = 1; i < counts.length; i++)
 512       {
 513         if (counts[i] > 0)
 514         {
 515           size++;
 516         }
 517       }
 518     }
 519
 520     /*
 521      * include 'other' characters recorded (even if count is zero
 522      * though that would be a strange use case)
 523      */
 524     if (otherData != null)
 525     {
 526       size += otherData.size();
 527     }
 528
 529     return size;
 530   }
 531
 532   /**
 533    * Returns a data bean holding those symbols that have a non-zero count
 534    * (excluding the gap symbol), with their counts.
 535    *
 536    * @return
 537    */
 538   public SymbolCounts getSymbolCounts()
 539   {
 540     int size = size();
 541     char[] symbols = new char[size];
 542     int[] values = new int[size];
 543     int j = 0;
 544
 545     if (useIntCounts)
 546     {
 547       for (int i = 1; i < intCounts.length; i++)
 548       {
 549         if (intCounts[i] > 0)
 550         {
 551           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 552                   : AAS.charAt(i - 1);
 553           symbols[j] = symbol;
 554           values[j] = intCounts[i];
 555           j++;
 556         }
 557       }
 558     }
 559     else
 560     {
 561       for (int i = 1; i < counts.length; i++)
 562       {
 563         if (counts[i] > 0)
 564         {
 565           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 566                   : AAS.charAt(i - 1);
 567           symbols[j] = symbol;
 568           values[j] = counts[i];
 569           j++;
 570         }
 571       }
 572     }
 573     if (otherData != null)
 574     {
 575       for (int i = 0; i < otherData.size(); i++)
 576       {
 577         symbols[j] = (char) otherData.keyAt(i);
 578         values[j] = otherData.valueAt(i);
 579         j++;
 580       }
 581     }
 582
 583     return new SymbolCounts(symbols, values);
 584   }
 585
 586   /**
 587    * Returns a tooltip string showing residues in descending order of their
 588    * percentage frequency in the profile
 589    *
 590    * @param normaliseBy
 591    *          the divisor for residue counts (may or may not include gapped
 592    *          sequence count)
 593    * @param percentageDecPl
 594    *          the number of decimal places to show in percentages
 595    * @return
 596    */
 597   public String getTooltip(int normaliseBy, int percentageDecPl)
 598   {
 599     SymbolCounts symbolCounts = getSymbolCounts();
 600     char[] ca = symbolCounts.symbols;
 601     int[] vl = symbolCounts.values;
 602
 603     /*
 604      * sort characters into ascending order of their counts
 605      */
 606     QuickSort.sort(vl, ca);
 607
 608     /*
 609      * traverse in reverse order (highest count first) to build tooltip
 610      */
 611     boolean first = true;
 612     StringBuilder sb = new StringBuilder(64);
 613     for (int c = ca.length - 1; c >= 0; c--)
 614     {
 615       final char residue = ca[c];
 616       // TODO combine residues which share a percentage
 617       // (see AAFrequency.completeCdnaConsensus)
 618       float tval = (vl[c] * 100f) / normaliseBy;
 619       sb.append(first ? "" : "; ").append(residue).append(" ");
 620       Format.appendPercentage(sb, tval, percentageDecPl);
 621       sb.append("%");
 622       first = false;
 623     }
 624     return sb.toString();
 625   }
 626
 627   /**
 628    * Returns a string representation of the symbol counts, for debug purposes.
 629    */
 630   @Override
 631   public String toString()
 632   {
 633     StringBuilder sb = new StringBuilder();
 634     sb.append("[ ");
 635     SymbolCounts sc = getSymbolCounts();
 636     for (int i = 0; i < sc.symbols.length; i++)
 637     {
 638       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 639     }
 640     sb.append("]");
 641     return sb.toString();
 642   }
 643 }