src/jalview/datamodel/ResidueCount.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.Comparison;
  24 import jalview.util.Format;
  25 import jalview.util.QuickSort;
  26 import jalview.util.SparseCount;
  27
  28 /**
  29  * A class to count occurrences of residues in a profile, optimised for speed
  30  * and memory footprint.
  31  *
  32  * @author gmcarstairs
  33  *
  34  */
  35 public class ResidueCount
  36 {
  37   /**
  38    * A data bean to hold the results of counting symbols
  39    */
  40   public class SymbolCounts
  41   {
  42     /**
  43      * the symbols seen (as char values), in no particular order
  44      */
  45     public final char[] symbols;
  46
  47     /**
  48      * the counts for each symbol, in the same order as the symbols
  49      */
  50     public final int[] values;
  51
  52     SymbolCounts(char[] s, int[] v)
  53     {
  54       symbols = s;
  55       values = v;
  56     }
  57   }
  58
  59   private static final int TOUPPERCASE = 'A' - 'a';
  60
  61   /*
  62    * nucleotide symbols to count (including N unknown)
  63    */
  64   private static final String NUCS = "ACGNTU";
  65
  66   /*
  67    * amino acid symbols to count (including X unknown)
  68    * NB we also include U so as to support counting of RNA bases
  69    * in the "don't know" case of nucleotide / peptide
  70    */
  71   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  72
  73   private static final int GAP_COUNT = 0;
  74
  75   /*
  76    * fast lookup tables holding the index into our count
  77    * arrays of each symbol; index 0 is reserved for gap counting
  78    */
  79   private final static int[] NUC_INDEX = new int[26];
  80
  81   private final static int[] AA_INDEX = new int[26];
  82
  83   static
  84   {
  85     for (int i = 0; i < NUCS.length(); i++)
  86     {
  87       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  88     }
  89     for (int i = 0; i < AAS.length(); i++)
  90     {
  91       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  92     }
  93   }
  94
  95   /*
  96    * counts array, just big enough for the nucleotide or peptide
  97    * character set (plus gap counts in position 0)
  98    */
  99   private short[] counts;
 100
 101   /*
 102    * alternative array of int counts for use if any count
 103    * exceeds the maximum value of short (32767)
 104    */
 105   private int[] intCounts;
 106
 107   /*
 108    * flag set if we switch from short to int counts
 109    */
 110   private boolean useIntCounts;
 111
 112   /*
 113    * general-purpose counter, only for use for characters
 114    * that are not in the expected alphabet
 115    */
 116   private SparseCount otherData;
 117
 118   /*
 119    * keeps track of the maximum count value recorded
 120    * (if this class ever allows decrements, would need to
 121    * calculate this on request instead)
 122    */
 123   int maxCount;
 124
 125   /*
 126    * if we think we are counting nucleotide, can get by with smaller
 127    * array to hold counts
 128    */
 129   private boolean isNucleotide;
 130
 131   /**
 132    * Default constructor allocates arrays able to count either nucleotide or
 133    * peptide bases. Use this constructor if not sure which the data is.
 134    */
 135   public ResidueCount()
 136   {
 137     this(false);
 138   }
 139
 140   /**
 141    * Constructor that allocates an array just big enough for the anticipated
 142    * characters, plus one position to count gaps
 143    */
 144   public ResidueCount(boolean nucleotide)
 145   {
 146     isNucleotide = nucleotide;
 147     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 148     counts = new short[charsToCount + 1];
 149   }
 150
 151   /**
 152    * Increments the count for the given character. The supplied character may be
 153    * upper or lower case but counts are for the upper case only. Gap characters
 154    * (space, ., -) are all counted together.
 155    *
 156    * @param c
 157    * @return the new value of the count for the character
 158    */
 159   public int add(final char c)
 160   {
 161     char u = toUpperCase(c);
 162     int newValue = 0;
 163     int offset = getOffset(u);
 164
 165     /*
 166      * offset 0 is reserved for gap counting, so 0 here means either
 167      * an unexpected character, or a gap character passed in error
 168      */
 169     if (offset == 0)
 170     {
 171       if (Comparison.isGap(u))
 172       {
 173         newValue = addGap();
 174       }
 175       else
 176       {
 177         newValue = addOtherCharacter(u);
 178       }
 179     }
 180     else
 181     {
 182       newValue = increment(offset);
 183     }
 184     return newValue;
 185   }
 186
 187   /**
 188    * Increment the count at the specified offset. If this would result in short
 189    * overflow, promote to counting int values instead.
 190    *
 191    * @param offset
 192    * @return the new value of the count at this offset
 193    */
 194   int increment(int offset)
 195   {
 196     int newValue = 0;
 197     if (useIntCounts)
 198     {
 199       newValue = intCounts[offset];
 200       intCounts[offset] = ++newValue;
 201     }
 202     else
 203     {
 204       if (counts[offset] == Short.MAX_VALUE)
 205       {
 206         handleOverflow();
 207         newValue = intCounts[offset];
 208         intCounts[offset] = ++newValue;
 209       }
 210       else
 211       {
 212         newValue = counts[offset];
 213         counts[offset] = (short) ++newValue;
 214       }
 215     }
 216     maxCount = Math.max(maxCount, newValue);
 217     return newValue;
 218   }
 219
 220   /**
 221    * Switch from counting in short to counting in int
 222    */
 223   synchronized void handleOverflow()
 224   {
 225     intCounts = new int[counts.length];
 226     for (int i = 0; i < counts.length; i++)
 227     {
 228       intCounts[i] = counts[i];
 229     }
 230     counts = null;
 231     useIntCounts = true;
 232   }
 233
 234   /**
 235    * Returns this character's offset in the count array
 236    *
 237    * @param c
 238    * @return
 239    */
 240   int getOffset(char c)
 241   {
 242     int offset = 0;
 243     if ('A' <= c && c <= 'Z')
 244     {
 245       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 246     }
 247     return offset;
 248   }
 249
 250   /**
 251    * @param c
 252    * @return
 253    */
 254   protected char toUpperCase(final char c)
 255   {
 256     char u = c;
 257     if ('a' <= c && c <= 'z')
 258     {
 259       u = (char) (c + TOUPPERCASE);
 260     }
 261     return u;
 262   }
 263
 264   /**
 265    * Increment count for some unanticipated character. The first time this
 266    * called, a SparseCount is instantiated to hold these 'extra' counts.
 267    *
 268    * @param c
 269    * @return the new value of the count for the character
 270    */
 271   int addOtherCharacter(char c)
 272   {
 273     if (otherData == null)
 274     {
 275       otherData = new SparseCount();
 276     }
 277     int newValue = otherData.add(c, 1);
 278     maxCount = Math.max(maxCount, newValue);
 279     return newValue;
 280   }
 281
 282   /**
 283    * Set count for some unanticipated character. The first time this called, a
 284    * SparseCount is instantiated to hold these 'extra' counts.
 285    *
 286    * @param c
 287    * @param value
 288    */
 289   void setOtherCharacter(char c, int value)
 290   {
 291     if (otherData == null)
 292     {
 293       otherData = new SparseCount();
 294     }
 295     otherData.put(c, value);
 296   }
 297
 298   /**
 299    * Increment count of gap characters
 300    *
 301    * @return the new count of gaps
 302    */
 303   public int addGap()
 304   {
 305     int newValue;
 306     if (useIntCounts)
 307     {
 308       newValue = ++intCounts[GAP_COUNT];
 309     }
 310     else
 311     {
 312       newValue = ++counts[GAP_COUNT];
 313     }
 314     return newValue;
 315   }
 316
 317   /**
 318    * Answers true if we are counting ints (only after overflow of short counts)
 319    *
 320    * @return
 321    */
 322   boolean isCountingInts()
 323   {
 324     return useIntCounts;
 325   }
 326
 327   /**
 328    * Sets the count for the given character. The supplied character may be upper
 329    * or lower case but counts are for the upper case only.
 330    *
 331    * @param c
 332    * @param count
 333    */
 334   public void put(char c, int count)
 335   {
 336     char u = toUpperCase(c);
 337     int offset = getOffset(u);
 338
 339     /*
 340      * offset 0 is reserved for gap counting, so 0 here means either
 341      * an unexpected character, or a gap character passed in error
 342      */
 343     if (offset == 0)
 344     {
 345       if (Comparison.isGap(u))
 346       {
 347         set(0, count);
 348       }
 349       else
 350       {
 351         setOtherCharacter(u, count);
 352         maxCount = Math.max(maxCount, count);
 353       }
 354     }
 355     else
 356     {
 357       set(offset, count);
 358       maxCount = Math.max(maxCount, count);
 359     }
 360   }
 361
 362   /**
 363    * Sets the count at the specified offset. If this would result in short
 364    * overflow, promote to counting int values instead.
 365    *
 366    * @param offset
 367    * @param value
 368    */
 369   void set(int offset, int value)
 370   {
 371     if (useIntCounts)
 372     {
 373       intCounts[offset] = value;
 374     }
 375     else
 376     {
 377       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 378       {
 379         handleOverflow();
 380         intCounts[offset] = value;
 381       }
 382       else
 383       {
 384         counts[offset] = (short) value;
 385       }
 386     }
 387   }
 388
 389   /**
 390    * Returns the count for the given character, or zero if no count held
 391    *
 392    * @param c
 393    * @return
 394    */
 395   public int getCount(char c)
 396   {
 397     char u = toUpperCase(c);
 398     int offset = getOffset(u);
 399     if (offset == 0)
 400     {
 401       if (!Comparison.isGap(u))
 402       {
 403         // should have called getGapCount()
 404         return otherData == null ? 0 : otherData.get(u);
 405       }
 406     }
 407     return useIntCounts ? intCounts[offset] : counts[offset];
 408   }
 409
 410   public int getGapCount()
 411   {
 412     return useIntCounts ? intCounts[0] : counts[0];
 413   }
 414
 415   /**
 416    * Answers true if this object wraps a counter for unexpected characters
 417    *
 418    * @return
 419    */
 420   boolean isUsingOtherData()
 421   {
 422     return otherData != null;
 423   }
 424
 425   /**
 426    * Returns the character (or concatenated characters) for the symbol(s) with
 427    * the given count in the profile. Can be used to get the modal residue by
 428    * supplying the modal count value. Returns an empty string if no symbol has
 429    * the given count. The symbols are in alphabetic order of standard peptide or
 430    * nucleotide characters, followed by 'other' symbols if any.
 431    *
 432    * @return
 433    */
 434   public String getResiduesForCount(int count)
 435   {
 436     if (count == 0)
 437     {
 438       return "";
 439     }
 440
 441     /*
 442      * find counts for the given value and append the
 443      * corresponding symbol
 444      */
 445     StringBuilder modal = new StringBuilder();
 446     if (useIntCounts)
 447     {
 448       for (int i = 1; i < intCounts.length; i++)
 449       {
 450         if (intCounts[i] == count)
 451         {
 452           modal.append(
 453                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 454         }
 455       }
 456     }
 457     else
 458     {
 459       for (int i = 1; i < counts.length; i++)
 460       {
 461         if (counts[i] == count)
 462         {
 463           modal.append(
 464                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 465         }
 466       }
 467     }
 468     if (otherData != null)
 469     {
 470       for (int i = 0; i < otherData.size(); i++)
 471       {
 472         if (otherData.valueAt(i) == count)
 473         {
 474           modal.append((char) otherData.keyAt(i));
 475         }
 476       }
 477     }
 478     return modal.toString();
 479   }
 480
 481   /**
 482    * Returns the highest count for any symbol(s) in the profile (excluding gap)
 483    *
 484    * @return
 485    */
 486   public int getModalCount()
 487   {
 488     return maxCount;
 489   }
 490
 491   /**
 492    * Returns the number of distinct symbols with a non-zero count (excluding the
 493    * gap symbol)
 494    *
 495    * @return
 496    */
 497   public int size()
 498   {
 499     int size = 0;
 500     if (useIntCounts)
 501     {
 502       for (int i = 1; i < intCounts.length; i++)
 503       {
 504         if (intCounts[i] > 0)
 505         {
 506           size++;
 507         }
 508       }
 509     }
 510     else
 511     {
 512       for (int i = 1; i < counts.length; i++)
 513       {
 514         if (counts[i] > 0)
 515         {
 516           size++;
 517         }
 518       }
 519     }
 520
 521     /*
 522      * include 'other' characters recorded (even if count is zero
 523      * though that would be a strange use case)
 524      */
 525     if (otherData != null)
 526     {
 527       size += otherData.size();
 528     }
 529
 530     return size;
 531   }
 532
 533   /**
 534    * Returns a data bean holding those symbols that have a non-zero count
 535    * (excluding the gap symbol), with their counts.
 536    *
 537    * @return
 538    */
 539   public SymbolCounts getSymbolCounts()
 540   {
 541     int size = size();
 542     char[] symbols = new char[size];
 543     int[] values = new int[size];
 544     int j = 0;
 545
 546     if (useIntCounts)
 547     {
 548       for (int i = 1; i < intCounts.length; i++)
 549       {
 550         if (intCounts[i] > 0)
 551         {
 552           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 553                   : AAS.charAt(i - 1);
 554           symbols[j] = symbol;
 555           values[j] = intCounts[i];
 556           j++;
 557         }
 558       }
 559     }
 560     else
 561     {
 562       for (int i = 1; i < counts.length; i++)
 563       {
 564         if (counts[i] > 0)
 565         {
 566           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 567                   : AAS.charAt(i - 1);
 568           symbols[j] = symbol;
 569           values[j] = counts[i];
 570           j++;
 571         }
 572       }
 573     }
 574     if (otherData != null)
 575     {
 576       for (int i = 0; i < otherData.size(); i++)
 577       {
 578         symbols[j] = (char) otherData.keyAt(i);
 579         values[j] = otherData.valueAt(i);
 580         j++;
 581       }
 582     }
 583
 584     return new SymbolCounts(symbols, values);
 585   }
 586
 587   /**
 588    * Returns a tooltip string showing residues in descending order of their
 589    * percentage frequency in the profile
 590    *
 591    * @param normaliseBy
 592    *          the divisor for residue counts (may or may not include gapped
 593    *          sequence count)
 594    * @param percentageDecPl
 595    *          the number of decimal places to show in percentages
 596    * @return
 597    */
 598   public String getTooltip(int normaliseBy, int percentageDecPl)
 599   {
 600     SymbolCounts symbolCounts = getSymbolCounts();
 601     char[] ca = symbolCounts.symbols;
 602     int[] vl = symbolCounts.values;
 603
 604     /*
 605      * sort characters into ascending order of their counts
 606      */
 607     QuickSort.sort(vl, ca);
 608
 609     /*
 610      * traverse in reverse order (highest count first) to build tooltip
 611      */
 612     boolean first = true;
 613     StringBuilder sb = new StringBuilder(64);
 614     for (int c = ca.length - 1; c >= 0; c--)
 615     {
 616       final char residue = ca[c];
 617       // TODO combine residues which share a percentage
 618       // (see AAFrequency.completeCdnaConsensus)
 619       float tval = (vl[c] * 100f) / normaliseBy;
 620       sb.append(first ? "" : "; ").append(residue).append(" ");
 621       Format.appendPercentage(sb, tval, percentageDecPl);
 622       sb.append("%");
 623       first = false;
 624     }
 625     return sb.toString();
 626   }
 627
 628   /**
 629    * Returns a string representation of the symbol counts, for debug purposes.
 630    */
 631   @Override
 632   public String toString()
 633   {
 634     StringBuilder sb = new StringBuilder();
 635     sb.append("[ ");
 636     SymbolCounts sc = getSymbolCounts();
 637     for (int i = 0; i < sc.symbols.length; i++)
 638     {
 639       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 640     }
 641     sb.append("]");
 642     return sb.toString();
 643   }
 644 }