src/jalview/analysis/ResidueCount.java

   1 package jalview.analysis;
   2
   3 import jalview.util.Comparison;
   4 import jalview.util.Format;
   5 import jalview.util.QuickSort;
   6 import jalview.util.SparseCount;
   7
   8 /**
   9  * A class to count occurrences of residues in a profile, optimised for speed
  10  * and memory footprint.
  11  * @author gmcarstairs
  12  *
  13  */
  14 public class ResidueCount
  15 {
  16   /**
  17    * A data bean to hold the results of counting symbols
  18    */
  19   public class SymbolCounts
  20   {
  21     /**
  22      * the symbols seen (as char values)
  23      */
  24     public final char[] symbols;
  25
  26     /**
  27      * the counts for each symbol, in the same order as the symbols
  28      */
  29     public final int[] values;
  30
  31     SymbolCounts(char[] s, int[] v)
  32     {
  33       symbols = s;
  34       values = v;
  35     }
  36   }
  37
  38   private static final int TOUPPERCASE = 'A' - 'a';
  39
  40   /*
  41    * nucleotide symbols to count (including N unknown)
  42    */
  43   private static final String NUCS = "ACGTUN";
  44
  45   /*
  46    * amino acid symbols to count (including X unknown)
  47    * NB we also include U so as to support counting of RNA bases
  48    * in the "don't know" case of nucleotide / peptide
  49    */
  50   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  51
  52   private static final int GAP_COUNT = 0;
  53
  54   /*
  55    * fast lookup tables holding the index into our count
  56    * arrays of each symbol; index 0 is reserved for gap counting
  57    */
  58   private static int[] NUC_INDEX = new int[26];
  59
  60   private static int[] AA_INDEX = new int[26];
  61   static
  62   {
  63     for (int i = 0; i < NUCS.length(); i++)
  64     {
  65       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  66     }
  67     for (int i = 0; i < AAS.length(); i++)
  68     {
  69       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  70     }
  71   }
  72
  73   /*
  74    * counts array, just big enough for the nucleotide or peptide
  75    * character set (plus gap counts in position 0)
  76    */
  77   private short[] counts;
  78
  79   /*
  80    * alternative array of int counts for use if any count
  81    * exceeds the maximum value of short (32767)
  82    */
  83   private int[] intCounts;
  84
  85   /*
  86    * flag set if we switch from short to int counts
  87    */
  88   private boolean useIntCounts;
  89
  90   /*
  91    * general-purpose counter, only for use for characters
  92    * that are not in the expected alphabet
  93    */
  94   private SparseCount otherData;
  95
  96   /*
  97    * keeps track of the maximum count value recorded
  98    * (if this class every allows decrements, would need to
  99    * calculate this on request instead)
 100    */
 101   int maxCount;
 102
 103   /*
 104    * if we think we are counting nucleotide, can get by with smaller
 105    * array to hold counts
 106    */
 107   private boolean isNucleotide;
 108
 109   /**
 110    * Default constructor allocates arrays able to count either nucleotide or
 111    * peptide bases. Use this constructor if not sure which the data is.
 112    */
 113   public ResidueCount()
 114   {
 115     this(false);
 116   }
 117
 118   /**
 119    * Constructor that allocates an array just big enough for the anticipated
 120    * characters, plus one position to count gaps
 121    */
 122   public ResidueCount(boolean nucleotide)
 123   {
 124     isNucleotide = nucleotide;
 125     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 126     counts = new short[charsToCount + 1];
 127   }
 128
 129   /**
 130    * Increments the count for the given character. The supplied character may be
 131    * upper or lower case but counts are for the upper case only. Gap characters
 132    * (space, ., -) are all counted together.
 133    *
 134    * @param c
 135    * @return the new value of the count for the character
 136    */
 137   public int add(char c)
 138   {
 139     int newValue = 0;
 140     int offset = getOffset(c);
 141
 142     /*
 143      * offset 0 is reserved for gap counting, so 0 here means either
 144      * an unexpected character, or a gap character passed in error
 145      */
 146     if (offset == 0)
 147     {
 148       if (Comparison.isGap(c))
 149       {
 150         newValue = addGap();
 151       }
 152       else
 153       {
 154         newValue = addOtherCharacter(c);
 155       }
 156     }
 157     else
 158     {
 159       newValue = increment(offset);
 160     }
 161     return newValue;
 162   }
 163
 164   /**
 165    * Increment the count at the specified offset. If this would result in short
 166    * overflow, promote to counting int values instead.
 167    *
 168    * @param offset
 169    * @return the new value of the count at this offset
 170    */
 171   int increment(int offset)
 172   {
 173     int newValue = 0;
 174     if (useIntCounts)
 175     {
 176       newValue = intCounts[offset];
 177       intCounts[offset] = ++newValue;
 178     }
 179     else
 180     {
 181       if (counts[offset] == Short.MAX_VALUE)
 182       {
 183         handleOverflow();
 184         newValue = intCounts[offset];
 185         intCounts[offset] = ++newValue;
 186       }
 187       else
 188       {
 189         newValue = counts[offset];
 190         counts[offset] = (short) ++newValue;
 191       }
 192     }
 193     maxCount = Math.max(maxCount, newValue);
 194     return newValue;
 195   }
 196
 197   /**
 198    * Switch from counting in short to counting in int
 199    */
 200   synchronized void handleOverflow()
 201   {
 202     intCounts = new int[counts.length];
 203     for (int i = 0; i < counts.length; i++)
 204     {
 205       intCounts[i] = counts[i];
 206     }
 207     counts = null;
 208     useIntCounts = true;
 209   }
 210
 211   /**
 212    * @param c
 213    * @return
 214    */
 215   int getOffset(char c)
 216   {
 217     /*
 218      * ensure upper-case (fails fast if it already is!)
 219      */
 220     if ('a' <= c && c <= 'z')
 221     {
 222       c = (char) (c + TOUPPERCASE);
 223     }
 224
 225     /*
 226      * locate this character's offset in the count array
 227      */
 228     int offset = 0;
 229     if ('A' <= c && c <= 'Z')
 230     {
 231       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 232     }
 233     return offset;
 234   }
 235
 236   /**
 237    * Increment count for some unanticipated character. The first time this
 238    * called, a SparseCount is instantiated to hold these 'extra' counts.
 239    *
 240    * @param c
 241    * @return the new value of the count for the character
 242    */
 243   int addOtherCharacter(char c)
 244   {
 245     if (otherData == null)
 246     {
 247       otherData = new SparseCount();
 248     }
 249     int newValue = otherData.add(c, 1);
 250     maxCount = Math.max(maxCount, newValue);
 251     return newValue;
 252   }
 253
 254   /**
 255    * Set count for some unanticipated character. The first time this called, a
 256    * SparseCount is instantiated to hold these 'extra' counts.
 257    *
 258    * @param c
 259    * @param value
 260    */
 261   void setOtherCharacter(char c, int value)
 262   {
 263     if (otherData == null)
 264     {
 265       otherData = new SparseCount();
 266     }
 267     otherData.put(c, value);
 268   }
 269
 270   /**
 271    * Increment count of gap characters
 272    *
 273    * @return the new count of gaps
 274    */
 275   public int addGap()
 276   {
 277     int newValue;
 278     if (useIntCounts)
 279     {
 280       newValue = ++intCounts[GAP_COUNT];
 281     }
 282     else
 283     {
 284       newValue = ++counts[GAP_COUNT];
 285     }
 286     return newValue;
 287   }
 288
 289   /**
 290    * Answers true if we are counting ints (only after overflow of short counts)
 291    *
 292    * @return
 293    */
 294   boolean isCountingInts()
 295   {
 296     return useIntCounts;
 297   }
 298
 299   /**
 300    * Sets the count for the given character. The supplied character may be upper
 301    * or lower case but counts are for the upper case only.
 302    *
 303    * @param c
 304    * @param count
 305    */
 306   public void put(char c, int count)
 307   {
 308     int offset = getOffset(c);
 309
 310     /*
 311      * offset 0 is reserved for gap counting, so 0 here means either
 312      * an unexpected character, or a gap character passed in error
 313      */
 314     if (offset == 0)
 315     {
 316       if (Comparison.isGap(c))
 317       {
 318         addGap();
 319       }
 320       else
 321       {
 322         setOtherCharacter(c, count);
 323         maxCount = Math.max(maxCount, count);
 324       }
 325     }
 326     else
 327     {
 328       set(offset, count);
 329       maxCount = Math.max(maxCount, count);
 330     }
 331   }
 332
 333   /**
 334    * Sets the count at the specified offset. If this would result in short
 335    * overflow, promote to counting int values instead.
 336    *
 337    * @param offset
 338    * @param value
 339    */
 340   void set(int offset, int value)
 341   {
 342     if (useIntCounts)
 343     {
 344       intCounts[offset] = value;
 345     }
 346     else
 347     {
 348       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 349       {
 350         handleOverflow();
 351         intCounts[offset] = value;
 352       }
 353       else
 354       {
 355         counts[offset] = (short) value;
 356       }
 357     }
 358   }
 359
 360   /**
 361    * Returns the count for the given character, or zero if no count held
 362    *
 363    * @param c
 364    * @return
 365    */
 366   public int getCount(char c)
 367   {
 368     int offset = getOffset(c);
 369     if (offset == 0)
 370     {
 371       if (!Comparison.isGap(c))
 372       {
 373         // should have called getGapCount()
 374         return otherData == null ? 0 : otherData.get(c);
 375       }
 376     }
 377     return useIntCounts ? intCounts[offset] : counts[offset];
 378   }
 379
 380   public int getGapCount()
 381   {
 382     return useIntCounts ? intCounts[0] : counts[0];
 383   }
 384
 385   /**
 386    * Answers true if this object wraps a counter for unexpected characters
 387    *
 388    * @return
 389    */
 390   boolean isUsingOtherData()
 391   {
 392     return otherData != null;
 393   }
 394
 395   /**
 396    * Returns the character (or concatenated characters) for the symbol(s) with
 397    * the given count in the profile. Can be used to get the modal residue by
 398    * supplying the modal count value. Returns an empty string if symbol has the
 399    * given count. The symbols are in alphabetic order of standard peptide or
 400    * nucleotide characters, followed by 'other' symbols if any.
 401    *
 402    * @return
 403    */
 404   public String getResiduesForCount(int count)
 405   {
 406     if (count == 0)
 407     {
 408       return "";
 409     }
 410
 411     /*
 412      * find counts for the given value and append the
 413      * corresponding symbol
 414      */
 415     StringBuilder modal = new StringBuilder();
 416     if (useIntCounts)
 417     {
 418       for (int i = 1; i < intCounts.length; i++)
 419       {
 420         if (intCounts[i] == count)
 421         {
 422           modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
 423                   .charAt(i - 1));
 424         }
 425       }
 426     }
 427     else
 428     {
 429       for (int i = 1; i < counts.length; i++)
 430       {
 431         if (counts[i] == count)
 432         {
 433           modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
 434                   .charAt(i - 1));
 435         }
 436       }
 437     }
 438     if (otherData != null)
 439     {
 440       for (int i = 0; i < otherData.size(); i++)
 441       {
 442         if (otherData.valueAt(i) == count)
 443         {
 444           modal.append((char) otherData.keyAt(i));
 445         }
 446       }
 447     }
 448     return modal.toString();
 449   }
 450
 451   /**
 452    * Returns the highest count for any symbol in the profile (excluding gap)
 453    *
 454    * @return
 455    */
 456   public int getModalCount()
 457   {
 458     return maxCount;
 459   }
 460
 461   /**
 462    * Returns the number of distinct symbols with a non-zero count (excluding the
 463    * gap symbol)
 464    *
 465    * @return
 466    */
 467   public int size() {
 468     int size = 0;
 469     if (useIntCounts)
 470     {
 471       for (int i = 1; i < intCounts.length; i++)
 472       {
 473         if (intCounts[i] > 0)
 474         {
 475           size++;
 476         }
 477       }
 478     }
 479     else
 480     {
 481       for (int i = 1; i < counts.length; i++)
 482       {
 483         if (counts[i] > 0)
 484         {
 485           size++;
 486         }
 487       }
 488     }
 489     if (otherData != null)
 490     {
 491       size += otherData.size();
 492     }
 493
 494     return size;
 495   }
 496
 497   /**
 498    * Returns those symbols that have a non-zero count (excluding the gap
 499    * symbol), with their counts. The symbols are in no special order. Returns an
 500    * array of size 2 whose first element is a char array of symbols, and second
 501    * element an int array of corresponding counts.
 502    *
 503    * @return an array [[char1, char2, ...] [char1Count, char2Count, ...] ... ]
 504    */
 505   public SymbolCounts getSymbolCounts()
 506   {
 507     char[] symbols = new char[size()];
 508     int[] values = new int[size()];
 509     int j = 0;
 510
 511     if (useIntCounts)
 512     {
 513       for (int i = 1; i < intCounts.length; i++)
 514       {
 515         if (intCounts[i] > 0)
 516         {
 517           char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
 518                   .charAt(i - 1);
 519           symbols[j] = symbol;
 520           values[j] = intCounts[i];
 521           j++;
 522         }
 523       }
 524     }
 525     else
 526     {
 527       for (int i = 1; i < counts.length; i++)
 528       {
 529         if (counts[i] > 0)
 530         {
 531           char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
 532                   .charAt(i - 1);
 533           symbols[j] = symbol;
 534           values[j] = counts[i];
 535           j++;
 536         }
 537       }
 538     }
 539     if (otherData != null)
 540     {
 541       for (int i = 0; i < otherData.size(); i++)
 542       {
 543         int value = otherData.valueAt(i);
 544         if (value > 0)
 545         {
 546           symbols[j] = (char) otherData.keyAt(i);
 547           values[j] = otherData.valueAt(i);
 548           j++;
 549         }
 550       }
 551     }
 552
 553     return new SymbolCounts(symbols, values);
 554   }
 555
 556   /**
 557    * Returns a tooltip string showing residues in descending order of their
 558    * percentage frequency in the profile
 559    *
 560    * @param normaliseBy
 561    *          the divisor for residue counts (may or may not include gapped
 562    *          sequence count)
 563    * @param percentageDecPl
 564    *          the number of decimal places to show in percentages
 565    * @return
 566    */
 567   public String getTooltip(int normaliseBy, int percentageDecPl)
 568   {
 569     StringBuilder sb = new StringBuilder(64);
 570     SymbolCounts symbolCounts = getSymbolCounts();
 571     char[] ca = symbolCounts.symbols;
 572     int[] vl = symbolCounts.values;
 573
 574     /*
 575      * sort characters into ascending order of their counts
 576      */
 577     QuickSort.sort(vl, ca);
 578
 579     /*
 580      * traverse in reverse order (highest count first) to build tooltip
 581      */
 582     for (int p = 0, c = ca.length - 1; c >= 0; c--)
 583     {
 584       final char residue = ca[c];
 585       if (residue != '-')
 586       {
 587         // TODO combine residues which share a percentage
 588         // (see AAFrequency.completeCdnaConsensus)
 589         float tval = (vl[c] * 100f) / normaliseBy;
 590         sb.append((((p == 0) ? "" : "; "))).append(residue)
 591                 .append(" ");
 592         Format.appendPercentage(sb, tval, percentageDecPl);
 593         sb.append("%");
 594         p++;
 595       }
 596     }
 597     return sb.toString();
 598   }
 599
 600   /**
 601    * Returns a string representation of the symbol counts, for debug purposes.
 602    */
 603   @Override
 604   public String toString()
 605   {
 606     StringBuilder sb = new StringBuilder();
 607     sb.append("[ ");
 608     SymbolCounts sc = getSymbolCounts();
 609     for (int i = 0; i < sc.symbols.length; i++)
 610     {
 611       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 612     }
 613     sb.append("]");
 614     return sb.toString();
 615   }
 616 }