1 package jalview.analysis;
3 import jalview.util.Comparison;
4 import jalview.util.Format;
5 import jalview.util.QuickSort;
6 import jalview.util.SparseCount;
9 * A class to count occurrences of residues in a profile, optimised for speed
10 * and memory footprint.
14 public class ResidueCount
17 * A data bean to hold the results of counting symbols
19 public class SymbolCounts
22 * the symbols seen (as char values)
24 public final char[] symbols;
27 * the counts for each symbol, in the same order as the symbols
29 public final int[] values;
31 SymbolCounts(char[] s, int[] v)
38 private static final int TOUPPERCASE = 'A' - 'a';
41 * nucleotide symbols to count (including N unknown)
43 private static final String NUCS = "ACGTUN";
46 * amino acid symbols to count (including X unknown)
47 * NB we also include U so as to support counting of RNA bases
48 * in the "don't know" case of nucleotide / peptide
50 private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
52 private static final int GAP_COUNT = 0;
55 * fast lookup tables holding the index into our count
56 * arrays of each symbol; index 0 is reserved for gap counting
58 private static int[] NUC_INDEX = new int[26];
60 private static int[] AA_INDEX = new int[26];
63 for (int i = 0; i < NUCS.length(); i++)
65 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
67 for (int i = 0; i < AAS.length(); i++)
69 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
74 * counts array, just big enough for the nucleotide or peptide
75 * character set (plus gap counts in position 0)
77 private short[] counts;
80 * alternative array of int counts for use if any count
81 * exceeds the maximum value of short (32767)
83 private int[] intCounts;
86 * flag set if we switch from short to int counts
88 private boolean useIntCounts;
91 * general-purpose counter, only for use for characters
92 * that are not in the expected alphabet
94 private SparseCount otherData;
97 * keeps track of the maximum count value recorded
98 * (if this class every allows decrements, would need to
99 * calculate this on request instead)
104 * if we think we are counting nucleotide, can get by with smaller
105 * array to hold counts
107 private boolean isNucleotide;
110 * Default constructor allocates arrays able to count either nucleotide or
111 * peptide bases. Use this constructor if not sure which the data is.
113 public ResidueCount()
119 * Constructor that allocates an array just big enough for the anticipated
120 * characters, plus one position to count gaps
122 public ResidueCount(boolean nucleotide)
124 isNucleotide = nucleotide;
125 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
126 counts = new short[charsToCount + 1];
130 * Increments the count for the given character. The supplied character may be
131 * upper or lower case but counts are for the upper case only. Gap characters
132 * (space, ., -) are all counted together.
135 * @return the new value of the count for the character
137 public int add(char c)
140 int offset = getOffset(c);
143 * offset 0 is reserved for gap counting, so 0 here means either
144 * an unexpected character, or a gap character passed in error
148 if (Comparison.isGap(c))
154 newValue = addOtherCharacter(c);
159 newValue = increment(offset);
165 * Increment the count at the specified offset. If this would result in short
166 * overflow, promote to counting int values instead.
169 * @return the new value of the count at this offset
171 int increment(int offset)
176 newValue = intCounts[offset];
177 intCounts[offset] = ++newValue;
181 if (counts[offset] == Short.MAX_VALUE)
184 newValue = intCounts[offset];
185 intCounts[offset] = ++newValue;
189 newValue = counts[offset];
190 counts[offset] = (short) ++newValue;
193 maxCount = Math.max(maxCount, newValue);
198 * Switch from counting in short to counting in int
200 synchronized void handleOverflow()
202 intCounts = new int[counts.length];
203 for (int i = 0; i < counts.length; i++)
205 intCounts[i] = counts[i];
215 int getOffset(char c)
218 * ensure upper-case (fails fast if it already is!)
220 if ('a' <= c && c <= 'z')
222 c = (char) (c + TOUPPERCASE);
226 * locate this character's offset in the count array
229 if ('A' <= c && c <= 'Z')
231 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
237 * Increment count for some unanticipated character. The first time this
238 * called, a SparseCount is instantiated to hold these 'extra' counts.
241 * @return the new value of the count for the character
243 int addOtherCharacter(char c)
245 if (otherData == null)
247 otherData = new SparseCount();
249 int newValue = otherData.add(c, 1);
250 maxCount = Math.max(maxCount, newValue);
255 * Set count for some unanticipated character. The first time this called, a
256 * SparseCount is instantiated to hold these 'extra' counts.
261 void setOtherCharacter(char c, int value)
263 if (otherData == null)
265 otherData = new SparseCount();
267 otherData.put(c, value);
271 * Increment count of gap characters
273 * @return the new count of gaps
280 newValue = ++intCounts[GAP_COUNT];
284 newValue = ++counts[GAP_COUNT];
290 * Answers true if we are counting ints (only after overflow of short counts)
294 boolean isCountingInts()
300 * Sets the count for the given character. The supplied character may be upper
301 * or lower case but counts are for the upper case only.
306 public void put(char c, int count)
308 int offset = getOffset(c);
311 * offset 0 is reserved for gap counting, so 0 here means either
312 * an unexpected character, or a gap character passed in error
316 if (Comparison.isGap(c))
322 setOtherCharacter(c, count);
323 maxCount = Math.max(maxCount, count);
329 maxCount = Math.max(maxCount, count);
334 * Sets the count at the specified offset. If this would result in short
335 * overflow, promote to counting int values instead.
340 void set(int offset, int value)
344 intCounts[offset] = value;
348 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
351 intCounts[offset] = value;
355 counts[offset] = (short) value;
361 * Returns the count for the given character, or zero if no count held
366 public int getCount(char c)
368 int offset = getOffset(c);
371 if (!Comparison.isGap(c))
373 // should have called getGapCount()
374 return otherData == null ? 0 : otherData.get(c);
377 return useIntCounts ? intCounts[offset] : counts[offset];
380 public int getGapCount()
382 return useIntCounts ? intCounts[0] : counts[0];
386 * Answers true if this object wraps a counter for unexpected characters
390 boolean isUsingOtherData()
392 return otherData != null;
396 * Returns the character (or concatenated characters) for the symbol(s) with
397 * the given count in the profile. Can be used to get the modal residue by
398 * supplying the modal count value. Returns an empty string if symbol has the
399 * given count. The symbols are in alphabetic order of standard peptide or
400 * nucleotide characters, followed by 'other' symbols if any.
404 public String getResiduesForCount(int count)
412 * find counts for the given value and append the
413 * corresponding symbol
415 StringBuilder modal = new StringBuilder();
418 for (int i = 1; i < intCounts.length; i++)
420 if (intCounts[i] == count)
422 modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
429 for (int i = 1; i < counts.length; i++)
431 if (counts[i] == count)
433 modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
438 if (otherData != null)
440 for (int i = 0; i < otherData.size(); i++)
442 if (otherData.valueAt(i) == count)
444 modal.append((char) otherData.keyAt(i));
448 return modal.toString();
452 * Returns the highest count for any symbol in the profile (excluding gap)
456 public int getModalCount()
462 * Returns the number of distinct symbols with a non-zero count (excluding the
471 for (int i = 1; i < intCounts.length; i++)
473 if (intCounts[i] > 0)
481 for (int i = 1; i < counts.length; i++)
489 if (otherData != null)
491 size += otherData.size();
498 * Returns those symbols that have a non-zero count (excluding the gap
499 * symbol), with their counts. The symbols are in no special order. Returns an
500 * array of size 2 whose first element is a char array of symbols, and second
501 * element an int array of corresponding counts.
503 * @return an array [[char1, char2, ...] [char1Count, char2Count, ...] ... ]
505 public SymbolCounts getSymbolCounts()
507 char[] symbols = new char[size()];
508 int[] values = new int[size()];
513 for (int i = 1; i < intCounts.length; i++)
515 if (intCounts[i] > 0)
517 char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
520 values[j] = intCounts[i];
527 for (int i = 1; i < counts.length; i++)
531 char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
534 values[j] = counts[i];
539 if (otherData != null)
541 for (int i = 0; i < otherData.size(); i++)
543 int value = otherData.valueAt(i);
546 symbols[j] = (char) otherData.keyAt(i);
547 values[j] = otherData.valueAt(i);
553 return new SymbolCounts(symbols, values);
557 * Returns a tooltip string showing residues in descending order of their
558 * percentage frequency in the profile
561 * the divisor for residue counts (may or may not include gapped
563 * @param percentageDecPl
564 * the number of decimal places to show in percentages
567 public String getTooltip(int normaliseBy, int percentageDecPl)
569 StringBuilder sb = new StringBuilder(64);
570 SymbolCounts symbolCounts = getSymbolCounts();
571 char[] ca = symbolCounts.symbols;
572 int[] vl = symbolCounts.values;
575 * sort characters into ascending order of their counts
577 QuickSort.sort(vl, ca);
580 * traverse in reverse order (highest count first) to build tooltip
582 for (int p = 0, c = ca.length - 1; c >= 0; c--)
584 final char residue = ca[c];
587 // TODO combine residues which share a percentage
588 // (see AAFrequency.completeCdnaConsensus)
589 float tval = (vl[c] * 100f) / normaliseBy;
590 sb.append((((p == 0) ? "" : "; "))).append(residue)
592 Format.appendPercentage(sb, tval, percentageDecPl);
597 return sb.toString();
601 * Returns a string representation of the symbol counts, for debug purposes.
604 public String toString()
606 StringBuilder sb = new StringBuilder();
608 SymbolCounts sc = getSymbolCounts();
609 for (int i = 0; i < sc.symbols.length; i++)
611 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
614 return sb.toString();