--- /dev/null
+package jalview.analysis;
+
+import jalview.util.Comparison;
+import jalview.util.Format;
+import jalview.util.QuickSort;
+import jalview.util.SparseCount;
+
+/**
+ * A class to count occurrences of residues in a profile, optimised for speed
+ * and memory footprint.
+ * @author gmcarstairs
+ *
+ */
+public class ResidueCount
+{
+ /**
+ * A data bean to hold the results of counting symbols
+ */
+ public class SymbolCounts
+ {
+ /**
+ * the symbols seen (as char values), in no particular order
+ */
+ public final char[] symbols;
+
+ /**
+ * the counts for each symbol, in the same order as the symbols
+ */
+ public final int[] values;
+
+ SymbolCounts(char[] s, int[] v)
+ {
+ symbols = s;
+ values = v;
+ }
+ }
+
+ private static final int TOUPPERCASE = 'A' - 'a';
+
+ /*
+ * nucleotide symbols to count (including N unknown)
+ */
+ private static final String NUCS = "ACGNTU";
+
+ /*
+ * amino acid symbols to count (including X unknown)
+ * NB we also include U so as to support counting of RNA bases
+ * in the "don't know" case of nucleotide / peptide
+ */
+ private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
+
+ private static final int GAP_COUNT = 0;
+
+ /*
+ * fast lookup tables holding the index into our count
+ * arrays of each symbol; index 0 is reserved for gap counting
+ */
+ private static int[] NUC_INDEX = new int[26];
+
+ private static int[] AA_INDEX = new int[26];
+ static
+ {
+ for (int i = 0; i < NUCS.length(); i++)
+ {
+ NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
+ }
+ for (int i = 0; i < AAS.length(); i++)
+ {
+ AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
+ }
+ }
+
+ /*
+ * counts array, just big enough for the nucleotide or peptide
+ * character set (plus gap counts in position 0)
+ */
+ private short[] counts;
+
+ /*
+ * alternative array of int counts for use if any count
+ * exceeds the maximum value of short (32767)
+ */
+ private int[] intCounts;
+
+ /*
+ * flag set if we switch from short to int counts
+ */
+ private boolean useIntCounts;
+
+ /*
+ * general-purpose counter, only for use for characters
+ * that are not in the expected alphabet
+ */
+ private SparseCount otherData;
+
+ /*
+ * keeps track of the maximum count value recorded
+ * (if this class every allows decrements, would need to
+ * calculate this on request instead)
+ */
+ int maxCount;
+
+ /*
+ * if we think we are counting nucleotide, can get by with smaller
+ * array to hold counts
+ */
+ private boolean isNucleotide;
+
+ /**
+ * Default constructor allocates arrays able to count either nucleotide or
+ * peptide bases. Use this constructor if not sure which the data is.
+ */
+ public ResidueCount()
+ {
+ this(false);
+ }
+
+ /**
+ * Constructor that allocates an array just big enough for the anticipated
+ * characters, plus one position to count gaps
+ */
+ public ResidueCount(boolean nucleotide)
+ {
+ isNucleotide = nucleotide;
+ int charsToCount = nucleotide ? NUCS.length() : AAS.length();
+ counts = new short[charsToCount + 1];
+ }
+
+ /**
+ * Increments the count for the given character. The supplied character may be
+ * upper or lower case but counts are for the upper case only. Gap characters
+ * (space, ., -) are all counted together.
+ *
+ * @param c
+ * @return the new value of the count for the character
+ */
+ public int add(final char c)
+ {
+ char u = toUpperCase(c);
+ int newValue = 0;
+ int offset = getOffset(u);
+
+ /*
+ * offset 0 is reserved for gap counting, so 0 here means either
+ * an unexpected character, or a gap character passed in error
+ */
+ if (offset == 0)
+ {
+ if (Comparison.isGap(u))
+ {
+ newValue = addGap();
+ }
+ else
+ {
+ newValue = addOtherCharacter(u);
+ }
+ }
+ else
+ {
+ newValue = increment(offset);
+ }
+ return newValue;
+ }
+
+ /**
+ * Increment the count at the specified offset. If this would result in short
+ * overflow, promote to counting int values instead.
+ *
+ * @param offset
+ * @return the new value of the count at this offset
+ */
+ int increment(int offset)
+ {
+ int newValue = 0;
+ if (useIntCounts)
+ {
+ newValue = intCounts[offset];
+ intCounts[offset] = ++newValue;
+ }
+ else
+ {
+ if (counts[offset] == Short.MAX_VALUE)
+ {
+ handleOverflow();
+ newValue = intCounts[offset];
+ intCounts[offset] = ++newValue;
+ }
+ else
+ {
+ newValue = counts[offset];
+ counts[offset] = (short) ++newValue;
+ }
+ }
+ maxCount = Math.max(maxCount, newValue);
+ return newValue;
+ }
+
+ /**
+ * Switch from counting in short to counting in int
+ */
+ synchronized void handleOverflow()
+ {
+ intCounts = new int[counts.length];
+ for (int i = 0; i < counts.length; i++)
+ {
+ intCounts[i] = counts[i];
+ }
+ counts = null;
+ useIntCounts = true;
+ }
+
+ /**
+ * Returns this character's offset in the count array
+ *
+ * @param c
+ * @return
+ */
+ int getOffset(char c)
+ {
+ int offset = 0;
+ if ('A' <= c && c <= 'Z')
+ {
+ offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
+ }
+ return offset;
+ }
+
+ /**
+ * @param c
+ * @return
+ */
+ protected char toUpperCase(final char c)
+ {
+ char u = c;
+ if ('a' <= c && c <= 'z')
+ {
+ u = (char) (c + TOUPPERCASE);
+ }
+ return u;
+ }
+
+ /**
+ * Increment count for some unanticipated character. The first time this
+ * called, a SparseCount is instantiated to hold these 'extra' counts.
+ *
+ * @param c
+ * @return the new value of the count for the character
+ */
+ int addOtherCharacter(char c)
+ {
+ if (otherData == null)
+ {
+ otherData = new SparseCount();
+ }
+ int newValue = otherData.add(c, 1);
+ maxCount = Math.max(maxCount, newValue);
+ return newValue;
+ }
+
+ /**
+ * Set count for some unanticipated character. The first time this called, a
+ * SparseCount is instantiated to hold these 'extra' counts.
+ *
+ * @param c
+ * @param value
+ */
+ void setOtherCharacter(char c, int value)
+ {
+ if (otherData == null)
+ {
+ otherData = new SparseCount();
+ }
+ otherData.put(c, value);
+ }
+
+ /**
+ * Increment count of gap characters
+ *
+ * @return the new count of gaps
+ */
+ public int addGap()
+ {
+ int newValue;
+ if (useIntCounts)
+ {
+ newValue = ++intCounts[GAP_COUNT];
+ }
+ else
+ {
+ newValue = ++counts[GAP_COUNT];
+ }
+ return newValue;
+ }
+
+ /**
+ * Answers true if we are counting ints (only after overflow of short counts)
+ *
+ * @return
+ */
+ boolean isCountingInts()
+ {
+ return useIntCounts;
+ }
+
+ /**
+ * Sets the count for the given character. The supplied character may be upper
+ * or lower case but counts are for the upper case only.
+ *
+ * @param c
+ * @param count
+ */
+ public void put(char c, int count)
+ {
+ char u = toUpperCase(c);
+ int offset = getOffset(u);
+
+ /*
+ * offset 0 is reserved for gap counting, so 0 here means either
+ * an unexpected character, or a gap character passed in error
+ */
+ if (offset == 0)
+ {
+ if (Comparison.isGap(u))
+ {
+ set(0, count);
+ }
+ else
+ {
+ setOtherCharacter(u, count);
+ maxCount = Math.max(maxCount, count);
+ }
+ }
+ else
+ {
+ set(offset, count);
+ maxCount = Math.max(maxCount, count);
+ }
+ }
+
+ /**
+ * Sets the count at the specified offset. If this would result in short
+ * overflow, promote to counting int values instead.
+ *
+ * @param offset
+ * @param value
+ */
+ void set(int offset, int value)
+ {
+ if (useIntCounts)
+ {
+ intCounts[offset] = value;
+ }
+ else
+ {
+ if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
+ {
+ handleOverflow();
+ intCounts[offset] = value;
+ }
+ else
+ {
+ counts[offset] = (short) value;
+ }
+ }
+ }
+
+ /**
+ * Returns the count for the given character, or zero if no count held
+ *
+ * @param c
+ * @return
+ */
+ public int getCount(char c)
+ {
+ char u = toUpperCase(c);
+ int offset = getOffset(u);
+ if (offset == 0)
+ {
+ if (!Comparison.isGap(u))
+ {
+ // should have called getGapCount()
+ return otherData == null ? 0 : otherData.get(u);
+ }
+ }
+ return useIntCounts ? intCounts[offset] : counts[offset];
+ }
+
+ public int getGapCount()
+ {
+ return useIntCounts ? intCounts[0] : counts[0];
+ }
+
+ /**
+ * Answers true if this object wraps a counter for unexpected characters
+ *
+ * @return
+ */
+ boolean isUsingOtherData()
+ {
+ return otherData != null;
+ }
+
+ /**
+ * Returns the character (or concatenated characters) for the symbol(s) with
+ * the given count in the profile. Can be used to get the modal residue by
+ * supplying the modal count value. Returns an empty string if no symbol has
+ * the given count. The symbols are in alphabetic order of standard peptide or
+ * nucleotide characters, followed by 'other' symbols if any.
+ *
+ * @return
+ */
+ public String getResiduesForCount(int count)
+ {
+ if (count == 0)
+ {
+ return "";
+ }
+
+ /*
+ * find counts for the given value and append the
+ * corresponding symbol
+ */
+ StringBuilder modal = new StringBuilder();
+ if (useIntCounts)
+ {
+ for (int i = 1; i < intCounts.length; i++)
+ {
+ if (intCounts[i] == count)
+ {
+ modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
+ .charAt(i - 1));
+ }
+ }
+ }
+ else
+ {
+ for (int i = 1; i < counts.length; i++)
+ {
+ if (counts[i] == count)
+ {
+ modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
+ .charAt(i - 1));
+ }
+ }
+ }
+ if (otherData != null)
+ {
+ for (int i = 0; i < otherData.size(); i++)
+ {
+ if (otherData.valueAt(i) == count)
+ {
+ modal.append((char) otherData.keyAt(i));
+ }
+ }
+ }
+ return modal.toString();
+ }
+
+ /**
+ * Returns the highest count for any symbol(s) in the profile (excluding gap)
+ *
+ * @return
+ */
+ public int getModalCount()
+ {
+ return maxCount;
+ }
+
+ /**
+ * Returns the number of distinct symbols with a non-zero count (excluding the
+ * gap symbol)
+ *
+ * @return
+ */
+ public int size() {
+ int size = 0;
+ if (useIntCounts)
+ {
+ for (int i = 1; i < intCounts.length; i++)
+ {
+ if (intCounts[i] > 0)
+ {
+ size++;
+ }
+ }
+ }
+ else
+ {
+ for (int i = 1; i < counts.length; i++)
+ {
+ if (counts[i] > 0)
+ {
+ size++;
+ }
+ }
+ }
+
+ /*
+ * include 'other' characters recorded (even if count is zero
+ * though that would be a strange use case)
+ */
+ if (otherData != null)
+ {
+ size += otherData.size();
+ }
+
+ return size;
+ }
+
+ /**
+ * Returns a data bean holding those symbols that have a non-zero count
+ * (excluding the gap symbol), with their counts.
+ *
+ * @return
+ */
+ public SymbolCounts getSymbolCounts()
+ {
+ int size = size();
+ char[] symbols = new char[size];
+ int[] values = new int[size];
+ int j = 0;
+
+ if (useIntCounts)
+ {
+ for (int i = 1; i < intCounts.length; i++)
+ {
+ if (intCounts[i] > 0)
+ {
+ char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
+ .charAt(i - 1);
+ symbols[j] = symbol;
+ values[j] = intCounts[i];
+ j++;
+ }
+ }
+ }
+ else
+ {
+ for (int i = 1; i < counts.length; i++)
+ {
+ if (counts[i] > 0)
+ {
+ char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
+ .charAt(i - 1);
+ symbols[j] = symbol;
+ values[j] = counts[i];
+ j++;
+ }
+ }
+ }
+ if (otherData != null)
+ {
+ for (int i = 0; i < otherData.size(); i++)
+ {
+ symbols[j] = (char) otherData.keyAt(i);
+ values[j] = otherData.valueAt(i);
+ j++;
+ }
+ }
+
+ return new SymbolCounts(symbols, values);
+ }
+
+ /**
+ * Returns a tooltip string showing residues in descending order of their
+ * percentage frequency in the profile
+ *
+ * @param normaliseBy
+ * the divisor for residue counts (may or may not include gapped
+ * sequence count)
+ * @param percentageDecPl
+ * the number of decimal places to show in percentages
+ * @return
+ */
+ public String getTooltip(int normaliseBy, int percentageDecPl)
+ {
+ SymbolCounts symbolCounts = getSymbolCounts();
+ char[] ca = symbolCounts.symbols;
+ int[] vl = symbolCounts.values;
+
+ /*
+ * sort characters into ascending order of their counts
+ */
+ QuickSort.sort(vl, ca);
+
+ /*
+ * traverse in reverse order (highest count first) to build tooltip
+ */
+ boolean first = true;
+ StringBuilder sb = new StringBuilder(64);
+ for (int c = ca.length - 1; c >= 0; c--)
+ {
+ final char residue = ca[c];
+ // TODO combine residues which share a percentage
+ // (see AAFrequency.completeCdnaConsensus)
+ float tval = (vl[c] * 100f) / normaliseBy;
+ sb.append(first ? "" : "; ").append(residue).append(" ");
+ Format.appendPercentage(sb, tval, percentageDecPl);
+ sb.append("%");
+ first = false;
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Returns a string representation of the symbol counts, for debug purposes.
+ */
+ @Override
+ public String toString()
+ {
+ StringBuilder sb = new StringBuilder();
+ sb.append("[ ");
+ SymbolCounts sc = getSymbolCounts();
+ for (int i = 0; i < sc.symbols.length; i++)
+ {
+ sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+}