+++ /dev/null
-package jalview.datamodel;
-
-import jalview.util.Comparison;
-import jalview.util.Format;
-import jalview.util.QuickSort;
-import jalview.util.SparseCount;
-
-/**
- * A class to count occurrences of residues in a profile, optimised for speed
- * and memory footprint.
- * @author gmcarstairs
- *
- */
-public class ResidueCount
-{
- /**
- * A data bean to hold the results of counting symbols
- */
- public class SymbolCounts
- {
- /**
- * the symbols seen (as char values), in no particular order
- */
- public final char[] symbols;
-
- /**
- * the counts for each symbol, in the same order as the symbols
- */
- public final int[] values;
-
- SymbolCounts(char[] s, int[] v)
- {
- symbols = s;
- values = v;
- }
- }
-
- private static final int TOUPPERCASE = 'A' - 'a';
-
- /*
- * nucleotide symbols to count (including N unknown)
- */
- private static final String NUCS = "ACGNTU";
-
- /*
- * amino acid symbols to count (including X unknown)
- * NB we also include U so as to support counting of RNA bases
- * in the "don't know" case of nucleotide / peptide
- */
- private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
-
- private static final int GAP_COUNT = 0;
-
- /*
- * fast lookup tables holding the index into our count
- * arrays of each symbol; index 0 is reserved for gap counting
- */
- private static int[] NUC_INDEX = new int[26];
-
- private static int[] AA_INDEX = new int[26];
- static
- {
- for (int i = 0; i < NUCS.length(); i++)
- {
- NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
- }
- for (int i = 0; i < AAS.length(); i++)
- {
- AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
- }
- }
-
- /*
- * counts array, just big enough for the nucleotide or peptide
- * character set (plus gap counts in position 0)
- */
- private short[] counts;
-
- /*
- * alternative array of int counts for use if any count
- * exceeds the maximum value of short (32767)
- */
- private int[] intCounts;
-
- /*
- * flag set if we switch from short to int counts
- */
- private boolean useIntCounts;
-
- /*
- * general-purpose counter, only for use for characters
- * that are not in the expected alphabet
- */
- private SparseCount otherData;
-
- /*
- * keeps track of the maximum count value recorded
- * (if this class ever allows decrements, would need to
- * calculate this on request instead)
- */
- int maxCount;
-
- /*
- * if we think we are counting nucleotide, can get by with smaller
- * array to hold counts
- */
- private boolean isNucleotide;
-
- /**
- * Default constructor allocates arrays able to count either nucleotide or
- * peptide bases. Use this constructor if not sure which the data is.
- */
- public ResidueCount()
- {
- this(false);
- }
-
- /**
- * Constructor that allocates an array just big enough for the anticipated
- * characters, plus one position to count gaps
- */
- public ResidueCount(boolean nucleotide)
- {
- isNucleotide = nucleotide;
- int charsToCount = nucleotide ? NUCS.length() : AAS.length();
- counts = new short[charsToCount + 1];
- }
-
- /**
- * Increments the count for the given character. The supplied character may be
- * upper or lower case but counts are for the upper case only. Gap characters
- * (space, ., -) are all counted together.
- *
- * @param c
- * @return the new value of the count for the character
- */
- public int add(final char c)
- {
- char u = toUpperCase(c);
- int newValue = 0;
- int offset = getOffset(u);
-
- /*
- * offset 0 is reserved for gap counting, so 0 here means either
- * an unexpected character, or a gap character passed in error
- */
- if (offset == 0)
- {
- if (Comparison.isGap(u))
- {
- newValue = addGap();
- }
- else
- {
- newValue = addOtherCharacter(u);
- }
- }
- else
- {
- newValue = increment(offset);
- }
- return newValue;
- }
-
- /**
- * Increment the count at the specified offset. If this would result in short
- * overflow, promote to counting int values instead.
- *
- * @param offset
- * @return the new value of the count at this offset
- */
- int increment(int offset)
- {
- int newValue = 0;
- if (useIntCounts)
- {
- newValue = intCounts[offset];
- intCounts[offset] = ++newValue;
- }
- else
- {
- if (counts[offset] == Short.MAX_VALUE)
- {
- handleOverflow();
- newValue = intCounts[offset];
- intCounts[offset] = ++newValue;
- }
- else
- {
- newValue = counts[offset];
- counts[offset] = (short) ++newValue;
- }
- }
- maxCount = Math.max(maxCount, newValue);
- return newValue;
- }
-
- /**
- * Switch from counting in short to counting in int
- */
- synchronized void handleOverflow()
- {
- intCounts = new int[counts.length];
- for (int i = 0; i < counts.length; i++)
- {
- intCounts[i] = counts[i];
- }
- counts = null;
- useIntCounts = true;
- }
-
- /**
- * Returns this character's offset in the count array
- *
- * @param c
- * @return
- */
- int getOffset(char c)
- {
- int offset = 0;
- if ('A' <= c && c <= 'Z')
- {
- offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
- }
- return offset;
- }
-
- /**
- * @param c
- * @return
- */
- protected char toUpperCase(final char c)
- {
- char u = c;
- if ('a' <= c && c <= 'z')
- {
- u = (char) (c + TOUPPERCASE);
- }
- return u;
- }
-
- /**
- * Increment count for some unanticipated character. The first time this
- * called, a SparseCount is instantiated to hold these 'extra' counts.
- *
- * @param c
- * @return the new value of the count for the character
- */
- int addOtherCharacter(char c)
- {
- if (otherData == null)
- {
- otherData = new SparseCount();
- }
- int newValue = otherData.add(c, 1);
- maxCount = Math.max(maxCount, newValue);
- return newValue;
- }
-
- /**
- * Set count for some unanticipated character. The first time this called, a
- * SparseCount is instantiated to hold these 'extra' counts.
- *
- * @param c
- * @param value
- */
- void setOtherCharacter(char c, int value)
- {
- if (otherData == null)
- {
- otherData = new SparseCount();
- }
- otherData.put(c, value);
- }
-
- /**
- * Increment count of gap characters
- *
- * @return the new count of gaps
- */
- public int addGap()
- {
- int newValue;
- if (useIntCounts)
- {
- newValue = ++intCounts[GAP_COUNT];
- }
- else
- {
- newValue = ++counts[GAP_COUNT];
- }
- return newValue;
- }
-
- /**
- * Answers true if we are counting ints (only after overflow of short counts)
- *
- * @return
- */
- boolean isCountingInts()
- {
- return useIntCounts;
- }
-
- /**
- * Sets the count for the given character. The supplied character may be upper
- * or lower case but counts are for the upper case only.
- *
- * @param c
- * @param count
- */
- public void put(char c, int count)
- {
- char u = toUpperCase(c);
- int offset = getOffset(u);
-
- /*
- * offset 0 is reserved for gap counting, so 0 here means either
- * an unexpected character, or a gap character passed in error
- */
- if (offset == 0)
- {
- if (Comparison.isGap(u))
- {
- set(0, count);
- }
- else
- {
- setOtherCharacter(u, count);
- maxCount = Math.max(maxCount, count);
- }
- }
- else
- {
- set(offset, count);
- maxCount = Math.max(maxCount, count);
- }
- }
-
- /**
- * Sets the count at the specified offset. If this would result in short
- * overflow, promote to counting int values instead.
- *
- * @param offset
- * @param value
- */
- void set(int offset, int value)
- {
- if (useIntCounts)
- {
- intCounts[offset] = value;
- }
- else
- {
- if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
- {
- handleOverflow();
- intCounts[offset] = value;
- }
- else
- {
- counts[offset] = (short) value;
- }
- }
- }
-
- /**
- * Returns the count for the given character, or zero if no count held
- *
- * @param c
- * @return
- */
- public int getCount(char c)
- {
- char u = toUpperCase(c);
- int offset = getOffset(u);
- if (offset == 0)
- {
- if (!Comparison.isGap(u))
- {
- // should have called getGapCount()
- return otherData == null ? 0 : otherData.get(u);
- }
- }
- return useIntCounts ? intCounts[offset] : counts[offset];
- }
-
- public int getGapCount()
- {
- return useIntCounts ? intCounts[0] : counts[0];
- }
-
- /**
- * Answers true if this object wraps a counter for unexpected characters
- *
- * @return
- */
- boolean isUsingOtherData()
- {
- return otherData != null;
- }
-
- /**
- * Returns the character (or concatenated characters) for the symbol(s) with
- * the given count in the profile. Can be used to get the modal residue by
- * supplying the modal count value. Returns an empty string if no symbol has
- * the given count. The symbols are in alphabetic order of standard peptide or
- * nucleotide characters, followed by 'other' symbols if any.
- *
- * @return
- */
- public String getResiduesForCount(int count)
- {
- if (count == 0)
- {
- return "";
- }
-
- /*
- * find counts for the given value and append the
- * corresponding symbol
- */
- StringBuilder modal = new StringBuilder();
- if (useIntCounts)
- {
- for (int i = 1; i < intCounts.length; i++)
- {
- if (intCounts[i] == count)
- {
- modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
- .charAt(i - 1));
- }
- }
- }
- else
- {
- for (int i = 1; i < counts.length; i++)
- {
- if (counts[i] == count)
- {
- modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
- .charAt(i - 1));
- }
- }
- }
- if (otherData != null)
- {
- for (int i = 0; i < otherData.size(); i++)
- {
- if (otherData.valueAt(i) == count)
- {
- modal.append((char) otherData.keyAt(i));
- }
- }
- }
- return modal.toString();
- }
-
- /**
- * Returns the highest count for any symbol(s) in the profile (excluding gap)
- *
- * @return
- */
- public int getModalCount()
- {
- return maxCount;
- }
-
- /**
- * Returns the number of distinct symbols with a non-zero count (excluding the
- * gap symbol)
- *
- * @return
- */
- public int size() {
- int size = 0;
- if (useIntCounts)
- {
- for (int i = 1; i < intCounts.length; i++)
- {
- if (intCounts[i] > 0)
- {
- size++;
- }
- }
- }
- else
- {
- for (int i = 1; i < counts.length; i++)
- {
- if (counts[i] > 0)
- {
- size++;
- }
- }
- }
-
- /*
- * include 'other' characters recorded (even if count is zero
- * though that would be a strange use case)
- */
- if (otherData != null)
- {
- size += otherData.size();
- }
-
- return size;
- }
-
- /**
- * Returns a data bean holding those symbols that have a non-zero count
- * (excluding the gap symbol), with their counts.
- *
- * @return
- */
- public SymbolCounts getSymbolCounts()
- {
- int size = size();
- char[] symbols = new char[size];
- int[] values = new int[size];
- int j = 0;
-
- if (useIntCounts)
- {
- for (int i = 1; i < intCounts.length; i++)
- {
- if (intCounts[i] > 0)
- {
- char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
- .charAt(i - 1);
- symbols[j] = symbol;
- values[j] = intCounts[i];
- j++;
- }
- }
- }
- else
- {
- for (int i = 1; i < counts.length; i++)
- {
- if (counts[i] > 0)
- {
- char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
- .charAt(i - 1);
- symbols[j] = symbol;
- values[j] = counts[i];
- j++;
- }
- }
- }
- if (otherData != null)
- {
- for (int i = 0; i < otherData.size(); i++)
- {
- symbols[j] = (char) otherData.keyAt(i);
- values[j] = otherData.valueAt(i);
- j++;
- }
- }
-
- return new SymbolCounts(symbols, values);
- }
-
- /**
- * Returns a tooltip string showing residues in descending order of their
- * percentage frequency in the profile
- *
- * @param normaliseBy
- * the divisor for residue counts (may or may not include gapped
- * sequence count)
- * @param percentageDecPl
- * the number of decimal places to show in percentages
- * @return
- */
- public String getTooltip(int normaliseBy, int percentageDecPl)
- {
- SymbolCounts symbolCounts = getSymbolCounts();
- char[] ca = symbolCounts.symbols;
- int[] vl = symbolCounts.values;
-
- /*
- * sort characters into ascending order of their counts
- */
- QuickSort.sort(vl, ca);
-
- /*
- * traverse in reverse order (highest count first) to build tooltip
- */
- boolean first = true;
- StringBuilder sb = new StringBuilder(64);
- for (int c = ca.length - 1; c >= 0; c--)
- {
- final char residue = ca[c];
- // TODO combine residues which share a percentage
- // (see AAFrequency.completeCdnaConsensus)
- float tval = (vl[c] * 100f) / normaliseBy;
- sb.append(first ? "" : "; ").append(residue).append(" ");
- Format.appendPercentage(sb, tval, percentageDecPl);
- sb.append("%");
- first = false;
- }
- return sb.toString();
- }
-
- /**
- * Returns a string representation of the symbol counts, for debug purposes.
- */
- @Override
- public String toString()
- {
- StringBuilder sb = new StringBuilder();
- sb.append("[ ");
- SymbolCounts sc = getSymbolCounts();
- for (int i = 0; i < sc.symbols.length; i++)
- {
- sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
- }
- sb.append("]");
- return sb.toString();
- }
-}