X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FResidueCount.java;fp=src%2Fjalview%2Fdatamodel%2FResidueCount.java;h=3e3a9666ddf2061e6d19e9f9fb30ca22b63136e2;hb=2595e9d4ee0dbbd3406a98c4e49a61ccde806479;hp=0000000000000000000000000000000000000000;hpb=e20075ba805d744d7cc4976e2b8d5e5840fb0a8d;p=jalview.git diff --git a/src/jalview/datamodel/ResidueCount.java b/src/jalview/datamodel/ResidueCount.java new file mode 100644 index 0000000..3e3a966 --- /dev/null +++ b/src/jalview/datamodel/ResidueCount.java @@ -0,0 +1,641 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.datamodel; + +import jalview.util.Comparison; +import jalview.util.Format; +import jalview.util.QuickSort; +import jalview.util.SparseCount; + +/** + * A class to count occurrences of residues in a profile, optimised for speed + * and memory footprint. + * @author gmcarstairs + * + */ +public class ResidueCount +{ + /** + * A data bean to hold the results of counting symbols + */ + public class SymbolCounts + { + /** + * the symbols seen (as char values), in no particular order + */ + public final char[] symbols; + + /** + * the counts for each symbol, in the same order as the symbols + */ + public final int[] values; + + SymbolCounts(char[] s, int[] v) + { + symbols = s; + values = v; + } + } + + private static final int TOUPPERCASE = 'A' - 'a'; + + /* + * nucleotide symbols to count (including N unknown) + */ + private static final String NUCS = "ACGNTU"; + + /* + * amino acid symbols to count (including X unknown) + * NB we also include U so as to support counting of RNA bases + * in the "don't know" case of nucleotide / peptide + */ + private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY"; + + private static final int GAP_COUNT = 0; + + /* + * fast lookup tables holding the index into our count + * arrays of each symbol; index 0 is reserved for gap counting + */ + private static int[] NUC_INDEX = new int[26]; + + private static int[] AA_INDEX = new int[26]; + static + { + for (int i = 0; i < NUCS.length(); i++) + { + NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1; + } + for (int i = 0; i < AAS.length(); i++) + { + AA_INDEX[AAS.charAt(i) - 'A'] = i + 1; + } + } + + /* + * counts array, just big enough for the nucleotide or peptide + * character set (plus gap counts in position 0) + */ + private short[] counts; + + /* + * alternative array of int counts for use if any count + * exceeds the maximum value of short (32767) + */ + private int[] intCounts; + + /* + * flag set if we switch from short to int counts + */ + private boolean useIntCounts; + + /* + * general-purpose counter, only for use for characters + * that are not in the expected alphabet + */ + private SparseCount otherData; + + /* + * keeps track of the maximum count value recorded + * (if this class ever allows decrements, would need to + * calculate this on request instead) + */ + int maxCount; + + /* + * if we think we are counting nucleotide, can get by with smaller + * array to hold counts + */ + private boolean isNucleotide; + + /** + * Default constructor allocates arrays able to count either nucleotide or + * peptide bases. Use this constructor if not sure which the data is. + */ + public ResidueCount() + { + this(false); + } + + /** + * Constructor that allocates an array just big enough for the anticipated + * characters, plus one position to count gaps + */ + public ResidueCount(boolean nucleotide) + { + isNucleotide = nucleotide; + int charsToCount = nucleotide ? NUCS.length() : AAS.length(); + counts = new short[charsToCount + 1]; + } + + /** + * Increments the count for the given character. The supplied character may be + * upper or lower case but counts are for the upper case only. Gap characters + * (space, ., -) are all counted together. + * + * @param c + * @return the new value of the count for the character + */ + public int add(final char c) + { + char u = toUpperCase(c); + int newValue = 0; + int offset = getOffset(u); + + /* + * offset 0 is reserved for gap counting, so 0 here means either + * an unexpected character, or a gap character passed in error + */ + if (offset == 0) + { + if (Comparison.isGap(u)) + { + newValue = addGap(); + } + else + { + newValue = addOtherCharacter(u); + } + } + else + { + newValue = increment(offset); + } + return newValue; + } + + /** + * Increment the count at the specified offset. If this would result in short + * overflow, promote to counting int values instead. + * + * @param offset + * @return the new value of the count at this offset + */ + int increment(int offset) + { + int newValue = 0; + if (useIntCounts) + { + newValue = intCounts[offset]; + intCounts[offset] = ++newValue; + } + else + { + if (counts[offset] == Short.MAX_VALUE) + { + handleOverflow(); + newValue = intCounts[offset]; + intCounts[offset] = ++newValue; + } + else + { + newValue = counts[offset]; + counts[offset] = (short) ++newValue; + } + } + maxCount = Math.max(maxCount, newValue); + return newValue; + } + + /** + * Switch from counting in short to counting in int + */ + synchronized void handleOverflow() + { + intCounts = new int[counts.length]; + for (int i = 0; i < counts.length; i++) + { + intCounts[i] = counts[i]; + } + counts = null; + useIntCounts = true; + } + + /** + * Returns this character's offset in the count array + * + * @param c + * @return + */ + int getOffset(char c) + { + int offset = 0; + if ('A' <= c && c <= 'Z') + { + offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A']; + } + return offset; + } + + /** + * @param c + * @return + */ + protected char toUpperCase(final char c) + { + char u = c; + if ('a' <= c && c <= 'z') + { + u = (char) (c + TOUPPERCASE); + } + return u; + } + + /** + * Increment count for some unanticipated character. The first time this + * called, a SparseCount is instantiated to hold these 'extra' counts. + * + * @param c + * @return the new value of the count for the character + */ + int addOtherCharacter(char c) + { + if (otherData == null) + { + otherData = new SparseCount(); + } + int newValue = otherData.add(c, 1); + maxCount = Math.max(maxCount, newValue); + return newValue; + } + + /** + * Set count for some unanticipated character. The first time this called, a + * SparseCount is instantiated to hold these 'extra' counts. + * + * @param c + * @param value + */ + void setOtherCharacter(char c, int value) + { + if (otherData == null) + { + otherData = new SparseCount(); + } + otherData.put(c, value); + } + + /** + * Increment count of gap characters + * + * @return the new count of gaps + */ + public int addGap() + { + int newValue; + if (useIntCounts) + { + newValue = ++intCounts[GAP_COUNT]; + } + else + { + newValue = ++counts[GAP_COUNT]; + } + return newValue; + } + + /** + * Answers true if we are counting ints (only after overflow of short counts) + * + * @return + */ + boolean isCountingInts() + { + return useIntCounts; + } + + /** + * Sets the count for the given character. The supplied character may be upper + * or lower case but counts are for the upper case only. + * + * @param c + * @param count + */ + public void put(char c, int count) + { + char u = toUpperCase(c); + int offset = getOffset(u); + + /* + * offset 0 is reserved for gap counting, so 0 here means either + * an unexpected character, or a gap character passed in error + */ + if (offset == 0) + { + if (Comparison.isGap(u)) + { + set(0, count); + } + else + { + setOtherCharacter(u, count); + maxCount = Math.max(maxCount, count); + } + } + else + { + set(offset, count); + maxCount = Math.max(maxCount, count); + } + } + + /** + * Sets the count at the specified offset. If this would result in short + * overflow, promote to counting int values instead. + * + * @param offset + * @param value + */ + void set(int offset, int value) + { + if (useIntCounts) + { + intCounts[offset] = value; + } + else + { + if (value > Short.MAX_VALUE || value < Short.MIN_VALUE) + { + handleOverflow(); + intCounts[offset] = value; + } + else + { + counts[offset] = (short) value; + } + } + } + + /** + * Returns the count for the given character, or zero if no count held + * + * @param c + * @return + */ + public int getCount(char c) + { + char u = toUpperCase(c); + int offset = getOffset(u); + if (offset == 0) + { + if (!Comparison.isGap(u)) + { + // should have called getGapCount() + return otherData == null ? 0 : otherData.get(u); + } + } + return useIntCounts ? intCounts[offset] : counts[offset]; + } + + public int getGapCount() + { + return useIntCounts ? intCounts[0] : counts[0]; + } + + /** + * Answers true if this object wraps a counter for unexpected characters + * + * @return + */ + boolean isUsingOtherData() + { + return otherData != null; + } + + /** + * Returns the character (or concatenated characters) for the symbol(s) with + * the given count in the profile. Can be used to get the modal residue by + * supplying the modal count value. Returns an empty string if no symbol has + * the given count. The symbols are in alphabetic order of standard peptide or + * nucleotide characters, followed by 'other' symbols if any. + * + * @return + */ + public String getResiduesForCount(int count) + { + if (count == 0) + { + return ""; + } + + /* + * find counts for the given value and append the + * corresponding symbol + */ + StringBuilder modal = new StringBuilder(); + if (useIntCounts) + { + for (int i = 1; i < intCounts.length; i++) + { + if (intCounts[i] == count) + { + modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS + .charAt(i - 1)); + } + } + } + else + { + for (int i = 1; i < counts.length; i++) + { + if (counts[i] == count) + { + modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS + .charAt(i - 1)); + } + } + } + if (otherData != null) + { + for (int i = 0; i < otherData.size(); i++) + { + if (otherData.valueAt(i) == count) + { + modal.append((char) otherData.keyAt(i)); + } + } + } + return modal.toString(); + } + + /** + * Returns the highest count for any symbol(s) in the profile (excluding gap) + * + * @return + */ + public int getModalCount() + { + return maxCount; + } + + /** + * Returns the number of distinct symbols with a non-zero count (excluding the + * gap symbol) + * + * @return + */ + public int size() { + int size = 0; + if (useIntCounts) + { + for (int i = 1; i < intCounts.length; i++) + { + if (intCounts[i] > 0) + { + size++; + } + } + } + else + { + for (int i = 1; i < counts.length; i++) + { + if (counts[i] > 0) + { + size++; + } + } + } + + /* + * include 'other' characters recorded (even if count is zero + * though that would be a strange use case) + */ + if (otherData != null) + { + size += otherData.size(); + } + + return size; + } + + /** + * Returns a data bean holding those symbols that have a non-zero count + * (excluding the gap symbol), with their counts. + * + * @return + */ + public SymbolCounts getSymbolCounts() + { + int size = size(); + char[] symbols = new char[size]; + int[] values = new int[size]; + int j = 0; + + if (useIntCounts) + { + for (int i = 1; i < intCounts.length; i++) + { + if (intCounts[i] > 0) + { + char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS + .charAt(i - 1); + symbols[j] = symbol; + values[j] = intCounts[i]; + j++; + } + } + } + else + { + for (int i = 1; i < counts.length; i++) + { + if (counts[i] > 0) + { + char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS + .charAt(i - 1); + symbols[j] = symbol; + values[j] = counts[i]; + j++; + } + } + } + if (otherData != null) + { + for (int i = 0; i < otherData.size(); i++) + { + symbols[j] = (char) otherData.keyAt(i); + values[j] = otherData.valueAt(i); + j++; + } + } + + return new SymbolCounts(symbols, values); + } + + /** + * Returns a tooltip string showing residues in descending order of their + * percentage frequency in the profile + * + * @param normaliseBy + * the divisor for residue counts (may or may not include gapped + * sequence count) + * @param percentageDecPl + * the number of decimal places to show in percentages + * @return + */ + public String getTooltip(int normaliseBy, int percentageDecPl) + { + SymbolCounts symbolCounts = getSymbolCounts(); + char[] ca = symbolCounts.symbols; + int[] vl = symbolCounts.values; + + /* + * sort characters into ascending order of their counts + */ + QuickSort.sort(vl, ca); + + /* + * traverse in reverse order (highest count first) to build tooltip + */ + boolean first = true; + StringBuilder sb = new StringBuilder(64); + for (int c = ca.length - 1; c >= 0; c--) + { + final char residue = ca[c]; + // TODO combine residues which share a percentage + // (see AAFrequency.completeCdnaConsensus) + float tval = (vl[c] * 100f) / normaliseBy; + sb.append(first ? "" : "; ").append(residue).append(" "); + Format.appendPercentage(sb, tval, percentageDecPl); + sb.append("%"); + first = false; + } + return sb.toString(); + } + + /** + * Returns a string representation of the symbol counts, for debug purposes. + */ + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(); + sb.append("[ "); + SymbolCounts sc = getSymbolCounts(); + for (int i = 0; i < sc.symbols.length; i++) + { + sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" "); + } + sb.append("]"); + return sb.toString(); + } +}