2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel;
23 import jalview.util.Comparison;
24 import jalview.util.Format;
25 import jalview.util.QuickSort;
26 import jalview.util.SparseCount;
28 import java.util.List;
31 * A class to count occurrences of residues in a profile, optimised for speed
32 * and memory footprint.
37 public class ResidueCount
40 * A data bean to hold the results of counting symbols
42 public class SymbolCounts
45 * the symbols seen (as char values), in no particular order
47 public final char[] symbols;
50 * the counts for each symbol, in the same order as the symbols
52 public final int[] values;
54 SymbolCounts(char[] s, int[] v)
61 private static final int TOUPPERCASE = 'A' - 'a';
64 * nucleotide symbols to count (including N unknown)
66 private static final String NUCS = "ACGNTU";
69 * amino acid symbols to count (including X unknown)
70 * NB we also include U so as to support counting of RNA bases
71 * in the "don't know" case of nucleotide / peptide
73 private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
75 private static final int GAP_COUNT = 0;
78 * fast lookup tables holding the index into our count
79 * arrays of each symbol; index 0 is reserved for gap counting
81 private static int[] NUC_INDEX = new int[26];
83 private static int[] AA_INDEX = new int[26];
86 for (int i = 0; i < NUCS.length(); i++)
88 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
90 for (int i = 0; i < AAS.length(); i++)
92 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
97 * counts array, just big enough for the nucleotide or peptide
98 * character set (plus gap counts in position 0)
100 private short[] counts;
103 * alternative array of int counts for use if any count
104 * exceeds the maximum value of short (32767)
106 private int[] intCounts;
109 * flag set if we switch from short to int counts
111 private boolean useIntCounts;
114 * general-purpose counter, only for use for characters
115 * that are not in the expected alphabet
117 private SparseCount otherData;
120 * keeps track of the maximum count value recorded
121 * (if this class ever allows decrements, would need to
122 * calculate this on request instead)
127 * if we think we are counting nucleotide, can get by with smaller
128 * array to hold counts
130 private boolean isNucleotide;
133 * Default constructor allocates arrays able to count either nucleotide or
134 * peptide bases. Use this constructor if not sure which the data is.
136 public ResidueCount()
142 * Constructor that allocates an array just big enough for the anticipated
143 * characters, plus one position to count gaps
145 public ResidueCount(boolean nucleotide)
147 isNucleotide = nucleotide;
148 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
149 counts = new short[charsToCount + 1];
153 * A constructor that counts frequency of all symbols (including gaps) in the
154 * sequences (not case-sensitive)
158 public ResidueCount(List<SequenceI> sequences)
161 for (SequenceI seq : sequences)
163 for (int i = 0; i < seq.getLength(); i++)
165 add(seq.getCharAt(i));
171 * Increments the count for the given character. The supplied character may be
172 * upper or lower case but counts are for the upper case only. Gap characters
173 * (space, ., -) are all counted together.
176 * @return the new value of the count for the character
178 public int add(final char c)
180 char u = toUpperCase(c);
182 int offset = getOffset(u);
185 * offset 0 is reserved for gap counting, so 0 here means either
186 * an unexpected character, or a gap character passed in error
190 if (Comparison.isGap(u))
196 newValue = addOtherCharacter(u);
201 newValue = increment(offset);
207 * Increment the count at the specified offset. If this would result in short
208 * overflow, promote to counting int values instead.
211 * @return the new value of the count at this offset
213 int increment(int offset)
218 newValue = intCounts[offset];
219 intCounts[offset] = ++newValue;
223 if (counts[offset] == Short.MAX_VALUE)
226 newValue = intCounts[offset];
227 intCounts[offset] = ++newValue;
231 newValue = counts[offset];
232 counts[offset] = (short) ++newValue;
235 maxCount = Math.max(maxCount, newValue);
240 * Switch from counting in short to counting in int
242 synchronized void handleOverflow()
244 intCounts = new int[counts.length];
245 for (int i = 0; i < counts.length; i++)
247 intCounts[i] = counts[i];
254 * Returns this character's offset in the count array
259 int getOffset(char c)
262 if ('A' <= c && c <= 'Z')
264 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
273 protected char toUpperCase(final char c)
276 if ('a' <= c && c <= 'z')
278 u = (char) (c + TOUPPERCASE);
284 * Increment count for some unanticipated character. The first time this
285 * called, a SparseCount is instantiated to hold these 'extra' counts.
288 * @return the new value of the count for the character
290 int addOtherCharacter(char c)
292 if (otherData == null)
294 otherData = new SparseCount();
296 int newValue = otherData.add(c, 1);
297 maxCount = Math.max(maxCount, newValue);
302 * Set count for some unanticipated character. The first time this called, a
303 * SparseCount is instantiated to hold these 'extra' counts.
308 void setOtherCharacter(char c, int value)
310 if (otherData == null)
312 otherData = new SparseCount();
314 otherData.put(c, value);
318 * Increment count of gap characters
320 * @return the new count of gaps
327 newValue = ++intCounts[GAP_COUNT];
331 newValue = ++counts[GAP_COUNT];
337 * Answers true if we are counting ints (only after overflow of short counts)
341 boolean isCountingInts()
347 * Sets the count for the given character. The supplied character may be upper
348 * or lower case but counts are for the upper case only.
353 public void put(char c, int count)
355 char u = toUpperCase(c);
356 int offset = getOffset(u);
359 * offset 0 is reserved for gap counting, so 0 here means either
360 * an unexpected character, or a gap character passed in error
364 if (Comparison.isGap(u))
370 setOtherCharacter(u, count);
371 maxCount = Math.max(maxCount, count);
377 maxCount = Math.max(maxCount, count);
382 * Sets the count at the specified offset. If this would result in short
383 * overflow, promote to counting int values instead.
388 void set(int offset, int value)
392 intCounts[offset] = value;
396 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
399 intCounts[offset] = value;
403 counts[offset] = (short) value;
409 * Returns the count for the given character, or zero if no count held
414 public int getCount(char c)
416 char u = toUpperCase(c);
417 int offset = getOffset(u);
420 if (!Comparison.isGap(u))
422 // should have called getGapCount()
423 return otherData == null ? 0 : otherData.get(u);
426 return useIntCounts ? intCounts[offset] : counts[offset];
429 public int getGapCount()
431 return useIntCounts ? intCounts[0] : counts[0];
435 * Answers true if this object wraps a counter for unexpected characters
439 boolean isUsingOtherData()
441 return otherData != null;
445 * Returns the character (or concatenated characters) for the symbol(s) with
446 * the given count in the profile. Can be used to get the modal residue by
447 * supplying the modal count value. Returns an empty string if no symbol has
448 * the given count. The symbols are in alphabetic order of standard peptide or
449 * nucleotide characters, followed by 'other' symbols if any.
453 public String getResiduesForCount(int count)
461 * find counts for the given value and append the
462 * corresponding symbol
464 StringBuilder modal = new StringBuilder();
467 for (int i = 1; i < intCounts.length; i++)
469 if (intCounts[i] == count)
472 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
478 for (int i = 1; i < counts.length; i++)
480 if (counts[i] == count)
483 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
487 if (otherData != null)
489 for (int i = 0; i < otherData.size(); i++)
491 if (otherData.valueAt(i) == count)
493 modal.append((char) otherData.keyAt(i));
497 return modal.toString();
501 * Returns the highest count for any symbol(s) in the profile (excluding gap)
505 public int getModalCount()
511 * Returns the number of distinct symbols with a non-zero count (excluding the
521 for (int i = 1; i < intCounts.length; i++)
523 if (intCounts[i] > 0)
531 for (int i = 1; i < counts.length; i++)
541 * include 'other' characters recorded (even if count is zero
542 * though that would be a strange use case)
544 if (otherData != null)
546 size += otherData.size();
553 * Returns a data bean holding those symbols that have a non-zero count
554 * (excluding the gap symbol), with their counts.
558 public SymbolCounts getSymbolCounts()
561 char[] symbols = new char[size];
562 int[] values = new int[size];
567 for (int i = 1; i < intCounts.length; i++)
569 if (intCounts[i] > 0)
571 char symbol = isNucleotide ? NUCS.charAt(i - 1)
574 values[j] = intCounts[i];
581 for (int i = 1; i < counts.length; i++)
585 char symbol = isNucleotide ? NUCS.charAt(i - 1)
588 values[j] = counts[i];
593 if (otherData != null)
595 for (int i = 0; i < otherData.size(); i++)
597 symbols[j] = (char) otherData.keyAt(i);
598 values[j] = otherData.valueAt(i);
603 return new SymbolCounts(symbols, values);
607 * Returns a tooltip string showing residues in descending order of their
608 * percentage frequency in the profile
611 * the divisor for residue counts (may or may not include gapped
613 * @param percentageDecPl
614 * the number of decimal places to show in percentages
617 public String getTooltip(int normaliseBy, int percentageDecPl)
619 SymbolCounts symbolCounts = getSymbolCounts();
620 char[] ca = symbolCounts.symbols;
621 int[] vl = symbolCounts.values;
624 * sort characters into ascending order of their counts
626 QuickSort.sort(vl, ca);
629 * traverse in reverse order (highest count first) to build tooltip
631 boolean first = true;
632 StringBuilder sb = new StringBuilder(64);
633 for (int c = ca.length - 1; c >= 0; c--)
635 final char residue = ca[c];
636 // TODO combine residues which share a percentage
637 // (see AAFrequency.completeCdnaConsensus)
638 float tval = (vl[c] * 100f) / normaliseBy;
639 sb.append(first ? "" : "; ").append(residue).append(" ");
640 Format.appendPercentage(sb, tval, percentageDecPl);
644 return sb.toString();
648 * Returns a string representation of the symbol counts, for debug purposes.
651 public String toString()
653 StringBuilder sb = new StringBuilder();
655 SymbolCounts sc = getSymbolCounts();
656 for (int i = 0; i < sc.symbols.length; i++)
658 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
661 return sb.toString();
665 * Answers the total count for all symbols (excluding gaps)
669 public int getTotalResidueCount()
672 for (char symbol : this.getSymbolCounts().symbols)
674 total += getCount(symbol);