2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel;
23 import jalview.util.Comparison;
24 import jalview.util.Format;
25 import jalview.util.QuickSort;
26 import jalview.util.SparseCount;
29 * A class to count occurrences of residues in a profile, optimised for speed
30 * and memory footprint.
35 public class SecondaryStructureCount
38 * A data bean to hold the results of counting symbols
40 public class SymbolCounts
43 * the symbols seen (as char values), in no particular order
45 public final char[] symbols;
48 * the counts for each symbol, in the same order as the symbols
50 public final int[] values;
52 SymbolCounts(char[] s, int[] v)
59 private static final int TOUPPERCASE = 'A' - 'a';
62 * nucleotide symbols to count (including N unknown)
64 private static final String SS_SYMBOLS = "HEC";
67 static final int GAP_COUNT = 0;
70 * fast lookup tables holding the index into our count
71 * arrays of each symbol; index 0 is reserved for gap counting
73 private static int[] SS_INDEX = new int[26];
77 for (int i = 0; i < SS_SYMBOLS.length(); i++)
79 SS_INDEX[SS_SYMBOLS.charAt(i) - 'A'] = i + 1;
84 * counts array, just big enough for the nucleotide or peptide
85 * character set (plus gap counts in position 0)
87 private short[] counts;
90 * alternative array of int counts for use if any count
91 * exceeds the maximum value of short (32767)
93 private int[] intCounts;
96 * flag set if we switch from short to int counts
98 private boolean useIntCounts;
101 * general-purpose counter, only for use for characters
102 * that are not in the expected alphabet
104 private SparseCount otherData;
107 * keeps track of the maximum count value recorded
108 * (if this class ever allows decrements, would need to
109 * calculate this on request instead)
114 * Constructor that allocates an array just big enough for the anticipated
115 * characters, plus one position to count gaps
117 public SecondaryStructureCount()
120 int charsToCount = SS_SYMBOLS.length();
121 counts = new short[charsToCount + 1];
125 * Increments the count for the given character. The supplied character may be
126 * upper or lower case but counts are for the upper case only. Gap characters
127 * (space, ., -) are all counted together.
130 * @return the new value of the count for the character
132 public int add(final char c)
134 char u = toUpperCase(c);
136 int offset = getOffset(u);
139 * offset 0 is reserved for gap counting, so 0 here means either
140 * an unexpected character, or a gap character passed in error
144 if (Comparison.isGap(u))
150 newValue = addOtherCharacter(u);
155 newValue = increment(offset);
161 * Increment the count at the specified offset. If this would result in short
162 * overflow, promote to counting int values instead.
165 * @return the new value of the count at this offset
167 int increment(int offset)
172 newValue = intCounts[offset];
173 intCounts[offset] = ++newValue;
177 if (counts[offset] == Short.MAX_VALUE)
180 newValue = intCounts[offset];
181 intCounts[offset] = ++newValue;
185 newValue = counts[offset];
186 counts[offset] = (short) ++newValue;
190 if (offset != GAP_COUNT)
192 // update modal residue count
193 maxCount = Math.max(maxCount, newValue);
199 * Switch from counting in short to counting in int
201 synchronized void handleOverflow()
203 intCounts = new int[counts.length];
204 for (int i = 0; i < counts.length; i++)
206 intCounts[i] = counts[i];
213 * Returns this character's offset in the count array
218 int getOffset(char c)
221 if ('A' <= c && c <= 'Z')
223 offset = SS_INDEX[c - 'A'];
232 protected char toUpperCase(final char c)
235 if ('a' <= c && c <= 'z')
237 u = (char) (c + TOUPPERCASE);
243 * Increment count for some unanticipated character. The first time this
244 * called, a SparseCount is instantiated to hold these 'extra' counts.
247 * @return the new value of the count for the character
249 int addOtherCharacter(char c)
251 if (otherData == null)
253 otherData = new SparseCount();
255 int newValue = otherData.add(c, 1);
256 maxCount = Math.max(maxCount, newValue);
261 * Set count for some unanticipated character. The first time this called, a
262 * SparseCount is instantiated to hold these 'extra' counts.
267 void setOtherCharacter(char c, int value)
269 if (otherData == null)
271 otherData = new SparseCount();
273 otherData.put(c, value);
277 * Increment count of gap characters
279 * @return the new count of gaps
283 int newValue = increment(GAP_COUNT);
288 * Answers true if we are counting ints (only after overflow of short counts)
292 boolean isCountingInts()
298 * Sets the count for the given character. The supplied character may be upper
299 * or lower case but counts are for the upper case only.
304 public void put(char c, int count)
306 char u = toUpperCase(c);
307 int offset = getOffset(u);
310 * offset 0 is reserved for gap counting, so 0 here means either
311 * an unexpected character, or a gap character passed in error
315 if (Comparison.isGap(u))
321 setOtherCharacter(u, count);
322 maxCount = Math.max(maxCount, count);
328 maxCount = Math.max(maxCount, count);
333 * Sets the count at the specified offset. If this would result in short
334 * overflow, promote to counting int values instead.
339 void set(int offset, int value)
343 intCounts[offset] = value;
347 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
350 intCounts[offset] = value;
354 counts[offset] = (short) value;
360 * Returns the count for the given character, or zero if no count held
365 public int getCount(char c)
367 char u = toUpperCase(c);
368 int offset = getOffset(u);
371 if (!Comparison.isGap(u))
373 // should have called getGapCount()
374 return otherData == null ? 0 : otherData.get(u);
377 return useIntCounts ? intCounts[offset] : counts[offset];
380 public int getGapCount()
382 return useIntCounts ? intCounts[0] : counts[0];
386 * Answers true if this object wraps a counter for unexpected characters
390 boolean isUsingOtherData()
392 return otherData != null;
396 * Returns the character (or concatenated characters) for the symbol(s) with
397 * the given count in the profile. Can be used to get the modal residue by
398 * supplying the modal count value. Returns an empty string if no symbol has
399 * the given count. The symbols are in alphabetic order of standard peptide or
400 * nucleotide characters, followed by 'other' symbols if any.
404 public String getSSForCount(int count)
412 * find counts for the given value and append the
413 * corresponding symbol
415 StringBuilder modal = new StringBuilder();
418 for (int i = 1; i < intCounts.length; i++)
420 if (intCounts[i] == count)
423 SS_SYMBOLS.charAt(i - 1));
429 for (int i = 1; i < counts.length; i++)
431 if (counts[i] == count)
434 SS_SYMBOLS.charAt(i - 1));
438 if (otherData != null)
440 for (int i = 0; i < otherData.size(); i++)
442 if (otherData.valueAt(i) == count)
444 modal.append((char) otherData.keyAt(i));
448 return modal.toString();
452 * Returns the highest count for any symbol(s) in the profile (excluding gap)
456 public int getModalCount()
462 * Returns the number of distinct symbols with a non-zero count (excluding the
472 for (int i = 1; i < intCounts.length; i++)
474 if (intCounts[i] > 0)
482 for (int i = 1; i < counts.length; i++)
492 * include 'other' characters recorded (even if count is zero
493 * though that would be a strange use case)
495 if (otherData != null)
497 size += otherData.size();
504 * Returns a data bean holding those symbols that have a non-zero count
505 * (excluding the gap symbol), with their counts.
509 public SymbolCounts getSymbolCounts()
512 char[] symbols = new char[size];
513 int[] values = new int[size];
518 for (int i = 1; i < intCounts.length; i++)
520 if (intCounts[i] > 0)
522 char symbol = SS_SYMBOLS.charAt(i - 1);
524 values[j] = intCounts[i];
531 for (int i = 1; i < counts.length; i++)
535 char symbol = SS_SYMBOLS.charAt(i - 1);
537 values[j] = counts[i];
542 if (otherData != null)
544 for (int i = 0; i < otherData.size(); i++)
546 symbols[j] = (char) otherData.keyAt(i);
547 values[j] = otherData.valueAt(i);
552 return new SymbolCounts(symbols, values);
556 * Returns a tooltip string showing residues in descending order of their
557 * percentage frequency in the profile
560 * the divisor for residue counts (may or may not include gapped
562 * @param percentageDecPl
563 * the number of decimal places to show in percentages
566 public String getTooltip(int normaliseBy, int percentageDecPl)
568 SymbolCounts symbolCounts = getSymbolCounts();
569 char[] ca = symbolCounts.symbols;
570 int[] vl = symbolCounts.values;
573 * sort characters into ascending order of their counts
575 QuickSort.sort(vl, ca);
578 * traverse in reverse order (highest count first) to build tooltip
580 boolean first = true;
581 StringBuilder sb = new StringBuilder(64);
582 for (int c = ca.length - 1; c >= 0; c--)
584 final char residue = ca[c];
585 // TODO combine residues which share a percentage
586 // (see AAFrequency.completeCdnaConsensus)
587 float tval = (vl[c] * 100f) / normaliseBy;
588 sb.append(first ? "" : "; ").append(residue).append(" ");
589 Format.appendPercentage(sb, tval, percentageDecPl);
593 return sb.toString();
597 * Returns a string representation of the symbol counts, for debug purposes.
600 public String toString()
602 StringBuilder sb = new StringBuilder();
604 SymbolCounts sc = getSymbolCounts();
605 for (int i = 0; i < sc.symbols.length; i++)
607 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
610 return sb.toString();