/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.datamodel;
import jalview.util.Comparison;
import jalview.util.Format;
import jalview.util.QuickSort;
import jalview.util.SparseCount;
import java.util.List;
/**
* A class to count occurrences of residues in a profile, optimised for speed
* and memory footprint.
*
* @author gmcarstairs
*
*/
public class ResidueCount
{
/**
* A data bean to hold the results of counting symbols
*/
public class SymbolCounts
{
/**
* the symbols seen (as char values), in no particular order
*/
public final char[] symbols;
/**
* the counts for each symbol, in the same order as the symbols
*/
public final int[] values;
SymbolCounts(char[] s, int[] v)
{
symbols = s;
values = v;
}
}
private static final int TOUPPERCASE = 'A' - 'a';
/*
* nucleotide symbols to count (including N unknown)
*/
private static final String NUCS = "ACGNTU";
/*
* amino acid symbols to count (including X unknown)
* NB we also include U so as to support counting of RNA bases
* in the "don't know" case of nucleotide / peptide
*/
private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
static final int GAP_COUNT = 0;
/*
* fast lookup tables holding the index into our count
* arrays of each symbol; index 0 is reserved for gap counting
*/
private static int[] NUC_INDEX = new int[26];
private static int[] AA_INDEX = new int[26];
static
{
for (int i = 0; i < NUCS.length(); i++)
{
NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
}
for (int i = 0; i < AAS.length(); i++)
{
AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
}
}
/*
* counts array, just big enough for the nucleotide or peptide
* character set (plus gap counts in position 0)
*/
private short[] counts;
/*
* alternative array of int counts for use if any count
* exceeds the maximum value of short (32767)
*/
private int[] intCounts;
/*
* flag set if we switch from short to int counts
*/
private boolean useIntCounts;
/*
* general-purpose counter, only for use for characters
* that are not in the expected alphabet
*/
private SparseCount otherData;
/*
* keeps track of the maximum count value recorded
* (if this class ever allows decrements, would need to
* calculate this on request instead)
*/
int maxCount;
/*
* if we think we are counting nucleotide, can get by with smaller
* array to hold counts
*/
private boolean isNucleotide;
/**
* Default constructor allocates arrays able to count either nucleotide or
* peptide bases. Use this constructor if not sure which the data is.
*/
public ResidueCount()
{
this(false);
}
/**
* Constructor that allocates an array just big enough for the anticipated
* characters, plus one position to count gaps
*/
public ResidueCount(boolean nucleotide)
{
isNucleotide = nucleotide;
int charsToCount = nucleotide ? NUCS.length() : AAS.length();
counts = new short[charsToCount + 1];
}
/**
* A constructor that counts frequency of all symbols (including gaps) in the
* sequences (not case-sensitive)
*
* @param sequences
*/
public ResidueCount(List sequences)
{
this();
for (SequenceI seq : sequences)
{
for (int i = 0; i < seq.getLength(); i++)
{
add(seq.getCharAt(i));
}
}
}
/**
* Increments the count for the given character. The supplied character may be
* upper or lower case but counts are for the upper case only. Gap characters
* (space, ., -) are all counted together.
*
* @param c
* @return the new value of the count for the character
*/
public int add(final char c)
{
char u = toUpperCase(c);
int newValue = 0;
int offset = getOffset(u);
/*
* offset 0 is reserved for gap counting, so 0 here means either
* an unexpected character, or a gap character passed in error
*/
if (offset == 0)
{
if (Comparison.isGap(u))
{
newValue = addGap();
}
else
{
newValue = addOtherCharacter(u);
}
}
else
{
newValue = increment(offset);
}
return newValue;
}
/**
* Increment the count at the specified offset. If this would result in short
* overflow, promote to counting int values instead.
*
* @param offset
* @return the new value of the count at this offset
*/
int increment(int offset)
{
int newValue = 0;
if (useIntCounts)
{
newValue = intCounts[offset];
intCounts[offset] = ++newValue;
}
else
{
if (counts[offset] == Short.MAX_VALUE)
{
handleOverflow();
newValue = intCounts[offset];
intCounts[offset] = ++newValue;
}
else
{
newValue = counts[offset];
counts[offset] = (short) ++newValue;
}
}
if (offset != GAP_COUNT)
{
// update modal residue count
maxCount = Math.max(maxCount, newValue);
}
return newValue;
}
/**
* Switch from counting in short to counting in int
*/
synchronized void handleOverflow()
{
intCounts = new int[counts.length];
for (int i = 0; i < counts.length; i++)
{
intCounts[i] = counts[i];
}
counts = null;
useIntCounts = true;
}
/**
* Returns this character's offset in the count array
*
* @param c
* @return
*/
int getOffset(char c)
{
int offset = 0;
if ('A' <= c && c <= 'Z')
{
offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
}
return offset;
}
/**
* @param c
* @return
*/
protected char toUpperCase(final char c)
{
char u = c;
if ('a' <= c && c <= 'z')
{
u = (char) (c + TOUPPERCASE);
}
return u;
}
/**
* Increment count for some unanticipated character. The first time this
* called, a SparseCount is instantiated to hold these 'extra' counts.
*
* @param c
* @return the new value of the count for the character
*/
int addOtherCharacter(char c)
{
if (otherData == null)
{
otherData = new SparseCount();
}
int newValue = otherData.add(c, 1);
maxCount = Math.max(maxCount, newValue);
return newValue;
}
/**
* Set count for some unanticipated character. The first time this called, a
* SparseCount is instantiated to hold these 'extra' counts.
*
* @param c
* @param value
*/
void setOtherCharacter(char c, int value)
{
if (otherData == null)
{
otherData = new SparseCount();
}
otherData.put(c, value);
}
/**
* Increment count of gap characters
*
* @return the new count of gaps
*/
public int addGap()
{
int newValue = increment(GAP_COUNT);
return newValue;
}
/**
* Answers true if we are counting ints (only after overflow of short counts)
*
* @return
*/
boolean isCountingInts()
{
return useIntCounts;
}
/**
* Sets the count for the given character. The supplied character may be upper
* or lower case but counts are for the upper case only.
*
* @param c
* @param count
*/
public void put(char c, int count)
{
char u = toUpperCase(c);
int offset = getOffset(u);
/*
* offset 0 is reserved for gap counting, so 0 here means either
* an unexpected character, or a gap character passed in error
*/
if (offset == 0)
{
if (Comparison.isGap(u))
{
set(0, count);
}
else
{
setOtherCharacter(u, count);
maxCount = Math.max(maxCount, count);
}
}
else
{
set(offset, count);
maxCount = Math.max(maxCount, count);
}
}
/**
* Sets the count at the specified offset. If this would result in short
* overflow, promote to counting int values instead.
*
* @param offset
* @param value
*/
void set(int offset, int value)
{
if (useIntCounts)
{
intCounts[offset] = value;
}
else
{
if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
{
handleOverflow();
intCounts[offset] = value;
}
else
{
counts[offset] = (short) value;
}
}
}
/**
* Returns the count for the given character, or zero if no count held
*
* @param c
* @return
*/
public int getCount(char c)
{
char u = toUpperCase(c);
int offset = getOffset(u);
if (offset == 0)
{
if (!Comparison.isGap(u))
{
// should have called getGapCount()
return otherData == null ? 0 : otherData.get(u);
}
}
return useIntCounts ? intCounts[offset] : counts[offset];
}
public int getGapCount()
{
return useIntCounts ? intCounts[0] : counts[0];
}
/**
* Answers true if this object wraps a counter for unexpected characters
*
* @return
*/
boolean isUsingOtherData()
{
return otherData != null;
}
/**
* Returns the character (or concatenated characters) for the symbol(s) with
* the given count in the profile. Can be used to get the modal residue by
* supplying the modal count value. Returns an empty string if no symbol has
* the given count. The symbols are in alphabetic order of standard peptide or
* nucleotide characters, followed by 'other' symbols if any.
*
* @return
*/
public String getResiduesForCount(int count)
{
if (count == 0)
{
return "";
}
/*
* find counts for the given value and append the
* corresponding symbol
*/
StringBuilder modal = new StringBuilder();
if (useIntCounts)
{
for (int i = 1; i < intCounts.length; i++)
{
if (intCounts[i] == count)
{
modal.append(
isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
}
}
}
else
{
for (int i = 1; i < counts.length; i++)
{
if (counts[i] == count)
{
modal.append(
isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
}
}
}
if (otherData != null)
{
for (int i = 0; i < otherData.size(); i++)
{
if (otherData.valueAt(i) == count)
{
modal.append((char) otherData.keyAt(i));
}
}
}
return modal.toString();
}
/**
* Returns the highest count for any symbol(s) in the profile (excluding gap)
*
* @return
*/
public int getModalCount()
{
return maxCount;
}
/**
* Returns the number of distinct symbols with a non-zero count (excluding the
* gap symbol)
*
* @return
*/
public int size()
{
int size = 0;
if (useIntCounts)
{
for (int i = 1; i < intCounts.length; i++)
{
if (intCounts[i] > 0)
{
size++;
}
}
}
else
{
for (int i = 1; i < counts.length; i++)
{
if (counts[i] > 0)
{
size++;
}
}
}
/*
* include 'other' characters recorded (even if count is zero
* though that would be a strange use case)
*/
if (otherData != null)
{
size += otherData.size();
}
return size;
}
/**
* Returns a data bean holding those symbols that have a non-zero count
* (excluding the gap symbol), with their counts.
*
* @return
*/
public SymbolCounts getSymbolCounts()
{
int size = size();
char[] symbols = new char[size];
int[] values = new int[size];
int j = 0;
if (useIntCounts)
{
for (int i = 1; i < intCounts.length; i++)
{
if (intCounts[i] > 0)
{
char symbol = isNucleotide ? NUCS.charAt(i - 1)
: AAS.charAt(i - 1);
symbols[j] = symbol;
values[j] = intCounts[i];
j++;
}
}
}
else
{
for (int i = 1; i < counts.length; i++)
{
if (counts[i] > 0)
{
char symbol = isNucleotide ? NUCS.charAt(i - 1)
: AAS.charAt(i - 1);
symbols[j] = symbol;
values[j] = counts[i];
j++;
}
}
}
if (otherData != null)
{
for (int i = 0; i < otherData.size(); i++)
{
symbols[j] = (char) otherData.keyAt(i);
values[j] = otherData.valueAt(i);
j++;
}
}
return new SymbolCounts(symbols, values);
}
/**
* Returns a tooltip string showing residues in descending order of their
* percentage frequency in the profile
*
* @param normaliseBy
* the divisor for residue counts (may or may not include gapped
* sequence count)
* @param percentageDecPl
* the number of decimal places to show in percentages
* @return
*/
public String getTooltip(int normaliseBy, int percentageDecPl)
{
SymbolCounts symbolCounts = getSymbolCounts();
char[] ca = symbolCounts.symbols;
int[] vl = symbolCounts.values;
/*
* sort characters into ascending order of their counts
*/
QuickSort.sort(vl, ca);
/*
* traverse in reverse order (highest count first) to build tooltip
*/
boolean first = true;
StringBuilder sb = new StringBuilder(64);
for (int c = ca.length - 1; c >= 0; c--)
{
final char residue = ca[c];
// TODO combine residues which share a percentage
// (see AAFrequency.completeCdnaConsensus)
float tval = (vl[c] * 100f) / normaliseBy;
sb.append(first ? "" : "; ").append(residue).append(" ");
Format.appendPercentage(sb, tval, percentageDecPl);
sb.append("%");
first = false;
}
return sb.toString();
}
/**
* Returns a string representation of the symbol counts, for debug purposes.
*/
@Override
public String toString()
{
StringBuilder sb = new StringBuilder();
sb.append("[ ");
SymbolCounts sc = getSymbolCounts();
for (int i = 0; i < sc.symbols.length; i++)
{
sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
}
sb.append("]");
return sb.toString();
}
/**
* Answers the total count for all symbols (excluding gaps)
*
* @return
*/
public int getTotalResidueCount()
{
int total = 0;
for (char symbol : this.getSymbolCounts().symbols)
{
total += getCount(symbol);
}
return total;
}
}