import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.Annotation;
+import jalview.datamodel.ResidueCount;
+import jalview.datamodel.ResidueCount.SymbolCounts;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
-import jalview.ext.android.SparseIntArray;
import jalview.schemes.ResidueProperties;
+import jalview.util.Comparison;
import java.awt.Color;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.SortedMap;
import java.util.TreeMap;
import java.util.Vector;
/**
* Calculates conservation values for a given set of sequences
- *
- * @author $author$
- * @version $Revision$
*/
public class Conservation
{
+ /*
+ * need to have a minimum of 3% of sequences with a residue
+ * for it to be included in the conservation calculation
+ */
+ private static final int THRESHOLD_PERCENT = 3;
+
+ private static final int TOUPPERCASE = 'a' - 'A';
+
SequenceI[] sequences;
int start;
boolean seqNumsChanged = false; // updated after any change via calcSeqNum;
+ /*
+ * a map per column with {property, conservation} where conservation value is
+ * 1 (property is conserved), 0 (absence of property is conserved) or -1
+ * (property is not conserved i.e. column has residues with and without it)
+ */
Map<String, Integer>[] total;
boolean canonicaliseAa = true; // if true then conservation calculation will
private Sequence consSequence;
+ /*
+ * percentage of residues in a column to qualify for counting conservation
+ */
private int threshold;
private String name = "";
private String[] consSymbs;
/**
- * Creates a new Conservation object.
+ * Constructor using default threshold of 3%
*
* @param name
* Name of conservation
- * @param threshold
- * to count the residues in residueHash(). commonly used value is 3
* @param sequences
* sequences to be used in calculation
* @param start
* @param end
* end residue position
*/
+ public Conservation(String name, List<SequenceI> sequences, int start,
+ int end)
+ {
+ this(name, THRESHOLD_PERCENT, sequences, start, end);
+ }
+
+ /**
+ * Constructor
+ *
+ * @param name
+ * Name of conservation
+ * @param threshold
+ * percentage of sequences at or below which property conservation is
+ * ignored
+ * @param sequences
+ * sequences to be used in calculation
+ * @param start
+ * start column position
+ * @param end
+ * end column position
+ */
public Conservation(String name, int threshold,
List<SequenceI> sequences, int start, int end)
{
*/
public void calculate()
{
- int jSize = sequences.length;
- // int[] values; // Replaces residueHash
- SparseIntArray values = new SparseIntArray();
+ int height = sequences.length;
total = new Map[maxLength];
- for (int i = start; i <= end; i++)
+ for (int column = start; column <= end; column++)
{
- // values = new int[255];
- values.clear();
+ ResidueCount values = countResidues(column);
+
+ /*
+ * percentage count at or below which we ignore residues
+ */
+ int thresh = (threshold * height) / 100;
- for (int j = 0; j < jSize; j++)
+ /*
+ * check observed residues in column and record whether each
+ * physico-chemical property is conserved (+1), absence conserved (0),
+ * or not conserved (-1)
+ * Using TreeMap means properties are displayed in alphabetical order
+ */
+ SortedMap<String, Integer> resultHash = new TreeMap<String, Integer>();
+ SymbolCounts symbolCounts = values.getSymbolCounts();
+ char[] symbols = symbolCounts.symbols;
+ int[] counts = symbolCounts.values;
+ for (int j = 0; j < symbols.length; j++)
{
- if (sequences[j].getLength() > i)
+ char c = symbols[j];
+ if (counts[j] > thresh)
{
- char c = sequences[j].getCharAt(i);
-
- if (canonicaliseAa)
- { // lookup the base aa code symbol
- c = (char) ResidueProperties.aaIndex[sequences[j].getCharAt(i)];
- if (c > 20)
- {
- c = '-';
- }
- else
- {
- // recover canonical aa symbol
- c = ResidueProperties.aa[c].charAt(0);
- }
- }
- else
- {
- // original behaviour - operate on ascii symbols directly
- // No need to check if its a '-'
- if (c == '.' || c == ' ')
- {
- c = '-';
- }
-
- c = toUpperCase(c);
- }
- // values[c]++;
- values.add(c, 1);
+ recordConservation(resultHash, String.valueOf(c));
+ }
+ }
+ if (values.getGapCount() > thresh)
+ {
+ recordConservation(resultHash, "-");
+ }
+
+ if (total.length > 0)
+ {
+ total[column - start] = resultHash;
+ }
+ }
+ }
+
+ /**
+ * Updates the conservation results for an observed residue
+ *
+ * @param resultMap
+ * a map of {property, conservation} where conservation value is +1
+ * (all residues have the property), 0 (no residue has the property)
+ * or -1 (some do, some don't)
+ * @param res
+ */
+ protected static void recordConservation(Map<String, Integer> resultMap,
+ String res)
+ {
+ res = res.toUpperCase();
+ for (Entry<String, Map<String, Integer>> property : ResidueProperties.propHash
+ .entrySet())
+ {
+ String propertyName = property.getKey();
+ Integer residuePropertyValue = property.getValue().get(res);
+
+ if (!resultMap.containsKey(propertyName))
+ {
+ /*
+ * first time we've seen this residue - note whether it has this property
+ */
+ if (residuePropertyValue != null)
+ {
+ resultMap.put(propertyName, residuePropertyValue);
}
else
{
- // values['-']++;
- values.add('-', 1);
+ /*
+ * unrecognised residue - use default value for property
+ */
+ resultMap.put(propertyName, property.getValue().get("-"));
}
}
+ else
+ {
+ Integer currentResult = resultMap.get(propertyName);
+ if (currentResult.intValue() != -1
+ && !currentResult.equals(residuePropertyValue))
+ {
+ /*
+ * property is unconserved - residues seen both with and without it
+ */
+ resultMap.put(propertyName, Integer.valueOf(-1));
+ }
+ }
+ }
+ }
- // What is the count threshold to count the residues in residueHash()
- int thresh = (threshold * jSize) / 100;
+ /**
+ * Counts residues (upper-cased) and gaps in the given column
+ *
+ * @param column
+ * @return
+ */
+ protected ResidueCount countResidues(int column)
+ {
+ ResidueCount values = new ResidueCount(false);
- // loop over all the found residues
- // Hashtable<String, Integer> resultHash = new Hashtable<String,
- // Integer>();
- Map<String, Integer> resultHash = new TreeMap<String, Integer>();
- // for (char v = '-'; v < 'Z'; v++)
- for (int key = 0; key < values.size(); key++)
+ for (int row = 0; row < sequences.length; row++)
+ {
+ if (sequences[row].getLength() > column)
{
- char v = (char) values.keyAt(key);
- // if (values[v] > thresh)
- if (values.valueAt(key) > thresh)
+ char c = sequences[row].getCharAt(column);
+ if (canonicaliseAa)
{
- String res = String.valueOf(v);
-
- // Now loop over the properties
- for (String type : ResidueProperties.propHash.keySet())
- {
- Map<String, Integer> ht = ResidueProperties.propHash.get(type);
-
- // Have we ticked this before?
- if (!resultHash.containsKey(type))
- {
- if (ht.containsKey(res))
- {
- resultHash.put(type, ht.get(res));
- }
- else
- {
- resultHash.put(type, ht.get("-"));
- }
- }
- else if (!resultHash.get(type).equals(ht.get(res)))
- {
- resultHash.put(type, new Integer(-1));
- }
- }
+ int index = ResidueProperties.aaIndex[c];
+ c = index > 20 ? '-' : ResidueProperties.aa[index].charAt(0);
+ }
+ else
+ {
+ c = toUpperCase(c);
+ }
+ if (Comparison.isGap(c))
+ {
+ values.addGap();
+ }
+ else
+ {
+ values.add(c);
}
}
-
- if (total.length > 0)
+ else
{
- total[i - start] = resultHash;
+ values.addGap();
}
}
+ return values;
}
- /*****************************************************************************
- * count conservation for the j'th column of the alignment
+ /**
+ * Counts conservation and gaps for a column of the alignment
*
- * @return { gap count, conserved residue count}
+ * @return { 1 if fully conserved, else 0, gap count }
*/
- public int[] countConsNGaps(int j)
+ public int[] countConservationAndGaps(int column)
{
- int count = 0;
- int cons = 0;
- int nres = 0;
- int[] r = new int[2];
- char f = '$';
- int i, iSize = sequences.length;
- char c;
+ int gapCount = 0;
+ boolean fullyConserved = true;
+ int iSize = sequences.length;
- for (i = 0; i < iSize; i++)
+ if (iSize == 0)
+ {
+ return new int[] { 0, 0 };
+ }
+
+ char lastRes = '0';
+ for (int i = 0; i < iSize; i++)
{
- if (j >= sequences[i].getLength())
+ if (column >= sequences[i].getLength())
{
- count++;
+ gapCount++;
continue;
}
- c = sequences[i].getCharAt(j); // gaps do not have upper/lower case
+ char c = sequences[i].getCharAt(column); // gaps do not have upper/lower case
- if (jalview.util.Comparison.isGap((c)))
+ if (Comparison.isGap((c)))
{
- count++;
+ gapCount++;
}
else
{
c = toUpperCase(c);
- nres++;
-
- if (nres == 1)
+ if (lastRes == '0')
{
- f = c;
- cons++;
+ lastRes = c;
}
- else if (f == c)
+ if (c != lastRes)
{
- cons++;
+ fullyConserved = false;
}
}
}
- r[0] = (nres == cons) ? 1 : 0;
- r[1] = count;
-
+ int[] r = new int[] { fullyConserved ? 1 : 0, gapCount };
return r;
}
{
if ('a' <= c && c <= 'z')
{
- c -= (32); // 32 = 'a' - 'A'
+ c -= TOUPPERCASE;
}
return c;
}
/**
* Calculates the conservation sequence
*
- * @param consflag
- * if true, positive conservation; false calculates negative
- * conservation
- * @param percentageGaps
- * commonly used value is 25
+ * @param positiveOnly
+ * if true, calculate positive conservation; else calculate both
+ * positive and negative conservation
+ * @param maxPercentageGaps
+ * the percentage of gaps in a column, at or above which no
+ * conservation is asserted
*/
- public void verdict(boolean consflag, float percentageGaps)
+ public void verdict(boolean positiveOnly, float maxPercentageGaps)
{
+ // TODO call this at the end of calculate(), should not be a public method
+
StringBuilder consString = new StringBuilder(end);
// NOTE THIS SHOULD CHECK IF THE CONSEQUENCE ALREADY
consSymbs = new String[end - start + 1];
for (int i = start; i <= end; i++)
{
- int[] gapcons = countConsNGaps(i);
+ int[] gapcons = countConservationAndGaps(i);
+ boolean fullyConserved = gapcons[0] == 1;
int totGaps = gapcons[1];
- float pgaps = ((float) totGaps * 100) / sequences.length;
- StringBuilder positives = new StringBuilder(64);
- StringBuilder negatives = new StringBuilder(32);
- // consSymbs[i - start] = "";
+ float pgaps = (totGaps * 100f) / sequences.length;
- if (percentageGaps > pgaps)
+ if (maxPercentageGaps > pgaps)
{
Map<String, Integer> resultHash = total[i - start];
- // Now find the verdict
int count = 0;
+ StringBuilder positives = new StringBuilder(64);
+ StringBuilder negatives = new StringBuilder(32);
for (String type : resultHash.keySet())
{
int result = resultHash.get(type).intValue();
- // Do we want to count +ve conservation or +ve and -ve cons.?
- if (consflag)
+ if (result == -1)
{
- if (result == 1)
- {
- // consSymbs[i - start] = type + " " + consSymbs[i - start];
- positives.append(positives.length() == 0 ? "" : " ");
- positives.append(type);
- count++;
- }
+ /*
+ * not conserved (present or absent)
+ */
+ continue;
}
- else
+ count++;
+ if (result == 1)
{
- if (result != -1)
- {
- if (result == 0)
- {
- /*
- * add negatively conserved properties on the end
- */
- // consSymbs[i - start] = consSymbs[i - start] + " !" + type;
- negatives.append(negatives.length() == 0 ? "" : " ");
- negatives.append("!").append(type);
- }
- else
- {
- /*
- * put positively conserved properties on the front
- */
- // consSymbs[i - start] = type + " " + consSymbs[i - start];
- positives.append(positives.length() == 0 ? "" : " ");
- positives.append(type);
- }
- count++;
- }
+ /*
+ * positively conserved property (all residues have it)
+ */
+ positives.append(positives.length() == 0 ? "" : " ");
+ positives.append(type);
+ }
+ if (result == 0 && !positiveOnly)
+ {
+ /*
+ * absense of property is conserved (all residues lack it)
+ */
+ negatives.append(negatives.length() == 0 ? "" : " ");
+ negatives.append("!").append(type);
}
}
if (negatives.length() > 0)
}
else
{
- consString.append((gapcons[0] == 1) ? "*" : "+");
+ consString.append(fullyConserved ? "*" : "+");
}
}
else
*
* @param name
* - name of conservation
- * @param threshold
- * - minimum number of conserved residues needed to indicate
- * conservation (typically 3)
* @param seqs
* @param start
* first column in calculation window
* @param end
* last column in calculation window
- * @param posOrNeg
- * positive (true) or negative (false) conservation
- * @param consPercGaps
+ * @param positiveOnly
+ * calculate positive (true) or positive and negative (false)
+ * conservation
+ * @param maxPercentGaps
* percentage of gaps tolerated in column
* @param calcQuality
* flag indicating if alignment quality should be calculated
* @return Conservation object ready for use in visualization
*/
public static Conservation calculateConservation(String name,
- int threshold, List<SequenceI> seqs, int start, int end,
- boolean posOrNeg, int consPercGaps, boolean calcQuality)
+ List<SequenceI> seqs, int start, int end, boolean positiveOnly,
+ int maxPercentGaps, boolean calcQuality)
{
- Conservation cons = new Conservation(name, threshold, seqs, start, end);
+ Conservation cons = new Conservation(name, seqs, start, end);
cons.calculate();
- cons.verdict(posOrNeg, consPercGaps);
+ cons.verdict(positiveOnly, maxPercentGaps);
if (calcQuality)
{
return cons;
}
+
+ /**
+ * Returns the computed tooltip (annotation description) for a given column.
+ * The tip is empty if the conservation score is zero, otherwise holds the
+ * conserved properties (and, optionally, properties whose absence is
+ * conserved).
+ *
+ * @param column
+ * @return
+ */
+ String getTooltip(int column)
+ {
+ char[] sequence = getConsSequence().getSequence();
+ char val = column < sequence.length ? sequence[column] : '-';
+ boolean hasConservation = val != '-' && val != '0';
+ int consp = column - start;
+ String tip = (hasConservation && consp > -1 && consp < consSymbs.length) ? consSymbs[consp]
+ : "";
+ return tip;
+ }
}