import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.Annotation;
-import jalview.datamodel.HiddenMarkovModel;
import jalview.datamodel.Profile;
import jalview.datamodel.ProfileI;
import jalview.datamodel.Profiles;
import jalview.datamodel.ResidueCount.SymbolCounts;
import jalview.datamodel.SequenceI;
import jalview.ext.android.SparseIntArray;
-import jalview.schemes.ResidueProperties;
import jalview.util.Comparison;
import jalview.util.Format;
import jalview.util.MappingUtils;
* This class is used extensively in calculating alignment colourschemes that
* depend on the amount of conservation in each alignment column.
*
+ * @author $author$
+ * @version $Revision$
*/
public class AAFrequency
{
- private static final double LOG2 = Math.log(2);
-
public static final String PROFILE = "P";
+ /*
+ * Quick look-up of String value of char 'A' to 'Z'
+ */
+ private static final String[] CHARS = new String['Z' - 'A' + 1];
+
+ static
+ {
+ for (char c = 'A'; c <= 'Z'; c++)
+ {
+ CHARS[c - 'A'] = String.valueOf(c);
+ }
+ }
+
public static final ProfilesI calculate(List<SequenceI> list, int start,
int end)
{
}
/**
- * Returns the full set of profiles for a hidden Markov model. The underlying
- * data is the raw probabilities of a residue being emitted at each node,
- * however the profiles returned by this function contain the percentage
- * chance of a residue emission.
- *
- * @param hmm
- * @param width
- * The width of the Profile array (Profiles) to be returned.
- * @param start
- * The alignment column on which the first profile is based.
- * @param end
- * The alignment column on which the last profile is based.
- * @param removeBelowBackground
- * if true, symbols with a match emission probability less than
- * background frequency are ignored
- * @return
- */
- public static ProfilesI calculateHMMProfiles(final HiddenMarkovModel hmm,
- int width, int start, int end, boolean removeBelowBackground,
- boolean infoLetterHeight)
- {
- ProfileI[] result = new ProfileI[width];
- char[] symbols = hmm.getSymbols().toCharArray();
- int symbolCount = symbols.length;
- for (int column = start; column < end; column++)
- {
- ResidueCount counts = new ResidueCount();
- for (char symbol : symbols)
- {
- int value = getAnalogueCount(hmm, column, symbol,
- removeBelowBackground, infoLetterHeight);
- counts.put(symbol, value);
- }
- int maxCount = counts.getModalCount();
- String maxResidue = counts.getResiduesForCount(maxCount);
- int gapCount = counts.getGapCount();
- ProfileI profile = new Profile(symbolCount, gapCount, maxCount,
- maxResidue);
- profile.setCounts(counts);
-
- result[column] = profile;
- }
- return new Profiles(result);
- }
-
- /**
* Make an estimate of the profile size we are going to compute i.e. how many
* different characters may be present in it. Overestimating has a cost of
* using more memory than necessary. Underestimating has a cost of needing to
}
/**
- * Derive the information annotations to be added to the alignment for
- * display. This does not recompute the raw data, but may be called on a
- * change in display options, such as 'ignore below background frequency',
- * which may in turn result in a change in the derived values.
+ * Derive the gap count annotation row.
*
- * @param information
- * the annotation row to add annotations to
- * @param profiles
- * the source information data
- * @param startCol
- * start column (inclusive)
- * @param endCol
- * end column (exclusive)
- * @param ignoreGaps
- * if true, normalise residue percentages
- * @param showSequenceLogo
- * if true include all information symbols, else just show modal
- * residue
- */
- public static float completeInformation(AlignmentAnnotation information,
- ProfilesI profiles, int startCol, int endCol)
- {
- // long now = System.currentTimeMillis();
- if (information == null || information.annotations == null)
- {
- /*
- * called with a bad alignment annotation row
- * wait for it to be initialised properly
- */
- return 0;
- }
-
- float max = 0f;
- SequenceI hmmSeq = information.sequenceRef;
-
- int seqLength = hmmSeq.getLength();
- if (information.annotations.length < seqLength)
- {
- return 0;
- }
-
- HiddenMarkovModel hmm = hmmSeq.getHMM();
-
- for (int column = startCol; column < endCol; column++)
- {
- if (column >= seqLength)
- {
- // hmm consensus sequence is shorter than the alignment
- break;
- }
-
- float value = hmm.getInformationContent(column);
- boolean isNaN = Float.isNaN(value);
- if (!isNaN)
- {
- max = Math.max(max, value);
- }
-
- String description = isNaN ? null
- : String.format("%.4f bits", value);
- information.annotations[column] = new Annotation(
- Character.toString(
- Character.toUpperCase(hmmSeq.getCharAt(column))),
- description, ' ', value);
- }
-
- information.graphMax = max;
- return max;
- }
-
- /**
- * Derive the occupancy count annotation
- *
- * @param occupancy
+ * @param gaprow
* the annotation row to add annotations to
* @param profiles
* the source consensus data
* @param endCol
* end column (exclusive)
*/
- public static void completeOccupancyAnnot(AlignmentAnnotation occupancy,
+ public static void completeGapAnnot(AlignmentAnnotation gaprow,
ProfilesI profiles, int startCol, int endCol, long nseq)
{
- if (occupancy == null || occupancy.annotations == null
- || occupancy.annotations.length < endCol)
+ if (gaprow == null || gaprow.annotations == null
+ || gaprow.annotations.length < endCol)
{
/*
* called with a bad alignment annotation row
return;
}
// always set ranges again
- occupancy.graphMax = nseq;
- occupancy.graphMin = 0;
+ gaprow.graphMax = nseq;
+ gaprow.graphMin = 0;
double scale = 0.8 / nseq;
for (int i = startCol; i < endCol; i++)
{
* happens if sequences calculated over were
* shorter than alignment width
*/
- occupancy.annotations[i] = null;
+ gaprow.annotations[i] = null;
return;
}
String description = "" + gapped;
- occupancy.annotations[i] = new Annotation("", description, '\0',
- gapped,
+ gaprow.annotations[i] = new Annotation("", description, '\0', gapped,
jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY,
(float) scale * gapped));
}
return result;
}
-
/**
* Extract a sorted extract of cDNA codon profile data. The returned array
* contains
for (int col = 0; col < cols; col++)
{
// todo would prefer a Java bean for consensus data
- Hashtable<String, int[]> columnHash = new Hashtable<>();
+ Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
// #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
int[] codonCounts = new int[66];
codonCounts[0] = alignment.getSequences().size();
}
return scale;
}
-
- /**
- * Returns the sorted HMM profile for the given column of the alignment. The
- * returned array contains
- *
- * <pre>
- * [profileType=0, numberOfValues, 100, charValue1, percentage1, charValue2, percentage2, ...]
- * in descending order of percentage value
- * </pre>
- *
- * @param hmm
- * @param column
- * @param removeBelowBackground
- * if true, ignores residues with probability less than their
- * background frequency
- * @param infoHeight
- * if true, uses the log ratio 'information' measure to scale the
- * value
- * @return
- */
- public static int[] extractHMMProfile(HiddenMarkovModel hmm, int column,
- boolean removeBelowBackground, boolean infoHeight)
- {
- if (hmm == null)
- {
- return null;
- }
- String alphabet = hmm.getSymbols();
- int size = alphabet.length();
- char symbols[] = new char[size];
- int values[] = new int[size];
- int totalCount = 0;
-
- for (int i = 0; i < size; i++)
- {
- char symbol = alphabet.charAt(i);
- symbols[i] = symbol;
- int value = getAnalogueCount(hmm, column, symbol,
- removeBelowBackground, infoHeight);
- values[i] = value;
- totalCount += value;
- }
-
- /*
- * sort symbols by increasing emission probability
- */
- QuickSort.sort(values, symbols);
-
- int[] profile = new int[3 + size * 2];
-
- profile[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
- profile[1] = size;
- profile[2] = 100;
-
- /*
- * order symbol/count profile by decreasing emission probability
- */
- if (totalCount != 0)
- {
- int arrayPos = 3;
- for (int k = size - 1; k >= 0; k--)
- {
- Float percentage;
- int value = values[k];
- if (removeBelowBackground)
- {
- percentage = ((float) value) / totalCount * 100f;
- }
- else
- {
- percentage = value / 100f;
- }
- int intPercent = Math.round(percentage);
- profile[arrayPos] = symbols[k];
- profile[arrayPos + 1] = intPercent;
- arrayPos += 2;
- }
- }
- return profile;
- }
-
- /**
- * Converts the emission probability of a residue at a column in the alignment
- * to a 'count', suitable for rendering as an annotation value
- *
- * @param hmm
- * @param column
- * @param symbol
- * @param removeBelowBackground
- * if true, returns 0 for any symbol with a match emission
- * probability less than the background frequency
- * @infoHeight if true, uses the log ratio 'information content' to scale the
- * value
- * @return
- */
- static int getAnalogueCount(HiddenMarkovModel hmm, int column,
- char symbol, boolean removeBelowBackground, boolean infoHeight)
- {
- double value = hmm.getMatchEmissionProbability(column, symbol);
- double freq = ResidueProperties.backgroundFrequencies
- .get(hmm.getAlphabetType()).get(symbol);
- if (value < freq && removeBelowBackground)
- {
- return 0;
- }
-
- if (infoHeight)
- {
- value = value * (Math.log(value / freq) / LOG2);
- }
-
- value = value * 10000d;
- return Math.round((float) value);
- }
}