/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.Profile; import jalview.datamodel.ProfileI; import jalview.datamodel.Profiles; import jalview.datamodel.ProfilesI; import jalview.datamodel.ResidueCount; import jalview.datamodel.ResidueCount.SymbolCounts; import jalview.datamodel.SequenceI; import jalview.ext.android.SparseIntArray; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; import jalview.util.Format; import jalview.util.MappingUtils; import jalview.util.QuickSort; import java.awt.Color; import java.util.Arrays; import java.util.Hashtable; import java.util.List; /** * Takes in a vector or array of sequences and column start and column end and * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied. * This class is used extensively in calculating alignment colourschemes that * depend on the amount of conservation in each alignment column. * */ public class AAFrequency { private static final double LOG2 = Math.log(2); public static final String PROFILE = "P"; public static final ProfilesI calculate(List list, int start, int end) { return calculate(list, start, end, false); } public static final ProfilesI calculate(List sequences, int start, int end, boolean profile) { SequenceI[] seqs = new SequenceI[sequences.size()]; int width = 0; synchronized (sequences) { for (int i = 0; i < sequences.size(); i++) { seqs[i] = sequences.get(i); int length = seqs[i].getLength(); if (length > width) { width = length; } } if (end >= width) { end = width; } ProfilesI reply = calculate(seqs, width, start, end, profile); return reply; } } /** * Calculate the consensus symbol(s) for each column in the given range. * * @param sequences * @param width * the full width of the alignment * @param start * start column (inclusive, base zero) * @param end * end column (exclusive) * @param saveFullProfile * if true, store all symbol counts */ public static final ProfilesI calculate(final SequenceI[] sequences, int width, int start, int end, boolean saveFullProfile) { // long now = System.currentTimeMillis(); int seqCount = sequences.length; boolean nucleotide = false; int nucleotideCount = 0; int peptideCount = 0; ProfileI[] result = new ProfileI[width]; for (int column = start; column < end; column++) { /* * Apply a heuristic to detect nucleotide data (which can * be counted in more compact arrays); here we test for * more than 90% nucleotide; recheck every 10 columns in case * of misleading data e.g. highly conserved Alanine in peptide! * Mistakenly guessing nucleotide has a small performance cost, * as it will result in counting in sparse arrays. * Mistakenly guessing peptide has a small space cost, * as it will use a larger than necessary array to hold counts. */ if (nucleotideCount > 100 && column % 10 == 0) { nucleotide = (9 * peptideCount < nucleotideCount); } ResidueCount residueCounts = new ResidueCount(nucleotide); for (int row = 0; row < seqCount; row++) { if (sequences[row] == null) { System.err.println( "WARNING: Consensus skipping null sequence - possible race condition."); continue; } if (sequences[row].getLength() > column) { char c = sequences[row].getCharAt(column); residueCounts.add(c); if (Comparison.isNucleotide(c)) { nucleotideCount++; } else if (!Comparison.isGap(c)) { peptideCount++; } } else { /* * count a gap if the sequence doesn't reach this column */ residueCounts.addGap(); } } int maxCount = residueCounts.getModalCount(); String maxResidue = residueCounts.getResiduesForCount(maxCount); int gapCount = residueCounts.getGapCount(); ProfileI profile = new Profile(seqCount, gapCount, maxCount, maxResidue); if (saveFullProfile) { profile.setCounts(residueCounts); } result[column] = profile; } return new Profiles(result); // long elapsed = System.currentTimeMillis() - now; // System.out.println(elapsed); } /** * Returns the full set of profiles for a hidden Markov model. The underlying * data is the raw probabilities of a residue being emitted at each node, * however the profiles returned by this function contain the percentage * chance of a residue emission. * * @param hmm * @param width * The width of the Profile array (Profiles) to be returned. * @param start * The alignment column on which the first profile is based. * @param end * The alignment column on which the last profile is based. * @param removeBelowBackground * if true, symbols with a match emission probability less than * background frequency are ignored * @return */ public static ProfilesI calculateHMMProfiles(final HiddenMarkovModel hmm, int width, int start, int end, boolean removeBelowBackground, boolean infoLetterHeight) { ProfileI[] result = new ProfileI[width]; char[] symbols = hmm.getSymbols().toCharArray(); int symbolCount = symbols.length; for (int column = start; column < end; column++) { ResidueCount counts = new ResidueCount(); for (char symbol : symbols) { int value = getAnalogueCount(hmm, column, symbol, removeBelowBackground, infoLetterHeight); counts.put(symbol, value); } int maxCount = counts.getModalCount(); String maxResidue = counts.getResiduesForCount(maxCount); int gapCount = counts.getGapCount(); ProfileI profile = new Profile(symbolCount, gapCount, maxCount, maxResidue); profile.setCounts(counts); result[column] = profile; } return new Profiles(result); } /** * Make an estimate of the profile size we are going to compute i.e. how many * different characters may be present in it. Overestimating has a cost of * using more memory than necessary. Underestimating has a cost of needing to * extend the SparseIntArray holding the profile counts. * * @param profileSizes * counts of sizes of profiles so far encountered * @return */ static int estimateProfileSize(SparseIntArray profileSizes) { if (profileSizes.size() == 0) { return 4; } /* * could do a statistical heuristic here e.g. 75%ile * for now just return the largest value */ return profileSizes.keyAt(profileSizes.size() - 1); } /** * Derive the consensus annotations to be added to the alignment for display. * This does not recompute the raw data, but may be called on a change in * display options, such as 'ignore gaps', which may in turn result in a * change in the derived values. * * @param consensus * the annotation row to add annotations to * @param profiles * the source consensus data * @param startCol * start column (inclusive) * @param endCol * end column (exclusive) * @param ignoreGaps * if true, normalise residue percentages ignoring gaps * @param showSequenceLogo * if true include all consensus symbols, else just show modal * residue * @param nseq * number of sequences */ public static void completeConsensus(AlignmentAnnotation consensus, ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps, boolean showSequenceLogo, long nseq) { // long now = System.currentTimeMillis(); if (consensus == null || consensus.annotations == null || consensus.annotations.length < endCol) { /* * called with a bad alignment annotation row * wait for it to be initialised properly */ return; } for (int i = startCol; i < endCol; i++) { ProfileI profile = profiles.get(i); if (profile == null) { /* * happens if sequences calculated over were * shorter than alignment width */ consensus.annotations[i] = null; return; } final int dp = getPercentageDp(nseq); float value = profile.getPercentageIdentity(ignoreGaps); String description = getTooltip(profile, value, showSequenceLogo, ignoreGaps, dp); String modalResidue = profile.getModalResidue(); if ("".equals(modalResidue)) { modalResidue = "-"; } else if (modalResidue.length() > 1) { modalResidue = "+"; } consensus.annotations[i] = new Annotation(modalResidue, description, ' ', value); } // long elapsed = System.currentTimeMillis() - now; // System.out.println(-elapsed); } /** * Derive the information annotations to be added to the alignment for * display. This does not recompute the raw data, but may be called on a * change in display options, such as 'ignore below background frequency', * which may in turn result in a change in the derived values. * * @param information * the annotation row to add annotations to * @param profiles * the source information data * @param startCol * start column (inclusive) * @param endCol * end column (exclusive) * @param ignoreGaps * if true, normalise residue percentages * @param showSequenceLogo * if true include all information symbols, else just show modal * residue */ public static float completeInformation(AlignmentAnnotation information, ProfilesI profiles, int startCol, int endCol) { // long now = System.currentTimeMillis(); if (information == null || information.annotations == null) { /* * called with a bad alignment annotation row * wait for it to be initialised properly */ return 0; } float max = 0f; SequenceI hmmSeq = information.sequenceRef; int seqLength = hmmSeq.getLength(); if (information.annotations.length < seqLength) { return 0; } HiddenMarkovModel hmm = hmmSeq.getHMM(); for (int column = startCol; column < endCol; column++) { if (column >= seqLength) { // hmm consensus sequence is shorter than the alignment break; } float value = hmm.getInformationContent(column); boolean isNaN = Float.isNaN(value); if (!isNaN) { max = Math.max(max, value); } String description = isNaN ? null : String.format("%.4f bits", value); information.annotations[column] = new Annotation( Character.toString( Character.toUpperCase(hmmSeq.getCharAt(column))), description, ' ', value); } information.graphMax = max; return max; } /** * Derive the occupancy count annotation * * @param occupancy * the annotation row to add annotations to * @param profiles * the source consensus data * @param startCol * start column (inclusive) * @param endCol * end column (exclusive) */ public static void completeOccupancyAnnot(AlignmentAnnotation occupancy, ProfilesI profiles, int startCol, int endCol, long nseq) { if (occupancy == null || occupancy.annotations == null || occupancy.annotations.length < endCol) { /* * called with a bad alignment annotation row * wait for it to be initialised properly */ return; } // always set ranges again occupancy.graphMax = nseq; occupancy.graphMin = 0; double scale = 0.8 / nseq; for (int i = startCol; i < endCol; i++) { ProfileI profile = profiles.get(i); if (profile == null) { /* * happens if sequences calculated over were * shorter than alignment width */ occupancy.annotations[i] = null; return; } final int gapped = profile.getNonGapped(); String description = "" + gapped; occupancy.annotations[i] = new Annotation("", description, '\0', gapped, jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY, (float) scale * gapped)); } } /** * Returns a tooltip showing either *

the full profile (percentages of all residues present), if * showSequenceLogo is true, or
just the modal (most common) residue(s), if showSequenceLogo is * false

* Percentages are as a fraction of all sequence, or only ungapped sequences * if ignoreGaps is true. * * @param profile * @param pid * @param showSequenceLogo * @param ignoreGaps * @param dp * the number of decimal places to format percentages to * @return */ static String getTooltip(ProfileI profile, float pid, boolean showSequenceLogo, boolean ignoreGaps, int dp) { ResidueCount counts = profile.getCounts(); String description = null; if (counts != null && showSequenceLogo) { int normaliseBy = ignoreGaps ? profile.getNonGapped() : profile.getHeight(); description = counts.getTooltip(normaliseBy, dp); } else { StringBuilder sb = new StringBuilder(64); String maxRes = profile.getModalResidue(); if (maxRes.length() > 1) { sb.append("[").append(maxRes).append("]"); } else { sb.append(maxRes); } if (maxRes.length() > 0) { sb.append(" "); Format.appendPercentage(sb, pid, dp); sb.append("%"); } description = sb.toString(); } return description; } /** * Returns the sorted profile for the given consensus data. The returned array * contains * *

   *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
   * in descending order of percentage value
   *

* * @param profile * the data object from which to extract and sort values * @param ignoreGaps * if true, only non-gapped values are included in percentage * calculations * @return */ public static int[] extractProfile(ProfileI profile, boolean ignoreGaps) { int[] rtnval = new int[64]; ResidueCount counts = profile.getCounts(); if (counts == null) { return null; } SymbolCounts symbolCounts = counts.getSymbolCounts(); char[] symbols = symbolCounts.symbols; int[] values = symbolCounts.values; QuickSort.sort(values, symbols); int nextArrayPos = 2; int totalPercentage = 0; final int divisor = ignoreGaps ? profile.getNonGapped() : profile.getHeight(); /* * traverse the arrays in reverse order (highest counts first) */ for (int i = symbols.length - 1; i >= 0; i--) { int theChar = symbols[i]; int charCount = values[i]; rtnval[nextArrayPos++] = theChar; final int percentage = (charCount * 100) / divisor; rtnval[nextArrayPos++] = percentage; totalPercentage += percentage; } rtnval[0] = symbols.length; rtnval[1] = totalPercentage; int[] result = new int[rtnval.length + 1]; result[0] = AlignmentAnnotation.SEQUENCE_PROFILE; System.arraycopy(rtnval, 0, result, 1, rtnval.length); return result; } /** * Extract a sorted extract of cDNA codon profile data. The returned array * contains * *

   *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
   * in descending order of percentage value, where the character values encode codon triplets
   *

* * @param hashtable * @return */ public static int[] extractCdnaProfile(Hashtable hashtable, boolean ignoreGaps) { // this holds #seqs, #ungapped, and then codon count, indexed by encoded // codon triplet int[] codonCounts = (int[]) hashtable.get(PROFILE); int[] sortedCounts = new int[codonCounts.length - 2]; System.arraycopy(codonCounts, 2, sortedCounts, 0, codonCounts.length - 2); int[] result = new int[3 + 2 * sortedCounts.length]; // first value is just the type of profile data result[0] = AlignmentAnnotation.CDNA_PROFILE; char[] codons = new char[sortedCounts.length]; for (int i = 0; i < codons.length; i++) { codons[i] = (char) i; } QuickSort.sort(sortedCounts, codons); int totalPercentage = 0; int distinctValuesCount = 0; int j = 3; int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0]; for (int i = codons.length - 1; i >= 0; i--) { final int codonCount = sortedCounts[i]; if (codonCount == 0) { break; // nothing else of interest here } distinctValuesCount++; result[j++] = codons[i]; final int percentage = codonCount * 100 / divisor; result[j++] = percentage; totalPercentage += percentage; } result[2] = totalPercentage; /* * Just return the non-zero values */ // todo next value is redundant if we limit the array to non-zero counts result[1] = distinctValuesCount; return Arrays.copyOfRange(result, 0, j); } /** * Compute a consensus for the cDNA coding for a protein alignment. * * @param alignment * the protein alignment (which should hold mappings to cDNA * sequences) * @param hconsensus * the consensus data stores to be populated (one per column) */ public static void calculateCdna(AlignmentI alignment, Hashtable[] hconsensus) { final char gapCharacter = alignment.getGapCharacter(); List mappings = alignment.getCodonFrames(); if (mappings == null || mappings.isEmpty()) { return; } int cols = alignment.getWidth(); for (int col = 0; col < cols; col++) { // todo would prefer a Java bean for consensus data Hashtable columnHash = new Hashtable<>(); // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1) int[] codonCounts = new int[66]; codonCounts[0] = alignment.getSequences().size(); int ungappedCount = 0; for (SequenceI seq : alignment.getSequences()) { if (seq.getCharAt(col) == gapCharacter) { continue; } List codons = MappingUtils.findCodonsFor(seq, col, mappings); for (char[] codon : codons) { int codonEncoded = CodingUtils.encodeCodon(codon); if (codonEncoded >= 0) { codonCounts[codonEncoded + 2]++; ungappedCount++; } } } codonCounts[1] = ungappedCount; // todo: sort values here, save counts and codons? columnHash.put(PROFILE, codonCounts); hconsensus[col] = columnHash; } } /** * Derive displayable cDNA consensus annotation from computed consensus data. * * @param consensusAnnotation * the annotation row to be populated for display * @param consensusData * the computed consensus data * @param showProfileLogo * if true show all symbols present at each position, else only the * modal value * @param nseqs * the number of sequences in the alignment */ public static void completeCdnaConsensus( AlignmentAnnotation consensusAnnotation, Hashtable[] consensusData, boolean showProfileLogo, int nseqs) { if (consensusAnnotation == null || consensusAnnotation.annotations == null || consensusAnnotation.annotations.length < consensusData.length) { // called with a bad alignment annotation row - wait for it to be // initialised properly return; } // ensure codon triplet scales with font size consensusAnnotation.scaleColLabel = true; for (int col = 0; col < consensusData.length; col++) { Hashtable hci = consensusData[col]; if (hci == null) { // gapped protein column? continue; } // array holds #seqs, #ungapped, then codon counts indexed by codon final int[] codonCounts = (int[]) hci.get(PROFILE); int totalCount = 0; /* * First pass - get total count and find the highest */ final char[] codons = new char[codonCounts.length - 2]; for (int j = 2; j < codonCounts.length; j++) { final int codonCount = codonCounts[j]; codons[j - 2] = (char) (j - 2); totalCount += codonCount; } /* * Sort array of encoded codons by count ascending - so the modal value * goes to the end; start by copying the count (dropping the first value) */ int[] sortedCodonCounts = new int[codonCounts.length - 2]; System.arraycopy(codonCounts, 2, sortedCodonCounts, 0, codonCounts.length - 2); QuickSort.sort(sortedCodonCounts, codons); int modalCodonEncoded = codons[codons.length - 1]; int modalCodonCount = sortedCodonCounts[codons.length - 1]; String modalCodon = String .valueOf(CodingUtils.decodeCodon(modalCodonEncoded)); if (sortedCodonCounts.length > 1 && sortedCodonCounts[codons.length - 2] == sortedCodonCounts[codons.length - 1]) { /* * two or more codons share the modal count */ modalCodon = "+"; } float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100 / (float) totalCount; /* * todo ? Replace consensus hashtable with sorted arrays of codons and * counts (non-zero only). Include total count in count array [0]. */ /* * Scan sorted array backwards for most frequent values first. Show * repeated values compactly. */ StringBuilder mouseOver = new StringBuilder(32); StringBuilder samePercent = new StringBuilder(); String percent = null; String lastPercent = null; int percentDecPl = getPercentageDp(nseqs); for (int j = codons.length - 1; j >= 0; j--) { int codonCount = sortedCodonCounts[j]; if (codonCount == 0) { /* * remaining codons are 0% - ignore, but finish off the last one if * necessary */ if (samePercent.length() > 0) { mouseOver.append(samePercent).append(": ").append(percent) .append("% "); } break; } int codonEncoded = codons[j]; final int pct = codonCount * 100 / totalCount; String codon = String .valueOf(CodingUtils.decodeCodon(codonEncoded)); StringBuilder sb = new StringBuilder(); Format.appendPercentage(sb, pct, percentDecPl); percent = sb.toString(); if (showProfileLogo || codonCount == modalCodonCount) { if (percent.equals(lastPercent) && j > 0) { samePercent.append(samePercent.length() == 0 ? "" : ", "); samePercent.append(codon); } else { if (samePercent.length() > 0) { mouseOver.append(samePercent).append(": ").append(lastPercent) .append("% "); } samePercent.setLength(0); samePercent.append(codon); } lastPercent = percent; } } consensusAnnotation.annotations[col] = new Annotation(modalCodon, mouseOver.toString(), ' ', pid); } } /** * Returns the number of decimal places to show for profile percentages. For * less than 100 sequences, returns zero (the integer percentage value will be * displayed). For 100-999 sequences, returns 1, for 1000-9999 returns 2, etc. * * @param nseq * @return */ protected static int getPercentageDp(long nseq) { int scale = 0; while (nseq >= 100) { scale++; nseq /= 10; } return scale; } /** * Returns the sorted HMM profile for the given column of the alignment. The * returned array contains * *

   *    [profileType=0, numberOfValues, 100, charValue1, percentage1, charValue2, percentage2, ...]
   * in descending order of percentage value
   *

* * @param hmm * @param column * @param removeBelowBackground * if true, ignores residues with probability less than their * background frequency * @param infoHeight * if true, uses the log ratio 'information' measure to scale the * value * @return */ public static int[] extractHMMProfile(HiddenMarkovModel hmm, int column, boolean removeBelowBackground, boolean infoHeight) { if (hmm == null) { return null; } String alphabet = hmm.getSymbols(); int size = alphabet.length(); char symbols[] = new char[size]; int values[] = new int[size]; int totalCount = 0; for (int i = 0; i < size; i++) { char symbol = alphabet.charAt(i); symbols[i] = symbol; int value = getAnalogueCount(hmm, column, symbol, removeBelowBackground, infoHeight); values[i] = value; totalCount += value; } /* * sort symbols by increasing emission probability */ QuickSort.sort(values, symbols); int[] profile = new int[3 + size * 2]; profile[0] = AlignmentAnnotation.SEQUENCE_PROFILE; profile[1] = size; profile[2] = 100; /* * order symbol/count profile by decreasing emission probability */ if (totalCount != 0) { int arrayPos = 3; for (int k = size - 1; k >= 0; k--) { Float percentage; int value = values[k]; if (removeBelowBackground) { percentage = ((float) value) / totalCount * 100f; } else { percentage = value / 100f; } int intPercent = Math.round(percentage); profile[arrayPos] = symbols[k]; profile[arrayPos + 1] = intPercent; arrayPos += 2; } } return profile; } /** * Converts the emission probability of a residue at a column in the alignment * to a 'count', suitable for rendering as an annotation value * * @param hmm * @param column * @param symbol * @param removeBelowBackground * if true, returns 0 for any symbol with a match emission * probability less than the background frequency * @infoHeight if true, uses the log ratio 'information content' to scale the * value * @return */ static int getAnalogueCount(HiddenMarkovModel hmm, int column, char symbol, boolean removeBelowBackground, boolean infoHeight) { double value = hmm.getMatchEmissionProbability(column, symbol); double freq = ResidueProperties.backgroundFrequencies .get(hmm.getAlphabetType()).get(symbol); if (value < freq && removeBelowBackground) { return 0; } if (infoHeight) { value = value * (Math.log(value / freq) / LOG2); } value = value * 10000d; return Math.round((float) value); } }