2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.AlignmentAnnotation;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.Annotation;
27 import jalview.datamodel.Profile;
28 import jalview.datamodel.ProfileI;
29 import jalview.datamodel.Profiles;
30 import jalview.datamodel.ProfilesI;
31 import jalview.datamodel.ResidueCount;
32 import jalview.datamodel.ResidueCount.SymbolCounts;
33 import jalview.datamodel.SecondaryStructureCount;
34 import jalview.datamodel.SeqCigar;
35 import jalview.datamodel.SequenceI;
36 import jalview.ext.android.SparseIntArray;
37 import jalview.util.Comparison;
38 import jalview.util.Constants;
39 import jalview.util.Format;
40 import jalview.util.MappingUtils;
41 import jalview.util.QuickSort;
43 import java.awt.Color;
44 import java.util.Arrays;
45 import java.util.Hashtable;
46 import java.util.List;
49 * Takes in a vector or array of sequences and column start and column end and
50 * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
51 * This class is used extensively in calculating alignment colourschemes that
52 * depend on the amount of conservation in each alignment column.
57 public class AAFrequency
59 public static final String PROFILE = "P";
62 * Quick look-up of String value of char 'A' to 'Z'
64 private static final String[] CHARS = new String['Z' - 'A' + 1];
68 for (char c = 'A'; c <= 'Z'; c++)
70 CHARS[c - 'A'] = String.valueOf(c);
74 public static final ProfilesI calculate(List<SequenceI> list, int start,
77 return calculate(list, start, end, false);
80 public static final ProfilesI calculate(List<SequenceI> sequences,
81 int start, int end, boolean profile)
83 SequenceI[] seqs = new SequenceI[sequences.size()];
85 synchronized (sequences)
87 for (int i = 0; i < sequences.size(); i++)
89 seqs[i] = sequences.get(i);
90 int length = seqs[i].getLength();
102 ProfilesI reply = calculate(seqs, width, start, end, profile);
108 * Calculate the consensus symbol(s) for each column in the given range.
112 * the full width of the alignment
114 * start column (inclusive, base zero)
116 * end column (exclusive)
117 * @param saveFullProfile
118 * if true, store all symbol counts
120 public static final ProfilesI calculate(final SequenceI[] sequences,
121 int width, int start, int end, boolean saveFullProfile)
123 // long now = System.currentTimeMillis();
124 int seqCount = sequences.length;
125 boolean nucleotide = false;
126 int nucleotideCount = 0;
127 int peptideCount = 0;
129 ProfileI[] result = new ProfileI[width];
131 for (int column = start; column < end; column++)
134 * Apply a heuristic to detect nucleotide data (which can
135 * be counted in more compact arrays); here we test for
136 * more than 90% nucleotide; recheck every 10 columns in case
137 * of misleading data e.g. highly conserved Alanine in peptide!
138 * Mistakenly guessing nucleotide has a small performance cost,
139 * as it will result in counting in sparse arrays.
140 * Mistakenly guessing peptide has a small space cost,
141 * as it will use a larger than necessary array to hold counts.
143 if (nucleotideCount > 100 && column % 10 == 0)
145 nucleotide = (9 * peptideCount < nucleotideCount);
147 ResidueCount residueCounts = new ResidueCount(nucleotide);
149 for (int row = 0; row < seqCount; row++)
151 if (sequences[row] == null)
153 jalview.bin.Console.errPrintln(
154 "WARNING: Consensus skipping null sequence - possible race condition.");
157 if (sequences[row].getLength() > column)
159 char c = sequences[row].getCharAt(column);
160 residueCounts.add(c);
161 if (Comparison.isNucleotide(c))
165 else if (!Comparison.isGap(c))
173 * count a gap if the sequence doesn't reach this column
175 residueCounts.addGap();
179 int maxCount = residueCounts.getModalCount();
180 String maxResidue = residueCounts.getResiduesForCount(maxCount);
181 int gapCount = residueCounts.getGapCount();
182 ProfileI profile = new Profile(seqCount, gapCount, maxCount,
187 profile.setCounts(residueCounts);
190 result[column] = profile;
192 return new Profiles(result);
193 // long elapsed = System.currentTimeMillis() - now;
194 // jalview.bin.Console.outPrintln(elapsed);
198 public static final ProfilesI calculateSS(List<SequenceI> list, int start,
201 return calculateSS(list, start, end, false);
204 public static final ProfilesI calculateSS(List<SequenceI> sequences,
205 int start, int end, boolean profile)
207 SequenceI[] seqs = new SequenceI[sequences.size()];
209 synchronized (sequences)
211 for (int i = 0; i < sequences.size(); i++)
213 seqs[i] = sequences.get(i);
214 int length = seqs[i].getLength();
226 ProfilesI reply = calculateSS(seqs, width, start, end, profile);
231 public static final ProfilesI calculateSS(final SequenceI[] sequences,
232 int width, int start, int end, boolean saveFullProfile)
235 int seqCount = sequences.length;
237 ProfileI[] result = new ProfileI[width];
239 for (int column = start; column < end; column++)
244 SecondaryStructureCount ssCounts = new SecondaryStructureCount();
246 for (int row = 0; row < seqCount; row++)
248 if (sequences[row] == null)
250 jalview.bin.Console.errPrintln(
251 "WARNING: Consensus skipping null sequence - possible race condition.");
255 char c = sequences[row].getCharAt(column);
256 AlignmentAnnotation aa = AlignmentUtils.getDisplayedAlignmentAnnotation(sequences[row]);
261 if (sequences[row].getLength() > column && !Comparison.isGap(c) && aa !=null)
264 int seqPosition = sequences[row].findPosition(column);
266 char ss = AlignmentUtils.findSSAnnotationForGivenSeqposition(
273 else if(Comparison.isGap(c) && aa!=null) {
278 int maxSSCount = ssCounts.getModalCount();
279 String maxSS = ssCounts.getSSForCount(maxSSCount);
280 int gapCount = ssCounts.getGapCount();
281 ProfileI profile = new Profile(maxSS, ssCount, gapCount,
286 profile.setSSCounts(ssCounts);
289 result[column] = profile;
291 return new Profiles(result);
295 * Make an estimate of the profile size we are going to compute i.e. how many
296 * different characters may be present in it. Overestimating has a cost of
297 * using more memory than necessary. Underestimating has a cost of needing to
298 * extend the SparseIntArray holding the profile counts.
300 * @param profileSizes
301 * counts of sizes of profiles so far encountered
304 static int estimateProfileSize(SparseIntArray profileSizes)
306 if (profileSizes.size() == 0)
312 * could do a statistical heuristic here e.g. 75%ile
313 * for now just return the largest value
315 return profileSizes.keyAt(profileSizes.size() - 1);
319 * Derive the consensus annotations to be added to the alignment for display.
320 * This does not recompute the raw data, but may be called on a change in
321 * display options, such as 'ignore gaps', which may in turn result in a
322 * change in the derived values.
325 * the annotation row to add annotations to
327 * the source consensus data
329 * start column (inclusive)
331 * end column (exclusive)
333 * if true, normalise residue percentages ignoring gaps
334 * @param showSequenceLogo
335 * if true include all consensus symbols, else just show modal
338 * number of sequences
340 public static void completeConsensus(AlignmentAnnotation consensus,
341 ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps,
342 boolean showSequenceLogo, long nseq)
344 // long now = System.currentTimeMillis();
345 if (consensus == null || consensus.annotations == null
346 || consensus.annotations.length < endCol)
349 * called with a bad alignment annotation row
350 * wait for it to be initialised properly
355 for (int i = startCol; i < endCol; i++)
357 ProfileI profile = profiles.get(i);
361 * happens if sequences calculated over were
362 * shorter than alignment width
364 consensus.annotations[i] = null;
368 final int dp = getPercentageDp(nseq);
370 float value = profile.getPercentageIdentity(ignoreGaps);
372 String description = getTooltip(profile, value, showSequenceLogo,
375 String modalResidue = profile.getModalResidue();
376 if ("".equals(modalResidue))
380 else if (modalResidue.length() > 1)
384 consensus.annotations[i] = new Annotation(modalResidue, description,
387 // long elapsed = System.currentTimeMillis() - now;
388 // jalview.bin.Console.outPrintln(-elapsed);
392 public static void completeSSConsensus(AlignmentAnnotation ssConsensus,
393 ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps,
394 boolean showSequenceLogo, long nseq)
396 // long now = System.currentTimeMillis();
397 if (ssConsensus == null || ssConsensus.annotations == null
398 || ssConsensus.annotations.length < endCol)
401 * called with a bad alignment annotation row
402 * wait for it to be initialised properly
407 for (int i = startCol; i < endCol; i++)
409 ProfileI profile = profiles.get(i);
413 * happens if sequences calculated over were
414 * shorter than alignment width
416 ssConsensus.annotations[i] = null;
420 final int dp = getPercentageDp(nseq);
422 float value = profile.getSSPercentageIdentity(ignoreGaps);
424 String description = getSSTooltip(profile, value, showSequenceLogo,
427 String modalSS = profile.getModalSS();
428 if ("".equals(modalSS))
432 else if (modalSS.length() > 1)
436 ssConsensus.annotations[i] = new Annotation(modalSS, description,
439 // long elapsed = System.currentTimeMillis() - now;
440 // jalview.bin.Console.outPrintln(-elapsed);
444 * Derive the gap count annotation row.
447 * the annotation row to add annotations to
449 * the source consensus data
451 * start column (inclusive)
453 * end column (exclusive)
455 public static void completeGapAnnot(AlignmentAnnotation gaprow,
456 ProfilesI profiles, int startCol, int endCol, long nseq)
458 if (gaprow == null || gaprow.annotations == null
459 || gaprow.annotations.length < endCol)
462 * called with a bad alignment annotation row
463 * wait for it to be initialised properly
467 // always set ranges again
468 gaprow.graphMax = nseq;
470 double scale = 0.8 / nseq;
471 for (int i = startCol; i < endCol; i++)
473 ProfileI profile = profiles.get(i);
477 * happens if sequences calculated over were
478 * shorter than alignment width
480 gaprow.annotations[i] = null;
484 final int gapped = profile.getNonGapped();
486 String description = "" + gapped;
488 gaprow.annotations[i] = new Annotation("", description, '\0', gapped,
489 jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY,
490 (float) scale * gapped));
495 * Returns a tooltip showing either
497 * <li>the full profile (percentages of all residues present), if
498 * showSequenceLogo is true, or</li>
499 * <li>just the modal (most common) residue(s), if showSequenceLogo is
502 * Percentages are as a fraction of all sequence, or only ungapped sequences
503 * if ignoreGaps is true.
507 * @param showSequenceLogo
510 * the number of decimal places to format percentages to
513 static String getTooltip(ProfileI profile, float pid,
514 boolean showSequenceLogo, boolean ignoreGaps, int dp)
516 ResidueCount counts = profile.getCounts();
518 String description = null;
519 if (counts != null && showSequenceLogo)
521 int normaliseBy = ignoreGaps ? profile.getNonGapped()
522 : profile.getHeight();
523 description = counts.getTooltip(normaliseBy, dp);
527 StringBuilder sb = new StringBuilder(64);
528 String maxRes = profile.getModalResidue();
529 if (maxRes.length() > 1)
531 sb.append("[").append(maxRes).append("]");
537 if (maxRes.length() > 0)
540 Format.appendPercentage(sb, pid, dp);
543 description = sb.toString();
548 static String getSSTooltip(ProfileI profile, float pid,
549 boolean showSequenceLogo, boolean ignoreGaps, int dp)
551 SecondaryStructureCount counts = profile.getSSCounts();
553 String description = null;
554 if (counts != null && showSequenceLogo)
556 int normaliseBy = ignoreGaps ? profile.getNonGapped()
557 : profile.getHeight();
558 description = counts.getTooltip(normaliseBy, dp);
562 StringBuilder sb = new StringBuilder(64);
563 String maxSS = profile.getModalSS();
564 if (maxSS.length() > 1)
566 sb.append("[").append(maxSS).append("]");
572 if (maxSS.length() > 0)
575 Format.appendPercentage(sb, pid, dp);
578 description = sb.toString();
584 * Returns the sorted profile for the given consensus data. The returned array
588 * [profileType, numberOfValues, totalPercent, charValue1, percentage1, charValue2, percentage2, ...]
589 * in descending order of percentage value
593 * the data object from which to extract and sort values
595 * if true, only non-gapped values are included in percentage
599 public static int[] extractProfile(ProfileI profile, boolean ignoreGaps)
604 if (profile.getCounts() != null)
606 ResidueCount counts = profile.getCounts();
607 SymbolCounts symbolCounts = counts.getSymbolCounts();
608 symbols = symbolCounts.symbols;
609 values = symbolCounts.values;
612 else if(profile.getSSCounts() != null)
614 SecondaryStructureCount counts = profile.getSSCounts();
616 SecondaryStructureCount.SymbolCounts symbolCounts = counts.getSymbolCounts();
617 symbols = symbolCounts.symbols;
618 values = symbolCounts.values;
625 QuickSort.sort(values, symbols);
626 int totalPercentage = 0;
627 final int divisor = ignoreGaps ? profile.getNonGapped()
628 : profile.getHeight();
631 * traverse the arrays in reverse order (highest counts first)
633 int[] result = new int[3 + 2 * symbols.length];
634 int nextArrayPos = 3;
635 int nonZeroCount = 0;
637 for (int i = symbols.length - 1; i >= 0; i--)
639 int theChar = symbols[i];
640 int charCount = values[i];
641 final int percentage = (charCount * 100) / divisor;
645 * this count (and any remaining) round down to 0% - discard
650 result[nextArrayPos++] = theChar;
651 result[nextArrayPos++] = percentage;
652 totalPercentage += percentage;
656 * truncate array if any zero values were discarded
658 if (nonZeroCount < symbols.length)
660 int[] tmp = new int[3 + 2 * nonZeroCount];
661 System.arraycopy(result, 0, tmp, 0, tmp.length);
666 * fill in 'header' values
668 result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
669 result[1] = nonZeroCount;
670 result[2] = totalPercentage;
676 * Extract a sorted extract of cDNA codon profile data. The returned array
680 * [profileType, numberOfValues, totalPercentage, charValue1, percentage1, charValue2, percentage2, ...]
681 * in descending order of percentage value, where the character values encode codon triplets
687 public static int[] extractCdnaProfile(
688 Hashtable<String, Object> hashtable, boolean ignoreGaps)
690 // this holds #seqs, #ungapped, and then codon count, indexed by encoded
692 int[] codonCounts = (int[]) hashtable.get(PROFILE);
693 int[] sortedCounts = new int[codonCounts.length - 2];
694 System.arraycopy(codonCounts, 2, sortedCounts, 0,
695 codonCounts.length - 2);
697 int[] result = new int[3 + 2 * sortedCounts.length];
698 // first value is just the type of profile data
699 result[0] = AlignmentAnnotation.CDNA_PROFILE;
701 char[] codons = new char[sortedCounts.length];
702 for (int i = 0; i < codons.length; i++)
704 codons[i] = (char) i;
706 QuickSort.sort(sortedCounts, codons);
707 int totalPercentage = 0;
708 int distinctValuesCount = 0;
710 int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
711 for (int i = codons.length - 1; i >= 0; i--)
713 final int codonCount = sortedCounts[i];
716 break; // nothing else of interest here
718 final int percentage = codonCount * 100 / divisor;
722 * this (and any remaining) values rounded down to 0 - discard
726 distinctValuesCount++;
727 result[j++] = codons[i];
728 result[j++] = percentage;
729 totalPercentage += percentage;
731 result[2] = totalPercentage;
734 * Just return the non-zero values
736 // todo next value is redundant if we limit the array to non-zero counts
737 result[1] = distinctValuesCount;
738 return Arrays.copyOfRange(result, 0, j);
742 * Compute a consensus for the cDNA coding for a protein alignment.
745 * the protein alignment (which should hold mappings to cDNA
748 * the consensus data stores to be populated (one per column)
750 public static void calculateCdna(AlignmentI alignment,
751 Hashtable<String, Object>[] hconsensus)
753 final char gapCharacter = alignment.getGapCharacter();
754 List<AlignedCodonFrame> mappings = alignment.getCodonFrames();
755 if (mappings == null || mappings.isEmpty())
760 int cols = alignment.getWidth();
761 for (int col = 0; col < cols; col++)
763 // todo would prefer a Java bean for consensus data
764 Hashtable<String, Object> columnHash = new Hashtable<>();
765 // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
766 int[] codonCounts = new int[66];
767 codonCounts[0] = alignment.getSequences().size();
768 int ungappedCount = 0;
769 for (SequenceI seq : alignment.getSequences())
771 if (seq.getCharAt(col) == gapCharacter)
775 List<char[]> codons = MappingUtils.findCodonsFor(seq, col,
777 for (char[] codon : codons)
779 int codonEncoded = CodingUtils.encodeCodon(codon);
780 if (codonEncoded >= 0)
782 codonCounts[codonEncoded + 2]++;
788 codonCounts[1] = ungappedCount;
789 // todo: sort values here, save counts and codons?
790 columnHash.put(PROFILE, codonCounts);
791 hconsensus[col] = columnHash;
796 * Derive displayable cDNA consensus annotation from computed consensus data.
798 * @param consensusAnnotation
799 * the annotation row to be populated for display
800 * @param consensusData
801 * the computed consensus data
802 * @param showProfileLogo
803 * if true show all symbols present at each position, else only the
806 * the number of sequences in the alignment
808 public static void completeCdnaConsensus(
809 AlignmentAnnotation consensusAnnotation,
810 Hashtable<String, Object>[] consensusData,
811 boolean showProfileLogo, int nseqs)
813 if (consensusAnnotation == null
814 || consensusAnnotation.annotations == null
815 || consensusAnnotation.annotations.length < consensusData.length)
817 // called with a bad alignment annotation row - wait for it to be
818 // initialised properly
822 // ensure codon triplet scales with font size
823 consensusAnnotation.scaleColLabel = true;
824 for (int col = 0; col < consensusData.length; col++)
826 Hashtable<String, Object> hci = consensusData[col];
829 // gapped protein column?
832 // array holds #seqs, #ungapped, then codon counts indexed by codon
833 final int[] codonCounts = (int[]) hci.get(PROFILE);
837 * First pass - get total count and find the highest
839 final char[] codons = new char[codonCounts.length - 2];
840 for (int j = 2; j < codonCounts.length; j++)
842 final int codonCount = codonCounts[j];
843 codons[j - 2] = (char) (j - 2);
844 totalCount += codonCount;
848 * Sort array of encoded codons by count ascending - so the modal value
849 * goes to the end; start by copying the count (dropping the first value)
851 int[] sortedCodonCounts = new int[codonCounts.length - 2];
852 System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
853 codonCounts.length - 2);
854 QuickSort.sort(sortedCodonCounts, codons);
856 int modalCodonEncoded = codons[codons.length - 1];
857 int modalCodonCount = sortedCodonCounts[codons.length - 1];
858 String modalCodon = String
859 .valueOf(CodingUtils.decodeCodon(modalCodonEncoded));
860 if (sortedCodonCounts.length > 1 && sortedCodonCounts[codons.length
861 - 2] == sortedCodonCounts[codons.length - 1])
864 * two or more codons share the modal count
868 float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
869 / (float) totalCount;
872 * todo ? Replace consensus hashtable with sorted arrays of codons and
873 * counts (non-zero only). Include total count in count array [0].
877 * Scan sorted array backwards for most frequent values first. Show
878 * repeated values compactly.
880 StringBuilder mouseOver = new StringBuilder(32);
881 StringBuilder samePercent = new StringBuilder();
882 String percent = null;
883 String lastPercent = null;
884 int percentDecPl = getPercentageDp(nseqs);
886 for (int j = codons.length - 1; j >= 0; j--)
888 int codonCount = sortedCodonCounts[j];
892 * remaining codons are 0% - ignore, but finish off the last one if
895 if (samePercent.length() > 0)
897 mouseOver.append(samePercent).append(": ").append(percent)
902 int codonEncoded = codons[j];
903 final int pct = codonCount * 100 / totalCount;
904 String codon = String
905 .valueOf(CodingUtils.decodeCodon(codonEncoded));
906 StringBuilder sb = new StringBuilder();
907 Format.appendPercentage(sb, pct, percentDecPl);
908 percent = sb.toString();
909 if (showProfileLogo || codonCount == modalCodonCount)
911 if (percent.equals(lastPercent) && j > 0)
913 samePercent.append(samePercent.length() == 0 ? "" : ", ");
914 samePercent.append(codon);
918 if (samePercent.length() > 0)
920 mouseOver.append(samePercent).append(": ").append(lastPercent)
923 samePercent.setLength(0);
924 samePercent.append(codon);
926 lastPercent = percent;
930 consensusAnnotation.annotations[col] = new Annotation(modalCodon,
931 mouseOver.toString(), ' ', pid);
936 * Returns the number of decimal places to show for profile percentages. For
937 * less than 100 sequences, returns zero (the integer percentage value will be
938 * displayed). For 100-999 sequences, returns 1, for 1000-9999 returns 2, etc.
943 protected static int getPercentageDp(long nseq)