2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.analysis.scoremodels.ScoreMatrix;
24 import jalview.analysis.scoremodels.ScoreModels;
25 import jalview.datamodel.AlignmentAnnotation;
26 import jalview.datamodel.Annotation;
27 import jalview.datamodel.ResidueCount;
28 import jalview.datamodel.ResidueCount.SymbolCounts;
29 import jalview.datamodel.Sequence;
30 import jalview.datamodel.SequenceI;
31 import jalview.schemes.ResidueProperties;
32 import jalview.util.Comparison;
34 import java.awt.Color;
35 import java.util.List;
37 import java.util.Map.Entry;
38 import java.util.SortedMap;
39 import java.util.TreeMap;
40 import java.util.Vector;
43 * Calculates conservation values for a given set of sequences
45 public class Conservation
48 * need to have a minimum of 3% of sequences with a residue
49 * for it to be included in the conservation calculation
51 private static final int THRESHOLD_PERCENT = 3;
53 private static final int TOUPPERCASE = 'a' - 'A';
55 SequenceI[] sequences;
61 Vector<int[]> seqNums; // vector of int vectors where first is sequence
64 int maxLength = 0; // used by quality calcs
66 boolean seqNumsChanged = false; // updated after any change via calcSeqNum;
69 * a map per column with {property, conservation} where conservation value is
70 * 1 (property is conserved), 0 (absence of property is conserved) or -1
71 * (property is not conserved i.e. column has residues with and without it)
73 Map<String, Integer>[] total;
75 boolean canonicaliseAa = true; // if true then conservation calculation will
77 // map all symbols to canonical aa numbering
78 // rather than consider conservation of that
81 /** Stores calculated quality values */
82 private Vector<Double> quality;
84 /** Stores maximum and minimum values of quality values */
85 private double[] qualityRange = new double[2];
87 private Sequence consSequence;
90 * percentage of residues in a column to qualify for counting conservation
92 private int threshold;
94 private String name = "";
96 private int[][] cons2;
98 private String[] consSymbs;
101 * Constructor using default threshold of 3%
104 * Name of conservation
106 * sequences to be used in calculation
108 * start residue position
110 * end residue position
112 public Conservation(String name, List<SequenceI> sequences, int start,
115 this(name, THRESHOLD_PERCENT, sequences, start, end);
122 * Name of conservation
124 * percentage of sequences at or below which property conservation is
127 * sequences to be used in calculation
129 * start column position
131 * end column position
133 public Conservation(String name, int threshold,
134 List<SequenceI> sequences, int start, int end)
137 this.threshold = threshold;
141 maxLength = end - start + 1; // default width includes bounds of
144 int s, sSize = sequences.size();
145 SequenceI[] sarray = new SequenceI[sSize];
146 this.sequences = sarray;
149 for (s = 0; s < sSize; s++)
151 sarray[s] = sequences.get(s);
152 if (sarray[s].getLength() > maxLength)
154 maxLength = sarray[s].getLength();
157 } catch (ArrayIndexOutOfBoundsException ex)
159 // bail - another thread has modified the sequence array, so the current
160 // calculation is probably invalid.
161 this.sequences = new SequenceI[0];
167 * Translate sequence i into score matrix indices and store it in the i'th
168 * position of the seqNums array.
173 private void calcSeqNum(int i, ScoreMatrix sm)
175 int gapIndex = sm.getMatrixIndex(' ');
176 int sSize = sequences.length;
178 if ((i > -1) && (i < sSize))
180 String sq = sequences[i].getSequenceAsString();
182 if (seqNums.size() <= i)
184 seqNums.addElement(new int[sq.length() + 1]);
188 * the first entry in the array is the sequence's hashcode,
189 * following entries are matrix indices of sequence characters
191 if (sq.hashCode() != seqNums.elementAt(i)[0])
195 seqNumsChanged = true;
203 int[] sqnum = new int[len + 1]; // better to always make a new array -
204 // sequence can change its length
205 sqnum[0] = sq.hashCode();
207 for (j = 1; j <= len; j++)
209 // sqnum[j] = ResidueProperties.aaIndex[sq.charAt(j - 1)];
210 sqnum[j] = sm.getMatrixIndex(sq.charAt(j - 1));
217 seqNums.setElementAt(sqnum, i);
221 System.out.println("SEQUENCE HAS BEEN DELETED!!!");
226 // JBPNote INFO level debug
228 .println("ERROR: calcSeqNum called with out of range sequence index for Alignment\n");
233 * Calculates the conservation values for given set of sequences
235 public void calculate()
237 int height = sequences.length;
239 total = new Map[maxLength];
241 for (int column = start; column <= end; column++)
243 ResidueCount values = countResidues(column);
246 * percentage count at or below which we ignore residues
248 int thresh = (threshold * height) / 100;
251 * check observed residues in column and record whether each
252 * physico-chemical property is conserved (+1), absence conserved (0),
253 * or not conserved (-1)
254 * Using TreeMap means properties are displayed in alphabetical order
256 SortedMap<String, Integer> resultHash = new TreeMap<String, Integer>();
257 SymbolCounts symbolCounts = values.getSymbolCounts();
258 char[] symbols = symbolCounts.symbols;
259 int[] counts = symbolCounts.values;
260 for (int j = 0; j < symbols.length; j++)
263 if (counts[j] > thresh)
265 recordConservation(resultHash, String.valueOf(c));
268 if (values.getGapCount() > thresh)
270 recordConservation(resultHash, "-");
273 if (total.length > 0)
275 total[column - start] = resultHash;
281 * Updates the conservation results for an observed residue
284 * a map of {property, conservation} where conservation value is +1
285 * (all residues have the property), 0 (no residue has the property)
286 * or -1 (some do, some don't)
289 protected static void recordConservation(Map<String, Integer> resultMap,
292 res = res.toUpperCase();
293 for (Entry<String, Map<String, Integer>> property : ResidueProperties.propHash
296 String propertyName = property.getKey();
297 Integer residuePropertyValue = property.getValue().get(res);
299 if (!resultMap.containsKey(propertyName))
302 * first time we've seen this residue - note whether it has this property
304 if (residuePropertyValue != null)
306 resultMap.put(propertyName, residuePropertyValue);
311 * unrecognised residue - use default value for property
313 resultMap.put(propertyName, property.getValue().get("-"));
318 Integer currentResult = resultMap.get(propertyName);
319 if (currentResult.intValue() != -1
320 && !currentResult.equals(residuePropertyValue))
323 * property is unconserved - residues seen both with and without it
325 resultMap.put(propertyName, Integer.valueOf(-1));
332 * Counts residues (upper-cased) and gaps in the given column
337 protected ResidueCount countResidues(int column)
339 ResidueCount values = new ResidueCount(false);
341 for (int row = 0; row < sequences.length; row++)
343 if (sequences[row].getLength() > column)
345 char c = sequences[row].getCharAt(column);
348 int index = ResidueProperties.aaIndex[c];
349 c = index > 20 ? '-' : ResidueProperties.aa[index].charAt(0);
355 if (Comparison.isGap(c))
373 * Counts conservation and gaps for a column of the alignment
375 * @return { 1 if fully conserved, else 0, gap count }
377 public int[] countConservationAndGaps(int column)
380 boolean fullyConserved = true;
381 int iSize = sequences.length;
385 return new int[] { 0, 0 };
389 for (int i = 0; i < iSize; i++)
391 if (column >= sequences[i].getLength())
397 char c = sequences[i].getCharAt(column); // gaps do not have upper/lower case
399 if (Comparison.isGap((c)))
412 fullyConserved = false;
417 int[] r = new int[] { fullyConserved ? 1 : 0, gapCount };
422 * Returns the upper-cased character if between 'a' and 'z', else the
428 char toUpperCase(char c)
430 if ('a' <= c && c <= 'z')
438 * Calculates the conservation sequence
440 * @param positiveOnly
441 * if true, calculate positive conservation; else calculate both
442 * positive and negative conservation
443 * @param maxPercentageGaps
444 * the percentage of gaps in a column, at or above which no
445 * conservation is asserted
447 public void verdict(boolean positiveOnly, float maxPercentageGaps)
449 // TODO call this at the end of calculate(), should not be a public method
451 StringBuilder consString = new StringBuilder(end);
453 // NOTE THIS SHOULD CHECK IF THE CONSEQUENCE ALREADY
454 // EXISTS AND NOT OVERWRITE WITH '-', BUT THIS CASE
455 // DOES NOT EXIST IN JALVIEW 2.1.2
456 for (int i = 0; i < start; i++)
458 consString.append('-');
460 consSymbs = new String[end - start + 1];
461 for (int i = start; i <= end; i++)
463 int[] gapcons = countConservationAndGaps(i);
464 boolean fullyConserved = gapcons[0] == 1;
465 int totGaps = gapcons[1];
466 float pgaps = (totGaps * 100f) / sequences.length;
468 if (maxPercentageGaps > pgaps)
470 Map<String, Integer> resultHash = total[i - start];
472 StringBuilder positives = new StringBuilder(64);
473 StringBuilder negatives = new StringBuilder(32);
474 for (String type : resultHash.keySet())
476 int result = resultHash.get(type).intValue();
480 * not conserved (present or absent)
488 * positively conserved property (all residues have it)
490 positives.append(positives.length() == 0 ? "" : " ");
491 positives.append(type);
493 if (result == 0 && !positiveOnly)
496 * absense of property is conserved (all residues lack it)
498 negatives.append(negatives.length() == 0 ? "" : " ");
499 negatives.append("!").append(type);
502 if (negatives.length() > 0)
504 positives.append(" ").append(negatives);
506 consSymbs[i - start] = positives.toString();
510 consString.append(count); // Conserved props!=Identity
514 consString.append(fullyConserved ? "*" : "+");
519 consString.append('-');
523 consSequence = new Sequence(name, consString.toString(), start, end);
529 * @return Conservation sequence
531 public SequenceI getConsSequence()
536 // From Alignment.java in jalview118
537 public void findQuality()
539 findQuality(0, maxLength - 1);
547 private void percentIdentity(ScoreMatrix sm)
549 seqNums = new Vector<int[]>();
550 int i = 0, iSize = sequences.length;
551 // Do we need to calculate this again?
552 for (i = 0; i < iSize; i++)
557 int gapIndex = sm.getMatrixIndex(' ');
559 if ((cons2 == null) || seqNumsChanged)
561 // FIXME remove magic number 24 without changing calc
562 // sm.getSize() returns 25 so doesn't quite do it...
563 cons2 = new int[maxLength][24];
567 while (j < sequences.length)
569 int[] sqnum = seqNums.elementAt(j);
571 for (i = 1; i < sqnum.length; i++)
573 cons2[i - 1][sqnum[i]]++;
576 for (i = sqnum.length - 1; i < maxLength; i++)
578 cons2[i][gapIndex]++; // gap count
586 * Calculates the quality of the set of sequences
593 public void findQuality(int startRes, int endRes)
595 quality = new Vector<Double>();
597 double max = -Double.MAX_VALUE;
598 ScoreMatrix blosum62 = ScoreModels.getInstance().getBlosum62();
599 float[][] blosumScores = blosum62.getMatrix();
600 int gapIndex = blosum62.getMatrixIndex(' ');
602 // Loop over columns // JBPNote Profiling info
603 // long ts = System.currentTimeMillis();
604 // long te = System.currentTimeMillis();
605 percentIdentity(blosum62);
607 int size = seqNums.size();
608 int[] lengths = new int[size];
609 double tot, bigtot, sr, tmp;
611 int l, j, i, ii, i2, k, seqNum;
613 for (l = 0; l < size; l++)
615 lengths[l] = seqNums.elementAt(l).length - 1;
618 // todo ? remove '*' (unused?) from score matrix and
619 // use getSize() here instead of getSize() - 1 ??
620 final int symbolCount = blosum62.getSize() - 1; // 24;
622 for (j = startRes; j <= endRes; j++)
626 // First Xr = depends on column only
627 x = new double[symbolCount];
629 for (ii = 0; ii < symbolCount; ii++)
633 for (i2 = 0; i2 < symbolCount; i2++)
635 x[ii] += (((double) cons2[j][i2] * blosumScores[ii][i2]) + 4);
641 // Now calculate D for each position and sum
642 for (k = 0; k < size; k++)
645 xx = new double[symbolCount];
646 seqNum = (j < lengths[k]) ? seqNums.elementAt(k)[j + 1] : gapIndex;
647 // Sequence, or gap at the end
649 // This is a loop over r
650 for (i = 0; i < symbolCount - 1; i++)
654 sr = (double) blosumScores[i][seqNum] + 4;
656 // Calculate X with another loop over residues
657 // System.out.println("Xi " + i + " " + x[i] + " " + sr);
660 tot += (xx[i] * xx[i]);
663 bigtot += Math.sqrt(tot);
666 // This is the quality for one column
672 // bigtot = bigtot * (size-cons2[j][23])/size;
673 quality.addElement(new Double(bigtot));
675 // Need to normalize by gaps
678 double newmax = -Double.MAX_VALUE;
680 for (j = startRes; j <= endRes; j++)
682 tmp = quality.elementAt(j).doubleValue();
683 // tmp = ((max - tmp) * (size - cons2[j][23])) / size;
684 tmp = ((max - tmp) * (size - cons2[j][gapIndex])) / size;
686 // System.out.println(tmp+ " " + j);
687 quality.setElementAt(new Double(tmp), j);
695 // System.out.println("Quality " + s);
696 qualityRange[0] = 0D;
697 qualityRange[1] = newmax;
701 * Complete the given consensus and quuality annotation rows. Note: currently
702 * this method will enlarge the given annotation row if it is too small,
703 * otherwise will leave its length unchanged.
705 * @param conservation
706 * conservation annotation row
708 * (optional - may be null)
710 * first column for conservation
712 * extent of conservation
714 public void completeAnnotations(AlignmentAnnotation conservation,
715 AlignmentAnnotation quality2, int istart, int alWidth)
717 char[] sequence = getConsSequence().getSequence();
729 maxB = 0f - minB; // scalable range for colouring both Conservation and
739 if (conservation != null && conservation.annotations != null
740 && conservation.annotations.length < alWidth)
742 conservation.annotations = new Annotation[alWidth];
745 if (quality2 != null)
747 quality2.graphMax = (float) qualityRange[1];
748 if (quality2.annotations != null
749 && quality2.annotations.length < alWidth)
751 quality2.annotations = new Annotation[alWidth];
753 qmin = (float) qualityRange[0];
754 qmax = (float) qualityRange[1];
757 for (int i = istart; i < alWidth; i++)
763 if (Character.isDigit(c))
776 if (conservation != null)
778 float vprop = value - min;
780 int consp = i - start;
781 String conssym = (value > 0 && consp > -1 && consp < consSymbs.length) ? consSymbs[consp]
783 conservation.annotations[i] = new Annotation(String.valueOf(c),
784 conssym, ' ', value, new Color(minR + (maxR * vprop), minG
785 + (maxG * vprop), minB + (maxB * vprop)));
789 if (quality2 != null)
791 value = quality.elementAt(i).floatValue();
792 float vprop = value - qmin;
794 quality2.annotations[i] = new Annotation(" ",
795 String.valueOf(value), ' ', value, new Color(minR
796 + (maxR * vprop), minG + (maxG * vprop), minB
803 * construct and call the calculation methods on a new Conservation object
806 * - name of conservation
809 * first column in calculation window
811 * last column in calculation window
812 * @param positiveOnly
813 * calculate positive (true) or positive and negative (false)
815 * @param maxPercentGaps
816 * percentage of gaps tolerated in column
818 * flag indicating if alignment quality should be calculated
819 * @return Conservation object ready for use in visualization
821 public static Conservation calculateConservation(String name,
822 List<SequenceI> seqs, int start, int end, boolean positiveOnly,
823 int maxPercentGaps, boolean calcQuality)
825 Conservation cons = new Conservation(name, seqs, start, end);
827 cons.verdict(positiveOnly, maxPercentGaps);
838 * Returns the computed tooltip (annotation description) for a given column.
839 * The tip is empty if the conservation score is zero, otherwise holds the
840 * conserved properties (and, optionally, properties whose absence is
846 String getTooltip(int column)
848 char[] sequence = getConsSequence().getSequence();
849 char val = column < sequence.length ? sequence[column] : '-';
850 boolean hasConservation = val != '-' && val != '0';
851 int consp = column - start;
852 String tip = (hasConservation && consp > -1 && consp < consSymbs.length) ? consSymbs[consp]