src/jalview/analysis/AAFrequency.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.9)
   3  * Copyright (C) 2015 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.Format;
  29 import jalview.util.MappingUtils;
  30 import jalview.util.QuickSort;
  31
  32 import java.util.Arrays;
  33 import java.util.Hashtable;
  34 import java.util.List;
  35 import java.util.Set;
  36
  37 /**
  38  * Takes in a vector or array of sequences and column start and column end and
  39  * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
  40  * This class is used extensively in calculating alignment colourschemes that
  41  * depend on the amount of conservation in each alignment column.
  42  *
  43  * @author $author$
  44  * @version $Revision$
  45  */
  46 public class AAFrequency
  47 {
  48   private static final int TO_UPPER_CASE = 'A' - 'a'; // -32
  49
  50   public static final String MAXCOUNT = "C";
  51
  52   public static final String MAXRESIDUE = "R";
  53
  54   public static final String PID_GAPS = "G";
  55
  56   public static final String PID_NOGAPS = "N";
  57
  58   public static final String PROFILE = "P";
  59
  60   public static final String ENCODED_CHARS = "E";
  61
  62   /*
  63    * Quick look-up of String value of char 'A' to 'Z'
  64    */
  65   private static final String[] CHARS = new String['Z' - 'A' + 1];
  66
  67   static
  68   {
  69     for (char c = 'A'; c <= 'Z'; c++)
  70     {
  71       CHARS[c - 'A'] = String.valueOf(c);
  72     }
  73   }
  74
  75   public static final Hashtable[] calculate(List<SequenceI> list,
  76           int start, int end)
  77   {
  78     return calculate(list, start, end, false);
  79   }
  80
  81   public static final Hashtable[] calculate(List<SequenceI> sequences,
  82           int start, int end, boolean profile)
  83   {
  84     SequenceI[] seqs = new SequenceI[sequences.size()];
  85     int width = 0;
  86     synchronized (sequences)
  87     {
  88       for (int i = 0; i < sequences.size(); i++)
  89       {
  90         seqs[i] = sequences.get(i);
  91         if (seqs[i].getLength() > width)
  92         {
  93           width = seqs[i].getLength();
  94         }
  95       }
  96
  97       Hashtable[] reply = new Hashtable[width];
  98
  99       if (end >= width)
 100       {
 101         end = width;
 102       }
 103
 104       calculate(seqs, start, end, reply, profile);
 105       return reply;
 106     }
 107   }
 108
 109   public static final void calculate(SequenceI[] sequences, int start,
 110           int end, Hashtable[] result, boolean profile)
 111   {
 112     Hashtable residueHash;
 113     int maxCount, nongap, i, j, v;
 114     int jSize = sequences.length;
 115     String maxResidue;
 116     char c = '-';
 117     float percentage;
 118
 119     int[] values = new int[255];
 120
 121     char[] seq;
 122
 123     for (i = start; i < end; i++)
 124     {
 125       residueHash = new Hashtable();
 126       maxCount = 0;
 127       maxResidue = "";
 128       nongap = 0;
 129       values = new int[255];
 130
 131       for (j = 0; j < jSize; j++)
 132       {
 133         if (sequences[j] == null)
 134         {
 135           System.err
 136                   .println("WARNING: Consensus skipping null sequence - possible race condition.");
 137           continue;
 138         }
 139         seq = sequences[j].getSequence();
 140         if (seq.length > i)
 141         {
 142           c = seq[i];
 143
 144           if (c == '.' || c == ' ')
 145           {
 146             c = '-';
 147           }
 148
 149           if (c == '-')
 150           {
 151             values['-']++;
 152             continue;
 153           }
 154           else if ('a' <= c && c <= 'z')
 155           {
 156             c += TO_UPPER_CASE;
 157           }
 158
 159           nongap++;
 160           values[c]++;
 161
 162         }
 163         else
 164         {
 165           values['-']++;
 166         }
 167       }
 168       if (jSize == 1)
 169       {
 170         maxResidue = String.valueOf(c);
 171         maxCount = 1;
 172       }
 173       else
 174       {
 175         for (v = 'A'; v <= 'Z'; v++)
 176         {
 177           // TODO why ignore values[v] == 1?
 178           if (values[v] < 1 /* 2 */|| values[v] < maxCount)
 179           {
 180             continue;
 181           }
 182
 183           if (values[v] > maxCount)
 184           {
 185             maxResidue = CHARS[v - 'A'];
 186           }
 187           else if (values[v] == maxCount)
 188           {
 189             maxResidue += CHARS[v - 'A'];
 190           }
 191           maxCount = values[v];
 192         }
 193       }
 194       if (maxResidue.length() == 0)
 195       {
 196         maxResidue = "-";
 197       }
 198       if (profile)
 199       {
 200         // TODO use a 1-dimensional array with jSize, nongap in [0] and [1]
 201         residueHash.put(PROFILE, new int[][] { values,
 202             new int[] { jSize, nongap } });
 203       }
 204       residueHash.put(MAXCOUNT, new Integer(maxCount));
 205       residueHash.put(MAXRESIDUE, maxResidue);
 206
 207       percentage = ((float) maxCount * 100) / jSize;
 208       residueHash.put(PID_GAPS, new Float(percentage));
 209
 210       if (nongap > 0)
 211       {
 212         // calculate for non-gapped too
 213         percentage = ((float) maxCount * 100) / nongap;
 214       }
 215       residueHash.put(PID_NOGAPS, new Float(percentage));
 216
 217       result[i] = residueHash;
 218     }
 219   }
 220
 221   /**
 222    * Compute all or part of the annotation row from the given consensus
 223    * hashtable
 224    *
 225    * @param consensus
 226    *          - pre-allocated annotation row
 227    * @param hconsensus
 228    * @param iStart
 229    * @param width
 230    * @param ignoreGapsInConsensusCalculation
 231    * @param includeAllConsSymbols
 232    * @param nseq
 233    */
 234   public static void completeConsensus(AlignmentAnnotation consensus,
 235           Hashtable[] hconsensus, int iStart, int width,
 236           boolean ignoreGapsInConsensusCalculation,
 237           boolean includeAllConsSymbols, long nseq)
 238   {
 239     completeConsensus(consensus, hconsensus, iStart, width,
 240             ignoreGapsInConsensusCalculation, includeAllConsSymbols, null,
 241             nseq);
 242   }
 243
 244   /**
 245    * Derive the consensus annotations to be added to the alignment for display.
 246    * This does not recompute the raw data, but may be called on a change in
 247    * display options, such as 'show logo', which may in turn result in a change
 248    * in the derived values.
 249    *
 250    * @param consensus
 251    *          the annotation row to add annotations to
 252    * @param hconsensus
 253    *          the source consensus data
 254    * @param iStart
 255    *          start column
 256    * @param width
 257    *          end column
 258    * @param ignoreGapsInConsensusCalculation
 259    *          if true, use the consensus calculated ignoring gaps
 260    * @param includeAllConsSymbols
 261    *          if true include all consensus symbols, else just show modal
 262    *          residue
 263    * @param alphabet
 264    * @param nseq
 265    *          number of sequences
 266    */
 267   public static void completeConsensus(AlignmentAnnotation consensus,
 268           Hashtable[] hconsensus, int iStart, int width,
 269           boolean ignoreGapsInConsensusCalculation,
 270           boolean includeAllConsSymbols, char[] alphabet, long nseq)
 271   {
 272     if (consensus == null || consensus.annotations == null
 273             || consensus.annotations.length < width)
 274     {
 275       // called with a bad alignment annotation row - wait for it to be
 276       // initialised properly
 277       return;
 278     }
 279
 280     final Format fmt = getPercentageFormat(nseq);
 281
 282     for (int i = iStart; i < width; i++)
 283     {
 284       Hashtable hci;
 285       if (i >= hconsensus.length || ((hci = hconsensus[i]) == null))
 286       {
 287         // happens if sequences calculated over were shorter than alignment
 288         // width
 289         consensus.annotations[i] = null;
 290         continue;
 291       }
 292       Float fv = (Float) hci
 293               .get(ignoreGapsInConsensusCalculation ? PID_NOGAPS : PID_GAPS);
 294       if (fv == null)
 295       {
 296         consensus.annotations[i] = null;
 297         // data has changed below us .. give up and
 298         continue;
 299       }
 300       float value = fv.floatValue();
 301       String maxRes = hci.get(AAFrequency.MAXRESIDUE).toString();
 302       StringBuilder mouseOver = new StringBuilder(64);
 303       if (maxRes.length() > 1)
 304       {
 305         mouseOver.append("[").append(maxRes).append("] ");
 306         maxRes = "+";
 307       }
 308       else
 309       {
 310         mouseOver.append(hci.get(AAFrequency.MAXRESIDUE) + " ");
 311       }
 312       int[][] profile = (int[][]) hci.get(AAFrequency.PROFILE);
 313       if (profile != null && includeAllConsSymbols)
 314       {
 315         int sequenceCount = profile[1][0];
 316         int nonGappedCount = profile[1][1];
 317         int normalisedBy = ignoreGapsInConsensusCalculation ? nonGappedCount
 318                 : sequenceCount;
 319         mouseOver.setLength(0);
 320         if (alphabet != null)
 321         {
 322           for (int c = 0; c < alphabet.length; c++)
 323           {
 324             float tval = profile[0][alphabet[c]] * 100f / normalisedBy;
 325             mouseOver
 326                     .append(((c == 0) ? "" : "; "))
 327                     .append(alphabet[c])
 328                     .append(" ")
 329                     .append(((fmt != null) ? fmt.form(tval) : ((int) tval)))
 330                     .append("%");
 331           }
 332         }
 333         else
 334         {
 335           // TODO do this sort once only in calculate()?
 336           // char[][] ca = new char[profile[0].length][];
 337           char[] ca = new char[profile[0].length];
 338           float[] vl = new float[profile[0].length];
 339           for (int c = 0; c < ca.length; c++)
 340           {
 341             ca[c] = (char) c;
 342             // ca[c] = new char[]
 343             // { (char) c };
 344             vl[c] = profile[0][c];
 345           }
 346           QuickSort.sort(vl, ca);
 347           for (int p = 0, c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 348           {
 349             final char residue = ca[c];
 350             if (residue != '-')
 351             {
 352               float tval = profile[0][residue] * 100f / normalisedBy;
 353               mouseOver
 354                       .append((((p == 0) ? "" : "; ")))
 355                       .append(residue)
 356                       .append(" ")
 357                       .append(((fmt != null) ? fmt.form(tval)
 358                               : ((int) tval))).append("%");
 359               p++;
 360             }
 361           }
 362         }
 363       }
 364       else
 365       {
 366         mouseOver.append(
 367                 (((fmt != null) ? fmt.form(value) : ((int) value))))
 368                 .append("%");
 369       }
 370       consensus.annotations[i] = new Annotation(maxRes,
 371               mouseOver.toString(), ' ', value);
 372     }
 373   }
 374
 375   /**
 376    * Returns a Format designed to show all significant figures for profile
 377    * percentages. For less than 100 sequences, returns null (the integer
 378    * percentage value will be displayed). For 100-999 sequences, returns "%3.1f"
 379    *
 380    * @param nseq
 381    * @return
 382    */
 383   protected static Format getPercentageFormat(long nseq)
 384   {
 385     int scale = 0;
 386     while (nseq >= 10)
 387     {
 388       scale++;
 389       nseq /= 10;
 390     }
 391     return scale <= 1 ? null : new Format("%3." + (scale - 1) + "f");
 392   }
 393
 394   /**
 395    * Returns the sorted profile for the given consensus data. The returned array
 396    * contains
 397    *
 398    * <pre>
 399    *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
 400    * in descending order of percentage value
 401    * </pre>
 402    *
 403    * @param hconsensus
 404    *          the data table from which to extract and sort values
 405    * @param ignoreGaps
 406    *          if true, only non-gapped values are included in percentage
 407    *          calculations
 408    * @return
 409    */
 410   public static int[] extractProfile(Hashtable hconsensus,
 411           boolean ignoreGaps)
 412   {
 413     int[] rtnval = new int[64];
 414     int[][] profile = (int[][]) hconsensus.get(AAFrequency.PROFILE);
 415     if (profile == null)
 416     {
 417       return null;
 418     }
 419     char[] ca = new char[profile[0].length];
 420     float[] vl = new float[profile[0].length];
 421     for (int c = 0; c < ca.length; c++)
 422     {
 423       ca[c] = (char) c;
 424       vl[c] = profile[0][c];
 425     }
 426     QuickSort.sort(vl, ca);
 427     int nextArrayPos = 2;
 428     int totalPercentage = 0;
 429     int distinctValuesCount = 0;
 430     final int divisor = profile[1][ignoreGaps ? 1 : 0];
 431     for (int c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 432     {
 433       if (ca[c] != '-')
 434       {
 435         rtnval[nextArrayPos++] = ca[c];
 436         final int percentage = (int) (profile[0][ca[c]] * 100f / divisor);
 437         rtnval[nextArrayPos++] = percentage;
 438         totalPercentage += percentage;
 439         distinctValuesCount++;
 440       }
 441     }
 442     rtnval[0] = distinctValuesCount;
 443     rtnval[1] = totalPercentage;
 444     int[] result = new int[rtnval.length + 1];
 445     result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 446     System.arraycopy(rtnval, 0, result, 1, rtnval.length);
 447
 448     return result;
 449   }
 450
 451   /**
 452    * Extract a sorted extract of cDNA codon profile data. The returned array
 453    * contains
 454    *
 455    * <pre>
 456    *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
 457    * in descending order of percentage value, where the character values encode codon triplets
 458    * </pre>
 459    *
 460    * @param hashtable
 461    * @return
 462    */
 463   public static int[] extractCdnaProfile(Hashtable hashtable,
 464           boolean ignoreGaps)
 465   {
 466     // this holds #seqs, #ungapped, and then codon count, indexed by encoded
 467     // codon triplet
 468     int[] codonCounts = (int[]) hashtable.get(PROFILE);
 469     int[] sortedCounts = new int[codonCounts.length - 2];
 470     System.arraycopy(codonCounts, 2, sortedCounts, 0,
 471             codonCounts.length - 2);
 472
 473     int[] result = new int[3 + 2 * sortedCounts.length];
 474     // first value is just the type of profile data
 475     result[0] = AlignmentAnnotation.CDNA_PROFILE;
 476
 477     char[] codons = new char[sortedCounts.length];
 478     for (int i = 0; i < codons.length; i++)
 479     {
 480       codons[i] = (char) i;
 481     }
 482     QuickSort.sort(sortedCounts, codons);
 483     int totalPercentage = 0;
 484     int distinctValuesCount = 0;
 485     int j = 3;
 486     int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
 487     for (int i = codons.length - 1; i >= 0; i--)
 488     {
 489       final int codonCount = sortedCounts[i];
 490       if (codonCount == 0)
 491       {
 492         break; // nothing else of interest here
 493       }
 494       distinctValuesCount++;
 495       result[j++] = codons[i];
 496       final int percentage = codonCount * 100 / divisor;
 497       result[j++] = percentage;
 498       totalPercentage += percentage;
 499     }
 500     result[2] = totalPercentage;
 501
 502     /*
 503      * Just return the non-zero values
 504      */
 505     // todo next value is redundant if we limit the array to non-zero counts
 506     result[1] = distinctValuesCount;
 507     return Arrays.copyOfRange(result, 0, j);
 508   }
 509
 510   /**
 511    * Compute a consensus for the cDNA coding for a protein alignment.
 512    *
 513    * @param alignment
 514    *          the protein alignment (which should hold mappings to cDNA
 515    *          sequences)
 516    * @param hconsensus
 517    *          the consensus data stores to be populated (one per column)
 518    */
 519   public static void calculateCdna(AlignmentI alignment,
 520           Hashtable[] hconsensus)
 521   {
 522     final char gapCharacter = alignment.getGapCharacter();
 523     Set<AlignedCodonFrame> mappings = alignment.getCodonFrames();
 524     if (mappings == null || mappings.isEmpty())
 525     {
 526       return;
 527     }
 528
 529     int cols = alignment.getWidth();
 530     for (int col = 0; col < cols; col++)
 531     {
 532       // todo would prefer a Java bean for consensus data
 533       Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
 534       // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
 535       int[] codonCounts = new int[66];
 536       codonCounts[0] = alignment.getSequences().size();
 537       int ungappedCount = 0;
 538       for (SequenceI seq : alignment.getSequences())
 539       {
 540         if (seq.getCharAt(col) == gapCharacter)
 541         {
 542           continue;
 543         }
 544         char[] codon = MappingUtils.findCodonFor(seq, col, mappings);
 545         int codonEncoded = CodingUtils.encodeCodon(codon);
 546         if (codonEncoded >= 0)
 547         {
 548           codonCounts[codonEncoded + 2]++;
 549           ungappedCount++;
 550         }
 551       }
 552       codonCounts[1] = ungappedCount;
 553       // todo: sort values here, save counts and codons?
 554       columnHash.put(PROFILE, codonCounts);
 555       hconsensus[col] = columnHash;
 556     }
 557   }
 558
 559   /**
 560    * Derive displayable cDNA consensus annotation from computed consensus data.
 561    *
 562    * @param consensusAnnotation
 563    *          the annotation row to be populated for display
 564    * @param consensusData
 565    *          the computed consensus data
 566    * @param showProfileLogo
 567    *          if true show all symbols present at each position, else only the
 568    *          modal value
 569    * @param nseqs
 570    *          the number of sequences in the alignment
 571    */
 572   public static void completeCdnaConsensus(
 573           AlignmentAnnotation consensusAnnotation,
 574           Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
 575   {
 576     if (consensusAnnotation == null
 577             || consensusAnnotation.annotations == null
 578             || consensusAnnotation.annotations.length < consensusData.length)
 579     {
 580       // called with a bad alignment annotation row - wait for it to be
 581       // initialised properly
 582       return;
 583     }
 584
 585     // ensure codon triplet scales with font size
 586     consensusAnnotation.scaleColLabel = true;
 587     for (int col = 0; col < consensusData.length; col++)
 588     {
 589       Hashtable hci = consensusData[col];
 590       if (hci == null)
 591       {
 592         // gapped protein column?
 593         continue;
 594       }
 595       // array holds #seqs, #ungapped, then codon counts indexed by codon
 596       final int[] codonCounts = (int[]) hci.get(PROFILE);
 597       int totalCount = 0;
 598
 599       /*
 600        * First pass - get total count and find the highest
 601        */
 602       final char[] codons = new char[codonCounts.length - 2];
 603       for (int j = 2; j < codonCounts.length; j++)
 604       {
 605         final int codonCount = codonCounts[j];
 606         codons[j - 2] = (char) (j - 2);
 607         totalCount += codonCount;
 608       }
 609
 610       /*
 611        * Sort array of encoded codons by count ascending - so the modal value
 612        * goes to the end; start by copying the count (dropping the first value)
 613        */
 614       int[] sortedCodonCounts = new int[codonCounts.length - 2];
 615       System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
 616               codonCounts.length - 2);
 617       QuickSort.sort(sortedCodonCounts, codons);
 618
 619       int modalCodonEncoded = codons[codons.length - 1];
 620       int modalCodonCount = sortedCodonCounts[codons.length - 1];
 621       String modalCodon = String.valueOf(CodingUtils
 622               .decodeCodon(modalCodonEncoded));
 623       if (sortedCodonCounts.length > 1
 624               && sortedCodonCounts[codons.length - 2] == modalCodonEncoded)
 625       {
 626         modalCodon = "+";
 627       }
 628       float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
 629               / (float) totalCount;
 630
 631       /*
 632        * todo ? Replace consensus hashtable with sorted arrays of codons and
 633        * counts (non-zero only). Include total count in count array [0].
 634        */
 635
 636       /*
 637        * Scan sorted array backwards for most frequent values first. Show
 638        * repeated values compactly.
 639        */
 640       StringBuilder mouseOver = new StringBuilder(32);
 641       StringBuilder samePercent = new StringBuilder();
 642       String percent = null;
 643       String lastPercent = null;
 644       Format fmt = getPercentageFormat(nseqs);
 645
 646       for (int j = codons.length - 1; j >= 0; j--)
 647       {
 648         int codonCount = sortedCodonCounts[j];
 649         if (codonCount == 0)
 650         {
 651           /*
 652            * remaining codons are 0% - ignore, but finish off the last one if
 653            * necessary
 654            */
 655           if (samePercent.length() > 0)
 656           {
 657             mouseOver.append(samePercent).append(": ").append(percent)
 658                     .append("% ");
 659           }
 660           break;
 661         }
 662         int codonEncoded = codons[j];
 663         final int pct = codonCount * 100 / totalCount;
 664         String codon = String
 665                 .valueOf(CodingUtils.decodeCodon(codonEncoded));
 666         percent = fmt == null ? Integer.toString(pct) : fmt.form(pct);
 667         if (showProfileLogo || codonCount == modalCodonCount)
 668         {
 669           if (percent.equals(lastPercent) && j > 0)
 670           {
 671             samePercent.append(samePercent.length() == 0 ? "" : ", ");
 672             samePercent.append(codon);
 673           }
 674           else
 675           {
 676             if (samePercent.length() > 0)
 677             {
 678               mouseOver.append(samePercent).append(": ")
 679                       .append(lastPercent).append("% ");
 680             }
 681             samePercent.setLength(0);
 682             samePercent.append(codon);
 683           }
 684           lastPercent = percent;
 685         }
 686       }
 687
 688       consensusAnnotation.annotations[col] = new Annotation(modalCodon,
 689               mouseOver.toString(), ' ', pid);
 690     }
 691   }
 692 }