src/jalview/analysis/AAFrequency.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.Format;
  29 import jalview.util.MappingUtils;
  30 import jalview.util.QuickSort;
  31
  32 import java.util.Arrays;
  33 import java.util.Hashtable;
  34 import java.util.List;
  35
  36 /**
  37  * Takes in a vector or array of sequences and column start and column end and
  38  * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
  39  * This class is used extensively in calculating alignment colourschemes that
  40  * depend on the amount of conservation in each alignment column.
  41  *
  42  * @author $author$
  43  * @version $Revision$
  44  */
  45 public class AAFrequency
  46 {
  47   private static final int TO_UPPER_CASE = 'A' - 'a'; // -32
  48
  49   public static final String MAXCOUNT = "C";
  50
  51   public static final String MAXRESIDUE = "R";
  52
  53   public static final String PID_GAPS = "G";
  54
  55   public static final String PID_NOGAPS = "N";
  56
  57   public static final String PROFILE = "P";
  58
  59   public static final String ENCODED_CHARS = "E";
  60
  61   /*
  62    * Quick look-up of String value of char 'A' to 'Z'
  63    */
  64   private static final String[] CHARS = new String['Z' - 'A' + 1];
  65
  66   static
  67   {
  68     for (char c = 'A'; c <= 'Z'; c++)
  69     {
  70       CHARS[c - 'A'] = String.valueOf(c);
  71     }
  72   }
  73
  74   public static final Hashtable[] calculate(List<SequenceI> list,
  75           int start, int end)
  76   {
  77     return calculate(list, start, end, false);
  78   }
  79
  80   public static final Hashtable[] calculate(List<SequenceI> sequences,
  81           int start, int end, boolean profile)
  82   {
  83     SequenceI[] seqs = new SequenceI[sequences.size()];
  84     int width = 0;
  85     synchronized (sequences)
  86     {
  87       for (int i = 0; i < sequences.size(); i++)
  88       {
  89         seqs[i] = sequences.get(i);
  90         if (seqs[i].getLength() > width)
  91         {
  92           width = seqs[i].getLength();
  93         }
  94       }
  95
  96       Hashtable[] reply = new Hashtable[width];
  97
  98       if (end >= width)
  99       {
 100         end = width;
 101       }
 102
 103       calculate(seqs, start, end, reply, profile);
 104       return reply;
 105     }
 106   }
 107
 108   public static final void calculate(SequenceI[] sequences, int start,
 109           int end, Hashtable[] result, boolean profile)
 110   {
 111     Hashtable residueHash;
 112     int maxCount, nongap, i, j, v;
 113     int jSize = sequences.length;
 114     String maxResidue;
 115     char c = '-';
 116     float percentage;
 117
 118     int[] values = new int[255];
 119
 120     char[] seq;
 121
 122     for (i = start; i < end; i++)
 123     {
 124       residueHash = new Hashtable();
 125       maxCount = 0;
 126       maxResidue = "";
 127       nongap = 0;
 128       values = new int[255];
 129
 130       for (j = 0; j < jSize; j++)
 131       {
 132         if (sequences[j] == null)
 133         {
 134           System.err
 135                   .println("WARNING: Consensus skipping null sequence - possible race condition.");
 136           continue;
 137         }
 138         seq = sequences[j].getSequence();
 139         if (seq.length > i)
 140         {
 141           c = seq[i];
 142
 143           if (c == '.' || c == ' ')
 144           {
 145             c = '-';
 146           }
 147
 148           if (c == '-')
 149           {
 150             values['-']++;
 151             continue;
 152           }
 153           else if ('a' <= c && c <= 'z')
 154           {
 155             c += TO_UPPER_CASE;
 156           }
 157
 158           nongap++;
 159           values[c]++;
 160
 161         }
 162         else
 163         {
 164           values['-']++;
 165         }
 166       }
 167       if (jSize == 1)
 168       {
 169         maxResidue = String.valueOf(c);
 170         maxCount = 1;
 171       }
 172       else
 173       {
 174         for (v = 'A'; v <= 'Z'; v++)
 175         {
 176           // TODO why ignore values[v] == 1?
 177           if (values[v] < 1 /* 2 */|| values[v] < maxCount)
 178           {
 179             continue;
 180           }
 181
 182           if (values[v] > maxCount)
 183           {
 184             maxResidue = CHARS[v - 'A'];
 185           }
 186           else if (values[v] == maxCount)
 187           {
 188             maxResidue += CHARS[v - 'A'];
 189           }
 190           maxCount = values[v];
 191         }
 192       }
 193       if (maxResidue.length() == 0)
 194       {
 195         maxResidue = "-";
 196       }
 197       if (profile)
 198       {
 199         // TODO use a 1-dimensional array with jSize, nongap in [0] and [1]
 200         residueHash.put(PROFILE, new int[][] { values,
 201             new int[] { jSize, nongap } });
 202       }
 203       residueHash.put(MAXCOUNT, new Integer(maxCount));
 204       residueHash.put(MAXRESIDUE, maxResidue);
 205
 206       percentage = ((float) maxCount * 100) / jSize;
 207       residueHash.put(PID_GAPS, new Float(percentage));
 208
 209       if (nongap > 0)
 210       {
 211         // calculate for non-gapped too
 212         percentage = ((float) maxCount * 100) / nongap;
 213       }
 214       residueHash.put(PID_NOGAPS, new Float(percentage));
 215
 216       result[i] = residueHash;
 217     }
 218   }
 219
 220   /**
 221    * Compute all or part of the annotation row from the given consensus
 222    * hashtable
 223    *
 224    * @param consensus
 225    *          - pre-allocated annotation row
 226    * @param hconsensus
 227    * @param iStart
 228    * @param width
 229    * @param ignoreGapsInConsensusCalculation
 230    * @param includeAllConsSymbols
 231    * @param nseq
 232    */
 233   public static void completeConsensus(AlignmentAnnotation consensus,
 234           Hashtable[] hconsensus, int iStart, int width,
 235           boolean ignoreGapsInConsensusCalculation,
 236           boolean includeAllConsSymbols, long nseq)
 237   {
 238     completeConsensus(consensus, hconsensus, iStart, width,
 239             ignoreGapsInConsensusCalculation, includeAllConsSymbols, null,
 240             nseq);
 241   }
 242
 243   /**
 244    * Derive the consensus annotations to be added to the alignment for display.
 245    * This does not recompute the raw data, but may be called on a change in
 246    * display options, such as 'show logo', which may in turn result in a change
 247    * in the derived values.
 248    *
 249    * @param consensus
 250    *          the annotation row to add annotations to
 251    * @param hconsensus
 252    *          the source consensus data
 253    * @param iStart
 254    *          start column
 255    * @param width
 256    *          end column
 257    * @param ignoreGapsInConsensusCalculation
 258    *          if true, use the consensus calculated ignoring gaps
 259    * @param includeAllConsSymbols
 260    *          if true include all consensus symbols, else just show modal
 261    *          residue
 262    * @param alphabet
 263    * @param nseq
 264    *          number of sequences
 265    */
 266   public static void completeConsensus(AlignmentAnnotation consensus,
 267           Hashtable[] hconsensus, int iStart, int width,
 268           boolean ignoreGapsInConsensusCalculation,
 269           boolean includeAllConsSymbols, char[] alphabet, long nseq)
 270   {
 271     if (consensus == null || consensus.annotations == null
 272             || consensus.annotations.length < width)
 273     {
 274       // called with a bad alignment annotation row - wait for it to be
 275       // initialised properly
 276       return;
 277     }
 278
 279     final Format fmt = getPercentageFormat(nseq);
 280
 281     for (int i = iStart; i < width; i++)
 282     {
 283       Hashtable hci;
 284       if (i >= hconsensus.length || ((hci = hconsensus[i]) == null))
 285       {
 286         // happens if sequences calculated over were shorter than alignment
 287         // width
 288         consensus.annotations[i] = null;
 289         continue;
 290       }
 291       Float fv = (Float) hci
 292               .get(ignoreGapsInConsensusCalculation ? PID_NOGAPS : PID_GAPS);
 293       if (fv == null)
 294       {
 295         consensus.annotations[i] = null;
 296         // data has changed below us .. give up and
 297         continue;
 298       }
 299       float value = fv.floatValue();
 300       String maxRes = hci.get(AAFrequency.MAXRESIDUE).toString();
 301       StringBuilder mouseOver = new StringBuilder(64);
 302       if (maxRes.length() > 1)
 303       {
 304         mouseOver.append("[").append(maxRes).append("] ");
 305         maxRes = "+";
 306       }
 307       else
 308       {
 309         mouseOver.append(hci.get(AAFrequency.MAXRESIDUE) + " ");
 310       }
 311       int[][] profile = (int[][]) hci.get(AAFrequency.PROFILE);
 312       if (profile != null && includeAllConsSymbols)
 313       {
 314         int sequenceCount = profile[1][0];
 315         int nonGappedCount = profile[1][1];
 316         int normalisedBy = ignoreGapsInConsensusCalculation ? nonGappedCount
 317                 : sequenceCount;
 318         mouseOver.setLength(0);
 319         if (alphabet != null)
 320         {
 321           for (int c = 0; c < alphabet.length; c++)
 322           {
 323             float tval = profile[0][alphabet[c]] * 100f / normalisedBy;
 324             mouseOver
 325                     .append(((c == 0) ? "" : "; "))
 326                     .append(alphabet[c])
 327                     .append(" ")
 328                     .append(((fmt != null) ? fmt.form(tval) : ((int) tval)))
 329                     .append("%");
 330           }
 331         }
 332         else
 333         {
 334           // TODO do this sort once only in calculate()?
 335           // char[][] ca = new char[profile[0].length][];
 336           char[] ca = new char[profile[0].length];
 337           float[] vl = new float[profile[0].length];
 338           for (int c = 0; c < ca.length; c++)
 339           {
 340             ca[c] = (char) c;
 341             // ca[c] = new char[]
 342             // { (char) c };
 343             vl[c] = profile[0][c];
 344           }
 345           QuickSort.sort(vl, ca);
 346           for (int p = 0, c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 347           {
 348             final char residue = ca[c];
 349             if (residue != '-')
 350             {
 351               float tval = profile[0][residue] * 100f / normalisedBy;
 352               mouseOver
 353                       .append((((p == 0) ? "" : "; ")))
 354                       .append(residue)
 355                       .append(" ")
 356                       .append(((fmt != null) ? fmt.form(tval)
 357                               : ((int) tval))).append("%");
 358               p++;
 359             }
 360           }
 361         }
 362       }
 363       else
 364       {
 365         mouseOver.append(
 366                 (((fmt != null) ? fmt.form(value) : ((int) value))))
 367                 .append("%");
 368       }
 369       consensus.annotations[i] = new Annotation(maxRes,
 370               mouseOver.toString(), ' ', value);
 371     }
 372   }
 373
 374   /**
 375    * Returns a Format designed to show all significant figures for profile
 376    * percentages. For less than 100 sequences, returns null (the integer
 377    * percentage value will be displayed). For 100-999 sequences, returns "%3.1f"
 378    *
 379    * @param nseq
 380    * @return
 381    */
 382   protected static Format getPercentageFormat(long nseq)
 383   {
 384     int scale = 0;
 385     while (nseq >= 10)
 386     {
 387       scale++;
 388       nseq /= 10;
 389     }
 390     return scale <= 1 ? null : new Format("%3." + (scale - 1) + "f");
 391   }
 392
 393   /**
 394    * Returns the sorted profile for the given consensus data. The returned array
 395    * contains
 396    *
 397    * <pre>
 398    *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
 399    * in descending order of percentage value
 400    * </pre>
 401    *
 402    * @param hconsensus
 403    *          the data table from which to extract and sort values
 404    * @param ignoreGaps
 405    *          if true, only non-gapped values are included in percentage
 406    *          calculations
 407    * @return
 408    */
 409   public static int[] extractProfile(Hashtable hconsensus,
 410           boolean ignoreGaps)
 411   {
 412     int[] rtnval = new int[64];
 413     int[][] profile = (int[][]) hconsensus.get(AAFrequency.PROFILE);
 414     if (profile == null)
 415     {
 416       return null;
 417     }
 418     char[] ca = new char[profile[0].length];
 419     float[] vl = new float[profile[0].length];
 420     for (int c = 0; c < ca.length; c++)
 421     {
 422       ca[c] = (char) c;
 423       vl[c] = profile[0][c];
 424     }
 425     QuickSort.sort(vl, ca);
 426     int nextArrayPos = 2;
 427     int totalPercentage = 0;
 428     int distinctValuesCount = 0;
 429     final int divisor = profile[1][ignoreGaps ? 1 : 0];
 430     for (int c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 431     {
 432       if (ca[c] != '-')
 433       {
 434         rtnval[nextArrayPos++] = ca[c];
 435         final int percentage = (int) (profile[0][ca[c]] * 100f / divisor);
 436         rtnval[nextArrayPos++] = percentage;
 437         totalPercentage += percentage;
 438         distinctValuesCount++;
 439       }
 440     }
 441     rtnval[0] = distinctValuesCount;
 442     rtnval[1] = totalPercentage;
 443     int[] result = new int[rtnval.length + 1];
 444     result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 445     System.arraycopy(rtnval, 0, result, 1, rtnval.length);
 446
 447     return result;
 448   }
 449
 450   /**
 451    * Extract a sorted extract of cDNA codon profile data. The returned array
 452    * contains
 453    *
 454    * <pre>
 455    *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
 456    * in descending order of percentage value, where the character values encode codon triplets
 457    * </pre>
 458    *
 459    * @param hashtable
 460    * @return
 461    */
 462   public static int[] extractCdnaProfile(Hashtable hashtable,
 463           boolean ignoreGaps)
 464   {
 465     // this holds #seqs, #ungapped, and then codon count, indexed by encoded
 466     // codon triplet
 467     int[] codonCounts = (int[]) hashtable.get(PROFILE);
 468     int[] sortedCounts = new int[codonCounts.length - 2];
 469     System.arraycopy(codonCounts, 2, sortedCounts, 0,
 470             codonCounts.length - 2);
 471
 472     int[] result = new int[3 + 2 * sortedCounts.length];
 473     // first value is just the type of profile data
 474     result[0] = AlignmentAnnotation.CDNA_PROFILE;
 475
 476     char[] codons = new char[sortedCounts.length];
 477     for (int i = 0; i < codons.length; i++)
 478     {
 479       codons[i] = (char) i;
 480     }
 481     QuickSort.sort(sortedCounts, codons);
 482     int totalPercentage = 0;
 483     int distinctValuesCount = 0;
 484     int j = 3;
 485     int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
 486     for (int i = codons.length - 1; i >= 0; i--)
 487     {
 488       final int codonCount = sortedCounts[i];
 489       if (codonCount == 0)
 490       {
 491         break; // nothing else of interest here
 492       }
 493       distinctValuesCount++;
 494       result[j++] = codons[i];
 495       final int percentage = codonCount * 100 / divisor;
 496       result[j++] = percentage;
 497       totalPercentage += percentage;
 498     }
 499     result[2] = totalPercentage;
 500
 501     /*
 502      * Just return the non-zero values
 503      */
 504     // todo next value is redundant if we limit the array to non-zero counts
 505     result[1] = distinctValuesCount;
 506     return Arrays.copyOfRange(result, 0, j);
 507   }
 508
 509   /**
 510    * Compute a consensus for the cDNA coding for a protein alignment.
 511    *
 512    * @param alignment
 513    *          the protein alignment (which should hold mappings to cDNA
 514    *          sequences)
 515    * @param hconsensus
 516    *          the consensus data stores to be populated (one per column)
 517    */
 518   public static void calculateCdna(AlignmentI alignment,
 519           Hashtable[] hconsensus)
 520   {
 521     final char gapCharacter = alignment.getGapCharacter();
 522     List<AlignedCodonFrame> mappings = alignment.getCodonFrames();
 523     if (mappings == null || mappings.isEmpty())
 524     {
 525       return;
 526     }
 527
 528     int cols = alignment.getWidth();
 529     for (int col = 0; col < cols; col++)
 530     {
 531       // todo would prefer a Java bean for consensus data
 532       Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
 533       // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
 534       int[] codonCounts = new int[66];
 535       codonCounts[0] = alignment.getSequences().size();
 536       int ungappedCount = 0;
 537       for (SequenceI seq : alignment.getSequences())
 538       {
 539         if (seq.getCharAt(col) == gapCharacter)
 540         {
 541           continue;
 542         }
 543         List<char[]> codons = MappingUtils
 544                 .findCodonsFor(seq, col, mappings);
 545         for (char[] codon : codons)
 546         {
 547           int codonEncoded = CodingUtils.encodeCodon(codon);
 548           if (codonEncoded >= 0)
 549           {
 550             codonCounts[codonEncoded + 2]++;
 551             ungappedCount++;
 552           }
 553         }
 554       }
 555       codonCounts[1] = ungappedCount;
 556       // todo: sort values here, save counts and codons?
 557       columnHash.put(PROFILE, codonCounts);
 558       hconsensus[col] = columnHash;
 559     }
 560   }
 561
 562   /**
 563    * Derive displayable cDNA consensus annotation from computed consensus data.
 564    *
 565    * @param consensusAnnotation
 566    *          the annotation row to be populated for display
 567    * @param consensusData
 568    *          the computed consensus data
 569    * @param showProfileLogo
 570    *          if true show all symbols present at each position, else only the
 571    *          modal value
 572    * @param nseqs
 573    *          the number of sequences in the alignment
 574    */
 575   public static void completeCdnaConsensus(
 576           AlignmentAnnotation consensusAnnotation,
 577           Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
 578   {
 579     if (consensusAnnotation == null
 580             || consensusAnnotation.annotations == null
 581             || consensusAnnotation.annotations.length < consensusData.length)
 582     {
 583       // called with a bad alignment annotation row - wait for it to be
 584       // initialised properly
 585       return;
 586     }
 587
 588     // ensure codon triplet scales with font size
 589     consensusAnnotation.scaleColLabel = true;
 590     for (int col = 0; col < consensusData.length; col++)
 591     {
 592       Hashtable hci = consensusData[col];
 593       if (hci == null)
 594       {
 595         // gapped protein column?
 596         continue;
 597       }
 598       // array holds #seqs, #ungapped, then codon counts indexed by codon
 599       final int[] codonCounts = (int[]) hci.get(PROFILE);
 600       int totalCount = 0;
 601
 602       /*
 603        * First pass - get total count and find the highest
 604        */
 605       final char[] codons = new char[codonCounts.length - 2];
 606       for (int j = 2; j < codonCounts.length; j++)
 607       {
 608         final int codonCount = codonCounts[j];
 609         codons[j - 2] = (char) (j - 2);
 610         totalCount += codonCount;
 611       }
 612
 613       /*
 614        * Sort array of encoded codons by count ascending - so the modal value
 615        * goes to the end; start by copying the count (dropping the first value)
 616        */
 617       int[] sortedCodonCounts = new int[codonCounts.length - 2];
 618       System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
 619               codonCounts.length - 2);
 620       QuickSort.sort(sortedCodonCounts, codons);
 621
 622       int modalCodonEncoded = codons[codons.length - 1];
 623       int modalCodonCount = sortedCodonCounts[codons.length - 1];
 624       String modalCodon = String.valueOf(CodingUtils
 625               .decodeCodon(modalCodonEncoded));
 626       if (sortedCodonCounts.length > 1
 627               && sortedCodonCounts[codons.length - 2] == sortedCodonCounts[codons.length - 1])
 628       {
 629         /*
 630          * two or more codons share the modal count
 631          */
 632         modalCodon = "+";
 633       }
 634       float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
 635               / (float) totalCount;
 636
 637       /*
 638        * todo ? Replace consensus hashtable with sorted arrays of codons and
 639        * counts (non-zero only). Include total count in count array [0].
 640        */
 641
 642       /*
 643        * Scan sorted array backwards for most frequent values first. Show
 644        * repeated values compactly.
 645        */
 646       StringBuilder mouseOver = new StringBuilder(32);
 647       StringBuilder samePercent = new StringBuilder();
 648       String percent = null;
 649       String lastPercent = null;
 650       Format fmt = getPercentageFormat(nseqs);
 651
 652       for (int j = codons.length - 1; j >= 0; j--)
 653       {
 654         int codonCount = sortedCodonCounts[j];
 655         if (codonCount == 0)
 656         {
 657           /*
 658            * remaining codons are 0% - ignore, but finish off the last one if
 659            * necessary
 660            */
 661           if (samePercent.length() > 0)
 662           {
 663             mouseOver.append(samePercent).append(": ").append(percent)
 664                     .append("% ");
 665           }
 666           break;
 667         }
 668         int codonEncoded = codons[j];
 669         final int pct = codonCount * 100 / totalCount;
 670         String codon = String
 671                 .valueOf(CodingUtils.decodeCodon(codonEncoded));
 672         percent = fmt == null ? Integer.toString(pct) : fmt.form(pct);
 673         if (showProfileLogo || codonCount == modalCodonCount)
 674         {
 675           if (percent.equals(lastPercent) && j > 0)
 676           {
 677             samePercent.append(samePercent.length() == 0 ? "" : ", ");
 678             samePercent.append(codon);
 679           }
 680           else
 681           {
 682             if (samePercent.length() > 0)
 683             {
 684               mouseOver.append(samePercent).append(": ")
 685                       .append(lastPercent).append("% ");
 686             }
 687             samePercent.setLength(0);
 688             samePercent.append(codon);
 689           }
 690           lastPercent = percent;
 691         }
 692       }
 693
 694       consensusAnnotation.annotations[col] = new Annotation(modalCodon,
 695               mouseOver.toString(), ' ', pid);
 696     }
 697   }
 698 }