src/jalview/analysis/AAFrequency.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.ext.android.SparseIntArray;
  29 import jalview.util.Format;
  30 import jalview.util.MappingUtils;
  31 import jalview.util.QuickSort;
  32
  33 import java.util.Arrays;
  34 import java.util.Hashtable;
  35 import java.util.List;
  36
  37 /**
  38  * Takes in a vector or array of sequences and column start and column end and
  39  * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
  40  * This class is used extensively in calculating alignment colourschemes that
  41  * depend on the amount of conservation in each alignment column.
  42  *
  43  * @author $author$
  44  * @version $Revision$
  45  */
  46 public class AAFrequency
  47 {
  48   private static final int TO_UPPER_CASE = 'A' - 'a'; // -32
  49
  50   public static final String MAXCOUNT = "C";
  51
  52   public static final String MAXRESIDUE = "R";
  53
  54   public static final String PID_GAPS = "G";
  55
  56   public static final String PID_NOGAPS = "N";
  57
  58   public static final String PROFILE = "P";
  59
  60   public static final String ENCODED_CHARS = "E";
  61
  62   /*
  63    * Quick look-up of String value of char 'A' to 'Z'
  64    */
  65   private static final String[] CHARS = new String['Z' - 'A' + 1];
  66
  67   static
  68   {
  69     for (char c = 'A'; c <= 'Z'; c++)
  70     {
  71       CHARS[c - 'A'] = String.valueOf(c);
  72     }
  73   }
  74
  75   public static final Hashtable[] calculate(List<SequenceI> list,
  76           int start, int end)
  77   {
  78     return calculate(list, start, end, false);
  79   }
  80
  81   public static final Hashtable[] calculate(List<SequenceI> sequences,
  82           int start, int end, boolean profile)
  83   {
  84     SequenceI[] seqs = new SequenceI[sequences.size()];
  85     int width = 0;
  86     synchronized (sequences)
  87     {
  88       for (int i = 0; i < sequences.size(); i++)
  89       {
  90         seqs[i] = sequences.get(i);
  91         if (seqs[i].getLength() > width)
  92         {
  93           width = seqs[i].getLength();
  94         }
  95       }
  96
  97       Hashtable[] reply = new Hashtable[width];
  98
  99       if (end >= width)
 100       {
 101         end = width;
 102       }
 103
 104       calculate(seqs, start, end, reply, profile);
 105       return reply;
 106     }
 107   }
 108
 109   public static final void calculate(SequenceI[] sequences, int start,
 110           int end, Hashtable[] result, boolean profile)
 111   {
 112     Hashtable residueHash;
 113     int maxCount, nongap, i, j, v;
 114     int jSize = sequences.length;
 115     String maxResidue;
 116     char c = '-';
 117     float percentage;
 118
 119     // int[] values = new int[255];
 120
 121     char[] seq;
 122
 123     for (i = start; i < end; i++)
 124     {
 125       residueHash = new Hashtable();
 126       maxCount = 0;
 127       maxResidue = "";
 128       nongap = 0;
 129       // values = new int[255];
 130       SparseIntArray values = new SparseIntArray();
 131
 132       for (j = 0; j < jSize; j++)
 133       {
 134         if (sequences[j] == null)
 135         {
 136           System.err
 137                   .println("WARNING: Consensus skipping null sequence - possible race condition.");
 138           continue;
 139         }
 140         seq = sequences[j].getSequence();
 141         if (seq.length > i)
 142         {
 143           c = seq[i];
 144
 145           if (c == '.' || c == ' ')
 146           {
 147             c = '-';
 148           }
 149
 150           if (c == '-')
 151           {
 152             // values['-']++;
 153             values.put('-', values.get('-') + 1);
 154             continue;
 155           }
 156           else if ('a' <= c && c <= 'z')
 157           {
 158             c += TO_UPPER_CASE;
 159           }
 160
 161           nongap++;
 162           // values[c]++;
 163           values.put(c, values.get(c) + 1);
 164
 165         }
 166         else
 167         {
 168           // values['-']++;
 169           values.put('-', values.get('-') + 1);
 170         }
 171       }
 172       if (jSize == 1)
 173       {
 174         maxResidue = String.valueOf(c);
 175         maxCount = 1;
 176       }
 177       else
 178       {
 179         for (v = 'A'; v <= 'Z'; v++)
 180         {
 181           // TODO why ignore values[v] == 1?
 182           int count = values.get(v); // values[v];
 183           if (count < 1 /* 2 */|| count < maxCount)
 184           {
 185             continue;
 186           }
 187
 188           if (count > maxCount)
 189           {
 190             maxResidue = CHARS[v - 'A'];
 191           }
 192           else if (count == maxCount)
 193           {
 194             maxResidue += CHARS[v - 'A'];
 195           }
 196           maxCount = count;
 197         }
 198       }
 199       if (maxResidue.length() == 0)
 200       {
 201         maxResidue = "-";
 202       }
 203       if (profile)
 204       {
 205         // TODO use a 1-dimensional array with jSize, nongap in [0] and [1]
 206         // residueHash.put(PROFILE, new int[][] { values,
 207         // new int[] { jSize, nongap } });
 208         residueHash.put(PROFILE, new Profile(values, jSize, nongap));
 209       }
 210       residueHash.put(MAXCOUNT, new Integer(maxCount));
 211       residueHash.put(MAXRESIDUE, maxResidue);
 212
 213       percentage = ((float) maxCount * 100) / jSize;
 214       residueHash.put(PID_GAPS, new Float(percentage));
 215
 216       if (nongap > 0)
 217       {
 218         // calculate for non-gapped too
 219         percentage = ((float) maxCount * 100) / nongap;
 220       }
 221       residueHash.put(PID_NOGAPS, new Float(percentage));
 222
 223       result[i] = residueHash;
 224     }
 225   }
 226
 227   /**
 228    * Compute all or part of the annotation row from the given consensus
 229    * hashtable
 230    *
 231    * @param consensus
 232    *          - pre-allocated annotation row
 233    * @param hconsensus
 234    * @param iStart
 235    * @param width
 236    * @param ignoreGapsInConsensusCalculation
 237    * @param includeAllConsSymbols
 238    * @param nseq
 239    */
 240   public static void completeConsensus(AlignmentAnnotation consensus,
 241           Hashtable[] hconsensus, int iStart, int width,
 242           boolean ignoreGapsInConsensusCalculation,
 243           boolean includeAllConsSymbols, long nseq)
 244   {
 245     completeConsensus(consensus, hconsensus, iStart, width,
 246             ignoreGapsInConsensusCalculation, includeAllConsSymbols, null,
 247             nseq);
 248   }
 249
 250   /**
 251    * Derive the consensus annotations to be added to the alignment for display.
 252    * This does not recompute the raw data, but may be called on a change in
 253    * display options, such as 'show logo', which may in turn result in a change
 254    * in the derived values.
 255    *
 256    * @param consensus
 257    *          the annotation row to add annotations to
 258    * @param hconsensus
 259    *          the source consensus data
 260    * @param iStart
 261    *          start column
 262    * @param width
 263    *          end column
 264    * @param ignoreGapsInConsensusCalculation
 265    *          if true, use the consensus calculated ignoring gaps
 266    * @param includeAllConsSymbols
 267    *          if true include all consensus symbols, else just show modal
 268    *          residue
 269    * @param alphabet
 270    * @param nseq
 271    *          number of sequences
 272    */
 273   public static void completeConsensus(AlignmentAnnotation consensus,
 274           Hashtable[] hconsensus, int iStart, int width,
 275           boolean ignoreGapsInConsensusCalculation,
 276           boolean includeAllConsSymbols, char[] alphabet, long nseq)
 277   {
 278     if (consensus == null || consensus.annotations == null
 279             || consensus.annotations.length < width)
 280     {
 281       // called with a bad alignment annotation row - wait for it to be
 282       // initialised properly
 283       return;
 284     }
 285
 286     final Format fmt = getPercentageFormat(nseq);
 287
 288     for (int i = iStart; i < width; i++)
 289     {
 290       Hashtable hci;
 291       if (i >= hconsensus.length || ((hci = hconsensus[i]) == null))
 292       {
 293         // happens if sequences calculated over were shorter than alignment
 294         // width
 295         consensus.annotations[i] = null;
 296         continue;
 297       }
 298       Float fv = (Float) hci
 299               .get(ignoreGapsInConsensusCalculation ? PID_NOGAPS : PID_GAPS);
 300       if (fv == null)
 301       {
 302         consensus.annotations[i] = null;
 303         // data has changed below us .. give up and
 304         continue;
 305       }
 306       float value = fv.floatValue();
 307       String maxRes = hci.get(AAFrequency.MAXRESIDUE).toString();
 308       StringBuilder mouseOver = new StringBuilder(64);
 309       if (maxRes.length() > 1)
 310       {
 311         mouseOver.append("[").append(maxRes).append("] ");
 312         maxRes = "+";
 313       }
 314       else
 315       {
 316         mouseOver.append(hci.get(AAFrequency.MAXRESIDUE) + " ");
 317       }
 318       int[][] profile = (int[][]) hci.get(AAFrequency.PROFILE);
 319       if (profile != null && includeAllConsSymbols)
 320       {
 321         int sequenceCount = profile[1][0];
 322         int nonGappedCount = profile[1][1];
 323         int normalisedBy = ignoreGapsInConsensusCalculation ? nonGappedCount
 324                 : sequenceCount;
 325         mouseOver.setLength(0);
 326         if (alphabet != null)
 327         {
 328           for (int c = 0; c < alphabet.length; c++)
 329           {
 330             float tval = profile[0][alphabet[c]] * 100f / normalisedBy;
 331             mouseOver
 332                     .append(((c == 0) ? "" : "; "))
 333                     .append(alphabet[c])
 334                     .append(" ")
 335                     .append(((fmt != null) ? fmt.form(tval) : ((int) tval)))
 336                     .append("%");
 337           }
 338         }
 339         else
 340         {
 341           // TODO do this sort once only in calculate()?
 342           // char[][] ca = new char[profile[0].length][];
 343           char[] ca = new char[profile[0].length];
 344           float[] vl = new float[profile[0].length];
 345           for (int c = 0; c < ca.length; c++)
 346           {
 347             ca[c] = (char) c;
 348             // ca[c] = new char[]
 349             // { (char) c };
 350             vl[c] = profile[0][c];
 351           }
 352           QuickSort.sort(vl, ca);
 353           for (int p = 0, c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 354           {
 355             final char residue = ca[c];
 356             if (residue != '-')
 357             {
 358               float tval = profile[0][residue] * 100f / normalisedBy;
 359               mouseOver
 360                       .append((((p == 0) ? "" : "; ")))
 361                       .append(residue)
 362                       .append(" ")
 363                       .append(((fmt != null) ? fmt.form(tval)
 364                               : ((int) tval))).append("%");
 365               p++;
 366             }
 367           }
 368         }
 369       }
 370       else
 371       {
 372         mouseOver.append(
 373                 (((fmt != null) ? fmt.form(value) : ((int) value))))
 374                 .append("%");
 375       }
 376       consensus.annotations[i] = new Annotation(maxRes,
 377               mouseOver.toString(), ' ', value);
 378     }
 379   }
 380
 381   /**
 382    * Returns a Format designed to show all significant figures for profile
 383    * percentages. For less than 100 sequences, returns null (the integer
 384    * percentage value will be displayed). For 100-999 sequences, returns "%3.1f"
 385    *
 386    * @param nseq
 387    * @return
 388    */
 389   protected static Format getPercentageFormat(long nseq)
 390   {
 391     int scale = 0;
 392     while (nseq >= 10)
 393     {
 394       scale++;
 395       nseq /= 10;
 396     }
 397     return scale <= 1 ? null : new Format("%3." + (scale - 1) + "f");
 398   }
 399
 400   /**
 401    * Returns the sorted profile for the given consensus data. The returned array
 402    * contains
 403    *
 404    * <pre>
 405    *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
 406    * in descending order of percentage value
 407    * </pre>
 408    *
 409    * @param hconsensus
 410    *          the data table from which to extract and sort values
 411    * @param ignoreGaps
 412    *          if true, only non-gapped values are included in percentage
 413    *          calculations
 414    * @return
 415    */
 416   public static int[] extractProfile(Hashtable hconsensus,
 417           boolean ignoreGaps)
 418   {
 419     int[] rtnval = new int[64];
 420     // int[][] profile = (int[][]) hconsensus.get(AAFrequency.PROFILE);
 421     Profile profile = (Profile) hconsensus.get(AAFrequency.PROFILE);
 422     if (profile == null)
 423     {
 424       return null;
 425     }
 426     // int profileLength = profile[0].length;
 427     int profileLength = profile.profile.size();
 428     char[] ca = new char[profileLength];
 429     float[] vl = new float[profileLength];
 430     // for (int c = 0; c < ca.length; c++)
 431     // {
 432     // ca[c] = (char) c;
 433     // vl[c] = profile[0][c];
 434     // }
 435     for (int i = 0; i < profileLength; i++)
 436     {
 437       int c = profile.profile.keyAt(i);
 438       ca[i] = (char) c;
 439       vl[i] = profile.profile.get(c);
 440     }
 441     QuickSort.sort(vl, ca);
 442     int nextArrayPos = 2;
 443     int totalPercentage = 0;
 444     int distinctValuesCount = 0;
 445     final int divisor = profile[1][ignoreGaps ? 1 : 0];
 446     for (int c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 447     {
 448       if (ca[c] != '-')
 449       {
 450         rtnval[nextArrayPos++] = ca[c];
 451         final int percentage = (int) (profile[0][ca[c]] * 100f / divisor);
 452         rtnval[nextArrayPos++] = percentage;
 453         totalPercentage += percentage;
 454         distinctValuesCount++;
 455       }
 456     }
 457     rtnval[0] = distinctValuesCount;
 458     rtnval[1] = totalPercentage;
 459     int[] result = new int[rtnval.length + 1];
 460     result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 461     System.arraycopy(rtnval, 0, result, 1, rtnval.length);
 462
 463     return result;
 464   }
 465
 466   /**
 467    * Extract a sorted extract of cDNA codon profile data. The returned array
 468    * contains
 469    *
 470    * <pre>
 471    *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
 472    * in descending order of percentage value, where the character values encode codon triplets
 473    * </pre>
 474    *
 475    * @param hashtable
 476    * @return
 477    */
 478   public static int[] extractCdnaProfile(Hashtable hashtable,
 479           boolean ignoreGaps)
 480   {
 481     // this holds #seqs, #ungapped, and then codon count, indexed by encoded
 482     // codon triplet
 483     int[] codonCounts = (int[]) hashtable.get(PROFILE);
 484     int[] sortedCounts = new int[codonCounts.length - 2];
 485     System.arraycopy(codonCounts, 2, sortedCounts, 0,
 486             codonCounts.length - 2);
 487
 488     int[] result = new int[3 + 2 * sortedCounts.length];
 489     // first value is just the type of profile data
 490     result[0] = AlignmentAnnotation.CDNA_PROFILE;
 491
 492     char[] codons = new char[sortedCounts.length];
 493     for (int i = 0; i < codons.length; i++)
 494     {
 495       codons[i] = (char) i;
 496     }
 497     QuickSort.sort(sortedCounts, codons);
 498     int totalPercentage = 0;
 499     int distinctValuesCount = 0;
 500     int j = 3;
 501     int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
 502     for (int i = codons.length - 1; i >= 0; i--)
 503     {
 504       final int codonCount = sortedCounts[i];
 505       if (codonCount == 0)
 506       {
 507         break; // nothing else of interest here
 508       }
 509       distinctValuesCount++;
 510       result[j++] = codons[i];
 511       final int percentage = codonCount * 100 / divisor;
 512       result[j++] = percentage;
 513       totalPercentage += percentage;
 514     }
 515     result[2] = totalPercentage;
 516
 517     /*
 518      * Just return the non-zero values
 519      */
 520     // todo next value is redundant if we limit the array to non-zero counts
 521     result[1] = distinctValuesCount;
 522     return Arrays.copyOfRange(result, 0, j);
 523   }
 524
 525   /**
 526    * Compute a consensus for the cDNA coding for a protein alignment.
 527    *
 528    * @param alignment
 529    *          the protein alignment (which should hold mappings to cDNA
 530    *          sequences)
 531    * @param hconsensus
 532    *          the consensus data stores to be populated (one per column)
 533    */
 534   public static void calculateCdna(AlignmentI alignment,
 535           Hashtable[] hconsensus)
 536   {
 537     final char gapCharacter = alignment.getGapCharacter();
 538     List<AlignedCodonFrame> mappings = alignment.getCodonFrames();
 539     if (mappings == null || mappings.isEmpty())
 540     {
 541       return;
 542     }
 543
 544     int cols = alignment.getWidth();
 545     for (int col = 0; col < cols; col++)
 546     {
 547       // todo would prefer a Java bean for consensus data
 548       Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
 549       // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
 550       int[] codonCounts = new int[66];
 551       codonCounts[0] = alignment.getSequences().size();
 552       int ungappedCount = 0;
 553       for (SequenceI seq : alignment.getSequences())
 554       {
 555         if (seq.getCharAt(col) == gapCharacter)
 556         {
 557           continue;
 558         }
 559         List<char[]> codons = MappingUtils
 560                 .findCodonsFor(seq, col, mappings);
 561         for (char[] codon : codons)
 562         {
 563           int codonEncoded = CodingUtils.encodeCodon(codon);
 564           if (codonEncoded >= 0)
 565           {
 566             codonCounts[codonEncoded + 2]++;
 567             ungappedCount++;
 568           }
 569         }
 570       }
 571       codonCounts[1] = ungappedCount;
 572       // todo: sort values here, save counts and codons?
 573       columnHash.put(PROFILE, codonCounts);
 574       hconsensus[col] = columnHash;
 575     }
 576   }
 577
 578   /**
 579    * Derive displayable cDNA consensus annotation from computed consensus data.
 580    *
 581    * @param consensusAnnotation
 582    *          the annotation row to be populated for display
 583    * @param consensusData
 584    *          the computed consensus data
 585    * @param showProfileLogo
 586    *          if true show all symbols present at each position, else only the
 587    *          modal value
 588    * @param nseqs
 589    *          the number of sequences in the alignment
 590    */
 591   public static void completeCdnaConsensus(
 592           AlignmentAnnotation consensusAnnotation,
 593           Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
 594   {
 595     if (consensusAnnotation == null
 596             || consensusAnnotation.annotations == null
 597             || consensusAnnotation.annotations.length < consensusData.length)
 598     {
 599       // called with a bad alignment annotation row - wait for it to be
 600       // initialised properly
 601       return;
 602     }
 603
 604     // ensure codon triplet scales with font size
 605     consensusAnnotation.scaleColLabel = true;
 606     for (int col = 0; col < consensusData.length; col++)
 607     {
 608       Hashtable hci = consensusData[col];
 609       if (hci == null)
 610       {
 611         // gapped protein column?
 612         continue;
 613       }
 614       // array holds #seqs, #ungapped, then codon counts indexed by codon
 615       final int[] codonCounts = (int[]) hci.get(PROFILE);
 616       int totalCount = 0;
 617
 618       /*
 619        * First pass - get total count and find the highest
 620        */
 621       final char[] codons = new char[codonCounts.length - 2];
 622       for (int j = 2; j < codonCounts.length; j++)
 623       {
 624         final int codonCount = codonCounts[j];
 625         codons[j - 2] = (char) (j - 2);
 626         totalCount += codonCount;
 627       }
 628
 629       /*
 630        * Sort array of encoded codons by count ascending - so the modal value
 631        * goes to the end; start by copying the count (dropping the first value)
 632        */
 633       int[] sortedCodonCounts = new int[codonCounts.length - 2];
 634       System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
 635               codonCounts.length - 2);
 636       QuickSort.sort(sortedCodonCounts, codons);
 637
 638       int modalCodonEncoded = codons[codons.length - 1];
 639       int modalCodonCount = sortedCodonCounts[codons.length - 1];
 640       String modalCodon = String.valueOf(CodingUtils
 641               .decodeCodon(modalCodonEncoded));
 642       if (sortedCodonCounts.length > 1
 643               && sortedCodonCounts[codons.length - 2] == sortedCodonCounts[codons.length - 1])
 644       {
 645         /*
 646          * two or more codons share the modal count
 647          */
 648         modalCodon = "+";
 649       }
 650       float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
 651               / (float) totalCount;
 652
 653       /*
 654        * todo ? Replace consensus hashtable with sorted arrays of codons and
 655        * counts (non-zero only). Include total count in count array [0].
 656        */
 657
 658       /*
 659        * Scan sorted array backwards for most frequent values first. Show
 660        * repeated values compactly.
 661        */
 662       StringBuilder mouseOver = new StringBuilder(32);
 663       StringBuilder samePercent = new StringBuilder();
 664       String percent = null;
 665       String lastPercent = null;
 666       Format fmt = getPercentageFormat(nseqs);
 667
 668       for (int j = codons.length - 1; j >= 0; j--)
 669       {
 670         int codonCount = sortedCodonCounts[j];
 671         if (codonCount == 0)
 672         {
 673           /*
 674            * remaining codons are 0% - ignore, but finish off the last one if
 675            * necessary
 676            */
 677           if (samePercent.length() > 0)
 678           {
 679             mouseOver.append(samePercent).append(": ").append(percent)
 680                     .append("% ");
 681           }
 682           break;
 683         }
 684         int codonEncoded = codons[j];
 685         final int pct = codonCount * 100 / totalCount;
 686         String codon = String
 687                 .valueOf(CodingUtils.decodeCodon(codonEncoded));
 688         percent = fmt == null ? Integer.toString(pct) : fmt.form(pct);
 689         if (showProfileLogo || codonCount == modalCodonCount)
 690         {
 691           if (percent.equals(lastPercent) && j > 0)
 692           {
 693             samePercent.append(samePercent.length() == 0 ? "" : ", ");
 694             samePercent.append(codon);
 695           }
 696           else
 697           {
 698             if (samePercent.length() > 0)
 699             {
 700               mouseOver.append(samePercent).append(": ")
 701                       .append(lastPercent).append("% ");
 702             }
 703             samePercent.setLength(0);
 704             samePercent.append(codon);
 705           }
 706           lastPercent = percent;
 707         }
 708       }
 709
 710       consensusAnnotation.annotations[col] = new Annotation(modalCodon,
 711               mouseOver.toString(), ' ', pid);
 712     }
 713   }
 714 }