src/jalview/analysis/AAFrequency.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.Format;
  29 import jalview.util.MappingUtils;
  30 import jalview.util.QuickSort;
  31
  32 import java.util.Arrays;
  33 import java.util.Hashtable;
  34 import java.util.List;
  35 import java.util.Set;
  36
  37 /**
  38  * Takes in a vector or array of sequences and column start and column end and
  39  * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
  40  * This class is used extensively in calculating alignment colourschemes that
  41  * depend on the amount of conservation in each alignment column.
  42  *
  43  * @author $author$
  44  * @version $Revision$
  45  */
  46 public class AAFrequency
  47 {
  48   private static final int TO_UPPER_CASE = 'A' - 'a'; // -32
  49
  50   public static final String MAXCOUNT = "C";
  51
  52   public static final String MAXRESIDUE = "R";
  53
  54   public static final String PID_GAPS = "G";
  55
  56   public static final String PID_NOGAPS = "N";
  57
  58   public static final String PROFILE = "P";
  59
  60   public static final String ENCODED_CHARS = "E";
  61
  62   /*
  63    * Quick look-up of String value of char 'A' to 'Z'
  64    */
  65   private static final String[] CHARS = new String['Z' - 'A' + 1];
  66
  67   static
  68   {
  69     for (char c = 'A'; c <= 'Z'; c++)
  70     {
  71       CHARS[c - 'A'] = String.valueOf(c);
  72     }
  73   }
  74
  75   public static final Hashtable[] calculate(List<SequenceI> list,
  76           int start, int end)
  77   {
  78     return calculate(list, start, end, false);
  79   }
  80
  81   public static final Hashtable[] calculate(List<SequenceI> sequences,
  82           int start, int end, boolean profile)
  83   {
  84     SequenceI[] seqs = new SequenceI[sequences.size()];
  85     int width = 0;
  86     synchronized (sequences)
  87     {
  88       for (int i = 0; i < sequences.size(); i++)
  89       {
  90         seqs[i] = sequences.get(i);
  91         if (seqs[i].getLength() > width)
  92         {
  93           width = seqs[i].getLength();
  94         }
  95       }
  96
  97       Hashtable[] reply = new Hashtable[width];
  98
  99       if (end >= width)
 100       {
 101         end = width;
 102       }
 103
 104       calculate(seqs, start, end, reply, profile);
 105       return reply;
 106     }
 107   }
 108
 109   public static final void calculate(SequenceI[] sequences, int start,
 110           int end, Hashtable[] result, boolean profile)
 111   {
 112     Hashtable residueHash;
 113     int maxCount, nongap, i, j, v;
 114     int jSize = sequences.length;
 115     String maxResidue;
 116     char c = '-';
 117     float percentage;
 118
 119     int[] values = new int[255];
 120
 121     char[] seq;
 122
 123     for (i = start; i < end; i++)
 124     {
 125       residueHash = new Hashtable();
 126       maxCount = 0;
 127       maxResidue = "";
 128       nongap = 0;
 129       values = new int[255];
 130
 131       for (j = 0; j < jSize; j++)
 132       {
 133         if (sequences[j] == null)
 134         {
 135           System.err
 136                   .println("WARNING: Consensus skipping null sequence - possible race condition.");
 137           continue;
 138         }
 139         seq = sequences[j].getSequence();
 140         if (seq.length > i)
 141         {
 142           c = seq[i];
 143
 144           if (c == '.' || c == ' ')
 145           {
 146             c = '-';
 147           }
 148
 149           if (c == '-')
 150           {
 151             values['-']++;
 152             continue;
 153           }
 154           else if ('a' <= c && c <= 'z')
 155           {
 156             c += TO_UPPER_CASE;
 157           }
 158
 159           nongap++;
 160           values[c]++;
 161
 162         }
 163         else
 164         {
 165           values['-']++;
 166         }
 167       }
 168       if (jSize == 1)
 169       {
 170         maxResidue = String.valueOf(c);
 171         maxCount = 1;
 172       }
 173       else
 174       {
 175         for (v = 'A'; v <= 'Z'; v++)
 176         {
 177           // TODO why ignore values[v] == 1?
 178           if (values[v] < 1 /* 2 */|| values[v] < maxCount)
 179           {
 180             continue;
 181           }
 182
 183           if (values[v] > maxCount)
 184           {
 185             maxResidue = CHARS[v - 'A'];
 186           }
 187           else if (values[v] == maxCount)
 188           {
 189             maxResidue += CHARS[v - 'A'];
 190           }
 191           maxCount = values[v];
 192         }
 193       }
 194       if (maxResidue.length() == 0)
 195       {
 196         maxResidue = "-";
 197       }
 198       if (profile)
 199       {
 200         // TODO use a 1-dimensional array with jSize, nongap in [0] and [1]
 201         residueHash.put(PROFILE, new int[][]
 202         { values, new int[]
 203         { jSize, nongap } });
 204       }
 205       residueHash.put(MAXCOUNT, new Integer(maxCount));
 206       residueHash.put(MAXRESIDUE, maxResidue);
 207
 208       percentage = ((float) maxCount * 100) / jSize;
 209       residueHash.put(PID_GAPS, new Float(percentage));
 210
 211       if (nongap > 0)
 212       {
 213         // calculate for non-gapped too
 214         percentage = ((float) maxCount * 100) / nongap;
 215       }
 216       residueHash.put(PID_NOGAPS, new Float(percentage));
 217
 218       result[i] = residueHash;
 219     }
 220   }
 221
 222   /**
 223    * Compute all or part of the annotation row from the given consensus
 224    * hashtable
 225    *
 226    * @param consensus
 227    *          - pre-allocated annotation row
 228    * @param hconsensus
 229    * @param iStart
 230    * @param width
 231    * @param ignoreGapsInConsensusCalculation
 232    * @param includeAllConsSymbols
 233    * @param nseq
 234    */
 235   public static void completeConsensus(AlignmentAnnotation consensus,
 236           Hashtable[] hconsensus, int iStart, int width,
 237           boolean ignoreGapsInConsensusCalculation,
 238           boolean includeAllConsSymbols, long nseq)
 239   {
 240     completeConsensus(consensus, hconsensus, iStart, width,
 241             ignoreGapsInConsensusCalculation, includeAllConsSymbols, null,
 242             nseq);
 243   }
 244
 245   /**
 246    * Derive the consensus annotations to be added to the alignment for display.
 247    * This does not recompute the raw data, but may be called on a change in
 248    * display options, such as 'show logo', which may in turn result in a change
 249    * in the derived values.
 250    *
 251    * @param consensus
 252    *          the annotation row to add annotations to
 253    * @param hconsensus
 254    *          the source consensus data
 255    * @param iStart
 256    *          start column
 257    * @param width
 258    *          end column
 259    * @param ignoreGapsInConsensusCalculation
 260    *          if true, use the consensus calculated ignoring gaps
 261    * @param includeAllConsSymbols
 262    *          if true include all consensus symbols, else just show modal
 263    *          residue
 264    * @param alphabet
 265    * @param nseq
 266    *          number of sequences
 267    */
 268   public static void completeConsensus(AlignmentAnnotation consensus,
 269           Hashtable[] hconsensus, int iStart, int width,
 270           boolean ignoreGapsInConsensusCalculation,
 271           boolean includeAllConsSymbols, char[] alphabet, long nseq)
 272   {
 273     if (consensus == null || consensus.annotations == null
 274             || consensus.annotations.length < width)
 275     {
 276       // called with a bad alignment annotation row - wait for it to be
 277       // initialised properly
 278       return;
 279     }
 280
 281     final Format fmt = getPercentageFormat(nseq);
 282
 283     for (int i = iStart; i < width; i++)
 284     {
 285       Hashtable hci;
 286       if (i >= hconsensus.length || ((hci = hconsensus[i]) == null))
 287       {
 288         // happens if sequences calculated over were shorter than alignment
 289         // width
 290         consensus.annotations[i] = null;
 291         continue;
 292       }
 293       Float fv = (Float) hci
 294               .get(ignoreGapsInConsensusCalculation ? PID_NOGAPS : PID_GAPS);
 295       if (fv == null)
 296       {
 297         consensus.annotations[i] = null;
 298         // data has changed below us .. give up and
 299         continue;
 300       }
 301       float value = fv.floatValue();
 302       String maxRes = hci.get(AAFrequency.MAXRESIDUE).toString();
 303       StringBuilder mouseOver = new StringBuilder(64);
 304       if (maxRes.length() > 1)
 305       {
 306         mouseOver.append("[").append(maxRes).append("] ");
 307         maxRes = "+";
 308       }
 309       else
 310       {
 311         mouseOver.append(hci.get(AAFrequency.MAXRESIDUE) + " ");
 312       }
 313       int[][] profile = (int[][]) hci.get(AAFrequency.PROFILE);
 314       if (profile != null && includeAllConsSymbols)
 315       {
 316         int sequenceCount = profile[1][0];
 317         int nonGappedCount = profile[1][1];
 318         int normalisedBy = ignoreGapsInConsensusCalculation ? nonGappedCount
 319                 : sequenceCount;
 320         mouseOver.setLength(0);
 321         if (alphabet != null)
 322         {
 323           for (int c = 0; c < alphabet.length; c++)
 324           {
 325             float tval = profile[0][alphabet[c]] * 100f / normalisedBy;
 326             mouseOver
 327                     .append(((c == 0) ? "" : "; "))
 328                     .append(alphabet[c])
 329                     .append(" ")
 330                     .append(((fmt != null) ? fmt.formDouble(tval) : ((int) tval)))
 331                     .append("%");
 332           }
 333         }
 334         else
 335         {
 336           // TODO do this sort once only in calculate()?
 337           // char[][] ca = new char[profile[0].length][];
 338           char[] ca = new char[profile[0].length];
 339           float[] vl = new float[profile[0].length];
 340           for (int c = 0; c < ca.length; c++)
 341           {
 342             ca[c] = (char) c;
 343             // ca[c] = new char[]
 344             // { (char) c };
 345             vl[c] = profile[0][c];
 346           }
 347           QuickSort.sortFloatChar(vl, ca);
 348           for (int p = 0, c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 349           {
 350             final char residue = ca[c];
 351             if (residue != '-')
 352             {
 353               float tval = profile[0][residue] * 100f / normalisedBy;
 354               mouseOver
 355                       .append((((p == 0) ? "" : "; ")))
 356                       .append(residue)
 357                       .append(" ")
 358                       .append(((fmt != null) ? fmt.formDouble(tval)
 359                               : ((int) tval))).append("%");
 360               p++;
 361             }
 362           }
 363         }
 364       }
 365       else
 366       {
 367         mouseOver.append(
 368                 (((fmt != null) ? fmt.formDouble(value) : ((int) value))))
 369                 .append("%");
 370       }
 371       consensus.annotations[i] = new Annotation(maxRes,
 372               mouseOver.toString(), ' ',
 373               value);
 374     }
 375   }
 376
 377   /**
 378    * Returns a Format designed to show all significant figures for profile
 379    * percentages. For less than 100 sequences, returns null (the integer
 380    * percentage value will be displayed). For 100-999 sequences, returns "%3.1f"
 381    *
 382    * @param nseq
 383    * @return
 384    */
 385   protected static Format getPercentageFormat(long nseq)
 386   {
 387     int scale = 0;
 388     while (nseq >= 10)
 389     {
 390       scale++;
 391       nseq /= 10;
 392     }
 393     return scale <= 1 ? null : new Format("%3." + (scale - 1) + "f");
 394   }
 395
 396   /**
 397    * Returns the sorted profile for the given consensus data. The returned array
 398    * contains
 399    *
 400    * <pre>
 401    *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
 402    * in descending order of percentage value
 403    * </pre>
 404    *
 405    * @param hconsensus
 406    *          the data table from which to extract and sort values
 407    * @param ignoreGaps
 408    *          if true, only non-gapped values are included in percentage
 409    *          calculations
 410    * @return
 411    */
 412   public static int[] extractProfile(Hashtable hconsensus,
 413           boolean ignoreGaps)
 414   {
 415     int[] rtnval = new int[64];
 416     int[][] profile = (int[][]) hconsensus.get(AAFrequency.PROFILE);
 417     if (profile == null)
 418     {
 419       return null;
 420     }
 421     char[] ca = new char[profile[0].length];
 422     float[] vl = new float[profile[0].length];
 423     for (int c = 0; c < ca.length; c++)
 424     {
 425       ca[c] = (char) c;
 426       vl[c] = profile[0][c];
 427     }
 428     QuickSort.sortFloatChar(vl, ca);
 429     int nextArrayPos = 2;
 430     int totalPercentage = 0;
 431     int distinctValuesCount = 0;
 432     final int divisor = profile[1][ignoreGaps ? 1 : 0];
 433     for (int c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 434     {
 435       if (ca[c] != '-')
 436       {
 437         rtnval[nextArrayPos++] = ca[c];
 438         final int percentage = (int) (profile[0][ca[c]] * 100f / divisor);
 439         rtnval[nextArrayPos++] = percentage;
 440         totalPercentage += percentage;
 441         distinctValuesCount++;
 442       }
 443     }
 444     rtnval[0] = distinctValuesCount;
 445     rtnval[1] = totalPercentage;
 446     int[] result = new int[rtnval.length + 1];
 447     result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 448     System.arraycopy(rtnval, 0, result, 1, rtnval.length);
 449
 450     return result;
 451   }
 452
 453   /**
 454    * Extract a sorted extract of cDNA codon profile data. The returned array
 455    * contains
 456    *
 457    * <pre>
 458    *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
 459    * in descending order of percentage value, where the character values encode codon triplets
 460    * </pre>
 461    *
 462    * @param hashtable
 463    * @return
 464    */
 465   public static int[] extractCdnaProfile(Hashtable hashtable, boolean ignoreGaps)
 466   {
 467     // this holds #seqs, #ungapped, and then codon count, indexed by encoded
 468     // codon triplet
 469     int[] codonCounts = (int[]) hashtable.get(PROFILE);
 470     int[] sortedCounts = new int[codonCounts.length - 2];
 471     System.arraycopy(codonCounts, 2, sortedCounts, 0,
 472             codonCounts.length - 2);
 473
 474     int[] result = new int[3 + 2 * sortedCounts.length];
 475     // first value is just the type of profile data
 476     result[0] = AlignmentAnnotation.CDNA_PROFILE;
 477
 478     char[] codons = new char[sortedCounts.length];
 479     for (int i = 0; i < codons.length; i++)
 480     {
 481       codons[i] = (char) i;
 482     }
 483     QuickSort.sortIntChar(sortedCounts, codons);
 484     int totalPercentage = 0;
 485     int distinctValuesCount = 0;
 486     int j = 3;
 487     int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
 488     for (int i = codons.length - 1; i >= 0; i--)
 489     {
 490       final int codonCount = sortedCounts[i];
 491       if (codonCount == 0)
 492       {
 493         break; // nothing else of interest here
 494       }
 495       distinctValuesCount++;
 496       result[j++] = codons[i];
 497       final int percentage = codonCount * 100 / divisor;
 498       result[j++] = percentage;
 499       totalPercentage += percentage;
 500     }
 501     result[2] = totalPercentage;
 502
 503     /*
 504      * Just return the non-zero values
 505      */
 506     // todo next value is redundant if we limit the array to non-zero counts
 507     result[1] = distinctValuesCount;
 508     return Arrays.copyOfRange(result, 0, j);
 509   }
 510
 511         /**
 512          * Compute a consensus for the cDNA coding for a protein alignment.
 513          *
 514          * @param alignment
 515          *          the protein alignment (which should hold mappings to cDNA
 516          *          sequences)
 517          * @param hconsensus
 518          *          the consensus data stores to be populated (one per column)
 519          * @param mappings
 520          * @param col0  first column
 521          * @param co11  last column
 522          */
 523         public static void calculateCdna(AlignmentI alignment,
 524                         Set<AlignedCodonFrame> mappings, Hashtable[] hconsensus, int col0,
 525                         int col1) {
 526                 final char gapCharacter = alignment.getGapCharacter();
 527                 for (int col = col0; col < col1; col++) {
 528                         // todo would prefer a Java bean for consensus data
 529                         Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
 530                         // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
 531                         int[] codonCounts = new int[66];
 532                         codonCounts[0] = alignment.getSequences().size();
 533                         int ungappedCount = 0;
 534                         for (SequenceI seq : alignment.getSequences()) {
 535                                 if (seq.getCharAt(col) == gapCharacter) {
 536                                         continue;
 537                                 }
 538                                 char[] codon = MappingUtils.findCodonFor(seq, col, mappings);
 539                                 int codonEncoded = CodingUtils.encodeCodon(codon);
 540                                 if (codonEncoded >= 0) {
 541                                         codonCounts[codonEncoded + 2]++;
 542                                         ungappedCount++;
 543                                 }
 544                         }
 545                         codonCounts[1] = ungappedCount;
 546                         // todo: sort values here, save counts and codons?
 547                         columnHash.put(PROFILE, codonCounts);
 548                         hconsensus[col] = columnHash;
 549                 }
 550         }
 551
 552   /**
 553    * Derive displayable cDNA consensus annotation from computed consensus data.
 554    *
 555    * @param consensusAnnotation
 556    *          the annotation row to be populated for display
 557    * @param consensusData
 558    *          the computed consensus data
 559    * @param showProfileLogo
 560    *          if true show all symbols present at each position, else only the
 561    *          modal value
 562    * @param nseqs
 563    *          the number of sequences in the alignment
 564    */
 565   public static void completeCdnaConsensus(
 566           AlignmentAnnotation consensusAnnotation,
 567           Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
 568   {
 569     if (consensusAnnotation == null
 570             || consensusAnnotation.annotations == null
 571             || consensusAnnotation.annotations.length < consensusData.length)
 572     {
 573       // called with a bad alignment annotation row - wait for it to be
 574       // initialised properly
 575       return;
 576     }
 577
 578     // ensure codon triplet scales with font size
 579     consensusAnnotation.scaleColLabel = true;
 580     for (int col = 0; col < consensusData.length; col++)
 581     {
 582       Hashtable hci = consensusData[col];
 583       if (hci == null)
 584       {
 585         // gapped protein column?
 586         continue;
 587       }
 588       // array holds #seqs, #ungapped, then codon counts indexed by codon
 589       final int[] codonCounts = (int[]) hci.get(PROFILE);
 590       int totalCount = 0;
 591
 592       /*
 593        * First pass - get total count and find the highest
 594        */
 595       final char[] codons = new char[codonCounts.length - 2];
 596       for (int j = 2; j < codonCounts.length; j++)
 597       {
 598         final int codonCount = codonCounts[j];
 599         codons[j - 2] = (char) (j - 2);
 600         totalCount += codonCount;
 601       }
 602
 603       /*
 604        * Sort array of encoded codons by count ascending - so the modal value
 605        * goes to the end; start by copying the count (dropping the first value)
 606        */
 607       int[] sortedCodonCounts = new int[codonCounts.length - 2];
 608       System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
 609               codonCounts.length - 2);
 610       QuickSort.sortIntChar(sortedCodonCounts, codons);
 611
 612       int modalCodonEncoded = codons[codons.length - 1];
 613       int modalCodonCount = sortedCodonCounts[codons.length - 1];
 614       String modalCodon = String.valueOf(CodingUtils
 615               .decodeCodon(modalCodonEncoded));
 616       if (sortedCodonCounts.length > 1
 617               && sortedCodonCounts[codons.length - 2] == modalCodonEncoded)
 618       {
 619         modalCodon = "+";
 620       }
 621       float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
 622               / (float) totalCount;
 623
 624       /*
 625        * todo ? Replace consensus hashtable with sorted arrays of codons and
 626        * counts (non-zero only). Include total count in count array [0].
 627        */
 628
 629       /*
 630        * Scan sorted array backwards for most frequent values first. Show
 631        * repeated values compactly.
 632        */
 633       StringBuilder mouseOver = new StringBuilder(32);
 634       StringBuilder samePercent = new StringBuilder();
 635       String percent = null;
 636       String lastPercent = null;
 637       Format fmt = getPercentageFormat(nseqs);
 638
 639       for (int j = codons.length - 1; j >= 0; j--)
 640       {
 641         int codonCount = sortedCodonCounts[j];
 642         if (codonCount == 0)
 643         {
 644           /*
 645            * remaining codons are 0% - ignore, but finish off the last one if
 646            * necessary
 647            */
 648           if (samePercent.length() > 0)
 649           {
 650             mouseOver.append(samePercent).append(": ").append(percent)
 651                     .append("% ");
 652           }
 653           break;
 654         }
 655         int codonEncoded = codons[j];
 656         final int pct = codonCount * 100 / totalCount;
 657         String codon = String
 658                 .valueOf(CodingUtils.decodeCodon(codonEncoded));
 659         percent = fmt == null ? Integer.toString(pct) : fmt
 660                 .formLong(pct);
 661         if (showProfileLogo || codonCount == modalCodonCount)
 662         {
 663           if (percent.equals(lastPercent) && j > 0)
 664           {
 665             samePercent.append(samePercent.length() == 0 ? "" : ", ");
 666             samePercent.append(codon);
 667           }
 668           else
 669           {
 670             if (samePercent.length() > 0)
 671             {
 672               mouseOver.append(samePercent).append(": ")
 673                       .append(lastPercent)
 674                       .append("% ");
 675             }
 676             samePercent.setLength(0);
 677             samePercent.append(codon);
 678           }
 679           lastPercent = percent;
 680         }
 681       }
 682
 683       consensusAnnotation.annotations[col] = new Annotation(modalCodon,
 684               mouseOver.toString(), ' ', pid);
 685     }
 686   }
 687 }