src/jalview/analysis/AAFrequency.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.Format;
  29 import jalview.util.MappingUtils;
  30 import jalview.util.QuickSort;
  31
  32 import java.util.Arrays;
  33 import java.util.Hashtable;
  34 import java.util.List;
  35 import java.util.Set;
  36
  37 /**
  38  * Takes in a vector or array of sequences and column start and column end and
  39  * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
  40  * This class is used extensively in calculating alignment colourschemes that
  41  * depend on the amount of conservation in each alignment column.
  42  *
  43  * @author $author$
  44  * @version $Revision$
  45  */
  46 public class AAFrequency
  47 {
  48   private static final int TO_UPPER_CASE = 'A' - 'a'; // -32
  49
  50   public static final String MAXCOUNT = "C";
  51
  52   public static final String MAXRESIDUE = "R";
  53
  54   public static final String PID_GAPS = "G";
  55
  56   public static final String PID_NOGAPS = "N";
  57
  58   public static final String PROFILE = "P";
  59
  60   public static final String ENCODED_CHARS = "E";
  61
  62   /*
  63    * Quick look-up of String value of char 'A' to 'Z'
  64    */
  65   private static final String[] CHARS = new String['Z' - 'A' + 1];
  66
  67   static
  68   {
  69     for (char c = 'A'; c <= 'Z'; c++)
  70     {
  71       CHARS[c - 'A'] = String.valueOf(c);
  72     }
  73   }
  74
  75   public static final Hashtable[] calculate(List<SequenceI> list,
  76           int start, int end)
  77   {
  78     return calculate(list, start, end, false);
  79   }
  80
  81   public static final Hashtable[] calculate(List<SequenceI> sequences,
  82           int start, int end, boolean profile)
  83   {
  84     SequenceI[] seqs = new SequenceI[sequences.size()];
  85     int width = 0;
  86     synchronized (sequences)
  87     {
  88       for (int i = 0; i < sequences.size(); i++)
  89       {
  90         seqs[i] = sequences.get(i);
  91         if (seqs[i].getLength() > width)
  92         {
  93           width = seqs[i].getLength();
  94         }
  95       }
  96
  97       Hashtable[] reply = new Hashtable[width];
  98
  99       if (end >= width)
 100       {
 101         end = width;
 102       }
 103
 104       calculate(seqs, start, end, reply, profile);
 105       return reply;
 106     }
 107   }
 108
 109   public static final void calculate(SequenceI[] sequences, int start,
 110           int end, Hashtable[] result, boolean profile)
 111   {
 112     Hashtable residueHash;
 113     int maxCount, nongap, i, j, v;
 114     int jSize = sequences.length;
 115     String maxResidue;
 116     char c = '-';
 117     float percentage;
 118
 119     int[] values = new int[255];
 120
 121     char[] seq;
 122
 123     for (i = start; i < end; i++)
 124     {
 125       residueHash = new Hashtable();
 126       maxCount = 0;
 127       maxResidue = "";
 128       nongap = 0;
 129       values = new int[255];
 130
 131       for (j = 0; j < jSize; j++)
 132       {
 133         if (sequences[j] == null)
 134         {
 135           System.err
 136                   .println("WARNING: Consensus skipping null sequence - possible race condition.");
 137           continue;
 138         }
 139         seq = sequences[j].getSequence();
 140         if (seq.length > i)
 141         {
 142           c = seq[i];
 143
 144           if (c == '.' || c == ' ')
 145           {
 146             c = '-';
 147           }
 148
 149           if (c == '-')
 150           {
 151             values['-']++;
 152             continue;
 153           }
 154           else if ('a' <= c && c <= 'z')
 155           {
 156             c += TO_UPPER_CASE;
 157           }
 158
 159           nongap++;
 160           values[c]++;
 161
 162         }
 163         else
 164         {
 165           values['-']++;
 166         }
 167       }
 168       if (jSize == 1)
 169       {
 170         maxResidue = String.valueOf(c);
 171         maxCount = 1;
 172       }
 173       else
 174       {
 175         for (v = 'A'; v <= 'Z'; v++)
 176         {
 177           // TODO why ignore values[v] == 1?
 178           if (values[v] < 1 /* 2 */|| values[v] < maxCount)
 179           {
 180             continue;
 181           }
 182
 183           if (values[v] > maxCount)
 184           {
 185             maxResidue = CHARS[v - 'A'];
 186           }
 187           else if (values[v] == maxCount)
 188           {
 189             maxResidue += CHARS[v - 'A'];
 190           }
 191           maxCount = values[v];
 192         }
 193       }
 194       if (maxResidue.length() == 0)
 195       {
 196         maxResidue = "-";
 197       }
 198       if (profile)
 199       {
 200         // TODO use a 1-dimensional array with jSize, nongap in [0] and [1]
 201         residueHash.put(PROFILE, new int[][]
 202         { values, new int[]
 203         { jSize, nongap } });
 204       }
 205       residueHash.put(MAXCOUNT, new Integer(maxCount));
 206       residueHash.put(MAXRESIDUE, maxResidue);
 207
 208       percentage = ((float) maxCount * 100) / jSize;
 209       residueHash.put(PID_GAPS, new Float(percentage));
 210
 211       if (nongap > 0)
 212       {
 213         // calculate for non-gapped too
 214         percentage = ((float) maxCount * 100) / nongap;
 215       }
 216       residueHash.put(PID_NOGAPS, new Float(percentage));
 217
 218       result[i] = residueHash;
 219     }
 220   }
 221
 222   /**
 223    * Compute all or part of the annotation row from the given consensus
 224    * hashtable
 225    *
 226    * @param consensus
 227    *          - pre-allocated annotation row
 228    * @param hconsensus
 229    * @param iStart
 230    * @param width
 231    * @param ignoreGapsInConsensusCalculation
 232    * @param includeAllConsSymbols
 233    * @param nseq
 234    */
 235   public static void completeConsensus(AlignmentAnnotation consensus,
 236           Hashtable[] hconsensus, int iStart, int width,
 237           boolean ignoreGapsInConsensusCalculation,
 238           boolean includeAllConsSymbols, long nseq)
 239   {
 240     completeConsensus(consensus, hconsensus, iStart, width,
 241             ignoreGapsInConsensusCalculation, includeAllConsSymbols, null,
 242             nseq);
 243   }
 244
 245   /**
 246    * Derive the consensus annotations to be added to the alignment for display.
 247    * This does not recompute the raw data, but may be called on a change in
 248    * display options, such as 'show logo', which may in turn result in a change
 249    * in the derived values.
 250    *
 251    * @param consensus
 252    *          the annotation row to add annotations to
 253    * @param hconsensus
 254    *          the source consensus data
 255    * @param iStart
 256    *          start column
 257    * @param width
 258    *          end column
 259    * @param ignoreGapsInConsensusCalculation
 260    *          if true, use the consensus calculated ignoring gaps
 261    * @param includeAllConsSymbols
 262    *          if true include all consensus symbols, else just show modal
 263    *          residue
 264    * @param alphabet
 265    * @param nseq
 266    *          number of sequences
 267    */
 268   public static void completeConsensus(AlignmentAnnotation consensus,
 269           Hashtable[] hconsensus, int iStart, int width,
 270           boolean ignoreGapsInConsensusCalculation,
 271           boolean includeAllConsSymbols, char[] alphabet, long nseq)
 272   {
 273     if (consensus == null || consensus.annotations == null
 274             || consensus.annotations.length < width)
 275     {
 276       // called with a bad alignment annotation row - wait for it to be
 277       // initialised properly
 278       return;
 279     }
 280
 281     final Format fmt = getPercentageFormat(nseq);
 282
 283     for (int i = iStart; i < width; i++)
 284     {
 285       Hashtable hci;
 286       if (i >= hconsensus.length || ((hci = hconsensus[i]) == null))
 287       {
 288         // happens if sequences calculated over were shorter than alignment
 289         // width
 290         consensus.annotations[i] = null;
 291         continue;
 292       }
 293       Float fv = (Float) hci
 294               .get(ignoreGapsInConsensusCalculation ? PID_NOGAPS : PID_GAPS);
 295       if (fv == null)
 296       {
 297         consensus.annotations[i] = null;
 298         // data has changed below us .. give up and
 299         continue;
 300       }
 301       float value = fv.floatValue();
 302       String maxRes = hci.get(AAFrequency.MAXRESIDUE).toString();
 303       StringBuilder mouseOver = new StringBuilder(64);
 304       if (maxRes.length() > 1)
 305       {
 306         mouseOver.append("[").append(maxRes).append("] ");
 307         maxRes = "+";
 308       }
 309       else
 310       {
 311         mouseOver.append(hci.get(AAFrequency.MAXRESIDUE) + " ");
 312       }
 313       int[][] profile = (int[][]) hci.get(AAFrequency.PROFILE);
 314       if (profile != null && includeAllConsSymbols)
 315       {
 316         int sequenceCount = profile[1][0];
 317         int nonGappedCount = profile[1][1];
 318         int normalisedBy = ignoreGapsInConsensusCalculation ? nonGappedCount
 319                 : sequenceCount;
 320         mouseOver.setLength(0);
 321         if (alphabet != null)
 322         {
 323           for (int c = 0; c < alphabet.length; c++)
 324           {
 325             float tval = profile[0][alphabet[c]] * 100f / normalisedBy;
 326             mouseOver
 327                     .append(((c == 0) ? "" : "; "))
 328                     .append(alphabet[c])
 329                     .append(" ")
 330                     .append(((fmt != null) ? fmt.form(tval) : ((int) tval)))
 331                     .append("%");
 332           }
 333         }
 334         else
 335         {
 336           // TODO do this sort once only in calculate()?
 337           // char[][] ca = new char[profile[0].length][];
 338           char[] ca = new char[profile[0].length];
 339           float[] vl = new float[profile[0].length];
 340           for (int c = 0; c < ca.length; c++)
 341           {
 342             ca[c] = (char) c;
 343             // ca[c] = new char[]
 344             // { (char) c };
 345             vl[c] = profile[0][c];
 346           }
 347           QuickSort.sort(vl, ca);
 348           for (int p = 0, c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 349           {
 350             final char residue = ca[c];
 351             if (residue != '-')
 352             {
 353               float tval = profile[0][residue] * 100f / normalisedBy;
 354               mouseOver
 355                       .append((((p == 0) ? "" : "; ")))
 356                       .append(residue)
 357                       .append(" ")
 358                       .append(((fmt != null) ? fmt.form(tval)
 359                               : ((int) tval))).append("%");
 360               p++;
 361             }
 362           }
 363         }
 364       }
 365       else
 366       {
 367         mouseOver.append(
 368                 (((fmt != null) ? fmt.form(value) : ((int) value))))
 369                 .append("%");
 370       }
 371       consensus.annotations[i] = new Annotation(maxRes,
 372               mouseOver.toString(), ' ',
 373               value);
 374     }
 375   }
 376
 377   /**
 378    * Returns a Format designed to show all significant figures for profile
 379    * percentages. For less than 100 sequences, returns null (the integer
 380    * percentage value will be displayed). For 100-999 sequences, returns "%3.1f"
 381    *
 382    * @param nseq
 383    * @return
 384    */
 385   protected static Format getPercentageFormat(long nseq)
 386   {
 387     int scale = 0;
 388     while (nseq >= 10)
 389     {
 390       scale++;
 391       nseq /= 10;
 392     }
 393     return scale <= 1 ? null : new Format("%3." + (scale - 1) + "f");
 394   }
 395
 396   /**
 397    * Returns the sorted profile for the given consensus data. The returned array
 398    * contains
 399    *
 400    * <pre>
 401    *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
 402    * in descending order of percentage value
 403    * </pre>
 404    *
 405    * @param hconsensus
 406    *          the data table from which to extract and sort values
 407    * @param ignoreGaps
 408    *          if true, only non-gapped values are included in percentage
 409    *          calculations
 410    * @return
 411    */
 412   public static int[] extractProfile(Hashtable hconsensus,
 413           boolean ignoreGaps)
 414   {
 415     int[] rtnval = new int[64];
 416     int[][] profile = (int[][]) hconsensus.get(AAFrequency.PROFILE);
 417     if (profile == null)
 418     {
 419       return null;
 420     }
 421     char[] ca = new char[profile[0].length];
 422     float[] vl = new float[profile[0].length];
 423     for (int c = 0; c < ca.length; c++)
 424     {
 425       ca[c] = (char) c;
 426       vl[c] = profile[0][c];
 427     }
 428     QuickSort.sort(vl, ca);
 429     int nextArrayPos = 2;
 430     int totalPercentage = 0;
 431     int distinctValuesCount = 0;
 432     final int divisor = profile[1][ignoreGaps ? 1 : 0];
 433     for (int c = ca.length - 1; profile[0][ca[c]] > 0; c--)
 434     {
 435       if (ca[c] != '-')
 436       {
 437         rtnval[nextArrayPos++] = ca[c];
 438         final int percentage = (int) (profile[0][ca[c]] * 100f / divisor);
 439         rtnval[nextArrayPos++] = percentage;
 440         totalPercentage += percentage;
 441         distinctValuesCount++;
 442       }
 443     }
 444     rtnval[0] = distinctValuesCount;
 445     rtnval[1] = totalPercentage;
 446     int[] result = new int[rtnval.length + 1];
 447     result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 448     System.arraycopy(rtnval, 0, result, 1, rtnval.length);
 449
 450     return result;
 451   }
 452
 453   /**
 454    * Extract a sorted extract of cDNA codon profile data. The returned array
 455    * contains
 456    *
 457    * <pre>
 458    *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
 459    * in descending order of percentage value, where the character values encode codon triplets
 460    * </pre>
 461    *
 462    * @param hashtable
 463    * @return
 464    */
 465   public static int[] extractCdnaProfile(Hashtable hashtable, boolean ignoreGaps)
 466   {
 467     // this holds #seqs, #ungapped, and then codon count, indexed by encoded
 468     // codon triplet
 469     int[] codonCounts = (int[]) hashtable.get(PROFILE);
 470     int[] sortedCounts = new int[codonCounts.length - 2];
 471     System.arraycopy(codonCounts, 2, sortedCounts, 0,
 472             codonCounts.length - 2);
 473
 474     int[] result = new int[3 + 2 * sortedCounts.length];
 475     // first value is just the type of profile data
 476     result[0] = AlignmentAnnotation.CDNA_PROFILE;
 477
 478     char[] codons = new char[sortedCounts.length];
 479     for (int i = 0; i < codons.length; i++)
 480     {
 481       codons[i] = (char) i;
 482     }
 483     QuickSort.sort(sortedCounts, codons);
 484     int totalPercentage = 0;
 485     int distinctValuesCount = 0;
 486     int j = 3;
 487     int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
 488     for (int i = codons.length - 1; i >= 0; i--)
 489     {
 490       final int codonCount = sortedCounts[i];
 491       if (codonCount == 0)
 492       {
 493         break; // nothing else of interest here
 494       }
 495       distinctValuesCount++;
 496       result[j++] = codons[i];
 497       final int percentage = codonCount * 100 / divisor;
 498       result[j++] = percentage;
 499       totalPercentage += percentage;
 500     }
 501     result[2] = totalPercentage;
 502
 503     /*
 504      * Just return the non-zero values
 505      */
 506     // todo next value is redundant if we limit the array to non-zero counts
 507     result[1] = distinctValuesCount;
 508     return Arrays.copyOfRange(result, 0, j);
 509   }
 510
 511   /**
 512    * Compute a consensus for the cDNA coding for a protein alignment.
 513    *
 514    * @param alignment
 515    *          the protein alignment (which should hold mappings to cDNA
 516    *          sequences)
 517    * @param hconsensus
 518    *          the consensus data stores to be populated (one per column)
 519    */
 520   public static void calculateCdna(AlignmentI alignment,
 521           Hashtable[] hconsensus)
 522   {
 523     final char gapCharacter = alignment.getGapCharacter();
 524     Set<AlignedCodonFrame> mappings = alignment.getCodonFrames();
 525     if (mappings == null || mappings.isEmpty())
 526     {
 527       return;
 528     }
 529
 530     int cols = alignment.getWidth();
 531     for (int col = 0; col < cols; col++)
 532     {
 533       // todo would prefer a Java bean for consensus data
 534       Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
 535       // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
 536       int[] codonCounts = new int[66];
 537       codonCounts[0] = alignment.getSequences().size();
 538       int ungappedCount = 0;
 539       for (SequenceI seq : alignment.getSequences())
 540       {
 541         if (seq.getCharAt(col) == gapCharacter)
 542         {
 543           continue;
 544         }
 545         char[] codon = MappingUtils.findCodonFor(seq, col, mappings);
 546         int codonEncoded = CodingUtils.encodeCodon(codon);
 547         if (codonEncoded >= 0)
 548         {
 549           codonCounts[codonEncoded + 2]++;
 550           ungappedCount++;
 551         }
 552       }
 553       codonCounts[1] = ungappedCount;
 554       // todo: sort values here, save counts and codons?
 555       columnHash.put(PROFILE, codonCounts);
 556       hconsensus[col] = columnHash;
 557     }
 558   }
 559
 560   /**
 561    * Derive displayable cDNA consensus annotation from computed consensus data.
 562    *
 563    * @param consensusAnnotation
 564    *          the annotation row to be populated for display
 565    * @param consensusData
 566    *          the computed consensus data
 567    * @param showProfileLogo
 568    *          if true show all symbols present at each position, else only the
 569    *          modal value
 570    * @param nseqs
 571    *          the number of sequences in the alignment
 572    */
 573   public static void completeCdnaConsensus(
 574           AlignmentAnnotation consensusAnnotation,
 575           Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
 576   {
 577     if (consensusAnnotation == null
 578             || consensusAnnotation.annotations == null
 579             || consensusAnnotation.annotations.length < consensusData.length)
 580     {
 581       // called with a bad alignment annotation row - wait for it to be
 582       // initialised properly
 583       return;
 584     }
 585
 586     // ensure codon triplet scales with font size
 587     consensusAnnotation.scaleColLabel = true;
 588     for (int col = 0; col < consensusData.length; col++)
 589     {
 590       Hashtable hci = consensusData[col];
 591       if (hci == null)
 592       {
 593         // gapped protein column?
 594         continue;
 595       }
 596       // array holds #seqs, #ungapped, then codon counts indexed by codon
 597       final int[] codonCounts = (int[]) hci.get(PROFILE);
 598       int totalCount = 0;
 599       StringBuilder mouseOver = new StringBuilder(32);
 600
 601       /*
 602        * First pass - get total count and find the highest
 603        */
 604       final char[] codons = new char[codonCounts.length - 2];
 605       for (int j = 2; j < codonCounts.length; j++)
 606       {
 607         final int codonCount = codonCounts[j];
 608         codons[j - 2] = (char) (j - 2);
 609         totalCount += codonCount;
 610       }
 611
 612       /*
 613        * Sort array of encoded codons by count ascending - so the modal value
 614        * goes to the end; start by copying the count (dropping the first value)
 615        */
 616       int[] sortedCodonCounts = new int[codonCounts.length - 2];
 617       System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
 618               codonCounts.length - 2);
 619       QuickSort.sort(sortedCodonCounts, codons);
 620
 621       int modalCodonEncoded = codons[codons.length - 1];
 622       int modalCodonCount = sortedCodonCounts[codons.length - 1];
 623       String modalCodon = String.valueOf(CodingUtils
 624               .decodeCodon(modalCodonEncoded));
 625       if (sortedCodonCounts.length > 1
 626               && sortedCodonCounts[codons.length - 2] == modalCodonEncoded)
 627       {
 628         modalCodon = "+";
 629       }
 630       float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
 631               / (float) totalCount;
 632
 633       /*
 634        * todo ? Replace consensus hashtable with sorted arrays of codons and
 635        * counts (non-zero only). Include total count in count array [0].
 636        */
 637
 638       /*
 639        * Scan sorted array backwards for most frequent values first.
 640        */
 641       for (int j = codons.length - 1; j >= 0; j--)
 642       {
 643         int codonCount = sortedCodonCounts[j];
 644         if (codonCount == 0)
 645         {
 646           break;
 647         }
 648         int codonEncoded = codons[j];
 649         final int pct = codonCount * 100 / totalCount;
 650         String codon = String
 651                 .valueOf(CodingUtils.decodeCodon(codonEncoded));
 652         Format fmt = getPercentageFormat(nseqs);
 653         String formatted = fmt == null ? Integer.toString(pct) : fmt
 654                 .form(pct);
 655         if (showProfileLogo || codonCount == modalCodonCount)
 656         {
 657           mouseOver.append(codon).append(": ").append(formatted)
 658                   .append("% ");
 659         }
 660       }
 661
 662       consensusAnnotation.annotations[col] = new Annotation(modalCodon,
 663               mouseOver.toString(), ' ', pid);
 664     }
 665   }
 666 }