src/jalview/datamodel/ResidueCount.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.Comparison;
  24 import jalview.util.Format;
  25 import jalview.util.QuickSort;
  26 import jalview.util.SparseCount;
  27
  28 import java.util.List;
  29
  30 /**
  31  * A class to count occurrences of residues in a profile, optimised for speed
  32  * and memory footprint.
  33  *
  34  * @author gmcarstairs
  35  *
  36  */
  37 public class ResidueCount
  38 {
  39   /**
  40    * A data bean to hold the results of counting symbols
  41    */
  42   public class SymbolCounts
  43   {
  44     /**
  45      * the symbols seen (as char values), in no particular order
  46      */
  47     public final char[] symbols;
  48
  49     /**
  50      * the counts for each symbol, in the same order as the symbols
  51      */
  52     public final int[] values;
  53
  54     SymbolCounts(char[] s, int[] v)
  55     {
  56       symbols = s;
  57       values = v;
  58     }
  59   }
  60
  61   private static final int TOUPPERCASE = 'A' - 'a';
  62
  63   /*
  64    * nucleotide symbols to count (including N unknown)
  65    */
  66   private static final String NUCS = "ACGNTU";
  67
  68   /*
  69    * amino acid symbols to count (including X unknown)
  70    * NB we also include U so as to support counting of RNA bases
  71    * in the "don't know" case of nucleotide / peptide
  72    */
  73   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  74
  75   private static final int GAP_COUNT = 0;
  76
  77   /*
  78    * fast lookup tables holding the index into our count
  79    * arrays of each symbol; index 0 is reserved for gap counting
  80    */
  81   private static int[] NUC_INDEX = new int[26];
  82
  83   private static int[] AA_INDEX = new int[26];
  84   static
  85   {
  86     for (int i = 0; i < NUCS.length(); i++)
  87     {
  88       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  89     }
  90     for (int i = 0; i < AAS.length(); i++)
  91     {
  92       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  93     }
  94   }
  95
  96   /*
  97    * counts array, just big enough for the nucleotide or peptide
  98    * character set (plus gap counts in position 0)
  99    */
 100   private short[] counts;
 101
 102   /*
 103    * alternative array of int counts for use if any count
 104    * exceeds the maximum value of short (32767)
 105    */
 106   private int[] intCounts;
 107
 108   /*
 109    * flag set if we switch from short to int counts
 110    */
 111   private boolean useIntCounts;
 112
 113   /*
 114    * general-purpose counter, only for use for characters
 115    * that are not in the expected alphabet
 116    */
 117   private SparseCount otherData;
 118
 119   /*
 120    * keeps track of the maximum count value recorded
 121    * (if this class ever allows decrements, would need to
 122    * calculate this on request instead)
 123    */
 124   int maxCount;
 125
 126   /*
 127    * if we think we are counting nucleotide, can get by with smaller
 128    * array to hold counts
 129    */
 130   private boolean isNucleotide;
 131
 132   /**
 133    * Default constructor allocates arrays able to count either nucleotide or
 134    * peptide bases. Use this constructor if not sure which the data is.
 135    */
 136   public ResidueCount()
 137   {
 138     this(false);
 139   }
 140
 141   /**
 142    * Constructor that allocates an array just big enough for the anticipated
 143    * characters, plus one position to count gaps
 144    */
 145   public ResidueCount(boolean nucleotide)
 146   {
 147     isNucleotide = nucleotide;
 148     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 149     counts = new short[charsToCount + 1];
 150   }
 151
 152   /**
 153    * A constructor that counts frequency of all symbols (including gaps) in the
 154    * sequences (not case-sensitive)
 155    *
 156    * @param sequences
 157    */
 158   public ResidueCount(List<SequenceI> sequences)
 159   {
 160     this();
 161     for (SequenceI seq : sequences)
 162     {
 163       for (int i = 0; i < seq.getLength(); i++)
 164       {
 165         add(seq.getCharAt(i));
 166       }
 167     }
 168   }
 169
 170   /**
 171    * Increments the count for the given character. The supplied character may be
 172    * upper or lower case but counts are for the upper case only. Gap characters
 173    * (space, ., -) are all counted together.
 174    *
 175    * @param c
 176    * @return the new value of the count for the character
 177    */
 178   public int add(final char c)
 179   {
 180     char u = toUpperCase(c);
 181     int newValue = 0;
 182     int offset = getOffset(u);
 183
 184     /*
 185      * offset 0 is reserved for gap counting, so 0 here means either
 186      * an unexpected character, or a gap character passed in error
 187      */
 188     if (offset == 0)
 189     {
 190       if (Comparison.isGap(u))
 191       {
 192         newValue = addGap();
 193       }
 194       else
 195       {
 196         newValue = addOtherCharacter(u);
 197       }
 198     }
 199     else
 200     {
 201       newValue = increment(offset);
 202     }
 203     return newValue;
 204   }
 205
 206   /**
 207    * Increment the count at the specified offset. If this would result in short
 208    * overflow, promote to counting int values instead.
 209    *
 210    * @param offset
 211    * @return the new value of the count at this offset
 212    */
 213   int increment(int offset)
 214   {
 215     int newValue = 0;
 216     if (useIntCounts)
 217     {
 218       newValue = intCounts[offset];
 219       intCounts[offset] = ++newValue;
 220     }
 221     else
 222     {
 223       if (counts[offset] == Short.MAX_VALUE)
 224       {
 225         handleOverflow();
 226         newValue = intCounts[offset];
 227         intCounts[offset] = ++newValue;
 228       }
 229       else
 230       {
 231         newValue = counts[offset];
 232         counts[offset] = (short) ++newValue;
 233       }
 234     }
 235     maxCount = Math.max(maxCount, newValue);
 236     return newValue;
 237   }
 238
 239   /**
 240    * Switch from counting in short to counting in int
 241    */
 242   synchronized void handleOverflow()
 243   {
 244     intCounts = new int[counts.length];
 245     for (int i = 0; i < counts.length; i++)
 246     {
 247       intCounts[i] = counts[i];
 248     }
 249     counts = null;
 250     useIntCounts = true;
 251   }
 252
 253   /**
 254    * Returns this character's offset in the count array
 255    *
 256    * @param c
 257    * @return
 258    */
 259   int getOffset(char c)
 260   {
 261     int offset = 0;
 262     if ('A' <= c && c <= 'Z')
 263     {
 264       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 265     }
 266     return offset;
 267   }
 268
 269   /**
 270    * @param c
 271    * @return
 272    */
 273   protected char toUpperCase(final char c)
 274   {
 275     char u = c;
 276     if ('a' <= c && c <= 'z')
 277     {
 278       u = (char) (c + TOUPPERCASE);
 279     }
 280     return u;
 281   }
 282
 283   /**
 284    * Increment count for some unanticipated character. The first time this
 285    * called, a SparseCount is instantiated to hold these 'extra' counts.
 286    *
 287    * @param c
 288    * @return the new value of the count for the character
 289    */
 290   int addOtherCharacter(char c)
 291   {
 292     if (otherData == null)
 293     {
 294       otherData = new SparseCount();
 295     }
 296     int newValue = otherData.add(c, 1);
 297     maxCount = Math.max(maxCount, newValue);
 298     return newValue;
 299   }
 300
 301   /**
 302    * Set count for some unanticipated character. The first time this called, a
 303    * SparseCount is instantiated to hold these 'extra' counts.
 304    *
 305    * @param c
 306    * @param value
 307    */
 308   void setOtherCharacter(char c, int value)
 309   {
 310     if (otherData == null)
 311     {
 312       otherData = new SparseCount();
 313     }
 314     otherData.put(c, value);
 315   }
 316
 317   /**
 318    * Increment count of gap characters
 319    *
 320    * @return the new count of gaps
 321    */
 322   public int addGap()
 323   {
 324     int newValue;
 325     if (useIntCounts)
 326     {
 327       newValue = ++intCounts[GAP_COUNT];
 328     }
 329     else
 330     {
 331       newValue = ++counts[GAP_COUNT];
 332     }
 333     return newValue;
 334   }
 335
 336   /**
 337    * Answers true if we are counting ints (only after overflow of short counts)
 338    *
 339    * @return
 340    */
 341   boolean isCountingInts()
 342   {
 343     return useIntCounts;
 344   }
 345
 346   /**
 347    * Sets the count for the given character. The supplied character may be upper
 348    * or lower case but counts are for the upper case only.
 349    *
 350    * @param c
 351    * @param count
 352    */
 353   public void put(char c, int count)
 354   {
 355     char u = toUpperCase(c);
 356     int offset = getOffset(u);
 357
 358     /*
 359      * offset 0 is reserved for gap counting, so 0 here means either
 360      * an unexpected character, or a gap character passed in error
 361      */
 362     if (offset == 0)
 363     {
 364       if (Comparison.isGap(u))
 365       {
 366         set(0, count);
 367       }
 368       else
 369       {
 370         setOtherCharacter(u, count);
 371         maxCount = Math.max(maxCount, count);
 372       }
 373     }
 374     else
 375     {
 376       set(offset, count);
 377       maxCount = Math.max(maxCount, count);
 378     }
 379   }
 380
 381   /**
 382    * Sets the count at the specified offset. If this would result in short
 383    * overflow, promote to counting int values instead.
 384    *
 385    * @param offset
 386    * @param value
 387    */
 388   void set(int offset, int value)
 389   {
 390     if (useIntCounts)
 391     {
 392       intCounts[offset] = value;
 393     }
 394     else
 395     {
 396       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 397       {
 398         handleOverflow();
 399         intCounts[offset] = value;
 400       }
 401       else
 402       {
 403         counts[offset] = (short) value;
 404       }
 405     }
 406   }
 407
 408   /**
 409    * Returns the count for the given character, or zero if no count held
 410    *
 411    * @param c
 412    * @return
 413    */
 414   public int getCount(char c)
 415   {
 416     char u = toUpperCase(c);
 417     int offset = getOffset(u);
 418     if (offset == 0)
 419     {
 420       if (!Comparison.isGap(u))
 421       {
 422         // should have called getGapCount()
 423         return otherData == null ? 0 : otherData.get(u);
 424       }
 425     }
 426     return useIntCounts ? intCounts[offset] : counts[offset];
 427   }
 428
 429   public int getGapCount()
 430   {
 431     return useIntCounts ? intCounts[0] : counts[0];
 432   }
 433
 434   /**
 435    * Answers true if this object wraps a counter for unexpected characters
 436    *
 437    * @return
 438    */
 439   boolean isUsingOtherData()
 440   {
 441     return otherData != null;
 442   }
 443
 444   /**
 445    * Returns the character (or concatenated characters) for the symbol(s) with
 446    * the given count in the profile. Can be used to get the modal residue by
 447    * supplying the modal count value. Returns an empty string if no symbol has
 448    * the given count. The symbols are in alphabetic order of standard peptide or
 449    * nucleotide characters, followed by 'other' symbols if any.
 450    *
 451    * @return
 452    */
 453   public String getResiduesForCount(int count)
 454   {
 455     if (count == 0)
 456     {
 457       return "";
 458     }
 459
 460     /*
 461      * find counts for the given value and append the
 462      * corresponding symbol
 463      */
 464     StringBuilder modal = new StringBuilder();
 465     if (useIntCounts)
 466     {
 467       for (int i = 1; i < intCounts.length; i++)
 468       {
 469         if (intCounts[i] == count)
 470         {
 471           modal.append(
 472                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 473         }
 474       }
 475     }
 476     else
 477     {
 478       for (int i = 1; i < counts.length; i++)
 479       {
 480         if (counts[i] == count)
 481         {
 482           modal.append(
 483                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 484         }
 485       }
 486     }
 487     if (otherData != null)
 488     {
 489       for (int i = 0; i < otherData.size(); i++)
 490       {
 491         if (otherData.valueAt(i) == count)
 492         {
 493           modal.append((char) otherData.keyAt(i));
 494         }
 495       }
 496     }
 497     return modal.toString();
 498   }
 499
 500   /**
 501    * Returns the highest count for any symbol(s) in the profile (excluding gap)
 502    *
 503    * @return
 504    */
 505   public int getModalCount()
 506   {
 507     return maxCount;
 508   }
 509
 510   /**
 511    * Returns the number of distinct symbols with a non-zero count (excluding the
 512    * gap symbol)
 513    *
 514    * @return
 515    */
 516   public int size()
 517   {
 518     int size = 0;
 519     if (useIntCounts)
 520     {
 521       for (int i = 1; i < intCounts.length; i++)
 522       {
 523         if (intCounts[i] > 0)
 524         {
 525           size++;
 526         }
 527       }
 528     }
 529     else
 530     {
 531       for (int i = 1; i < counts.length; i++)
 532       {
 533         if (counts[i] > 0)
 534         {
 535           size++;
 536         }
 537       }
 538     }
 539
 540     /*
 541      * include 'other' characters recorded (even if count is zero
 542      * though that would be a strange use case)
 543      */
 544     if (otherData != null)
 545     {
 546       size += otherData.size();
 547     }
 548
 549     return size;
 550   }
 551
 552   /**
 553    * Returns a data bean holding those symbols that have a non-zero count
 554    * (excluding the gap symbol), with their counts.
 555    *
 556    * @return
 557    */
 558   public SymbolCounts getSymbolCounts()
 559   {
 560     int size = size();
 561     char[] symbols = new char[size];
 562     int[] values = new int[size];
 563     int j = 0;
 564
 565     if (useIntCounts)
 566     {
 567       for (int i = 1; i < intCounts.length; i++)
 568       {
 569         if (intCounts[i] > 0)
 570         {
 571           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 572                   : AAS.charAt(i - 1);
 573           symbols[j] = symbol;
 574           values[j] = intCounts[i];
 575           j++;
 576         }
 577       }
 578     }
 579     else
 580     {
 581       for (int i = 1; i < counts.length; i++)
 582       {
 583         if (counts[i] > 0)
 584         {
 585           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 586                   : AAS.charAt(i - 1);
 587           symbols[j] = symbol;
 588           values[j] = counts[i];
 589           j++;
 590         }
 591       }
 592     }
 593     if (otherData != null)
 594     {
 595       for (int i = 0; i < otherData.size(); i++)
 596       {
 597         symbols[j] = (char) otherData.keyAt(i);
 598         values[j] = otherData.valueAt(i);
 599         j++;
 600       }
 601     }
 602
 603     return new SymbolCounts(symbols, values);
 604   }
 605
 606   /**
 607    * Returns a tooltip string showing residues in descending order of their
 608    * percentage frequency in the profile
 609    *
 610    * @param normaliseBy
 611    *          the divisor for residue counts (may or may not include gapped
 612    *          sequence count)
 613    * @param percentageDecPl
 614    *          the number of decimal places to show in percentages
 615    * @return
 616    */
 617   public String getTooltip(int normaliseBy, int percentageDecPl)
 618   {
 619     SymbolCounts symbolCounts = getSymbolCounts();
 620     char[] ca = symbolCounts.symbols;
 621     int[] vl = symbolCounts.values;
 622
 623     /*
 624      * sort characters into ascending order of their counts
 625      */
 626     QuickSort.sort(vl, ca);
 627
 628     /*
 629      * traverse in reverse order (highest count first) to build tooltip
 630      */
 631     boolean first = true;
 632     StringBuilder sb = new StringBuilder(64);
 633     for (int c = ca.length - 1; c >= 0; c--)
 634     {
 635       final char residue = ca[c];
 636       // TODO combine residues which share a percentage
 637       // (see AAFrequency.completeCdnaConsensus)
 638       float tval = (vl[c] * 100f) / normaliseBy;
 639       sb.append(first ? "" : "; ").append(residue).append(" ");
 640       Format.appendPercentage(sb, tval, percentageDecPl);
 641       sb.append("%");
 642       first = false;
 643     }
 644     return sb.toString();
 645   }
 646
 647   /**
 648    * Returns a string representation of the symbol counts, for debug purposes.
 649    */
 650   @Override
 651   public String toString()
 652   {
 653     StringBuilder sb = new StringBuilder();
 654     sb.append("[ ");
 655     SymbolCounts sc = getSymbolCounts();
 656     for (int i = 0; i < sc.symbols.length; i++)
 657     {
 658       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 659     }
 660     sb.append("]");
 661     return sb.toString();
 662   }
 663
 664   /**
 665    * Answers the total count for all symbols (excluding gaps)
 666    *
 667    * @return
 668    */
 669   public int getTotalResidueCount()
 670   {
 671     int total = 0;
 672     for (char symbol : this.getSymbolCounts().symbols)
 673     {
 674       total += getCount(symbol);
 675     }
 676     return total;
 677   }
 678 }