src/jalview/datamodel/ResidueCount.java

   1 package jalview.datamodel;
   2
   3 import jalview.util.Comparison;
   4 import jalview.util.Format;
   5 import jalview.util.QuickSort;
   6 import jalview.util.SparseCount;
   7
   8 /**
   9  * A class to count occurrences of residues in a profile, optimised for speed
  10  * and memory footprint.
  11  * @author gmcarstairs
  12  *
  13  */
  14 public class ResidueCount
  15 {
  16   /**
  17    * A data bean to hold the results of counting symbols
  18    */
  19   public class SymbolCounts
  20   {
  21     /**
  22      * the symbols seen (as char values), in no particular order
  23      */
  24     public final char[] symbols;
  25
  26     /**
  27      * the counts for each symbol, in the same order as the symbols
  28      */
  29     public final int[] values;
  30
  31     SymbolCounts(char[] s, int[] v)
  32     {
  33       symbols = s;
  34       values = v;
  35     }
  36   }
  37
  38   private static final int TOUPPERCASE = 'A' - 'a';
  39
  40   /*
  41    * nucleotide symbols to count (including N unknown)
  42    */
  43   private static final String NUCS = "ACGNTU";
  44
  45   /*
  46    * amino acid symbols to count (including X unknown)
  47    * NB we also include U so as to support counting of RNA bases
  48    * in the "don't know" case of nucleotide / peptide
  49    */
  50   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  51
  52   private static final int GAP_COUNT = 0;
  53
  54   /*
  55    * fast lookup tables holding the index into our count
  56    * arrays of each symbol; index 0 is reserved for gap counting
  57    */
  58   private static int[] NUC_INDEX = new int[26];
  59
  60   private static int[] AA_INDEX = new int[26];
  61   static
  62   {
  63     for (int i = 0; i < NUCS.length(); i++)
  64     {
  65       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  66     }
  67     for (int i = 0; i < AAS.length(); i++)
  68     {
  69       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  70     }
  71   }
  72
  73   /*
  74    * counts array, just big enough for the nucleotide or peptide
  75    * character set (plus gap counts in position 0)
  76    */
  77   private short[] counts;
  78
  79   /*
  80    * alternative array of int counts for use if any count
  81    * exceeds the maximum value of short (32767)
  82    */
  83   private int[] intCounts;
  84
  85   /*
  86    * flag set if we switch from short to int counts
  87    */
  88   private boolean useIntCounts;
  89
  90   /*
  91    * general-purpose counter, only for use for characters
  92    * that are not in the expected alphabet
  93    */
  94   private SparseCount otherData;
  95
  96   /*
  97    * keeps track of the maximum count value recorded
  98    * (if this class ever allows decrements, would need to
  99    * calculate this on request instead)
 100    */
 101   int maxCount;
 102
 103   /*
 104    * if we think we are counting nucleotide, can get by with smaller
 105    * array to hold counts
 106    */
 107   private boolean isNucleotide;
 108
 109   /**
 110    * Default constructor allocates arrays able to count either nucleotide or
 111    * peptide bases. Use this constructor if not sure which the data is.
 112    */
 113   public ResidueCount()
 114   {
 115     this(false);
 116   }
 117
 118   /**
 119    * Constructor that allocates an array just big enough for the anticipated
 120    * characters, plus one position to count gaps
 121    */
 122   public ResidueCount(boolean nucleotide)
 123   {
 124     isNucleotide = nucleotide;
 125     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 126     counts = new short[charsToCount + 1];
 127   }
 128
 129   /**
 130    * Increments the count for the given character. The supplied character may be
 131    * upper or lower case but counts are for the upper case only. Gap characters
 132    * (space, ., -) are all counted together.
 133    *
 134    * @param c
 135    * @return the new value of the count for the character
 136    */
 137   public int add(final char c)
 138   {
 139     char u = toUpperCase(c);
 140     int newValue = 0;
 141     int offset = getOffset(u);
 142
 143     /*
 144      * offset 0 is reserved for gap counting, so 0 here means either
 145      * an unexpected character, or a gap character passed in error
 146      */
 147     if (offset == 0)
 148     {
 149       if (Comparison.isGap(u))
 150       {
 151         newValue = addGap();
 152       }
 153       else
 154       {
 155         newValue = addOtherCharacter(u);
 156       }
 157     }
 158     else
 159     {
 160       newValue = increment(offset);
 161     }
 162     return newValue;
 163   }
 164
 165   /**
 166    * Increment the count at the specified offset. If this would result in short
 167    * overflow, promote to counting int values instead.
 168    *
 169    * @param offset
 170    * @return the new value of the count at this offset
 171    */
 172   int increment(int offset)
 173   {
 174     int newValue = 0;
 175     if (useIntCounts)
 176     {
 177       newValue = intCounts[offset];
 178       intCounts[offset] = ++newValue;
 179     }
 180     else
 181     {
 182       if (counts[offset] == Short.MAX_VALUE)
 183       {
 184         handleOverflow();
 185         newValue = intCounts[offset];
 186         intCounts[offset] = ++newValue;
 187       }
 188       else
 189       {
 190         newValue = counts[offset];
 191         counts[offset] = (short) ++newValue;
 192       }
 193     }
 194     maxCount = Math.max(maxCount, newValue);
 195     return newValue;
 196   }
 197
 198   /**
 199    * Switch from counting in short to counting in int
 200    */
 201   synchronized void handleOverflow()
 202   {
 203     intCounts = new int[counts.length];
 204     for (int i = 0; i < counts.length; i++)
 205     {
 206       intCounts[i] = counts[i];
 207     }
 208     counts = null;
 209     useIntCounts = true;
 210   }
 211
 212   /**
 213    * Returns this character's offset in the count array
 214    *
 215    * @param c
 216    * @return
 217    */
 218   int getOffset(char c)
 219   {
 220     int offset = 0;
 221     if ('A' <= c && c <= 'Z')
 222     {
 223       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 224     }
 225     return offset;
 226   }
 227
 228   /**
 229    * @param c
 230    * @return
 231    */
 232   protected char toUpperCase(final char c)
 233   {
 234     char u = c;
 235     if ('a' <= c && c <= 'z')
 236     {
 237       u = (char) (c + TOUPPERCASE);
 238     }
 239     return u;
 240   }
 241
 242   /**
 243    * Increment count for some unanticipated character. The first time this
 244    * called, a SparseCount is instantiated to hold these 'extra' counts.
 245    *
 246    * @param c
 247    * @return the new value of the count for the character
 248    */
 249   int addOtherCharacter(char c)
 250   {
 251     if (otherData == null)
 252     {
 253       otherData = new SparseCount();
 254     }
 255     int newValue = otherData.add(c, 1);
 256     maxCount = Math.max(maxCount, newValue);
 257     return newValue;
 258   }
 259
 260   /**
 261    * Set count for some unanticipated character. The first time this called, a
 262    * SparseCount is instantiated to hold these 'extra' counts.
 263    *
 264    * @param c
 265    * @param value
 266    */
 267   void setOtherCharacter(char c, int value)
 268   {
 269     if (otherData == null)
 270     {
 271       otherData = new SparseCount();
 272     }
 273     otherData.put(c, value);
 274   }
 275
 276   /**
 277    * Increment count of gap characters
 278    *
 279    * @return the new count of gaps
 280    */
 281   public int addGap()
 282   {
 283     int newValue;
 284     if (useIntCounts)
 285     {
 286       newValue = ++intCounts[GAP_COUNT];
 287     }
 288     else
 289     {
 290       newValue = ++counts[GAP_COUNT];
 291     }
 292     return newValue;
 293   }
 294
 295   /**
 296    * Answers true if we are counting ints (only after overflow of short counts)
 297    *
 298    * @return
 299    */
 300   boolean isCountingInts()
 301   {
 302     return useIntCounts;
 303   }
 304
 305   /**
 306    * Sets the count for the given character. The supplied character may be upper
 307    * or lower case but counts are for the upper case only.
 308    *
 309    * @param c
 310    * @param count
 311    */
 312   public void put(char c, int count)
 313   {
 314     char u = toUpperCase(c);
 315     int offset = getOffset(u);
 316
 317     /*
 318      * offset 0 is reserved for gap counting, so 0 here means either
 319      * an unexpected character, or a gap character passed in error
 320      */
 321     if (offset == 0)
 322     {
 323       if (Comparison.isGap(u))
 324       {
 325         set(0, count);
 326       }
 327       else
 328       {
 329         setOtherCharacter(u, count);
 330         maxCount = Math.max(maxCount, count);
 331       }
 332     }
 333     else
 334     {
 335       set(offset, count);
 336       maxCount = Math.max(maxCount, count);
 337     }
 338   }
 339
 340   /**
 341    * Sets the count at the specified offset. If this would result in short
 342    * overflow, promote to counting int values instead.
 343    *
 344    * @param offset
 345    * @param value
 346    */
 347   void set(int offset, int value)
 348   {
 349     if (useIntCounts)
 350     {
 351       intCounts[offset] = value;
 352     }
 353     else
 354     {
 355       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 356       {
 357         handleOverflow();
 358         intCounts[offset] = value;
 359       }
 360       else
 361       {
 362         counts[offset] = (short) value;
 363       }
 364     }
 365   }
 366
 367   /**
 368    * Returns the count for the given character, or zero if no count held
 369    *
 370    * @param c
 371    * @return
 372    */
 373   public int getCount(char c)
 374   {
 375     char u = toUpperCase(c);
 376     int offset = getOffset(u);
 377     if (offset == 0)
 378     {
 379       if (!Comparison.isGap(u))
 380       {
 381         // should have called getGapCount()
 382         return otherData == null ? 0 : otherData.get(u);
 383       }
 384     }
 385     return useIntCounts ? intCounts[offset] : counts[offset];
 386   }
 387
 388   public int getGapCount()
 389   {
 390     return useIntCounts ? intCounts[0] : counts[0];
 391   }
 392
 393   /**
 394    * Answers true if this object wraps a counter for unexpected characters
 395    *
 396    * @return
 397    */
 398   boolean isUsingOtherData()
 399   {
 400     return otherData != null;
 401   }
 402
 403   /**
 404    * Returns the character (or concatenated characters) for the symbol(s) with
 405    * the given count in the profile. Can be used to get the modal residue by
 406    * supplying the modal count value. Returns an empty string if no symbol has
 407    * the given count. The symbols are in alphabetic order of standard peptide or
 408    * nucleotide characters, followed by 'other' symbols if any.
 409    *
 410    * @return
 411    */
 412   public String getResiduesForCount(int count)
 413   {
 414     if (count == 0)
 415     {
 416       return "";
 417     }
 418
 419     /*
 420      * find counts for the given value and append the
 421      * corresponding symbol
 422      */
 423     StringBuilder modal = new StringBuilder();
 424     if (useIntCounts)
 425     {
 426       for (int i = 1; i < intCounts.length; i++)
 427       {
 428         if (intCounts[i] == count)
 429         {
 430           modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
 431                   .charAt(i - 1));
 432         }
 433       }
 434     }
 435     else
 436     {
 437       for (int i = 1; i < counts.length; i++)
 438       {
 439         if (counts[i] == count)
 440         {
 441           modal.append(isNucleotide ? NUCS.charAt(i - 1) : AAS
 442                   .charAt(i - 1));
 443         }
 444       }
 445     }
 446     if (otherData != null)
 447     {
 448       for (int i = 0; i < otherData.size(); i++)
 449       {
 450         if (otherData.valueAt(i) == count)
 451         {
 452           modal.append((char) otherData.keyAt(i));
 453         }
 454       }
 455     }
 456     return modal.toString();
 457   }
 458
 459   /**
 460    * Returns the highest count for any symbol(s) in the profile (excluding gap)
 461    *
 462    * @return
 463    */
 464   public int getModalCount()
 465   {
 466     return maxCount;
 467   }
 468
 469   /**
 470    * Returns the number of distinct symbols with a non-zero count (excluding the
 471    * gap symbol)
 472    *
 473    * @return
 474    */
 475   public int size() {
 476     int size = 0;
 477     if (useIntCounts)
 478     {
 479       for (int i = 1; i < intCounts.length; i++)
 480       {
 481         if (intCounts[i] > 0)
 482         {
 483           size++;
 484         }
 485       }
 486     }
 487     else
 488     {
 489       for (int i = 1; i < counts.length; i++)
 490       {
 491         if (counts[i] > 0)
 492         {
 493           size++;
 494         }
 495       }
 496     }
 497
 498     /*
 499      * include 'other' characters recorded (even if count is zero
 500      * though that would be a strange use case)
 501      */
 502     if (otherData != null)
 503     {
 504       size += otherData.size();
 505     }
 506
 507     return size;
 508   }
 509
 510   /**
 511    * Returns a data bean holding those symbols that have a non-zero count
 512    * (excluding the gap symbol), with their counts.
 513    *
 514    * @return
 515    */
 516   public SymbolCounts getSymbolCounts()
 517   {
 518     int size = size();
 519     char[] symbols = new char[size];
 520     int[] values = new int[size];
 521     int j = 0;
 522
 523     if (useIntCounts)
 524     {
 525       for (int i = 1; i < intCounts.length; i++)
 526       {
 527         if (intCounts[i] > 0)
 528         {
 529           char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
 530                   .charAt(i - 1);
 531           symbols[j] = symbol;
 532           values[j] = intCounts[i];
 533           j++;
 534         }
 535       }
 536     }
 537     else
 538     {
 539       for (int i = 1; i < counts.length; i++)
 540       {
 541         if (counts[i] > 0)
 542         {
 543           char symbol = isNucleotide ? NUCS.charAt(i - 1) : AAS
 544                   .charAt(i - 1);
 545           symbols[j] = symbol;
 546           values[j] = counts[i];
 547           j++;
 548         }
 549       }
 550     }
 551     if (otherData != null)
 552     {
 553       for (int i = 0; i < otherData.size(); i++)
 554       {
 555         symbols[j] = (char) otherData.keyAt(i);
 556         values[j] = otherData.valueAt(i);
 557         j++;
 558       }
 559     }
 560
 561     return new SymbolCounts(symbols, values);
 562   }
 563
 564   /**
 565    * Returns a tooltip string showing residues in descending order of their
 566    * percentage frequency in the profile
 567    *
 568    * @param normaliseBy
 569    *          the divisor for residue counts (may or may not include gapped
 570    *          sequence count)
 571    * @param percentageDecPl
 572    *          the number of decimal places to show in percentages
 573    * @return
 574    */
 575   public String getTooltip(int normaliseBy, int percentageDecPl)
 576   {
 577     SymbolCounts symbolCounts = getSymbolCounts();
 578     char[] ca = symbolCounts.symbols;
 579     int[] vl = symbolCounts.values;
 580
 581     /*
 582      * sort characters into ascending order of their counts
 583      */
 584     QuickSort.sort(vl, ca);
 585
 586     /*
 587      * traverse in reverse order (highest count first) to build tooltip
 588      */
 589     boolean first = true;
 590     StringBuilder sb = new StringBuilder(64);
 591     for (int c = ca.length - 1; c >= 0; c--)
 592     {
 593       final char residue = ca[c];
 594       // TODO combine residues which share a percentage
 595       // (see AAFrequency.completeCdnaConsensus)
 596       float tval = (vl[c] * 100f) / normaliseBy;
 597       sb.append(first ? "" : "; ").append(residue).append(" ");
 598       Format.appendPercentage(sb, tval, percentageDecPl);
 599       sb.append("%");
 600       first = false;
 601     }
 602     return sb.toString();
 603   }
 604
 605   /**
 606    * Returns a string representation of the symbol counts, for debug purposes.
 607    */
 608   @Override
 609   public String toString()
 610   {
 611     StringBuilder sb = new StringBuilder();
 612     sb.append("[ ");
 613     SymbolCounts sc = getSymbolCounts();
 614     for (int i = 0; i < sc.symbols.length; i++)
 615     {
 616       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 617     }
 618     sb.append("]");
 619     return sb.toString();
 620   }
 621 }