src/jalview/datamodel/ResidueCount.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.Comparison;
  24 import jalview.util.Format;
  25 import jalview.util.QuickSort;
  26 import jalview.util.SparseCount;
  27
  28 import java.util.List;
  29
  30 /**
  31  * A class to count occurrences of residues in a profile, optimised for speed
  32  * and memory footprint.
  33  *
  34  * @author gmcarstairs
  35  *
  36  */
  37 public class ResidueCount
  38 {
  39   /**
  40    * A data bean to hold the results of counting symbols
  41    */
  42   public class SymbolCounts
  43   {
  44     /**
  45      * the symbols seen (as char values), in no particular order
  46      */
  47     public final char[] symbols;
  48
  49     /**
  50      * the counts for each symbol, in the same order as the symbols
  51      */
  52     public final int[] values;
  53
  54     SymbolCounts(char[] s, int[] v)
  55     {
  56       symbols = s;
  57       values = v;
  58     }
  59   }
  60
  61   private static final int TOUPPERCASE = 'A' - 'a';
  62
  63   /*
  64    * nucleotide symbols to count (including N unknown)
  65    */
  66   private static final String NUCS = "ACGNTU";
  67
  68   /*
  69    * amino acid symbols to count (including X unknown)
  70    * NB we also include U so as to support counting of RNA bases
  71    * in the "don't know" case of nucleotide / peptide
  72    */
  73   private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
  74
  75   static final int GAP_COUNT = 0;
  76
  77   /*
  78    * fast lookup tables holding the index into our count
  79    * arrays of each symbol; index 0 is reserved for gap counting
  80    */
  81   private static int[] NUC_INDEX = new int[26];
  82
  83   private static int[] AA_INDEX = new int[26];
  84   static
  85   {
  86     for (int i = 0; i < NUCS.length(); i++)
  87     {
  88       NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
  89     }
  90     for (int i = 0; i < AAS.length(); i++)
  91     {
  92       AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
  93     }
  94   }
  95
  96   /*
  97    * counts array, just big enough for the nucleotide or peptide
  98    * character set (plus gap counts in position 0)
  99    */
 100   private short[] counts;
 101
 102   /*
 103    * alternative array of int counts for use if any count
 104    * exceeds the maximum value of short (32767)
 105    */
 106   private int[] intCounts;
 107
 108   /*
 109    * flag set if we switch from short to int counts
 110    */
 111   private boolean useIntCounts;
 112
 113   /*
 114    * general-purpose counter, only for use for characters
 115    * that are not in the expected alphabet
 116    */
 117   private SparseCount otherData;
 118
 119   /*
 120    * keeps track of the maximum count value recorded
 121    * (if this class ever allows decrements, would need to
 122    * calculate this on request instead)
 123    */
 124   int maxCount;
 125
 126   /*
 127    * if we think we are counting nucleotide, can get by with smaller
 128    * array to hold counts
 129    */
 130   private boolean isNucleotide;
 131
 132   /**
 133    * Default constructor allocates arrays able to count either nucleotide or
 134    * peptide bases. Use this constructor if not sure which the data is.
 135    */
 136   public ResidueCount()
 137   {
 138     this(false);
 139   }
 140
 141   /**
 142    * Constructor that allocates an array just big enough for the anticipated
 143    * characters, plus one position to count gaps
 144    */
 145   public ResidueCount(boolean nucleotide)
 146   {
 147     isNucleotide = nucleotide;
 148     int charsToCount = nucleotide ? NUCS.length() : AAS.length();
 149     counts = new short[charsToCount + 1];
 150   }
 151
 152   /**
 153    * A constructor that counts frequency of all symbols (including gaps) in the
 154    * sequences (not case-sensitive)
 155    *
 156    * @param sequences
 157    */
 158   public ResidueCount(List<SequenceI> sequences)
 159   {
 160     this();
 161     for (SequenceI seq : sequences)
 162     {
 163       for (int i = 0; i < seq.getLength(); i++)
 164       {
 165         add(seq.getCharAt(i));
 166       }
 167     }
 168   }
 169
 170   /**
 171    * Increments the count for the given character. The supplied character may be
 172    * upper or lower case but counts are for the upper case only. Gap characters
 173    * (space, ., -) are all counted together.
 174    *
 175    * @param c
 176    * @return the new value of the count for the character
 177    */
 178   public int add(final char c)
 179   {
 180     char u = toUpperCase(c);
 181     int newValue = 0;
 182     int offset = getOffset(u);
 183
 184     /*
 185      * offset 0 is reserved for gap counting, so 0 here means either
 186      * an unexpected character, or a gap character passed in error
 187      */
 188     if (offset == 0)
 189     {
 190       if (Comparison.isGap(u))
 191       {
 192         newValue = addGap();
 193       }
 194       else
 195       {
 196         newValue = addOtherCharacter(u);
 197       }
 198     }
 199     else
 200     {
 201       newValue = increment(offset);
 202     }
 203     return newValue;
 204   }
 205
 206   /**
 207    * Increment the count at the specified offset. If this would result in short
 208    * overflow, promote to counting int values instead.
 209    *
 210    * @param offset
 211    * @return the new value of the count at this offset
 212    */
 213   int increment(int offset)
 214   {
 215     int newValue = 0;
 216     if (useIntCounts)
 217     {
 218       newValue = intCounts[offset];
 219       intCounts[offset] = ++newValue;
 220     }
 221     else
 222     {
 223       if (counts[offset] == Short.MAX_VALUE)
 224       {
 225         handleOverflow();
 226         newValue = intCounts[offset];
 227         intCounts[offset] = ++newValue;
 228       }
 229       else
 230       {
 231         newValue = counts[offset];
 232         counts[offset] = (short) ++newValue;
 233       }
 234     }
 235
 236     if (offset != GAP_COUNT)
 237     {
 238       // update modal residue count
 239       maxCount = Math.max(maxCount, newValue);
 240     }
 241     return newValue;
 242   }
 243
 244   /**
 245    * Switch from counting in short to counting in int
 246    */
 247   synchronized void handleOverflow()
 248   {
 249     intCounts = new int[counts.length];
 250     for (int i = 0; i < counts.length; i++)
 251     {
 252       intCounts[i] = counts[i];
 253     }
 254     counts = null;
 255     useIntCounts = true;
 256   }
 257
 258   /**
 259    * Returns this character's offset in the count array
 260    *
 261    * @param c
 262    * @return
 263    */
 264   int getOffset(char c)
 265   {
 266     int offset = 0;
 267     if ('A' <= c && c <= 'Z')
 268     {
 269       offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
 270     }
 271     return offset;
 272   }
 273
 274   /**
 275    * @param c
 276    * @return
 277    */
 278   protected char toUpperCase(final char c)
 279   {
 280     char u = c;
 281     if ('a' <= c && c <= 'z')
 282     {
 283       u = (char) (c + TOUPPERCASE);
 284     }
 285     return u;
 286   }
 287
 288   /**
 289    * Increment count for some unanticipated character. The first time this
 290    * called, a SparseCount is instantiated to hold these 'extra' counts.
 291    *
 292    * @param c
 293    * @return the new value of the count for the character
 294    */
 295   int addOtherCharacter(char c)
 296   {
 297     if (otherData == null)
 298     {
 299       otherData = new SparseCount();
 300     }
 301     int newValue = otherData.add(c, 1);
 302     maxCount = Math.max(maxCount, newValue);
 303     return newValue;
 304   }
 305
 306   /**
 307    * Set count for some unanticipated character. The first time this called, a
 308    * SparseCount is instantiated to hold these 'extra' counts.
 309    *
 310    * @param c
 311    * @param value
 312    */
 313   void setOtherCharacter(char c, int value)
 314   {
 315     if (otherData == null)
 316     {
 317       otherData = new SparseCount();
 318     }
 319     otherData.put(c, value);
 320   }
 321
 322   /**
 323    * Increment count of gap characters
 324    *
 325    * @return the new count of gaps
 326    */
 327   public int addGap()
 328   {
 329     int newValue = increment(GAP_COUNT);
 330     return newValue;
 331   }
 332
 333   /**
 334    * Answers true if we are counting ints (only after overflow of short counts)
 335    *
 336    * @return
 337    */
 338   boolean isCountingInts()
 339   {
 340     return useIntCounts;
 341   }
 342
 343   /**
 344    * Sets the count for the given character. The supplied character may be upper
 345    * or lower case but counts are for the upper case only.
 346    *
 347    * @param c
 348    * @param count
 349    */
 350   public void put(char c, int count)
 351   {
 352     char u = toUpperCase(c);
 353     int offset = getOffset(u);
 354
 355     /*
 356      * offset 0 is reserved for gap counting, so 0 here means either
 357      * an unexpected character, or a gap character passed in error
 358      */
 359     if (offset == 0)
 360     {
 361       if (Comparison.isGap(u))
 362       {
 363         set(0, count);
 364       }
 365       else
 366       {
 367         setOtherCharacter(u, count);
 368         maxCount = Math.max(maxCount, count);
 369       }
 370     }
 371     else
 372     {
 373       set(offset, count);
 374       maxCount = Math.max(maxCount, count);
 375     }
 376   }
 377
 378   /**
 379    * Sets the count at the specified offset. If this would result in short
 380    * overflow, promote to counting int values instead.
 381    *
 382    * @param offset
 383    * @param value
 384    */
 385   void set(int offset, int value)
 386   {
 387     if (useIntCounts)
 388     {
 389       intCounts[offset] = value;
 390     }
 391     else
 392     {
 393       if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
 394       {
 395         handleOverflow();
 396         intCounts[offset] = value;
 397       }
 398       else
 399       {
 400         counts[offset] = (short) value;
 401       }
 402     }
 403   }
 404
 405   /**
 406    * Returns the count for the given character, or zero if no count held
 407    *
 408    * @param c
 409    * @return
 410    */
 411   public int getCount(char c)
 412   {
 413     char u = toUpperCase(c);
 414     int offset = getOffset(u);
 415     if (offset == 0)
 416     {
 417       if (!Comparison.isGap(u))
 418       {
 419         // should have called getGapCount()
 420         return otherData == null ? 0 : otherData.get(u);
 421       }
 422     }
 423     return useIntCounts ? intCounts[offset] : counts[offset];
 424   }
 425
 426   public int getGapCount()
 427   {
 428     return useIntCounts ? intCounts[0] : counts[0];
 429   }
 430
 431   /**
 432    * Answers true if this object wraps a counter for unexpected characters
 433    *
 434    * @return
 435    */
 436   boolean isUsingOtherData()
 437   {
 438     return otherData != null;
 439   }
 440
 441   /**
 442    * Returns the character (or concatenated characters) for the symbol(s) with
 443    * the given count in the profile. Can be used to get the modal residue by
 444    * supplying the modal count value. Returns an empty string if no symbol has
 445    * the given count. The symbols are in alphabetic order of standard peptide or
 446    * nucleotide characters, followed by 'other' symbols if any.
 447    *
 448    * @return
 449    */
 450   public String getResiduesForCount(int count)
 451   {
 452     if (count == 0)
 453     {
 454       return "";
 455     }
 456
 457     /*
 458      * find counts for the given value and append the
 459      * corresponding symbol
 460      */
 461     StringBuilder modal = new StringBuilder();
 462     if (useIntCounts)
 463     {
 464       for (int i = 1; i < intCounts.length; i++)
 465       {
 466         if (intCounts[i] == count)
 467         {
 468           modal.append(
 469                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 470         }
 471       }
 472     }
 473     else
 474     {
 475       for (int i = 1; i < counts.length; i++)
 476       {
 477         if (counts[i] == count)
 478         {
 479           modal.append(
 480                   isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
 481         }
 482       }
 483     }
 484     if (otherData != null)
 485     {
 486       for (int i = 0; i < otherData.size(); i++)
 487       {
 488         if (otherData.valueAt(i) == count)
 489         {
 490           modal.append((char) otherData.keyAt(i));
 491         }
 492       }
 493     }
 494     return modal.toString();
 495   }
 496
 497   /**
 498    * Returns the highest count for any symbol(s) in the profile (excluding gap)
 499    *
 500    * @return
 501    */
 502   public int getModalCount()
 503   {
 504     return maxCount;
 505   }
 506
 507   /**
 508    * Returns the number of distinct symbols with a non-zero count (excluding the
 509    * gap symbol)
 510    *
 511    * @return
 512    */
 513   public int size()
 514   {
 515     int size = 0;
 516     if (useIntCounts)
 517     {
 518       for (int i = 1; i < intCounts.length; i++)
 519       {
 520         if (intCounts[i] > 0)
 521         {
 522           size++;
 523         }
 524       }
 525     }
 526     else
 527     {
 528       for (int i = 1; i < counts.length; i++)
 529       {
 530         if (counts[i] > 0)
 531         {
 532           size++;
 533         }
 534       }
 535     }
 536
 537     /*
 538      * include 'other' characters recorded (even if count is zero
 539      * though that would be a strange use case)
 540      */
 541     if (otherData != null)
 542     {
 543       size += otherData.size();
 544     }
 545
 546     return size;
 547   }
 548
 549   /**
 550    * Returns a data bean holding those symbols that have a non-zero count
 551    * (excluding the gap symbol), with their counts.
 552    *
 553    * @return
 554    */
 555   public SymbolCounts getSymbolCounts()
 556   {
 557     int size = size();
 558     char[] symbols = new char[size];
 559     int[] values = new int[size];
 560     int j = 0;
 561
 562     if (useIntCounts)
 563     {
 564       for (int i = 1; i < intCounts.length; i++)
 565       {
 566         if (intCounts[i] > 0)
 567         {
 568           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 569                   : AAS.charAt(i - 1);
 570           symbols[j] = symbol;
 571           values[j] = intCounts[i];
 572           j++;
 573         }
 574       }
 575     }
 576     else
 577     {
 578       for (int i = 1; i < counts.length; i++)
 579       {
 580         if (counts[i] > 0)
 581         {
 582           char symbol = isNucleotide ? NUCS.charAt(i - 1)
 583                   : AAS.charAt(i - 1);
 584           symbols[j] = symbol;
 585           values[j] = counts[i];
 586           j++;
 587         }
 588       }
 589     }
 590     if (otherData != null)
 591     {
 592       for (int i = 0; i < otherData.size(); i++)
 593       {
 594         symbols[j] = (char) otherData.keyAt(i);
 595         values[j] = otherData.valueAt(i);
 596         j++;
 597       }
 598     }
 599
 600     return new SymbolCounts(symbols, values);
 601   }
 602
 603   /**
 604    * Returns a tooltip string showing residues in descending order of their
 605    * percentage frequency in the profile
 606    *
 607    * @param normaliseBy
 608    *          the divisor for residue counts (may or may not include gapped
 609    *          sequence count)
 610    * @param percentageDecPl
 611    *          the number of decimal places to show in percentages
 612    * @return
 613    */
 614   public String getTooltip(int normaliseBy, int percentageDecPl)
 615   {
 616     SymbolCounts symbolCounts = getSymbolCounts();
 617     char[] ca = symbolCounts.symbols;
 618     int[] vl = symbolCounts.values;
 619
 620     /*
 621      * sort characters into ascending order of their counts
 622      */
 623     QuickSort.sort(vl, ca);
 624
 625     /*
 626      * traverse in reverse order (highest count first) to build tooltip
 627      */
 628     boolean first = true;
 629     StringBuilder sb = new StringBuilder(64);
 630     for (int c = ca.length - 1; c >= 0; c--)
 631     {
 632       final char residue = ca[c];
 633       // TODO combine residues which share a percentage
 634       // (see AAFrequency.completeCdnaConsensus)
 635       float tval = (vl[c] * 100f) / normaliseBy;
 636       sb.append(first ? "" : "; ").append(residue).append(" ");
 637       Format.appendPercentage(sb, tval, percentageDecPl);
 638       sb.append("%");
 639       first = false;
 640     }
 641     return sb.toString();
 642   }
 643
 644   /**
 645    * Returns a string representation of the symbol counts, for debug purposes.
 646    */
 647   @Override
 648   public String toString()
 649   {
 650     StringBuilder sb = new StringBuilder();
 651     sb.append("[ ");
 652     SymbolCounts sc = getSymbolCounts();
 653     for (int i = 0; i < sc.symbols.length; i++)
 654     {
 655       sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
 656     }
 657     sb.append("]");
 658     return sb.toString();
 659   }
 660
 661   /**
 662    * Answers the total count for all symbols (excluding gaps)
 663    *
 664    * @return
 665    */
 666   public int getTotalResidueCount()
 667   {
 668     int total = 0;
 669     for (char symbol : this.getSymbolCounts().symbols)
 670     {
 671       total += getCount(symbol);
 672     }
 673     return total;
 674   }
 675 }