src/jalview/analysis/AAFrequency.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.Annotation;
  27 import jalview.datamodel.HiddenMarkovModel;
  28 import jalview.datamodel.Profile;
  29 import jalview.datamodel.ProfileI;
  30 import jalview.datamodel.Profiles;
  31 import jalview.datamodel.ProfilesI;
  32 import jalview.datamodel.ResidueCount;
  33 import jalview.datamodel.ResidueCount.SymbolCounts;
  34 import jalview.datamodel.SequenceI;
  35 import jalview.ext.android.SparseIntArray;
  36 import jalview.schemes.ResidueProperties;
  37 import jalview.util.Comparison;
  38 import jalview.util.Format;
  39 import jalview.util.MappingUtils;
  40 import jalview.util.QuickSort;
  41
  42 import java.awt.Color;
  43 import java.util.Arrays;
  44 import java.util.Hashtable;
  45 import java.util.List;
  46
  47 /**
  48  * Takes in a vector or array of sequences and column start and column end and
  49  * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
  50  * This class is used extensively in calculating alignment colourschemes that
  51  * depend on the amount of conservation in each alignment column.
  52  *
  53  * @author $author$
  54  * @version $Revision$
  55  */
  56 public class AAFrequency
  57 {
  58   public static final String PROFILE = "P";
  59
  60   private static final String AMINO = "amino";
  61
  62   private static final String DNA = "DNA";
  63
  64   /*
  65    * Quick look-up of String value of char 'A' to 'Z'
  66    */
  67   private static final String[] CHARS = new String['Z' - 'A' + 1];
  68
  69   static
  70   {
  71     for (char c = 'A'; c <= 'Z'; c++)
  72     {
  73       CHARS[c - 'A'] = String.valueOf(c);
  74     }
  75   }
  76
  77   public static final ProfilesI calculate(List<SequenceI> list, int start,
  78           int end)
  79   {
  80     return calculate(list, start, end, false);
  81   }
  82
  83   public static final ProfilesI calculate(List<SequenceI> sequences,
  84           int start, int end, boolean profile)
  85   {
  86     SequenceI[] seqs = new SequenceI[sequences.size()];
  87     int width = 0;
  88     synchronized (sequences)
  89     {
  90       for (int i = 0; i < sequences.size(); i++)
  91       {
  92         seqs[i] = sequences.get(i);
  93         int length = seqs[i].getLength();
  94         if (length > width)
  95         {
  96           width = length;
  97         }
  98       }
  99
 100       if (end >= width)
 101       {
 102         end = width;
 103       }
 104
 105       ProfilesI reply = calculate(seqs, width, start, end, profile);
 106       return reply;
 107     }
 108   }
 109
 110   /**
 111    * Calculate the consensus symbol(s) for each column in the given range.
 112    *
 113    * @param sequences
 114    * @param width
 115    *          the full width of the alignment
 116    * @param start
 117    *          start column (inclusive, base zero)
 118    * @param end
 119    *          end column (exclusive)
 120    * @param saveFullProfile
 121    *          if true, store all symbol counts
 122    */
 123   public static final ProfilesI calculate(final SequenceI[] sequences,
 124           int width, int start, int end, boolean saveFullProfile)
 125   {
 126     // long now = System.currentTimeMillis();
 127     int seqCount = sequences.length;
 128     boolean nucleotide = false;
 129     int nucleotideCount = 0;
 130     int peptideCount = 0;
 131
 132     ProfileI[] result = new ProfileI[width];
 133
 134     for (int column = start; column < end; column++)
 135     {
 136       /*
 137        * Apply a heuristic to detect nucleotide data (which can
 138        * be counted in more compact arrays); here we test for
 139        * more than 90% nucleotide; recheck every 10 columns in case
 140        * of misleading data e.g. highly conserved Alanine in peptide!
 141        * Mistakenly guessing nucleotide has a small performance cost,
 142        * as it will result in counting in sparse arrays.
 143        * Mistakenly guessing peptide has a small space cost,
 144        * as it will use a larger than necessary array to hold counts.
 145        */
 146       if (nucleotideCount > 100 && column % 10 == 0)
 147       {
 148         nucleotide = (9 * peptideCount < nucleotideCount);
 149       }
 150       ResidueCount residueCounts = new ResidueCount(nucleotide);
 151
 152       for (int row = 0; row < seqCount; row++)
 153       {
 154         if (sequences[row] == null)
 155         {
 156           System.err
 157                   .println("WARNING: Consensus skipping null sequence - possible race condition.");
 158           continue;
 159         }
 160         char[] seq = sequences[row].getSequence();
 161         if (seq.length > column)
 162         {
 163           char c = seq[column];
 164           residueCounts.add(c);
 165           if (Comparison.isNucleotide(c))
 166           {
 167             nucleotideCount++;
 168           }
 169           else if (!Comparison.isGap(c))
 170           {
 171             peptideCount++;
 172           }
 173         }
 174         else
 175         {
 176           /*
 177            * count a gap if the sequence doesn't reach this column
 178            */
 179           residueCounts.addGap();
 180         }
 181       }
 182
 183       int maxCount = residueCounts.getModalCount();
 184       String maxResidue = residueCounts.getResiduesForCount(maxCount);
 185       int gapCount = residueCounts.getGapCount();
 186       ProfileI profile = new Profile(seqCount, gapCount, maxCount,
 187               maxResidue);
 188
 189       if (saveFullProfile)
 190       {
 191         profile.setCounts(residueCounts);
 192       }
 193
 194       result[column] = profile;
 195     }
 196     return new Profiles(result);
 197     // long elapsed = System.currentTimeMillis() - now;
 198     // System.out.println(elapsed);
 199   }
 200
 201   /**
 202    * Make an estimate of the profile size we are going to compute i.e. how many
 203    * different characters may be present in it. Overestimating has a cost of
 204    * using more memory than necessary. Underestimating has a cost of needing to
 205    * extend the SparseIntArray holding the profile counts.
 206    *
 207    * @param profileSizes
 208    *          counts of sizes of profiles so far encountered
 209    * @return
 210    */
 211   static int estimateProfileSize(SparseIntArray profileSizes)
 212   {
 213     if (profileSizes.size() == 0)
 214     {
 215       return 4;
 216     }
 217
 218     /*
 219      * could do a statistical heuristic here e.g. 75%ile
 220      * for now just return the largest value
 221      */
 222     return profileSizes.keyAt(profileSizes.size() - 1);
 223   }
 224
 225   /**
 226    * Derive the consensus annotations to be added to the alignment for display.
 227    * This does not recompute the raw data, but may be called on a change in
 228    * display options, such as 'ignore gaps', which may in turn result in a
 229    * change in the derived values.
 230    *
 231    * @param consensus
 232    *          the annotation row to add annotations to
 233    * @param profiles
 234    *          the source consensus data
 235    * @param startCol
 236    *          start column (inclusive)
 237    * @param endCol
 238    *          end column (exclusive)
 239    * @param ignoreGaps
 240    *          if true, normalise residue percentages ignoring gaps
 241    * @param showSequenceLogo
 242    *          if true include all consensus symbols, else just show modal
 243    *          residue
 244    * @param nseq
 245    *          number of sequences
 246    */
 247   public static void completeConsensus(AlignmentAnnotation consensus,
 248           ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps,
 249           boolean showSequenceLogo, long nseq)
 250   {
 251     // long now = System.currentTimeMillis();
 252     if (consensus == null || consensus.annotations == null
 253             || consensus.annotations.length < endCol)
 254     {
 255       /*
 256        * called with a bad alignment annotation row
 257        * wait for it to be initialised properly
 258        */
 259       return;
 260     }
 261
 262     for (int i = startCol; i < endCol; i++)
 263     {
 264       ProfileI profile = profiles.get(i);
 265       if (profile == null)
 266       {
 267         /*
 268          * happens if sequences calculated over were
 269          * shorter than alignment width
 270          */
 271         consensus.annotations[i] = null;
 272         return;
 273       }
 274
 275       final int dp = getPercentageDp(nseq);
 276
 277       float value = profile.getPercentageIdentity(ignoreGaps);
 278
 279       String description = getTooltip(profile, value, showSequenceLogo,
 280               ignoreGaps, dp);
 281
 282       String modalResidue = profile.getModalResidue();
 283       if ("".equals(modalResidue))
 284       {
 285         modalResidue = "-";
 286       }
 287       else if (modalResidue.length() > 1)
 288       {
 289         modalResidue = "+";
 290       }
 291       consensus.annotations[i] = new Annotation(modalResidue, description,
 292               ' ', value);
 293     }
 294     // long elapsed = System.currentTimeMillis() - now;
 295     // System.out.println(-elapsed);
 296   }
 297
 298   /**
 299    * Derive the gap count annotation row.
 300    *
 301    * @param gaprow
 302    *          the annotation row to add annotations to
 303    * @param profiles
 304    *          the source consensus data
 305    * @param startCol
 306    *          start column (inclusive)
 307    * @param endCol
 308    *          end column (exclusive)
 309    */
 310   public static void completeGapAnnot(AlignmentAnnotation gaprow,
 311           ProfilesI profiles, int startCol, int endCol, long nseq)
 312   {
 313     if (gaprow == null || gaprow.annotations == null
 314             || gaprow.annotations.length < endCol)
 315     {
 316       /*
 317        * called with a bad alignment annotation row
 318        * wait for it to be initialised properly
 319        */
 320       return;
 321     }
 322     // always set ranges again
 323     gaprow.graphMax = nseq;
 324     gaprow.graphMin = 0;
 325     double scale = 0.8/nseq;
 326     for (int i = startCol; i < endCol; i++)
 327     {
 328       ProfileI profile = profiles.get(i);
 329       if (profile == null)
 330       {
 331         /*
 332          * happens if sequences calculated over were
 333          * shorter than alignment width
 334          */
 335         gaprow.annotations[i] = null;
 336         return;
 337       }
 338
 339       final int gapped = profile.getNonGapped();
 340
 341       String description = "" + gapped;
 342
 343       gaprow.annotations[i] = new Annotation("", description,
 344               '\0', gapped, jalview.util.ColorUtils.bleachColour(
 345                       Color.DARK_GRAY, (float) scale * gapped));
 346     }
 347   }
 348
 349   /**
 350    * Returns a tooltip showing either
 351    * <ul>
 352    * <li>the full profile (percentages of all residues present), if
 353    * showSequenceLogo is true, or</li>
 354    * <li>just the modal (most common) residue(s), if showSequenceLogo is false</li>
 355    * </ul>
 356    * Percentages are as a fraction of all sequence, or only ungapped sequences
 357    * if ignoreGaps is true.
 358    *
 359    * @param profile
 360    * @param pid
 361    * @param showSequenceLogo
 362    * @param ignoreGaps
 363    * @param dp
 364    *          the number of decimal places to format percentages to
 365    * @return
 366    */
 367   static String getTooltip(ProfileI profile, float pid,
 368           boolean showSequenceLogo, boolean ignoreGaps, int dp)
 369   {
 370     ResidueCount counts = profile.getCounts();
 371
 372     String description = null;
 373     if (counts != null && showSequenceLogo)
 374     {
 375       int normaliseBy = ignoreGaps ? profile.getNonGapped() : profile
 376               .getHeight();
 377       description = counts.getTooltip(normaliseBy, dp);
 378     }
 379     else
 380     {
 381       StringBuilder sb = new StringBuilder(64);
 382       String maxRes = profile.getModalResidue();
 383       if (maxRes.length() > 1)
 384       {
 385         sb.append("[").append(maxRes).append("]");
 386       }
 387       else
 388       {
 389         sb.append(maxRes);
 390       }
 391       if (maxRes.length() > 0)
 392       {
 393         sb.append(" ");
 394         Format.appendPercentage(sb, pid, dp);
 395         sb.append("%");
 396       }
 397       description = sb.toString();
 398     }
 399     return description;
 400   }
 401
 402   /**
 403    * Returns the sorted profile for the given consensus data. The returned array
 404    * contains
 405    *
 406    * <pre>
 407    *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
 408    * in descending order of percentage value
 409    * </pre>
 410    *
 411    * @param profile
 412    *          the data object from which to extract and sort values
 413    * @param ignoreGaps
 414    *          if true, only non-gapped values are included in percentage
 415    *          calculations
 416    * @return
 417    */
 418   public static int[] extractProfile(ProfileI profile, boolean ignoreGaps)
 419   {
 420     int[] rtnval = new int[64];
 421     ResidueCount counts = profile.getCounts();
 422     if (counts == null)
 423     {
 424       return null;
 425     }
 426
 427     SymbolCounts symbolCounts = counts.getSymbolCounts();
 428     char[] symbols = symbolCounts.symbols;
 429     int[] values = symbolCounts.values;
 430     QuickSort.sort(values, symbols);
 431     int nextArrayPos = 2;
 432     int totalPercentage = 0;
 433     final int divisor = ignoreGaps ? profile.getNonGapped() : profile
 434             .getHeight();
 435
 436     /*
 437      * traverse the arrays in reverse order (highest counts first)
 438      */
 439     for (int i = symbols.length - 1; i >= 0; i--)
 440     {
 441       int theChar = symbols[i];
 442       int charCount = values[i];
 443
 444       rtnval[nextArrayPos++] = theChar;
 445       final int percentage = (charCount * 100) / divisor;
 446       rtnval[nextArrayPos++] = percentage;
 447       totalPercentage += percentage;
 448     }
 449     rtnval[0] = symbols.length;
 450     rtnval[1] = totalPercentage;
 451     int[] result = new int[rtnval.length + 1];
 452     result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 453     System.arraycopy(rtnval, 0, result, 1, rtnval.length);
 454
 455     return result;
 456   }
 457
 458
 459   /**
 460    * Extract a sorted extract of cDNA codon profile data. The returned array
 461    * contains
 462    *
 463    * <pre>
 464    *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
 465    * in descending order of percentage value, where the character values encode codon triplets
 466    * </pre>
 467    *
 468    * @param hashtable
 469    * @return
 470    */
 471   public static int[] extractCdnaProfile(Hashtable hashtable,
 472           boolean ignoreGaps)
 473   {
 474     // this holds #seqs, #ungapped, and then codon count, indexed by encoded
 475     // codon triplet
 476     int[] codonCounts = (int[]) hashtable.get(PROFILE);
 477     int[] sortedCounts = new int[codonCounts.length - 2];
 478     System.arraycopy(codonCounts, 2, sortedCounts, 0,
 479             codonCounts.length - 2);
 480
 481     int[] result = new int[3 + 2 * sortedCounts.length];
 482     // first value is just the type of profile data
 483     result[0] = AlignmentAnnotation.CDNA_PROFILE;
 484
 485     char[] codons = new char[sortedCounts.length];
 486     for (int i = 0; i < codons.length; i++)
 487     {
 488       codons[i] = (char) i;
 489     }
 490     QuickSort.sort(sortedCounts, codons);
 491     int totalPercentage = 0;
 492     int distinctValuesCount = 0;
 493     int j = 3;
 494     int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
 495     for (int i = codons.length - 1; i >= 0; i--)
 496     {
 497       final int codonCount = sortedCounts[i];
 498       if (codonCount == 0)
 499       {
 500         break; // nothing else of interest here
 501       }
 502       distinctValuesCount++;
 503       result[j++] = codons[i];
 504       final int percentage = codonCount * 100 / divisor;
 505       result[j++] = percentage;
 506       totalPercentage += percentage;
 507     }
 508     result[2] = totalPercentage;
 509
 510     /*
 511      * Just return the non-zero values
 512      */
 513     // todo next value is redundant if we limit the array to non-zero counts
 514     result[1] = distinctValuesCount;
 515     return Arrays.copyOfRange(result, 0, j);
 516   }
 517
 518   /**
 519    * Compute a consensus for the cDNA coding for a protein alignment.
 520    *
 521    * @param alignment
 522    *          the protein alignment (which should hold mappings to cDNA
 523    *          sequences)
 524    * @param hconsensus
 525    *          the consensus data stores to be populated (one per column)
 526    */
 527   public static void calculateCdna(AlignmentI alignment,
 528           Hashtable[] hconsensus)
 529   {
 530     final char gapCharacter = alignment.getGapCharacter();
 531     List<AlignedCodonFrame> mappings = alignment.getCodonFrames();
 532     if (mappings == null || mappings.isEmpty())
 533     {
 534       return;
 535     }
 536
 537     int cols = alignment.getWidth();
 538     for (int col = 0; col < cols; col++)
 539     {
 540       // todo would prefer a Java bean for consensus data
 541       Hashtable<String, int[]> columnHash = new Hashtable<>();
 542       // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
 543       int[] codonCounts = new int[66];
 544       codonCounts[0] = alignment.getSequences().size();
 545       int ungappedCount = 0;
 546       for (SequenceI seq : alignment.getSequences())
 547       {
 548         if (seq.getCharAt(col) == gapCharacter)
 549         {
 550           continue;
 551         }
 552         List<char[]> codons = MappingUtils
 553                 .findCodonsFor(seq, col, mappings);
 554         for (char[] codon : codons)
 555         {
 556           int codonEncoded = CodingUtils.encodeCodon(codon);
 557           if (codonEncoded >= 0)
 558           {
 559             codonCounts[codonEncoded + 2]++;
 560             ungappedCount++;
 561           }
 562         }
 563       }
 564       codonCounts[1] = ungappedCount;
 565       // todo: sort values here, save counts and codons?
 566       columnHash.put(PROFILE, codonCounts);
 567       hconsensus[col] = columnHash;
 568     }
 569   }
 570
 571   /**
 572    * Derive displayable cDNA consensus annotation from computed consensus data.
 573    *
 574    * @param consensusAnnotation
 575    *          the annotation row to be populated for display
 576    * @param consensusData
 577    *          the computed consensus data
 578    * @param showProfileLogo
 579    *          if true show all symbols present at each position, else only the
 580    *          modal value
 581    * @param nseqs
 582    *          the number of sequences in the alignment
 583    */
 584   public static void completeCdnaConsensus(
 585           AlignmentAnnotation consensusAnnotation,
 586           Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
 587   {
 588     if (consensusAnnotation == null
 589             || consensusAnnotation.annotations == null
 590             || consensusAnnotation.annotations.length < consensusData.length)
 591     {
 592       // called with a bad alignment annotation row - wait for it to be
 593       // initialised properly
 594       return;
 595     }
 596
 597     // ensure codon triplet scales with font size
 598     consensusAnnotation.scaleColLabel = true;
 599     for (int col = 0; col < consensusData.length; col++)
 600     {
 601       Hashtable hci = consensusData[col];
 602       if (hci == null)
 603       {
 604         // gapped protein column?
 605         continue;
 606       }
 607       // array holds #seqs, #ungapped, then codon counts indexed by codon
 608       final int[] codonCounts = (int[]) hci.get(PROFILE);
 609       int totalCount = 0;
 610
 611       /*
 612        * First pass - get total count and find the highest
 613        */
 614       final char[] codons = new char[codonCounts.length - 2];
 615       for (int j = 2; j < codonCounts.length; j++)
 616       {
 617         final int codonCount = codonCounts[j];
 618         codons[j - 2] = (char) (j - 2);
 619         totalCount += codonCount;
 620       }
 621
 622       /*
 623        * Sort array of encoded codons by count ascending - so the modal value
 624        * goes to the end; start by copying the count (dropping the first value)
 625        */
 626       int[] sortedCodonCounts = new int[codonCounts.length - 2];
 627       System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
 628               codonCounts.length - 2);
 629       QuickSort.sort(sortedCodonCounts, codons);
 630
 631       int modalCodonEncoded = codons[codons.length - 1];
 632       int modalCodonCount = sortedCodonCounts[codons.length - 1];
 633       String modalCodon = String.valueOf(CodingUtils
 634               .decodeCodon(modalCodonEncoded));
 635       if (sortedCodonCounts.length > 1
 636               && sortedCodonCounts[codons.length - 2] == sortedCodonCounts[codons.length - 1])
 637       {
 638         /*
 639          * two or more codons share the modal count
 640          */
 641         modalCodon = "+";
 642       }
 643       float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
 644               / (float) totalCount;
 645
 646       /*
 647        * todo ? Replace consensus hashtable with sorted arrays of codons and
 648        * counts (non-zero only). Include total count in count array [0].
 649        */
 650
 651       /*
 652        * Scan sorted array backwards for most frequent values first. Show
 653        * repeated values compactly.
 654        */
 655       StringBuilder mouseOver = new StringBuilder(32);
 656       StringBuilder samePercent = new StringBuilder();
 657       String percent = null;
 658       String lastPercent = null;
 659       int percentDecPl = getPercentageDp(nseqs);
 660
 661       for (int j = codons.length - 1; j >= 0; j--)
 662       {
 663         int codonCount = sortedCodonCounts[j];
 664         if (codonCount == 0)
 665         {
 666           /*
 667            * remaining codons are 0% - ignore, but finish off the last one if
 668            * necessary
 669            */
 670           if (samePercent.length() > 0)
 671           {
 672             mouseOver.append(samePercent).append(": ").append(percent)
 673                     .append("% ");
 674           }
 675           break;
 676         }
 677         int codonEncoded = codons[j];
 678         final int pct = codonCount * 100 / totalCount;
 679         String codon = String
 680                 .valueOf(CodingUtils.decodeCodon(codonEncoded));
 681         StringBuilder sb = new StringBuilder();
 682         Format.appendPercentage(sb, pct, percentDecPl);
 683         percent = sb.toString();
 684         if (showProfileLogo || codonCount == modalCodonCount)
 685         {
 686           if (percent.equals(lastPercent) && j > 0)
 687           {
 688             samePercent.append(samePercent.length() == 0 ? "" : ", ");
 689             samePercent.append(codon);
 690           }
 691           else
 692           {
 693             if (samePercent.length() > 0)
 694             {
 695               mouseOver.append(samePercent).append(": ")
 696                       .append(lastPercent).append("% ");
 697             }
 698             samePercent.setLength(0);
 699             samePercent.append(codon);
 700           }
 701           lastPercent = percent;
 702         }
 703       }
 704
 705       consensusAnnotation.annotations[col] = new Annotation(modalCodon,
 706               mouseOver.toString(), ' ', pid);
 707     }
 708   }
 709
 710   /**
 711    * Returns the number of decimal places to show for profile percentages. For
 712    * less than 100 sequences, returns zero (the integer percentage value will be
 713    * displayed). For 100-999 sequences, returns 1, for 1000-9999 returns 2, etc.
 714    *
 715    * @param nseq
 716    * @return
 717    */
 718   protected static int getPercentageDp(long nseq)
 719   {
 720     int scale = 0;
 721     while (nseq >= 100)
 722     {
 723       scale++;
 724       nseq /= 10;
 725     }
 726     return scale;
 727   }
 728
 729   /**
 730    * produces a HMM profile for a column in an alignment
 731    *
 732    * @param aa
 733    *          Alignment annotation for which the profile is being calculated
 734    * @param column
 735    *          column in the alignment the profile is being made for
 736    * @param removeBelowBackground
 737    *          boolean, indicating whether to ignore residues with probabilities
 738    *          less than their background frequencies
 739    * @return
 740    */
 741   public static int[] getHMMProfileFor(AlignmentAnnotation aa, int column,
 742           boolean removeBelowBackground)
 743   {
 744
 745     HiddenMarkovModel hmm;
 746     hmm = aa.getHMM();
 747     if (hmm != null)
 748     {
 749       String alph = hmm.getAlphabetType();
 750       int size = hmm.getNumberOfSymbols();
 751       char symbols[] = new char[size];
 752       int values[] = new int[size];
 753       List<Character> charList = hmm.getSymbols();
 754       Integer totalCount = 0;
 755
 756       for (int i = 0; i < size; i++)
 757       {
 758         char symbol = charList.get(i);
 759         symbols[i] = symbol;
 760         Double value;
 761
 762         value = hmm.getMatchEmissionProbability(column, symbol);
 763         double freq;
 764
 765         if (alph == AMINO && removeBelowBackground)
 766         {
 767           freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
 768           if (value < freq)
 769           {
 770             value = 0d;
 771           }
 772         }
 773         else if (alph == DNA && removeBelowBackground)
 774         {
 775           freq = ResidueProperties.nucleotideBackgroundFrequencies
 776                   .get(symbol);
 777           if (value < freq)
 778           {
 779             value = 0d;
 780           }
 781         }
 782         value = value * 10000;
 783         values[i] = value.intValue();
 784         totalCount += value.intValue();
 785       }
 786
 787       QuickSort.sort(values, symbols);
 788
 789       int[] profile = new int[3 + size * 2];
 790
 791       profile[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
 792       profile[1] = size;
 793       profile[2] = totalCount / 100;
 794
 795       if (totalCount != 0)
 796       {
 797         int arrayPos = 3;
 798         for (int k = size - 1; k >= 0; k--)
 799         {
 800           Double percentage;
 801           Integer value = values[k];
 802           percentage = (value.doubleValue() / totalCount.doubleValue())
 803                   * 100d;
 804           profile[arrayPos] = symbols[k];
 805           profile[arrayPos + 1] = percentage.intValue();
 806           arrayPos += 2;
 807         }
 808       }
 809       return profile;
 810     }
 811     return null;
 812   }
 813 }