update spikes/mungo from JAL-3076 patch branch

[jalview.git] / src / jalview / analysis / AAFrequency.java
diff --git a/src/jalview/analysis/AAFrequency.java b/src/jalview/analysis/AAFrequency.java

index 11c8f72..e4f2dfa 100755 (executable)
--- a/src/jalview/analysis/AAFrequency.java
+++ b/src/jalview/analysis/AAFrequency.java
@@ -24,7 +24,6 @@ import jalview.datamodel.AlignedCodonFrame;
  import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.Annotation;
-import jalview.datamodel.HiddenMarkovModel;
  import jalview.datamodel.Profile;
  import jalview.datamodel.ProfileI;
  import jalview.datamodel.Profiles;
@@ -33,7 +32,6 @@ import jalview.datamodel.ResidueCount;
  import jalview.datamodel.ResidueCount.SymbolCounts;
  import jalview.datamodel.SequenceI;
  import jalview.ext.android.SparseIntArray;
-import jalview.schemes.ResidueProperties;
  import jalview.util.Comparison;
  import jalview.util.Format;
  import jalview.util.MappingUtils;
@@ -50,13 +48,26 @@ import java.util.List;
   * This class is used extensively in calculating alignment colourschemes that
   * depend on the amount of conservation in each alignment column.
   * 
+ * @author $author$
+ * @version $Revision$
   */
  public class AAFrequency
  {
-  private static final double LOG2 = Math.log(2);
-
    public static final String PROFILE = "P";
  
+  /*
+   * Quick look-up of String value of char 'A' to 'Z'
+   */
+  private static final String[] CHARS = new String['Z' - 'A' + 1];
+
+  static
+  {
+    for (char c = 'A'; c <= 'Z'; c++)
+    {
+      CHARS[c - 'A'] = String.valueOf(c);
+    }
+  }
+
    public static final ProfilesI calculate(List<SequenceI> list, int start,
            int end)
    {
@@ -181,52 +192,6 @@ public class AAFrequency
    }
  
    /**
-   * Returns the full set of profiles for a hidden Markov model. The underlying
-   * data is the raw probabilities of a residue being emitted at each node,
-   * however the profiles returned by this function contain the percentage
-   * chance of a residue emission.
-   * 
-   * @param hmm
-   * @param width
-   *          The width of the Profile array (Profiles) to be returned.
-   * @param start
-   *          The alignment column on which the first profile is based.
-   * @param end
-   *          The alignment column on which the last profile is based.
-   * @param removeBelowBackground
-   *          if true, symbols with a match emission probability less than
-   *          background frequency are ignored
-   * @return
-   */
-  public static ProfilesI calculateHMMProfiles(final HiddenMarkovModel hmm,
-          int width, int start, int end, boolean removeBelowBackground,
-          boolean infoLetterHeight)
-  {
-    ProfileI[] result = new ProfileI[width];
-    char[] symbols = hmm.getSymbols().toCharArray();
-    int symbolCount = symbols.length;
-    for (int column = start; column < end; column++)
-    {
-      ResidueCount counts = new ResidueCount();
-      for (char symbol : symbols)
-      {
-        int value = getAnalogueCount(hmm, column, symbol,
-                removeBelowBackground, infoLetterHeight);
-        counts.put(symbol, value);
-      }
-      int maxCount = counts.getModalCount();
-      String maxResidue = counts.getResiduesForCount(maxCount);
-      int gapCount = counts.getGapCount();
-      ProfileI profile = new Profile(symbolCount, gapCount, maxCount,
-              maxResidue);
-      profile.setCounts(counts);
-
-      result[column] = profile;
-    }
-    return new Profiles(result);
-  }
-
-  /**
     * Make an estimate of the profile size we are going to compute i.e. how many
     * different characters may be present in it. Overestimating has a cost of
     * using more memory than necessary. Underestimating has a cost of needing to
@@ -324,80 +289,9 @@ public class AAFrequency
    }
  
    /**
-   * Derive the information annotations to be added to the alignment for
-   * display. This does not recompute the raw data, but may be called on a
-   * change in display options, such as 'ignore below background frequency',
-   * which may in turn result in a change in the derived values.
+   * Derive the gap count annotation row.
     * 
-   * @param information
-   *          the annotation row to add annotations to
-   * @param profiles
-   *          the source information data
-   * @param startCol
-   *          start column (inclusive)
-   * @param endCol
-   *          end column (exclusive)
-   * @param ignoreGaps
-   *          if true, normalise residue percentages
-   * @param showSequenceLogo
-   *          if true include all information symbols, else just show modal
-   *          residue
-   */
-  public static float completeInformation(AlignmentAnnotation information,
-          ProfilesI profiles, int startCol, int endCol)
-  {
-    // long now = System.currentTimeMillis();
-    if (information == null || information.annotations == null)
-    {
-      /*
-       * called with a bad alignment annotation row 
-       * wait for it to be initialised properly
-       */
-      return 0;
-    }
-
-    float max = 0f;
-    SequenceI hmmSeq = information.sequenceRef;
-
-    int seqLength = hmmSeq.getLength();
-    if (information.annotations.length < seqLength)
-    {
-      return 0;
-    }
-
-    HiddenMarkovModel hmm = hmmSeq.getHMM();
-
-    for (int column = startCol; column < endCol; column++)
-    {
-      if (column >= seqLength)
-      {
-        // hmm consensus sequence is shorter than the alignment
-        break;
-      }
-      
-      float value = hmm.getInformationContent(column);
-      boolean isNaN = Float.isNaN(value);
-      if (!isNaN)
-      {
-        max = Math.max(max, value);
-      }
-
-      String description = isNaN ? null
-              : String.format("%.4f bits", value);
-      information.annotations[column] = new Annotation(
-              Character.toString(
-                      Character.toUpperCase(hmmSeq.getCharAt(column))),
-              description, ' ', value);
-    }
-
-    information.graphMax = max;
-    return max;
-  }
-
-  /**
-   * Derive the occupancy count annotation
-   * 
-   * @param occupancy
+   * @param gaprow
     *          the annotation row to add annotations to
     * @param profiles
     *          the source consensus data
@@ -406,11 +300,11 @@ public class AAFrequency
     * @param endCol
     *          end column (exclusive)
     */
-  public static void completeOccupancyAnnot(AlignmentAnnotation occupancy,
+  public static void completeGapAnnot(AlignmentAnnotation gaprow,
            ProfilesI profiles, int startCol, int endCol, long nseq)
    {
-    if (occupancy == null || occupancy.annotations == null
-            || occupancy.annotations.length < endCol)
+    if (gaprow == null || gaprow.annotations == null
+            || gaprow.annotations.length < endCol)
      {
        /*
         * called with a bad alignment annotation row 
@@ -419,8 +313,8 @@ public class AAFrequency
        return;
      }
      // always set ranges again
-    occupancy.graphMax = nseq;
-    occupancy.graphMin = 0;
+    gaprow.graphMax = nseq;
+    gaprow.graphMin = 0;
      double scale = 0.8 / nseq;
      for (int i = startCol; i < endCol; i++)
      {
@@ -431,7 +325,7 @@ public class AAFrequency
           * happens if sequences calculated over were 
           * shorter than alignment width
           */
-        occupancy.annotations[i] = null;
+        gaprow.annotations[i] = null;
          return;
        }
  
@@ -439,8 +333,7 @@ public class AAFrequency
  
        String description = "" + gapped;
  
-      occupancy.annotations[i] = new Annotation("", description, '\0',
-              gapped,
+      gaprow.annotations[i] = new Annotation("", description, '\0', gapped,
                jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY,
                        (float) scale * gapped));
      }
@@ -556,7 +449,6 @@ public class AAFrequency
      return result;
    }
  
-
    /**
     * Extract a sorted extract of cDNA codon profile data. The returned array
     * contains
@@ -639,7 +531,7 @@ public class AAFrequency
      for (int col = 0; col < cols; col++)
      {
        // todo would prefer a Java bean for consensus data
-      Hashtable<String, int[]> columnHash = new Hashtable<>();
+      Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
        // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
        int[] codonCounts = new int[66];
        codonCounts[0] = alignment.getSequences().size();
@@ -826,118 +718,4 @@ public class AAFrequency
      }
      return scale;
    }
-
-  /**
-   * Returns the sorted HMM profile for the given column of the alignment. The
-   * returned array contains
-   * 
-   * <pre>
-   *    [profileType=0, numberOfValues, 100, charValue1, percentage1, charValue2, percentage2, ...]
-   * in descending order of percentage value
-   * </pre>
-   * 
-   * @param hmm
-   * @param column
-   * @param removeBelowBackground
-   *          if true, ignores residues with probability less than their
-   *          background frequency
-   * @param infoHeight
-   *          if true, uses the log ratio 'information' measure to scale the
-   *          value
-   * @return
-   */
-  public static int[] extractHMMProfile(HiddenMarkovModel hmm, int column,
-          boolean removeBelowBackground, boolean infoHeight)
-  {
-    if (hmm == null)
-    {
-      return null;
-    }
-    String alphabet = hmm.getSymbols();
-    int size = alphabet.length();
-    char symbols[] = new char[size];
-    int values[] = new int[size];
-    int totalCount = 0;
-
-    for (int i = 0; i < size; i++)
-    {
-      char symbol = alphabet.charAt(i);
-      symbols[i] = symbol;
-      int value = getAnalogueCount(hmm, column, symbol,
-              removeBelowBackground, infoHeight);
-      values[i] = value;
-      totalCount += value;
-    }
-
-    /*
-     * sort symbols by increasing emission probability
-     */
-    QuickSort.sort(values, symbols);
-
-    int[] profile = new int[3 + size * 2];
-
-    profile[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
-    profile[1] = size;
-    profile[2] = 100;
-
-    /*
-     * order symbol/count profile by decreasing emission probability
-     */
-    if (totalCount != 0)
-    {
-      int arrayPos = 3;
-      for (int k = size - 1; k >= 0; k--)
-      {
-        Float percentage;
-        int value = values[k];
-        if (removeBelowBackground)
-        {
-          percentage = ((float) value) / totalCount * 100f;
-        }
-        else
-        {
-          percentage = value / 100f;
-        }
-        int intPercent = Math.round(percentage);
-        profile[arrayPos] = symbols[k];
-        profile[arrayPos + 1] = intPercent;
-        arrayPos += 2;
-      }
-    }
-    return profile;
-  }
-
-  /**
-   * Converts the emission probability of a residue at a column in the alignment
-   * to a 'count', suitable for rendering as an annotation value
-   * 
-   * @param hmm
-   * @param column
-   * @param symbol
-   * @param removeBelowBackground
-   *          if true, returns 0 for any symbol with a match emission
-   *          probability less than the background frequency
-   * @infoHeight if true, uses the log ratio 'information content' to scale the
-   *             value
-   * @return
-   */
-  static int getAnalogueCount(HiddenMarkovModel hmm, int column,
-          char symbol, boolean removeBelowBackground, boolean infoHeight)
-  {
-    double value = hmm.getMatchEmissionProbability(column, symbol);
-    double freq = ResidueProperties.backgroundFrequencies
-            .get(hmm.getAlphabetType()).get(symbol);
-    if (value < freq && removeBelowBackground)
-    {
-      return 0;
-    }
-
-    if (infoHeight)
-    {
-      value = value * (Math.log(value / freq) / LOG2);
-    }
-
-    value = value * 10000d;
-    return Math.round((float) value);
-  }
  }