JAL-3202 discard zero percentages in extracted profile

author gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 27 Feb 2019 12:31:04 +0000 (12:31 +0000)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 27 Feb 2019 12:40:56 +0000 (12:40 +0000)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 27 Feb 2019 12:31:04 +0000 (12:31 +0000)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 27 Feb 2019 12:40:56 +0000 (12:40 +0000)
diff --git a/src/jalview/analysis/AAFrequency.java b/src/jalview/analysis/AAFrequency.java

index e4f2dfa..a1b0325 100755 (executable)
--- a/src/jalview/analysis/AAFrequency.java
+++ b/src/jalview/analysis/AAFrequency.java
@@ -398,7 +398,7 @@ public class AAFrequency
     * contains
     * 
     * <pre>
-   *    [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
+   *    [profileType, numberOfValues, totalPercent, charValue1, percentage1, charValue2, percentage2, ...]
     * in descending order of percentage value
     * </pre>
     * 
@@ -411,7 +411,6 @@ public class AAFrequency
     */
    public static int[] extractProfile(ProfileI profile, boolean ignoreGaps)
    {
-    int[] rtnval = new int[64];
      ResidueCount counts = profile.getCounts();
      if (counts == null)
      {
@@ -422,7 +421,6 @@ public class AAFrequency
      char[] symbols = symbolCounts.symbols;
      int[] values = symbolCounts.values;
      QuickSort.sort(values, symbols);
-    int nextArrayPos = 2;
      int totalPercentage = 0;
      final int divisor = ignoreGaps ? profile.getNonGapped()
              : profile.getHeight();
@@ -430,21 +428,44 @@ public class AAFrequency
      /*
       * traverse the arrays in reverse order (highest counts first)
       */
+    int[] result = new int[3 + 2 * symbols.length];
+    int nextArrayPos = 3;
+    int nonZeroCount = 0;
+
      for (int i = symbols.length - 1; i >= 0; i--)
      {
        int theChar = symbols[i];
        int charCount = values[i];
-
-      rtnval[nextArrayPos++] = theChar;
        final int percentage = (charCount * 100) / divisor;
-      rtnval[nextArrayPos++] = percentage;
+      if (percentage == 0)
+      {
+        /*
+         * this count (and any remaining) round down to 0% - discard
+         */
+        break;
+      }
+      nonZeroCount++;
+      result[nextArrayPos++] = theChar;
+      result[nextArrayPos++] = percentage;
        totalPercentage += percentage;
      }
-    rtnval[0] = symbols.length;
-    rtnval[1] = totalPercentage;
-    int[] result = new int[rtnval.length + 1];
+
+    /*
+     * truncate array if any zero values were discarded
+     */
+    if (nonZeroCount < symbols.length)
+    {
+      int[] tmp = new int[3 + 2 * nonZeroCount];
+      System.arraycopy(result, 0, tmp, 0, tmp.length);
+      result = tmp;
+    }
+
+    /*
+     * fill in 'header' values
+     */
      result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
-    System.arraycopy(rtnval, 0, result, 1, rtnval.length);
+    result[1] = nonZeroCount;
+    result[2] = totalPercentage;
  
      return result;
    }
@@ -454,7 +475,7 @@ public class AAFrequency
     * contains
     * 
     * <pre>
-   *    [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
+   *    [profileType, numberOfValues, totalPercentage, charValue1, percentage1, charValue2, percentage2, ...]
     * in descending order of percentage value, where the character values encode codon triplets
     * </pre>
     * 
@@ -492,9 +513,16 @@ public class AAFrequency
        {
          break; // nothing else of interest here
        }
+      final int percentage = codonCount * 100 / divisor;
+      if (percentage == 0)
+      {
+        /*
+         * this (and any remaining) values rounded down to 0 - discard
+         */
+        break;
+      }
        distinctValuesCount++;
        result[j++] = codons[i];
-      final int percentage = codonCount * 100 / divisor;
        result[j++] = percentage;
        totalPercentage += percentage;
      }
@@ -531,7 +559,7 @@ public class AAFrequency
      for (int col = 0; col < cols; col++)
      {
        // todo would prefer a Java bean for consensus data
-      Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
+      Hashtable<String, int[]> columnHash = new Hashtable<>();
        // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
        int[] codonCounts = new int[66];
        codonCounts[0] = alignment.getSequences().size();
diff --git a/src/jalview/renderer/AnnotationRenderer.java b/src/jalview/renderer/AnnotationRenderer.java

index ed266ae..17bc6df 100644 (file)
--- a/src/jalview/renderer/AnnotationRenderer.java
+++ b/src/jalview/renderer/AnnotationRenderer.java
@@ -470,8 +470,10 @@ public class AnnotationRenderer
              .getAlignmentStrucConsensusAnnotation();
      final AlignmentAnnotation complementConsensusAnnot = av
              .getComplementConsensusAnnotation();
-    boolean renderHistogram = true, renderProfile = true,
-            normaliseProfile = false, isRNA = rna;
+    boolean renderHistogram = true;
+    boolean renderProfile = false;
+    boolean normaliseProfile = false;
+    boolean isRNA = rna;
  
      BitSet graphGroupDrawn = new BitSet();
      int charOffset = 0; // offset for a label
@@ -1448,7 +1450,13 @@ public class AnnotationRenderer
              }
              // next profl[] position is profile % for the character(s)
              
-            double newHeight = normaliseFactor * scale * profl[c++];
+            int percent = profl[c++];
+            if (percent == 0)
+            {
+              // failsafe in case a count rounds down to 0%
+              continue;
+            }
+            double newHeight = normaliseFactor * scale * percent;
  
              /*
               * Set character colour as per alignment colour scheme; use the
diff --git a/test/jalview/analysis/AAFrequencyTest.java b/test/jalview/analysis/AAFrequencyTest.java

index 75fb39e..93c95ce 100644 (file)
--- a/test/jalview/analysis/AAFrequencyTest.java
+++ b/test/jalview/analysis/AAFrequencyTest.java
@@ -25,12 +25,16 @@ import static org.testng.AssertJUnit.assertNull;
  
  import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.Annotation;
+import jalview.datamodel.Profile;
  import jalview.datamodel.ProfileI;
  import jalview.datamodel.ProfilesI;
+import jalview.datamodel.ResidueCount;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceI;
  import jalview.gui.JvOptionPane;
  
+import java.util.Hashtable;
+
  import org.testng.annotations.BeforeClass;
  import org.testng.annotations.Test;
  
@@ -232,4 +236,183 @@ public class AAFrequencyTest
      assertEquals("T 75%", ann.description);
      assertEquals("T", ann.displayCharacter);
    }
+
+  /**
+   * Test to include rounding down of a non-zero count to 0% (JAL-3202)
+   */
+  @Test(groups = { "Functional" })
+  public void testExtractProfile()
+  {
+    /*
+     * 200 sequences of which 30 gapped (170 ungapped)
+     * max count 70 for modal residue 'G'
+     */
+    ProfileI profile = new Profile(200, 30, 70, "G");
+    ResidueCount counts = new ResidueCount();
+    counts.put('G', 70);
+    counts.put('R', 60);
+    counts.put('L', 38);
+    counts.put('H', 2);
+    profile.setCounts(counts);
+
+    /*
+     * [0, noOfValues, totalPercent, char1, count1, ...]
+     * G: 70/170 = 41.2 = 41
+     * R: 60/170 = 35.3 = 35
+     * L: 38/170 = 22.3 = 22
+     * H: 2/170 = 1
+     * total (rounded) percentages = 99 
+     */
+    int[] extracted = AAFrequency.extractProfile(profile, true);
+    int[] expected = new int[] { 0, 4, 99, 'G', 41, 'R', 35, 'L', 22, 'H',
+        1 };
+    org.testng.Assert.assertEquals(extracted, expected);
+
+    /*
+     * add some counts of 1; these round down to 0% and should be discarded
+     */
+    counts.put('G', 68); // 68/170 = 40% exactly (percentages now total 98)
+    counts.put('Q', 1);
+    counts.put('K', 1);
+    extracted = AAFrequency.extractProfile(profile, true);
+    expected = new int[] { 0, 4, 98, 'G', 40, 'R', 35, 'L', 22, 'H', 1 };
+    org.testng.Assert.assertEquals(extracted, expected);
+
+  }
+
+  /**
+   * Tests for the profile calculation where gaps are included i.e. the
+   * denominator is the total number of sequences in the column
+   */
+  @Test(groups = { "Functional" })
+  public void testExtractProfile_countGaps()
+  {
+    /*
+     * 200 sequences of which 30 gapped (170 ungapped)
+     * max count 70 for modal residue 'G'
+     */
+    ProfileI profile = new Profile(200, 30, 70, "G");
+    ResidueCount counts = new ResidueCount();
+    counts.put('G', 70);
+    counts.put('R', 60);
+    counts.put('L', 38);
+    counts.put('H', 2);
+    profile.setCounts(counts);
+  
+    /*
+     * [0, noOfValues, totalPercent, char1, count1, ...]
+     * G: 70/200 = 35%
+     * R: 60/200 = 30%
+     * L: 38/200 = 19%
+     * H: 2/200 = 1%
+     * total (rounded) percentages = 85 
+     */
+    int[] extracted = AAFrequency.extractProfile(profile, false);
+    int[] expected = new int[] { AlignmentAnnotation.SEQUENCE_PROFILE, 4,
+        85, 'G', 35, 'R', 30, 'L', 19, 'H',
+        1 };
+    org.testng.Assert.assertEquals(extracted, expected);
+  
+    /*
+     * add some counts of 1; these round down to 0% and should be discarded
+     */
+    counts.put('G', 68); // 68/200 = 34%
+    counts.put('Q', 1);
+    counts.put('K', 1);
+    extracted = AAFrequency.extractProfile(profile, false);
+    expected = new int[] { AlignmentAnnotation.SEQUENCE_PROFILE, 4, 84, 'G',
+        34, 'R', 30, 'L', 19, 'H', 1 };
+    org.testng.Assert.assertEquals(extracted, expected);
+  
+  }
+
+  @Test(groups = { "Functional" })
+  public void testExtractCdnaProfile()
+  {
+    /*
+     * 200 sequences of which 30 gapped (170 ungapped)
+     * max count 70 for modal residue 'G'
+     */
+    Hashtable profile = new Hashtable();
+
+    /*
+     *  cdna profile is {seqCount, ungappedCount, codonCount1, ...codonCount64}
+     * where 1..64 positions correspond to encoded codons
+     * see CodingUtils.encodeCodon()
+     */
+    int[] codonCounts = new int[66];
+    char[] codon1 = new char[] { 'G', 'C', 'A' };
+    char[] codon2 = new char[] { 'c', 'C', 'A' };
+    char[] codon3 = new char[] { 't', 'g', 'A' };
+    char[] codon4 = new char[] { 'G', 'C', 't' };
+    int encoded1 = CodingUtils.encodeCodon(codon1);
+    int encoded2 = CodingUtils.encodeCodon(codon2);
+    int encoded3 = CodingUtils.encodeCodon(codon3);
+    int encoded4 = CodingUtils.encodeCodon(codon4);
+    codonCounts[2 + encoded1] = 30;
+    codonCounts[2 + encoded2] = 70;
+    codonCounts[2 + encoded3] = 9;
+    codonCounts[2 + encoded4] = 1;
+    codonCounts[0] = 120;
+    codonCounts[1] = 110;
+    profile.put(AAFrequency.PROFILE, codonCounts);
+  
+    /*
+     * [0, noOfValues, totalPercent, char1, count1, ...]
+     * codon1: 30/110 = 27.2 = 27% 
+     * codon2: 70/110 = 63.6% = 63%
+     * codon3: 9/110 = 8.1% = 8%
+     * codon4: 1/110 = 0.9% = 0% should be discarded
+     * total (rounded) percentages = 98
+     */
+    int[] extracted = AAFrequency.extractCdnaProfile(profile, true);
+    int[] expected = new int[] { AlignmentAnnotation.CDNA_PROFILE, 3, 98,
+        encoded2, 63, encoded1, 27, encoded3, 8 };
+    org.testng.Assert.assertEquals(extracted, expected);
+  }
+
+  @Test(groups = { "Functional" })
+  public void testExtractCdnaProfile_countGaps()
+  {
+    /*
+     * 200 sequences of which 30 gapped (170 ungapped)
+     * max count 70 for modal residue 'G'
+     */
+    Hashtable profile = new Hashtable();
+  
+    /*
+     *  cdna profile is {seqCount, ungappedCount, codonCount1, ...codonCount64}
+     * where 1..64 positions correspond to encoded codons
+     * see CodingUtils.encodeCodon()
+     */
+    int[] codonCounts = new int[66];
+    char[] codon1 = new char[] { 'G', 'C', 'A' };
+    char[] codon2 = new char[] { 'c', 'C', 'A' };
+    char[] codon3 = new char[] { 't', 'g', 'A' };
+    char[] codon4 = new char[] { 'G', 'C', 't' };
+    int encoded1 = CodingUtils.encodeCodon(codon1);
+    int encoded2 = CodingUtils.encodeCodon(codon2);
+    int encoded3 = CodingUtils.encodeCodon(codon3);
+    int encoded4 = CodingUtils.encodeCodon(codon4);
+    codonCounts[2 + encoded1] = 30;
+    codonCounts[2 + encoded2] = 70;
+    codonCounts[2 + encoded3] = 9;
+    codonCounts[2 + encoded4] = 1;
+    codonCounts[0] = 120;
+    codonCounts[1] = 110;
+    profile.put(AAFrequency.PROFILE, codonCounts);
+  
+    /*
+     * [0, noOfValues, totalPercent, char1, count1, ...]
+     * codon1: 30/120 = 25% 
+     * codon2: 70/120 = 58.3 = 58%
+     * codon3: 9/120 = 7.5 = 7%
+     * codon4: 1/120 = 0.8 = 0% should be discarded
+     * total (rounded) percentages = 90
+     */
+    int[] extracted = AAFrequency.extractCdnaProfile(profile, false);
+    int[] expected = new int[] { AlignmentAnnotation.CDNA_PROFILE, 3, 90,
+        encoded2, 58, encoded1, 25, encoded3, 7 };
+    org.testng.Assert.assertEquals(extracted, expected);
+  }
  }
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 27 Feb 2019 12:31:04 +0000 (12:31 +0000)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 27 Feb 2019 12:40:56 +0000 (12:40 +0000)
src/jalview/analysis/AAFrequency.java		patch \| blob \| history
src/jalview/renderer/AnnotationRenderer.java		patch \| blob \| history
test/jalview/analysis/AAFrequencyTest.java		patch \| blob \| history