Merge branch 'mmw/bug/JAL-4241-annotation-alignment-fix-slivka' into development...

author Mateusz Warowny <mmzwarowny@dundee.ac.uk>

Thu, 14 Sep 2023 15:10:43 +0000 (17:10 +0200)

committer Mateusz Warowny <mmzwarowny@dundee.ac.uk>

Thu, 14 Sep 2023 15:10:43 +0000 (17:10 +0200)
author Mateusz Warowny <mmzwarowny@dundee.ac.uk>
Thu, 14 Sep 2023 15:10:43 +0000 (17:10 +0200)
committer Mateusz Warowny <mmzwarowny@dundee.ac.uk>
Thu, 14 Sep 2023 15:10:43 +0000 (17:10 +0200)
diff --git a/src/jalview/analysis/SeqsetUtils.java b/src/jalview/analysis/SeqsetUtils.java

index adb70e3..5420aff 100755 (executable)
--- a/src/jalview/analysis/SeqsetUtils.java
+++ b/src/jalview/analysis/SeqsetUtils.java
@@ -30,6 +30,7 @@ import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
  
  import java.util.ArrayList;
+import java.util.BitSet;
  import java.util.Enumeration;
  import java.util.HashMap;
  import java.util.Hashtable;
@@ -40,6 +41,8 @@ import java.util.Optional;
  import java.util.Vector;
  import static java.lang.String.format;
  
+import java.nio.CharBuffer;
+
  public class SeqsetUtils
  {
    public static class SequenceInfo {
@@ -90,6 +93,32 @@ public class SeqsetUtils
    }
  
    /**
+   * Filter the sequence through the mask leaving only characters at positions
+   * where the mask value was true. The length of the resulting array is
+   * the cardinality of the mask from 0 to sequence length.
+   *
+   * @param sequence
+   *          input sequence
+   * @param mask
+   *          mask used to filter the sequence characters
+   * @return input array filtered through the mask
+   */
+  public static char[] filterSequence(char[] sequence, BitSet mask)
+  {
+    mask = mask.get(0, sequence.length);
+    char[] result = new char[mask.cardinality()];
+    for (int i = mask.nextSetBit(0), j = 0; i >= 0;)
+    {
+      result[j++] = sequence[i];
+      if (i == Integer.MAX_VALUE)
+        // prevents integer overflow of (i + 1)
+        break;
+      i = mask.nextSetBit(i + 1);
+    }
+    return result;
+  }
+
+  /**
     * Recover essential properties of a sequence from a hashtable TODO: replace
     * these methods with something more elegant.
     * 
diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java

index 4230366..89c55e6 100755 (executable)
--- a/src/jalview/datamodel/Sequence.java
+++ b/src/jalview/datamodel/Sequence.java
@@ -594,6 +594,14 @@ public class Sequence extends ASequence implements SequenceI
      return this.sequence.length;
    }
  
+  @Override
+  public void setSequence(char[] seq)
+  {
+    this.sequence = Arrays.copyOf(seq, seq.length);
+    checkValidRange();
+    sequenceChanged();
+  }
+
    /**
     * DOCUMENT ME!
     * 
diff --git a/src/jalview/datamodel/SequenceI.java b/src/jalview/datamodel/SequenceI.java

index 11aa4e6..82575ec 100755 (executable)
--- a/src/jalview/datamodel/SequenceI.java
+++ b/src/jalview/datamodel/SequenceI.java
@@ -101,6 +101,14 @@ public interface SequenceI extends ASequenceI
    public int getLength();
  
    /**
+   * Replace the sequence with the given characters
+   * 
+   * @param sequence
+   *          new sequence characters
+   */
+  public void setSequence(char[] sequence);
+
+  /**
     * Replace the sequence with the given string
     * 
     * @param sequence
diff --git a/src/jalview/ws2/actions/annotation/AnnotationJob.java b/src/jalview/ws2/actions/annotation/AnnotationJob.java

index 467dafd..6a1053e 100644 (file)
--- a/src/jalview/ws2/actions/annotation/AnnotationJob.java
+++ b/src/jalview/ws2/actions/annotation/AnnotationJob.java
@@ -1,6 +1,7 @@
  package jalview.ws2.actions.annotation;
  
  import java.util.ArrayList;
+import java.util.Arrays;
  import java.util.BitSet;
  import java.util.HashMap;
  import java.util.List;
@@ -26,7 +27,7 @@ public class AnnotationJob extends BaseJob
    final int minSize;
  
    public AnnotationJob(List<SequenceI> inputSeqs, boolean[] gapMap,
-          Map<String, SequenceI> seqNames, int start, int end, int minSize)
+      Map<String, SequenceI> seqNames, int start, int end, int minSize)
    {
      super(inputSeqs);
      this.gapMap = gapMap;
@@ -47,10 +48,10 @@ public class AnnotationJob extends BaseJob
    }
  
    public static AnnotationJob create(SequenceCollectionI inputSeqs,
-          boolean bySequence, boolean submitGaps, boolean requireAligned,
-          boolean filterNonStandardResidues, int minSize)
+      boolean bySequence, boolean submitGaps, boolean requireAligned,
+      boolean filterNonStandardResidues, int minSize)
    {
-    List<SequenceI> seqences = new ArrayList<>();
+    List<SequenceI> sequences = new ArrayList<>();
      int minlen = 10;
      int width = 0;
      Map<String, SequenceI> namesMap = bySequence ? new HashMap<>() : null;
@@ -64,87 +65,104 @@ public class AnnotationJob extends BaseJob
      for (SequenceI sq : inputSeqs.getSequences())
      {
        int sqLen = (bySequence)
-              ? sq.findPosition(end + 1) - sq.findPosition(start + 1)
-              : sq.getEnd() - sq.getStart();
+          ? sq.findPosition(end + 1) - sq.findPosition(start + 1)
+          : sq.getEnd() - sq.getStart();
        if (sqLen < minlen)
          continue;
-      String newName = SeqsetUtils.unique_name(seqences.size() + 1);
+      width = Math.max(width, sq.getLength());
+      String newName = SeqsetUtils.unique_name(sequences.size() + 1);
        if (namesMap != null)
          namesMap.put(newName, sq);
+      char[] seqChars = sq.getSequence(start, end + 1);
+      if (filterNonStandardResidues)
+        replaceNonStandardResidues(seqChars, Comparison.GAP_DASH,
+            sq.isProtein());
        Sequence seq;
        if (submitGaps)
        {
-        seq = new Sequence(newName, sq.getSequenceAsString());
-        updateResidueMap(residueMap, seq, filterNonStandardResidues);
-      }
-      else
+        seq = new Sequence(newName, seqChars);
+        updateResidueMap(residueMap, seq);
+      } else
        {
          // TODO: add ability to exclude hidden regions
          seq = new Sequence(newName,
-                AlignSeq.extractGaps(Comparison.GapChars,
-                        sq.getSequenceAsString(start, end + 1)));
+            AlignSeq.extractGaps(Comparison.GapChars, new String(seqChars)));
          // for annotation need to also record map to sequence start/end
          // position in range
          // then transfer back to original sequence on return.
        }
-      seqences.add(seq);
-      width = Math.max(width, seq.getLength());
-    }
-
-    if (requireAligned && submitGaps)
-    {
-      for (int i = 0; i < seqences.size(); i++)
-      {
-        SequenceI sq = seqences.get(i);
-        char[] padded = fitSequenceToResidueMap(sq.getSequence(),
-                residueMap);
-        seqences.set(i, new Sequence(sq.getName(), padded));
-      }
+      sequences.add(seq);
      }
      boolean[] gapMapArray = null;
      if (submitGaps)
      {
+      adjustColumns(sequences, residueMap, requireAligned);
        gapMapArray = new boolean[width];
        for (int i = 0; i < width; i++)
          gapMapArray[i] = residueMap.get(i);
      }
-    return new AnnotationJob(seqences, gapMapArray, namesMap, start, end,
-            minSize);
+    return new AnnotationJob(sequences, gapMapArray, namesMap, start, end,
+        minSize);
    }
  
-  private static void updateResidueMap(BitSet residueMap, SequenceI seq,
-          boolean filterNonStandardResidues)
+  static void replaceNonStandardResidues(char[] seq, char replacement,
+      boolean isProtein)
    {
-    for (int pos : seq.gapMap())
+    for (int i = 0; i < seq.length; i++)
      {
-      char sqchr = seq.getCharAt(pos);
-      boolean include = !filterNonStandardResidues;
-      include |= seq.isProtein() ? ResidueProperties.aaIndex[sqchr] < 20
-              : ResidueProperties.nucleotideIndex[sqchr] < 5;
-      if (include)
-        residueMap.set(pos);
+      char chr = seq[i];
+      if (isProtein ? ResidueProperties.aaIndex[chr] >= 20
+          : ResidueProperties.nucleotideIndex[chr] >= 5)
+      {
+        seq[i] = replacement;
+      }
      }
    }
  
    /**
-   * Fits the sequence to the residue map removing empty columns where residue
-   * map is unset and padding the sequence with gaps at the end if needed.
+   * Add residue positions of the given sequence to the residues map. Perform an
+   * "or" operation between the given residue map and the inverse of the gap map
+   * of the given sequence.
+   * 
+   * @param residueMap
+   *          mapping to be updated in-place
+   * @param seq
+   *          the sequence whose residue positions are added to the map
+   */
+  static void updateResidueMap(BitSet residueMap, SequenceI seq)
+  {
+    var gaps = seq.gapBitset();
+    gaps.flip(0, seq.getLength());
+    residueMap.or(gaps);
+  }
+
+  /**
+   * Remove columns not included in the mask from the sequences in-place. If
+   * {@code padToLength} is set, the shorter sequences are padded with gaps at
+   * the end.
+   * 
+   * @param sequences
+   *          list of sequences to be modified
+   * @param mask
+   *          mask of columns that will remain
+   * @param padToLength
+   *          if gaps should be added to the end of shorter sequences
     */
-  private static char[] fitSequenceToResidueMap(char[] sequence,
-          BitSet residueMap)
+  static void adjustColumns(List<SequenceI> sequences, BitSet mask,
+      boolean padToLength)
    {
-    int width = residueMap.cardinality();
-    char[] padded = new char[width];
-    for (int op = 0, pp = 0; pp < width; op++)
+    int width = mask.cardinality();
+    for (SequenceI seq : sequences)
      {
-      if (residueMap.get(op))
+      char[] chars = SeqsetUtils.filterSequence(seq.getSequence(), mask);
+      if (padToLength && chars.length < width)
        {
-        if (sequence.length > op)
-          padded[pp++] = sequence[op];
-        else
-          padded[pp++] = '-';
+        int limit = chars.length;
+        chars = Arrays.copyOf(chars, width);
+        Arrays.fill(chars, limit, chars.length, Comparison.GAP_DASH);
        }
+      seq.setEnd(seq.getStart());
+      seq.setSequence(chars);
      }
-    return padded;
    }
  }
diff --git a/src/jalview/ws2/actions/annotation/AnnotationTask.java b/src/jalview/ws2/actions/annotation/AnnotationTask.java

index 6cb5aa6..1d02264 100644 (file)
--- a/src/jalview/ws2/actions/annotation/AnnotationTask.java
+++ b/src/jalview/ws2/actions/annotation/AnnotationTask.java
@@ -162,6 +162,8 @@ public class AnnotationTask
      }
    }
  
+  // TODO: review and test
+  //       may produce wrong output if annotations longer than gapMap
    private Annotation[] createGappedAnnotations(Annotation[] annotations,
            boolean[] gapMap)
    {
diff --git a/test/jalview/analysis/SeqsetUtilsTest.java b/test/jalview/analysis/SeqsetUtilsTest.java

index 71e5bfd..fbd5812 100644 (file)
--- a/test/jalview/analysis/SeqsetUtilsTest.java
+++ b/test/jalview/analysis/SeqsetUtilsTest.java
@@ -27,11 +27,16 @@ import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
  import jalview.gui.JvOptionPane;
  
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.equalTo;
+
+import java.util.BitSet;
  import java.util.Hashtable;
  import java.util.Map;
  
  import org.testng.Assert;
  import org.testng.annotations.BeforeClass;
+import org.testng.annotations.DataProvider;
  import org.testng.annotations.Test;
  
  /**
@@ -82,4 +87,29 @@ public class SeqsetUtilsTest
      Assert.assertSame(sqset[0].getSequenceFeatures().get(1),
              sqset2[0].getSequenceFeatures().get(1));
    }
+  
+  @DataProvider
+  public Object[][] sequenceAndMask()
+  {
+    return new Object[][] {
+      { "AAAABBBBCCCCDDDD", 0xFFFFL, "AAAABBBBCCCCDDDD" },
+      { "AAAABBBBCCCCDDDD", 0x000FL, "AAAA" },
+      { "---A---B---C---D", 0x8888L, "ABCD" },
+      { "---A---B---C---D", 0x9999L, "-A-B-C-D" },
+      { "ABCDABCDABCDABCD", 0xC5A3L, "ABBDACCD" },
+      { "", 0xFFFFL, "" },
+      { "AAAABBBBCCCCDDDD", 0x0000L, "" },
+      { "AAABBBCCC", 0xFFFF, "AAABBBCCC" },
+      { "AAAABBBB", 0xD000L, "" },
+      { "AAAABBBB", 0xAA0AL, "AA" },
+    };
+  }
+  
+  @Test(groups = {"Functional"}, dataProvider = "sequenceAndMask")
+  public void testFilterSequence(String sequence, long mask, String expected)
+  {
+    BitSet bitMask = BitSet.valueOf(new long[] {mask});
+    var result = SeqsetUtils.filterSequence(sequence.toCharArray(), bitMask);
+    assertThat(result, equalTo(expected.toCharArray()));
+  }
  }
diff --git a/test/jalview/ws2/actions/annotation/AnnotationJobTest.java b/test/jalview/ws2/actions/annotation/AnnotationJobTest.java

index 4e3767f..183d6f3 100644 (file)
--- a/test/jalview/ws2/actions/annotation/AnnotationJobTest.java
+++ b/test/jalview/ws2/actions/annotation/AnnotationJobTest.java
@@ -172,7 +172,7 @@ public class AnnotationJobTest
    }
  
    @Test(groups = { "Functional"} )
-  public void testCreate_ContainsNonStandardAndNoFilterNonStandard_NonStandardToGap()
+  public void testCreate_ContainsNonStandardAndNoFilterNonStandard_NonStandardRemain()
    {
      var alignment = new Alignment(new Sequence[] {
          new Sequence("test1", "ACACAOACACAC"),
@@ -180,6 +180,20 @@ public class AnnotationJobTest
      });
      var annotJob = AnnotationJob.create(alignment, true, true, true, false, 0);
      assertThat(annotJob.getInputSequences(), contains(
+            matchesSequenceString("ACACAOACACAC"),
+            matchesSequenceString("ABAVAVAVABAV")
+    ));
+  }
+
+  @Test(groups = { "Functional"} )
+  public void testCreate_ContainsNonStandardAndFilterNonStandard_NonStandardToGap()
+  {
+    var alignment = new Alignment(new Sequence[] {
+        new Sequence("test1", "ACACAOACACAC"),
+        new Sequence("test2", "ABAVAVAVABAV")
+    });
+    var annotJob = AnnotationJob.create(alignment, true, true, true, true, 0);
+    assertThat(annotJob.getInputSequences(), contains(
              matchesSequenceString("ACACA-ACACAC"),
              matchesSequenceString("A-AVAVAVA-AV")
      ));
author	Mateusz Warowny <mmzwarowny@dundee.ac.uk>
	Thu, 14 Sep 2023 15:10:43 +0000 (17:10 +0200)
committer	Mateusz Warowny <mmzwarowny@dundee.ac.uk>
	Thu, 14 Sep 2023 15:10:43 +0000 (17:10 +0200)
src/jalview/analysis/SeqsetUtils.java		patch \| blob \| history
src/jalview/datamodel/Sequence.java		patch \| blob \| history
src/jalview/datamodel/SequenceI.java		patch \| blob \| history
src/jalview/ws2/actions/annotation/AnnotationJob.java		patch \| blob \| history
src/jalview/ws2/actions/annotation/AnnotationTask.java		patch \| blob \| history
test/jalview/analysis/SeqsetUtilsTest.java		patch \| blob \| history
test/jalview/ws2/actions/annotation/AnnotationJobTest.java		patch \| blob \| history