From: Mateusz Warowny Date: Thu, 14 Sep 2023 15:10:43 +0000 (+0200) Subject: Merge branch 'mmw/bug/JAL-4241-annotation-alignment-fix-slivka' into development... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=81682dafca2e79cf141b134496db0c3a84027806;hp=54844a3baecd8bd04d948897180e2bfc1a293e91;p=jalview.git Merge branch 'mmw/bug/JAL-4241-annotation-alignment-fix-slivka' into development/Release_2_12_Branch --- diff --git a/src/jalview/analysis/SeqsetUtils.java b/src/jalview/analysis/SeqsetUtils.java index adb70e3..5420aff 100755 --- a/src/jalview/analysis/SeqsetUtils.java +++ b/src/jalview/analysis/SeqsetUtils.java @@ -30,6 +30,7 @@ import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.util.ArrayList; +import java.util.BitSet; import java.util.Enumeration; import java.util.HashMap; import java.util.Hashtable; @@ -40,6 +41,8 @@ import java.util.Optional; import java.util.Vector; import static java.lang.String.format; +import java.nio.CharBuffer; + public class SeqsetUtils { public static class SequenceInfo { @@ -90,6 +93,32 @@ public class SeqsetUtils } /** + * Filter the sequence through the mask leaving only characters at positions + * where the mask value was true. The length of the resulting array is + * the cardinality of the mask from 0 to sequence length. + * + * @param sequence + * input sequence + * @param mask + * mask used to filter the sequence characters + * @return input array filtered through the mask + */ + public static char[] filterSequence(char[] sequence, BitSet mask) + { + mask = mask.get(0, sequence.length); + char[] result = new char[mask.cardinality()]; + for (int i = mask.nextSetBit(0), j = 0; i >= 0;) + { + result[j++] = sequence[i]; + if (i == Integer.MAX_VALUE) + // prevents integer overflow of (i + 1) + break; + i = mask.nextSetBit(i + 1); + } + return result; + } + + /** * Recover essential properties of a sequence from a hashtable TODO: replace * these methods with something more elegant. * diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java index 4230366..89c55e6 100755 --- a/src/jalview/datamodel/Sequence.java +++ b/src/jalview/datamodel/Sequence.java @@ -594,6 +594,14 @@ public class Sequence extends ASequence implements SequenceI return this.sequence.length; } + @Override + public void setSequence(char[] seq) + { + this.sequence = Arrays.copyOf(seq, seq.length); + checkValidRange(); + sequenceChanged(); + } + /** * DOCUMENT ME! * diff --git a/src/jalview/datamodel/SequenceI.java b/src/jalview/datamodel/SequenceI.java index 11aa4e6..82575ec 100755 --- a/src/jalview/datamodel/SequenceI.java +++ b/src/jalview/datamodel/SequenceI.java @@ -101,6 +101,14 @@ public interface SequenceI extends ASequenceI public int getLength(); /** + * Replace the sequence with the given characters + * + * @param sequence + * new sequence characters + */ + public void setSequence(char[] sequence); + + /** * Replace the sequence with the given string * * @param sequence diff --git a/src/jalview/ws2/actions/annotation/AnnotationJob.java b/src/jalview/ws2/actions/annotation/AnnotationJob.java index 467dafd..6a1053e 100644 --- a/src/jalview/ws2/actions/annotation/AnnotationJob.java +++ b/src/jalview/ws2/actions/annotation/AnnotationJob.java @@ -1,6 +1,7 @@ package jalview.ws2.actions.annotation; import java.util.ArrayList; +import java.util.Arrays; import java.util.BitSet; import java.util.HashMap; import java.util.List; @@ -26,7 +27,7 @@ public class AnnotationJob extends BaseJob final int minSize; public AnnotationJob(List inputSeqs, boolean[] gapMap, - Map seqNames, int start, int end, int minSize) + Map seqNames, int start, int end, int minSize) { super(inputSeqs); this.gapMap = gapMap; @@ -47,10 +48,10 @@ public class AnnotationJob extends BaseJob } public static AnnotationJob create(SequenceCollectionI inputSeqs, - boolean bySequence, boolean submitGaps, boolean requireAligned, - boolean filterNonStandardResidues, int minSize) + boolean bySequence, boolean submitGaps, boolean requireAligned, + boolean filterNonStandardResidues, int minSize) { - List seqences = new ArrayList<>(); + List sequences = new ArrayList<>(); int minlen = 10; int width = 0; Map namesMap = bySequence ? new HashMap<>() : null; @@ -64,87 +65,104 @@ public class AnnotationJob extends BaseJob for (SequenceI sq : inputSeqs.getSequences()) { int sqLen = (bySequence) - ? sq.findPosition(end + 1) - sq.findPosition(start + 1) - : sq.getEnd() - sq.getStart(); + ? sq.findPosition(end + 1) - sq.findPosition(start + 1) + : sq.getEnd() - sq.getStart(); if (sqLen < minlen) continue; - String newName = SeqsetUtils.unique_name(seqences.size() + 1); + width = Math.max(width, sq.getLength()); + String newName = SeqsetUtils.unique_name(sequences.size() + 1); if (namesMap != null) namesMap.put(newName, sq); + char[] seqChars = sq.getSequence(start, end + 1); + if (filterNonStandardResidues) + replaceNonStandardResidues(seqChars, Comparison.GAP_DASH, + sq.isProtein()); Sequence seq; if (submitGaps) { - seq = new Sequence(newName, sq.getSequenceAsString()); - updateResidueMap(residueMap, seq, filterNonStandardResidues); - } - else + seq = new Sequence(newName, seqChars); + updateResidueMap(residueMap, seq); + } else { // TODO: add ability to exclude hidden regions seq = new Sequence(newName, - AlignSeq.extractGaps(Comparison.GapChars, - sq.getSequenceAsString(start, end + 1))); + AlignSeq.extractGaps(Comparison.GapChars, new String(seqChars))); // for annotation need to also record map to sequence start/end // position in range // then transfer back to original sequence on return. } - seqences.add(seq); - width = Math.max(width, seq.getLength()); - } - - if (requireAligned && submitGaps) - { - for (int i = 0; i < seqences.size(); i++) - { - SequenceI sq = seqences.get(i); - char[] padded = fitSequenceToResidueMap(sq.getSequence(), - residueMap); - seqences.set(i, new Sequence(sq.getName(), padded)); - } + sequences.add(seq); } boolean[] gapMapArray = null; if (submitGaps) { + adjustColumns(sequences, residueMap, requireAligned); gapMapArray = new boolean[width]; for (int i = 0; i < width; i++) gapMapArray[i] = residueMap.get(i); } - return new AnnotationJob(seqences, gapMapArray, namesMap, start, end, - minSize); + return new AnnotationJob(sequences, gapMapArray, namesMap, start, end, + minSize); } - private static void updateResidueMap(BitSet residueMap, SequenceI seq, - boolean filterNonStandardResidues) + static void replaceNonStandardResidues(char[] seq, char replacement, + boolean isProtein) { - for (int pos : seq.gapMap()) + for (int i = 0; i < seq.length; i++) { - char sqchr = seq.getCharAt(pos); - boolean include = !filterNonStandardResidues; - include |= seq.isProtein() ? ResidueProperties.aaIndex[sqchr] < 20 - : ResidueProperties.nucleotideIndex[sqchr] < 5; - if (include) - residueMap.set(pos); + char chr = seq[i]; + if (isProtein ? ResidueProperties.aaIndex[chr] >= 20 + : ResidueProperties.nucleotideIndex[chr] >= 5) + { + seq[i] = replacement; + } } } /** - * Fits the sequence to the residue map removing empty columns where residue - * map is unset and padding the sequence with gaps at the end if needed. + * Add residue positions of the given sequence to the residues map. Perform an + * "or" operation between the given residue map and the inverse of the gap map + * of the given sequence. + * + * @param residueMap + * mapping to be updated in-place + * @param seq + * the sequence whose residue positions are added to the map + */ + static void updateResidueMap(BitSet residueMap, SequenceI seq) + { + var gaps = seq.gapBitset(); + gaps.flip(0, seq.getLength()); + residueMap.or(gaps); + } + + /** + * Remove columns not included in the mask from the sequences in-place. If + * {@code padToLength} is set, the shorter sequences are padded with gaps at + * the end. + * + * @param sequences + * list of sequences to be modified + * @param mask + * mask of columns that will remain + * @param padToLength + * if gaps should be added to the end of shorter sequences */ - private static char[] fitSequenceToResidueMap(char[] sequence, - BitSet residueMap) + static void adjustColumns(List sequences, BitSet mask, + boolean padToLength) { - int width = residueMap.cardinality(); - char[] padded = new char[width]; - for (int op = 0, pp = 0; pp < width; op++) + int width = mask.cardinality(); + for (SequenceI seq : sequences) { - if (residueMap.get(op)) + char[] chars = SeqsetUtils.filterSequence(seq.getSequence(), mask); + if (padToLength && chars.length < width) { - if (sequence.length > op) - padded[pp++] = sequence[op]; - else - padded[pp++] = '-'; + int limit = chars.length; + chars = Arrays.copyOf(chars, width); + Arrays.fill(chars, limit, chars.length, Comparison.GAP_DASH); } + seq.setEnd(seq.getStart()); + seq.setSequence(chars); } - return padded; } } diff --git a/src/jalview/ws2/actions/annotation/AnnotationTask.java b/src/jalview/ws2/actions/annotation/AnnotationTask.java index 6cb5aa6..1d02264 100644 --- a/src/jalview/ws2/actions/annotation/AnnotationTask.java +++ b/src/jalview/ws2/actions/annotation/AnnotationTask.java @@ -162,6 +162,8 @@ public class AnnotationTask } } + // TODO: review and test + // may produce wrong output if annotations longer than gapMap private Annotation[] createGappedAnnotations(Annotation[] annotations, boolean[] gapMap) { diff --git a/test/jalview/analysis/SeqsetUtilsTest.java b/test/jalview/analysis/SeqsetUtilsTest.java index 71e5bfd..fbd5812 100644 --- a/test/jalview/analysis/SeqsetUtilsTest.java +++ b/test/jalview/analysis/SeqsetUtilsTest.java @@ -27,11 +27,16 @@ import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.gui.JvOptionPane; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +import java.util.BitSet; import java.util.Hashtable; import java.util.Map; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; /** @@ -82,4 +87,29 @@ public class SeqsetUtilsTest Assert.assertSame(sqset[0].getSequenceFeatures().get(1), sqset2[0].getSequenceFeatures().get(1)); } + + @DataProvider + public Object[][] sequenceAndMask() + { + return new Object[][] { + { "AAAABBBBCCCCDDDD", 0xFFFFL, "AAAABBBBCCCCDDDD" }, + { "AAAABBBBCCCCDDDD", 0x000FL, "AAAA" }, + { "---A---B---C---D", 0x8888L, "ABCD" }, + { "---A---B---C---D", 0x9999L, "-A-B-C-D" }, + { "ABCDABCDABCDABCD", 0xC5A3L, "ABBDACCD" }, + { "", 0xFFFFL, "" }, + { "AAAABBBBCCCCDDDD", 0x0000L, "" }, + { "AAABBBCCC", 0xFFFF, "AAABBBCCC" }, + { "AAAABBBB", 0xD000L, "" }, + { "AAAABBBB", 0xAA0AL, "AA" }, + }; + } + + @Test(groups = {"Functional"}, dataProvider = "sequenceAndMask") + public void testFilterSequence(String sequence, long mask, String expected) + { + BitSet bitMask = BitSet.valueOf(new long[] {mask}); + var result = SeqsetUtils.filterSequence(sequence.toCharArray(), bitMask); + assertThat(result, equalTo(expected.toCharArray())); + } } diff --git a/test/jalview/ws2/actions/annotation/AnnotationJobTest.java b/test/jalview/ws2/actions/annotation/AnnotationJobTest.java index 4e3767f..183d6f3 100644 --- a/test/jalview/ws2/actions/annotation/AnnotationJobTest.java +++ b/test/jalview/ws2/actions/annotation/AnnotationJobTest.java @@ -172,7 +172,7 @@ public class AnnotationJobTest } @Test(groups = { "Functional"} ) - public void testCreate_ContainsNonStandardAndNoFilterNonStandard_NonStandardToGap() + public void testCreate_ContainsNonStandardAndNoFilterNonStandard_NonStandardRemain() { var alignment = new Alignment(new Sequence[] { new Sequence("test1", "ACACAOACACAC"), @@ -180,6 +180,20 @@ public class AnnotationJobTest }); var annotJob = AnnotationJob.create(alignment, true, true, true, false, 0); assertThat(annotJob.getInputSequences(), contains( + matchesSequenceString("ACACAOACACAC"), + matchesSequenceString("ABAVAVAVABAV") + )); + } + + @Test(groups = { "Functional"} ) + public void testCreate_ContainsNonStandardAndFilterNonStandard_NonStandardToGap() + { + var alignment = new Alignment(new Sequence[] { + new Sequence("test1", "ACACAOACACAC"), + new Sequence("test2", "ABAVAVAVABAV") + }); + var annotJob = AnnotationJob.create(alignment, true, true, true, true, 0); + assertThat(annotJob.getInputSequences(), contains( matchesSequenceString("ACACA-ACACAC"), matchesSequenceString("A-AVAVAVA-AV") ));