From: Mateusz Warowny Date: Thu, 14 Sep 2023 15:09:03 +0000 (+0200) Subject: JAL-4241 Flter empty columns from annotation service input X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=refs%2Fheads%2Fmmw%2Fbug%2FJAL-4241-annotation-alignment-fix-slivka;p=jalview.git JAL-4241 Flter empty columns from annotation service input --- diff --git a/src/jalview/ws2/actions/annotation/AnnotationJob.java b/src/jalview/ws2/actions/annotation/AnnotationJob.java index 187c14c..6a1053e 100644 --- a/src/jalview/ws2/actions/annotation/AnnotationJob.java +++ b/src/jalview/ws2/actions/annotation/AnnotationJob.java @@ -1,6 +1,7 @@ package jalview.ws2.actions.annotation; import java.util.ArrayList; +import java.util.Arrays; import java.util.BitSet; import java.util.HashMap; import java.util.List; @@ -26,7 +27,7 @@ public class AnnotationJob extends BaseJob final int minSize; public AnnotationJob(List inputSeqs, boolean[] gapMap, - Map seqNames, int start, int end, int minSize) + Map seqNames, int start, int end, int minSize) { super(inputSeqs); this.gapMap = gapMap; @@ -47,10 +48,10 @@ public class AnnotationJob extends BaseJob } public static AnnotationJob create(SequenceCollectionI inputSeqs, - boolean bySequence, boolean submitGaps, boolean requireAligned, - boolean filterNonStandardResidues, int minSize) + boolean bySequence, boolean submitGaps, boolean requireAligned, + boolean filterNonStandardResidues, int minSize) { - List seqences = new ArrayList<>(); + List sequences = new ArrayList<>(); int minlen = 10; int width = 0; Map namesMap = bySequence ? new HashMap<>() : null; @@ -64,63 +65,53 @@ public class AnnotationJob extends BaseJob for (SequenceI sq : inputSeqs.getSequences()) { int sqLen = (bySequence) - ? sq.findPosition(end + 1) - sq.findPosition(start + 1) - : sq.getEnd() - sq.getStart(); + ? sq.findPosition(end + 1) - sq.findPosition(start + 1) + : sq.getEnd() - sq.getStart(); if (sqLen < minlen) continue; - String newName = SeqsetUtils.unique_name(seqences.size() + 1); + width = Math.max(width, sq.getLength()); + String newName = SeqsetUtils.unique_name(sequences.size() + 1); if (namesMap != null) namesMap.put(newName, sq); char[] seqChars = sq.getSequence(start, end + 1); if (filterNonStandardResidues) - replaceNonStandardResidues(seqChars, Comparison.GAP_DASH, sq.isProtein()); + replaceNonStandardResidues(seqChars, Comparison.GAP_DASH, + sq.isProtein()); Sequence seq; if (submitGaps) { seq = new Sequence(newName, seqChars); - updateResidueMap(residueMap, seq, filterNonStandardResidues); - } - else + updateResidueMap(residueMap, seq); + } else { // TODO: add ability to exclude hidden regions seq = new Sequence(newName, - AlignSeq.extractGaps(Comparison.GapChars, new String(seqChars))); + AlignSeq.extractGaps(Comparison.GapChars, new String(seqChars))); // for annotation need to also record map to sequence start/end // position in range // then transfer back to original sequence on return. } - seqences.add(seq); - width = Math.max(width, seq.getLength()); - } - - if (requireAligned && submitGaps) - { - for (int i = 0; i < seqences.size(); i++) - { - SequenceI sq = seqences.get(i); - char[] padded = fitSequenceToResidueMap(sq.getSequence(), - residueMap); - seqences.set(i, new Sequence(sq.getName(), padded)); - } + sequences.add(seq); } boolean[] gapMapArray = null; if (submitGaps) { + adjustColumns(sequences, residueMap, requireAligned); gapMapArray = new boolean[width]; for (int i = 0; i < width; i++) gapMapArray[i] = residueMap.get(i); } - return new AnnotationJob(seqences, gapMapArray, namesMap, start, end, - minSize); + return new AnnotationJob(sequences, gapMapArray, namesMap, start, end, + minSize); } - private static void replaceNonStandardResidues(char[] seq, char replacement, boolean isProtein) + static void replaceNonStandardResidues(char[] seq, char replacement, + boolean isProtein) { for (int i = 0; i < seq.length; i++) { char chr = seq[i]; - if (isProtein - ? ResidueProperties.aaIndex[chr] >= 20 + if (isProtein ? ResidueProperties.aaIndex[chr] >= 20 : ResidueProperties.nucleotideIndex[chr] >= 5) { seq[i] = replacement; @@ -128,39 +119,50 @@ public class AnnotationJob extends BaseJob } } - private static void updateResidueMap(BitSet residueMap, SequenceI seq, - boolean filterNonStandardResidues) + /** + * Add residue positions of the given sequence to the residues map. Perform an + * "or" operation between the given residue map and the inverse of the gap map + * of the given sequence. + * + * @param residueMap + * mapping to be updated in-place + * @param seq + * the sequence whose residue positions are added to the map + */ + static void updateResidueMap(BitSet residueMap, SequenceI seq) { - for (int pos : seq.gapMap()) - { - char sqchr = seq.getCharAt(pos); - boolean include = !filterNonStandardResidues; - include |= seq.isProtein() ? ResidueProperties.aaIndex[sqchr] < 20 - : ResidueProperties.nucleotideIndex[sqchr] < 5; - if (include) - residueMap.set(pos); - } + var gaps = seq.gapBitset(); + gaps.flip(0, seq.getLength()); + residueMap.or(gaps); } /** - * Fits the sequence to the residue map removing empty columns where residue - * map is unset and padding the sequence with gaps at the end if needed. + * Remove columns not included in the mask from the sequences in-place. If + * {@code padToLength} is set, the shorter sequences are padded with gaps at + * the end. + * + * @param sequences + * list of sequences to be modified + * @param mask + * mask of columns that will remain + * @param padToLength + * if gaps should be added to the end of shorter sequences */ - private static char[] fitSequenceToResidueMap(char[] sequence, - BitSet residueMap) + static void adjustColumns(List sequences, BitSet mask, + boolean padToLength) { - int width = residueMap.cardinality(); - char[] padded = new char[width]; - for (int op = 0, pp = 0; pp < width; op++) + int width = mask.cardinality(); + for (SequenceI seq : sequences) { - if (residueMap.get(op)) + char[] chars = SeqsetUtils.filterSequence(seq.getSequence(), mask); + if (padToLength && chars.length < width) { - if (sequence.length > op) - padded[pp++] = sequence[op]; - else - padded[pp++] = '-'; + int limit = chars.length; + chars = Arrays.copyOf(chars, width); + Arrays.fill(chars, limit, chars.length, Comparison.GAP_DASH); } + seq.setEnd(seq.getStart()); + seq.setSequence(chars); } - return padded; } } diff --git a/src/jalview/ws2/actions/annotation/AnnotationTask.java b/src/jalview/ws2/actions/annotation/AnnotationTask.java index 6cb5aa6..1d02264 100644 --- a/src/jalview/ws2/actions/annotation/AnnotationTask.java +++ b/src/jalview/ws2/actions/annotation/AnnotationTask.java @@ -162,6 +162,8 @@ public class AnnotationTask } } + // TODO: review and test + // may produce wrong output if annotations longer than gapMap private Annotation[] createGappedAnnotations(Annotation[] annotations, boolean[] gapMap) {