import jalview.datamodel.SequenceI;
import java.util.ArrayList;
+import java.util.BitSet;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Vector;
import static java.lang.String.format;
+import java.nio.CharBuffer;
+
public class SeqsetUtils
{
public static class SequenceInfo {
}
/**
+ * Filter the sequence through the mask leaving only characters at positions
+ * where the mask value was true. The length of the resulting array is
+ * the cardinality of the mask from 0 to sequence length.
+ *
+ * @param sequence
+ * input sequence
+ * @param mask
+ * mask used to filter the sequence characters
+ * @return input array filtered through the mask
+ */
+ public static char[] filterSequence(char[] sequence, BitSet mask)
+ {
+ mask = mask.get(0, sequence.length);
+ char[] result = new char[mask.cardinality()];
+ for (int i = mask.nextSetBit(0), j = 0; i >= 0;)
+ {
+ result[j++] = sequence[i];
+ if (i == Integer.MAX_VALUE)
+ // prevents integer overflow of (i + 1)
+ break;
+ i = mask.nextSetBit(i + 1);
+ }
+ return result;
+ }
+
+ /**
* Recover essential properties of a sequence from a hashtable TODO: replace
* these methods with something more elegant.
*
package jalview.ws2.actions.annotation;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
final int minSize;
public AnnotationJob(List<SequenceI> inputSeqs, boolean[] gapMap,
- Map<String, SequenceI> seqNames, int start, int end, int minSize)
+ Map<String, SequenceI> seqNames, int start, int end, int minSize)
{
super(inputSeqs);
this.gapMap = gapMap;
}
public static AnnotationJob create(SequenceCollectionI inputSeqs,
- boolean bySequence, boolean submitGaps, boolean requireAligned,
- boolean filterNonStandardResidues, int minSize)
+ boolean bySequence, boolean submitGaps, boolean requireAligned,
+ boolean filterNonStandardResidues, int minSize)
{
- List<SequenceI> seqences = new ArrayList<>();
+ List<SequenceI> sequences = new ArrayList<>();
int minlen = 10;
int width = 0;
Map<String, SequenceI> namesMap = bySequence ? new HashMap<>() : null;
for (SequenceI sq : inputSeqs.getSequences())
{
int sqLen = (bySequence)
- ? sq.findPosition(end + 1) - sq.findPosition(start + 1)
- : sq.getEnd() - sq.getStart();
+ ? sq.findPosition(end + 1) - sq.findPosition(start + 1)
+ : sq.getEnd() - sq.getStart();
if (sqLen < minlen)
continue;
- String newName = SeqsetUtils.unique_name(seqences.size() + 1);
+ width = Math.max(width, sq.getLength());
+ String newName = SeqsetUtils.unique_name(sequences.size() + 1);
if (namesMap != null)
namesMap.put(newName, sq);
+ char[] seqChars = sq.getSequence(start, end + 1);
+ if (filterNonStandardResidues)
+ replaceNonStandardResidues(seqChars, Comparison.GAP_DASH,
+ sq.isProtein());
Sequence seq;
if (submitGaps)
{
- seq = new Sequence(newName, sq.getSequenceAsString());
- updateResidueMap(residueMap, seq, filterNonStandardResidues);
- }
- else
+ seq = new Sequence(newName, seqChars);
+ updateResidueMap(residueMap, seq);
+ } else
{
// TODO: add ability to exclude hidden regions
seq = new Sequence(newName,
- AlignSeq.extractGaps(Comparison.GapChars,
- sq.getSequenceAsString(start, end + 1)));
+ AlignSeq.extractGaps(Comparison.GapChars, new String(seqChars)));
// for annotation need to also record map to sequence start/end
// position in range
// then transfer back to original sequence on return.
}
- seqences.add(seq);
- width = Math.max(width, seq.getLength());
- }
-
- if (requireAligned && submitGaps)
- {
- for (int i = 0; i < seqences.size(); i++)
- {
- SequenceI sq = seqences.get(i);
- char[] padded = fitSequenceToResidueMap(sq.getSequence(),
- residueMap);
- seqences.set(i, new Sequence(sq.getName(), padded));
- }
+ sequences.add(seq);
}
boolean[] gapMapArray = null;
if (submitGaps)
{
+ adjustColumns(sequences, residueMap, requireAligned);
gapMapArray = new boolean[width];
for (int i = 0; i < width; i++)
gapMapArray[i] = residueMap.get(i);
}
- return new AnnotationJob(seqences, gapMapArray, namesMap, start, end,
- minSize);
+ return new AnnotationJob(sequences, gapMapArray, namesMap, start, end,
+ minSize);
}
- private static void updateResidueMap(BitSet residueMap, SequenceI seq,
- boolean filterNonStandardResidues)
+ static void replaceNonStandardResidues(char[] seq, char replacement,
+ boolean isProtein)
{
- for (int pos : seq.gapMap())
+ for (int i = 0; i < seq.length; i++)
{
- char sqchr = seq.getCharAt(pos);
- boolean include = !filterNonStandardResidues;
- include |= seq.isProtein() ? ResidueProperties.aaIndex[sqchr] < 20
- : ResidueProperties.nucleotideIndex[sqchr] < 5;
- if (include)
- residueMap.set(pos);
+ char chr = seq[i];
+ if (isProtein ? ResidueProperties.aaIndex[chr] >= 20
+ : ResidueProperties.nucleotideIndex[chr] >= 5)
+ {
+ seq[i] = replacement;
+ }
}
}
/**
- * Fits the sequence to the residue map removing empty columns where residue
- * map is unset and padding the sequence with gaps at the end if needed.
+ * Add residue positions of the given sequence to the residues map. Perform an
+ * "or" operation between the given residue map and the inverse of the gap map
+ * of the given sequence.
+ *
+ * @param residueMap
+ * mapping to be updated in-place
+ * @param seq
+ * the sequence whose residue positions are added to the map
+ */
+ static void updateResidueMap(BitSet residueMap, SequenceI seq)
+ {
+ var gaps = seq.gapBitset();
+ gaps.flip(0, seq.getLength());
+ residueMap.or(gaps);
+ }
+
+ /**
+ * Remove columns not included in the mask from the sequences in-place. If
+ * {@code padToLength} is set, the shorter sequences are padded with gaps at
+ * the end.
+ *
+ * @param sequences
+ * list of sequences to be modified
+ * @param mask
+ * mask of columns that will remain
+ * @param padToLength
+ * if gaps should be added to the end of shorter sequences
*/
- private static char[] fitSequenceToResidueMap(char[] sequence,
- BitSet residueMap)
+ static void adjustColumns(List<SequenceI> sequences, BitSet mask,
+ boolean padToLength)
{
- int width = residueMap.cardinality();
- char[] padded = new char[width];
- for (int op = 0, pp = 0; pp < width; op++)
+ int width = mask.cardinality();
+ for (SequenceI seq : sequences)
{
- if (residueMap.get(op))
+ char[] chars = SeqsetUtils.filterSequence(seq.getSequence(), mask);
+ if (padToLength && chars.length < width)
{
- if (sequence.length > op)
- padded[pp++] = sequence[op];
- else
- padded[pp++] = '-';
+ int limit = chars.length;
+ chars = Arrays.copyOf(chars, width);
+ Arrays.fill(chars, limit, chars.length, Comparison.GAP_DASH);
}
+ seq.setEnd(seq.getStart());
+ seq.setSequence(chars);
}
- return padded;
}
}
import jalview.datamodel.SequenceI;
import jalview.gui.JvOptionPane;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.equalTo;
+
+import java.util.BitSet;
import java.util.Hashtable;
import java.util.Map;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
Assert.assertSame(sqset[0].getSequenceFeatures().get(1),
sqset2[0].getSequenceFeatures().get(1));
}
+
+ @DataProvider
+ public Object[][] sequenceAndMask()
+ {
+ return new Object[][] {
+ { "AAAABBBBCCCCDDDD", 0xFFFFL, "AAAABBBBCCCCDDDD" },
+ { "AAAABBBBCCCCDDDD", 0x000FL, "AAAA" },
+ { "---A---B---C---D", 0x8888L, "ABCD" },
+ { "---A---B---C---D", 0x9999L, "-A-B-C-D" },
+ { "ABCDABCDABCDABCD", 0xC5A3L, "ABBDACCD" },
+ { "", 0xFFFFL, "" },
+ { "AAAABBBBCCCCDDDD", 0x0000L, "" },
+ { "AAABBBCCC", 0xFFFF, "AAABBBCCC" },
+ { "AAAABBBB", 0xD000L, "" },
+ { "AAAABBBB", 0xAA0AL, "AA" },
+ };
+ }
+
+ @Test(groups = {"Functional"}, dataProvider = "sequenceAndMask")
+ public void testFilterSequence(String sequence, long mask, String expected)
+ {
+ BitSet bitMask = BitSet.valueOf(new long[] {mask});
+ var result = SeqsetUtils.filterSequence(sequence.toCharArray(), bitMask);
+ assertThat(result, equalTo(expected.toCharArray()));
+ }
}
}
@Test(groups = { "Functional"} )
- public void testCreate_ContainsNonStandardAndNoFilterNonStandard_NonStandardToGap()
+ public void testCreate_ContainsNonStandardAndNoFilterNonStandard_NonStandardRemain()
{
var alignment = new Alignment(new Sequence[] {
new Sequence("test1", "ACACAOACACAC"),
});
var annotJob = AnnotationJob.create(alignment, true, true, true, false, 0);
assertThat(annotJob.getInputSequences(), contains(
+ matchesSequenceString("ACACAOACACAC"),
+ matchesSequenceString("ABAVAVAVABAV")
+ ));
+ }
+
+ @Test(groups = { "Functional"} )
+ public void testCreate_ContainsNonStandardAndFilterNonStandard_NonStandardToGap()
+ {
+ var alignment = new Alignment(new Sequence[] {
+ new Sequence("test1", "ACACAOACACAC"),
+ new Sequence("test2", "ABAVAVAVABAV")
+ });
+ var annotJob = AnnotationJob.create(alignment, true, true, true, true, 0);
+ assertThat(annotJob.getInputSequences(), contains(
matchesSequenceString("ACACA-ACACAC"),
matchesSequenceString("A-AVAVAVA-AV")
));