4 Copyright (c) 2018, Mungo Carstairs
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright notice, this
11 list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
17 * Neither the name of the copyright holder nor the names of its
18 contributors may be used to endorse or promote products derived from
19 this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 package intervalstore.nonc;
34 import java.util.AbstractCollection;
35 import java.util.ArrayList;
36 import java.util.Arrays;
37 import java.util.BitSet;
38 import java.util.Collections;
39 import java.util.Comparator;
40 import java.util.Iterator;
41 import java.util.List;
42 import java.util.NoSuchElementException;
44 import intervalstore.api.IntervalI;
45 import intervalstore.api.IntervalStoreI;
49 * A fourth idea, implementing NCList as a pointer system identical in operation
50 * to IntervalStoreJ's implementation using ArrayLists but here using just two
51 * int[] arrays and a single IntervalI[] array that is in the proper order for
52 * holding all nested and unnested arrays.
54 * Use of unnesting is optional and can be experimented with by changing the
55 * createUnnested flag to false.
57 * Preliminary testing suggests that this implementation is about 10% faster for
58 * store interval size 50, store sequence factor 10, query width -1000 (fixed
59 * 1000-unit-wide window), and query count 100000.
61 * Origional note (Mungo Carstairs, IntervalStoreJ)
63 * A Collection class to store interval-associated data, with options for "lazy"
64 * sorting so as to speed incremental construction of the data prior to issuing
67 * Accepts duplicate entries but not null values.
71 * @author Bob Hanson 2019.09.01
74 * any type providing <code>getBegin()</code> and
75 * <code>getEnd()</code>, primarily
77 public class IntervalStore<T extends IntervalI>
78 extends AbstractCollection<T> implements IntervalStoreI<T>
82 * Search for the last interval that ends at or just after the specified
83 * position. In the situation that there are multiple intervals starting at
84 * pos, this method returns the first of those.
87 * the nest-ordered array from createArrays()
89 * the position at the start of the interval of interest
91 * the starting point for the subarray search
93 * the ending point for the subarray search
94 * @return index into the nests array or one greater than end if not found
96 public static int binarySearchFirstEndNotBefore(IntervalI[] nests, long from,
99 int matched = end + 1;
103 mid = (start + end) >>> 1;
104 if (nests[mid].getEnd() >= from)
118 * My preference is for a bigendian comparison, but you may differ.
120 private Comparator<? super IntervalI> icompare;
123 * bigendian is what NCList does; change icompare to switch to that
125 private boolean bigendian;
127 private final boolean DO_PRESORT;
129 private boolean isSorted;
131 private boolean createUnnested = true;
133 private int minStart = Integer.MAX_VALUE, maxStart = Integer.MIN_VALUE,
134 maxEnd = Integer.MAX_VALUE;
136 private boolean isTainted;
138 private int capacity = 8;
140 protected IntervalI[] intervals = new IntervalI[capacity];
142 private int[] offsets;
144 protected int intervalCount;
150 private BitSet bsDeleted;
153 * the key array that lists the intervals in sub-interval order so that the
154 * binary search can be isolated to a single subinterval just by indicating
155 * start and end within one array
157 private IntervalI[] nests;
160 * pointers to the starting positions in nests[] for a subinterval; the first
161 * element is the "unnested" pointer when unnesting (2) or the root level nest
162 * pointer when not unnesting (1); the second element is root level nest when
163 * unnesting or the start of nest data when not unnesting; after that, nests
164 * are in contiguous sets of binary-searchable blocks
167 private int[] nestStarts;
170 * the count of intervals within a nest
173 private int[] nestCounts;
175 public IntervalStore()
180 public IntervalStore(boolean presort)
186 * Constructor given a list of intervals. Note that the list may get sorted as
187 * a side-effect of calling this constructor.
189 public IntervalStore(List<T> intervals)
191 this(intervals, true);
195 * Allows a presort option, which can speed up initial loading of individual
196 * features but will delay the first findOverlap if set to true.
201 public IntervalStore(List<T> intervals, boolean presort)
203 // setting default to BIG_ENDIAN, meaning
204 // the order will be [10,100] before [10,80]
205 // this order doesn't really matter much.
206 this(intervals, presort, null, true);
212 * intervals to initialize with (others may still be added)
214 * whether or not to presort the list as additions are made
216 * IntervalI.COMPARATOR_LITTLEENDIAN or
217 * IntervalI.COMPARATOR_BIGENDIAN, but this could also be one that
218 * sorts by description as well, for example.
220 * true if the comparator sorts [10-100] before [10-80]; defaults to
223 public IntervalStore(List<T> intervals, boolean presort,
224 Comparator<? super IntervalI> comparator, boolean bigendian)
226 icompare = (comparator != null ? comparator
227 : bigendian ? IntervalI.COMPARE_BEGIN_ASC_END_DESC
228 : IntervalI.COMPARE_BEGIN_ASC_END_ASC);
229 this.bigendian = bigendian;
231 if (intervals != null)
233 // So, five hours later, we learn that all my timing has been thrown off
234 // because I used Array.sort, which if you look in the Java JDK is exactly
235 // what Collections.sort is, but for whatever reason, all my times were
236 // high by about 100-200 ms 100% reproducibly. Just one call to Array.sort
237 // prior to the nanotimer start messed it all up. Some sort of memory or
238 // garbage issue; I do not know. But using Collections.sort here fixes the
241 Collections.sort(intervals, icompare);
243 this.intervals = new IntervalI[capacity = intervalCount = intervals
246 DO_PRESORT = presort;
247 if (DO_PRESORT && intervalCount > 1)
256 isSorted = DO_PRESORT;
262 * Adds one interval to the store, allowing duplicates
267 public boolean add(T interval)
269 return add(interval, true);
273 * Adds one interval to the store, optionally checking for duplicates.
275 * This fast-adding algorithm uses a double-length int[] (offsets) to hold
276 * pointers into intervals[] that allows continual sorting of an expanding
277 * array buffer. When the time comes, this is cleaned up and packed back into
278 * a standard array, but in the mean time, it can be added to with no loss of
282 * @param allowDuplicates
285 public boolean add(T interval, boolean allowDuplicates)
287 if (interval == null)
302 synchronized (intervals)
304 int index = intervalCount;
305 int start = interval.getBegin();
307 if (intervalCount + added + 1 >= capacity)
309 intervals = finalizeAddition(
310 new IntervalI[capacity = capacity << 1]);
314 if (DO_PRESORT && isSorted)
316 if (intervalCount == 0)
322 index = findInterval(interval);
323 if (!allowDuplicates && index >= 0)
340 if (!allowDuplicates && findInterval(interval) >= 0)
347 if (index == intervalCount)
349 intervals[intervalCount++] = interval;
350 // System.out.println("added " + intervalCount + " " + interval);
354 int pt = capacity - ++added;
355 intervals[pt] = interval;
356 // System.out.println("stashed " + pt + " " + interval + " for "
357 // + index + " " + intervals[index]);
360 offsets = new int[capacity];
363 offsets[pt] = offsets[index];
368 minStart = Math.min(minStart, start);
369 maxStart = Math.max(maxStart, start);
375 * Clean up the intervals array into a simple ordered array.
380 private IntervalI[] finalizeAddition(IntervalI[] dest)
388 if (intervalCount > 0 && dest != intervals)
390 System.arraycopy(intervals, 0, dest, 0, intervalCount);
392 capacity = dest.length;
395 // System.out.println("finalizing " + intervalCount + " " + added);
397 // array is [(intervalCount)...null...(added)]
399 int ntotal = intervalCount + added;
400 for (int ptShift = ntotal, pt = intervalCount; pt >= 0;)
403 while (--pt >= 0 && offsets[pt] == 0)
412 // shift upper intervals right
416 System.arraycopy(intervals, pt, dest, ptShift, nOK);
422 for (int offset = offsets[pt]; offset > 0; offset = offsets[offset])
424 dest[--ptShift] = intervals[offset];
429 intervalCount = ntotal;
430 capacity = dest.length;
431 // System.out.println(Arrays.toString(dest));
436 * A binary search for a duplicate.
441 public int binaryIdentitySearch(IntervalI interval)
443 return binaryIdentitySearch(interval, null);
447 * for remove() and contains()
453 * @return index or, if not found, -1 - "would be here"
455 public int binaryIdentitySearch(IntervalI interval, BitSet bsIgnore)
458 int r0 = interval.getBegin();
459 int r1 = interval.getEnd();
460 int end = intervalCount - 1;
461 if (end < 0 || r0 < minStart)
467 return -1 - intervalCount;
471 int mid = (start + end) >>> 1;
472 IntervalI r = intervals[mid];
473 switch (compareRange(r, r0, r1))
482 IntervalI iv = intervals[mid];
483 if ((bsIgnore == null || !bsIgnore.get(mid))
484 && sameInterval(interval, iv))
487 // found one; just scan up and down now, first checking the range, but
488 // also checking other possible aspects of equivalence.
491 for (int i = mid; ++i <= end;)
493 if ((iv = intervals[i]).getBegin() != r0 || iv.getEnd() != r1)
497 if ((bsIgnore == null || !bsIgnore.get(i))
498 && sameInterval(interval, iv))
503 for (int i = mid; --i >= start;)
505 if ((iv = intervals[i]).getBegin() != r0
506 || (bigendian ? r1 < iv.getEnd() : iv.getEnd() < r1))
510 if ((bsIgnore == null || !bsIgnore.get(i))
511 && sameInterval(interval, iv))
523 * Answers true if the two intervals are equal (as determined by
524 * {@code i1.equals(i2)}, else false
530 static boolean sameInterval(IntervalI i1, IntervalI i2)
533 * for speed, do the fast check for begin/end equality before
534 * the equals check which includes type checking
536 return i1.equalsInterval(i2) && i1.equals(i2);
546 intervalCount = added = 0;
550 intervals = new IntervalI[8];
551 nestStarts = nestCounts = null;
553 minStart = maxEnd = Integer.MAX_VALUE;
554 maxStart = Integer.MIN_VALUE;
558 * Compare an interval t to a from/to range for insertion purposes
563 * @return 0 if same, 1 if start is after from, or start equals from and
564 * [bigendian: end is before to | littleendian: end is after to], else
567 private int compareRange(IntervalI t, long from, long to)
569 int order = Long.signum(t.getBegin() - from);
571 ? Long.signum(bigendian ? to - t.getEnd() : t.getEnd() - to)
576 public boolean contains(Object entry)
578 if (entry == null || intervalCount == 0 && added == 0 && deleted == 0)
582 if (!isSorted || deleted > 0)
586 int n = findInterval((IntervalI) entry);
591 * Check to see if a given interval is within another.
599 public boolean containsInterval(IntervalI outer, IntervalI inner)
601 return false; // not applicable
605 * Ensure that all addition, deletion, and sorting has been done, and that the
606 * nesting arrays have been created so that we are ready for findOverlaps().
610 private void ensureFinalized()
614 if (!isSorted || added > 0 || deleted > 0)
618 if (intervalCount > 0)
627 * Find all overlaps within the given range, inclusively.
629 * @return a list sorted in ascending order of start position
633 public List<T> findOverlaps(long from, long to)
635 return findOverlaps(from, to, null);
639 * Find all overlaps within the given range, inclusively.
641 * @return a list sorted in the order provided by the features list comparator
645 @SuppressWarnings("unchecked")
647 public List<T> findOverlaps(long from, long to, List<T> result)
651 result = new ArrayList<>();
653 switch (intervalCount + added)
658 IntervalI sf = intervals[0];
659 if (sf.getBegin() <= to && sf.getEnd() >= from)
668 if (from > maxEnd || to < minStart)
675 if (nestCounts[0] > 0)
677 searchNonnested(nestCounts[0], nests, from, to,
678 (List<IntervalI>) result);
682 if (nestCounts[root] > 0)
684 search(nests, from, to, root, result);
690 * A simpler search, since we know we don't have any subintervals. Not
691 * necessary, actually.
700 private static void searchNonnested(int n,
701 IntervalI[] nests, long from, long to, List<IntervalI> result)
704 for (int pt = binarySearchFirstEndNotBefore(nests, from, 2,
705 end); pt <= end; pt++)
707 IntervalI ival = nests[pt];
708 if (ival.getBegin() > to)
717 * The main search of the nests[] array's subarrays
725 @SuppressWarnings("unchecked")
726 private void search(IntervalI[] nests, long from, long to, int nest,
729 int start = nestStarts[nest];
730 int n = nestCounts[nest];
731 int end = start + n - 1;
732 IntervalI first = nests[start];
733 IntervalI last = nests[end];
735 // quick tests for common cases:
737 if (last.getEnd() < from || first.getBegin() > to)
745 // just one interval and hasn't failed begin/end test
749 // just two and didn't fail begin/end test
750 // so there is only one option: either the first or the second is our
752 pt = (first.getEnd() >= from ? start : end);
755 // do the binary search
756 pt = binarySearchFirstEndNotBefore(nests, from, start, end);
759 for (; pt <= end; pt++)
761 IntervalI ival = nests[pt];
762 // check for out of range
763 if (ival.getBegin() > to)
767 result.add((T) ival);
768 if (nestCounts[pt] > 0)
770 // check subintervals in this nest
771 search(nests, from, to, pt, result);
777 * Return the deepest level of nesting.
781 public int getDepth()
784 BitSet bsTested = new BitSet();
785 return Math.max((createUnnested ? getDepth(1, bsTested) : 0),
786 getDepth(0, bsTested));
790 * Iteratively dive deeply.
796 private int getDepth(int pt, BitSet bsTested)
800 int n = nestCounts[pt];
801 if (n == 0 || bsTested.get(pt))
806 for (int st = nestStarts[pt], i = st + n; --i >= st;)
808 if ((depth = getDepth(i, bsTested)) > maxDepth)
817 * Answers an iterator over the intervals in the store, with no particular
818 * ordering guaranteed. The iterator does not support the optional
819 * <code>remove</code> operation (throws
820 * <code>UnsupportedOperationException</code> if attempted).
823 public Iterator<T> iterator()
826 return new Iterator<T>()
832 public boolean hasNext()
834 return next < intervalCount;
837 @SuppressWarnings("unchecked")
841 if (next >= intervalCount)
843 throw new NoSuchElementException();
845 return (T) intervals[next++];
852 * Indented printing of the intervals.
856 public String prettyPrint()
859 StringBuffer sb = new StringBuffer();
862 sb.append("unnested:");
864 sb.append("\nnested:");
871 return sb.toString();
875 * Iterative nest dump.
881 private void dump(int nest, StringBuffer sb, String sep)
883 int pt = nestStarts[nest];
884 int n = nestCounts[nest];
887 for (int i = 0; i < n; i++)
889 sb.append(sep).append(nests[pt + i].toString());
890 if (nestCounts[pt + i] > 0)
892 dump(pt + i, sb, sep + " ");
898 public synchronized boolean remove(Object o)
900 return (o != null && intervalCount > 0
901 && removeInterval((IntervalI) o));
905 * Find the interval or return where it should go, possibly into the add
909 * @return index (nonnegative) or index where it would go (negative)
912 private int findInterval(IntervalI interval)
917 int pt = binaryIdentitySearch(interval, null);
918 // if (addPt == intervalCount || offsets[pt] == 0)
920 if (pt >= 0 || added == 0 || pt == -1 - intervalCount)
925 int start = interval.getBegin();
926 int end = interval.getEnd();
930 while ((pt = offsets[pt]) != 0)
932 IntervalI iv = intervals[pt];
933 switch (compareRange(iv, start, end))
938 if (sameInterval(interval, iv))
952 int i = intervalCount;
953 while (--i >= 0 && !sameInterval(intervals[i], interval))
962 * Uses a binary search to find the entry and removes it if found.
967 protected boolean removeInterval(IntervalI interval)
970 if (!isSorted || added > 0)
974 int i = binaryIdentitySearch(interval, bsDeleted);
981 if (bsDeleted == null)
983 bsDeleted = new BitSet(intervalCount);
992 return (isTainted = true);
996 * Fill in the gaps of the intervals array after one or more deletions.
999 private void finalizeDeletion()
1006 // ......xxx.....xxxx.....xxxxx....
1010 for (int i = bsDeleted.nextSetBit(0), pt = i; i >= 0;)
1012 i = bsDeleted.nextClearBit(i + 1);
1013 int pt1 = bsDeleted.nextSetBit(i + 1);
1016 pt1 = intervalCount;
1019 System.arraycopy(intervals, i, intervals, pt, n);
1021 if (pt1 == intervalCount)
1023 for (i = pt1; --i >= pt;)
1025 intervals[i] = null;
1027 intervalCount -= deleted;
1038 * Recreate the key nest arrays.
1041 public boolean revalidate()
1050 * Return the total number of intervals in the store.
1056 return intervalCount + added - deleted;
1060 * AbstractCollection override to ensure that we have finalized the store.
1063 public Object[] toArray()
1066 return super.toArray();
1070 * Sort intervals by start.
1076 intervals = finalizeAddition(new IntervalI[intervalCount + added]);
1078 else if (deleted > 0)
1084 // SOMETHING HAPPENS WHEN Arrays.sort is run that
1085 // adds 100 ms to a 150 ms run time.
1086 // I don't know why.
1087 Arrays.sort(intervals, 0, intervalCount, icompare);
1089 updateMinMaxStart();
1108 // cont [-1, -1, -1, -1, 3, 4, 4, 4, 7, 7, 7, 10, -1, 12]
1109 // nests [0, 0, 1, 2, 3, 12, 4, 5, 6, 7, 8, 9, 10, 11, 13]
1110 // starts [1, 0, 0, 0, 6, 14, 7, 0, 0, 10, 0, 0, 13, 0, 0]
1111 // counts [5, 0, 0, 0, 1, 1, 3, 0, 0, 3, 0, 0, 1, 0, 0]
1114 * Create the key arrays: nests, nestStarts, and nestCounts. The starting
1115 * point is getting the container array, which may hold -1 (top level nesting)
1116 * and -2 (unnested set, if doing that).
1118 * This is a pretty complicated method; it was way simpler before I decided to
1119 * support nesting as an option.
1122 private void createArrays()
1126 * When unnesting, we need a second top-level listing.
1129 int incr = (createUnnested ? 2 : 1);
1132 * The three key arrays produced by this method:
1135 nests = new IntervalI[intervalCount + incr];
1136 nestStarts = new int[intervalCount + incr];
1137 nestCounts = new int[intervalCount + incr];
1140 * a temporary array used in Phase Two.
1143 int[] counts = new int[intervalCount + incr];
1146 * the objective of Phase One
1148 int[] myContainer = new int[intervalCount];
1150 myContainer[0] = -incr;
1152 int beginLast = intervals[0].getBegin();
1153 int endLast = intervals[0].getEnd();
1154 int ptLastNot2 = -1;
1155 int endLast2 = endLast;
1156 int beginLast2 = beginLast;
1158 // Phase One: Get the temporary container array myContainer.
1159 for (int i = 1; i < intervalCount; i++)
1162 int end = intervals[i].getEnd();
1163 int begin = intervals[i].getBegin();
1165 // set the pointer to the element that is containing
1166 // this interval, or -2 (unnested) or -1 (root-level nest)
1168 myContainer[i] = -incr;
1170 // OK, now figure it all out...
1175 // Using a method isNested(...) here, because there are different
1176 // ways of defining "nested" when start or end are the
1177 // same. The definition used here would not be my first choice,
1178 // but it matches results for IntervalStoreJ
1179 // perfectly, down to the exact number of times that the
1180 // binary search runs through its start/mid/end loops in findOverlap.
1182 // beginLast2 and endLast2 refer to the root-level or unnested level
1184 if (!isNested(begin, end, beginLast2, endLast2))
1190 // this is tricky; making sure we properly get the
1191 // nests that are to be removed from the top-level
1192 // unnested list assigned a container -1, while all
1193 // top-level nests get -2.
1196 isNested = (pt < 0 || isNested(begin, end,
1197 intervals[pt].getBegin(), intervals[pt].getEnd()));
1200 myContainer[i] = -1;
1206 isNested = isNested(begin, end, beginLast, endLast);
1209 // ...almost done...
1213 myContainer[i] = pt;
1218 // monotonic -- find the parent that is doing the nesting
1220 while ((pt = myContainer[pt]) >= 0)
1222 if (isNested(begin, end, intervals[pt].getBegin(),
1223 intervals[pt].getEnd()))
1225 myContainer[i] = pt;
1226 // fully contained by a previous interval
1227 // System.out.println("mycontainer " + i + " = " + pt);
1233 // update the counts and pointers
1235 counts[myContainer[i] + incr]++;
1236 if (myContainer[i] == -2)
1249 // Phase Two: construct the nests[] array and its associated
1250 // starting pointer array and nest element counts. These counts
1251 // are actually produced above, but we reconstruct it as a set
1252 // of dynamic pointers during construction.
1254 // incr is either 1 (no separate unnested set) or 2 (include unnested)
1256 int nextStart = counts[0] + incr;
1258 * this array tracks the pointer within nestStarts to the nest block start
1261 int[] startPt = new int[intervalCount + incr];
1262 nestStarts[0] = incr;
1264 // When not unnesting, nestStarts[0] = 1, and the length
1265 // will start out here as 0 but increment as we go.
1266 // We do this even though we know its size already, because that
1267 // value serves as a dynamic pointer as well.
1272 // Unnesting requires two separate lists with proper pointers and counts.
1273 // The first, nestStarts[0] = 0, is for the unnested set (container -2);
1274 // the second (container -1, nestStarts[1]) is for the nest root.
1277 nestStarts[1] = nextStart;
1278 nextStart += counts[1];
1281 // Now get all the pointers right and set the nests[] pointer into intervals
1284 for (int i = 0; i < intervalCount; i++)
1286 int n = counts[i + incr];
1287 int ptNest = startPt[myContainer[i] + incr];
1288 int p = nestStarts[ptNest] + nestCounts[ptNest]++;
1289 nests[p] = intervals[i];
1292 startPt[i + incr] = p;
1293 nestStarts[p] = nextStart;
1298 // System.out.println("intervals " + Arrays.toString(intervals));
1299 // System.out.println("nests " + Arrays.toString(nests));
1300 // System.out.println("conts " + Arrays.toString(myContainer));
1301 // System.out.println("starts " + Arrays.toString(nestStarts));
1302 // System.out.println("counts " + Arrays.toString(nestCounts));
1303 // System.out.println("done " + nestCounts[0]);
1307 * Child-Parent relationships to match IntervalStoreJ. Perhaps a bit arcane?
1308 * Objective is to minimize the depth when we can.
1312 * @param parentStart
1316 private static boolean isNested(int childStart, int childEnd,
1317 int parentStart, int parentEnd)
1319 return (parentStart <= childStart && parentEnd > childEnd
1320 || parentStart < childStart && parentEnd == childEnd);
1324 * Just a couple of pointers to help speed findOverlaps along a bit.
1327 private void updateMinMaxStart()
1329 if (intervalCount > 0)
1331 minStart = intervals[0].getBegin();
1332 maxStart = intervals[intervalCount - 1].getBegin();
1336 minStart = Integer.MAX_VALUE;
1337 maxStart = Integer.MIN_VALUE;
1342 public String toString()
1344 return prettyPrint();