4 Copyright (c) 2018, Mungo Carstairs
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright notice, this
11 list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
17 * Neither the name of the copyright holder nor the names of its
18 contributors may be used to endorse or promote products derived from
19 this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 package intervalstore.impl;
34 import java.util.AbstractCollection;
35 import java.util.ArrayList;
36 import java.util.Collections;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.NoSuchElementException;
41 import intervalstore.api.IntervalI;
44 * An adapted implementation of NCList as described in the paper
47 * Nested Containment List (NCList): a new algorithm for accelerating
48 * interval query of genome alignment and interval databases
49 * - Alexander V. Alekseyenko, Christopher J. Lee
50 * https://doi.org/10.1093/bioinformatics/btl647
53 public class NCList<T extends IntervalI> extends AbstractCollection<T>
56 // private static final boolean OPTION_FIND_ANY = false;
59 * A depth-first iterator over the elements stored in the NCList
61 private class NCListIterator implements Iterator<T>
65 Iterator<T> nodeIterator;
68 * Constructor bootstraps a pointer to an iterator over the first subrange
73 subrangeIndex = nextSubrange(-1);
77 * Moves the subrange iterator to the next subrange, and answers its index
78 * in the list of subranges. If there are no more, sets the iterator to null
83 private int nextSubrange(int after)
85 int nextIndex = after + 1;
86 if (nextIndex < subranges.size())
88 nodeIterator = subranges.get(nextIndex).iterator();
96 public boolean hasNext()
98 return nodeIterator != null && nodeIterator.hasNext();
102 * Answers the next element returned by the current NCNode's iterator, and
103 * advances the iterator (to the next NCNode if necessary)
108 if (nodeIterator == null || !nodeIterator.hasNext())
110 throw new NoSuchElementException();
112 T result = nodeIterator.next();
114 if (!nodeIterator.hasNext())
116 subrangeIndex = nextSubrange(subrangeIndex);
124 * the number of interval instances represented
129 * a list, in start position order, of sublists of ranges ordered so
130 * that each contains (or is the same as) the one that follows it
132 private List<NCNode<T>> subranges;
135 * Constructor given a list of things that are each located on a contiguous
136 * interval. Note that the constructor may reorder the list.
138 * We assume here that for each range, start <= end. Behaviour for reverse
139 * ordered ranges is undefined.
143 public NCList(List<T> ranges)
150 * Sorts and groups ranges into sublists where each sublist represents an
151 * interval and its contained subintervals
155 protected void build(List<T> ranges)
158 * sort and partition into subranges
159 * which have no mutual containment
161 List<IntervalI> sublists = partitionNestedSublists(ranges);
164 * convert each subrange to an NCNode consisting of a range and
165 * (possibly) its contained NCList
167 for (IntervalI sublist : sublists)
169 subranges.add(new NCNode<>(
170 ranges.subList(sublist.getBegin(), sublist.getEnd() + 1)));
173 size = ranges.size();
177 * Default constructor
181 subranges = new ArrayList<>();
185 * Sorts and traverses the ranges to identify sublists, whose start intervals
186 * are overlapping or disjoint but not mutually contained. Answers a list of
187 * start-end indices of the sorted list of ranges.
192 protected List<IntervalI> partitionNestedSublists(List<T> ranges)
194 List<IntervalI> sublists = new ArrayList<>();
196 if (ranges.isEmpty())
202 * sort by start ascending, length descending, so that
203 * contained intervals follow their containing interval
205 Collections.sort(ranges, IntervalI.COMPARATOR_BIGENDIAN);
207 int listStartIndex = 0;
209 IntervalI lastParent = ranges.get(0);
210 boolean first = true;
212 for (int i = 0; i < ranges.size(); i++)
214 IntervalI nextInterval = ranges.get(i);
215 if (!first && !lastParent.properlyContainsInterval(nextInterval))
218 * this interval is not properly contained in the parent;
219 * close off the last sublist
221 sublists.add(new Range(listStartIndex, i - 1));
223 lastParent = nextInterval;
228 sublists.add(new Range(listStartIndex, ranges.size() - 1));
233 * Adds one entry to the stored set
238 public synchronized boolean add(final T entry)
240 final NCNode<T> newNode = new NCNode<>(entry);
246 * Adds one NCNode to this NCList
248 * This method does not update the <code>size</code> (interval count) of this
249 * NCList, as it may be used to rearrange nodes without changing their count.
250 * Callers should increment the count if needed.
254 protected void addNode(final NCNode<T> newNode)
256 final long start = newNode.getBegin();
257 final long end = newNode.getEnd();
258 size += newNode.size();
262 * 1) precedes all subranges - add as NCNode on front of list
263 * 2) follows all subranges - add as NCNode on end of list
264 * 3) matches a subrange - add as a sibling in the list
265 * 4) properly enclosed by a subrange - add recursively to subrange
266 * 5) properly encloses one or more subranges - push them inside it
267 * 6) spans two subranges - insert between them
271 * find the first subrange whose end does not precede entry's start
273 int candidateIndex = findFirstOverlap(start);
276 * search for maximal span of subranges i-k that the new entry
277 * encloses; or a subrange that encloses the new entry
279 boolean enclosing = false;
280 int firstEnclosed = 0;
281 int lastEnclosed = 0;
283 for (int j = candidateIndex; j < subranges.size(); j++)
285 NCNode<T> subrange = subranges.get(j);
287 if (subrange.equalsInterval(newNode))
290 * matching interval - insert adjacent
292 subranges.add(j, newNode);
296 if (end < subrange.getBegin() && !enclosing)
299 * new entry lies between subranges j-1 j
301 subranges.add(j, newNode);
305 if (subrange.properlyContainsInterval(newNode))
308 * push new entry inside this subrange as it encloses it
310 subrange.addNode(newNode);
314 if (start <= subrange.getBegin())
316 if (end >= subrange.getEnd())
319 * new entry encloses this subrange (and possibly preceding ones);
320 * continue to find the maximal list it encloses
333 * entry spans from before this subrange to inside it
338 * entry encloses one or more preceding subranges
340 push(newNode, firstEnclosed, lastEnclosed);
345 * entry overlaps two subranges but doesn't enclose either
348 subranges.add(j, newNode);
356 * drops through to here if new range encloses all others
357 * or overlaps the last one
361 push(newNode, firstEnclosed, lastEnclosed);
365 subranges.add(newNode);
370 public boolean contains(Object entry)
372 if (!(entry instanceof IntervalI))
376 IntervalI interval = (IntervalI) entry;
379 * find the first sublist that might overlap, i.e.
380 * the first whose end position is >= from
382 int candidateIndex = findFirstOverlap(interval.getBegin());
384 int to = interval.getEnd();
386 for (int i = candidateIndex; i < subranges.size(); i++)
388 NCNode<T> candidate = subranges.get(i);
389 if (candidate.getBegin() > to)
392 * we are past the end of our target range
396 if (candidate.contains(interval))
405 * Update the tree so that the new node encloses current subranges i to j
406 * (inclusive). That is, replace subranges i-j (inclusive) with a new subrange
407 * that contains them.
412 * @throws IllegalArgumentException
413 * if any of the subranges is not contained by the node's start-end
416 protected synchronized void push(NCNode<T> node, final int i,
419 for (int k = i; k <= j; k++)
421 NCNode<T> n = subranges.get(k);
422 if (!node.containsInterval(n)) {
423 throw new IllegalArgumentException("Can't push " + n.toString()
424 + " inside " + node.toString());
429 for (int k = j; k >= i; k--)
433 subranges.add(i, node);
437 * Answers a list of contained intervals that overlap the given range
443 public List<T> findOverlaps(long from, long to)
445 List<T> result = new ArrayList<>();
447 findOverlaps(from, to, result);
453 * Recursively searches the NCList adding any items that overlap the from-to
454 * range to the result list
460 protected List<T> findOverlaps(long from, long to, List<T> result)
463 // if (OPTION_FIND_ANY)
465 // return findAnyOverlaps(from, to, result);
468 * find the first sublist that might overlap, i.e.
469 * the first whose end position is >= from
471 int candidateIndex = findFirstOverlap(from);
473 for (int i = candidateIndex; i < subranges.size(); i++)
475 NCNode<T> candidate = subranges.get(i);
476 if (candidate.getBegin() > to)
479 * we are past the end of our target range
483 candidate.findOverlaps(from, to, result);
489 // * Recursively searches the NCList adding any items that overlap the from-to
490 // * range to the result list
496 // protected List<T> findAnyOverlaps(long from, long to, List<T> result)
499 // // BH find ANY overlap
501 // int candidateIndex = findAnyOverlap(subranges, from, to);
503 // if (candidateIndex < 0)
505 // for (int i = candidateIndex, n = subranges.size(); i < n; i++)
507 // NCNode<T> candidate = subranges.get(i);
508 // if (candidate.getBegin() > to)
511 // * we are past the end of our target range
515 // candidate.findOverlaps(from, to, result);
518 // // BH adds dual-direction check
520 // for (int i = candidateIndex; --i >= 0;)
522 // NCNode<T> candidate = subranges.get(i);
523 // if (candidate.getEnd() < from)
527 // candidate.findOverlaps(from, to, result);
532 // private int findAnyOverlap(List<NCNode<T>> ranges, long from, long to)
535 // int end = ranges.size() - 1;
536 // while (start <= end)
538 // int mid = (start + end) >>> 1;
539 // NCNode<T> r = ranges.get(mid);
540 // if (r.getEnd() >= from)
542 // if (r.getBegin() <= to)
555 * Search subranges for the first one whose end position is not before the
556 * target range's start position, i.e. the first one that may overlap the
557 * target range. Returns the index in the list of the first such range found,
558 * or the length of the list if none found.
563 protected int findFirstOverlap(final long from)
565 return BinarySearcher.findFirst(subranges, (int) from,
566 BinarySearcher.fend);
570 * Formats the tree as a bracketed list e.g.
573 * [1-100 [10-30 [10-20]], 15-30 [20-20]]
577 public String toString()
579 return subranges.toString();
583 * Answers the NCList as an indented list
587 public String prettyPrint()
589 StringBuilder sb = new StringBuilder(512);
592 prettyPrint(sb, offset, indent);
593 sb.append('\n');// System.lineSeparator());
594 return sb.toString();
602 void prettyPrint(StringBuilder sb, int offset, int indent)
604 boolean first = true;
605 for (NCNode<T> subrange : subranges)
609 sb.append('\n');// System.lineSeparator());
612 subrange.prettyPrint(sb, offset, indent);
617 * Answers true if the store's structure is valid (nesting containment rules
618 * are obeyed), else false. For use in testing and debugging.
622 public boolean isValid()
625 for (NCNode<T> subrange : subranges)
627 count += subrange.size();
633 return isValid(Integer.MIN_VALUE, Integer.MAX_VALUE);
637 * Answers true if the data held satisfy the rules of construction of an
638 * NCList bounded within the given start-end range, else false.
640 * Each subrange must lie within start-end (inclusive). Subranges must be
641 * ordered by start position ascending, and within that by end position
649 boolean isValid(final int start, final int end)
651 NCNode<T> lastRange = null;
653 for (NCNode<T> subrange : subranges)
655 if (subrange.getBegin() < start)
657 System.err.println("error in NCList: range " + subrange.toString()
658 + " starts before " + end);
661 if (subrange.getEnd() > end)
663 System.err.println("error in NCList: range " + subrange.toString()
664 + " ends after " + end);
668 if (lastRange != null)
670 if (subrange.getBegin() < lastRange.getBegin())
672 System.err.println("error in NCList: range " + subrange.toString()
673 + " starts before " + lastRange.toString());
676 if (subrange.properlyContainsInterval(lastRange))
678 System.err.println("error in NCList: range " + subrange.toString()
679 + " encloses preceding: " + lastRange.toString());
682 if (lastRange.properlyContainsInterval(subrange))
684 System.err.println("error in NCList: range " + subrange.toString()
685 + " enclosed by preceding: " + lastRange.toString());
689 lastRange = subrange;
691 if (!subrange.isValid())
700 * Answers the number of intervals stored
711 * Answers a list of all entries stored, in no guaranteed order. This method
712 * is not synchronized, so is not thread-safe.
714 public List<T> getEntries()
716 List<T> result = new ArrayList<>();
722 * Adds all contained entries to the given list
726 void getEntries(List<T> result)
728 for (NCNode<T> subrange : subranges)
730 subrange.getEntries(result);
735 * Removes the first interval <code>I</code>found that is equal to T
736 * (<code>I.equals(T)</code>). Answers true if an interval is removed, false
737 * if no match is found. This method is synchronized so thread-safe.
742 public synchronized boolean remove(T entry)
748 int i = findFirstOverlap(entry.getBegin());
750 for (; i < subranges.size(); i++)
752 NCNode<T> subrange = subranges.get(i);
753 if (subrange.getBegin() > entry.getBegin())
760 NCList<T> subRegions = subrange.getSubRegions();
762 if (subrange.getRegion().equals(entry))
765 * if the subrange is rooted on this entry, remove it,
766 * and remove and promote its subregions (if any)
769 size -= subrange.size();
770 if (subRegions != null)
772 for (NCNode<T> r : subRegions.subranges)
781 if (subrange.remove(entry))
792 * Answers the depth of interval nesting of this object, where 1 means there
793 * are no nested sub-intervals
797 public int getDepth()
800 for (NCNode<T> subrange : subranges)
802 subDepth = Math.max(subDepth, subrange.getDepth());
809 * Answers an iterator over the contained intervals, with no particular order
810 * guaranteed. The iterator does not support the optional <code>remove</code>
811 * operation (throws <code>UnsupportedOperationException</code> if attempted).
814 public Iterator<T> iterator()
816 return new NCListIterator();
820 public synchronized void clear()
826 public int getWidth()
828 return subranges.size();