4 Copyright (c) 2018, Mungo Carstairs
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright notice, this
11 list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
17 * Neither the name of the copyright holder nor the names of its
18 contributors may be used to endorse or promote products derived from
19 this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 package intervalstore.impl;
34 import java.util.AbstractCollection;
35 import java.util.ArrayList;
36 import java.util.Collections;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.NoSuchElementException;
41 import intervalstore.api.IntervalI;
44 * An adapted implementation of NCList as described in the paper
47 * Nested Containment List (NCList): a new algorithm for accelerating
48 * interval query of genome alignment and interval databases
49 * - Alexander V. Alekseyenko, Christopher J. Lee
50 * https://doi.org/10.1093/bioinformatics/btl647
53 public class NCList<T extends IntervalI> extends AbstractCollection<T>
56 * A depth-first iterator over the elements stored in the NCList
58 private class NCListIterator implements Iterator<T>
62 Iterator<T> nodeIterator;
65 * Constructor bootstraps a pointer to an iterator over the first subrange
70 subrangeIndex = nextSubrange(-1);
74 * Moves the subrange iterator to the next subrange, and answers its index
75 * in the list of subranges. If there are no more, sets the iterator to null
80 private int nextSubrange(int after)
82 int nextIndex = after + 1;
83 if (nextIndex < subranges.size())
85 nodeIterator = subranges.get(nextIndex).iterator();
93 public boolean hasNext()
95 return nodeIterator != null && nodeIterator.hasNext();
99 * Answers the next element returned by the current NCNode's iterator, and
100 * advances the iterator (to the next NCNode if necessary)
105 if (nodeIterator == null || !nodeIterator.hasNext())
107 throw new NoSuchElementException();
109 T result = nodeIterator.next();
111 if (!nodeIterator.hasNext())
113 subrangeIndex = nextSubrange(subrangeIndex);
121 * the number of interval instances represented
126 * a list, in start position order, of sublists of ranges ordered so
127 * that each contains (or is the same as) the one that follows it
129 private List<NCNode<T>> subranges;
132 * Constructor given a list of things that are each located on a contiguous
133 * interval. Note that the constructor may reorder the list.
135 * We assume here that for each range, start <= end. Behaviour for reverse
136 * ordered ranges is undefined.
140 public NCList(List<T> ranges)
147 * Sorts and groups ranges into sublists where each sublist represents an
148 * interval and its contained subintervals
152 protected void build(List<T> ranges)
155 * sort and partition into subranges
156 * which have no mutual containment
158 List<IntervalI> sublists = partitionNestedSublists(ranges);
161 * convert each subrange to an NCNode consisting of a range and
162 * (possibly) its contained NCList
164 for (IntervalI sublist : sublists)
166 subranges.add(new NCNode<>(
167 ranges.subList(sublist.getBegin(), sublist.getEnd() + 1)));
170 size = ranges.size();
174 * Default constructor
178 subranges = new ArrayList<>();
182 * Sorts and traverses the ranges to identify sublists, whose start intervals
183 * are overlapping or disjoint but not mutually contained. Answers a list of
184 * start-end indices of the sorted list of ranges.
189 protected List<IntervalI> partitionNestedSublists(List<T> ranges)
191 List<IntervalI> sublists = new ArrayList<>();
193 if (ranges.isEmpty())
199 * sort by start ascending, length descending, so that
200 * contained intervals follow their containing interval
202 Collections.sort(ranges, new NCListBuilder<>().getComparator());
204 int listStartIndex = 0;
206 IntervalI lastParent = ranges.get(0);
207 boolean first = true;
209 for (int i = 0; i < ranges.size(); i++)
211 IntervalI nextInterval = ranges.get(i);
212 if (!first && !lastParent.properlyContainsInterval(nextInterval))
215 * this interval is not properly contained in the parent;
216 * close off the last sublist
218 sublists.add(new Range(listStartIndex, i - 1));
220 lastParent = nextInterval;
225 sublists.add(new Range(listStartIndex, ranges.size() - 1));
230 * Adds one entry to the stored set
235 public synchronized boolean add(final T entry)
237 final NCNode<T> newNode = new NCNode<>(entry);
243 * Adds one NCNode to this NCList
245 * This method does not update the <code>size</code> (interval count) of this
246 * NCList, as it may be used to rearrange nodes without changing their count.
247 * Callers should increment the count if needed.
251 protected void addNode(final NCNode<T> newNode)
253 final long start = newNode.getBegin();
254 final long end = newNode.getEnd();
255 size += newNode.size();
259 * 1) precedes all subranges - add as NCNode on front of list
260 * 2) follows all subranges - add as NCNode on end of list
261 * 3) matches a subrange - add as a sibling in the list
262 * 4) properly enclosed by a subrange - add recursively to subrange
263 * 5) properly encloses one or more subranges - push them inside it
264 * 6) spans two subranges - insert between them
268 * find the first subrange whose end does not precede entry's start
270 int candidateIndex = findFirstOverlap(start);
273 * search for maximal span of subranges i-k that the new entry
274 * encloses; or a subrange that encloses the new entry
276 boolean enclosing = false;
277 int firstEnclosed = 0;
278 int lastEnclosed = 0;
280 for (int j = candidateIndex; j < subranges.size(); j++)
282 NCNode<T> subrange = subranges.get(j);
284 if (subrange.equalsInterval(newNode))
287 * matching interval - insert adjacent
289 subranges.add(j, newNode);
293 if (end < subrange.getBegin() && !enclosing)
296 * new entry lies between subranges j-1 j
298 subranges.add(j, newNode);
302 if (subrange.properlyContainsInterval(newNode))
305 * push new entry inside this subrange as it encloses it
307 subrange.addNode(newNode);
311 if (start <= subrange.getBegin())
313 if (end >= subrange.getEnd())
316 * new entry encloses this subrange (and possibly preceding ones);
317 * continue to find the maximal list it encloses
330 * entry spans from before this subrange to inside it
335 * entry encloses one or more preceding subranges
337 push(newNode, firstEnclosed, lastEnclosed);
342 * entry overlaps two subranges but doesn't enclose either
345 subranges.add(j, newNode);
353 * drops through to here if new range encloses all others
354 * or overlaps the last one
358 push(newNode, firstEnclosed, lastEnclosed);
362 subranges.add(newNode);
367 public boolean contains(Object entry)
369 if (!(entry instanceof IntervalI))
373 IntervalI interval = (IntervalI) entry;
376 * find the first sublist that might overlap, i.e.
377 * the first whose end position is >= from
379 int candidateIndex = findFirstOverlap(interval.getBegin());
381 int to = interval.getEnd();
383 for (int i = candidateIndex; i < subranges.size(); i++)
385 NCNode<T> candidate = subranges.get(i);
386 if (candidate.getBegin() > to)
389 * we are past the end of our target range
393 if (candidate.contains(interval))
402 * Update the tree so that the new node encloses current subranges i to j
403 * (inclusive). That is, replace subranges i-j (inclusive) with a new subrange
404 * that contains them.
409 * @throws IllegalArgumentException
410 * if any of the subranges is not contained by the node's start-end
413 protected synchronized void push(NCNode<T> node, final int i,
416 for (int k = i; k <= j; k++)
418 NCNode<T> n = subranges.get(k);
419 if (!node.containsInterval(n)) {
420 throw new IllegalArgumentException("Can't push " + n.toString()
421 + " inside " + node.toString());
426 for (int k = j; k >= i; k--)
430 subranges.add(i, node);
434 * Answers a list of contained intervals that overlap the given range
440 public List<T> findOverlaps(long from, long to)
442 List<T> result = new ArrayList<>();
444 findOverlaps(from, to, result);
450 * Recursively searches the NCList adding any items that overlap the from-to
451 * range to the result list
457 protected void findOverlaps(long from, long to, List<T> result)
460 * find the first sublist that might overlap, i.e.
461 * the first whose end position is >= from
463 int candidateIndex = findFirstOverlap(from);
465 for (int i = candidateIndex; i < subranges.size(); i++)
467 NCNode<T> candidate = subranges.get(i);
468 if (candidate.getBegin() > to)
471 * we are past the end of our target range
475 candidate.findOverlaps(from, to, result);
481 * Search subranges for the first one whose end position is not before the
482 * target range's start position, i.e. the first one that may overlap the
483 * target range. Returns the index in the list of the first such range found,
484 * or the length of the list if none found.
489 protected int findFirstOverlap(final long from)
491 return BinarySearcher.findFirst(subranges,
492 val -> val.getEnd() >= from);
496 * Formats the tree as a bracketed list e.g.
499 * [1-100 [10-30 [10-20]], 15-30 [20-20]]
503 public String toString()
505 return subranges.toString();
509 * Answers the NCList as an indented list
513 public String prettyPrint()
515 StringBuilder sb = new StringBuilder(512);
518 prettyPrint(sb, offset, indent);
519 sb.append(System.lineSeparator());
520 return sb.toString();
528 void prettyPrint(StringBuilder sb, int offset, int indent)
530 boolean first = true;
531 for (NCNode<T> subrange : subranges)
535 sb.append(System.lineSeparator());
538 subrange.prettyPrint(sb, offset, indent);
543 * Answers true if the store's structure is valid (nesting containment rules
544 * are obeyed), else false. For use in testing and debugging.
548 public boolean isValid()
551 for (NCNode<T> subrange : subranges)
553 count += subrange.size();
559 return isValid(Integer.MIN_VALUE, Integer.MAX_VALUE);
563 * Answers true if the data held satisfy the rules of construction of an
564 * NCList bounded within the given start-end range, else false.
566 * Each subrange must lie within start-end (inclusive). Subranges must be
567 * ordered by start position ascending, and within that by end position
575 boolean isValid(final int start, final int end)
577 NCNode<T> lastRange = null;
579 for (NCNode<T> subrange : subranges)
581 if (subrange.getBegin() < start)
583 System.err.println("error in NCList: range " + subrange.toString()
584 + " starts before " + end);
587 if (subrange.getEnd() > end)
589 System.err.println("error in NCList: range " + subrange.toString()
590 + " ends after " + end);
594 if (lastRange != null)
596 if (subrange.getBegin() < lastRange.getBegin())
598 System.err.println("error in NCList: range " + subrange.toString()
599 + " starts before " + lastRange.toString());
602 if (subrange.properlyContainsInterval(lastRange))
604 System.err.println("error in NCList: range " + subrange.toString()
605 + " encloses preceding: " + lastRange.toString());
608 if (lastRange.properlyContainsInterval(subrange))
610 System.err.println("error in NCList: range " + subrange.toString()
611 + " enclosed by preceding: " + lastRange.toString());
615 lastRange = subrange;
617 if (!subrange.isValid())
626 * Answers the number of intervals stored
637 * Answers a list of all entries stored, in no guaranteed order. This method
638 * is not synchronized, so is not thread-safe.
640 public List<T> getEntries()
642 List<T> result = new ArrayList<>();
648 * Adds all contained entries to the given list
652 void getEntries(List<T> result)
654 for (NCNode<T> subrange : subranges)
656 subrange.getEntries(result);
661 * Removes the first interval <code>I</code>found that is equal to T
662 * (<code>I.equals(T)</code>). Answers true if an interval is removed, false
663 * if no match is found. This method is synchronized so thread-safe.
668 public synchronized boolean remove(T entry)
674 int i = findFirstOverlap(entry.getBegin());
676 for (; i < subranges.size(); i++)
678 NCNode<T> subrange = subranges.get(i);
679 if (subrange.getBegin() > entry.getBegin())
686 NCList<T> subRegions = subrange.getSubRegions();
688 if (subrange.getRegion().equals(entry))
691 * if the subrange is rooted on this entry, remove it,
692 * and remove and promote its subregions (if any)
695 size -= subrange.size();
696 if (subRegions != null)
698 for (NCNode<T> r : subRegions.subranges)
707 if (subrange.remove(entry))
718 * Answers the depth of interval nesting of this object, where 1 means there
719 * are no nested sub-intervals
723 public int getDepth()
726 for (NCNode<T> subrange : subranges)
728 subDepth = Math.max(subDepth, subrange.getDepth());
735 * Answers an iterator over the contained intervals, with no particular order
736 * guaranteed. The iterator does not support the optional <code>remove</code>
737 * operation (throws <code>UnsupportedOperationException</code> if attempted).
740 public Iterator<T> iterator()
742 return new NCListIterator();
746 public synchronized void clear()