4 Copyright (c) 2018, Mungo Carstairs
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright notice, this
11 list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
17 * Neither the name of the copyright holder nor the names of its
18 contributors may be used to endorse or promote products derived from
19 this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 package intervalstore.impl;
34 import java.util.AbstractCollection;
35 import java.util.ArrayList;
36 import java.util.Collections;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.NoSuchElementException;
41 import intervalstore.api.IntervalI;
42 import intervalstore.impl.BinarySearcher.Compare;
45 * An adapted implementation of NCList as described in the paper
48 * Nested Containment List (NCList): a new algorithm for accelerating
49 * interval query of genome alignment and interval databases
50 * - Alexander V. Alekseyenko, Christopher J. Lee
51 * https://doi.org/10.1093/bioinformatics/btl647
54 public class NCList<T extends IntervalI> extends AbstractCollection<T>
57 * A depth-first iterator over the elements stored in the NCList
59 private class NCListIterator implements Iterator<T>
63 Iterator<T> nodeIterator;
66 * Constructor bootstraps a pointer to an iterator over the first subrange
71 subrangeIndex = nextSubrange(-1);
75 * Moves the subrange iterator to the next subrange, and answers its index
76 * in the list of subranges. If there are no more, sets the iterator to null
81 private int nextSubrange(int after)
83 int nextIndex = after + 1;
84 if (nextIndex < subranges.size())
86 nodeIterator = subranges.get(nextIndex).iterator();
94 public boolean hasNext()
96 return nodeIterator != null && nodeIterator.hasNext();
100 * Answers the next element returned by the current NCNode's iterator, and
101 * advances the iterator (to the next NCNode if necessary)
106 if (nodeIterator == null || !nodeIterator.hasNext())
108 throw new NoSuchElementException();
110 T result = nodeIterator.next();
112 if (!nodeIterator.hasNext())
114 subrangeIndex = nextSubrange(subrangeIndex);
122 * the number of interval instances represented
127 * a list, in start position order, of sublists of ranges ordered so
128 * that each contains (or is the same as) the one that follows it
130 private List<NCNode<T>> subranges;
133 * Constructor given a list of things that are each located on a contiguous
134 * interval. Note that the constructor may reorder the list.
136 * We assume here that for each range, start <= end. Behaviour for reverse
137 * ordered ranges is undefined.
141 public NCList(List<T> ranges)
148 * Sorts and groups ranges into sublists where each sublist represents an
149 * interval and its contained subintervals
153 protected void build(List<T> ranges)
156 * sort and partition into subranges
157 * which have no mutual containment
159 List<IntervalI> sublists = partitionNestedSublists(ranges);
162 * convert each subrange to an NCNode consisting of a range and
163 * (possibly) its contained NCList
165 for (IntervalI sublist : sublists)
167 subranges.add(new NCNode<>(
168 ranges.subList(sublist.getBegin(), sublist.getEnd() + 1)));
171 size = ranges.size();
175 * Default constructor
179 subranges = new ArrayList<>();
183 * Sorts and traverses the ranges to identify sublists, whose start intervals
184 * are overlapping or disjoint but not mutually contained. Answers a list of
185 * start-end indices of the sorted list of ranges.
190 protected List<IntervalI> partitionNestedSublists(List<T> ranges)
192 List<IntervalI> sublists = new ArrayList<>();
194 if (ranges.isEmpty())
200 * sort by start ascending, length descending, so that
201 * contained intervals follow their containing interval
203 Collections.sort(ranges, IntervalI.COMPARE_BEGIN_ASC_END_DESC);
205 int listStartIndex = 0;
207 IntervalI lastParent = ranges.get(0);
208 boolean first = true;
210 for (int i = 0; i < ranges.size(); i++)
212 IntervalI nextInterval = ranges.get(i);
213 if (!first && !lastParent.properlyContainsInterval(nextInterval))
216 * this interval is not properly contained in the parent;
217 * close off the last sublist
219 sublists.add(new Range(listStartIndex, i - 1));
221 lastParent = nextInterval;
226 sublists.add(new Range(listStartIndex, ranges.size() - 1));
231 * Adds one entry to the stored set
236 public synchronized boolean add(final T entry)
238 final NCNode<T> newNode = new NCNode<>(entry);
244 * Adds one NCNode to this NCList
246 * This method does not update the <code>size</code> (interval count) of this
247 * NCList, as it may be used to rearrange nodes without changing their count.
248 * Callers should increment the count if needed.
252 protected void addNode(final NCNode<T> newNode)
254 final long start = newNode.getBegin();
255 final long end = newNode.getEnd();
256 size += newNode.size();
260 * 1) precedes all subranges - add as NCNode on front of list
261 * 2) follows all subranges - add as NCNode on end of list
262 * 3) matches a subrange - add as a sibling in the list
263 * 4) properly enclosed by a subrange - add recursively to subrange
264 * 5) properly encloses one or more subranges - push them inside it
265 * 6) spans two subranges - insert between them
269 * find the first subrange whose end does not precede entry's start
271 int candidateIndex = findFirstOverlap(start);
274 * search for maximal span of subranges i-k that the new entry
275 * encloses; or a subrange that encloses the new entry
277 boolean enclosing = false;
278 int firstEnclosed = 0;
279 int lastEnclosed = 0;
281 for (int j = candidateIndex; j < subranges.size(); j++)
283 NCNode<T> subrange = subranges.get(j);
285 if (subrange.equalsInterval(newNode))
288 * matching interval - insert adjacent
290 subranges.add(j, newNode);
294 if (end < subrange.getBegin() && !enclosing)
297 * new entry lies between subranges j-1 j
299 subranges.add(j, newNode);
303 if (subrange.properlyContainsInterval(newNode))
306 * push new entry inside this subrange as it encloses it
308 subrange.addNode(newNode);
312 if (start <= subrange.getBegin())
314 if (end >= subrange.getEnd())
317 * new entry encloses this subrange (and possibly preceding ones);
318 * continue to find the maximal list it encloses
331 * entry spans from before this subrange to inside it
336 * entry encloses one or more preceding subranges
338 push(newNode, firstEnclosed, lastEnclosed);
343 * entry overlaps two subranges but doesn't enclose either
346 subranges.add(j, newNode);
354 * drops through to here if new range encloses all others
355 * or overlaps the last one
359 push(newNode, firstEnclosed, lastEnclosed);
363 subranges.add(newNode);
368 public boolean contains(Object entry)
370 if (!(entry instanceof IntervalI))
374 IntervalI interval = (IntervalI) entry;
377 * find the first sublist that might overlap, i.e.
378 * the first whose end position is >= from
380 int candidateIndex = findFirstOverlap(interval.getBegin());
382 int to = interval.getEnd();
384 for (int i = candidateIndex; i < subranges.size(); i++)
386 NCNode<T> candidate = subranges.get(i);
387 if (candidate.getBegin() > to)
390 * we are past the end of our target range
394 if (candidate.contains(interval))
403 * Update the tree so that the new node encloses current subranges i to j
404 * (inclusive). That is, replace subranges i-j (inclusive) with a new subrange
405 * that contains them.
410 * @throws IllegalArgumentException
411 * if any of the subranges is not contained by the node's start-end
414 protected synchronized void push(NCNode<T> node, final int i,
417 for (int k = i; k <= j; k++)
419 NCNode<T> n = subranges.get(k);
420 if (!node.containsInterval(n)) {
421 throw new IllegalArgumentException("Can't push " + n.toString()
422 + " inside " + node.toString());
427 for (int k = j; k >= i; k--)
431 subranges.add(i, node);
435 * Answers a list of contained intervals that overlap the given range
441 public List<T> findOverlaps(long from, long to)
443 List<T> result = new ArrayList<>();
445 findOverlaps(from, to, result);
451 * Recursively searches the NCList adding any items that overlap the from-to
452 * range to the result list
458 protected void findOverlaps(long from, long to, List<T> result)
461 * find the first sublist that might overlap, i.e.
462 * the first whose end position is >= from
464 int candidateIndex = findFirstOverlap(from);
466 for (int i = candidateIndex; i < subranges.size(); i++)
468 NCNode<T> candidate = subranges.get(i);
469 if (candidate.getBegin() > to)
472 * we are past the end of our target range
476 candidate.findOverlaps(from, to, result);
482 * Search subranges for the first one whose end position is not before the
483 * target range's start position, i.e. the first one that may overlap the
484 * target range. Returns the index in the list of the first such range found,
485 * or the length of the list if none found.
490 protected int findFirstOverlap(final long from)
492 return BinarySearcher.findFirst(subranges, false, Compare.GE,
497 * Formats the tree as a bracketed list e.g.
500 * [1-100 [10-30 [10-20]], 15-30 [20-20]]
504 public String toString()
506 return subranges.toString();
510 * Answers the NCList as an indented list
514 public String prettyPrint()
516 StringBuilder sb = new StringBuilder(512);
519 prettyPrint(sb, offset, indent);
520 sb.append(System.lineSeparator());
521 return sb.toString();
529 void prettyPrint(StringBuilder sb, int offset, int indent)
531 boolean first = true;
532 for (NCNode<T> subrange : subranges)
536 sb.append(System.lineSeparator());
539 subrange.prettyPrint(sb, offset, indent);
544 * Answers true if the store's structure is valid (nesting containment rules
545 * are obeyed), else false. For use in testing and debugging.
549 public boolean isValid()
552 for (NCNode<T> subrange : subranges)
554 count += subrange.size();
560 return isValid(Integer.MIN_VALUE, Integer.MAX_VALUE);
564 * Answers true if the data held satisfy the rules of construction of an
565 * NCList bounded within the given start-end range, else false.
567 * Each subrange must lie within start-end (inclusive). Subranges must be
568 * ordered by start position ascending, and within that by end position
576 boolean isValid(final int start, final int end)
578 NCNode<T> lastRange = null;
580 for (NCNode<T> subrange : subranges)
582 if (subrange.getBegin() < start)
584 System.err.println("error in NCList: range " + subrange.toString()
585 + " starts before " + end);
588 if (subrange.getEnd() > end)
590 System.err.println("error in NCList: range " + subrange.toString()
591 + " ends after " + end);
595 if (lastRange != null)
597 if (subrange.getBegin() < lastRange.getBegin())
599 System.err.println("error in NCList: range " + subrange.toString()
600 + " starts before " + lastRange.toString());
603 if (subrange.properlyContainsInterval(lastRange))
605 System.err.println("error in NCList: range " + subrange.toString()
606 + " encloses preceding: " + lastRange.toString());
609 if (lastRange.properlyContainsInterval(subrange))
611 System.err.println("error in NCList: range " + subrange.toString()
612 + " enclosed by preceding: " + lastRange.toString());
616 lastRange = subrange;
618 if (!subrange.isValid())
627 * Answers the number of intervals stored
638 * Answers a list of all entries stored, in no guaranteed order. This method
639 * is not synchronized, so is not thread-safe.
641 public List<T> getEntries()
643 List<T> result = new ArrayList<>();
649 * Adds all contained entries to the given list
653 void getEntries(List<T> result)
655 for (NCNode<T> subrange : subranges)
657 subrange.getEntries(result);
662 * Removes the first interval <code>I</code>found that is equal to T
663 * (<code>I.equals(T)</code>). Answers true if an interval is removed, false
664 * if no match is found. This method is synchronized so thread-safe.
669 public synchronized boolean remove(T entry)
675 int i = findFirstOverlap(entry.getBegin());
677 for (; i < subranges.size(); i++)
679 NCNode<T> subrange = subranges.get(i);
680 if (subrange.getBegin() > entry.getBegin())
687 NCList<T> subRegions = subrange.getSubRegions();
689 if (subrange.getRegion().equals(entry))
692 * if the subrange is rooted on this entry, remove it,
693 * and remove and promote its subregions (if any)
696 size -= subrange.size();
697 if (subRegions != null)
699 for (NCNode<T> r : subRegions.subranges)
708 if (subrange.remove(entry))
719 * Answers the depth of interval nesting of this object, where 1 means there
720 * are no nested sub-intervals
724 public int getDepth()
727 for (NCNode<T> subrange : subranges)
729 subDepth = Math.max(subDepth, subrange.getDepth());
736 * Answers an iterator over the contained intervals, with no particular order
737 * guaranteed. The iterator does not support the optional <code>remove</code>
738 * operation (throws <code>UnsupportedOperationException</code> if attempted).
741 public Iterator<T> iterator()
743 return new NCListIterator();
747 public synchronized void clear()