1 package jalview.datamodel.features;
3 import jalview.datamodel.ContiguousI;
4 import jalview.datamodel.Range;
6 import java.util.ArrayList;
7 import java.util.Collections;
11 * An adapted implementation of NCList as described in the paper
14 * Nested Containment List (NCList): a new algorithm for accelerating
15 * interval query of genome alignment and interval databases
16 * - Alexander V. Alekseyenko, Christopher J. Lee
17 * https://doi.org/10.1093/bioinformatics/btl647
20 public class NCList<T extends ContiguousI>
23 * the number of ranges represented
28 * a list, in start position order, of sublists of ranges ordered so
29 * that each contains (or is the same as) the one that follows it
31 private List<NCNode<T>> subranges;
34 * Constructor given a list of things that are each located on a contiguous
35 * interval. Note that the constructor may reorder the list.
37 * We assume here that for each range, start <= end. Behaviour for reverse
38 * ordered ranges is undefined.
42 public NCList(List<T> ranges)
49 * Sort and group ranges into sublists where each sublist represents a region
50 * and its contained subregions
54 protected void build(List<T> ranges)
57 * sort by start ascending so that contained intervals
58 * follow their containing interval
60 Collections.sort(ranges, RangeComparator.BY_START_POSITION);
62 List<Range> sublists = buildSubranges(ranges);
65 * convert each subrange to an NCNode consisting of a range and
66 * (possibly) its contained NCList
68 for (Range sublist : sublists)
70 subranges.add(new NCNode<T>(ranges.subList(sublist.start,
77 public NCList(T entry)
80 subranges.add(new NCNode<>(entry));
86 subranges = new ArrayList<NCNode<T>>();
90 * Traverses the sorted ranges to identify sublists, within which each
91 * interval contains the one that follows it
96 protected List<Range> buildSubranges(List<T> ranges)
98 List<Range> sublists = new ArrayList<>();
100 if (ranges.isEmpty())
105 int listStartIndex = 0;
106 long lastEndPos = Long.MAX_VALUE;
108 for (int i = 0; i < ranges.size(); i++)
110 ContiguousI nextInterval = ranges.get(i);
111 long nextStart = nextInterval.getBegin();
112 long nextEnd = nextInterval.getEnd();
113 if (nextStart > lastEndPos || nextEnd > lastEndPos)
116 * this interval is not contained in the preceding one
117 * close off the last sublist
119 sublists.add(new Range(listStartIndex, i - 1));
122 lastEndPos = nextEnd;
125 sublists.add(new Range(listStartIndex, ranges.size() - 1));
130 * Adds one entry to the stored set (with duplicates allowed)
134 public void add(T entry)
140 * Adds one entry to the stored set, and returns true, unless allowDuplicates
141 * is set to false and it is already contained (by object equality test), in
142 * which case it is not added and this method returns false.
145 * @param allowDuplicates
148 public synchronized boolean add(T entry, boolean allowDuplicates)
150 if (!allowDuplicates && contains(entry))
156 long start = entry.getBegin();
157 long end = entry.getEnd();
161 * - precedes all subranges: add as NCNode on front of list
162 * - follows all subranges: add as NCNode on end of list
163 * - enclosed by a subrange - add recursively to subrange
164 * - encloses one or more subranges - push them inside it
165 * - none of the above - add as a new node and resort nodes list (?)
169 * find the first subrange whose end does not precede entry's start
171 int candidateIndex = findFirstOverlap(start);
172 if (candidateIndex == -1)
175 * all subranges precede this one - add it on the end
177 subranges.add(new NCNode<>(entry));
182 * search for maximal span of subranges i-k that the new entry
183 * encloses; or a subrange that encloses the new entry
185 boolean enclosing = false;
186 int firstEnclosed = 0;
187 int lastEnclosed = 0;
188 boolean overlapping = false;
190 for (int j = candidateIndex; j < subranges.size(); j++)
192 NCNode<T> subrange = subranges.get(j);
194 if (end < subrange.getBegin() && !overlapping && !enclosing)
197 * new entry lies between subranges j-1 j
199 subranges.add(j, new NCNode<>(entry));
203 if (subrange.getBegin() <= start && subrange.getEnd() >= end)
206 * push new entry inside this subrange as it encloses it
212 if (start <= subrange.getBegin())
214 if (end >= subrange.getEnd())
217 * new entry encloses this subrange (and possibly preceding ones);
218 * continue to find the maximal list it encloses
231 * entry spans from before this subrange to inside it
236 * entry encloses one or more preceding subranges
238 addEnclosingRange(entry, firstEnclosed, lastEnclosed);
244 * entry spans two subranges but doesn't enclose any
247 subranges.add(j, new NCNode<>(entry));
259 * drops through to here if new range encloses all others
260 * or overlaps the last one
264 addEnclosingRange(entry, firstEnclosed, lastEnclosed);
268 subranges.add(new NCNode<>(entry));
275 * Answers true if this NCList contains the given entry (by object equality
281 public boolean contains(T entry)
284 * find the first sublist that might overlap, i.e.
285 * the first whose end position is >= from
287 int candidateIndex = findFirstOverlap(entry.getBegin());
289 if (candidateIndex == -1)
294 int to = entry.getEnd();
296 for (int i = candidateIndex; i < subranges.size(); i++)
298 NCNode<T> candidate = subranges.get(i);
299 if (candidate.getBegin() > to)
302 * we are past the end of our target range
306 if (candidate.contains(entry))
315 * Update the tree so that the range of the new entry encloses subranges i to
316 * j (inclusive). That is, replace subranges i-j (inclusive) with a new
317 * subrange that contains them.
323 protected synchronized void addEnclosingRange(T entry, final int i,
326 NCList<T> newNCList = new NCList<>();
327 newNCList.addNodes(subranges.subList(i, j + 1));
328 NCNode<T> newNode = new NCNode<>(entry, newNCList);
329 for (int k = j; k >= i; k--)
333 subranges.add(i, newNode);
336 protected void addNodes(List<NCNode<T>> nodes)
338 for (NCNode<T> node : nodes)
346 * Returns a (possibly empty) list of items whose extent overlaps the given
350 * start of overlap range (inclusive)
352 * end of overlap range (inclusive)
355 public List<T> findOverlaps(long from, long to)
357 List<T> result = new ArrayList<>();
359 findOverlaps(from, to, result);
365 * Recursively searches the NCList adding any items that overlap the from-to
366 * range to the result list
372 protected void findOverlaps(long from, long to, List<T> result)
375 * find the first sublist that might overlap, i.e.
376 * the first whose end position is >= from
378 int candidateIndex = findFirstOverlap(from);
380 if (candidateIndex == -1)
385 for (int i = candidateIndex; i < subranges.size(); i++)
387 NCNode<T> candidate = subranges.get(i);
388 if (candidate.getBegin() > to)
391 * we are past the end of our target range
395 candidate.findOverlaps(from, to, result);
401 * Search subranges for the first one whose end position is not before the
402 * target range's start position, i.e. the first one that may overlap the
403 * target range. Returns the index in the list of the first such range found,
404 * or -1 if none found.
409 protected int findFirstOverlap(long from)
412 * The NCList paper describes binary search for this step,
413 * but this not implemented here as (a) I haven't understood it yet
414 * and (b) it seems to imply complications for adding to an NCList
418 if (subranges != null)
420 for (NCNode<T> subrange : subranges)
422 if (subrange.getEnd() >= from)
433 * Formats the tree as a bracketed list e.g.
436 * [1-100 [10-30 [10-20]], 15-30 [20-20]]
440 public String toString()
442 return subranges.toString();
446 * Returns a string representation of the data where containment is shown by
447 * indentation on new lines
451 public String prettyPrint()
453 StringBuilder sb = new StringBuilder(512);
456 prettyPrint(sb, offset, indent);
457 sb.append(System.lineSeparator());
458 return sb.toString();
466 void prettyPrint(StringBuilder sb, int offset, int indent)
468 boolean first = true;
469 for (NCNode<T> subrange : subranges)
473 sb.append(System.lineSeparator());
476 subrange.prettyPrint(sb, offset, indent);
481 * Answers true if the data held satisfy the rules of construction of an
482 * NCList, else false.
486 public boolean isValid()
488 return isValid(Integer.MIN_VALUE, Integer.MAX_VALUE);
492 * Answers true if the data held satisfy the rules of construction of an
493 * NCList bounded within the given start-end range, else false.
495 * Each subrange must lie within start-end (inclusive). Subranges must be
496 * ordered by start position ascending.
503 boolean isValid(final int start, final int end)
505 int lastStart = start;
506 for (NCNode<T> subrange : subranges)
508 if (subrange.getBegin() < lastStart)
510 System.err.println("error in NCList: range " + subrange.toString()
511 + " starts before " + lastStart);
514 if (subrange.getEnd() > end)
516 System.err.println("error in NCList: range " + subrange.toString()
517 + " ends after " + end);
520 lastStart = subrange.getBegin();
522 if (!subrange.isValid())
531 * Answers the lowest start position enclosed by the ranges
535 public int getStart()
537 return subranges.isEmpty() ? 0 : subranges.get(0).getBegin();
541 * Returns the number of ranges held (deep count)
551 * Returns a list of all entries stored
555 public List<T> getEntries()
557 List<T> result = new ArrayList<>();
563 * Adds all contained entries to the given list
567 void getEntries(List<T> result)
569 for (NCNode<T> subrange : subranges)
571 subrange.getEntries(result);
576 * Deletes the given entry from the store, returning true if it was found (and
577 * deleted), else false. This method makes no assumption that the entry is in
578 * the 'expected' place in the store, in case it has been modified since it
579 * was added. Only the first 'same object' match is deleted, not 'equal' or
584 public synchronized boolean delete(T entry)
590 for (int i = 0; i < subranges.size(); i++)
592 NCNode<T> subrange = subranges.get(i);
593 NCList<T> subRegions = subrange.getSubRegions();
595 if (subrange.getRegion() == entry)
598 * if the subrange is rooted on this entry, promote its
599 * subregions (if any) to replace the subrange here;
600 * NB have to resort subranges after doing this since e.g.
601 * [10-30 [12-20 [16-18], 13-19]]
602 * after deleting 12-20, 16-18 is promoted to sibling of 13-19
603 * but should follow it in the list of subranges of 10-30
606 if (subRegions != null)
608 subranges.addAll(subRegions.subranges);
609 Collections.sort(subranges, RangeComparator.BY_START_POSITION);
616 if (subRegions != null && subRegions.delete(entry))
619 subrange.deleteSubRegionsIfEmpty();