2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel.features;
23 import jalview.datamodel.ContiguousI;
24 import jalview.datamodel.Range;
26 import java.util.ArrayList;
27 import java.util.Collections;
28 import java.util.List;
31 * An adapted implementation of NCList as described in the paper
34 * Nested Containment List (NCList): a new algorithm for accelerating
35 * interval query of genome alignment and interval databases
36 * - Alexander V. Alekseyenko, Christopher J. Lee
37 * https://doi.org/10.1093/bioinformatics/btl647
40 public class NCList<T extends ContiguousI>
43 * the number of ranges represented
48 * a list, in start position order, of sublists of ranges ordered so
49 * that each contains (or is the same as) the one that follows it
51 private List<NCNode<T>> subranges;
54 * Constructor given a list of things that are each located on a contiguous
55 * interval. Note that the constructor may reorder the list.
57 * We assume here that for each range, start <= end. Behaviour for reverse
58 * ordered ranges is undefined.
62 public NCList(List<T> ranges)
69 * Sort and group ranges into sublists where each sublist represents a region
70 * and its contained subregions
74 protected void build(List<T> ranges)
77 * sort by start ascending so that contained intervals
78 * follow their containing interval
80 Collections.sort(ranges, RangeComparator.BY_START_POSITION);
82 List<Range> sublists = buildSubranges(ranges);
85 * convert each subrange to an NCNode consisting of a range and
86 * (possibly) its contained NCList
88 for (Range sublist : sublists)
90 subranges.add(new NCNode<T>(ranges.subList(sublist.start,
97 public NCList(T entry)
100 subranges.add(new NCNode<>(entry));
106 subranges = new ArrayList<NCNode<T>>();
110 * Traverses the sorted ranges to identify sublists, within which each
111 * interval contains the one that follows it
116 protected List<Range> buildSubranges(List<T> ranges)
118 List<Range> sublists = new ArrayList<>();
120 if (ranges.isEmpty())
125 int listStartIndex = 0;
126 long lastEndPos = Long.MAX_VALUE;
128 for (int i = 0; i < ranges.size(); i++)
130 ContiguousI nextInterval = ranges.get(i);
131 long nextStart = nextInterval.getBegin();
132 long nextEnd = nextInterval.getEnd();
133 if (nextStart > lastEndPos || nextEnd > lastEndPos)
136 * this interval is not contained in the preceding one
137 * close off the last sublist
139 sublists.add(new Range(listStartIndex, i - 1));
142 lastEndPos = nextEnd;
145 sublists.add(new Range(listStartIndex, ranges.size() - 1));
150 * Adds one entry to the stored set (with duplicates allowed)
154 public void add(T entry)
160 * Adds one entry to the stored set, and returns true, unless allowDuplicates
161 * is set to false and it is already contained (by object equality test), in
162 * which case it is not added and this method returns false.
165 * @param allowDuplicates
168 public synchronized boolean add(T entry, boolean allowDuplicates)
170 if (!allowDuplicates && contains(entry))
176 long start = entry.getBegin();
177 long end = entry.getEnd();
181 * - precedes all subranges: add as NCNode on front of list
182 * - follows all subranges: add as NCNode on end of list
183 * - enclosed by a subrange - add recursively to subrange
184 * - encloses one or more subranges - push them inside it
185 * - none of the above - add as a new node and resort nodes list (?)
189 * find the first subrange whose end does not precede entry's start
191 int candidateIndex = findFirstOverlap(start);
192 if (candidateIndex == -1)
195 * all subranges precede this one - add it on the end
197 subranges.add(new NCNode<>(entry));
202 * search for maximal span of subranges i-k that the new entry
203 * encloses; or a subrange that encloses the new entry
205 boolean enclosing = false;
206 int firstEnclosed = 0;
207 int lastEnclosed = 0;
208 boolean overlapping = false;
210 for (int j = candidateIndex; j < subranges.size(); j++)
212 NCNode<T> subrange = subranges.get(j);
214 if (end < subrange.getBegin() && !overlapping && !enclosing)
217 * new entry lies between subranges j-1 j
219 subranges.add(j, new NCNode<>(entry));
223 if (subrange.getBegin() <= start && subrange.getEnd() >= end)
226 * push new entry inside this subrange as it encloses it
232 if (start <= subrange.getBegin())
234 if (end >= subrange.getEnd())
237 * new entry encloses this subrange (and possibly preceding ones);
238 * continue to find the maximal list it encloses
251 * entry spans from before this subrange to inside it
256 * entry encloses one or more preceding subranges
258 addEnclosingRange(entry, firstEnclosed, lastEnclosed);
264 * entry spans two subranges but doesn't enclose any
267 subranges.add(j, new NCNode<>(entry));
279 * drops through to here if new range encloses all others
280 * or overlaps the last one
284 addEnclosingRange(entry, firstEnclosed, lastEnclosed);
288 subranges.add(new NCNode<>(entry));
295 * Answers true if this NCList contains the given entry (by object equality
301 public boolean contains(T entry)
304 * find the first sublist that might overlap, i.e.
305 * the first whose end position is >= from
307 int candidateIndex = findFirstOverlap(entry.getBegin());
309 if (candidateIndex == -1)
314 int to = entry.getEnd();
316 for (int i = candidateIndex; i < subranges.size(); i++)
318 NCNode<T> candidate = subranges.get(i);
319 if (candidate.getBegin() > to)
322 * we are past the end of our target range
326 if (candidate.contains(entry))
335 * Update the tree so that the range of the new entry encloses subranges i to
336 * j (inclusive). That is, replace subranges i-j (inclusive) with a new
337 * subrange that contains them.
343 protected synchronized void addEnclosingRange(T entry, final int i,
346 NCList<T> newNCList = new NCList<>();
347 newNCList.addNodes(subranges.subList(i, j + 1));
348 NCNode<T> newNode = new NCNode<>(entry, newNCList);
349 for (int k = j; k >= i; k--)
353 subranges.add(i, newNode);
356 protected void addNodes(List<NCNode<T>> nodes)
358 for (NCNode<T> node : nodes)
366 * Returns a (possibly empty) list of items whose extent overlaps the given
370 * start of overlap range (inclusive)
372 * end of overlap range (inclusive)
375 public List<T> findOverlaps(long from, long to)
377 List<T> result = new ArrayList<>();
379 findOverlaps(from, to, result);
385 * Recursively searches the NCList adding any items that overlap the from-to
386 * range to the result list
392 protected void findOverlaps(long from, long to, List<T> result)
395 * find the first sublist that might overlap, i.e.
396 * the first whose end position is >= from
398 int candidateIndex = findFirstOverlap(from);
400 if (candidateIndex == -1)
405 for (int i = candidateIndex; i < subranges.size(); i++)
407 NCNode<T> candidate = subranges.get(i);
408 if (candidate.getBegin() > to)
411 * we are past the end of our target range
415 candidate.findOverlaps(from, to, result);
421 * Search subranges for the first one whose end position is not before the
422 * target range's start position, i.e. the first one that may overlap the
423 * target range. Returns the index in the list of the first such range found,
424 * or -1 if none found.
429 protected int findFirstOverlap(long from)
432 * The NCList paper describes binary search for this step,
433 * but this not implemented here as (a) I haven't understood it yet
434 * and (b) it seems to imply complications for adding to an NCList
438 if (subranges != null)
440 for (NCNode<T> subrange : subranges)
442 if (subrange.getEnd() >= from)
453 * Formats the tree as a bracketed list e.g.
456 * [1-100 [10-30 [10-20]], 15-30 [20-20]]
460 public String toString()
462 return subranges.toString();
466 * Returns a string representation of the data where containment is shown by
467 * indentation on new lines
471 public String prettyPrint()
473 StringBuilder sb = new StringBuilder(512);
476 prettyPrint(sb, offset, indent);
477 sb.append(System.lineSeparator());
478 return sb.toString();
486 void prettyPrint(StringBuilder sb, int offset, int indent)
488 boolean first = true;
489 for (NCNode<T> subrange : subranges)
493 sb.append(System.lineSeparator());
496 subrange.prettyPrint(sb, offset, indent);
501 * Answers true if the data held satisfy the rules of construction of an
502 * NCList, else false.
506 public boolean isValid()
508 return isValid(Integer.MIN_VALUE, Integer.MAX_VALUE);
512 * Answers true if the data held satisfy the rules of construction of an
513 * NCList bounded within the given start-end range, else false.
515 * Each subrange must lie within start-end (inclusive). Subranges must be
516 * ordered by start position ascending.
523 boolean isValid(final int start, final int end)
525 int lastStart = start;
526 for (NCNode<T> subrange : subranges)
528 if (subrange.getBegin() < lastStart)
530 System.err.println("error in NCList: range " + subrange.toString()
531 + " starts before " + lastStart);
534 if (subrange.getEnd() > end)
536 System.err.println("error in NCList: range " + subrange.toString()
537 + " ends after " + end);
540 lastStart = subrange.getBegin();
542 if (!subrange.isValid())
551 * Answers the lowest start position enclosed by the ranges
555 public int getStart()
557 return subranges.isEmpty() ? 0 : subranges.get(0).getBegin();
561 * Returns the number of ranges held (deep count)
571 * Returns a list of all entries stored
575 public List<T> getEntries()
577 List<T> result = new ArrayList<>();
583 * Adds all contained entries to the given list
587 void getEntries(List<T> result)
589 for (NCNode<T> subrange : subranges)
591 subrange.getEntries(result);
596 * Deletes the given entry from the store, returning true if it was found (and
597 * deleted), else false. This method makes no assumption that the entry is in
598 * the 'expected' place in the store, in case it has been modified since it
599 * was added. Only the first 'same object' match is deleted, not 'equal' or
604 public synchronized boolean delete(T entry)
610 for (int i = 0; i < subranges.size(); i++)
612 NCNode<T> subrange = subranges.get(i);
613 NCList<T> subRegions = subrange.getSubRegions();
615 if (subrange.getRegion() == entry)
618 * if the subrange is rooted on this entry, promote its
619 * subregions (if any) to replace the subrange here;
620 * NB have to resort subranges after doing this since e.g.
621 * [10-30 [12-20 [16-18], 13-19]]
622 * after deleting 12-20, 16-18 is promoted to sibling of 13-19
623 * but should follow it in the list of subranges of 10-30
626 if (subRegions != null)
628 subranges.addAll(subRegions.subranges);
629 Collections.sort(subranges, RangeComparator.BY_START_POSITION);
636 if (subRegions != null && subRegions.delete(entry))
639 subrange.deleteSubRegionsIfEmpty();