X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Ffeatures%2FNCList.java;h=b8160d3c7077516d528477ed10a3ea53edc2470a;hb=05d220bbea0e8ce667490219436b96ebdf9826df;hp=70466700a2eebd027cb0a7a117622d75be199038;hpb=51728d3951398f9c12d7017c776953f17322cc68;p=jalview.git diff --git a/src/jalview/datamodel/features/NCList.java b/src/jalview/datamodel/features/NCList.java index 7046670..b8160d3 100644 --- a/src/jalview/datamodel/features/NCList.java +++ b/src/jalview/datamodel/features/NCList.java @@ -1,19 +1,28 @@ package jalview.datamodel.features; +import jalview.datamodel.ContiguousI; +import jalview.datamodel.Range; + import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.List; /** * An adapted implementation of NCList as described in the paper * *
- * todo
+ * Nested Containment List (NCList): a new algorithm for accelerating
+ * interval query of genome alignment and interval databases
+ * - Alexander V. Alekseyenko, Christopher J. Lee
+ * https://doi.org/10.1093/bioinformatics/btl647
  * 
*/ public class NCList { + /* + * the number of ranges represented + */ + private int size; /* * a list, in start position order, of sublists of ranges ordered so @@ -21,12 +30,6 @@ public class NCList */ private List> subranges; - /* - * a comparator to sort intervals by start position ascending, with - * longer (enclosing) intervals preceding those they enclose - */ - Comparator intervalSorter = new RangeComparator(true); - /** * Constructor given a list of things that are each located on a contiguous * interval. Note that the constructor may reorder the list. @@ -38,10 +41,14 @@ public class NCList */ public NCList(List ranges) { + this(); build(ranges); } /** + * Sort and group ranges into sublists where each sublist represents a region + * and its contained subregions + * * @param ranges */ protected void build(List ranges) @@ -50,29 +57,33 @@ public class NCList * sort by start ascending so that contained intervals * follow their containing interval */ - Collections.sort(ranges, intervalSorter); - - List sublists = findSubranges(ranges); + Collections.sort(ranges, RangeComparator.BY_START_POSITION); - subranges = new ArrayList>(); + List sublists = buildSubranges(ranges); /* * convert each subrange to an NCNode consisting of a range and * (possibly) its contained NCList */ - for (SubList sublist : sublists) + for (Range sublist : sublists) { - subranges.add(new NCNode(ranges.subList(sublist.startIndex, - sublist.endIndex + 1))); + subranges.add(new NCNode(ranges.subList(sublist.start, + sublist.end + 1))); } - sublists.clear(); + + size = ranges.size(); } public NCList(T entry) { - List ranges = new ArrayList(); - ranges.add(entry); - build(ranges); + this(); + subranges.add(new NCNode<>(entry)); + size = 1; + } + + public NCList() + { + subranges = new ArrayList>(); } /** @@ -82,10 +93,15 @@ public class NCList * @param ranges * @return */ - protected List findSubranges(List ranges) + protected List buildSubranges(List ranges) { - List sublists = new ArrayList(); + List sublists = new ArrayList<>(); + if (ranges.isEmpty()) + { + return sublists; + } + int listStartIndex = 0; long lastEndPos = Long.MAX_VALUE; @@ -100,23 +116,43 @@ public class NCList * this interval is not contained in the preceding one * close off the last sublist */ - sublists.add(new SubList(listStartIndex, i - 1)); + sublists.add(new Range(listStartIndex, i - 1)); listStartIndex = i; } lastEndPos = nextEnd; } - sublists.add(new SubList(listStartIndex, ranges.size() - 1)); + sublists.add(new Range(listStartIndex, ranges.size() - 1)); return sublists; } /** - * Adds one entry to the stored set + * Adds one entry to the stored set (with duplicates allowed) + * + * @param entry + */ + public void add(T entry) + { + add(entry, true); + } + + /** + * Adds one entry to the stored set, and returns true, unless allowDuplicates + * is set to false and it is already contained (by object equality test), in + * which case it is not added and this method returns false. * * @param entry + * @param allowDuplicates + * @return */ - public synchronized void add(T entry) + public synchronized boolean add(T entry, boolean allowDuplicates) { + if (!allowDuplicates && contains(entry)) + { + return false; + } + + size++; long start = entry.getBegin(); long end = entry.getEnd(); @@ -138,41 +174,42 @@ public class NCList /* * all subranges precede this one - add it on the end */ - subranges.add(new NCNode(entry)); - return; + subranges.add(new NCNode<>(entry)); + return true; } /* * search for maximal span of subranges i-k that the new entry * encloses; or a subrange that encloses the new entry */ - int i = candidateIndex; boolean enclosing = false; + int firstEnclosed = 0; + int lastEnclosed = 0; boolean overlapping = false; for (int j = candidateIndex; j < subranges.size(); j++) { NCNode subrange = subranges.get(j); - if (end < subrange.getStart() && !overlapping) + if (end < subrange.getBegin() && !overlapping && !enclosing) { /* * new entry lies between subranges j-1 j */ - subranges.add(j, new NCNode(entry)); - return; + subranges.add(j, new NCNode<>(entry)); + return true; } - if (subrange.getStart() <= start && subrange.getEnd() >= end) + if (subrange.getBegin() <= start && subrange.getEnd() >= end) { /* * push new entry inside this subrange as it encloses it */ subrange.add(entry); - return; + return true; } - if (start <= subrange.getStart()) + if (start <= subrange.getBegin()) { if (end >= subrange.getEnd()) { @@ -180,6 +217,11 @@ public class NCList * new entry encloses this subrange (and possibly preceding ones); * continue to find the maximal list it encloses */ + if (!enclosing) + { + firstEnclosed = j; + } + lastEnclosed = j; enclosing = true; continue; } @@ -193,8 +235,8 @@ public class NCList /* * entry encloses one or more preceding subranges */ - addEnclosingRange(entry, i, j - 1); - return; + addEnclosingRange(entry, firstEnclosed, lastEnclosed); + return true; } else { @@ -202,27 +244,102 @@ public class NCList * entry spans two subranges but doesn't enclose any * so just add it */ - subranges.add(j, new NCNode(entry)); - return; + subranges.add(j, new NCNode<>(entry)); + return true; } } } + else + { + overlapping = true; + } + } + + /* + * drops through to here if new range encloses all others + * or overlaps the last one + */ + if (enclosing) + { + addEnclosingRange(entry, firstEnclosed, lastEnclosed); } + else + { + subranges.add(new NCNode<>(entry)); + } + + return true; } /** + * Answers true if this NCList contains the given entry (by object equality + * test), else false + * + * @param entry + * @return + */ + public boolean contains(T entry) + { + /* + * find the first sublist that might overlap, i.e. + * the first whose end position is >= from + */ + int candidateIndex = findFirstOverlap(entry.getBegin()); + + if (candidateIndex == -1) + { + return false; + } + + int to = entry.getEnd(); + + for (int i = candidateIndex; i < subranges.size(); i++) + { + NCNode candidate = subranges.get(i); + if (candidate.getBegin() > to) + { + /* + * we are past the end of our target range + */ + break; + } + if (candidate.contains(entry)) + { + return true; + } + } + return false; + } + + /** * Update the tree so that the range of the new entry encloses subranges i to - * j (inclusive). That is, replace subranges i-j with a new subrange that - * contains them. + * j (inclusive). That is, replace subranges i-j (inclusive) with a new + * subrange that contains them. * * @param entry * @param i * @param j */ - protected synchronized void addEnclosingRange(T entry, int i, int j) + protected synchronized void addEnclosingRange(T entry, final int i, + final int j) { - // TODO how to rebuild the subranges as an NCList? + NCList newNCList = new NCList<>(); + newNCList.addNodes(subranges.subList(i, j + 1)); + NCNode newNode = new NCNode<>(entry, newNCList); + for (int k = j; k >= i; k--) + { + subranges.remove(k); + } + subranges.add(i, newNode); + } + protected void addNodes(List> nodes) + { + for (NCNode node : nodes) + { + subranges.add(node); + size += node.size(); + } } /** @@ -237,7 +354,7 @@ public class NCList */ public List findOverlaps(long from, long to) { - List result = new ArrayList(); + List result = new ArrayList<>(); findOverlaps(from, to, result); @@ -252,8 +369,7 @@ public class NCList * @param to * @param result */ - protected void findOverlaps(long from, long to, - List result) + protected void findOverlaps(long from, long to, List result) { /* * find the first sublist that might overlap, i.e. @@ -269,14 +385,14 @@ public class NCList for (int i = candidateIndex; i < subranges.size(); i++) { NCNode candidate = subranges.get(i); - if (candidate.getStart() > to) + if (candidate.getBegin() > to) { /* * we are past the end of our target range */ break; } - candidate.addOverlaps(from, to, result); + candidate.findOverlaps(from, to, result); } } @@ -292,8 +408,12 @@ public class NCList */ protected int findFirstOverlap(long from) { - // TODO binary search - // for now quick cheat linear search + /* + * The NCList paper describes binary search for this step, + * but this not implemented here as (a) I haven't understood it yet + * and (b) it seems to imply complications for adding to an NCList + */ + int i = 0; if (subranges != null) { @@ -321,4 +441,186 @@ public class NCList { return subranges.toString(); } + + /** + * Returns a string representation of the data where containment is shown by + * indentation on new lines + * + * @return + */ + public String prettyPrint() + { + StringBuilder sb = new StringBuilder(512); + int offset = 0; + int indent = 2; + prettyPrint(sb, offset, indent); + sb.append(System.lineSeparator()); + return sb.toString(); + } + + /** + * @param sb + * @param offset + * @param indent + */ + void prettyPrint(StringBuilder sb, int offset, int indent) + { + boolean first = true; + for (NCNode subrange : subranges) + { + if (!first) + { + sb.append(System.lineSeparator()); + } + first = false; + subrange.prettyPrint(sb, offset, indent); + } + } + + /** + * Answers true if the data held satisfy the rules of construction of an + * NCList, else false. + * + * @return + */ + public boolean isValid() + { + return isValid(Integer.MIN_VALUE, Integer.MAX_VALUE); + } + + /** + * Answers true if the data held satisfy the rules of construction of an + * NCList bounded within the given start-end range, else false. + *

+ * Each subrange must lie within start-end (inclusive). Subranges must be + * ordered by start position ascending. + *

+ * + * @param start + * @param end + * @return + */ + boolean isValid(final int start, final int end) + { + int lastStart = start; + for (NCNode subrange : subranges) + { + if (subrange.getBegin() < lastStart) + { + System.err.println("error in NCList: range " + subrange.toString() + + " starts before " + lastStart); + return false; + } + if (subrange.getEnd() > end) + { + System.err.println("error in NCList: range " + subrange.toString() + + " ends after " + end); + return false; + } + lastStart = subrange.getBegin(); + + if (!subrange.isValid()) + { + return false; + } + } + return true; + } + + /** + * Answers the lowest start position enclosed by the ranges + * + * @return + */ + public int getStart() + { + return subranges.isEmpty() ? 0 : subranges.get(0).getBegin(); + } + + /** + * Returns the number of ranges held (deep count) + * + * @return + */ + public int size() + { + return size; + } + + /** + * Returns a list of all entries stored + * + * @return + */ + public List getEntries() + { + List result = new ArrayList<>(); + getEntries(result); + return result; + } + + /** + * Adds all contained entries to the given list + * + * @param result + */ + void getEntries(List result) + { + for (NCNode subrange : subranges) + { + subrange.getEntries(result); + } + } + + /** + * Deletes the given entry from the store, returning true if it was found (and + * deleted), else false. This method makes no assumption that the entry is in + * the 'expected' place in the store, in case it has been modified since it + * was added. Only the first 'same object' match is deleted, not 'equal' or + * multiple objects. + * + * @param entry + */ + public synchronized boolean delete(T entry) + { + if (entry == null) + { + return false; + } + for (int i = 0; i < subranges.size(); i++) + { + NCNode subrange = subranges.get(i); + NCList subRegions = subrange.getSubRegions(); + + if (subrange.getRegion() == entry) + { + /* + * if the subrange is rooted on this entry, promote its + * subregions (if any) to replace the subrange here; + * NB have to resort subranges after doing this since e.g. + * [10-30 [12-20 [16-18], 13-19]] + * after deleting 12-20, 16-18 is promoted to sibling of 13-19 + * but should follow it in the list of subranges of 10-30 + */ + subranges.remove(i); + if (subRegions != null) + { + subranges.addAll(subRegions.subranges); + Collections.sort(subranges, RangeComparator.BY_START_POSITION); + } + size--; + return true; + } + else + { + if (subRegions != null && subRegions.delete(entry)) + { + size--; + subrange.deleteSubRegionsIfEmpty(); + return true; + } + } + } + return false; + } }