JAL-2446 FeatureStore with NCList - work in progress
[jalview.git] / src / jalview / datamodel / features / FeatureStore.java
diff --git a/src/jalview/datamodel/features/FeatureStore.java b/src/jalview/datamodel/features/FeatureStore.java
new file mode 100644 (file)
index 0000000..bd94c8a
--- /dev/null
@@ -0,0 +1,445 @@
+package jalview.datamodel.features;
+
+import jalview.datamodel.SequenceFeature;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+/**
+ * A data store for a set of sequence features that supports efficient lookup of
+ * features overlapping a given range.
+ * 
+ * @author gmcarstairs
+ *
+ */
+public class FeatureStore
+{
+  Comparator<ContiguousI> startOrdering = new RangeComparator(true);
+
+  Comparator<ContiguousI> endOrdering = new RangeComparator(false);
+
+  /*
+   * An ordered list of features, with the promise that no feature in the list 
+   * properly contains any other. This constraint allows bounded linear search
+   * of the list for features overlapping a region.
+   * Contact features are not included in this list.
+   */
+  List<SequenceFeature> nonNestedFeatures;
+
+  /*
+   * contact features ordered by first contact position
+   */
+  List<SequenceFeature> contactFeatureStarts;
+
+  /*
+   * contact features ordered by second contact position
+   */
+  List<SequenceFeature> contactFeatureEnds;
+
+  /*
+   * Nested Containment List is used to hold any features that are nested 
+   * within (properly contained by) any other feature. This is a recursive tree
+   * which supports depth-first scan for features overlapping a range.
+   * It is used here as a 'catch-all' fallback for features that cannot be put
+   * into a simple ordered list without invalidating the search methods.
+   */
+  NCList<SequenceFeature> nestedFeatures;
+
+  /**
+   * Constructor
+   */
+  public FeatureStore()
+  {
+    nonNestedFeatures = new ArrayList<SequenceFeature>();
+    // we only construct contactFeatures and the NCList if we need to
+  }
+
+  /**
+   * Add one entry to the data store
+   * 
+   * @param feature
+   */
+  public void addFeature(SequenceFeature feature)
+  {
+    if (feature.isContactFeature())
+    {
+      addContactFeature(feature);
+    }
+    else
+    {
+      boolean added = addNonNestedFeature(feature);
+      if (!added)
+      {
+        /*
+         * detected a nested feature - put it in the NCList structure
+         */
+        addNestedFeature(feature);
+      }
+    }
+  }
+
+  /**
+   * Adds one feature to the NCList that can manage nested features (creating
+   * the NCList if necessary)
+   */
+  protected synchronized void addNestedFeature(SequenceFeature feature)
+  {
+    if (nestedFeatures == null)
+    {
+      nestedFeatures = new NCList<SequenceFeature>(feature);
+    }
+    else
+    {
+      nestedFeatures.add(feature);
+    }
+  }
+
+  /**
+   * Add a feature to the list of non-nested features, maintaining the ordering
+   * of the list. A check is made for whether the feature is nested in (properly
+   * contained by) an existing feature. If there is no nesting, the feature is
+   * added to the list and the method returns true. If nesting is found, the
+   * feature is not added and the method returns false.
+   * <p>
+   * Contact features are added at the position of their first contact point
+   * 
+   * @param feature
+   * @return
+   */
+  protected boolean addNonNestedFeature(SequenceFeature feature)
+  {
+    synchronized (nonNestedFeatures)
+    {
+      int insertPosition = binarySearchForAdd(nonNestedFeatures, feature);
+
+      /*
+       * fail if we detect feature enclosure - of the new feature by
+       * the one preceding it, or of the next feature by the new one
+       */
+      if (insertPosition > 0)
+      {
+        if (encloses(nonNestedFeatures.get(insertPosition - 1), feature))
+        {
+          return false;
+        }
+      }
+      if (insertPosition < nonNestedFeatures.size())
+      {
+        if (encloses(feature, nonNestedFeatures.get(insertPosition)))
+        {
+          return false;
+        }
+      }
+
+      /*
+       * checks passed - add or append the feature
+       */
+      if (insertPosition == nonNestedFeatures.size())
+      {
+        nonNestedFeatures.add(feature);
+      }
+      else
+      {
+        nonNestedFeatures.add(insertPosition, feature);
+      }
+      return true;
+    }
+  }
+
+  /**
+   * Answers true if range1 properly encloses range2, else false
+   * 
+   * @param range1
+   * @param range2
+   * @return
+   */
+  protected boolean encloses(ContiguousI range1, ContiguousI range2)
+  {
+    int begin1 = range1.getBegin();
+    int begin2 = range2.getBegin();
+    int end1 = range1.getEnd();
+    int end2 = range2.getEnd();
+    if (begin1 == begin2 && end1 > end2)
+    {
+      return true;
+    }
+    if (begin1 < begin2 && end1 >= end2)
+    {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Answers the index of the first element in the given list which follows or
+   * matches the given feature in the sort order. If no such element, answers
+   * the length of the list.
+   * 
+   * @param list
+   * @param feature
+   * 
+   * @return
+   */
+  protected int binarySearchForAdd(List<SequenceFeature> list, SequenceFeature feature)
+  {
+    // TODO binary search!
+    int i = 0;
+    while (i < list.size())
+    {
+      if (startOrdering.compare(nonNestedFeatures.get(i), feature) >= 0)
+      {
+        break;
+      }
+      i++;
+    }
+    return i;
+  }
+
+  /**
+   * Add a contact feature to the lists that hold them ordered by start (first
+   * contact) and by end (second contact) position, ensuring the lists remain
+   * ordered
+   * 
+   * @param feature
+   */
+  protected synchronized void addContactFeature(SequenceFeature feature)
+  {
+    // TODO binary search for insertion points!
+    if (contactFeatureStarts == null)
+    {
+      contactFeatureStarts = new ArrayList<SequenceFeature>();
+    }
+    if (contactFeatureEnds == null)
+    {
+      contactFeatureEnds = new ArrayList<SequenceFeature>();
+    }
+    contactFeatureStarts.add(feature);
+    Collections.sort(contactFeatureStarts, startOrdering);
+    contactFeatureEnds.add(feature);
+    Collections.sort(contactFeatureEnds, endOrdering);
+  }
+
+  /**
+   * Returns a (possibly empty) list of entries whose range overlaps the given
+   * range. The returned list is not ordered. Contact features are included if
+   * either of the contact points lies within the range.
+   * 
+   * @param start
+   *          start position of overlap range (inclusive)
+   * @param end
+   *          end position of overlap range (inclusive)
+   * @return
+   */
+  public List<SequenceFeature> findOverlappingFeatures(long start, long end)
+  {
+    List<SequenceFeature> result = new ArrayList<SequenceFeature>();
+
+    findNonNestedFeatures(start, end, result);
+
+    findContactFeatures(start, end, result);
+
+    if (nestedFeatures != null)
+    {
+      result.addAll(nestedFeatures.findOverlaps(start, end));
+    }
+
+    return result;
+  }
+
+  /**
+   * Adds contact features to the result list where either the second or the
+   * first contact position lies within the target range.
+   * 
+   * @param from
+   * @param to
+   * @param result
+   */
+  protected void findContactFeatures(long from, long to,
+          List<SequenceFeature> result)
+  {
+    if (contactFeatureStarts != null)
+    {
+      findContactStartFeatures(from, to, result);
+    }
+    if (contactFeatureEnds != null)
+    {
+      findContactEndFeatures(from, to, result);
+    }
+  }
+
+  /**
+   * @param from
+   * @param to
+   * @param result
+   */
+  protected void findContactEndFeatures(long from, long to,
+          List<SequenceFeature> result)
+  {
+    // TODO binary search for startPosition
+    for (int startPosition = 0; startPosition < contactFeatureEnds.size(); startPosition++)
+    {
+      SequenceFeature sf = contactFeatureEnds.get(startPosition);
+      if (!sf.isContactFeature())
+      {
+        System.err.println("Error! non-contact feature type "
+                + sf.getType() + " in contact features list");
+        continue;
+      }
+      int begin = sf.getBegin();
+      if (begin >= from && begin <= to)
+      {
+        /*
+         * this feature's first contact position lies in the search range
+         * so we don't include it in results a second time
+         */
+        continue;
+      }
+      int end = sf.getEnd();
+      if (end >= from && end <= to)
+      {
+        result.add(sf);
+      }
+    }
+  }
+
+  /**
+   * Returns the index of the first contact feature found whose end (second
+   * contact position) is not before the given start position. If no such
+   * feature is found, returns the length of the contact features list.
+   * 
+   * @param start
+   * @return
+   */
+  protected int contactsBinarySearch(long start)
+  {
+    // TODO binary search!!
+    int i = 0;
+    while (i < contactFeatureEnds.size())
+    {
+      if (contactFeatureEnds.get(i).getEnd() >= start)
+      {
+        break;
+      }
+      i++;
+    }
+
+    return i;
+  }
+
+  /**
+   * Adds features to the result list that are at a single position which lies
+   * within the target range. Non-positional features (start=end=0) and contact
+   * features are excluded.
+   * 
+   * @param from
+   * @param to
+   * @param result
+   */
+  protected void findNonNestedFeatures(long from, long to,
+          List<SequenceFeature> result)
+  {
+    int startIndex = binarySearch(nonNestedFeatures, from);
+    findNonNestedFeatures(startIndex, from, to, result);
+  }
+
+  /**
+   * Scans the list of non-nested features, starting from startIndex, to find
+   * those that overlap the from-to range, and adds them to the result list.
+   * Returns the index of the first feature whose start position is after the
+   * target range (or the length of the whole list if none such feature exists).
+   * 
+   * @param startIndex
+   * @param from
+   * @param to
+   * @param result
+   * @return
+   */
+  protected int findNonNestedFeatures(final int startIndex, long from,
+          long to,
+          List<SequenceFeature> result)
+  {
+    int i = startIndex;
+    while (i < nonNestedFeatures.size())
+    {
+      SequenceFeature sf = nonNestedFeatures.get(i);
+      if (sf.getBegin() > to)
+      {
+        break;
+      }
+      int start = sf.getBegin();
+      int end = sf.getEnd();
+      if (sf.isContactFeature())
+      {
+        end = start;
+      }
+      if (start <= to && end >= from)
+      {
+        result.add(sf);
+      }
+      i++;
+    }
+    return i;
+  }
+
+  /**
+   * Performs a binary search of the (sorted) list to find the index of the
+   * first entry whose end position is not less than the target position (i.e.
+   * skip all features that properly precede the given position)
+   * 
+   * @param features
+   * @param target
+   * @return
+   */
+  protected int binarySearch(List<SequenceFeature> features, long target)
+  {
+    int width = features.size() / 2;
+    int lastpos = width;
+    while (width > 0)
+    {
+      int end = features.get(lastpos).getEnd();
+      width = width / 2;
+      if (end > target)
+      {
+        lastpos -= width;
+      }
+      else
+      {
+        lastpos += width;
+      }
+    }
+    // todo correct binary search
+    return lastpos > 1 ? lastpos - 2 : 0;
+    // return lastpos;
+  }
+
+  /**
+   * Adds contact features whose start position lies in the from-to range to the
+   * result list
+   * 
+   * @param from
+   * @param to
+   * @param result
+   */
+  protected void findContactStartFeatures(long from, long to,
+          List<SequenceFeature> result)
+  {
+    // TODO binary search for startPosition
+    for (int startPosition = 0; startPosition < contactFeatureStarts.size(); startPosition++)
+    {
+      SequenceFeature sf = contactFeatureStarts.get(startPosition);
+      if (!sf.isContactFeature())
+      {
+        System.err.println("Error! non-contact feature type "
+                + sf.getType() + " in contact features list");
+        continue;
+      }
+      int begin = sf.getBegin();
+      if (begin >= from && begin <= to)
+      {
+        result.add(sf);
+      }
+    }
+  }
+}