package jalview.datamodel.features; import jalview.datamodel.ContiguousI; import jalview.datamodel.SequenceFeature; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; /** * A data store for a set of sequence features that supports efficient lookup of * features overlapping a given range. Intended for (but not limited to) storage * of features for one sequence and feature type. * * @author gmcarstairs * */ public class FeatureStore { /** * a class providing criteria for performing a binary search of a list */ abstract static class SearchCriterion { /** * Answers true if the entry passes the search criterion test * * @param entry * @return */ abstract boolean compare(SequenceFeature entry); /** * serves a search condition for finding the first feature whose start * position follows a given target location * * @param target * @return */ static SearchCriterion byStart(final long target) { return new SearchCriterion() { @Override boolean compare(SequenceFeature entry) { return entry.getBegin() >= target; } }; } /** * serves a search condition for finding the first feature whose end * position is at or follows a given target location * * @param target * @return */ static SearchCriterion byEnd(final long target) { return new SearchCriterion() { @Override boolean compare(SequenceFeature entry) { return entry.getEnd() >= target; } }; } /** * serves a search condition for finding the first feature which follows the * given range as determined by a supplied comparator * * @param target * @return */ static SearchCriterion byFeature(final ContiguousI to, final Comparator rc) { return new SearchCriterion() { @Override boolean compare(SequenceFeature entry) { return rc.compare(entry, to) >= 0; } }; } } /* * Non-positional features have no (zero) start/end position. * Kept as a separate list in case this criterion changes in future. */ List nonPositionalFeatures; /* * An ordered list of features, with the promise that no feature in the list * properly contains any other. This constraint allows bounded linear search * of the list for features overlapping a region. * Contact features are not included in this list. */ List nonNestedFeatures; /* * contact features ordered by first contact position */ List contactFeatureStarts; /* * contact features ordered by second contact position */ List contactFeatureEnds; /* * Nested Containment List is used to hold any features that are nested * within (properly contained by) any other feature. This is a recursive tree * which supports depth-first scan for features overlapping a range. * It is used here as a 'catch-all' fallback for features that cannot be put * into a simple ordered list without invalidating the search methods. */ NCList nestedFeatures; /* * Feature groups represented in stored positional features * (possibly including null) */ Set positionalFeatureGroups; /* * Feature groups represented in stored non-positional features * (possibly including null) */ Set nonPositionalFeatureGroups; /* * the total length of all positional features; contact features count 1 to * the total and 1 to size(), consistent with an average 'feature length' of 1 */ int totalExtent; float positionalMinScore; float positionalMaxScore; float nonPositionalMinScore; float nonPositionalMaxScore; /** * Constructor */ public FeatureStore() { nonNestedFeatures = new ArrayList(); positionalFeatureGroups = new HashSet(); nonPositionalFeatureGroups = new HashSet(); positionalMinScore = Float.NaN; positionalMaxScore = Float.NaN; nonPositionalMinScore = Float.NaN; nonPositionalMaxScore = Float.NaN; // we only construct nonPositionalFeatures, contactFeatures // or the NCList if we need to } /** * Adds one sequence feature to the store, and returns true, unless the * feature is already contained in the store, in which case this method * returns false. Containment is determined by SequenceFeature.equals() * comparison. * * @param feature */ public boolean addFeature(SequenceFeature feature) { if (contains(feature)) { return false; } /* * keep a record of feature groups */ if (!feature.isNonPositional()) { positionalFeatureGroups.add(feature.getFeatureGroup()); } boolean added = false; if (feature.isContactFeature()) { added = addContactFeature(feature); } else if (feature.isNonPositional()) { added = addNonPositionalFeature(feature); } else { added = addNonNestedFeature(feature); if (!added) { /* * detected a nested feature - put it in the NCList structure */ added = addNestedFeature(feature); } } if (added) { /* * record the total extent of positional features, to make * getTotalFeatureLength possible; we count the length of a * contact feature as 1 */ totalExtent += getFeatureLength(feature); /* * record the minimum and maximum score for positional * and non-positional features */ float score = feature.getScore(); if (!Float.isNaN(score)) { if (feature.isNonPositional()) { nonPositionalMinScore = min(nonPositionalMinScore, score); nonPositionalMaxScore = max(nonPositionalMaxScore, score); } else { positionalMinScore = min(positionalMinScore, score); positionalMaxScore = max(positionalMaxScore, score); } } } return added; } /** * Answers true if this store contains the given feature (testing by * SequenceFeature.equals), else false * * @param feature * @return */ public boolean contains(SequenceFeature feature) { if (feature.isNonPositional()) { return nonPositionalFeatures == null ? false : nonPositionalFeatures .contains(feature); } if (feature.isContactFeature()) { return contactFeatureStarts == null ? false : listContains( contactFeatureStarts, feature); } if (listContains(nonNestedFeatures, feature)) { return true; } return nestedFeatures == null ? false : nestedFeatures .contains(feature); } /** * Answers the 'length' of the feature, counting 0 for non-positional features * and 1 for contact features * * @param feature * @return */ protected static int getFeatureLength(SequenceFeature feature) { if (feature.isNonPositional()) { return 0; } if (feature.isContactFeature()) { return 1; } return 1 + feature.getEnd() - feature.getBegin(); } /** * Adds the feature to the list of non-positional features (with lazy * instantiation of the list if it is null), and returns true. The feature * group is added to the set of distinct feature groups for non-positional * features. This method allows duplicate features, so test before calling to * prevent this. * * @param feature */ protected boolean addNonPositionalFeature(SequenceFeature feature) { if (nonPositionalFeatures == null) { nonPositionalFeatures = new ArrayList(); } nonPositionalFeatures.add(feature); nonPositionalFeatureGroups.add(feature.getFeatureGroup()); return true; } /** * Adds one feature to the NCList that can manage nested features (creating * the NCList if necessary), and returns true. If the feature is already * stored in the NCList (by equality test), then it is not added, and this * method returns false. */ protected synchronized boolean addNestedFeature(SequenceFeature feature) { if (nestedFeatures == null) { nestedFeatures = new NCList<>(feature); return true; } return nestedFeatures.add(feature, false); } /** * Add a feature to the list of non-nested features, maintaining the ordering * of the list. A check is made for whether the feature is nested in (properly * contained by) an existing feature. If there is no nesting, the feature is * added to the list and the method returns true. If nesting is found, the * feature is not added and the method returns false. * * @param feature * @return */ protected boolean addNonNestedFeature(SequenceFeature feature) { synchronized (nonNestedFeatures) { /* * find the first stored feature which doesn't precede the new one */ int insertPosition = binarySearch(nonNestedFeatures, SearchCriterion.byFeature(feature, RangeComparator.BY_START_POSITION)); /* * fail if we detect feature enclosure - of the new feature by * the one preceding it, or of the next feature by the new one */ if (insertPosition > 0) { if (encloses(nonNestedFeatures.get(insertPosition - 1), feature)) { return false; } } if (insertPosition < nonNestedFeatures.size()) { if (encloses(feature, nonNestedFeatures.get(insertPosition))) { return false; } } /* * checks passed - add the feature */ nonNestedFeatures.add(insertPosition, feature); return true; } } /** * Answers true if range1 properly encloses range2, else false * * @param range1 * @param range2 * @return */ protected boolean encloses(ContiguousI range1, ContiguousI range2) { int begin1 = range1.getBegin(); int begin2 = range2.getBegin(); int end1 = range1.getEnd(); int end2 = range2.getEnd(); if (begin1 == begin2 && end1 > end2) { return true; } if (begin1 < begin2 && end1 >= end2) { return true; } return false; } /** * Add a contact feature to the lists that hold them ordered by start (first * contact) and by end (second contact) position, ensuring the lists remain * ordered, and returns true. This method allows duplicate features to be * added, so test before calling to avoid this. * * @param feature * @return */ protected synchronized boolean addContactFeature(SequenceFeature feature) { if (contactFeatureStarts == null) { contactFeatureStarts = new ArrayList(); } if (contactFeatureEnds == null) { contactFeatureEnds = new ArrayList(); } /* * binary search the sorted list to find the insertion point */ int insertPosition = binarySearch(contactFeatureStarts, SearchCriterion.byFeature(feature, RangeComparator.BY_START_POSITION)); contactFeatureStarts.add(insertPosition, feature); // and resort to mak siccar...just in case insertion point not quite right Collections.sort(contactFeatureStarts, RangeComparator.BY_START_POSITION); insertPosition = binarySearch(contactFeatureStarts, SearchCriterion.byFeature(feature, RangeComparator.BY_END_POSITION)); contactFeatureEnds.add(feature); Collections.sort(contactFeatureEnds, RangeComparator.BY_END_POSITION); return true; } /** * Answers true if the list contains the feature, else false. This method is * optimised for the condition that the list is sorted on feature start * position ascending, and will give unreliable results if this does not hold. * * @param features * @param feature * @return */ protected static boolean listContains(List features, SequenceFeature feature) { if (features == null || feature == null) { return false; } /* * locate the first entry in the list which does not precede the feature */ int pos = binarySearch(features, SearchCriterion.byFeature(feature, RangeComparator.BY_START_POSITION)); int len = features.size(); while (pos < len) { SequenceFeature sf = features.get(pos); if (sf.getBegin() > feature.getBegin()) { return false; // no match found } if (sf.equals(feature)) { return true; } pos++; } return false; } /** * Returns a (possibly empty) list of features whose extent overlaps the given * range. The returned list is not ordered. Contact features are included if * either of the contact points lies within the range. * * @param start * start position of overlap range (inclusive) * @param end * end position of overlap range (inclusive) * @return */ public List findOverlappingFeatures(long start, long end) { List result = new ArrayList<>(); findNonNestedFeatures(start, end, result); findContactFeatures(start, end, result); if (nestedFeatures != null) { result.addAll(nestedFeatures.findOverlaps(start, end)); } return result; } /** * Adds contact features to the result list where either the second or the * first contact position lies within the target range * * @param from * @param to * @param result */ protected void findContactFeatures(long from, long to, List result) { if (contactFeatureStarts != null) { findContactStartFeatures(from, to, result); } if (contactFeatureEnds != null) { findContactEndFeatures(from, to, result); } } /** * Adds to the result list any contact features whose end (second contact * point), but not start (first contact point), lies in the query from-to * range * * @param from * @param to * @param result */ protected void findContactEndFeatures(long from, long to, List result) { /* * find the first contact feature (if any) that does not lie * entirely before the target range */ int startPosition = binarySearch(contactFeatureEnds, SearchCriterion.byEnd(from)); for (; startPosition < contactFeatureEnds.size(); startPosition++) { SequenceFeature sf = contactFeatureEnds.get(startPosition); if (!sf.isContactFeature()) { System.err.println("Error! non-contact feature type " + sf.getType() + " in contact features list"); continue; } int begin = sf.getBegin(); if (begin >= from && begin <= to) { /* * this feature's first contact position lies in the search range * so we don't include it in results a second time */ continue; } int end = sf.getEnd(); if (end >= from && end <= to) { result.add(sf); } if (end > to) { break; } } } /** * Adds non-nested features to the result list that lie within the target * range. Non-positional features (start=end=0), contact features and nested * features are excluded. * * @param from * @param to * @param result */ protected void findNonNestedFeatures(long from, long to, List result) { /* * find the first feature whose end position is * after the target range start */ int startIndex = binarySearch(nonNestedFeatures, SearchCriterion.byEnd(from)); final int startIndex1 = startIndex; int i = startIndex1; while (i < nonNestedFeatures.size()) { SequenceFeature sf = nonNestedFeatures.get(i); if (sf.getBegin() > to) { break; } if (sf.getBegin() <= to && sf.getEnd() >= from) { result.add(sf); } i++; } } /** * Adds contact features whose start position lies in the from-to range to the * result list * * @param from * @param to * @param result */ protected void findContactStartFeatures(long from, long to, List result) { int startPosition = binarySearch(contactFeatureStarts, SearchCriterion.byStart(from)); for (; startPosition < contactFeatureStarts.size(); startPosition++) { SequenceFeature sf = contactFeatureStarts.get(startPosition); if (!sf.isContactFeature()) { System.err.println("Error! non-contact feature type " + sf.getType() + " in contact features list"); continue; } int begin = sf.getBegin(); if (begin >= from && begin <= to) { result.add(sf); } } } /** * Answers a list of all positional features stored, in no guaranteed order * * @return */ public List getPositionalFeatures() { /* * add non-nested features (may be all features for many cases) */ List result = new ArrayList<>(); result.addAll(nonNestedFeatures); /* * add any contact features - from the list by start position */ if (contactFeatureStarts != null) { result.addAll(contactFeatureStarts); } /* * add any nested features */ if (nestedFeatures != null) { result.addAll(nestedFeatures.getEntries()); } return result; } /** * Answers a list of all contact features. If there are none, returns an * immutable empty list. * * @return */ public List getContactFeatures() { if (contactFeatureStarts == null) { return Collections.emptyList(); } return new ArrayList<>(contactFeatureStarts); } /** * Answers a list of all non-positional features. If there are none, returns * an immutable empty list. * * @return */ public List getNonPositionalFeatures() { if (nonPositionalFeatures == null) { return Collections.emptyList(); } return new ArrayList<>(nonPositionalFeatures); } /** * Deletes the given feature from the store, returning true if it was found * (and deleted), else false. This method makes no assumption that the feature * is in the 'expected' place in the store, in case it has been modified since * it was added. * * @param sf */ public synchronized boolean delete(SequenceFeature sf) { /* * try the non-nested positional features first */ boolean removed = nonNestedFeatures.remove(sf); /* * if not found, try contact positions (and if found, delete * from both lists of contact positions) */ if (!removed && contactFeatureStarts != null) { removed = contactFeatureStarts.remove(sf); if (removed) { contactFeatureEnds.remove(sf); } } boolean removedNonPositional = false; /* * if not found, try non-positional features */ if (!removed && nonPositionalFeatures != null) { removedNonPositional = nonPositionalFeatures.remove(sf); removed = removedNonPositional; } /* * if not found, try nested features */ if (!removed && nestedFeatures != null) { removed = nestedFeatures.delete(sf); } if (removed) { rescanAfterDelete(); } return removed; } /** * Rescan all features to recompute any cached values after an entry has been * deleted. This is expected to be an infrequent event, so performance here is * not critical. */ protected synchronized void rescanAfterDelete() { positionalFeatureGroups.clear(); nonPositionalFeatureGroups.clear(); totalExtent = 0; positionalMinScore = Float.NaN; positionalMaxScore = Float.NaN; nonPositionalMinScore = Float.NaN; nonPositionalMaxScore = Float.NaN; /* * scan non-positional features for groups and scores */ for (SequenceFeature sf : getNonPositionalFeatures()) { nonPositionalFeatureGroups.add(sf.getFeatureGroup()); float score = sf.getScore(); nonPositionalMinScore = min(nonPositionalMinScore, score); nonPositionalMaxScore = max(nonPositionalMaxScore, score); } /* * scan positional features for groups, scores and extents */ for (SequenceFeature sf : getPositionalFeatures()) { positionalFeatureGroups.add(sf.getFeatureGroup()); float score = sf.getScore(); positionalMinScore = min(positionalMinScore, score); positionalMaxScore = max(positionalMaxScore, score); totalExtent += getFeatureLength(sf); } } /** * A helper method to return the minimum of two floats, where a non-NaN value * is treated as 'less than' a NaN value (unlike Math.min which does the * opposite) * * @param f1 * @param f2 */ protected static float min(float f1, float f2) { if (Float.isNaN(f1)) { return Float.isNaN(f2) ? f1 : f2; } else { return Float.isNaN(f2) ? f1 : Math.min(f1, f2); } } /** * A helper method to return the maximum of two floats, where a non-NaN value * is treated as 'greater than' a NaN value (unlike Math.max which does the * opposite) * * @param f1 * @param f2 */ protected static float max(float f1, float f2) { if (Float.isNaN(f1)) { return Float.isNaN(f2) ? f1 : f2; } else { return Float.isNaN(f2) ? f1 : Math.max(f1, f2); } } /** * Answers true if this store has no features, else false * * @return */ public boolean isEmpty() { boolean hasFeatures = !nonNestedFeatures.isEmpty() || (contactFeatureStarts != null && !contactFeatureStarts .isEmpty()) || (nonPositionalFeatures != null && !nonPositionalFeatures .isEmpty()) || (nestedFeatures != null && nestedFeatures.size() > 0); return !hasFeatures; } /** * Answers the set of distinct feature groups stored, possibly including null, * as an unmodifiable view of the set. The parameter determines whether the * groups for positional or for non-positional features are returned. * * @param positionalFeatures * @return */ public Set getFeatureGroups(boolean positionalFeatures) { if (positionalFeatures) { return Collections.unmodifiableSet(positionalFeatureGroups); } else { return nonPositionalFeatureGroups == null ? Collections . emptySet() : Collections .unmodifiableSet(nonPositionalFeatureGroups); } } /** * Performs a binary search of the (sorted) list to find the index of the * first entry which returns true for the given comparator function. Returns * the length of the list if there is no such entry. * * @param features * @param sc * @return */ protected static int binarySearch(List features, SearchCriterion sc) { int start = 0; int end = features.size() - 1; int matched = features.size(); while (start <= end) { int mid = (start + end) / 2; SequenceFeature entry = features.get(mid); boolean compare = sc.compare(entry); if (compare) { matched = mid; end = mid - 1; } else { start = mid + 1; } } return matched; } /** * Answers the number of positional (or non-positional) features stored. * Contact features count as 1. * * @param positional * @return */ public int getFeatureCount(boolean positional) { if (!positional) { return nonPositionalFeatures == null ? 0 : nonPositionalFeatures .size(); } int size = nonNestedFeatures.size(); if (contactFeatureStarts != null) { // note a contact feature (start/end) counts as one size += contactFeatureStarts.size(); } if (nestedFeatures != null) { size += nestedFeatures.size(); } return size; } /** * Answers the total length of positional features (or zero if there are * none). Contact features contribute a value of 1 to the total. * * @return */ public int getTotalFeatureLength() { return totalExtent; } /** * Answers the minimum score held for positional or non-positional features. * This may be Float.NaN if there are no features, are none has a non-NaN * score. * * @param positional * @return */ public float getMinimumScore(boolean positional) { return positional ? positionalMinScore : nonPositionalMinScore; } /** * Answers the maximum score held for positional or non-positional features. * This may be Float.NaN if there are no features, are none has a non-NaN * score. * * @param positional * @return */ public float getMaximumScore(boolean positional) { return positional ? positionalMaxScore : nonPositionalMaxScore; } /** * Answers a list of all either positional or non-positional features whose * feature group matches the given group (which may be null) * * @param positional * @param group * @return */ public List getFeaturesForGroup(boolean positional, String group) { List result = new ArrayList<>(); /* * if we know features don't include the target group, no need * to inspect them for matches */ if (positional && !positionalFeatureGroups.contains(group) || !positional && !nonPositionalFeatureGroups.contains(group)) { return result; } List sfs = positional ? getPositionalFeatures() : getNonPositionalFeatures(); for (SequenceFeature sf : sfs) { String featureGroup = sf.getFeatureGroup(); if (group == null && featureGroup == null || group != null && group.equals(featureGroup)) { result.add(sf); } } return result; } /** * Adds the shift value to the start and end of all positional features. * Returns true if at least one feature was updated, else false. * * @param shift * @return */ public synchronized boolean shiftFeatures(int shift) { /* * Because begin and end are final fields (to ensure the data store's * integrity), we have to delete each feature and re-add it as amended. * (Although a simple shift of all values would preserve data integrity!) */ boolean modified = false; for (SequenceFeature sf : getPositionalFeatures()) { modified = true; int newBegin = sf.getBegin() + shift; int newEnd = sf.getEnd() + shift; /* * sanity check: don't shift left of the first residue */ if (newEnd > 0) { newBegin = Math.max(1, newBegin); SequenceFeature sf2 = new SequenceFeature(sf, newBegin, newEnd, sf.getFeatureGroup(), sf.getScore()); addFeature(sf2); } delete(sf); } return modified; } }