package jalview.datamodel.features;
+import jalview.bin.Cache;
+
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.Set;
+import java.util.StringTokenizer;
import java.util.TreeMap;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
/**
* A singleton class to hold the set of attributes known for each feature type
*/
public class FeatureAttributes
{
+ public enum Datatype
+ {
+ Character, Number, Mixed
+ }
+
+ /*
+ * property key for lookup of a comma-separated list of regex patterns
+ * to match those attribute names for which distinct values should be cached
+ */
+ private static final String CACHED_ATTS_KEY = "CACHED_ATTRIBUTES";
+
+ /*
+ * default value if property is not specified
+ * (selected VCF/VEP terms which have 'categorical' value ranges)
+ */
+ private static final String CACHED_ATTS_DEFAULT = "AS_FilterStatus,clinical_significance,consequence_type,"
+ + "CSQ:Consequence,CSQ:CLIN_SIG,CSQ:DOMAIN,CSQ:IMPACT";
+
+ /*
+ * delimiters of terms in attribute values
+ */
+ private static final String TERM_DELIMITERS = ",&";
+
+ /*
+ * defensive limit to number of attribute values cached per attribute
+ */
+ private static final int MAX_ATT_VALS = 30;
+
private static FeatureAttributes instance = new FeatureAttributes();
/*
private Map<String, Map<String[], AttributeData>> attributes;
/*
+ * attribute names that have been seen and
+ * match the condition for caching distinct values
+ */
+ private Set<String> cachedAttributes;
+
+ /*
+ * attribute names that have been seen and do not
+ * match the condition for caching distinct values
+ */
+ private Set<String> uncachedAttributes;
+
+ private List<Pattern> cacheableNamePatterns;
+
+ /*
* a case-insensitive comparator so that attributes are ordered e.g.
* AC
* af
List<String> description;
/*
- * minimum value (of any numeric values recorded)
+ * minimum value (if only numeric values recorded)
*/
float min = 0f;
/*
- * maximum value (of any numeric values recorded)
+ * maximum value (if only numeric values recorded)
*/
float max = 0f;
/*
- * flag is set true if any numeric value is detected for this attribute
+ * flag is set true if only numeric values are detected for this attribute
*/
boolean hasValue = false;
+ Datatype type;
+
+ /*
+ * (for selected attributes), a list of distinct terms found in values
+ */
+ Set<String> terms;
+
/**
- * Note one instance of this attribute, recording unique, non-null names,
- * and the min/max of any numerical values
+ * Note one instance of this attribute, recording unique, non-null
+ * descriptions, and the min/max of any numerical values.
+ * <p>
+ * Distinct value terms may also be recorded, if the feature type is one for
+ * which this is configured
*
+ * @param attName
* @param desc
* @param value
*/
- void addInstance(String desc, String value)
+ void addInstance(String[] attName, String desc, String value)
{
addDescription(desc);
if (value != null)
{
- try
+ value = value.trim();
+
+ String name = FeatureMatcher.toAttributeDisplayName(attName);
+ recordValue(name, value);
+
+ /*
+ * Parse numeric value unless we have previously
+ * seen text data for this attribute type
+ */
+ if (type == null || type == Datatype.Number)
+ {
+ try
+ {
+ float f = Float.valueOf(value);
+ min = hasValue ? Float.min(min, f) : f;
+ max = hasValue ? Float.max(max, f) : f;
+ hasValue = true;
+ type = (type == null || type == Datatype.Number)
+ ? Datatype.Number
+ : Datatype.Mixed;
+ } catch (NumberFormatException e)
+ {
+ /*
+ * non-numeric data: treat attribute as Character (or Mixed)
+ */
+ type = (type == null || type == Datatype.Character)
+ ? Datatype.Character
+ : Datatype.Mixed;
+ min = 0f;
+ max = 0f;
+ hasValue = false;
+ }
+ }
+ }
+ }
+
+ /**
+ * If attribute name is configured to cache distinct values, then parse out
+ * and store these
+ *
+ * @param attName
+ * @param value
+ */
+ private void recordValue(String attName, String value)
+ {
+ /*
+ * quit if we've seen this attribute name before,
+ * and determined we are not caching its values
+ */
+ if (uncachedAttributes.contains(attName))
+ {
+ return;
+ }
+
+ /*
+ * if first time seen, check attribute name filters to
+ * see if we want to cache its value
+ */
+ if (!cachedAttributes.contains(attName))
+ {
+ if (!matches(attName, cacheableNamePatterns))
{
- float f = Float.valueOf(value);
- min = Float.min(min, f);
- max = Float.max(max, f);
- hasValue = true;
- } catch (NumberFormatException e)
+ uncachedAttributes.add(attName);
+ return;
+ }
+ else
{
- // ok, wasn't a number, ignore for min-max purposes
+ cachedAttributes.add(attName);
}
}
+
+ /*
+ * we want to cache distinct terms for this attribute;
+ * parse them out using comma or & delimiters
+ */
+ if (terms == null)
+ {
+ terms = new HashSet<>();
+ }
+ int count = terms.size();
+ StringTokenizer st = new StringTokenizer(value, TERM_DELIMITERS);
+ while (st.hasMoreTokens() && count < MAX_ATT_VALS)
+ {
+ terms.add(st.nextToken().trim());
+ count++;
+ }
}
/**
- * Answers the description of the attribute, if recorded and unique, or null if either no, or more than description is recorded
+ * Answers true if any of the patterns matches the value, else false
+ *
+ * @param value
+ * @param filters
+ * @return
+ */
+ private boolean matches(String value, List<Pattern> filters)
+ {
+ for (Pattern p : filters)
+ {
+ if (p.matcher(value).matches())
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Answers the description of the attribute, if recorded and unique, or null
+ * if either no, or more than description is recorded
+ *
* @return
*/
public String getDescription()
return null;
}
+ public Datatype getType()
+ {
+ return type;
+ }
+
/**
* Adds the given description to the list of known descriptions (without
* duplication)
}
}
}
+
+ /**
+ * Answers the distinct terms recorded for the attribute, or an empty set if
+ * it is not configured to cache values
+ *
+ * @return
+ */
+ public Set<String> getDistinctTerms()
+ {
+ return terms == null ? Collections.<String> emptySet() : terms;
+ }
}
/**
return instance;
}
+ /**
+ * Private constructor to enforce singleton pattern
+ */
private FeatureAttributes()
{
attributes = new HashMap<>();
+ cachedAttributes = new HashSet<>();
+ uncachedAttributes = new HashSet<>();
+ cacheableNamePatterns = getFieldMatchers(CACHED_ATTS_KEY,
+ CACHED_ATTS_DEFAULT);
+ }
+
+ /**
+ * Reads the Preference value for the given key, with default specified if no
+ * preference set. The value is interpreted as a comma-separated list of
+ * regular expressions, and converted into a list of compiled patterns ready
+ * for matching. Patterns are set to non-case-sensitive matching.
+ * <p>
+ * This supports user-defined filters for attributes of interest to capture
+ * distinct values for as instance are added.
+ *
+ * @param key
+ * @param def
+ * @return
+ */
+ public static List<Pattern> getFieldMatchers(String key, String def)
+ {
+ String pref = def;
+ try
+ {
+ // temporary for applet: handle class loading errors...
+ pref = Cache.getDefault(key, def);
+ } catch (Throwable t)
+ {
+ }
+ List<Pattern> patterns = new ArrayList<>();
+ String[] tokens = pref.split(",");
+ for (String token : tokens)
+ {
+ try
+ {
+ patterns.add(Pattern.compile(token, Pattern.CASE_INSENSITIVE));
+ } catch (PatternSyntaxException e)
+ {
+ System.err.println("Invalid pattern ignored: " + token);
+ }
+ }
+ return patterns;
}
/**
}
/**
+ * Answers the set of distinct terms recorded for the given feature type and
+ * attribute. Answers an empty set if values are not cached for this
+ * attribute.
+ *
+ * @param featureType
+ * @param attName
+ * @return
+ */
+ public Set<String> getDistinctTerms(String featureType, String... attName)
+ {
+ if (!attributes.containsKey(featureType)
+ || !attributes.get(featureType).containsKey(attName))
+ {
+ return Collections.<String> emptySet();
+ }
+
+ return attributes.get(featureType).get(attName).getDistinctTerms();
+ }
+
+ /**
* Answers true if at least one attribute is known for the given feature type,
* else false
*
attData = new AttributeData();
atts.put(attName, attData);
}
- attData.addInstance(description, valueAsString);
+ attData.addInstance(attName, description, valueAsString);
}
/**
/**
* Answers the [min, max] value range of the given attribute for the given
- * feature type, if known, else null. Attributes which only have text values
- * would normally return null, however text values which happen to be numeric
- * could result in a 'min-max' range.
+ * feature type, if known, else null. Attributes with a mixture of text and
+ * numeric values are considered text (do not return a min-max range).
*
* @param featureType
* @param attName
}
attData.addDescription(description);
}
+
+ /**
+ * Answers the datatype of the feature, which is one of Character, Number or
+ * Mixed (or null if not known), as discovered from values recorded.
+ *
+ * @param featureType
+ * @param attName
+ * @return
+ */
+ public Datatype getDatatype(String featureType, String... attName)
+ {
+ Map<String[], AttributeData> atts = attributes.get(featureType);
+ if (atts != null)
+ {
+ AttributeData attData = atts.get(attName);
+ if (attData != null)
+ {
+ return attData.getType();
+ }
+ }
+ return null;
+ }
}