X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Futil%2FDBRefUtils.java;h=77a64e8323ee00e4a27864fbdc725d3c78bf091f;hb=4a3def9f59cefe629c9a33d87483283aee085928;hp=203b4e7ecc50bff607be1c469cead46a39e6e33f;hpb=914a9aaf5a2eeb3cdedf136c2e6b4b6e4b5d6174;p=jalview.git diff --git a/src/jalview/util/DBRefUtils.java b/src/jalview/util/DBRefUtils.java index 203b4e7..77a64e8 100755 --- a/src/jalview/util/DBRefUtils.java +++ b/src/jalview/util/DBRefUtils.java @@ -20,21 +20,23 @@ */ package jalview.util; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; -import jalview.datamodel.PDBEntry; -import jalview.datamodel.SequenceI; +import java.util.Locale; import java.util.ArrayList; -import java.util.Arrays; +import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; import com.stevesoft.pat.Regex; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.Mapping; +import jalview.datamodel.PDBEntry; +import jalview.datamodel.SequenceI; + /** * Utilities for handling DBRef objects and their collections. */ @@ -45,7 +47,18 @@ public class DBRefUtils */ private static Map canonicalSourceNameLookup = new HashMap<>(); - private static Map dasCoordinateSystemsLookup = new HashMap<>(); + public final static int DB_SOURCE = 1; + + public final static int DB_VERSION = 2; + + public final static int DB_ID = 4; + + public final static int DB_MAP = 8; + + public final static int SEARCH_MODE_NO_MAP_NO_VERSION = DB_SOURCE | DB_ID; + + public final static int SEARCH_MODE_FULL = DB_SOURCE | DB_VERSION | DB_ID + | DB_MAP; static { @@ -65,20 +78,15 @@ public class DBRefUtils canonicalSourceNameLookup.put("ensembl-tr", DBRefSource.ENSEMBL); canonicalSourceNameLookup.put("ensembl-gn", DBRefSource.ENSEMBL); + // TODO keep ? (in phyloviewer branch only) canonicalSourceNameLookup.put("pfam", DBRefSource.PFAM); - // Make sure we have lowercase entries for all canonical string lookups - Set keys = canonicalSourceNameLookup.keySet(); - for (String k : keys) + // guarantee we always have lowercase entries for canonical string lookups + for (String k : canonicalSourceNameLookup.keySet()) { - canonicalSourceNameLookup.put(k.toLowerCase(), + canonicalSourceNameLookup.put(k.toLowerCase(Locale.ROOT), canonicalSourceNameLookup.get(k)); } - - dasCoordinateSystemsLookup.put("pdbresnum", DBRefSource.PDB); - dasCoordinateSystemsLookup.put("uniprot", DBRefSource.UNIPROT); - dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBL); - // dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBLCDS); } /** @@ -92,58 +100,85 @@ public class DBRefUtils * array of sources to select * @return */ - public static DBRefEntry[] selectRefs(DBRefEntry[] dbrefs, + public static List selectRefs(List dbrefs, String[] sources) { if (dbrefs == null || sources == null) { return dbrefs; } - HashSet srcs = new HashSet<>(); + + // BH TODO (what?) + HashSet srcs = new HashSet(); for (String src : sources) { - srcs.add(src.toUpperCase()); + srcs.add(src.toUpperCase(Locale.ROOT)); } - List res = new ArrayList<>(); - for (DBRefEntry dbr : dbrefs) + int nrefs = dbrefs.size(); + List res = new ArrayList(); + for (int ib = 0; ib < nrefs; ib++) { + DBRefEntry dbr = dbrefs.get(ib); String source = getCanonicalName(dbr.getSource()); - if (srcs.contains(source.toUpperCase())) + if (srcs.contains(source.toUpperCase(Locale.ROOT))) { res.add(dbr); } } - if (res.size() > 0) { - DBRefEntry[] reply = new DBRefEntry[res.size()]; - return res.toArray(reply); + // List reply = new DBRefEntry[res.size()]; + return res;// .toArray(reply); } return null; } + private static boolean selectRefsBS(List dbrefs, + int sourceKeys, BitSet bsSelect) + { + if (dbrefs == null || sourceKeys == 0) + { + return false; + } + for (int i = 0, n = dbrefs.size(); i < n; i++) + { + DBRefEntry dbr = dbrefs.get(i); + if ((dbr.getSourceKey() & sourceKeys) != 0) + { + bsSelect.clear(i); + } + } + return !bsSelect.isEmpty(); + } + /** - * isDasCoordinateSystem + * Returns a (possibly empty) list of those references that match the given + * entry, according to the given comparator. * - * @param string - * String - * @param dBRefEntry - * DBRefEntry - * @return boolean true if Source DBRefEntry is compatible with DAS - * CoordinateSystem name + * @param refs + * an array of database references to search + * @param entry + * an entry to compare against + * @param comparator + * @return */ - - public static boolean isDasCoordinateSystem(String string, - DBRefEntry dBRefEntry) + static List searchRefs(DBRefEntry[] refs, DBRefEntry entry, + DbRefComp comparator) { - if (string == null || dBRefEntry == null) + List rfs = new ArrayList<>(); + if (refs == null || entry == null) { - return false; + return rfs; } - String coordsys = dasCoordinateSystemsLookup.get(string.toLowerCase()); - return coordsys == null ? false - : coordsys.equals(dBRefEntry.getSource()); + for (int i = 0; i < refs.length; i++) + { + if (comparator.matches(entry, refs[i])) + { + rfs.add(refs[i]); + } + } + return rfs; } /** @@ -161,7 +196,8 @@ public class DBRefUtils { return null; } - String canonical = canonicalSourceNameLookup.get(source.toLowerCase()); + String canonical = canonicalSourceNameLookup + .get(source.toLowerCase(Locale.ROOT)); return canonical == null ? source : canonical; } @@ -178,13 +214,15 @@ public class DBRefUtils * Set of references to search * @param entry * pattern to match + * @param mode + * SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional * @return */ - public static List searchRefs(DBRefEntry[] ref, - DBRefEntry entry) + public static List searchRefs(List ref, + DBRefEntry entry, int mode) { return searchRefs(ref, entry, - matchDbAndIdAndEitherMapOrEquivalentMapList); + matchDbAndIdAndEitherMapOrEquivalentMapList, mode); } /** @@ -201,9 +239,25 @@ public class DBRefUtils * accession id to match * @return */ - public static List searchRefs(DBRefEntry[] refs, String accId) + public static List searchRefs(List refs, + String accId) { - return searchRefs(refs, new DBRefEntry("", "", accId), matchId); + List rfs = new ArrayList(); + if (refs == null || accId == null) + { + return rfs; + } + for (int i = 0, n = refs.size(); i < n; i++) + { + DBRefEntry e = refs.get(i); + if (accId.equals(e.getAccessionId())) + { + rfs.add(e); + } + } + return rfs; + // return searchRefs(refs, new DBRefEntry("", "", accId), matchId, + // SEARCH_MODE_FULL); } /** @@ -215,21 +269,24 @@ public class DBRefUtils * @param entry * an entry to compare against * @param comparator + * @param mode + * SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional * @return */ - static List searchRefs(DBRefEntry[] refs, DBRefEntry entry, - DbRefComp comparator) + static List searchRefs(List refs, + DBRefEntry entry, DbRefComp comparator, int mode) { List rfs = new ArrayList<>(); if (refs == null || entry == null) { return rfs; } - for (int i = 0; i < refs.length; i++) + for (int i = 0, n = refs.size(); i < n; i++) { - if (comparator.matches(entry, refs[i])) + DBRefEntry e = refs.get(i); + if (comparator.matches(entry, e, SEARCH_MODE_FULL)) { - rfs.add(refs[i]); + rfs.add(e); } } return rfs; @@ -237,30 +294,36 @@ public class DBRefUtils interface DbRefComp { - public boolean matches(DBRefEntry refa, DBRefEntry refb); + default public boolean matches(DBRefEntry refa, DBRefEntry refb) + { + return matches(refa, refb, SEARCH_MODE_FULL); + }; + + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode); } /** * match on all non-null fields in refa */ - // TODO unused - remove? + // TODO unused - remove? would be broken by equating "" with null public static DbRefComp matchNonNullonA = new DbRefComp() { @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode) { - if (refa.getSource() == null + if ((mode & DB_SOURCE) != 0 && (refa.getSource() == null || DBRefUtils.getCanonicalName(refb.getSource()).equals( - DBRefUtils.getCanonicalName(refa.getSource()))) + DBRefUtils.getCanonicalName(refa.getSource())))) { - if (refa.getVersion() == null - || refb.getVersion().equals(refa.getVersion())) + if ((mode & DB_VERSION) != 0 && (refa.getVersion() == null + || refb.getVersion().equals(refa.getVersion()))) { - if (refa.getAccessionId() == null - || refb.getAccessionId().equals(refa.getAccessionId())) + if ((mode & DB_ID) != 0 && (refa.getAccessionId() == null + || refb.getAccessionId().equals(refa.getAccessionId()))) { - if (refa.getMap() == null || (refb.getMap() != null - && refb.getMap().equals(refa.getMap()))) + if ((mode & DB_MAP) != 0 + && (refa.getMap() == null || (refb.getMap() != null + && refb.getMap().equals(refa.getMap())))) { return true; } @@ -279,7 +342,7 @@ public class DBRefUtils public static DbRefComp matchEitherNonNull = new DbRefComp() { @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode) { if (nullOrEqualSource(refa.getSource(), refb.getSource()) && nullOrEqual(refa.getVersion(), refb.getVersion()) @@ -290,9 +353,79 @@ public class DBRefUtils } return false; } + }; /** + * Parses a DBRefEntry and adds it to the sequence, also a PDBEntry if the + * database is PDB. + *

+ * Used by file parsers to generate DBRefs from annotation within file (eg + * Stockholm) + * + * @param dbname + * @param version + * @param acn + * @param seq + * where to annotate with reference + * @return parsed version of entry that was added to seq (if any) + */ + public static DBRefEntry parseToDbRef(SequenceI seq, String dbname, + String version, String acn) + { + DBRefEntry ref = null; + if (dbname != null) + { + String locsrc = DBRefUtils.getCanonicalName(dbname); + if (locsrc.equals(DBRefSource.PDB)) + { + /* + * Check for PFAM style stockhom PDB accession id citation e.g. + * "1WRI A; 7-80;" + */ + Regex r = new com.stevesoft.pat.Regex( + "([0-9][0-9A-Za-z]{3})\\s*(.?)\\s*;\\s*([0-9]+)-([0-9]+)"); + if (r.search(acn.trim())) + { + String pdbid = r.stringMatched(1); + String chaincode = r.stringMatched(2); + if (chaincode == null) + { + chaincode = " "; + } + // String mapstart = r.stringMatched(3); + // String mapend = r.stringMatched(4); + if (chaincode.equals(" ")) + { + chaincode = "_"; + } + // construct pdb ref. + ref = new DBRefEntry(locsrc, version, pdbid + chaincode); + PDBEntry pdbr = new PDBEntry(); + pdbr.setId(pdbid); + pdbr.setType(PDBEntry.Type.PDB); + pdbr.setChainCode(chaincode); + seq.addPDBId(pdbr); + } + else + { + System.err.println("Malformed PDB DR line:" + acn); + } + } + else + { + // default: + ref = new DBRefEntry(locsrc, version, acn.trim()); + } + } + if (ref != null) + { + seq.addDBRef(ref); + } + return ref; + } + + /** * accession ID and DB must be identical. Version is ignored. Map is either * not defined or is a match (or is compatible?) */ @@ -300,7 +433,7 @@ public class DBRefUtils public static DbRefComp matchDbAndIdAndEitherMap = new DbRefComp() { @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode) { if (refa.getSource() != null && refb.getSource() != null && DBRefUtils.getCanonicalName(refb.getSource()).equals( @@ -332,7 +465,7 @@ public class DBRefUtils public static DbRefComp matchDbAndIdAndComplementaryMapList = new DbRefComp() { @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode) { if (refa.getSource() != null && refb.getSource() != null && DBRefUtils.getCanonicalName(refb.getSource()).equals( @@ -370,7 +503,7 @@ public class DBRefUtils public static DbRefComp matchDbAndIdAndEquivalentMapList = new DbRefComp() { @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode) { if (refa.getSource() != null && refb.getSource() != null && DBRefUtils.getCanonicalName(refb.getSource()).equals( @@ -411,14 +544,13 @@ public class DBRefUtils public static DbRefComp matchDbAndIdAndEitherMapOrEquivalentMapList = new DbRefComp() { @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) + public boolean matches(DBRefEntry refa, DBRefEntry refb, int mode) { if (refa.getSource() != null && refb.getSource() != null && DBRefUtils.getCanonicalName(refb.getSource()).equals( DBRefUtils.getCanonicalName(refa.getSource()))) { // We dont care about version - if (refa.getAccessionId() == null || refa.getAccessionId().equals(refb.getAccessionId())) { @@ -443,89 +575,28 @@ public class DBRefUtils }; /** - * accession ID only must be identical. - */ - public static DbRefComp matchId = new DbRefComp() - { - @Override - public boolean matches(DBRefEntry refa, DBRefEntry refb) - { - if (refa.getAccessionId() != null && refb.getAccessionId() != null - && refb.getAccessionId().equals(refa.getAccessionId())) - { - return true; - } - return false; - } - }; - - /** - * Parses a DBRefEntry and adds it to the sequence, also a PDBEntry if the - * database is PDB. - *

- * Used by file parsers to generate DBRefs from annotation within file (eg - * Stockholm) + * Returns the (possibly empty) list of those supplied dbrefs which have the + * specified source database, with a case-insensitive match of source name * - * @param dbname - * @param version - * @param acn - * @param seq - * where to annotate with reference - * @return parsed version of entry that was added to seq (if any) + * @param dbRefs + * @param source + * @return */ - public static DBRefEntry parseToDbRef(SequenceI seq, String dbname, - String version, String acn) + public static List searchRefsForSource(DBRefEntry[] dbRefs, + String source) { - DBRefEntry ref = null; - if (dbname != null) + List matches = new ArrayList<>(); + if (dbRefs != null && source != null) { - String locsrc = DBRefUtils.getCanonicalName(dbname); - if (locsrc.equals(DBRefSource.PDB)) + for (DBRefEntry dbref : dbRefs) { - /* - * Check for PFAM style stockhom PDB accession id citation e.g. - * "1WRI A; 7-80;" - */ - Regex r = new com.stevesoft.pat.Regex( - "([0-9][0-9A-Za-z]{3})\\s*(.?)\\s*;\\s*([0-9]+)-([0-9]+)"); - if (r.search(acn.trim())) - { - String pdbid = r.stringMatched(1); - String chaincode = r.stringMatched(2); - if (chaincode == null) - { - chaincode = " "; - } - // String mapstart = r.stringMatched(3); - // String mapend = r.stringMatched(4); - if (chaincode.equals(" ")) - { - chaincode = "_"; - } - // construct pdb ref. - ref = new DBRefEntry(locsrc, version, pdbid + chaincode); - PDBEntry pdbr = new PDBEntry(); - pdbr.setId(pdbid); - pdbr.setType(PDBEntry.Type.PDB); - pdbr.setChainCode(chaincode); - seq.addPDBId(pdbr); - } - else + if (source.equalsIgnoreCase(dbref.getSource())) { - System.err.println("Malformed PDB DR line:" + acn); + matches.add(dbref); } } - else - { - // default: - ref = new DBRefEntry(locsrc, version, acn); - } - } - if (ref != null) - { - seq.addDBRef(ref); } - return ref; + return matches; } /** @@ -575,8 +646,8 @@ public class DBRefUtils * a set of references to select from * @return */ - public static DBRefEntry[] selectDbRefs(boolean selectDna, - DBRefEntry[] refs) + public static List selectDbRefs(boolean selectDna, + List refs) { return selectRefs(refs, selectDna ? DBRefSource.DNACODINGDBS : DBRefSource.PROTEINDBS); @@ -593,8 +664,8 @@ public class DBRefUtils * @param source * @return */ - public static List searchRefsForSource(DBRefEntry[] dbRefs, - String source) + public static List searchRefsForSource( + List dbRefs, String source) { List matches = new ArrayList<>(); if (dbRefs != null && source != null) @@ -638,89 +709,109 @@ public class DBRefUtils * * @param sequence */ - public static void ensurePrimaries(SequenceI sequence) + public static void ensurePrimaries(SequenceI sequence, + List pr) { - List pr = sequence.getPrimaryDBRefs(); if (pr.size() == 0) { // nothing to do return; } - List selfs = new ArrayList<>(); - { - DBRefEntry[] selfArray = selectDbRefs(!sequence.isProtein(), - sequence.getDBRefs()); - if (selfArray == null || selfArray.length == 0) - { - // nothing to do - return; - } - selfs.addAll(Arrays.asList(selfArray)); - } + int sstart = sequence.getStart(); + int send = sequence.getEnd(); + boolean isProtein = sequence.isProtein(); + BitSet bsSelect = new BitSet(); + + // List selfs = new ArrayList(); + // { + + // List selddfs = selectDbRefs(!isprot, sequence.getDBRefs()); + // if (selfs == null || selfs.size() == 0) + // { + // // nothing to do + // return; + // } + + List dbrefs = sequence.getDBRefs(); + bsSelect.set(0, dbrefs.size()); + + if (!selectRefsBS(dbrefs, isProtein ? DBRefSource.PROTEIN_MASK + : DBRefSource.DNA_CODING_MASK, bsSelect)) + return; + + // selfs.addAll(selfArray); + // } // filter non-primary refs - for (DBRefEntry p : pr) + for (int ip = pr.size(); --ip >= 0;) { - while (selfs.contains(p)) + DBRefEntry p = pr.get(ip); + for (int i = bsSelect.nextSetBit(0); i >= 0; i = bsSelect + .nextSetBit(i + 1)) { - selfs.remove(p); + if (dbrefs.get(i) == p) + bsSelect.clear(i); } + // while (selfs.contains(p)) + // { + // selfs.remove(p); + // } } - List toPromote = new ArrayList<>(); + // List toPromote = new ArrayList(); - for (DBRefEntry p : pr) + for (int ip = pr.size(), keys = 0; --ip >= 0 + && keys != DBRefSource.PRIMARY_MASK;) { - List promType = new ArrayList<>(); - if (sequence.isProtein()) + DBRefEntry p = pr.get(ip); + if (isProtein) { switch (getCanonicalName(p.getSource())) { case DBRefSource.UNIPROT: - // case DBRefSource.UNIPROTKB: - // case DBRefSource.UP_NAME: - // search for and promote ensembl - promType.add(DBRefSource.ENSEMBL); + keys |= DBRefSource.UNIPROT_MASK; break; case DBRefSource.ENSEMBL: - // search for and promote Uniprot - promType.add(DBRefSource.UNIPROT); + keys |= DBRefSource.ENSEMBL_MASK; break; } } else { - // TODO: promote transcript refs + // TODO: promote transcript refs ?? } - - // collate candidates and promote them - DBRefEntry[] candidates = selectRefs(selfs.toArray(new DBRefEntry[0]), - promType.toArray(new String[0])); - if (candidates != null) + if (keys == 0 || !selectRefsBS(dbrefs, keys, bsSelect)) + return; + // if (candidates != null) { - for (DBRefEntry cand : candidates) + for (int ic = bsSelect.nextSetBit(0); ic >= 0; ic = bsSelect + .nextSetBit(ic + 1)) + // for (int ic = 0, n = candidates.size(); ic < n; ic++) { + DBRefEntry cand = dbrefs.get(ic);// candidates.get(ic); if (cand.hasMap()) { - if (cand.getMap().getTo() != null - && cand.getMap().getTo() != sequence) + Mapping map = cand.getMap(); + SequenceI cto = map.getTo(); + if (cto != null && cto != sequence) { // can't promote refs with mappings to other sequences continue; } - if (cand.getMap().getMap().getFromLowest() != sequence - .getStart() - && cand.getMap().getMap().getFromHighest() != sequence - .getEnd()) + MapList mlist = map.getMap(); + if (mlist.getFromLowest() != sstart + && mlist.getFromHighest() != send) { // can't promote refs with mappings from a region of this sequence // - eg CDS continue; } } - // and promote + // and promote - not that version must be non-null here, + // as p must have passed isPrimaryCandidate() cand.setVersion(p.getVersion() + " (promoted)"); - selfs.remove(cand); - toPromote.add(cand); + bsSelect.clear(ic); + // selfs.remove(cand); + // toPromote.add(cand); if (!cand.isPrimaryCandidate()) { System.out.println(