2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.DBRefEntry;
24 import jalview.datamodel.DBRefSource;
25 import jalview.datamodel.PDBEntry;
26 import jalview.datamodel.SequenceI;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.HashMap;
31 import java.util.HashSet;
32 import java.util.List;
36 import com.stevesoft.pat.Regex;
39 * Utilities for handling DBRef objects and their collections.
41 public class DBRefUtils
44 * lookup from lower-case form of a name to its canonical (standardised) form
46 private static Map<String, String> canonicalSourceNameLookup = new HashMap<String, String>();
48 private static Map<String, String> dasCoordinateSystemsLookup = new HashMap<String, String>();
52 // TODO load these from a resource file?
53 canonicalSourceNameLookup.put("uniprotkb/swiss-prot",
55 canonicalSourceNameLookup.put("uniprotkb/trembl", DBRefSource.UNIPROT);
57 // Ensembl values for dbname in xref REST service:
58 canonicalSourceNameLookup.put("uniprot/sptrembl", DBRefSource.UNIPROT);
59 canonicalSourceNameLookup.put("uniprot/swissprot", DBRefSource.UNIPROT);
61 canonicalSourceNameLookup.put("pdb", DBRefSource.PDB);
62 canonicalSourceNameLookup.put("ensembl", DBRefSource.ENSEMBL);
63 // Ensembl Gn and Tr are for Ensembl genomic and transcript IDs as served
65 canonicalSourceNameLookup.put("ensembl-tr", DBRefSource.ENSEMBL);
66 canonicalSourceNameLookup.put("ensembl-gn", DBRefSource.ENSEMBL);
68 canonicalSourceNameLookup.put("ensemblgenomes",
69 DBRefSource.ENSEMBLGENOMES);
71 // Make sure we have lowercase entries for all canonical string lookups
72 Set<String> keys = canonicalSourceNameLookup.keySet();
75 canonicalSourceNameLookup.put(k.toLowerCase(),
76 canonicalSourceNameLookup.get(k));
79 dasCoordinateSystemsLookup.put("pdbresnum", DBRefSource.PDB);
80 dasCoordinateSystemsLookup.put("uniprot", DBRefSource.UNIPROT);
81 dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBL);
82 // dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBLCDS);
86 * Returns those DBRefEntry objects whose source identifier (once converted to
87 * Jalview's canonical form) is in the list of sources to search for. Returns
88 * null if no matches found.
91 * DBRefEntry objects to search
93 * array of sources to select
96 public static DBRefEntry[] selectRefs(DBRefEntry[] dbrefs,
99 if (dbrefs == null || sources == null)
103 HashSet<String> srcs = new HashSet<String>();
104 for (String src : sources)
106 srcs.add(src.toUpperCase());
109 List<DBRefEntry> res = new ArrayList<DBRefEntry>();
110 for (DBRefEntry dbr : dbrefs)
112 String source = getCanonicalName(dbr.getSource());
113 if (srcs.contains(source.toUpperCase()))
121 DBRefEntry[] reply = new DBRefEntry[res.size()];
122 return res.toArray(reply);
128 * isDasCoordinateSystem
134 * @return boolean true if Source DBRefEntry is compatible with DAS
135 * CoordinateSystem name
138 public static boolean isDasCoordinateSystem(String string,
139 DBRefEntry dBRefEntry)
141 if (string == null || dBRefEntry == null)
145 String coordsys = dasCoordinateSystemsLookup.get(string.toLowerCase());
146 return coordsys == null ? false
147 : coordsys.equals(dBRefEntry.getSource());
151 * look up source in an internal list of database reference sources and return
152 * the canonical jalview name for the source, or the original string if it has
156 * @return canonical jalview source (one of jalview.datamodel.DBRefSource.*)
159 public static String getCanonicalName(String source)
165 String canonical = canonicalSourceNameLookup.get(source.toLowerCase());
168 if (source.toLowerCase().startsWith("ensembl"))
170 canonical = DBRefSource.ENSEMBL;
171 for (String ensembls: new String[] { "Protists","Plants","Bacteria","Fungi","Metazoa"})
173 if (source.toLowerCase().endsWith(ensembls.toLowerCase()))
175 canonical = DBRefSource.ENSEMBLGENOMES;
180 return canonical == null ? source : canonical;
184 * Returns a (possibly empty) list of those references that match the given
185 * entry. Currently uses a comparator which matches if
187 * <li>database sources are the same</li>
188 * <li>accession ids are the same</li>
189 * <li>both have no mapping, or the mappings are the same</li>
193 * Set of references to search
198 public static List<DBRefEntry> searchRefs(DBRefEntry[] ref,
201 return searchRefs(ref, entry,
202 matchDbAndIdAndEitherMapOrEquivalentMapList);
206 * Returns a list of those references that match the given accession id
208 * <li>database sources are the same</li>
209 * <li>accession ids are the same</li>
210 * <li>both have no mapping, or the mappings are the same</li>
214 * Set of references to search
216 * accession id to match
219 public static List<DBRefEntry> searchRefs(DBRefEntry[] refs, String accId)
221 return searchRefs(refs, new DBRefEntry("", "", accId), matchId);
225 * Returns a (possibly empty) list of those references that match the given
226 * entry, according to the given comparator.
229 * an array of database references to search
231 * an entry to compare against
235 static List<DBRefEntry> searchRefs(DBRefEntry[] refs, DBRefEntry entry,
236 DbRefComp comparator)
238 List<DBRefEntry> rfs = new ArrayList<DBRefEntry>();
239 if (refs == null || entry == null)
243 for (int i = 0; i < refs.length; i++)
245 if (comparator.matches(entry, refs[i]))
255 public boolean matches(DBRefEntry refa, DBRefEntry refb);
259 * match on all non-null fields in refa
261 // TODO unused - remove?
262 public static DbRefComp matchNonNullonA = new DbRefComp()
265 public boolean matches(DBRefEntry refa, DBRefEntry refb)
267 if (refa.getSource() == null
268 || DBRefUtils.getCanonicalName(refb.getSource()).equals(
269 DBRefUtils.getCanonicalName(refa.getSource())))
271 if (refa.getVersion() == null
272 || refb.getVersion().equals(refa.getVersion()))
274 if (refa.getAccessionId() == null
275 || refb.getAccessionId().equals(refa.getAccessionId()))
277 if (refa.getMap() == null || (refb.getMap() != null
278 && refb.getMap().equals(refa.getMap())))
290 * either field is null or field matches for all of source, version, accession
293 // TODO unused - remove?
294 public static DbRefComp matchEitherNonNull = new DbRefComp()
297 public boolean matches(DBRefEntry refa, DBRefEntry refb)
299 if (nullOrEqualSource(refa.getSource(), refb.getSource())
300 && nullOrEqual(refa.getVersion(), refb.getVersion())
301 && nullOrEqual(refa.getAccessionId(), refb.getAccessionId())
302 && nullOrEqual(refa.getMap(), refb.getMap()))
311 * accession ID and DB must be identical. Version is ignored. Map is either
312 * not defined or is a match (or is compatible?)
314 // TODO unused - remove?
315 public static DbRefComp matchDbAndIdAndEitherMap = new DbRefComp()
318 public boolean matches(DBRefEntry refa, DBRefEntry refb)
320 if (refa.getSource() != null && refb.getSource() != null
321 && DBRefUtils.getCanonicalName(refb.getSource()).equals(
322 DBRefUtils.getCanonicalName(refa.getSource())))
324 // We dont care about version
325 if (refa.getAccessionId() != null && refb.getAccessionId() != null
326 // FIXME should be && not || here?
327 || refb.getAccessionId().equals(refa.getAccessionId()))
329 if ((refa.getMap() == null || refb.getMap() == null)
330 || (refa.getMap() != null && refb.getMap() != null
331 && refb.getMap().equals(refa.getMap())))
342 * accession ID and DB must be identical. Version is ignored. No map on either
343 * or map but no maplist on either or maplist of map on a is the complement of
344 * maplist of map on b.
346 // TODO unused - remove?
347 public static DbRefComp matchDbAndIdAndComplementaryMapList = new DbRefComp()
350 public boolean matches(DBRefEntry refa, DBRefEntry refb)
352 if (refa.getSource() != null && refb.getSource() != null
353 && DBRefUtils.getCanonicalName(refb.getSource()).equals(
354 DBRefUtils.getCanonicalName(refa.getSource())))
356 // We dont care about version
357 if (refa.getAccessionId() != null && refb.getAccessionId() != null
358 || refb.getAccessionId().equals(refa.getAccessionId()))
360 if ((refa.getMap() == null && refb.getMap() == null)
361 || (refa.getMap() != null && refb.getMap() != null))
363 if ((refb.getMap().getMap() == null
364 && refa.getMap().getMap() == null)
365 || (refb.getMap().getMap() != null
366 && refa.getMap().getMap() != null
367 && refb.getMap().getMap().getInverse()
368 .equals(refa.getMap().getMap())))
380 * accession ID and DB must be identical. Version is ignored. No map on both
381 * or or map but no maplist on either or maplist of map on a is equivalent to
382 * the maplist of map on b.
384 // TODO unused - remove?
385 public static DbRefComp matchDbAndIdAndEquivalentMapList = new DbRefComp()
388 public boolean matches(DBRefEntry refa, DBRefEntry refb)
390 if (refa.getSource() != null && refb.getSource() != null
391 && DBRefUtils.getCanonicalName(refb.getSource()).equals(
392 DBRefUtils.getCanonicalName(refa.getSource())))
394 // We dont care about version
395 // if ((refa.getVersion()==null || refb.getVersion()==null)
396 // || refb.getVersion().equals(refa.getVersion()))
398 if (refa.getAccessionId() != null && refb.getAccessionId() != null
399 || refb.getAccessionId().equals(refa.getAccessionId()))
401 if (refa.getMap() == null && refb.getMap() == null)
405 if (refa.getMap() != null && refb.getMap() != null
406 && ((refb.getMap().getMap() == null
407 && refa.getMap().getMap() == null)
408 || (refb.getMap().getMap() != null
409 && refa.getMap().getMap() != null
410 && refb.getMap().getMap()
411 .equals(refa.getMap().getMap()))))
422 * accession ID and DB must be identical, or null on a. Version is ignored. No
423 * map on either or map but no maplist on either or maplist of map on a is
424 * equivalent to the maplist of map on b.
426 public static DbRefComp matchDbAndIdAndEitherMapOrEquivalentMapList = new DbRefComp()
429 public boolean matches(DBRefEntry refa, DBRefEntry refb)
431 if (refa.getSource() != null && refb.getSource() != null
432 && DBRefUtils.getCanonicalName(refb.getSource()).equals(
433 DBRefUtils.getCanonicalName(refa.getSource())))
435 // We dont care about version
437 if (refa.getAccessionId() == null
438 || refa.getAccessionId().equals(refb.getAccessionId()))
440 if (refa.getMap() == null || refb.getMap() == null)
444 if ((refa.getMap() != null && refb.getMap() != null)
445 && (refb.getMap().getMap() == null
446 && refa.getMap().getMap() == null)
447 || (refb.getMap().getMap() != null
448 && refa.getMap().getMap() != null
449 && (refb.getMap().getMap()
450 .equals(refa.getMap().getMap()))))
461 * accession ID only must be identical.
463 public static DbRefComp matchId = new DbRefComp()
466 public boolean matches(DBRefEntry refa, DBRefEntry refb)
468 if (refa.getAccessionId() != null && refb.getAccessionId() != null
469 && refb.getAccessionId().equals(refa.getAccessionId()))
478 * Parses a DBRefEntry and adds it to the sequence, also a PDBEntry if the
481 * Used by file parsers to generate DBRefs from annotation within file (eg
488 * where to annotate with reference
489 * @return parsed version of entry that was added to seq (if any)
491 public static DBRefEntry parseToDbRef(SequenceI seq, String dbname,
492 String version, String acn)
494 DBRefEntry ref = null;
497 String locsrc = DBRefUtils.getCanonicalName(dbname);
498 if (locsrc.equals(DBRefSource.PDB))
501 * Check for PFAM style stockhom PDB accession id citation e.g.
504 Regex r = new com.stevesoft.pat.Regex(
505 "([0-9][0-9A-Za-z]{3})\\s*(.?)\\s*;\\s*([0-9]+)-([0-9]+)");
506 if (r.search(acn.trim()))
508 String pdbid = r.stringMatched(1);
509 String chaincode = r.stringMatched(2);
510 if (chaincode == null)
514 // String mapstart = r.stringMatched(3);
515 // String mapend = r.stringMatched(4);
516 if (chaincode.equals(" "))
520 // construct pdb ref.
521 ref = new DBRefEntry(locsrc, version, pdbid + chaincode);
522 PDBEntry pdbr = new PDBEntry();
524 pdbr.setType(PDBEntry.Type.PDB);
525 pdbr.setChainCode(chaincode);
530 System.err.println("Malformed PDB DR line:" + acn);
536 ref = new DBRefEntry(locsrc, version, acn);
547 * Returns true if either object is null, or they are equal
553 public static boolean nullOrEqual(Object o1, Object o2)
555 if (o1 == null || o2 == null)
559 return o1.equals(o2);
563 * canonicalise source string before comparing. null is always wildcard
566 * - null or source string to compare
568 * - null or source string to compare
569 * @return true if either o1 or o2 are null, or o1 equals o2 under
570 * DBRefUtils.getCanonicalName
571 * (o1).equals(DBRefUtils.getCanonicalName(o2))
573 public static boolean nullOrEqualSource(String o1, String o2)
575 if (o1 == null || o2 == null)
579 return DBRefUtils.getCanonicalName(o1)
580 .equals(DBRefUtils.getCanonicalName(o2));
584 * Selects just the DNA or protein references from a set of references
587 * if true, select references to 'standard' DNA databases, else to
588 * 'standard' peptide databases
590 * a set of references to select from
593 public static DBRefEntry[] selectDbRefs(boolean selectDna,
596 return selectRefs(refs,
597 selectDna ? DBRefSource.DNACODINGDBS : DBRefSource.PROTEINDBS);
598 // could attempt to find other cross
599 // refs here - ie PDB xrefs
600 // (not dna, not protein seq)
604 * Returns the (possibly empty) list of those supplied dbrefs which have the
605 * specified source database, with a case-insensitive match of source name
611 public static List<DBRefEntry> searchRefsForSource(DBRefEntry[] dbRefs,
614 List<DBRefEntry> matches = new ArrayList<DBRefEntry>();
615 if (dbRefs != null && source != null)
617 for (DBRefEntry dbref : dbRefs)
619 if (source.equalsIgnoreCase(
620 DBRefUtils.getCanonicalName(dbref.getSource())))
630 * promote direct database references to primary for nucleotide or protein
631 * sequences if they have an appropriate primary ref
635 * <th>Primary DB</th>
636 * <th>Direct which will be promoted</th>
657 public static void ensurePrimaries(SequenceI sequence)
659 List<DBRefEntry> pr = sequence.getPrimaryDBRefs();
665 List<DBRefEntry> selfs = new ArrayList<DBRefEntry>();
667 DBRefEntry[] selfArray = selectDbRefs(!sequence.isProtein(),
668 sequence.getDBRefs());
669 if (selfArray == null || selfArray.length == 0)
674 selfs.addAll(Arrays.asList(selfArray));
677 // filter non-primary refs
678 for (DBRefEntry p : pr)
680 while (selfs.contains(p))
685 List<DBRefEntry> toPromote = new ArrayList<DBRefEntry>();
687 for (DBRefEntry p : pr)
689 List<String> promType = new ArrayList<String>();
690 if (sequence.isProtein())
692 switch (getCanonicalName(p.getSource()))
694 case DBRefSource.UNIPROT:
695 // case DBRefSource.UNIPROTKB:
696 // case DBRefSource.UP_NAME:
697 // search for and promote ensembl
698 promType.add(DBRefSource.ENSEMBL);
700 case DBRefSource.ENSEMBL:
701 // search for and promote Uniprot
702 promType.add(DBRefSource.UNIPROT);
708 // TODO: promote transcript refs
711 // collate candidates and promote them
712 DBRefEntry[] candidates = selectRefs(selfs.toArray(new DBRefEntry[0]),
713 promType.toArray(new String[0]));
714 if (candidates != null)
716 for (DBRefEntry cand : candidates)
720 if (cand.getMap().getTo() != null
721 && cand.getMap().getTo() != sequence)
723 // can't promote refs with mappings to other sequences
726 if (cand.getMap().getMap().getFromLowest() != sequence
728 && cand.getMap().getMap().getFromHighest() != sequence
731 // can't promote refs with mappings from a region of this sequence
737 cand.setVersion(p.getVersion() + " (promoted)");
740 if (!cand.isPrimaryCandidate())
743 "Warning: Couldn't promote dbref " + cand.toString()
744 + " for sequence " + sequence.toString());