1 package jalview.analysis;
\r
3 import java.util.Enumeration;
\r
4 import java.util.Vector;
\r
5 import java.util.Hashtable;
\r
7 import jalview.datamodel.AlignedCodonFrame;
\r
8 import jalview.datamodel.Alignment;
\r
9 import jalview.datamodel.AlignmentI;
\r
10 import jalview.datamodel.DBRefSource;
\r
11 import jalview.datamodel.DBRefEntry;
\r
12 import jalview.datamodel.Sequence;
\r
13 import jalview.datamodel.SequenceI;
\r
14 import jalview.ws.SequenceFetcher;
\r
15 import jalview.ws.seqfetcher.ASequenceFetcher;
\r
18 * Functions for cross-referencing sequence databases. user must first specify
\r
19 * if cross-referencing from protein or dna (set dna==true)
\r
24 public class CrossRef
\r
27 * get the DNA or protein references for a protein or dna sequence
\r
33 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
\r
37 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
\r
41 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
\r
42 DBRefSource.DNACODINGDBS); // could attempt to find other cross
\r
43 // refs and return here - ie PDB xrefs
\r
44 // (not dna, not protein seq)
\r
49 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
\r
51 Hashtable classes = new Hashtable();
\r
52 classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(
\r
53 rfs, DBRefSource.PROTEINDBS));
\r
54 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
\r
55 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
\r
56 classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(
\r
57 rfs, DBRefSource.DOMAINDBS));
\r
58 // classes.put(OTHER, )
\r
64 * true if seqs are DNA seqs
\r
66 * @return a list of sequence database cross reference source types
\r
68 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
\r
70 return findSequenceXrefTypes(dna, seqs, null);
\r
74 * Indirect references are references from other sequences from the dataset to
\r
75 * any of the direct DBRefEntrys on the given sequences.
\r
78 * true if seqs are DNA seqs
\r
80 * @return a list of sequence database cross reference source types
\r
82 public static String[] findSequenceXrefTypes(boolean dna,
\r
83 SequenceI[] seqs, AlignmentI dataset)
\r
85 String[] dbrefs = null;
\r
86 Vector refs = new Vector();
\r
87 for (int s = 0; s < seqs.length; s++)
\r
89 SequenceI dss = seqs[s];
\r
90 while (dss.getDatasetSequence() != null)
\r
92 dss = dss.getDatasetSequence();
\r
94 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
\r
95 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
97 if (!refs.contains(rfs[r].getSource()))
\r
99 refs.addElement(rfs[r].getSource());
\r
102 if (dataset != null)
\r
104 // search for references to this sequence's direct references.
\r
105 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef());
\r
106 Vector rseqs = new Vector();
\r
107 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
\r
108 null); // don't need to specify codon frame for mapping here
\r
109 Enumeration lr = rseqs.elements();
\r
110 while (lr.hasMoreElements())
\r
112 SequenceI rs = (SequenceI) lr.nextElement();
\r
113 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
\r
114 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
116 if (!refs.contains(rfs[r].getSource()))
\r
118 refs.addElement(rfs[r].getSource());
\r
124 if (refs.size() > 0)
\r
126 dbrefs = new String[refs.size()];
\r
127 refs.copyInto(dbrefs);
\r
133 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
\r
134 * reference if (!refs.contains(rfs[r].getSource())) {
\r
135 * refs.addElement(rfs[r].getSource()); } } }
\r
137 public static boolean hasCdnaMap(SequenceI[] seqs)
\r
139 String[] reftypes = findSequenceXrefTypes(false, seqs);
\r
140 for (int s = 0; s < reftypes.length; s++)
\r
142 if (reftypes.equals(DBRefSource.EMBLCDS))
\r
151 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
\r
153 Vector cseqs = new Vector();
\r
154 for (int s = 0; s < seqs.length; s++)
\r
156 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
\r
157 for (int c = 0; c < cdna.length; c++)
\r
159 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
\r
161 System.err.println("TODO: unimplemented sequence retrieval for coding region sequence.");
\r
162 // TODO: retrieve CDS dataset sequences
\r
163 // need global dataset sequence retriever/resolver to reuse refs
\r
164 // and construct Mapping entry.
\r
165 // insert gaps in CDS according to peptide gaps.
\r
166 // add gapped sequence to cseqs
\r
170 if (cseqs.size() > 0)
\r
172 SequenceI[] rsqs = new SequenceI[cseqs.size()];
\r
173 cseqs.copyInto(rsqs);
\r
186 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
189 return findXrefSequences(seqs, dna, source, null);
\r
198 * alignment to search for product sequences.
\r
199 * @return products (as dataset sequences)
\r
201 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
202 String source, AlignmentI dataset)
\r
204 Vector rseqs = new Vector();
\r
205 Alignment ral = null;
\r
206 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
\r
207 for (int s = 0; s < seqs.length; s++)
\r
209 SequenceI dss = seqs[s];
\r
210 while (dss.getDatasetSequence() != null)
\r
212 dss = dss.getDatasetSequence();
\r
214 boolean found = false;
\r
215 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
\r
216 if ((xrfs == null || xrfs.length == 0) && dataset != null)
\r
218 System.out.println("Attempting to find ds Xrefs refs.");
\r
219 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
\r
227 // filter for desired source xref here
\r
228 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
\r
231 for (int r = 0; xrfs != null && r < xrfs.length; r++)
\r
233 if (source != null && !source.equals(xrfs[r].getSource()))
\r
235 if (xrfs[r].hasMap())
\r
237 if (xrfs[r].getMap().getTo() != null)
\r
239 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
\r
240 rseqs.addElement(rsq);
\r
241 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
\r
242 .getMap().getMap().getToRatio())
\r
244 // get sense of map correct for adding to product alignment.
\r
247 // map is from dna seq to a protein product
\r
248 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
\r
252 // map should be from protein seq to its coding dna
\r
253 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
\r
261 // do a bit more work - search for sequences with references matching
\r
262 // xrefs on this sequence.
\r
263 if (dataset != null)
\r
265 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf);
\r
267 xrfs[r] = null; // we've recovered seqs for this one.
\r
273 if (xrfs != null && xrfs.length > 0)
\r
275 // Try and get the sequence reference...
\r
277 * Ideal world - we ask for a sequence fetcher implementation here if
\r
278 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
\r
280 ASequenceFetcher sftch = new SequenceFetcher();
\r
281 SequenceI[] retrieved = null;
\r
282 int l = xrfs.length;
\r
283 for (int r = 0; r < xrfs.length; r++)
\r
285 // filter out any irrelevant or irretrievable references
\r
286 if (xrfs[r] == null
\r
287 || ((source != null && !source.equals(xrfs[r]
\r
288 .getSource())) || !sftch.isFetchable(xrfs[r]
\r
298 .println("Attempting to retrieve cross referenced sequences.");
\r
299 DBRefEntry[] t = new DBRefEntry[l];
\r
301 for (int r = 0; r < xrfs.length; r++)
\r
303 if (xrfs[r] != null)
\r
309 retrieved = sftch.getSequences(xrfs); // problem here is we don't know which of xrfs resulted in which retrieved element
\r
310 } catch (Exception e)
\r
313 .println("Problem whilst retrieving cross references for Sequence : "
\r
314 + seqs[s].getName());
\r
315 e.printStackTrace();
\r
317 if (retrieved != null)
\r
319 for (int rs = 0; rs < retrieved.length; rs++)
\r
321 // TODO: examine each sequence for 'redundancy'
\r
322 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs].getDBRef();
\r
323 if (dbr != null && dbr.length > 0)
\r
325 for (int di = 0; di < dbr.length; di++)
\r
327 // find any entry where we should put in the sequence being cross-referenced into the map
\r
328 jalview.datamodel.Mapping map = dbr[di].getMap();
\r
331 if (map.getTo() != null && map.getMap() != null)
\r
333 // should search the local dataset to find any existing candidates for To !
\r
336 // compare ms with dss and replace with dss in mapping if map is congruent
\r
337 SequenceI ms = map.getTo();
\r
338 int sf = map.getMap().getToLowest();
\r
339 int st = map.getMap().getToHighest();
\r
340 SequenceI mappedrg = ms.getSubSequence(sf, st);
\r
341 SequenceI loc = dss.getSubSequence(sf, st);
\r
342 if (mappedrg.getLength()>0 && mappedrg.getSequenceAsString().equals(
\r
343 loc.getSequenceAsString()))
\r
346 .println("Mapping updated for retrieved crossreference");
\r
347 // method to update all refs of existing To on retrieved sequence with dss and merge any props on To onto dss.
\r
350 } catch (Exception e)
\r
353 .println("Exception when consolidating Mapped sequence set...");
\r
354 e.printStackTrace(System.err);
\r
360 retrieved[rs].updatePDBIds();
\r
361 rseqs.addElement(retrieved[rs]);
\r
368 if (rseqs.size() > 0)
\r
370 SequenceI[] rsqs = new SequenceI[rseqs.size()];
\r
371 rseqs.copyInto(rsqs);
\r
372 ral = new Alignment(rsqs);
\r
373 if (cf != null && cf.getProtMappings() != null)
\r
375 ral.addCodonFrame(cf);
\r
382 * find references to lrfs in the cross-reference set of each sequence in
\r
383 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
\r
384 * based on source and accession string only - Map and Version are nulled.
\r
390 * @return true if matches were found.
\r
392 private static boolean searchDatasetXrefs(SequenceI sequenceI,
\r
393 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
\r
394 AlignedCodonFrame cf)
\r
396 boolean found = false;
\r
399 for (int i = 0; i < lrfs.length; i++)
\r
401 DBRefEntry xref = new DBRefEntry(lrfs[i]);
\r
402 // add in wildcards
\r
403 xref.setVersion(null);
\r
405 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
\r
411 * search a given sequence dataset for references matching cross-references to
\r
412 * the given sequence
\r
418 * set of unique sequences
\r
420 * @return true if one or more unique sequences were found and added
\r
422 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
423 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
425 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
\r
429 * TODO: generalise to different protein classifications Search dataset for
\r
430 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
\r
438 * search all references or only subset
\r
440 * search dna or protein xrefs (if direct=false)
\r
441 * @return true if relationship found and sequence added.
\r
443 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
444 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
\r
445 boolean direct, boolean dna)
\r
447 boolean found = false;
\r
448 if (dataset == null)
\r
450 if (dataset.getSequences() == null)
\r
452 System.err.println("Empty dataset sequence set - NO VECTOR");
\r
455 Enumeration e = dataset.getSequences().elements();
\r
456 while (e.hasMoreElements())
\r
458 SequenceI nxt = (SequenceI) e.nextElement();
\r
461 if (nxt.getDatasetSequence() != null)
\r
464 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
\r
466 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
\r
468 // look for direct or indirect references in common
\r
469 DBRefEntry[] poss = null, cands = null;
\r
472 cands = jalview.util.DBRefUtils.searchRefs(poss = nxt
\r
477 cands = jalview.util.DBRefUtils.searchRefs(poss = CrossRef
\r
478 .findXDbRefs(dna, nxt.getDBRef()), xrf);
\r
482 if (!rseqs.contains(nxt))
\r
484 rseqs.addElement(nxt);
\r
485 boolean foundmap = cf != null; // don't search if we aren't given
\r
486 // a codon map object
\r
487 for (int r = 0; foundmap && r < cands.length; r++)
\r
489 if (cands[r].hasMap())
\r
491 if (cands[r].getMap().getTo() != null
\r
492 && cands[r].getMap().getMap().getFromRatio() != cands[r]
\r
493 .getMap().getMap().getToRatio())
\r
496 // get sense of map correct for adding to product alignment.
\r
499 // map is from dna seq to a protein product
\r
500 cf.addMap(sequenceI, nxt, cands[r].getMap().getMap());
\r
504 // map should be from protein seq to its coding dna
\r
505 cf.addMap(nxt, sequenceI, cands[r].getMap().getMap()
\r
511 // TODO: add mapping between sequences if necessary
\r
523 * precalculate different products that can be found for seqs in dataset and
\r
530 * don't actually build lists - just get types
\r
531 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
\r
532 * seqs, AlignmentI dataset, boolean fake) { String types[] =
\r
533 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
\r
534 * dataset); if (types != null) { System.out.println("Xref Types for:
\r
535 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
\r
536 * System.out.println("Type: " + types[t]); SequenceI[] prod =
\r
537 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
\r
538 * System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
539 * prod.length) + " products"); if (prod!=null) { for (int p=0; p<prod.length;
\r
540 * p++) { System.out.println("Prod "+p+":
\r
541 * "+prod[p].getDisplayId(true)); } } }
\r
542 * } else { System.out.println("Trying getProducts for
\r
543 * "+al.getSequenceAt(0).getDisplayId(true)); System.out.println("Search DS
\r
544 * Xref for: "+(dna ? "dna" : "prot")); // have a bash at finding the products
\r
545 * amongst all the retrieved sequences. SequenceI[] prod =
\r
546 * jalview.analysis.CrossRef.findXrefSequences(al .getSequencesArray(), dna,
\r
547 * null, ds); System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
548 * prod.length) + " products"); if (prod!=null) { // select non-equivalent
\r
549 * sequences from dataset list for (int p=0; p<prod.length; p++) {
\r
550 * System.out.println("Prod "+p+": "+prod[p].getDisplayId(true)); } }
\r