1 package jalview.analysis;
\r
3 import java.util.Enumeration;
\r
4 import java.util.Vector;
\r
5 import java.util.Hashtable;
\r
7 import jalview.datamodel.AlignedCodonFrame;
\r
8 import jalview.datamodel.Alignment;
\r
9 import jalview.datamodel.AlignmentI;
\r
10 import jalview.datamodel.DBRefSource;
\r
11 import jalview.datamodel.DBRefEntry;
\r
12 import jalview.datamodel.Sequence;
\r
13 import jalview.datamodel.SequenceI;
\r
14 import jalview.ws.SequenceFetcher;
\r
15 import jalview.ws.seqfetcher.ASequenceFetcher;
\r
18 * Functions for cross-referencing sequence databases. user must first specify
\r
19 * if cross-referencing from protein or dna (set dna==true)
\r
24 public class CrossRef
\r
27 * get the DNA or protein references for a protein or dna sequence
\r
33 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
\r
37 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
\r
41 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
\r
42 DBRefSource.DNACODINGDBS); // could attempt to find other cross
\r
43 // refs and return here - ie PDB xrefs
\r
44 // (not dna, not protein seq)
\r
49 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
\r
51 Hashtable classes = new Hashtable();
\r
52 classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(
\r
53 rfs, DBRefSource.PROTEINDBS));
\r
54 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
\r
55 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
\r
56 classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(
\r
57 rfs, DBRefSource.DOMAINDBS));
\r
58 // classes.put(OTHER, )
\r
64 * true if seqs are DNA seqs
\r
66 * @return a list of sequence database cross reference source types
\r
68 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
\r
70 return findSequenceXrefTypes(dna, seqs, null);
\r
74 * Indirect references are references from other sequences from the dataset to
\r
75 * any of the direct DBRefEntrys on the given sequences.
\r
78 * true if seqs are DNA seqs
\r
80 * @return a list of sequence database cross reference source types
\r
82 public static String[] findSequenceXrefTypes(boolean dna,
\r
83 SequenceI[] seqs, AlignmentI dataset)
\r
85 String[] dbrefs = null;
\r
86 Vector refs = new Vector();
\r
87 for (int s = 0; s < seqs.length; s++)
\r
89 SequenceI dss = seqs[s];
\r
90 while (dss.getDatasetSequence() != null)
\r
92 dss = dss.getDatasetSequence();
\r
94 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
\r
95 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
97 if (!refs.contains(rfs[r].getSource()))
\r
99 refs.addElement(rfs[r].getSource());
\r
102 if (dataset != null)
\r
104 // search for references to this sequence's direct references.
\r
105 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef());
\r
106 Vector rseqs = new Vector();
\r
107 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
\r
108 null); // don't need to specify codon frame for mapping here
\r
109 Enumeration lr = rseqs.elements();
\r
110 while (lr.hasMoreElements())
\r
112 SequenceI rs = (SequenceI) lr.nextElement();
\r
113 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
\r
114 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
116 if (!refs.contains(rfs[r].getSource()))
\r
118 refs.addElement(rfs[r].getSource());
\r
124 if (refs.size() > 0)
\r
126 dbrefs = new String[refs.size()];
\r
127 refs.copyInto(dbrefs);
\r
133 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
\r
134 * reference if (!refs.contains(rfs[r].getSource())) {
\r
135 * refs.addElement(rfs[r].getSource()); } } }
\r
137 public static boolean hasCdnaMap(SequenceI[] seqs)
\r
139 String[] reftypes = findSequenceXrefTypes(false, seqs);
\r
140 for (int s = 0; s < reftypes.length; s++)
\r
142 if (reftypes.equals(DBRefSource.EMBLCDS))
\r
151 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
\r
153 Vector cseqs = new Vector();
\r
154 for (int s = 0; s < seqs.length; s++)
\r
156 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
\r
157 for (int c = 0; c < cdna.length; c++)
\r
159 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
\r
161 // retrieve CDS dataset sequences
\r
162 // need global dataset sequence retriever/resolver to reuse refs
\r
163 // and construct Mapping entry.
\r
164 // insert gaps in CDS according to peptide gaps.
\r
165 // add gapped sequence to cseqs
\r
169 if (cseqs.size() > 0)
\r
171 SequenceI[] rsqs = new SequenceI[cseqs.size()];
\r
172 cseqs.copyInto(rsqs);
\r
185 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
188 return findXrefSequences(seqs, dna, source, null);
\r
197 * alignment to search for product sequences.
\r
198 * @return products (as dataset sequences)
\r
200 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
201 String source, AlignmentI dataset)
\r
203 Vector rseqs = new Vector();
\r
204 Alignment ral = null;
\r
205 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
\r
206 for (int s = 0; s < seqs.length; s++)
\r
208 SequenceI dss = seqs[s];
\r
209 while (dss.getDatasetSequence() != null)
\r
211 dss = dss.getDatasetSequence();
\r
213 boolean found = false;
\r
214 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
\r
215 if ((xrfs == null || xrfs.length == 0) && dataset != null)
\r
217 System.out.println("Attempting to find ds Xrefs refs.");
\r
218 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
\r
226 // filter for desired source xref here
\r
227 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
\r
230 for (int r = 0; xrfs != null && r < xrfs.length; r++)
\r
232 if (source != null && !source.equals(xrfs[r].getSource()))
\r
234 if (xrfs[r].hasMap())
\r
236 if (xrfs[r].getMap().getTo() != null)
\r
238 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
\r
239 rseqs.addElement(rsq);
\r
240 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
\r
241 .getMap().getMap().getToRatio())
\r
243 // get sense of map correct for adding to product alignment.
\r
246 // map is from dna seq to a protein product
\r
247 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
\r
251 // map should be from protein seq to its coding dna
\r
252 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
\r
260 // do a bit more work - search for sequences with references matching
\r
261 // xrefs on this sequence.
\r
262 if (dataset != null)
\r
264 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf);
\r
266 xrfs[r] = null; // we've recovered seqs for this one.
\r
272 if (xrfs != null && xrfs.length > 0)
\r
274 // Try and get the sequence reference...
\r
276 * Ideal world - we ask for a sequence fetcher implementation here if
\r
277 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
\r
279 ASequenceFetcher sftch = new SequenceFetcher();
\r
280 SequenceI[] retrieved = null;
\r
281 int l = xrfs.length;
\r
282 for (int r = 0; r < xrfs.length; r++)
\r
284 // filter out any irrelevant or irretrievable references
\r
285 if (xrfs[r] == null
\r
286 || ((source != null && !source.equals(xrfs[r]
\r
287 .getSource())) || !sftch.isFetchable(xrfs[r]
\r
297 .println("Attempting to retrieve cross referenced sequences.");
\r
298 DBRefEntry[] t = new DBRefEntry[l];
\r
300 for (int r = 0; r < xrfs.length; r++)
\r
302 if (xrfs[r] != null)
\r
308 retrieved = sftch.getSequences(xrfs); // problem here is we don't know which of xrfs resulted in which retrieved element
\r
309 } catch (Exception e)
\r
312 .println("Problem whilst retrieving cross references for Sequence : "
\r
313 + seqs[s].getName());
\r
314 e.printStackTrace();
\r
316 if (retrieved != null)
\r
318 for (int rs = 0; rs < retrieved.length; rs++)
\r
320 // TODO: examine each sequence for 'redundancy'
\r
321 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs].getDBRef();
\r
322 if (dbr != null && dbr.length > 0)
\r
324 for (int di = 0; di < dbr.length; di++)
\r
326 // find any entry where we should put in the sequence being cross-referenced into the map
\r
327 jalview.datamodel.Mapping map = dbr[di].getMap();
\r
330 if (map.getTo() != null && map.getMap() != null)
\r
332 // should search the local dataset to find any existing candidates for To !
\r
335 // compare ms with dss and replace with dss in mapping if map is congruent
\r
336 SequenceI ms = map.getTo();
\r
337 int sf = map.getMap().getToLowest();
\r
338 int st = map.getMap().getToHighest();
\r
339 SequenceI mappedrg = ms.getSubSequence(sf, st);
\r
340 SequenceI loc = dss.getSubSequence(sf, st);
\r
341 if (mappedrg.getLength()>0 && mappedrg.getSequenceAsString().equals(
\r
342 loc.getSequenceAsString()))
\r
345 .println("Mapping updated for retrieved crossreference");
\r
346 // method to update all refs of existing To on retrieved sequence with dss and merge any props on To onto dss.
\r
349 } catch (Exception e)
\r
352 .println("Exception when consolidating Mapped sequence set...");
\r
353 e.printStackTrace(System.err);
\r
359 retrieved[rs].updatePDBIds();
\r
360 rseqs.addElement(retrieved[rs]);
\r
367 if (rseqs.size() > 0)
\r
369 SequenceI[] rsqs = new SequenceI[rseqs.size()];
\r
370 rseqs.copyInto(rsqs);
\r
371 ral = new Alignment(rsqs);
\r
372 if (cf != null && cf.getProtMappings() != null)
\r
374 ral.addCodonFrame(cf);
\r
381 * find references to lrfs in the cross-reference set of each sequence in
\r
382 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
\r
383 * based on source and accession string only - Map and Version are nulled.
\r
389 * @return true if matches were found.
\r
391 private static boolean searchDatasetXrefs(SequenceI sequenceI,
\r
392 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
\r
393 AlignedCodonFrame cf)
\r
395 boolean found = false;
\r
398 for (int i = 0; i < lrfs.length; i++)
\r
400 DBRefEntry xref = new DBRefEntry(lrfs[i]);
\r
401 // add in wildcards
\r
402 xref.setVersion(null);
\r
404 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
\r
410 * search a given sequence dataset for references matching cross-references to
\r
411 * the given sequence
\r
417 * set of unique sequences
\r
419 * @return true if one or more unique sequences were found and added
\r
421 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
422 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
424 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
\r
428 * TODO: generalise to different protein classifications Search dataset for
\r
429 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
\r
437 * search all references or only subset
\r
439 * search dna or protein xrefs (if direct=false)
\r
440 * @return true if relationship found and sequence added.
\r
442 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
443 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
\r
444 boolean direct, boolean dna)
\r
446 boolean found = false;
\r
447 if (dataset == null)
\r
449 if (dataset.getSequences() == null)
\r
451 System.err.println("Empty dataset sequence set - NO VECTOR");
\r
454 Enumeration e = dataset.getSequences().elements();
\r
455 while (e.hasMoreElements())
\r
457 SequenceI nxt = (SequenceI) e.nextElement();
\r
460 if (nxt.getDatasetSequence() != null)
\r
463 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
\r
465 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
\r
467 // look for direct or indirect references in common
\r
468 DBRefEntry[] poss = null, cands = null;
\r
471 cands = jalview.util.DBRefUtils.searchRefs(poss = nxt
\r
476 cands = jalview.util.DBRefUtils.searchRefs(poss = CrossRef
\r
477 .findXDbRefs(dna, nxt.getDBRef()), xrf);
\r
481 if (!rseqs.contains(nxt))
\r
483 rseqs.addElement(nxt);
\r
484 boolean foundmap = cf != null; // don't search if we aren't given
\r
485 // a codon map object
\r
486 for (int r = 0; foundmap && r < cands.length; r++)
\r
488 if (cands[r].hasMap())
\r
490 if (cands[r].getMap().getTo() != null
\r
491 && cands[r].getMap().getMap().getFromRatio() != cands[r]
\r
492 .getMap().getMap().getToRatio())
\r
495 // get sense of map correct for adding to product alignment.
\r
498 // map is from dna seq to a protein product
\r
499 cf.addMap(sequenceI, nxt, cands[r].getMap().getMap());
\r
503 // map should be from protein seq to its coding dna
\r
504 cf.addMap(nxt, sequenceI, cands[r].getMap().getMap()
\r
510 // TODO: add mapping between sequences if necessary
\r
522 * precalculate different products that can be found for seqs in dataset and
\r
529 * don't actually build lists - just get types
\r
530 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
\r
531 * seqs, AlignmentI dataset, boolean fake) { String types[] =
\r
532 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
\r
533 * dataset); if (types != null) { System.out.println("Xref Types for:
\r
534 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
\r
535 * System.out.println("Type: " + types[t]); SequenceI[] prod =
\r
536 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
\r
537 * System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
538 * prod.length) + " products"); if (prod!=null) { for (int p=0; p<prod.length;
\r
539 * p++) { System.out.println("Prod "+p+":
\r
540 * "+prod[p].getDisplayId(true)); } } }
\r
541 * } else { System.out.println("Trying getProducts for
\r
542 * "+al.getSequenceAt(0).getDisplayId(true)); System.out.println("Search DS
\r
543 * Xref for: "+(dna ? "dna" : "prot")); // have a bash at finding the products
\r
544 * amongst all the retrieved sequences. SequenceI[] prod =
\r
545 * jalview.analysis.CrossRef.findXrefSequences(al .getSequencesArray(), dna,
\r
546 * null, ds); System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
547 * prod.length) + " products"); if (prod!=null) { // select non-equivalent
\r
548 * sequences from dataset list for (int p=0; p<prod.length; p++) {
\r
549 * System.out.println("Prod "+p+": "+prod[p].getDisplayId(true)); } }
\r