1 package jalview.analysis;
\r
3 import java.util.Enumeration;
\r
4 import java.util.Vector;
\r
5 import java.util.Hashtable;
\r
7 import jalview.datamodel.AlignedCodonFrame;
\r
8 import jalview.datamodel.Alignment;
\r
9 import jalview.datamodel.AlignmentI;
\r
10 import jalview.datamodel.DBRefSource;
\r
11 import jalview.datamodel.DBRefEntry;
\r
12 import jalview.datamodel.Sequence;
\r
13 import jalview.datamodel.SequenceI;
\r
14 import jalview.ws.ASequenceFetcher;
\r
15 import jalview.ws.SequenceFetcher;
\r
18 * Functions for cross-referencing sequence databases. user must first specify
\r
19 * if cross-referencing from protein or dna (set dna==true)
\r
24 public class CrossRef
\r
27 * get the DNA or protein references for a protein or dna sequence
\r
33 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
\r
37 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
\r
41 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
\r
42 DBRefSource.DNACODINGDBS); // could attempt to find other cross refs and return here - ie PDB xrefs (not dna, not protein seq)
\r
48 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
\r
50 Hashtable classes = new Hashtable();
\r
51 classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
\r
52 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils.selectRefs(rfs,
\r
53 DBRefSource.DNACODINGDBS));
\r
54 classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(rfs,
\r
55 DBRefSource.DOMAINDBS));
\r
56 // classes.put(OTHER, )
\r
62 * true if seqs are DNA seqs
\r
64 * @return a list of sequence database cross reference source types
\r
66 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
\r
68 return findSequenceXrefTypes(dna, seqs, null);
\r
71 * Indirect references are references from other sequences from the dataset to any of the direct
\r
72 * DBRefEntrys on the given sequences.
\r
74 * true if seqs are DNA seqs
\r
76 * @return a list of sequence database cross reference source types
\r
78 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs, AlignmentI dataset)
\r
80 String[] dbrefs = null;
\r
81 Vector refs = new Vector();
\r
82 for (int s = 0; s < seqs.length; s++)
\r
84 SequenceI dss = seqs[s];
\r
85 while (dss.getDatasetSequence()!=null)
\r
87 dss = dss.getDatasetSequence();
\r
89 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
\r
90 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
92 if (!refs.contains(rfs[r].getSource()))
\r
94 refs.addElement(rfs[r].getSource());
\r
99 // search for references to this sequence's direct references.
\r
100 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef());
\r
101 Vector rseqs = new Vector();
\r
102 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs, null); // don't need to specify codon frame for mapping here
\r
103 Enumeration lr = rseqs.elements();
\r
104 while (lr.hasMoreElements())
\r
106 SequenceI rs = (SequenceI) lr.nextElement();
\r
107 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
\r
108 for (int r=0; rfs != null && r < rfs.length; r++)
\r
110 if (!refs.contains(rfs[r].getSource()))
\r
112 refs.addElement(rfs[r].getSource());
\r
118 if (refs.size() > 0)
\r
120 dbrefs = new String[refs.size()];
\r
121 refs.copyInto(dbrefs);
\r
127 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
\r
128 * reference if (!refs.contains(rfs[r].getSource())) {
\r
129 * refs.addElement(rfs[r].getSource()); } } }
\r
131 public static boolean hasCdnaMap(SequenceI[] seqs)
\r
133 String[] reftypes = findSequenceXrefTypes(false, seqs);
\r
134 for (int s = 0; s < reftypes.length; s++)
\r
136 if (reftypes.equals(DBRefSource.EMBLCDS))
\r
145 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
\r
147 Vector cseqs = new Vector();
\r
148 for (int s = 0; s < seqs.length; s++)
\r
150 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
\r
151 for (int c = 0; c < cdna.length; c++)
\r
153 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
\r
155 // retrieve CDS dataset sequences
\r
156 // need global dataset sequence retriever/resolver to reuse refs
\r
157 // and construct Mapping entry.
\r
158 // insert gaps in CDS according to peptide gaps.
\r
159 // add gapped sequence to cseqs
\r
163 if (cseqs.size() > 0)
\r
165 SequenceI[] rsqs = new SequenceI[cseqs.size()];
\r
166 cseqs.copyInto(rsqs);
\r
179 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
182 return findXrefSequences(seqs, dna, source, null);
\r
191 * alignment to search for product sequences.
\r
192 * @return products (as dataset sequences)
\r
194 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
195 String source, AlignmentI dataset)
\r
197 Vector rseqs = new Vector();
\r
198 Alignment ral = null;
\r
199 AlignedCodonFrame cf=new AlignedCodonFrame(0); // nominal width
\r
200 for (int s = 0; s < seqs.length; s++)
\r
202 SequenceI dss = seqs[s];
\r
203 while (dss.getDatasetSequence()!=null)
\r
205 dss = dss.getDatasetSequence();
\r
207 boolean found = false;
\r
208 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
\r
209 if ((xrfs == null || xrfs.length == 0) && dataset!=null)
\r
211 System.out.println("Attempting to find ds Xrefs refs.");
\r
212 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less ambiguous would be a 'find primary dbRefEntry' method.
\r
213 // filter for desired source xref here
\r
214 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, rseqs, cf);
\r
216 for (int r = 0; xrfs!=null && r < xrfs.length; r++)
\r
218 if (source != null && !source.equals(xrfs[r].getSource()))
\r
220 if (xrfs[r].hasMap())
\r
222 if (xrfs[r].getMap().getTo() != null)
\r
224 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
\r
225 rseqs.addElement(rsq);
\r
226 if (xrfs[r].getMap().getMap().getFromRatio()!=xrfs[r].getMap().getMap().getToRatio())
\r
228 // get sense of map correct for adding to product alignment.
\r
231 // map is from dna seq to a protein product
\r
232 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
\r
234 // map should be from protein seq to its coding dna
\r
235 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
\r
243 // do a bit more work - search for sequences with references matching
\r
244 // xrefs on this sequence.
\r
245 if (dataset != null)
\r
247 found = searchDataset(dss, xrfs[r], dataset, rseqs, cf);
\r
249 xrfs[r] = null; // we've recovered seqs for this one.
\r
255 if (xrfs != null && xrfs.length > 0)
\r
257 // Try and get the sequence reference...
\r
259 * Ideal world - we ask for a sequence fetcher implementation here if
\r
260 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
\r
262 ASequenceFetcher sftch = new SequenceFetcher();
\r
263 SequenceI[] retrieved = null;
\r
264 int l = xrfs.length;
\r
265 for (int r = 0; r < xrfs.length; r++)
\r
267 // filter out any irrelevant or irretrievable references
\r
268 if (xrfs[r]==null || ((source != null && !source.equals(xrfs[r].getSource()))
\r
269 || !sftch.isFetchable(xrfs[r].getSource())))
\r
278 .println("Attempting to retrieve cross referenced sequences.");
\r
279 DBRefEntry[] t = new DBRefEntry[l];
\r
281 for (int r = 0; r < xrfs.length; r++)
\r
283 if (xrfs[r] != null)
\r
289 retrieved = sftch.getSequences(xrfs);
\r
290 } catch (Exception e)
\r
293 .println("Problem whilst retrieving cross references for Sequence : "
\r
294 + seqs[s].getName());
\r
295 e.printStackTrace();
\r
297 if (retrieved != null)
\r
299 for (int rs = 0; rs < retrieved.length; rs++)
\r
301 rseqs.addElement(retrieved[rs]);
\r
308 if (rseqs.size() > 0)
\r
310 SequenceI[] rsqs = new SequenceI[rseqs.size()];
\r
311 rseqs.copyInto(rsqs);
\r
312 ral = new Alignment(rsqs);
\r
313 if (cf!=null && cf.getProtMappings()!=null)
\r
315 ral.addCodonFrame(cf);
\r
322 * find references to lrfs in the cross-reference set of each sequence in dataset (that is not equal to sequenceI)
\r
323 * Identifies matching DBRefEntry based on source and accession string only - Map and Version are nulled.
\r
328 * @return true if matches were found.
\r
330 private static boolean searchDatasetXrefs(SequenceI sequenceI, boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
332 boolean found=false;
\r
335 for (int i=0;i<lrfs.length; i++)
\r
337 DBRefEntry xref = new DBRefEntry(lrfs[i]);
\r
338 // add in wildcards
\r
339 xref.setVersion(null);
\r
341 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
\r
348 * search a given sequence dataset for references matching cross-references to
\r
349 * the given sequence
\r
356 * @return true if sequences were found and added
\r
358 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
359 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
361 return searchDataset(sequenceI, xrf,
\r
362 dataset, rseqs, cf, true, false);
\r
365 * TODO: generalise to different protein classifications
\r
366 * Search dataset for DBRefEntrys matching the given one (xrf) and add
\r
367 * the associated sequence to rseq.
\r
372 * @param direct - search all references or only subset
\r
373 * @param dna search dna or protein xrefs (if direct=false)
\r
374 * @return true if relationship found and sequence added.
\r
376 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
377 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf, boolean direct, boolean dna)
\r
379 boolean found = false;
\r
380 if (dataset==null)
\r
382 if (dataset.getSequences()==null)
\r
384 System.err.println("Empty dataset sequence set - NO VECTOR");
\r
387 Enumeration e = dataset.getSequences().elements();
\r
388 while (e.hasMoreElements())
\r
390 SequenceI nxt = (SequenceI) e.nextElement();
\r
393 if (nxt.getDatasetSequence() != null)
\r
396 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
\r
398 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
\r
400 DBRefEntry[] poss=null, cands=null;
\r
403 cands = jalview.util.DBRefUtils.searchRefs(poss=nxt
\r
406 cands = jalview.util.DBRefUtils.searchRefs(
\r
407 poss=CrossRef.findXDbRefs(dna, nxt.getDBRef()), xrf);
\r
411 rseqs.addElement(nxt);
\r
412 boolean foundmap= cf!=null; // don't search if we aren't given a codon map object
\r
413 for (int r=0; foundmap && r<cands.length; r++)
\r
415 if (cands[r].hasMap())
\r
417 if (cands[r].getMap().getTo()!=null && cands[r].getMap().getMap().getFromRatio()!=cands[r].getMap().getMap().getToRatio())
\r
420 // get sense of map correct for adding to product alignment.
\r
423 // map is from dna seq to a protein product
\r
424 cf.addMap(sequenceI, nxt, cands[r].getMap().getMap());
\r
426 // map should be from protein seq to its coding dna
\r
427 cf.addMap(nxt, sequenceI, cands[r].getMap().getMap().getInverse());
\r
432 // TODO: add mapping between sequences if necessary
\r
443 * precalculate different products that can be found for seqs in dataset
\r
448 * @param fake - don't actually build lists - just get types
\r
450 public static Object[] buildXProductsList(boolean dna, SequenceI[] seqs, AlignmentI dataset, boolean fake)
\r
452 String types[] = jalview.analysis.CrossRef.findSequenceXrefTypes(
\r
453 dna, seqs, dataset);
\r
456 System.out.println("Xref Types for: "+(dna ? "dna" : "prot"));
\r
457 for (int t = 0; t < types.length; t++)
\r
459 System.out.println("Type: " + types[t]);
\r
460 SequenceI[] prod =
\r
461 jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
\r
462 System.out.println("Found "
\r
463 + ((prod == null) ? "no" : "" + prod.length)
\r
467 for (int p=0; p<prod.length; p++)
\r
469 System.out.println("Prod "+p+": "+prod[p].getDisplayId(true));
\r
475 System.out.println("Trying getProducts for "+al.getSequenceAt(0).getDisplayId(true));
\r
476 System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
\r
477 // have a bash at finding the products amongst all the retrieved sequences.
\r
478 SequenceI[] prod = jalview.analysis.CrossRef.findXrefSequences(al
\r
479 .getSequencesArray(), dna, null, ds);
\r
480 System.out.println("Found "
\r
481 + ((prod == null) ? "no" : "" + prod.length)
\r
485 // select non-equivalent sequences from dataset list
\r
486 for (int p=0; p<prod.length; p++)
\r
488 System.out.println("Prod "+p+": "+prod[p].getDisplayId(true));
\r