2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.4)
\r
3 * Copyright (C) 2008 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
\r
5 * This program is free software; you can redistribute it and/or
\r
6 * modify it under the terms of the GNU General Public License
\r
7 * as published by the Free Software Foundation; either version 2
\r
8 * of the License, or (at your option) any later version.
\r
10 * This program is distributed in the hope that it will be useful,
\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
13 * GNU General Public License for more details.
\r
15 * You should have received a copy of the GNU General Public License
\r
16 * along with this program; if not, write to the Free Software
\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
19 package jalview.analysis;
\r
21 import java.util.Enumeration;
\r
22 import java.util.Vector;
\r
23 import java.util.Hashtable;
\r
25 import jalview.datamodel.AlignedCodonFrame;
\r
26 import jalview.datamodel.Alignment;
\r
27 import jalview.datamodel.AlignmentI;
\r
28 import jalview.datamodel.DBRefSource;
\r
29 import jalview.datamodel.DBRefEntry;
\r
30 import jalview.datamodel.Sequence;
\r
31 import jalview.datamodel.SequenceI;
\r
32 import jalview.ws.SequenceFetcher;
\r
33 import jalview.ws.seqfetcher.ASequenceFetcher;
\r
36 * Functions for cross-referencing sequence databases. user must first specify
\r
37 * if cross-referencing from protein or dna (set dna==true)
\r
42 public class CrossRef
\r
45 * get the DNA or protein references for a protein or dna sequence
\r
51 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
\r
55 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
\r
59 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
\r
60 DBRefSource.DNACODINGDBS); // could attempt to find other cross
\r
61 // refs and return here - ie PDB xrefs
\r
62 // (not dna, not protein seq)
\r
67 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
\r
69 Hashtable classes = new Hashtable();
\r
70 classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(
\r
71 rfs, DBRefSource.PROTEINDBS));
\r
72 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
\r
73 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
\r
74 classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(
\r
75 rfs, DBRefSource.DOMAINDBS));
\r
76 // classes.put(OTHER, )
\r
82 * true if seqs are DNA seqs
\r
84 * @return a list of sequence database cross reference source types
\r
86 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
\r
88 return findSequenceXrefTypes(dna, seqs, null);
\r
92 * Indirect references are references from other sequences from the dataset to
\r
93 * any of the direct DBRefEntrys on the given sequences.
\r
96 * true if seqs are DNA seqs
\r
98 * @return a list of sequence database cross reference source types
\r
100 public static String[] findSequenceXrefTypes(boolean dna,
\r
101 SequenceI[] seqs, AlignmentI dataset)
\r
103 String[] dbrefs = null;
\r
104 Vector refs = new Vector();
\r
105 for (int s = 0; s < seqs.length; s++)
\r
107 SequenceI dss = seqs[s];
\r
108 while (dss.getDatasetSequence() != null)
\r
110 dss = dss.getDatasetSequence();
\r
112 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
\r
113 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
115 if (!refs.contains(rfs[r].getSource()))
\r
117 refs.addElement(rfs[r].getSource());
\r
120 if (dataset != null)
\r
122 // search for references to this sequence's direct references.
\r
123 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef());
\r
124 Vector rseqs = new Vector();
\r
125 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
\r
126 null); // don't need to specify codon frame for mapping here
\r
127 Enumeration lr = rseqs.elements();
\r
128 while (lr.hasMoreElements())
\r
130 SequenceI rs = (SequenceI) lr.nextElement();
\r
131 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
\r
132 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
134 if (!refs.contains(rfs[r].getSource()))
\r
136 refs.addElement(rfs[r].getSource());
\r
142 if (refs.size() > 0)
\r
144 dbrefs = new String[refs.size()];
\r
145 refs.copyInto(dbrefs);
\r
151 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
\r
152 * reference if (!refs.contains(rfs[r].getSource())) {
\r
153 * refs.addElement(rfs[r].getSource()); } } }
\r
155 public static boolean hasCdnaMap(SequenceI[] seqs)
\r
157 String[] reftypes = findSequenceXrefTypes(false, seqs);
\r
158 for (int s = 0; s < reftypes.length; s++)
\r
160 if (reftypes.equals(DBRefSource.EMBLCDS))
\r
169 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
\r
171 Vector cseqs = new Vector();
\r
172 for (int s = 0; s < seqs.length; s++)
\r
174 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
\r
175 for (int c = 0; c < cdna.length; c++)
\r
177 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
\r
180 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
\r
181 // TODO: retrieve CDS dataset sequences
\r
182 // need global dataset sequence retriever/resolver to reuse refs
\r
183 // and construct Mapping entry.
\r
184 // insert gaps in CDS according to peptide gaps.
\r
185 // add gapped sequence to cseqs
\r
189 if (cseqs.size() > 0)
\r
191 SequenceI[] rsqs = new SequenceI[cseqs.size()];
\r
192 cseqs.copyInto(rsqs);
\r
205 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
208 return findXrefSequences(seqs, dna, source, null);
\r
217 * alignment to search for product sequences.
\r
218 * @return products (as dataset sequences)
\r
220 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
221 String source, AlignmentI dataset)
\r
223 Vector rseqs = new Vector();
\r
224 Alignment ral = null;
\r
225 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
\r
226 for (int s = 0; s < seqs.length; s++)
\r
228 SequenceI dss = seqs[s];
\r
229 while (dss.getDatasetSequence() != null)
\r
231 dss = dss.getDatasetSequence();
\r
233 boolean found = false;
\r
234 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
\r
235 if ((xrfs == null || xrfs.length == 0) && dataset != null)
\r
237 System.out.println("Attempting to find ds Xrefs refs.");
\r
238 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
\r
246 // filter for desired source xref here
\r
247 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
\r
250 for (int r = 0; xrfs != null && r < xrfs.length; r++)
\r
252 if (source != null && !source.equals(xrfs[r].getSource()))
\r
254 if (xrfs[r].hasMap())
\r
256 if (xrfs[r].getMap().getTo() != null)
\r
258 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
\r
259 rseqs.addElement(rsq);
\r
260 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
\r
261 .getMap().getMap().getToRatio())
\r
263 // get sense of map correct for adding to product alignment.
\r
266 // map is from dna seq to a protein product
\r
267 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
\r
271 // map should be from protein seq to its coding dna
\r
272 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
\r
280 // do a bit more work - search for sequences with references matching
\r
281 // xrefs on this sequence.
\r
282 if (dataset != null)
\r
284 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf);
\r
286 xrfs[r] = null; // we've recovered seqs for this one.
\r
292 if (xrfs != null && xrfs.length > 0)
\r
294 // Try and get the sequence reference...
\r
296 * Ideal world - we ask for a sequence fetcher implementation here if
\r
297 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
\r
299 ASequenceFetcher sftch = new SequenceFetcher();
\r
300 SequenceI[] retrieved = null;
\r
301 int l = xrfs.length;
\r
302 for (int r = 0; r < xrfs.length; r++)
\r
304 // filter out any irrelevant or irretrievable references
\r
305 if (xrfs[r] == null
\r
306 || ((source != null && !source.equals(xrfs[r]
\r
307 .getSource())) || !sftch.isFetchable(xrfs[r]
\r
317 .println("Attempting to retrieve cross referenced sequences.");
\r
318 DBRefEntry[] t = new DBRefEntry[l];
\r
320 for (int r = 0; r < xrfs.length; r++)
\r
322 if (xrfs[r] != null)
\r
328 retrieved = sftch.getSequences(xrfs); // problem here is we don't
\r
329 // know which of xrfs
\r
330 // resulted in which
\r
331 // retrieved element
\r
332 } catch (Exception e)
\r
335 .println("Problem whilst retrieving cross references for Sequence : "
\r
336 + seqs[s].getName());
\r
337 e.printStackTrace();
\r
339 if (retrieved != null)
\r
341 for (int rs = 0; rs < retrieved.length; rs++)
\r
343 // TODO: examine each sequence for 'redundancy'
\r
344 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
\r
346 if (dbr != null && dbr.length > 0)
\r
348 for (int di = 0; di < dbr.length; di++)
\r
350 // find any entry where we should put in the sequence being
\r
351 // cross-referenced into the map
\r
352 jalview.datamodel.Mapping map = dbr[di].getMap();
\r
355 if (map.getTo() != null && map.getMap() != null)
\r
357 // should search the local dataset to find any existing
\r
358 // candidates for To !
\r
361 // compare ms with dss and replace with dss in mapping
\r
362 // if map is congruent
\r
363 SequenceI ms = map.getTo();
\r
364 int sf = map.getMap().getToLowest();
\r
365 int st = map.getMap().getToHighest();
\r
366 SequenceI mappedrg = ms.getSubSequence(sf, st);
\r
367 SequenceI loc = dss.getSubSequence(sf, st);
\r
368 if (mappedrg.getLength() > 0
\r
369 && mappedrg.getSequenceAsString().equals(
\r
370 loc.getSequenceAsString()))
\r
373 .println("Mapping updated for retrieved crossreference");
\r
374 // method to update all refs of existing To on
\r
375 // retrieved sequence with dss and merge any props
\r
379 } catch (Exception e)
\r
382 .println("Exception when consolidating Mapped sequence set...");
\r
383 e.printStackTrace(System.err);
\r
389 retrieved[rs].updatePDBIds();
\r
390 rseqs.addElement(retrieved[rs]);
\r
397 if (rseqs.size() > 0)
\r
399 SequenceI[] rsqs = new SequenceI[rseqs.size()];
\r
400 rseqs.copyInto(rsqs);
\r
401 ral = new Alignment(rsqs);
\r
402 if (cf != null && cf.getProtMappings() != null)
\r
404 ral.addCodonFrame(cf);
\r
411 * find references to lrfs in the cross-reference set of each sequence in
\r
412 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
\r
413 * based on source and accession string only - Map and Version are nulled.
\r
419 * @return true if matches were found.
\r
421 private static boolean searchDatasetXrefs(SequenceI sequenceI,
\r
422 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
\r
423 AlignedCodonFrame cf)
\r
425 boolean found = false;
\r
428 for (int i = 0; i < lrfs.length; i++)
\r
430 DBRefEntry xref = new DBRefEntry(lrfs[i]);
\r
431 // add in wildcards
\r
432 xref.setVersion(null);
\r
434 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
\r
440 * search a given sequence dataset for references matching cross-references to
\r
441 * the given sequence
\r
447 * set of unique sequences
\r
449 * @return true if one or more unique sequences were found and added
\r
451 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
452 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
454 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
\r
458 * TODO: generalise to different protein classifications Search dataset for
\r
459 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
\r
467 * search all references or only subset
\r
469 * search dna or protein xrefs (if direct=false)
\r
470 * @return true if relationship found and sequence added.
\r
472 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
473 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
\r
474 boolean direct, boolean dna)
\r
476 boolean found = false;
\r
477 SequenceI[] typer = new SequenceI[1];
\r
478 if (dataset == null)
\r
480 if (dataset.getSequences() == null)
\r
482 System.err.println("Empty dataset sequence set - NO VECTOR");
\r
485 Enumeration e = dataset.getSequences().elements();
\r
486 while (e.hasMoreElements())
\r
488 SequenceI nxt = (SequenceI) e.nextElement();
\r
491 if (nxt.getDatasetSequence() != null)
\r
494 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
\r
496 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
\r
498 // check if this is the correct sequence type
\r
501 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
\r
502 if ((direct && isDna == dna) || (!direct && isDna != dna))
\r
504 // skip this sequence because it is same molecule type
\r
509 // look for direct or indirect references in common
\r
510 DBRefEntry[] poss = null, cands = null;
\r
513 cands = jalview.util.DBRefUtils.searchRefs(poss = nxt
\r
518 cands = jalview.util.DBRefUtils.searchRefs(poss = CrossRef
\r
519 .findXDbRefs(dna, nxt.getDBRef()), xrf);
\r
523 if (!rseqs.contains(nxt))
\r
525 rseqs.addElement(nxt);
\r
526 boolean foundmap = cf != null; // don't search if we aren't given
\r
527 // a codon map object
\r
528 for (int r = 0; foundmap && r < cands.length; r++)
\r
530 if (cands[r].hasMap())
\r
532 if (cands[r].getMap().getTo() != null
\r
533 && cands[r].getMap().getMap().getFromRatio() != cands[r]
\r
534 .getMap().getMap().getToRatio())
\r
537 // get sense of map correct for adding to product alignment.
\r
540 // map is from dna seq to a protein product
\r
541 cf.addMap(sequenceI, nxt, cands[r].getMap().getMap());
\r
545 // map should be from protein seq to its coding dna
\r
546 cf.addMap(nxt, sequenceI, cands[r].getMap().getMap()
\r
552 // TODO: add mapping between sequences if necessary
\r
564 * precalculate different products that can be found for seqs in dataset and
\r
571 * don't actually build lists - just get types
\r
572 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
\r
573 * seqs, AlignmentI dataset, boolean fake) { String types[] =
\r
574 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
\r
575 * dataset); if (types != null) { System.out.println("Xref Types for:
\r
576 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
\r
577 * System.out.println("Type: " + types[t]); SequenceI[] prod =
\r
578 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
\r
579 * System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
580 * prod.length) + " products"); if (prod!=null) { for (int p=0; p<prod.length;
\r
581 * p++) { System.out.println("Prod "+p+":
\r
582 * "+prod[p].getDisplayId(true)); } } } } else {
\r
583 * System.out.println("Trying getProducts for
\r
584 * "+al.getSequenceAt(0).getDisplayId(true));
\r
585 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot")); //
\r
586 * have a bash at finding the products amongst all the retrieved
\r
587 * sequences. SequenceI[] prod =
\r
588 * jalview.analysis.CrossRef.findXrefSequences(al
\r
589 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
\r
590 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
\r
591 * (prod!=null) { // select non-equivalent sequences from dataset list
\r
592 * for (int p=0; p<prod.length; p++) { System.out.println("Prod
\r
593 * "+p+": "+prod[p].getDisplayId(true)); } } } }
\r