2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7)
\r
3 * Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle
\r
5 * This file is part of Jalview.
\r
7 * Jalview is free software: you can redistribute it and/or
\r
8 * modify it under the terms of the GNU General Public License
\r
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
\r
11 * Jalview is distributed in the hope that it will be useful, but
\r
12 * WITHOUT ANY WARRANTY; without even the implied warranty
\r
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
\r
14 * PURPOSE. See the GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
\r
18 package jalview.analysis;
\r
20 import java.util.Enumeration;
\r
21 import java.util.Vector;
\r
22 import java.util.Hashtable;
\r
24 import jalview.datamodel.AlignedCodonFrame;
\r
25 import jalview.datamodel.Alignment;
\r
26 import jalview.datamodel.AlignmentI;
\r
27 import jalview.datamodel.DBRefSource;
\r
28 import jalview.datamodel.DBRefEntry;
\r
29 import jalview.datamodel.Sequence;
\r
30 import jalview.datamodel.SequenceI;
\r
31 import jalview.ws.SequenceFetcher;
\r
32 import jalview.ws.seqfetcher.ASequenceFetcher;
\r
35 * Functions for cross-referencing sequence databases. user must first specify
\r
36 * if cross-referencing from protein or dna (set dna==true)
\r
41 public class CrossRef
\r
44 * get the DNA or protein references for a protein or dna sequence
\r
50 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
\r
54 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
\r
58 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
\r
59 DBRefSource.DNACODINGDBS); // could attempt to find other cross
\r
60 // refs and return here - ie PDB xrefs
\r
61 // (not dna, not protein seq)
\r
66 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
\r
68 Hashtable classes = new Hashtable();
\r
69 classes.put(DBRefSource.PROTEINDBS,
\r
70 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
\r
71 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
\r
72 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
\r
73 classes.put(DBRefSource.DOMAINDBS,
\r
74 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
\r
75 // classes.put(OTHER, )
\r
81 * true if seqs are DNA seqs
\r
83 * @return a list of sequence database cross reference source types
\r
85 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
\r
87 return findSequenceXrefTypes(dna, seqs, null);
\r
91 * Indirect references are references from other sequences from the dataset to
\r
92 * any of the direct DBRefEntrys on the given sequences.
\r
95 * true if seqs are DNA seqs
\r
97 * @return a list of sequence database cross reference source types
\r
99 public static String[] findSequenceXrefTypes(boolean dna,
\r
100 SequenceI[] seqs, AlignmentI dataset)
\r
102 String[] dbrefs = null;
\r
103 Vector refs = new Vector();
\r
104 for (int s = 0; s < seqs.length; s++)
\r
106 if (seqs[s] != null)
\r
109 SequenceI dss = seqs[s];
\r
110 while (dss.getDatasetSequence() != null)
\r
112 dss = dss.getDatasetSequence();
\r
114 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
\r
115 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
117 if (!refs.contains(rfs[r].getSource()))
\r
119 refs.addElement(rfs[r].getSource());
\r
122 if (dataset != null)
\r
124 // search for references to this sequence's direct references.
\r
125 DBRefEntry[] lrfs = CrossRef
\r
126 .findXDbRefs(!dna, seqs[s].getDBRef());
\r
127 Vector rseqs = new Vector();
\r
128 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
\r
129 null); // don't need to specify codon frame for mapping here
\r
130 Enumeration lr = rseqs.elements();
\r
131 while (lr.hasMoreElements())
\r
133 SequenceI rs = (SequenceI) lr.nextElement();
\r
134 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
\r
135 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
137 if (!refs.contains(rfs[r].getSource()))
\r
139 refs.addElement(rfs[r].getSource());
\r
146 if (refs.size() > 0)
\r
148 dbrefs = new String[refs.size()];
\r
149 refs.copyInto(dbrefs);
\r
155 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
\r
156 * reference if (!refs.contains(rfs[r].getSource())) {
\r
157 * refs.addElement(rfs[r].getSource()); } } }
\r
159 public static boolean hasCdnaMap(SequenceI[] seqs)
\r
161 String[] reftypes = findSequenceXrefTypes(false, seqs);
\r
162 for (int s = 0; s < reftypes.length; s++)
\r
164 if (reftypes.equals(DBRefSource.EMBLCDS))
\r
173 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
\r
175 Vector cseqs = new Vector();
\r
176 for (int s = 0; s < seqs.length; s++)
\r
178 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
\r
179 for (int c = 0; c < cdna.length; c++)
\r
181 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
\r
184 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
\r
185 // TODO: retrieve CDS dataset sequences
\r
186 // need global dataset sequence retriever/resolver to reuse refs
\r
187 // and construct Mapping entry.
\r
188 // insert gaps in CDS according to peptide gaps.
\r
189 // add gapped sequence to cseqs
\r
193 if (cseqs.size() > 0)
\r
195 SequenceI[] rsqs = new SequenceI[cseqs.size()];
\r
196 cseqs.copyInto(rsqs);
\r
209 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
212 return findXrefSequences(seqs, dna, source, null);
\r
221 * alignment to search for product sequences.
\r
222 * @return products (as dataset sequences)
\r
224 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
225 String source, AlignmentI dataset)
\r
227 Vector rseqs = new Vector();
\r
228 Alignment ral = null;
\r
229 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
\r
230 for (int s = 0; s < seqs.length; s++)
\r
232 SequenceI dss = seqs[s];
\r
233 while (dss.getDatasetSequence() != null)
\r
235 dss = dss.getDatasetSequence();
\r
237 boolean found = false;
\r
238 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
\r
239 if ((xrfs == null || xrfs.length == 0) && dataset != null)
\r
241 System.out.println("Attempting to find ds Xrefs refs.");
\r
242 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
\r
250 // filter for desired source xref here
\r
251 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
\r
254 for (int r = 0; xrfs != null && r < xrfs.length; r++)
\r
256 if (source != null && !source.equals(xrfs[r].getSource()))
\r
258 if (xrfs[r].hasMap())
\r
260 if (xrfs[r].getMap().getTo() != null)
\r
262 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
\r
263 rseqs.addElement(rsq);
\r
264 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
\r
265 .getMap().getMap().getToRatio())
\r
267 // get sense of map correct for adding to product alignment.
\r
270 // map is from dna seq to a protein product
\r
271 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
\r
275 // map should be from protein seq to its coding dna
\r
276 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
\r
284 // do a bit more work - search for sequences with references matching
\r
285 // xrefs on this sequence.
\r
286 if (dataset != null)
\r
288 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
\r
290 xrfs[r] = null; // we've recovered seqs for this one.
\r
296 if (xrfs != null && xrfs.length > 0)
\r
298 // Try and get the sequence reference...
\r
300 * Ideal world - we ask for a sequence fetcher implementation here if
\r
301 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
\r
303 ASequenceFetcher sftch = new SequenceFetcher();
\r
304 SequenceI[] retrieved = null;
\r
305 int l = xrfs.length;
\r
306 for (int r = 0; r < xrfs.length; r++)
\r
308 // filter out any irrelevant or irretrievable references
\r
309 if (xrfs[r] == null
\r
310 || ((source != null && !source.equals(xrfs[r]
\r
311 .getSource())) || !sftch.isFetchable(xrfs[r]
\r
321 .println("Attempting to retrieve cross referenced sequences.");
\r
322 DBRefEntry[] t = new DBRefEntry[l];
\r
324 for (int r = 0; r < xrfs.length; r++)
\r
326 if (xrfs[r] != null)
\r
332 retrieved = sftch.getSequences(xrfs); // problem here is we don't
\r
333 // know which of xrfs
\r
334 // resulted in which
\r
335 // retrieved element
\r
336 } catch (Exception e)
\r
339 .println("Problem whilst retrieving cross references for Sequence : "
\r
340 + seqs[s].getName());
\r
341 e.printStackTrace();
\r
343 if (retrieved != null)
\r
345 for (int rs = 0; rs < retrieved.length; rs++)
\r
347 // TODO: examine each sequence for 'redundancy'
\r
348 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
\r
350 if (dbr != null && dbr.length > 0)
\r
352 for (int di = 0; di < dbr.length; di++)
\r
354 // find any entry where we should put in the sequence being
\r
355 // cross-referenced into the map
\r
356 jalview.datamodel.Mapping map = dbr[di].getMap();
\r
359 if (map.getTo() != null && map.getMap() != null)
\r
361 // should search the local dataset to find any existing
\r
362 // candidates for To !
\r
365 // compare ms with dss and replace with dss in mapping
\r
366 // if map is congruent
\r
367 SequenceI ms = map.getTo();
\r
368 int sf = map.getMap().getToLowest();
\r
369 int st = map.getMap().getToHighest();
\r
370 SequenceI mappedrg = ms.getSubSequence(sf, st);
\r
371 SequenceI loc = dss.getSubSequence(sf, st);
\r
372 if (mappedrg.getLength() > 0
\r
373 && mappedrg.getSequenceAsString().equals(
\r
374 loc.getSequenceAsString()))
\r
377 .println("Mapping updated for retrieved crossreference");
\r
378 // method to update all refs of existing To on
\r
379 // retrieved sequence with dss and merge any props
\r
383 } catch (Exception e)
\r
386 .println("Exception when consolidating Mapped sequence set...");
\r
387 e.printStackTrace(System.err);
\r
393 retrieved[rs].updatePDBIds();
\r
394 rseqs.addElement(retrieved[rs]);
\r
401 if (rseqs.size() > 0)
\r
403 SequenceI[] rsqs = new SequenceI[rseqs.size()];
\r
404 rseqs.copyInto(rsqs);
\r
405 ral = new Alignment(rsqs);
\r
406 if (cf != null && cf.getProtMappings() != null)
\r
408 ral.addCodonFrame(cf);
\r
415 * find references to lrfs in the cross-reference set of each sequence in
\r
416 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
\r
417 * based on source and accession string only - Map and Version are nulled.
\r
423 * @return true if matches were found.
\r
425 private static boolean searchDatasetXrefs(SequenceI sequenceI,
\r
426 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
\r
427 AlignedCodonFrame cf)
\r
429 boolean found = false;
\r
432 for (int i = 0; i < lrfs.length; i++)
\r
434 DBRefEntry xref = new DBRefEntry(lrfs[i]);
\r
435 // add in wildcards
\r
436 xref.setVersion(null);
\r
438 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
\r
444 * search a given sequence dataset for references matching cross-references to
\r
445 * the given sequence
\r
451 * set of unique sequences
\r
453 * @return true if one or more unique sequences were found and added
\r
455 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
456 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
458 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
\r
462 * TODO: generalise to different protein classifications Search dataset for
\r
463 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
\r
471 * - search all references or only subset
\r
473 * search dna or protein xrefs (if direct=false)
\r
474 * @return true if relationship found and sequence added.
\r
476 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
477 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
\r
478 boolean direct, boolean dna)
\r
480 boolean found = false;
\r
481 SequenceI[] typer = new SequenceI[1];
\r
482 if (dataset == null)
\r
484 if (dataset.getSequences() == null)
\r
486 System.err.println("Empty dataset sequence set - NO VECTOR");
\r
489 Enumeration e = dataset.getSequences().elements();
\r
490 while (e.hasMoreElements())
\r
492 SequenceI nxt = (SequenceI) e.nextElement();
\r
495 if (nxt.getDatasetSequence() != null)
\r
498 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
\r
500 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
\r
502 // check if this is the correct sequence type
\r
505 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
\r
506 if ((direct && isDna == dna) || (!direct && isDna != dna))
\r
508 // skip this sequence because it is same molecule type
\r
513 // look for direct or indirect references in common
\r
514 DBRefEntry[] poss = nxt.getDBRef(), cands = null;
\r
517 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
\r
521 poss = CrossRef.findXDbRefs(dna, poss); //
\r
522 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
\r
526 if (!rseqs.contains(nxt))
\r
528 rseqs.addElement(nxt);
\r
529 boolean foundmap = cf != null; // don't search if we aren't given
\r
530 // a codon map object
\r
531 for (int r = 0; foundmap && r < cands.length; r++)
\r
533 if (cands[r].hasMap())
\r
535 if (cands[r].getMap().getTo() != null
\r
536 && cands[r].getMap().getMap().getFromRatio() != cands[r]
\r
537 .getMap().getMap().getToRatio())
\r
540 // get sense of map correct for adding to product alignment.
\r
543 // map is from dna seq to a protein product
\r
544 cf.addMap(sequenceI, nxt, cands[r].getMap().getMap());
\r
548 // map should be from protein seq to its coding dna
\r
549 cf.addMap(nxt, sequenceI, cands[r].getMap().getMap()
\r
555 // TODO: add mapping between sequences if necessary
\r
567 * precalculate different products that can be found for seqs in dataset and
\r
574 * - don't actually build lists - just get types
\r
575 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
\r
576 * seqs, AlignmentI dataset, boolean fake) { String types[] =
\r
577 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
\r
578 * dataset); if (types != null) { System.out.println("Xref Types for:
\r
579 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
\r
580 * System.out.println("Type: " + types[t]); SequenceI[] prod =
\r
581 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
\r
582 * System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
583 * prod.length) + " products"); if (prod!=null) { for (int p=0;
\r
584 * p<prod.length; p++) { System.out.println("Prod "+p+":
\r
585 * "+prod[p].getDisplayId(true)); } } } } else {
\r
586 * System.out.println("Trying getProducts for
\r
587 * "+al.getSequenceAt(0).getDisplayId(true));
\r
588 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
\r
589 * // have a bash at finding the products amongst all the retrieved
\r
590 * sequences. SequenceI[] prod =
\r
591 * jalview.analysis.CrossRef.findXrefSequences(al
\r
592 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
\r
593 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
\r
594 * (prod!=null) { // select non-equivalent sequences from dataset list
\r
595 * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
\r
596 * "+prod[p].getDisplayId(true)); } } } }
\r