2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7)
\r
3 * Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle
\r
5 * This file is part of Jalview.
\r
7 * Jalview is free software: you can redistribute it and/or
\r
8 * modify it under the terms of the GNU General Public License
\r
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
\r
11 * Jalview is distributed in the hope that it will be useful, but
\r
12 * WITHOUT ANY WARRANTY; without even the implied warranty
\r
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
\r
14 * PURPOSE. See the GNU General Public License for more details.
\r
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
\r
18 package jalview.analysis;
\r
20 import java.util.Enumeration;
\r
21 import java.util.List;
\r
22 import java.util.Vector;
\r
23 import java.util.Hashtable;
\r
25 import jalview.datamodel.AlignedCodonFrame;
\r
26 import jalview.datamodel.Alignment;
\r
27 import jalview.datamodel.AlignmentI;
\r
28 import jalview.datamodel.DBRefSource;
\r
29 import jalview.datamodel.DBRefEntry;
\r
30 import jalview.datamodel.Sequence;
\r
31 import jalview.datamodel.SequenceI;
\r
32 import jalview.ws.SequenceFetcher;
\r
33 import jalview.ws.seqfetcher.ASequenceFetcher;
\r
36 * Functions for cross-referencing sequence databases. user must first specify
\r
37 * if cross-referencing from protein or dna (set dna==true)
\r
42 public class CrossRef
\r
45 * get the DNA or protein references for a protein or dna sequence
\r
51 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
\r
55 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
\r
59 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
\r
60 DBRefSource.DNACODINGDBS); // could attempt to find other cross
\r
61 // refs and return here - ie PDB xrefs
\r
62 // (not dna, not protein seq)
\r
67 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
\r
69 Hashtable classes = new Hashtable();
\r
70 classes.put(DBRefSource.PROTEINDBS,
\r
71 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
\r
72 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
\r
73 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
\r
74 classes.put(DBRefSource.DOMAINDBS,
\r
75 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
\r
76 // classes.put(OTHER, )
\r
82 * true if seqs are DNA seqs
\r
84 * @return a list of sequence database cross reference source types
\r
86 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
\r
88 return findSequenceXrefTypes(dna, seqs, null);
\r
92 * Indirect references are references from other sequences from the dataset to
\r
93 * any of the direct DBRefEntrys on the given sequences.
\r
96 * true if seqs are DNA seqs
\r
98 * @return a list of sequence database cross reference source types
\r
100 public static String[] findSequenceXrefTypes(boolean dna,
\r
101 SequenceI[] seqs, AlignmentI dataset)
\r
103 String[] dbrefs = null;
\r
104 Vector refs = new Vector();
\r
105 for (int s = 0; s < seqs.length; s++)
\r
107 if (seqs[s] != null)
\r
110 SequenceI dss = seqs[s];
\r
111 while (dss.getDatasetSequence() != null)
\r
113 dss = dss.getDatasetSequence();
\r
115 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
\r
116 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
118 if (!refs.contains(rfs[r].getSource()))
\r
120 refs.addElement(rfs[r].getSource());
\r
123 if (dataset != null)
\r
125 // search for references to this sequence's direct references.
\r
126 DBRefEntry[] lrfs = CrossRef
\r
127 .findXDbRefs(!dna, seqs[s].getDBRef());
\r
128 Vector rseqs = new Vector();
\r
129 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
\r
130 null); // don't need to specify codon frame for mapping here
\r
131 Enumeration lr = rseqs.elements();
\r
132 while (lr.hasMoreElements())
\r
134 SequenceI rs = (SequenceI) lr.nextElement();
\r
135 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
\r
136 for (int r = 0; rfs != null && r < rfs.length; r++)
\r
138 if (!refs.contains(rfs[r].getSource()))
\r
140 refs.addElement(rfs[r].getSource());
\r
147 if (refs.size() > 0)
\r
149 dbrefs = new String[refs.size()];
\r
150 refs.copyInto(dbrefs);
\r
156 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
\r
157 * reference if (!refs.contains(rfs[r].getSource())) {
\r
158 * refs.addElement(rfs[r].getSource()); } } }
\r
160 public static boolean hasCdnaMap(SequenceI[] seqs)
\r
162 String[] reftypes = findSequenceXrefTypes(false, seqs);
\r
163 for (int s = 0; s < reftypes.length; s++)
\r
165 if (reftypes.equals(DBRefSource.EMBLCDS))
\r
174 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
\r
176 Vector cseqs = new Vector();
\r
177 for (int s = 0; s < seqs.length; s++)
\r
179 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
\r
180 for (int c = 0; c < cdna.length; c++)
\r
182 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
\r
185 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
\r
186 // TODO: retrieve CDS dataset sequences
\r
187 // need global dataset sequence retriever/resolver to reuse refs
\r
188 // and construct Mapping entry.
\r
189 // insert gaps in CDS according to peptide gaps.
\r
190 // add gapped sequence to cseqs
\r
194 if (cseqs.size() > 0)
\r
196 SequenceI[] rsqs = new SequenceI[cseqs.size()];
\r
197 cseqs.copyInto(rsqs);
\r
210 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
213 return findXrefSequences(seqs, dna, source, null);
\r
222 * alignment to search for product sequences.
\r
223 * @return products (as dataset sequences)
\r
225 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
\r
226 String source, AlignmentI dataset)
\r
228 Vector rseqs = new Vector();
\r
229 Alignment ral = null;
\r
230 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
\r
231 for (int s = 0; s < seqs.length; s++)
\r
233 SequenceI dss = seqs[s];
\r
234 while (dss.getDatasetSequence() != null)
\r
236 dss = dss.getDatasetSequence();
\r
238 boolean found = false;
\r
239 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
\r
240 if ((xrfs == null || xrfs.length == 0) && dataset != null)
\r
242 System.out.println("Attempting to find ds Xrefs refs.");
\r
243 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
\r
251 // filter for desired source xref here
\r
252 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
\r
255 for (int r = 0; xrfs != null && r < xrfs.length; r++)
\r
257 if (source != null && !source.equals(xrfs[r].getSource()))
\r
259 if (xrfs[r].hasMap())
\r
261 if (xrfs[r].getMap().getTo() != null)
\r
263 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
\r
264 rseqs.addElement(rsq);
\r
265 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
\r
266 .getMap().getMap().getToRatio())
\r
268 // get sense of map correct for adding to product alignment.
\r
271 // map is from dna seq to a protein product
\r
272 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
\r
276 // map should be from protein seq to its coding dna
\r
277 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
\r
285 // do a bit more work - search for sequences with references matching
\r
286 // xrefs on this sequence.
\r
287 if (dataset != null)
\r
289 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
\r
291 xrfs[r] = null; // we've recovered seqs for this one.
\r
297 if (xrfs != null && xrfs.length > 0)
\r
299 // Try and get the sequence reference...
\r
301 * Ideal world - we ask for a sequence fetcher implementation here if
\r
302 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
\r
304 ASequenceFetcher sftch = new SequenceFetcher();
\r
305 SequenceI[] retrieved = null;
\r
306 int l = xrfs.length;
\r
307 for (int r = 0; r < xrfs.length; r++)
\r
309 // filter out any irrelevant or irretrievable references
\r
310 if (xrfs[r] == null
\r
311 || ((source != null && !source.equals(xrfs[r]
\r
312 .getSource())) || !sftch.isFetchable(xrfs[r]
\r
322 .println("Attempting to retrieve cross referenced sequences.");
\r
323 DBRefEntry[] t = new DBRefEntry[l];
\r
325 for (int r = 0; r < xrfs.length; r++)
\r
327 if (xrfs[r] != null)
\r
333 retrieved = sftch.getSequences(xrfs); // problem here is we don't
\r
334 // know which of xrfs
\r
335 // resulted in which
\r
336 // retrieved element
\r
337 } catch (Exception e)
\r
340 .println("Problem whilst retrieving cross references for Sequence : "
\r
341 + seqs[s].getName());
\r
342 e.printStackTrace();
\r
344 if (retrieved != null)
\r
346 for (int rs = 0; rs < retrieved.length; rs++)
\r
348 // TODO: examine each sequence for 'redundancy'
\r
349 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
\r
351 if (dbr != null && dbr.length > 0)
\r
353 for (int di = 0; di < dbr.length; di++)
\r
355 // find any entry where we should put in the sequence being
\r
356 // cross-referenced into the map
\r
357 jalview.datamodel.Mapping map = dbr[di].getMap();
\r
360 if (map.getTo() != null && map.getMap() != null)
\r
362 // should search the local dataset to find any existing
\r
363 // candidates for To !
\r
366 // compare ms with dss and replace with dss in mapping
\r
367 // if map is congruent
\r
368 SequenceI ms = map.getTo();
\r
369 int sf = map.getMap().getToLowest();
\r
370 int st = map.getMap().getToHighest();
\r
371 SequenceI mappedrg = ms.getSubSequence(sf, st);
\r
372 SequenceI loc = dss.getSubSequence(sf, st);
\r
373 if (mappedrg.getLength() > 0
\r
374 && mappedrg.getSequenceAsString().equals(
\r
375 loc.getSequenceAsString()))
\r
378 .println("Mapping updated for retrieved crossreference");
\r
379 // method to update all refs of existing To on
\r
380 // retrieved sequence with dss and merge any props
\r
384 } catch (Exception e)
\r
387 .println("Exception when consolidating Mapped sequence set...");
\r
388 e.printStackTrace(System.err);
\r
394 retrieved[rs].updatePDBIds();
\r
395 rseqs.addElement(retrieved[rs]);
\r
402 if (rseqs.size() > 0)
\r
404 SequenceI[] rsqs = new SequenceI[rseqs.size()];
\r
405 rseqs.copyInto(rsqs);
\r
406 ral = new Alignment(rsqs);
\r
407 if (cf != null && cf.getProtMappings() != null)
\r
409 ral.addCodonFrame(cf);
\r
416 * find references to lrfs in the cross-reference set of each sequence in
\r
417 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
\r
418 * based on source and accession string only - Map and Version are nulled.
\r
424 * @return true if matches were found.
\r
426 private static boolean searchDatasetXrefs(SequenceI sequenceI,
\r
427 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
\r
428 AlignedCodonFrame cf)
\r
430 boolean found = false;
\r
433 for (int i = 0; i < lrfs.length; i++)
\r
435 DBRefEntry xref = new DBRefEntry(lrfs[i]);
\r
436 // add in wildcards
\r
437 xref.setVersion(null);
\r
439 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
\r
445 * search a given sequence dataset for references matching cross-references to
\r
446 * the given sequence
\r
452 * set of unique sequences
\r
454 * @return true if one or more unique sequences were found and added
\r
456 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
457 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
\r
459 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
\r
463 * TODO: generalise to different protein classifications Search dataset for
\r
464 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
\r
472 * - search all references or only subset
\r
474 * search dna or protein xrefs (if direct=false)
\r
475 * @return true if relationship found and sequence added.
\r
477 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
\r
478 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
\r
479 boolean direct, boolean dna)
\r
481 boolean found = false;
\r
482 SequenceI[] typer = new SequenceI[1];
\r
483 if (dataset == null)
\r
485 if (dataset.getSequences() == null)
\r
487 System.err.println("Empty dataset sequence set - NO VECTOR");
\r
490 List<SequenceI> ds;
\r
491 synchronized (ds=dataset.getSequences())
\r
493 for (SequenceI nxt:ds)
\r
496 if (nxt.getDatasetSequence() != null)
\r
499 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
\r
501 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
\r
503 // check if this is the correct sequence type
\r
506 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
\r
507 if ((direct && isDna == dna) || (!direct && isDna != dna))
\r
509 // skip this sequence because it is same molecule type
\r
514 // look for direct or indirect references in common
\r
515 DBRefEntry[] poss = nxt.getDBRef(), cands = null;
\r
518 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
\r
522 poss = CrossRef.findXDbRefs(dna, poss); //
\r
523 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
\r
527 if (!rseqs.contains(nxt))
\r
529 rseqs.addElement(nxt);
\r
530 boolean foundmap = cf != null; // don't search if we aren't given
\r
531 // a codon map object
\r
532 for (int r = 0; foundmap && r < cands.length; r++)
\r
534 if (cands[r].hasMap())
\r
536 if (cands[r].getMap().getTo() != null
\r
537 && cands[r].getMap().getMap().getFromRatio() != cands[r]
\r
538 .getMap().getMap().getToRatio())
\r
541 // get sense of map correct for adding to product alignment.
\r
544 // map is from dna seq to a protein product
\r
545 cf.addMap(sequenceI, nxt, cands[r].getMap().getMap());
\r
549 // map should be from protein seq to its coding dna
\r
550 cf.addMap(nxt, sequenceI, cands[r].getMap().getMap()
\r
556 // TODO: add mapping between sequences if necessary
\r
568 * precalculate different products that can be found for seqs in dataset and
\r
575 * - don't actually build lists - just get types
\r
576 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
\r
577 * seqs, AlignmentI dataset, boolean fake) { String types[] =
\r
578 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
\r
579 * dataset); if (types != null) { System.out.println("Xref Types for:
\r
580 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
\r
581 * System.out.println("Type: " + types[t]); SequenceI[] prod =
\r
582 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
\r
583 * System.out.println("Found " + ((prod == null) ? "no" : "" +
\r
584 * prod.length) + " products"); if (prod!=null) { for (int p=0;
\r
585 * p<prod.length; p++) { System.out.println("Prod "+p+":
\r
586 * "+prod[p].getDisplayId(true)); } } } } else {
\r
587 * System.out.println("Trying getProducts for
\r
588 * "+al.getSequenceAt(0).getDisplayId(true));
\r
589 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
\r
590 * // have a bash at finding the products amongst all the retrieved
\r
591 * sequences. SequenceI[] prod =
\r
592 * jalview.analysis.CrossRef.findXrefSequences(al
\r
593 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
\r
594 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
\r
595 * (prod!=null) { // select non-equivalent sequences from dataset list
\r
596 * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
\r
597 * "+prod[p].getDisplayId(true)); } } } }
\r