2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import java.util.Enumeration;
24 import java.util.List;
25 import java.util.Vector;
26 import java.util.Hashtable;
28 import jalview.datamodel.AlignedCodonFrame;
29 import jalview.datamodel.Alignment;
30 import jalview.datamodel.AlignmentI;
31 import jalview.datamodel.DBRefSource;
32 import jalview.datamodel.DBRefEntry;
33 import jalview.datamodel.Sequence;
34 import jalview.datamodel.SequenceI;
35 import jalview.ws.SequenceFetcher;
36 import jalview.ws.seqfetcher.ASequenceFetcher;
39 * Functions for cross-referencing sequence databases. user must first specify
40 * if cross-referencing from protein or dna (set dna==true)
48 * get the DNA or protein references for a protein or dna sequence
54 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
58 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
62 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
63 DBRefSource.DNACODINGDBS); // could attempt to find other cross
64 // refs and return here - ie PDB xrefs
65 // (not dna, not protein seq)
70 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
72 Hashtable classes = new Hashtable();
73 classes.put(DBRefSource.PROTEINDBS,
74 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
75 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
76 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
77 classes.put(DBRefSource.DOMAINDBS,
78 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
79 // classes.put(OTHER, )
85 * true if seqs are DNA seqs
87 * @return a list of sequence database cross reference source types
89 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
91 return findSequenceXrefTypes(dna, seqs, null);
95 * Indirect references are references from other sequences from the dataset to
96 * any of the direct DBRefEntrys on the given sequences.
99 * true if seqs are DNA seqs
101 * @return a list of sequence database cross reference source types
103 public static String[] findSequenceXrefTypes(boolean dna,
104 SequenceI[] seqs, AlignmentI dataset)
106 String[] dbrefs = null;
107 Vector refs = new Vector();
108 for (int s = 0; s < seqs.length; s++)
113 SequenceI dss = seqs[s];
114 while (dss.getDatasetSequence() != null)
116 dss = dss.getDatasetSequence();
118 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
119 for (int r = 0; rfs != null && r < rfs.length; r++)
121 if (!refs.contains(rfs[r].getSource()))
123 refs.addElement(rfs[r].getSource());
128 // search for references to this sequence's direct references.
129 DBRefEntry[] lrfs = CrossRef
130 .findXDbRefs(!dna, seqs[s].getDBRef());
131 Vector rseqs = new Vector();
132 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
133 null); // don't need to specify codon frame for mapping here
134 Enumeration lr = rseqs.elements();
135 while (lr.hasMoreElements())
137 SequenceI rs = (SequenceI) lr.nextElement();
138 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
139 for (int r = 0; rfs != null && r < rfs.length; r++)
141 if (!refs.contains(rfs[r].getSource()))
143 refs.addElement(rfs[r].getSource());
152 dbrefs = new String[refs.size()];
153 refs.copyInto(dbrefs);
159 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
160 * reference if (!refs.contains(rfs[r].getSource())) {
161 * refs.addElement(rfs[r].getSource()); } } }
163 public static boolean hasCdnaMap(SequenceI[] seqs)
165 String[] reftypes = findSequenceXrefTypes(false, seqs);
166 for (int s = 0; s < reftypes.length; s++)
168 if (reftypes.equals(DBRefSource.EMBLCDS))
177 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
179 Vector cseqs = new Vector();
180 for (int s = 0; s < seqs.length; s++)
182 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
183 for (int c = 0; c < cdna.length; c++)
185 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
188 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
189 // TODO: retrieve CDS dataset sequences
190 // need global dataset sequence retriever/resolver to reuse refs
191 // and construct Mapping entry.
192 // insert gaps in CDS according to peptide gaps.
193 // add gapped sequence to cseqs
197 if (cseqs.size() > 0)
199 SequenceI[] rsqs = new SequenceI[cseqs.size()];
200 cseqs.copyInto(rsqs);
213 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
216 return findXrefSequences(seqs, dna, source, null);
225 * alignment to search for product sequences.
226 * @return products (as dataset sequences)
228 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
229 String source, AlignmentI dataset)
231 Vector rseqs = new Vector();
232 Alignment ral = null;
233 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
234 for (int s = 0; s < seqs.length; s++)
236 SequenceI dss = seqs[s];
237 while (dss.getDatasetSequence() != null)
239 dss = dss.getDatasetSequence();
241 boolean found = false;
242 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
243 if ((xrfs == null || xrfs.length == 0) && dataset != null)
245 System.out.println("Attempting to find ds Xrefs refs.");
246 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
254 // filter for desired source xref here
255 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
258 for (int r = 0; xrfs != null && r < xrfs.length; r++)
260 if (source != null && !source.equals(xrfs[r].getSource()))
262 if (xrfs[r].hasMap())
264 if (xrfs[r].getMap().getTo() != null)
266 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
267 rseqs.addElement(rsq);
268 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
269 .getMap().getMap().getToRatio())
271 // get sense of map correct for adding to product alignment.
274 // map is from dna seq to a protein product
275 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
279 // map should be from protein seq to its coding dna
280 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
288 // do a bit more work - search for sequences with references matching
289 // xrefs on this sequence.
292 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
294 xrfs[r] = null; // we've recovered seqs for this one.
300 if (xrfs != null && xrfs.length > 0)
302 // Try and get the sequence reference...
304 * Ideal world - we ask for a sequence fetcher implementation here if
305 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
307 ASequenceFetcher sftch = new SequenceFetcher();
308 SequenceI[] retrieved = null;
310 for (int r = 0; r < xrfs.length; r++)
312 // filter out any irrelevant or irretrievable references
314 || ((source != null && !source.equals(xrfs[r]
315 .getSource())) || !sftch.isFetchable(xrfs[r]
325 .println("Attempting to retrieve cross referenced sequences.");
326 DBRefEntry[] t = new DBRefEntry[l];
328 for (int r = 0; r < xrfs.length; r++)
336 retrieved = sftch.getSequences(xrfs); // problem here is we don't
337 // know which of xrfs
340 } catch (Exception e)
343 .println("Problem whilst retrieving cross references for Sequence : "
344 + seqs[s].getName());
347 if (retrieved != null)
349 for (int rs = 0; rs < retrieved.length; rs++)
351 // TODO: examine each sequence for 'redundancy'
352 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
354 if (dbr != null && dbr.length > 0)
356 for (int di = 0; di < dbr.length; di++)
358 // find any entry where we should put in the sequence being
359 // cross-referenced into the map
360 jalview.datamodel.Mapping map = dbr[di].getMap();
363 if (map.getTo() != null && map.getMap() != null)
365 // should search the local dataset to find any existing
366 // candidates for To !
369 // compare ms with dss and replace with dss in mapping
370 // if map is congruent
371 SequenceI ms = map.getTo();
372 int sf = map.getMap().getToLowest();
373 int st = map.getMap().getToHighest();
374 SequenceI mappedrg = ms.getSubSequence(sf, st);
375 SequenceI loc = dss.getSubSequence(sf, st);
376 if (mappedrg.getLength() > 0
377 && mappedrg.getSequenceAsString().equals(
378 loc.getSequenceAsString()))
381 .println("Mapping updated for retrieved crossreference");
382 // method to update all refs of existing To on
383 // retrieved sequence with dss and merge any props
387 } catch (Exception e)
390 .println("Exception when consolidating Mapped sequence set...");
391 e.printStackTrace(System.err);
397 retrieved[rs].updatePDBIds();
398 rseqs.addElement(retrieved[rs]);
405 if (rseqs.size() > 0)
407 SequenceI[] rsqs = new SequenceI[rseqs.size()];
408 rseqs.copyInto(rsqs);
409 ral = new Alignment(rsqs);
410 if (cf != null && cf.getProtMappings() != null)
412 ral.addCodonFrame(cf);
419 * find references to lrfs in the cross-reference set of each sequence in
420 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
421 * based on source and accession string only - Map and Version are nulled.
427 * @return true if matches were found.
429 private static boolean searchDatasetXrefs(SequenceI sequenceI,
430 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
431 AlignedCodonFrame cf)
433 boolean found = false;
436 for (int i = 0; i < lrfs.length; i++)
438 DBRefEntry xref = new DBRefEntry(lrfs[i]);
440 xref.setVersion(null);
442 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
448 * search a given sequence dataset for references matching cross-references to
455 * set of unique sequences
457 * @return true if one or more unique sequences were found and added
459 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
460 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
462 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
466 * TODO: generalise to different protein classifications Search dataset for
467 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
475 * - search all references or only subset
477 * search dna or protein xrefs (if direct=false)
478 * @return true if relationship found and sequence added.
480 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
481 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
482 boolean direct, boolean dna)
484 boolean found = false;
485 SequenceI[] typer = new SequenceI[1];
488 if (dataset.getSequences() == null)
490 System.err.println("Empty dataset sequence set - NO VECTOR");
494 synchronized (ds = dataset.getSequences())
496 for (SequenceI nxt : ds)
499 if (nxt.getDatasetSequence() != null)
502 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
504 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
506 // check if this is the correct sequence type
509 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
510 if ((direct && isDna == dna) || (!direct && isDna != dna))
512 // skip this sequence because it is same molecule type
517 // look for direct or indirect references in common
518 DBRefEntry[] poss = nxt.getDBRef(), cands = null;
521 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
525 poss = CrossRef.findXDbRefs(dna, poss); //
526 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
530 if (!rseqs.contains(nxt))
532 rseqs.addElement(nxt);
533 boolean foundmap = cf != null; // don't search if we aren't
535 // a codon map object
536 for (int r = 0; foundmap && r < cands.length; r++)
538 if (cands[r].hasMap())
540 if (cands[r].getMap().getTo() != null
541 && cands[r].getMap().getMap().getFromRatio() != cands[r]
542 .getMap().getMap().getToRatio())
545 // get sense of map correct for adding to product
549 // map is from dna seq to a protein product
550 cf.addMap(sequenceI, nxt, cands[r].getMap()
555 // map should be from protein seq to its coding dna
556 cf.addMap(nxt, sequenceI, cands[r].getMap()
557 .getMap().getInverse());
562 // TODO: add mapping between sequences if necessary
574 * precalculate different products that can be found for seqs in dataset and
581 * - don't actually build lists - just get types
582 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
583 * seqs, AlignmentI dataset, boolean fake) { String types[] =
584 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
585 * dataset); if (types != null) { System.out.println("Xref Types for:
586 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
587 * System.out.println("Type: " + types[t]); SequenceI[] prod =
588 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
589 * System.out.println("Found " + ((prod == null) ? "no" : "" +
590 * prod.length) + " products"); if (prod!=null) { for (int p=0;
591 * p<prod.length; p++) { System.out.println("Prod "+p+":
592 * "+prod[p].getDisplayId(true)); } } } } else {
593 * System.out.println("Trying getProducts for
594 * "+al.getSequenceAt(0).getDisplayId(true));
595 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
596 * // have a bash at finding the products amongst all the retrieved
597 * sequences. SequenceI[] prod =
598 * jalview.analysis.CrossRef.findXrefSequences(al
599 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
600 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
601 * (prod!=null) { // select non-equivalent sequences from dataset list
602 * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
603 * "+prod[p].getDisplayId(true)); } } } }