2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.Alignment;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.DBRefEntry;
27 import jalview.datamodel.DBRefSource;
28 import jalview.datamodel.Sequence;
29 import jalview.datamodel.SequenceI;
30 import jalview.ws.SequenceFetcher;
31 import jalview.ws.seqfetcher.ASequenceFetcher;
33 import java.util.Enumeration;
34 import java.util.Hashtable;
35 import java.util.List;
36 import java.util.Vector;
39 * Functions for cross-referencing sequence databases. user must first specify
40 * if cross-referencing from protein or dna (set dna==true)
48 * get the DNA or protein references for a protein or dna sequence
54 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
58 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
62 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
63 DBRefSource.DNACODINGDBS); // could attempt to find other cross
64 // refs and return here - ie PDB xrefs
65 // (not dna, not protein seq)
70 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
72 Hashtable classes = new Hashtable();
73 classes.put(DBRefSource.PROTEINDBS,
74 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
75 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
76 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
77 classes.put(DBRefSource.DOMAINDBS,
78 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
79 // classes.put(OTHER, )
85 * true if seqs are DNA seqs
87 * @return a list of sequence database cross reference source types
89 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
91 return findSequenceXrefTypes(dna, seqs, null);
95 * Indirect references are references from other sequences from the dataset to
96 * any of the direct DBRefEntrys on the given sequences.
99 * true if seqs are DNA seqs
101 * @return a list of sequence database cross reference source types
103 public static String[] findSequenceXrefTypes(boolean dna,
104 SequenceI[] seqs, AlignmentI dataset)
106 String[] dbrefs = null;
107 Vector refs = new Vector();
108 for (int s = 0; s < seqs.length; s++)
113 SequenceI dss = seqs[s];
114 while (dss.getDatasetSequence() != null)
116 dss = dss.getDatasetSequence();
118 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
119 for (int r = 0; rfs != null && r < rfs.length; r++)
121 if (!refs.contains(rfs[r].getSource()))
123 refs.addElement(rfs[r].getSource());
128 // search for references to this sequence's direct references.
129 DBRefEntry[] lrfs = CrossRef
130 .findXDbRefs(!dna, seqs[s].getDBRef());
131 Vector rseqs = new Vector();
132 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
133 null); // don't need to specify codon frame for mapping here
134 Enumeration lr = rseqs.elements();
135 while (lr.hasMoreElements())
137 SequenceI rs = (SequenceI) lr.nextElement();
138 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
139 for (int r = 0; rfs != null && r < rfs.length; r++)
141 if (!refs.contains(rfs[r].getSource()))
143 refs.addElement(rfs[r].getSource());
152 dbrefs = new String[refs.size()];
153 refs.copyInto(dbrefs);
159 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
160 * reference if (!refs.contains(rfs[r].getSource())) {
161 * refs.addElement(rfs[r].getSource()); } } }
163 public static boolean hasCdnaMap(SequenceI[] seqs)
165 String[] reftypes = findSequenceXrefTypes(false, seqs);
166 for (int s = 0; s < reftypes.length; s++)
168 if (reftypes.equals(DBRefSource.EMBLCDS))
177 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
179 Vector cseqs = new Vector();
180 for (int s = 0; s < seqs.length; s++)
182 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
183 for (int c = 0; c < cdna.length; c++)
185 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
188 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
189 // TODO: retrieve CDS dataset sequences
190 // need global dataset sequence retriever/resolver to reuse refs
191 // and construct Mapping entry.
192 // insert gaps in CDS according to peptide gaps.
193 // add gapped sequence to cseqs
197 if (cseqs.size() > 0)
199 SequenceI[] rsqs = new SequenceI[cseqs.size()];
200 cseqs.copyInto(rsqs);
213 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
216 return findXrefSequences(seqs, dna, source, null);
225 * alignment to search for product sequences.
226 * @return products (as dataset sequences)
228 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
229 String source, AlignmentI dataset)
231 Vector rseqs = new Vector();
232 Alignment ral = null;
233 AlignedCodonFrame cf = new AlignedCodonFrame(); // nominal width
234 for (int s = 0; s < seqs.length; s++)
236 SequenceI dss = seqs[s];
237 while (dss.getDatasetSequence() != null)
239 dss = dss.getDatasetSequence();
241 boolean found = false;
242 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
243 if ((xrfs == null || xrfs.length == 0) && dataset != null)
245 System.out.println("Attempting to find ds Xrefs refs.");
246 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
254 // filter for desired source xref here
255 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
258 for (int r = 0; xrfs != null && r < xrfs.length; r++)
260 if (source != null && !source.equals(xrfs[r].getSource()))
264 if (xrfs[r].hasMap())
266 if (xrfs[r].getMap().getTo() != null)
268 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
269 rseqs.addElement(rsq);
270 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
271 .getMap().getMap().getToRatio())
273 // get sense of map correct for adding to product alignment.
276 // map is from dna seq to a protein product
277 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
281 // map should be from protein seq to its coding dna
282 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
290 // do a bit more work - search for sequences with references matching
291 // xrefs on this sequence.
294 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
297 xrfs[r] = null; // we've recovered seqs for this one.
304 if (xrfs != null && xrfs.length > 0)
306 // Try and get the sequence reference...
308 * Ideal world - we ask for a sequence fetcher implementation here if
309 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
311 ASequenceFetcher sftch = new SequenceFetcher();
312 SequenceI[] retrieved = null;
314 for (int r = 0; r < xrfs.length; r++)
316 // filter out any irrelevant or irretrievable references
318 || ((source != null && !source.equals(xrfs[r]
319 .getSource())) || !sftch.isFetchable(xrfs[r]
329 .println("Attempting to retrieve cross referenced sequences.");
330 DBRefEntry[] t = new DBRefEntry[l];
332 for (int r = 0; r < xrfs.length; r++)
342 retrieved = sftch.getSequences(xrfs); // problem here is we don't
343 // know which of xrfs
346 } catch (Exception e)
349 .println("Problem whilst retrieving cross references for Sequence : "
350 + seqs[s].getName());
353 if (retrieved != null)
355 for (int rs = 0; rs < retrieved.length; rs++)
357 // TODO: examine each sequence for 'redundancy'
358 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
360 if (dbr != null && dbr.length > 0)
362 for (int di = 0; di < dbr.length; di++)
364 // find any entry where we should put in the sequence being
365 // cross-referenced into the map
366 jalview.datamodel.Mapping map = dbr[di].getMap();
369 if (map.getTo() != null && map.getMap() != null)
371 // should search the local dataset to find any existing
372 // candidates for To !
375 // compare ms with dss and replace with dss in mapping
376 // if map is congruent
377 SequenceI ms = map.getTo();
378 int sf = map.getMap().getToLowest();
379 int st = map.getMap().getToHighest();
380 SequenceI mappedrg = ms.getSubSequence(sf, st);
381 SequenceI loc = dss.getSubSequence(sf, st);
382 if (mappedrg.getLength() > 0
383 && mappedrg.getSequenceAsString().equals(
384 loc.getSequenceAsString()))
387 .println("Mapping updated for retrieved crossreference");
388 // method to update all refs of existing To on
389 // retrieved sequence with dss and merge any props
393 } catch (Exception e)
396 .println("Exception when consolidating Mapped sequence set...");
397 e.printStackTrace(System.err);
403 retrieved[rs].updatePDBIds();
404 rseqs.addElement(retrieved[rs]);
411 if (rseqs.size() > 0)
413 SequenceI[] rsqs = new SequenceI[rseqs.size()];
414 rseqs.copyInto(rsqs);
415 ral = new Alignment(rsqs);
416 if (cf != null && cf.getProtMappings() != null)
418 ral.addCodonFrame(cf);
425 * find references to lrfs in the cross-reference set of each sequence in
426 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
427 * based on source and accession string only - Map and Version are nulled.
433 * @return true if matches were found.
435 private static boolean searchDatasetXrefs(SequenceI sequenceI,
436 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
437 AlignedCodonFrame cf)
439 boolean found = false;
444 for (int i = 0; i < lrfs.length; i++)
446 DBRefEntry xref = new DBRefEntry(lrfs[i]);
448 xref.setVersion(null);
450 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
456 * search a given sequence dataset for references matching cross-references to
463 * set of unique sequences
465 * @return true if one or more unique sequences were found and added
467 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
468 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
470 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
474 * TODO: generalise to different protein classifications Search dataset for
475 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
483 * - search all references or only subset
485 * search dna or protein xrefs (if direct=false)
486 * @return true if relationship found and sequence added.
488 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
489 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
490 boolean direct, boolean dna)
492 boolean found = false;
493 SequenceI[] typer = new SequenceI[1];
498 if (dataset.getSequences() == null)
500 System.err.println("Empty dataset sequence set - NO VECTOR");
504 synchronized (ds = dataset.getSequences())
506 for (SequenceI nxt : ds)
510 if (nxt.getDatasetSequence() != null)
513 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
515 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
517 // check if this is the correct sequence type
520 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
521 if ((direct && isDna == dna) || (!direct && isDna != dna))
523 // skip this sequence because it is same molecule type
528 // look for direct or indirect references in common
529 DBRefEntry[] poss = nxt.getDBRef(), cands = null;
532 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
536 poss = CrossRef.findXDbRefs(dna, poss); //
537 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
541 if (!rseqs.contains(nxt))
543 rseqs.addElement(nxt);
544 boolean foundmap = cf != null; // don't search if we aren't
546 // a codon map object
547 for (int r = 0; foundmap && r < cands.length; r++)
549 if (cands[r].hasMap())
551 if (cands[r].getMap().getTo() != null
552 && cands[r].getMap().getMap().getFromRatio() != cands[r]
553 .getMap().getMap().getToRatio())
556 // get sense of map correct for adding to product
560 // map is from dna seq to a protein product
561 cf.addMap(sequenceI, nxt, cands[r].getMap()
566 // map should be from protein seq to its coding dna
567 cf.addMap(nxt, sequenceI, cands[r].getMap()
568 .getMap().getInverse());
573 // TODO: add mapping between sequences if necessary
586 * precalculate different products that can be found for seqs in dataset and
593 * - don't actually build lists - just get types
594 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
595 * seqs, AlignmentI dataset, boolean fake) { String types[] =
596 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
597 * dataset); if (types != null) { System.out.println("Xref Types for:
598 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
599 * System.out.println("Type: " + types[t]); SequenceI[] prod =
600 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
601 * System.out.println("Found " + ((prod == null) ? "no" : "" +
602 * prod.length) + " products"); if (prod!=null) { for (int p=0;
603 * p<prod.length; p++) { System.out.println("Prod "+p+":
604 * "+prod[p].getDisplayId(true)); } } } } else {
605 * System.out.println("Trying getProducts for
606 * "+al.getSequenceAt(0).getDisplayId(true));
607 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
608 * // have a bash at finding the products amongst all the retrieved
609 * sequences. SequenceI[] prod =
610 * jalview.analysis.CrossRef.findXrefSequences(al
611 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
612 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
613 * (prod!=null) { // select non-equivalent sequences from dataset list
614 * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
615 * "+prod[p].getDisplayId(true)); } } } }