2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.1)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 * The Jalview Authors are detailed in the 'AUTHORS' file.
19 package jalview.analysis;
21 import java.util.Enumeration;
22 import java.util.List;
23 import java.util.Vector;
24 import java.util.Hashtable;
26 import jalview.datamodel.AlignedCodonFrame;
27 import jalview.datamodel.Alignment;
28 import jalview.datamodel.AlignmentI;
29 import jalview.datamodel.DBRefSource;
30 import jalview.datamodel.DBRefEntry;
31 import jalview.datamodel.Sequence;
32 import jalview.datamodel.SequenceI;
33 import jalview.ws.SequenceFetcher;
34 import jalview.ws.seqfetcher.ASequenceFetcher;
37 * Functions for cross-referencing sequence databases. user must first specify
38 * if cross-referencing from protein or dna (set dna==true)
46 * get the DNA or protein references for a protein or dna sequence
52 public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
56 rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
60 rfs = jalview.util.DBRefUtils.selectRefs(rfs,
61 DBRefSource.DNACODINGDBS); // could attempt to find other cross
62 // refs and return here - ie PDB xrefs
63 // (not dna, not protein seq)
68 public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
70 Hashtable classes = new Hashtable();
71 classes.put(DBRefSource.PROTEINDBS,
72 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
73 classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
74 .selectRefs(rfs, DBRefSource.DNACODINGDBS));
75 classes.put(DBRefSource.DOMAINDBS,
76 jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
77 // classes.put(OTHER, )
83 * true if seqs are DNA seqs
85 * @return a list of sequence database cross reference source types
87 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
89 return findSequenceXrefTypes(dna, seqs, null);
93 * Indirect references are references from other sequences from the dataset to
94 * any of the direct DBRefEntrys on the given sequences.
97 * true if seqs are DNA seqs
99 * @return a list of sequence database cross reference source types
101 public static String[] findSequenceXrefTypes(boolean dna,
102 SequenceI[] seqs, AlignmentI dataset)
104 String[] dbrefs = null;
105 Vector refs = new Vector();
106 for (int s = 0; s < seqs.length; s++)
111 SequenceI dss = seqs[s];
112 while (dss.getDatasetSequence() != null)
114 dss = dss.getDatasetSequence();
116 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
117 for (int r = 0; rfs != null && r < rfs.length; r++)
119 if (!refs.contains(rfs[r].getSource()))
121 refs.addElement(rfs[r].getSource());
126 // search for references to this sequence's direct references.
127 DBRefEntry[] lrfs = CrossRef
128 .findXDbRefs(!dna, seqs[s].getDBRef());
129 Vector rseqs = new Vector();
130 CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
131 null); // don't need to specify codon frame for mapping here
132 Enumeration lr = rseqs.elements();
133 while (lr.hasMoreElements())
135 SequenceI rs = (SequenceI) lr.nextElement();
136 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
137 for (int r = 0; rfs != null && r < rfs.length; r++)
139 if (!refs.contains(rfs[r].getSource()))
141 refs.addElement(rfs[r].getSource());
150 dbrefs = new String[refs.size()];
151 refs.copyInto(dbrefs);
157 * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
158 * reference if (!refs.contains(rfs[r].getSource())) {
159 * refs.addElement(rfs[r].getSource()); } } }
161 public static boolean hasCdnaMap(SequenceI[] seqs)
163 String[] reftypes = findSequenceXrefTypes(false, seqs);
164 for (int s = 0; s < reftypes.length; s++)
166 if (reftypes.equals(DBRefSource.EMBLCDS))
175 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
177 Vector cseqs = new Vector();
178 for (int s = 0; s < seqs.length; s++)
180 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
181 for (int c = 0; c < cdna.length; c++)
183 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
186 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
187 // TODO: retrieve CDS dataset sequences
188 // need global dataset sequence retriever/resolver to reuse refs
189 // and construct Mapping entry.
190 // insert gaps in CDS according to peptide gaps.
191 // add gapped sequence to cseqs
195 if (cseqs.size() > 0)
197 SequenceI[] rsqs = new SequenceI[cseqs.size()];
198 cseqs.copyInto(rsqs);
211 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
214 return findXrefSequences(seqs, dna, source, null);
223 * alignment to search for product sequences.
224 * @return products (as dataset sequences)
226 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
227 String source, AlignmentI dataset)
229 Vector rseqs = new Vector();
230 Alignment ral = null;
231 AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
232 for (int s = 0; s < seqs.length; s++)
234 SequenceI dss = seqs[s];
235 while (dss.getDatasetSequence() != null)
237 dss = dss.getDatasetSequence();
239 boolean found = false;
240 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
241 if ((xrfs == null || xrfs.length == 0) && dataset != null)
243 System.out.println("Attempting to find ds Xrefs refs.");
244 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
252 // filter for desired source xref here
253 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
256 for (int r = 0; xrfs != null && r < xrfs.length; r++)
258 if (source != null && !source.equals(xrfs[r].getSource()))
260 if (xrfs[r].hasMap())
262 if (xrfs[r].getMap().getTo() != null)
264 Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
265 rseqs.addElement(rsq);
266 if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
267 .getMap().getMap().getToRatio())
269 // get sense of map correct for adding to product alignment.
272 // map is from dna seq to a protein product
273 cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
277 // map should be from protein seq to its coding dna
278 cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
286 // do a bit more work - search for sequences with references matching
287 // xrefs on this sequence.
290 found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
292 xrfs[r] = null; // we've recovered seqs for this one.
298 if (xrfs != null && xrfs.length > 0)
300 // Try and get the sequence reference...
302 * Ideal world - we ask for a sequence fetcher implementation here if
303 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
305 ASequenceFetcher sftch = new SequenceFetcher();
306 SequenceI[] retrieved = null;
308 for (int r = 0; r < xrfs.length; r++)
310 // filter out any irrelevant or irretrievable references
312 || ((source != null && !source.equals(xrfs[r]
313 .getSource())) || !sftch.isFetchable(xrfs[r]
323 .println("Attempting to retrieve cross referenced sequences.");
324 DBRefEntry[] t = new DBRefEntry[l];
326 for (int r = 0; r < xrfs.length; r++)
334 retrieved = sftch.getSequences(xrfs); // problem here is we don't
335 // know which of xrfs
338 } catch (Exception e)
341 .println("Problem whilst retrieving cross references for Sequence : "
342 + seqs[s].getName());
345 if (retrieved != null)
347 for (int rs = 0; rs < retrieved.length; rs++)
349 // TODO: examine each sequence for 'redundancy'
350 jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
352 if (dbr != null && dbr.length > 0)
354 for (int di = 0; di < dbr.length; di++)
356 // find any entry where we should put in the sequence being
357 // cross-referenced into the map
358 jalview.datamodel.Mapping map = dbr[di].getMap();
361 if (map.getTo() != null && map.getMap() != null)
363 // should search the local dataset to find any existing
364 // candidates for To !
367 // compare ms with dss and replace with dss in mapping
368 // if map is congruent
369 SequenceI ms = map.getTo();
370 int sf = map.getMap().getToLowest();
371 int st = map.getMap().getToHighest();
372 SequenceI mappedrg = ms.getSubSequence(sf, st);
373 SequenceI loc = dss.getSubSequence(sf, st);
374 if (mappedrg.getLength() > 0
375 && mappedrg.getSequenceAsString().equals(
376 loc.getSequenceAsString()))
379 .println("Mapping updated for retrieved crossreference");
380 // method to update all refs of existing To on
381 // retrieved sequence with dss and merge any props
385 } catch (Exception e)
388 .println("Exception when consolidating Mapped sequence set...");
389 e.printStackTrace(System.err);
395 retrieved[rs].updatePDBIds();
396 rseqs.addElement(retrieved[rs]);
403 if (rseqs.size() > 0)
405 SequenceI[] rsqs = new SequenceI[rseqs.size()];
406 rseqs.copyInto(rsqs);
407 ral = new Alignment(rsqs);
408 if (cf != null && cf.getProtMappings() != null)
410 ral.addCodonFrame(cf);
417 * find references to lrfs in the cross-reference set of each sequence in
418 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
419 * based on source and accession string only - Map and Version are nulled.
425 * @return true if matches were found.
427 private static boolean searchDatasetXrefs(SequenceI sequenceI,
428 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
429 AlignedCodonFrame cf)
431 boolean found = false;
434 for (int i = 0; i < lrfs.length; i++)
436 DBRefEntry xref = new DBRefEntry(lrfs[i]);
438 xref.setVersion(null);
440 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
446 * search a given sequence dataset for references matching cross-references to
453 * set of unique sequences
455 * @return true if one or more unique sequences were found and added
457 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
458 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
460 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
464 * TODO: generalise to different protein classifications Search dataset for
465 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
473 * - search all references or only subset
475 * search dna or protein xrefs (if direct=false)
476 * @return true if relationship found and sequence added.
478 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
479 AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
480 boolean direct, boolean dna)
482 boolean found = false;
483 SequenceI[] typer = new SequenceI[1];
486 if (dataset.getSequences() == null)
488 System.err.println("Empty dataset sequence set - NO VECTOR");
492 synchronized (ds = dataset.getSequences())
494 for (SequenceI nxt : ds)
497 if (nxt.getDatasetSequence() != null)
500 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
502 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
504 // check if this is the correct sequence type
507 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
508 if ((direct && isDna == dna) || (!direct && isDna != dna))
510 // skip this sequence because it is same molecule type
515 // look for direct or indirect references in common
516 DBRefEntry[] poss = nxt.getDBRef(), cands = null;
519 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
523 poss = CrossRef.findXDbRefs(dna, poss); //
524 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
528 if (!rseqs.contains(nxt))
530 rseqs.addElement(nxt);
531 boolean foundmap = cf != null; // don't search if we aren't
533 // a codon map object
534 for (int r = 0; foundmap && r < cands.length; r++)
536 if (cands[r].hasMap())
538 if (cands[r].getMap().getTo() != null
539 && cands[r].getMap().getMap().getFromRatio() != cands[r]
540 .getMap().getMap().getToRatio())
543 // get sense of map correct for adding to product
547 // map is from dna seq to a protein product
548 cf.addMap(sequenceI, nxt, cands[r].getMap()
553 // map should be from protein seq to its coding dna
554 cf.addMap(nxt, sequenceI, cands[r].getMap()
555 .getMap().getInverse());
560 // TODO: add mapping between sequences if necessary
572 * precalculate different products that can be found for seqs in dataset and
579 * - don't actually build lists - just get types
580 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
581 * seqs, AlignmentI dataset, boolean fake) { String types[] =
582 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
583 * dataset); if (types != null) { System.out.println("Xref Types for:
584 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
585 * System.out.println("Type: " + types[t]); SequenceI[] prod =
586 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
587 * System.out.println("Found " + ((prod == null) ? "no" : "" +
588 * prod.length) + " products"); if (prod!=null) { for (int p=0;
589 * p<prod.length; p++) { System.out.println("Prod "+p+":
590 * "+prod[p].getDisplayId(true)); } } } } else {
591 * System.out.println("Trying getProducts for
592 * "+al.getSequenceAt(0).getDisplayId(true));
593 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
594 * // have a bash at finding the products amongst all the retrieved
595 * sequences. SequenceI[] prod =
596 * jalview.analysis.CrossRef.findXrefSequences(al
597 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
598 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
599 * (prod!=null) { // select non-equivalent sequences from dataset list
600 * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
601 * "+prod[p].getDisplayId(true)); } } } }