2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.Alignment;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.DBRefEntry;
27 import jalview.datamodel.DBRefSource;
28 import jalview.datamodel.Mapping;
29 import jalview.datamodel.Sequence;
30 import jalview.datamodel.SequenceFeature;
31 import jalview.datamodel.SequenceI;
32 import jalview.io.gff.SequenceOntology;
33 import jalview.schemes.ResidueProperties;
34 import jalview.util.DBRefUtils;
35 import jalview.util.MapList;
36 import jalview.util.MappingUtils;
37 import jalview.util.StringUtils;
38 import jalview.ws.SequenceFetcher;
39 import jalview.ws.seqfetcher.ASequenceFetcher;
41 import java.util.ArrayList;
42 import java.util.Collections;
43 import java.util.LinkedHashMap;
44 import java.util.List;
45 import java.util.Map.Entry;
46 import java.util.Vector;
49 * Functions for cross-referencing sequence databases. user must first specify
50 * if cross-referencing from protein or dna (set dna==true)
58 * Select just the DNA or protein references for a protein or dna sequence
61 * if true, select references from DNA (i.e. Protein databases), else
62 * DNA database references
64 * a set of references to select from
67 public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs)
69 return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS
70 : DBRefSource.DNACODINGDBS);
71 // could attempt to find other cross
72 // refs here - ie PDB xrefs
73 // (not dna, not protein seq)
78 * true if seqs are DNA seqs
80 * @return a list of sequence database cross reference source types
82 public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
84 return findSequenceXrefTypes(dna, seqs, null);
88 * Indirect references are references from other sequences from the dataset to
89 * any of the direct DBRefEntrys on the given sequences.
92 * true if seqs are DNA seqs
94 * @return a list of sequence database cross reference source types
96 public static String[] findSequenceXrefTypes(boolean dna,
97 SequenceI[] seqs, AlignmentI dataset)
99 String[] dbrefs = null;
100 List<String> refs = new ArrayList<String>();
101 for (SequenceI seq : seqs)
106 while (dss.getDatasetSequence() != null)
108 dss = dss.getDatasetSequence();
110 DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
113 for (DBRefEntry ref : rfs)
115 if (!refs.contains(ref.getSource()))
117 refs.add(ref.getSource());
123 // search for references to this sequence's direct references.
124 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRef());
125 List<SequenceI> rseqs = new ArrayList<SequenceI>();
126 CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs,
127 null); // don't need to specify codon frame for mapping here
128 for (SequenceI rs : rseqs)
130 DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
133 for (DBRefEntry ref : xrs)
135 if (!refs.contains(ref.getSource()))
137 refs.add(ref.getSource());
141 // looks like copy and paste - change rfs to xrs?
142 // for (int r = 0; rfs != null && r < rfs.length; r++)
144 // if (!refs.contains(rfs[r].getSource()))
146 // refs.add(rfs[r].getSource());
155 dbrefs = new String[refs.size()];
156 refs.toArray(dbrefs);
161 public static boolean hasCdnaMap(SequenceI[] seqs)
163 // TODO unused - remove?
164 String[] reftypes = findSequenceXrefTypes(false, seqs);
165 for (int s = 0; s < reftypes.length; s++)
167 if (reftypes.equals(DBRefSource.EMBLCDS))
176 public static SequenceI[] getCdnaMap(SequenceI[] seqs)
178 // TODO unused - remove?
179 Vector cseqs = new Vector();
180 for (int s = 0; s < seqs.length; s++)
182 DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());
183 for (int c = 0; c < cdna.length; c++)
185 if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
188 .println("TODO: unimplemented sequence retrieval for coding region sequence.");
189 // TODO: retrieve CDS dataset sequences
190 // need global dataset sequence retriever/resolver to reuse refs
191 // and construct Mapping entry.
192 // insert gaps in CDS according to peptide gaps.
193 // add gapped sequence to cseqs
197 if (cseqs.size() > 0)
199 SequenceI[] rsqs = new SequenceI[cseqs.size()];
200 cseqs.copyInto(rsqs);
213 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
216 return findXrefSequences(seqs, dna, source, null);
222 * sequences whose xrefs are being retrieved
224 * true if sequences are nucleotide
227 * alignment to search for product sequences.
228 * @return products (as dataset sequences)
230 public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
231 String source, AlignmentI dataset)
233 List<SequenceI> rseqs = new ArrayList<SequenceI>();
234 AlignedCodonFrame cf = new AlignedCodonFrame();
235 for (SequenceI seq : seqs)
238 while (dss.getDatasetSequence() != null)
240 dss = dss.getDatasetSequence();
242 boolean found = false;
243 DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());
244 if ((xrfs == null || xrfs.length == 0) && dataset != null)
246 System.out.println("Attempting to find ds Xrefs refs.");
247 // FIXME should be dss not seq here?
248 DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRef());
249 // less ambiguous would be a 'find primary dbRefEntry' method.
250 // filter for desired source xref here
251 found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
254 for (int r = 0; xrfs != null && r < xrfs.length; r++)
256 DBRefEntry xref = xrfs[r];
257 if (source != null && !source.equals(xref.getSource()))
263 if (xref.getMap().getTo() != null)
265 SequenceI rsq = new Sequence(xref.getMap().getTo());
267 if (xref.getMap().getMap().getFromRatio() != xref
268 .getMap().getMap().getToRatio())
270 // get sense of map correct for adding to product alignment.
273 // map is from dna seq to a protein product
274 cf.addMap(dss, rsq, xref.getMap().getMap());
278 // map should be from protein seq to its coding dna
279 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
283 * compute peptide variants from dna variants
285 rsq.createDatasetSequence();
286 computeProteinVariants(seq, rsq, xref.getMap().getMap());
293 // do a bit more work - search for sequences with references matching
294 // xrefs on this sequence.
297 found |= searchDataset(dss, xref, dataset, rseqs, cf); // ,false,!dna);
300 xrfs[r] = null; // we've recovered seqs for this one.
307 if (xrfs != null && xrfs.length > 0)
309 // Try and get the sequence reference...
311 * Ideal world - we ask for a sequence fetcher implementation here if
312 * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
314 ASequenceFetcher sftch = new SequenceFetcher();
315 SequenceI[] retrieved = null;
317 for (int r = 0; r < xrfs.length; r++)
319 // filter out any irrelevant or irretrievable references
321 || ((source != null && !source.equals(xrfs[r]
322 .getSource())) || !sftch.isFetchable(xrfs[r]
332 .println("Attempting to retrieve cross referenced sequences.");
333 DBRefEntry[] t = new DBRefEntry[l];
335 for (int r = 0; r < xrfs.length; r++)
345 retrieved = sftch.getSequences(xrfs, !dna);
346 // problem here is we don't know which of xrfs resulted in which
348 } catch (Exception e)
351 .println("Problem whilst retrieving cross references for Sequence : "
355 if (retrieved != null)
357 for (int rs = 0; rs < retrieved.length; rs++)
359 // TODO: examine each sequence for 'redundancy'
360 DBRefEntry[] dbr = retrieved[rs].getDBRef();
361 if (dbr != null && dbr.length > 0)
363 for (int di = 0; di < dbr.length; di++)
365 // find any entry where we should put in the sequence being
366 // cross-referenced into the map
367 Mapping map = dbr[di].getMap();
370 if (map.getTo() != null && map.getMap() != null)
372 // should search the local dataset to find any existing
373 // candidates for To !
376 // compare ms with dss and replace with dss in mapping
377 // if map is congruent
378 SequenceI ms = map.getTo();
379 int sf = map.getMap().getToLowest();
380 int st = map.getMap().getToHighest();
381 SequenceI mappedrg = ms.getSubSequence(sf, st);
382 SequenceI loc = dss.getSubSequence(sf, st);
383 if (mappedrg.getLength() > 0
384 && mappedrg.getSequenceAsString().equals(
385 loc.getSequenceAsString()))
388 .println("Mapping updated for retrieved crossreference");
389 // method to update all refs of existing To on
390 // retrieved sequence with dss and merge any props
394 } catch (Exception e)
397 .println("Exception when consolidating Mapped sequence set...");
398 e.printStackTrace(System.err);
404 retrieved[rs].updatePDBIds();
405 rseqs.add(retrieved[rs]);
413 Alignment ral = null;
414 if (rseqs.size() > 0)
416 SequenceI[] rsqs = new SequenceI[rseqs.size()];
418 ral = new Alignment(rsqs);
419 if (cf != null && !cf.isEmpty())
421 ral.addCodonFrame(cf);
428 * find references to lrfs in the cross-reference set of each sequence in
429 * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
430 * based on source and accession string only - Map and Version are nulled.
436 * @return true if matches were found.
438 private static boolean searchDatasetXrefs(SequenceI sequenceI,
439 boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
440 List<SequenceI> rseqs, AlignedCodonFrame cf)
442 boolean found = false;
447 for (int i = 0; i < lrfs.length; i++)
449 DBRefEntry xref = new DBRefEntry(lrfs[i]);
451 xref.setVersion(null);
453 found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
459 * search a given sequence dataset for references matching cross-references to
466 * set of unique sequences
468 * @return true if one or more unique sequences were found and added
470 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
471 AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf)
473 return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
477 * TODO: generalise to different protein classifications Search dataset for
478 * DBRefEntrys matching the given one (xrf) and add the associated sequence to
486 * - search all references or only subset
488 * search dna or protein xrefs (if direct=false)
489 * @return true if relationship found and sequence added.
491 public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
492 AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
493 boolean direct, boolean dna)
495 boolean found = false;
496 SequenceI[] typer = new SequenceI[1];
501 if (dataset.getSequences() == null)
503 System.err.println("Empty dataset sequence set - NO VECTOR");
507 synchronized (ds = dataset.getSequences())
509 for (SequenceI nxt : ds)
513 if (nxt.getDatasetSequence() != null)
516 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
518 if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
520 // check if this is the correct sequence type
523 boolean isDna = jalview.util.Comparison.isNucleotide(typer);
524 if ((direct && isDna == dna) || (!direct && isDna != dna))
526 // skip this sequence because it is same molecule type
531 // look for direct or indirect references in common
532 DBRefEntry[] poss = nxt.getDBRef(), cands = null;
535 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
539 poss = CrossRef.findXDbRefs(dna, poss); //
540 cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
544 if (!rseqs.contains(nxt))
547 boolean foundmap = cf != null;
548 // don't search if we aren't given a codon map object
549 for (int r = 0; foundmap && r < cands.length; r++)
551 if (cands[r].hasMap())
553 if (cands[r].getMap().getTo() != null
554 && cands[r].getMap().getMap().getFromRatio() != cands[r]
555 .getMap().getMap().getToRatio())
558 // get sense of map correct for adding to product
562 // map is from dna seq to a protein product
563 cf.addMap(sequenceI, nxt, cands[r].getMap()
568 // map should be from protein seq to its coding dna
569 cf.addMap(nxt, sequenceI, cands[r].getMap()
570 .getMap().getInverse());
575 // TODO: add mapping between sequences if necessary
588 * Computes variants in peptide product generated by variants in dna, and adds
589 * them as sequence_variant features on the protein sequence. Returns the
590 * number of variant features added.
594 * @param dnaToProtein
596 protected static int computeProteinVariants(SequenceI dnaSeq,
597 SequenceI peptide, MapList dnaToProtein)
600 * map from peptide position to all variant features of the codon for it
601 * LinkedHashMap ensures we add the peptide features in sequence order
603 LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
604 SequenceOntology so = SequenceOntology.getInstance();
606 SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
607 if (dnaFeatures == null)
612 int[] lastCodon = null;
613 int lastPeptidePostion = 0;
616 * build a map of codon variations for peptides
618 for (SequenceFeature sf : dnaFeatures)
620 int dnaCol = sf.getBegin();
621 if (dnaCol != sf.getEnd())
623 // not handling multi-locus variant features
626 if (so.isSequenceVariant(sf.getType()))
628 int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
631 // feature doesn't lie within coding region
634 int peptidePosition = mapsTo[0];
635 String[][] codonVariants = variants.get(peptidePosition);
636 if (codonVariants == null)
638 codonVariants = new String[3][];
639 variants.put(peptidePosition, codonVariants);
643 * extract dna variants to a string array
645 String alls = (String) sf.getValue("alleles");
650 String[] alleles = alls.split(",");
653 * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10]
655 int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
656 : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
657 peptidePosition, peptidePosition));
658 lastPeptidePostion = peptidePosition;
662 * save nucleotide (and this variant) for each codon position
664 for (int codonPos = 0; codonPos < 3; codonPos++)
666 String nucleotide = String.valueOf(dnaSeq
667 .getCharAt(codon[codonPos] - 1));
668 if (codon[codonPos] == dnaCol)
671 * record current dna base and its alleles
673 String[] dnaVariants = new String[alleles.length + 1];
674 dnaVariants[0] = nucleotide;
675 System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length);
676 codonVariants[codonPos] = dnaVariants;
678 else if (codonVariants[codonPos] == null)
681 * record current dna base only
682 * (at least until we find any variation and overwrite it)
684 codonVariants[codonPos] = new String[] { nucleotide };
691 * scan codon variations, compute peptide variants and add to peptide sequence
694 for (Entry<Integer, String[][]> variant : variants.entrySet())
696 int peptidePos = variant.getKey();
697 String[][] codonVariants = variant.getValue();
698 String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
699 List<String> peptideVariants = computePeptideVariants(codonVariants,
701 if (!peptideVariants.isEmpty())
703 Collections.sort(peptideVariants);
704 String desc = StringUtils.listToDelimitedString(peptideVariants,
706 SequenceFeature sf = new SequenceFeature(
707 SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos,
708 peptidePos, Float.NaN, null);
709 peptide.getDatasetSequence().addSequenceFeature(sf);
717 * Returns a non-redundant list of all peptide translations generated by the
718 * given dna variants, excluding the current residue value
720 * @param codonVariants
721 * an array of base values for codon positions 1, 2, 3
723 * the current residue translation
726 protected static List<String> computePeptideVariants(
727 String[][] codonVariants, String residue)
729 List<String> result = new ArrayList<String>();
730 for (String base1 : codonVariants[0])
732 for (String base2 : codonVariants[1])
734 for (String base3 : codonVariants[2])
736 String codon = base1 + base2 + base3;
737 // TODO: report frameshift/insertion/deletion
738 // and multiple-base variants?!
739 String peptide = codon.contains("-") ? "-" : ResidueProperties
740 .codonTranslate(codon);
741 if (peptide != null && !result.contains(peptide)
742 && !peptide.equals(residue))
753 * Computes a list of all peptide variants given dna variants
756 * the coding dna sequence
757 * @param codonVariants
758 * variant features for each codon position (null if no variant)
760 * the canonical protein translation
763 protected static List<String> computePeptideVariants(SequenceI dnaSeq,
764 SequenceFeature[] codonVariants, String residue)
766 List<String> result = new ArrayList<String>();
767 int[][] dnaVariants = new int[3][];
768 for (int i = 0; i < 3; i++)
772 // TODO Auto-generated method stub
777 * precalculate different products that can be found for seqs in dataset and
784 * - don't actually build lists - just get types
785 * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
786 * seqs, AlignmentI dataset, boolean fake) { String types[] =
787 * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
788 * dataset); if (types != null) { System.out.println("Xref Types for:
789 * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
790 * System.out.println("Type: " + types[t]); SequenceI[] prod =
791 * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
792 * System.out.println("Found " + ((prod == null) ? "no" : "" +
793 * prod.length) + " products"); if (prod!=null) { for (int p=0;
794 * p<prod.length; p++) { System.out.println("Prod "+p+":
795 * "+prod[p].getDisplayId(true)); } } } } else {
796 * System.out.println("Trying getProducts for
797 * "+al.getSequenceAt(0).getDisplayId(true));
798 * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
799 * // have a bash at finding the products amongst all the retrieved
800 * sequences. SequenceI[] prod =
801 * jalview.analysis.CrossRef.findXrefSequences(al
802 * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
803 * ((prod == null) ? "no" : "" + prod.length) + " products"); if
804 * (prod!=null) { // select non-equivalent sequences from dataset list
805 * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
806 * "+prod[p].getDisplayId(true)); } } } }