--- /dev/null
+package jalview.analysis;\r
+\r
+import java.util.Enumeration;\r
+import java.util.Vector;\r
+import java.util.Hashtable;\r
+\r
+import jalview.datamodel.AlignedCodonFrame;\r
+import jalview.datamodel.Alignment;\r
+import jalview.datamodel.AlignmentI;\r
+import jalview.datamodel.DBRefSource;\r
+import jalview.datamodel.DBRefEntry;\r
+import jalview.datamodel.Sequence;\r
+import jalview.datamodel.SequenceI;\r
+import jalview.ws.ASequenceFetcher;\r
+import jalview.ws.SequenceFetcher;\r
+\r
+/**\r
+ * Functions for cross-referencing sequence databases. user must first specify\r
+ * if cross-referencing from protein or dna (set dna==true)\r
+ * \r
+ * @author JimP\r
+ * \r
+ */\r
+public class CrossRef\r
+{\r
+ /**\r
+ * get the DNA or protein references for a protein or dna sequence\r
+ * \r
+ * @param dna\r
+ * @param rfs\r
+ * @return\r
+ */\r
+ public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)\r
+ {\r
+ if (dna)\r
+ {\r
+ rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);\r
+ }\r
+ else\r
+ {\r
+ rfs = jalview.util.DBRefUtils.selectRefs(rfs,\r
+ DBRefSource.DNACODINGDBS); // could attempt to find other cross refs and return here - ie PDB xrefs (not dna, not protein seq)\r
+ }\r
+ return rfs;\r
+ }\r
+\r
+\r
+ public static Hashtable classifyDbRefs(DBRefEntry[] rfs)\r
+ {\r
+ Hashtable classes = new Hashtable();\r
+ classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));\r
+ classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils.selectRefs(rfs,\r
+ DBRefSource.DNACODINGDBS));\r
+ classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(rfs,\r
+ DBRefSource.DOMAINDBS));\r
+ // classes.put(OTHER, )\r
+ return classes;\r
+ }\r
+\r
+ /**\r
+ * @param dna\r
+ * true if seqs are DNA seqs\r
+ * @param seqs\r
+ * @return a list of sequence database cross reference source types\r
+ */\r
+ public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)\r
+ {\r
+ return findSequenceXrefTypes(dna, seqs, null);\r
+ }\r
+ /**\r
+ * Indirect references are references from other sequences from the dataset to any of the direct\r
+ * DBRefEntrys on the given sequences.\r
+ * @param dna\r
+ * true if seqs are DNA seqs\r
+ * @param seqs\r
+ * @return a list of sequence database cross reference source types\r
+ */\r
+ public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs, AlignmentI dataset)\r
+ {\r
+ String[] dbrefs = null;\r
+ Vector refs = new Vector();\r
+ for (int s = 0; s < seqs.length; s++)\r
+ {\r
+ SequenceI dss = seqs[s];\r
+ while (dss.getDatasetSequence()!=null)\r
+ {\r
+ dss = dss.getDatasetSequence();\r
+ }\r
+ DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());\r
+ for (int r = 0; rfs != null && r < rfs.length; r++)\r
+ {\r
+ if (!refs.contains(rfs[r].getSource()))\r
+ {\r
+ refs.addElement(rfs[r].getSource());\r
+ }\r
+ }\r
+ if (dataset!=null)\r
+ {\r
+ // search for references to this sequence's direct references.\r
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef());\r
+ Vector rseqs = new Vector();\r
+ CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs, null); // don't need to specify codon frame for mapping here\r
+ Enumeration lr = rseqs.elements();\r
+ while (lr.hasMoreElements())\r
+ {\r
+ SequenceI rs = (SequenceI) lr.nextElement();\r
+ DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());\r
+ for (int r=0; rfs != null && r < rfs.length; r++)\r
+ {\r
+ if (!refs.contains(rfs[r].getSource()))\r
+ {\r
+ refs.addElement(rfs[r].getSource());\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ if (refs.size() > 0)\r
+ {\r
+ dbrefs = new String[refs.size()];\r
+ refs.copyInto(dbrefs);\r
+ }\r
+ return dbrefs;\r
+ }\r
+\r
+ /*\r
+ * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross\r
+ * reference if (!refs.contains(rfs[r].getSource())) {\r
+ * refs.addElement(rfs[r].getSource()); } } }\r
+ */\r
+ public static boolean hasCdnaMap(SequenceI[] seqs)\r
+ {\r
+ String[] reftypes = findSequenceXrefTypes(false, seqs);\r
+ for (int s = 0; s < reftypes.length; s++)\r
+ {\r
+ if (reftypes.equals(DBRefSource.EMBLCDS))\r
+ {\r
+ return true;\r
+ // no map\r
+ }\r
+ }\r
+ return false;\r
+ }\r
+\r
+ public static SequenceI[] getCdnaMap(SequenceI[] seqs)\r
+ {\r
+ Vector cseqs = new Vector();\r
+ for (int s = 0; s < seqs.length; s++)\r
+ {\r
+ DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef());\r
+ for (int c = 0; c < cdna.length; c++)\r
+ {\r
+ if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))\r
+ {\r
+ // retrieve CDS dataset sequences\r
+ // need global dataset sequence retriever/resolver to reuse refs\r
+ // and construct Mapping entry.\r
+ // insert gaps in CDS according to peptide gaps.\r
+ // add gapped sequence to cseqs\r
+ }\r
+ }\r
+ }\r
+ if (cseqs.size() > 0)\r
+ {\r
+ SequenceI[] rsqs = new SequenceI[cseqs.size()];\r
+ cseqs.copyInto(rsqs);\r
+ return rsqs;\r
+ }\r
+ return null;\r
+\r
+ }\r
+\r
+ /**\r
+ * \r
+ * @param dna\r
+ * @param seqs\r
+ * @return\r
+ */\r
+ public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,\r
+ String source)\r
+ {\r
+ return findXrefSequences(seqs, dna, source, null);\r
+ }\r
+\r
+ /**\r
+ * \r
+ * @param seqs\r
+ * @param dna\r
+ * @param source\r
+ * @param dataset\r
+ * alignment to search for product sequences.\r
+ * @return products (as dataset sequences)\r
+ */\r
+ public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,\r
+ String source, AlignmentI dataset)\r
+ {\r
+ Vector rseqs = new Vector();\r
+ Alignment ral = null;\r
+ AlignedCodonFrame cf=new AlignedCodonFrame(dataset.getWidth()); // nominal width\r
+ for (int s = 0; s < seqs.length; s++)\r
+ {\r
+ SequenceI dss = seqs[s];\r
+ while (dss.getDatasetSequence()!=null)\r
+ {\r
+ dss = dss.getDatasetSequence();\r
+ }\r
+ boolean found = false;\r
+ DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef());\r
+ if ((xrfs == null || xrfs.length == 0) && dataset!=null)\r
+ {\r
+ System.out.println("Attempting to find ds Xrefs refs.");\r
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less ambiguous would be a 'find primary dbRefEntry' method.\r
+ found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, rseqs, cf);\r
+ }\r
+ for (int r = 0; xrfs!=null && r < xrfs.length; r++)\r
+ {\r
+ if (source != null && !source.equals(xrfs[r].getSource()))\r
+ continue;\r
+ if (xrfs[r].hasMap())\r
+ {\r
+ if (xrfs[r].getMap().getTo() != null)\r
+ {\r
+ Sequence rsq = new Sequence(xrfs[r].getMap().getTo());\r
+ rseqs.addElement(rsq);\r
+ if (xrfs[r].getMap().getMap().getFromRatio()!=xrfs[r].getMap().getMap().getToRatio())\r
+ {\r
+ // get sense of map correct for adding to product alignment.\r
+ if (dna)\r
+ {\r
+ // map is from dna seq to a protein product\r
+ cf.addMap(dss, rsq, xrfs[r].getMap().getMap());\r
+ } else {\r
+ // map should be from protein seq to its coding dna\r
+ cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());\r
+ }\r
+ }\r
+ found = true;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ // do a bit more work - search for sequences with references matching\r
+ // xrefs on this sequence.\r
+ if (dataset != null)\r
+ {\r
+ found = searchDataset(dss, xrfs[r], dataset, rseqs, cf);\r
+ }\r
+ }\r
+ }\r
+ if (!found)\r
+ {\r
+ if (xrfs != null && xrfs.length > 0)\r
+ {\r
+ // Try and get the sequence reference...\r
+ /*\r
+ * Ideal world - we ask for a sequence fetcher implementation here if\r
+ * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (\r
+ */\r
+ ASequenceFetcher sftch = new SequenceFetcher();\r
+ SequenceI[] retrieved = null;\r
+ int l = xrfs.length;\r
+ for (int r = 0; r < xrfs.length; r++)\r
+ {\r
+ // filter out any irrelevant or irretrievable references\r
+ if ((source != null && !source.equals(xrfs[r].getSource()))\r
+ || !sftch.isFetchable(xrfs[r].getSource()))\r
+ {\r
+ l--;\r
+ xrfs[r] = null;\r
+ }\r
+ }\r
+ if (l > 0)\r
+ {\r
+ System.out\r
+ .println("Attempting to retrieve cross referenced sequences.");\r
+ DBRefEntry[] t = new DBRefEntry[l];\r
+ l = 0;\r
+ for (int r = 0; r < xrfs.length; r++)\r
+ {\r
+ if (xrfs[r] != null)\r
+ t[l++] = xrfs[r];\r
+ }\r
+ xrfs = t;\r
+ try\r
+ {\r
+ retrieved = sftch.getSequences(xrfs);\r
+ } catch (Exception e)\r
+ {\r
+ System.err\r
+ .println("Problem whilst retrieving cross references for Sequence : "\r
+ + seqs[s].getName());\r
+ e.printStackTrace();\r
+ }\r
+ if (retrieved != null)\r
+ {\r
+ for (int rs = 0; rs < retrieved.length; rs++)\r
+ {\r
+ rseqs.addElement(retrieved[rs]);\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ if (rseqs.size() > 0)\r
+ {\r
+ SequenceI[] rsqs = new SequenceI[rseqs.size()];\r
+ rseqs.copyInto(rsqs);\r
+ ral = new Alignment(rsqs);\r
+ if (cf!=null && cf.getProtMappings()!=null)\r
+ {\r
+ ral.addCodonFrame(cf);\r
+ }\r
+ }\r
+ return ral;\r
+ }\r
+\r
+ /**\r
+ * find references to lrfs in the cross-reference set of each sequence in dataset (that is not equal to sequenceI)\r
+ * Identifies matching DBRefEntry based on source and accession string only - Map and Version are nulled.\r
+ * @param sequenceI\r
+ * @param lrfs\r
+ * @param dataset\r
+ * @param rseqs\r
+ * @return true if matches were found.\r
+ */\r
+ private static boolean searchDatasetXrefs(SequenceI sequenceI, boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)\r
+ {\r
+ boolean found=false;\r
+ if (lrfs==null)\r
+ return false;\r
+ for (int i=0;i<lrfs.length; i++)\r
+ {\r
+ DBRefEntry xref = new DBRefEntry(lrfs[i]);\r
+ // add in wildcards\r
+ xref.setVersion(null);\r
+ xref.setMap(null);\r
+ found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);\r
+ }\r
+ return found;\r
+ }\r
+\r
+\r
+ /**\r
+ * search a given sequence dataset for references matching cross-references to\r
+ * the given sequence\r
+ * \r
+ * @param sequenceI\r
+ * @param xrf\r
+ * @param dna\r
+ * @param dataset\r
+ * @param rseqs\r
+ * @param cf \r
+ * @return true if sequences were found and added\r
+ */\r
+ public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,\r
+ AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)\r
+ {\r
+ return searchDataset(sequenceI, xrf,\r
+ dataset, rseqs, cf, true, false);\r
+ }\r
+ /**\r
+ * TODO: generalise to different protein classifications\r
+ * Search dataset for DBRefEntrys matching the given one (xrf) and add\r
+ * the associated sequence to rseq.\r
+ * @param sequenceI\r
+ * @param xrf\r
+ * @param dataset\r
+ * @param rseqs\r
+ * @param direct - search all references or only subset\r
+ * @param dna search dna or protein xrefs (if direct=false)\r
+ * @return true if relationship found and sequence added.\r
+ */\r
+ public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,\r
+ AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf, boolean direct, boolean dna)\r
+ {\r
+ boolean found = false;\r
+ if (dataset==null)\r
+ return false;\r
+ Enumeration e = dataset.getSequences().elements();\r
+ while (e.hasMoreElements())\r
+ {\r
+ SequenceI nxt = (SequenceI) e.nextElement();\r
+ if (nxt != null)\r
+ {\r
+ if (nxt.getDatasetSequence() != null)\r
+ {\r
+ System.err\r
+ .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");\r
+ }\r
+ if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())\r
+ {\r
+ DBRefEntry[] poss=null, cands=null;\r
+ if (direct)\r
+ {\r
+ cands = jalview.util.DBRefUtils.searchRefs(poss=nxt\r
+ .getDBRef(), xrf);\r
+ } else {\r
+ cands = jalview.util.DBRefUtils.searchRefs(\r
+ poss=CrossRef.findXDbRefs(dna, nxt.getDBRef()), xrf);\r
+ }\r
+ if (cands != null)\r
+ {\r
+ rseqs.addElement(nxt);\r
+ boolean foundmap= cf!=null; // don't search if we aren't given a codon map object\r
+ for (int r=0; foundmap && r<cands.length; r++)\r
+ {\r
+ if (cands[r].hasMap())\r
+ {\r
+ if (cands[r].getMap().getTo()!=null && cands[r].getMap().getMap().getFromRatio()!=cands[r].getMap().getMap().getToRatio())\r
+ {\r
+ foundmap=true;\r
+ // get sense of map correct for adding to product alignment.\r
+ if (dna)\r
+ {\r
+ // map is from dna seq to a protein product\r
+ cf.addMap(sequenceI, nxt, cands[r].getMap().getMap()); \r
+ } else {\r
+ // map should be from protein seq to its coding dna\r
+ cf.addMap(nxt, sequenceI, cands[r].getMap().getMap().getInverse());\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ \r
+ // TODO: add mapping between sequences if necessary\r
+ found = true;\r
+ }\r
+ }\r
+ }\r
+ return found;\r
+ }\r
+\r
+ /**\r
+ * precalculate different products that can be found for seqs in dataset\r
+ * and return them.\r
+ * @param dna\r
+ * @param seqs\r
+ * @param dataset\r
+ * @param fake - don't actually build lists - just get types\r
+ * @return\r
+ public static Object[] buildXProductsList(boolean dna, SequenceI[] seqs, AlignmentI dataset, boolean fake)\r
+ {\r
+ String types[] = jalview.analysis.CrossRef.findSequenceXrefTypes(\r
+ dna, seqs, dataset);\r
+ if (types != null)\r
+ {\r
+ System.out.println("Xref Types for: "+(dna ? "dna" : "prot"));\r
+ for (int t = 0; t < types.length; t++)\r
+ {\r
+ System.out.println("Type: " + types[t]);\r
+ SequenceI[] prod = \r
+ jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);\r
+ System.out.println("Found "\r
+ + ((prod == null) ? "no" : "" + prod.length)\r
+ + " products");\r
+ if (prod!=null)\r
+ {\r
+ for (int p=0; p<prod.length; p++)\r
+ {\r
+ System.out.println("Prod "+p+": "+prod[p].getDisplayId(true));\r
+ }\r
+ }\r
+ }\r
+\r
+ } else {\r
+ System.out.println("Trying getProducts for "+al.getSequenceAt(0).getDisplayId(true));\r
+ System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));\r
+ // have a bash at finding the products amongst all the retrieved sequences.\r
+ SequenceI[] prod = jalview.analysis.CrossRef.findXrefSequences(al\r
+ .getSequencesArray(), dna, null, ds);\r
+ System.out.println("Found "\r
+ + ((prod == null) ? "no" : "" + prod.length)\r
+ + " products");\r
+ if (prod!=null)\r
+ {\r
+ // select non-equivalent sequences from dataset list\r
+ for (int p=0; p<prod.length; p++)\r
+ {\r
+ System.out.println("Prod "+p+": "+prod[p].getDisplayId(true));\r
+ }\r
+ }\r
+\r
+ }\r
+ }\r
+ */\r
+}
\ No newline at end of file