From 2d8cd2896ca4ba3dcfab5f7a4a4c62f090006e51 Mon Sep 17 00:00:00 2001 From: jprocter Date: Fri, 13 Jul 2007 14:56:27 +0000 Subject: [PATCH] methods for traversing database reference space --- src/jalview/analysis/CrossRef.java | 488 ++++++++++++++++++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 src/jalview/analysis/CrossRef.java diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java new file mode 100644 index 0000000..9e69ca4 --- /dev/null +++ b/src/jalview/analysis/CrossRef.java @@ -0,0 +1,488 @@ +package jalview.analysis; + +import java.util.Enumeration; +import java.util.Vector; +import java.util.Hashtable; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; +import jalview.ws.ASequenceFetcher; +import jalview.ws.SequenceFetcher; + +/** + * Functions for cross-referencing sequence databases. user must first specify + * if cross-referencing from protein or dna (set dna==true) + * + * @author JimP + * + */ +public class CrossRef +{ + /** + * get the DNA or protein references for a protein or dna sequence + * + * @param dna + * @param rfs + * @return + */ + public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs) + { + if (dna) + { + rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS); + } + else + { + rfs = jalview.util.DBRefUtils.selectRefs(rfs, + DBRefSource.DNACODINGDBS); // could attempt to find other cross refs and return here - ie PDB xrefs (not dna, not protein seq) + } + return rfs; + } + + + public static Hashtable classifyDbRefs(DBRefEntry[] rfs) + { + Hashtable classes = new Hashtable(); + classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS)); + classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils.selectRefs(rfs, + DBRefSource.DNACODINGDBS)); + classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(rfs, + DBRefSource.DOMAINDBS)); + // classes.put(OTHER, ) + return classes; + } + + /** + * @param dna + * true if seqs are DNA seqs + * @param seqs + * @return a list of sequence database cross reference source types + */ + public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs) + { + return findSequenceXrefTypes(dna, seqs, null); + } + /** + * Indirect references are references from other sequences from the dataset to any of the direct + * DBRefEntrys on the given sequences. + * @param dna + * true if seqs are DNA seqs + * @param seqs + * @return a list of sequence database cross reference source types + */ + public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs, AlignmentI dataset) + { + String[] dbrefs = null; + Vector refs = new Vector(); + for (int s = 0; s < seqs.length; s++) + { + SequenceI dss = seqs[s]; + while (dss.getDatasetSequence()!=null) + { + dss = dss.getDatasetSequence(); + } + DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef()); + for (int r = 0; rfs != null && r < rfs.length; r++) + { + if (!refs.contains(rfs[r].getSource())) + { + refs.addElement(rfs[r].getSource()); + } + } + if (dataset!=null) + { + // search for references to this sequence's direct references. + DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); + Vector rseqs = new Vector(); + CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs, null); // don't need to specify codon frame for mapping here + Enumeration lr = rseqs.elements(); + while (lr.hasMoreElements()) + { + SequenceI rs = (SequenceI) lr.nextElement(); + DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef()); + for (int r=0; rfs != null && r < rfs.length; r++) + { + if (!refs.contains(rfs[r].getSource())) + { + refs.addElement(rfs[r].getSource()); + } + } + } + } + } + if (refs.size() > 0) + { + dbrefs = new String[refs.size()]; + refs.copyInto(dbrefs); + } + return dbrefs; + } + + /* + * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross + * reference if (!refs.contains(rfs[r].getSource())) { + * refs.addElement(rfs[r].getSource()); } } } + */ + public static boolean hasCdnaMap(SequenceI[] seqs) + { + String[] reftypes = findSequenceXrefTypes(false, seqs); + for (int s = 0; s < reftypes.length; s++) + { + if (reftypes.equals(DBRefSource.EMBLCDS)) + { + return true; + // no map + } + } + return false; + } + + public static SequenceI[] getCdnaMap(SequenceI[] seqs) + { + Vector cseqs = new Vector(); + for (int s = 0; s < seqs.length; s++) + { + DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRef()); + for (int c = 0; c < cdna.length; c++) + { + if (cdna[c].getSource().equals(DBRefSource.EMBLCDS)) + { + // retrieve CDS dataset sequences + // need global dataset sequence retriever/resolver to reuse refs + // and construct Mapping entry. + // insert gaps in CDS according to peptide gaps. + // add gapped sequence to cseqs + } + } + } + if (cseqs.size() > 0) + { + SequenceI[] rsqs = new SequenceI[cseqs.size()]; + cseqs.copyInto(rsqs); + return rsqs; + } + return null; + + } + + /** + * + * @param dna + * @param seqs + * @return + */ + public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna, + String source) + { + return findXrefSequences(seqs, dna, source, null); + } + + /** + * + * @param seqs + * @param dna + * @param source + * @param dataset + * alignment to search for product sequences. + * @return products (as dataset sequences) + */ + public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna, + String source, AlignmentI dataset) + { + Vector rseqs = new Vector(); + Alignment ral = null; + AlignedCodonFrame cf=new AlignedCodonFrame(dataset.getWidth()); // nominal width + for (int s = 0; s < seqs.length; s++) + { + SequenceI dss = seqs[s]; + while (dss.getDatasetSequence()!=null) + { + dss = dss.getDatasetSequence(); + } + boolean found = false; + DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef()); + if ((xrfs == null || xrfs.length == 0) && dataset!=null) + { + System.out.println("Attempting to find ds Xrefs refs."); + DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less ambiguous would be a 'find primary dbRefEntry' method. + found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, rseqs, cf); + } + for (int r = 0; xrfs!=null && r < xrfs.length; r++) + { + if (source != null && !source.equals(xrfs[r].getSource())) + continue; + if (xrfs[r].hasMap()) + { + if (xrfs[r].getMap().getTo() != null) + { + Sequence rsq = new Sequence(xrfs[r].getMap().getTo()); + rseqs.addElement(rsq); + if (xrfs[r].getMap().getMap().getFromRatio()!=xrfs[r].getMap().getMap().getToRatio()) + { + // get sense of map correct for adding to product alignment. + if (dna) + { + // map is from dna seq to a protein product + cf.addMap(dss, rsq, xrfs[r].getMap().getMap()); + } else { + // map should be from protein seq to its coding dna + cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse()); + } + } + found = true; + } + } + else + { + // do a bit more work - search for sequences with references matching + // xrefs on this sequence. + if (dataset != null) + { + found = searchDataset(dss, xrfs[r], dataset, rseqs, cf); + } + } + } + if (!found) + { + if (xrfs != null && xrfs.length > 0) + { + // Try and get the sequence reference... + /* + * Ideal world - we ask for a sequence fetcher implementation here if + * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) ( + */ + ASequenceFetcher sftch = new SequenceFetcher(); + SequenceI[] retrieved = null; + int l = xrfs.length; + for (int r = 0; r < xrfs.length; r++) + { + // filter out any irrelevant or irretrievable references + if ((source != null && !source.equals(xrfs[r].getSource())) + || !sftch.isFetchable(xrfs[r].getSource())) + { + l--; + xrfs[r] = null; + } + } + if (l > 0) + { + System.out + .println("Attempting to retrieve cross referenced sequences."); + DBRefEntry[] t = new DBRefEntry[l]; + l = 0; + for (int r = 0; r < xrfs.length; r++) + { + if (xrfs[r] != null) + t[l++] = xrfs[r]; + } + xrfs = t; + try + { + retrieved = sftch.getSequences(xrfs); + } catch (Exception e) + { + System.err + .println("Problem whilst retrieving cross references for Sequence : " + + seqs[s].getName()); + e.printStackTrace(); + } + if (retrieved != null) + { + for (int rs = 0; rs < retrieved.length; rs++) + { + rseqs.addElement(retrieved[rs]); + } + } + } + } + } + } + if (rseqs.size() > 0) + { + SequenceI[] rsqs = new SequenceI[rseqs.size()]; + rseqs.copyInto(rsqs); + ral = new Alignment(rsqs); + if (cf!=null && cf.getProtMappings()!=null) + { + ral.addCodonFrame(cf); + } + } + return ral; + } + + /** + * find references to lrfs in the cross-reference set of each sequence in dataset (that is not equal to sequenceI) + * Identifies matching DBRefEntry based on source and accession string only - Map and Version are nulled. + * @param sequenceI + * @param lrfs + * @param dataset + * @param rseqs + * @return true if matches were found. + */ + private static boolean searchDatasetXrefs(SequenceI sequenceI, boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf) + { + boolean found=false; + if (lrfs==null) + return false; + for (int i=0;i