import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
-import jalview.datamodel.DBRefSource;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.util.Comparison;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
-import jalview.ws.SequenceFetcher;
+import jalview.ws.SequenceFetcherFactory;
import jalview.ws.seqfetcher.ASequenceFetcher;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
-import java.util.Vector;
/**
* Functions for cross-referencing sequence databases. user must first specify
}
/**
- * Select just the DNA or protein references for a protein or dna sequence
- *
- * @param fromDna
- * if true, select references from DNA (i.e. Protein databases), else
- * DNA database references
- * @param refs
- * a set of references to select from
- * @return
- */
- public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs)
- {
- return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS
- : DBRefSource.DNACODINGDBS);
- // could attempt to find other cross
- // refs here - ie PDB xrefs
- // (not dna, not protein seq)
- }
-
- /**
- * @param dna
- * true if seqs are DNA seqs
- * @param seqs
- * @return a list of sequence database cross reference source types
- */
- public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
- {
- return findSequenceXrefTypes(dna, seqs, null);
- }
- /**
- * Indirect references are references from other sequences from the dataset to
- * any of the direct DBRefEntrys on the given sequences.
+ * Returns a list of distinct database sources for which sequences have either
+ * <ul>
+ * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
+ * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
+ * reference from another sequence in the dataset which has a cross-reference
+ * to a direct DBRefEntry on the given sequence</li>
+ * </ul>
*
* @param dna
- * true if seqs are DNA seqs
+ * true if seqs are nucleotide
* @param seqs
- * @return a list of sequence database cross reference source types
+ * sequences whose xrefs we are seeking
+ * @param dataset
+ * an alignment to search for indirect references
+ * @return
*/
- public static String[] findSequenceXrefTypes(boolean dna,
+ public static List<String> findXrefSourcesForSequences(boolean dna,
SequenceI[] seqs, AlignmentI dataset)
{
- String[] dbrefs = null;
- List<String> refs = new ArrayList<String>();
+ List<String> sources = new ArrayList<String>();
for (SequenceI seq : seqs)
{
if (seq != null)
{
- SequenceI dss = seq;
- while (dss.getDatasetSequence() != null)
- {
- dss = dss.getDatasetSequence();
- }
- DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRefs());
- if (rfs != null)
- {
- for (DBRefEntry ref : rfs)
- {
- if (!refs.contains(ref.getSource()))
- {
- refs.add(ref.getSource());
- }
- }
- }
- if (dataset != null)
- {
- // search for references to this sequence's direct references.
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
- List<SequenceI> rseqs = new ArrayList<SequenceI>();
- CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs,
- null); // don't need to specify codon frame for mapping here
- for (SequenceI rs : rseqs)
- {
- DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRefs());
- if (xrs != null)
- {
- for (DBRefEntry ref : xrs)
- {
- if (!refs.contains(ref.getSource()))
- {
- refs.add(ref.getSource());
- }
- }
- }
- // looks like copy and paste - change rfs to xrs?
- // for (int r = 0; rfs != null && r < rfs.length; r++)
- // {
- // if (!refs.contains(rfs[r].getSource()))
- // {
- // refs.add(rfs[r].getSource());
- // }
- // }
- }
- }
+ findXrefSourcesForSequence(seq, dna, dataset, sources);
}
}
- if (refs.size() > 0)
- {
- dbrefs = new String[refs.size()];
- refs.toArray(dbrefs);
- }
- return dbrefs;
+ return sources;
}
- public static boolean hasCdnaMap(SequenceI[] seqs)
+ /**
+ * Returns a list of distinct database sources for which a sequence has either
+ * <ul>
+ * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
+ * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
+ * reference from another sequence in the dataset which has a cross-reference
+ * to a direct DBRefEntry on the given sequence</li>
+ * </ul>
+ *
+ * @param seq
+ * the sequence whose dbrefs we are searching against
+ * @param dna
+ * true if the sequence is nucleotide
+ * @param dataset
+ * an alignment to search for indirect references
+ * @param sources
+ * a list of sources to add matches to
+ */
+ static void findXrefSourcesForSequence(SequenceI seq, boolean dna,
+ AlignmentI dataset, List<String> sources)
{
- // TODO unused - remove?
- String[] reftypes = findSequenceXrefTypes(false, seqs);
- for (int s = 0; s < reftypes.length; s++)
+ /*
+ * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
+ */
+ DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs());
+ addXrefsToSources(rfs, sources);
+ if (dataset != null)
{
- if (reftypes.equals(DBRefSource.EMBLCDS))
+ /*
+ * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
+ */
+ DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+ List<SequenceI> rseqs = new ArrayList<SequenceI>();
+
+ /*
+ * find sequences in the alignment which xref one of these DBRefs
+ * i.e. is xref-ed to a common sequence identifier
+ */
+ CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null);
+
+ /*
+ * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
+ */
+ for (SequenceI rs : rseqs)
{
- return true;
- // no map
+ DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs());
+ addXrefsToSources(xrs, sources);
}
}
- return false;
}
- public static SequenceI[] getCdnaMap(SequenceI[] seqs)
+ /**
+ * Helper method that adds the source identifiers of some cross-references to
+ * a (non-redundant) list of database sources
+ *
+ * @param xrefs
+ * @param sources
+ */
+ static void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
{
- // TODO unused - remove?
- Vector cseqs = new Vector();
- for (int s = 0; s < seqs.length; s++)
+ if (xrefs != null)
{
- DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRefs());
- for (int c = 0; c < cdna.length; c++)
+ for (DBRefEntry ref : xrefs)
{
- if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
+ String source = ref.getSource();
+ if (!sources.contains(source))
{
- System.err
- .println("TODO: unimplemented sequence retrieval for coding region sequence.");
- // TODO: retrieve CDS dataset sequences
- // need global dataset sequence retriever/resolver to reuse refs
- // and construct Mapping entry.
- // insert gaps in CDS according to peptide gaps.
- // add gapped sequence to cseqs
+ sources.add(source);
}
}
}
- if (cseqs.size() > 0)
- {
- SequenceI[] rsqs = new SequenceI[cseqs.size()];
- cseqs.copyInto(rsqs);
- return rsqs;
- }
- return null;
-
}
/**
* @param al
* alignment to search for cross-referenced sequences (and possibly
* add to)
- * @param addedPeers
- * a list of sequences to add to if 'peers' to the original sequences
- * are found e.g. alternative protein products for a protein's gene
* @return products (as dataset sequences)
*/
public static Alignment findXrefSequences(SequenceI[] seqs,
- final boolean dna, final String source, AlignmentI al,
- List<SequenceI> addedPeers)
+ final boolean dna, final String source, AlignmentI al)
{
AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
List<SequenceI> rseqs = new ArrayList<SequenceI>();
dss = dss.getDatasetSequence();
}
boolean found = false;
- DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRefs());
+ DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs());
if ((xrfs == null || xrfs.length == 0) && dataset != null)
{
- System.out.println("Attempting to find ds Xrefs refs.");
- // FIXME should be dss not seq here?
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
- // less ambiguous would be a 'find primary dbRefEntry' method.
- // filter for desired source xref here
+ /*
+ * found no suitable dbrefs on sequence - look for sequences in the
+ * alignment which share a dbref with this one
+ */
+ DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+
+ /*
+ * find sequences (except this one!), of complementary type,
+ * which have a dbref to an accession id for this sequence,
+ * and add them to the results
+ */
found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
rseqs, cf);
}
{
if (xref.getMap().getTo() != null)
{
+ found = true;
SequenceI rsq = new Sequence(xref.getMap().getTo());
rseqs.add(rsq);
if (xref.getMap().getMap().getFromRatio() != xref
cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
}
}
- found = true;
}
}
if (!found)
// xrefs on this sequence.
if (dataset != null)
{
- found |= searchDataset(dss, xref, dataset, rseqs, cf, false,
+ found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/
!dna);
- // ,false,!dna);
if (found)
{
xrfs[r] = null; // we've recovered seqs for this one.
{
if (xrfs != null && xrfs.length > 0)
{
- // Try and get the sequence reference...
- /*
- * Ideal world - we ask for a sequence fetcher implementation here if
- * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
- */
- ASequenceFetcher sftch = new SequenceFetcher();
+ ASequenceFetcher sftch = SequenceFetcherFactory
+ .getSequenceFetcher();
SequenceI[] retrieved = null;
int l = xrfs.length;
for (int r = 0; r < xrfs.length; r++)
}
if (l > 0)
{
- System.out
- .println("Attempting to retrieve cross referenced sequences.");
+ // System.out
+ // .println("Attempting to retrieve cross referenced sequences.");
DBRefEntry[] t = new DBRefEntry[l];
l = 0;
for (int r = 0; r < xrfs.length; r++)
xrfs = t;
try
{
- retrieved = sftch.getSequences(xrfs, !dna);
+ retrieved = sftch.getSequences(Arrays.asList(xrfs), !dna);
// problem here is we don't know which of xrfs resulted in which
// retrieved element
} catch (Exception e)
SequenceIdMatcher matcher = new SequenceIdMatcher(
dataset.getSequences());
- matcher.addAll(addedPeers);
List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
CrossRef me = new CrossRef();
for (int rs = 0; rs < retrieved.length; rs++)
.findIdMatch(map.getTo());
if (matched != null)
{
+ /*
+ * already got an xref to this sequence; update this
+ * map to point to the same sequence, and add
+ * any new dbrefs to it
+ */
+ for (DBRefEntry ref : map.getTo().getDBRefs())
+ {
+ matched.addDBRef(ref); // add or update mapping
+ }
map.setTo(matched);
}
else
map.setTo(dss);
/*
* copy sequence features as well, avoiding
- * duplication (e.g. from 2 transcripts)
+ * duplication (e.g. same variation from 2
+ * transcripts)
*/
SequenceFeature[] sfs = ms
.getSequenceFeatures();
cf.addMap(retrieved[rs].getDatasetSequence(),
dss, map.getMap());
}
+ // TODO remove this 'else' and the cf.addMap above?
else
{
- if (!addedPeers.contains(map.getTo()))
- {
- addedPeers.add(map.getTo());
- }
cf.addMap(retrieved[rs].getDatasetSequence(),
map.getTo(), map.getMap());
}
// add in wildcards
xref.setVersion(null);
xref.setMap(null);
- found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
+ found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false,
+ dna);
}
return found;
}
/**
- * search a given sequence dataset for references matching cross-references to
- * the given sequence
+ * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
+ * associated sequence to rseqs
*
* @param sequenceI
+ * a sequence to ignore (start point of search)
* @param xrf
+ * a cross-reference to try to match
* @param dataset
+ * sequences to search in
* @param rseqs
- * set of unique sequences
+ * result list to add to
* @param cf
- * @return true if one or more unique sequences were found and added
- */
- public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf)
- {
- return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
- }
-
- /**
- * TODO: generalise to different protein classifications Search dataset for
- * DBRefEntrys matching the given one (xrf) and add the associated sequence to
- * rseq.
- *
- * @param sequenceI
- * @param xrf
- * @param dataset
- * @param rseqs
+ * a set of sequence mappings to add to
* @param direct
* - search all references or only subset
* @param dna
boolean direct, boolean dna)
{
boolean found = false;
- SequenceI[] typer = new SequenceI[1];
if (dataset == null)
{
return false;
System.err
.println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
}
- if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
+ if (nxt == sequenceI || nxt == sequenceI.getDatasetSequence())
+ {
+ continue;
+ }
+ // check if this is the correct sequence type
{
- // check if this is the correct sequence type
+ // TODO 'direct' is always set to false - remove?
+ // or should it be 'true' from findXrefSequences?
+ // also its Javadoc conflicts with its use:
+ // test below implies 'direct' means find complementary sequences,
+ // !direct means select same molecule type
+ boolean isDna = Comparison
+ .isNucleotide(new SequenceI[] { nxt });
+ if ((direct && isDna == dna) || (!direct && isDna != dna))
{
- typer[0] = nxt;
- boolean isDna = jalview.util.Comparison.isNucleotide(typer);
- if ((direct && isDna == dna) || (!direct && isDna != dna))
- {
- // skip this sequence because it is same molecule type
- continue;
- }
+ // skip this sequence because it is wrong molecule type
+ continue;
}
+ }
- // look for direct or indirect references in common
- DBRefEntry[] poss = nxt.getDBRefs(), cands = null;
- if (direct)
- {
- cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
- }
- else
- {
- poss = CrossRef.findXDbRefs(dna, poss); //
- cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
- }
- if (cands != null)
+ // look for direct or indirect references in common
+ DBRefEntry[] poss = nxt.getDBRefs();
+ List<DBRefEntry> cands = null;
+ /*
+ * TODO does this make any sense?
+ * if 'direct', search the dbrefs for xrf
+ * else, filter the dbrefs by type and then search for xrf
+ * - the result is the same isn't it?
+ */
+ if (direct)
+ {
+ cands = DBRefUtils.searchRefs(poss, xrf);
+ }
+ else
+ {
+ poss = DBRefUtils.selectDbRefs(!dna, poss);
+ cands = DBRefUtils.searchRefs(poss, xrf);
+ }
+ if (!cands.isEmpty())
+ {
+ if (!rseqs.contains(nxt))
{
- if (!rseqs.contains(nxt))
+ found = true;
+ rseqs.add(nxt);
+ if (cf != null)
{
- rseqs.add(nxt);
- boolean foundmap = cf != null;
// don't search if we aren't given a codon map object
- for (int r = 0; foundmap && r < cands.length; r++)
+ for (DBRefEntry candidate : cands)
{
- if (cands[r].hasMap())
+ Mapping mapping = candidate.getMap();
+ if (mapping != null)
{
- if (cands[r].getMap().getTo() != null
- && cands[r].getMap().getMap().getFromRatio() != cands[r]
- .getMap().getMap().getToRatio())
+ MapList map = mapping.getMap();
+ if (mapping.getTo() != null
+ && map.getFromRatio() != map.getToRatio())
{
- foundmap = true;
// get sense of map correct for adding to product
// alignment.
if (dna)
{
// map is from dna seq to a protein product
- cf.addMap(sequenceI, nxt, cands[r].getMap()
- .getMap());
+ cf.addMap(sequenceI, nxt, map);
}
else
{
// map should be from protein seq to its coding dna
- cf.addMap(nxt, sequenceI, cands[r].getMap()
- .getMap().getInverse());
+ cf.addMap(nxt, sequenceI, map.getInverse());
}
}
}
}
- // TODO: add mapping between sequences if necessary
- found = true;
}
+ // TODO: add mapping between sequences if necessary
}
-
}
}
}
}
return found;
}
-
- /**
- * precalculate different products that can be found for seqs in dataset and
- * return them.
- *
- * @param dna
- * @param seqs
- * @param dataset
- * @param fake
- * - don't actually build lists - just get types
- * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
- * seqs, AlignmentI dataset, boolean fake) { String types[] =
- * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
- * dataset); if (types != null) { System.out.println("Xref Types for:
- * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
- * System.out.println("Type: " + types[t]); SequenceI[] prod =
- * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
- * System.out.println("Found " + ((prod == null) ? "no" : "" +
- * prod.length) + " products"); if (prod!=null) { for (int p=0;
- * p<prod.length; p++) { System.out.println("Prod "+p+":
- * "+prod[p].getDisplayId(true)); } } } } else {
- * System.out.println("Trying getProducts for
- * "+al.getSequenceAt(0).getDisplayId(true));
- * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
- * // have a bash at finding the products amongst all the retrieved
- * sequences. SequenceI[] prod =
- * jalview.analysis.CrossRef.findXrefSequences(al
- * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
- * ((prod == null) ? "no" : "" + prod.length) + " products"); if
- * (prod!=null) { // select non-equivalent sequences from dataset list
- * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
- * "+prod[p].getDisplayId(true)); } } } }
- */
}