import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
-import jalview.datamodel.DBRefSource;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.util.Comparison;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
-import jalview.ws.SequenceFetcher;
+import jalview.ws.SequenceFetcherFactory;
import jalview.ws.seqfetcher.ASequenceFetcher;
import java.util.ArrayList;
import java.util.List;
-import java.util.Vector;
/**
* Functions for cross-referencing sequence databases. user must first specify
}
/**
- * Select just the DNA or protein references for a protein or dna sequence
- *
- * @param fromDna
- * if true, select references from DNA (i.e. Protein databases), else
- * DNA database references
- * @param refs
- * a set of references to select from
- * @return
- */
- public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs)
- {
- return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS
- : DBRefSource.DNACODINGDBS);
- // could attempt to find other cross
- // refs here - ie PDB xrefs
- // (not dna, not protein seq)
- }
- /**
- * @param dna
- * true if seqs are DNA seqs
- * @param seqs
- * @return a list of sequence database cross reference source types
- */
- public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
- {
- return findSequenceXrefTypes(dna, seqs, null);
- }
-
- /**
- * Indirect references are references from other sequences from the dataset to
- * any of the direct DBRefEntrys on the given sequences.
+ * Returns a list of distinct database sources for which sequences have either
+ * <ul>
+ * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
+ * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
+ * reference from another sequence in the dataset which has a cross-reference
+ * to a direct DBRefEntry on the given sequence</li>
+ * </ul>
*
* @param dna
- * true if seqs are DNA seqs
+ * true if seqs are nucleotide
* @param seqs
- * @return a list of sequence database cross reference source types
+ * sequences whose xrefs we are seeking
+ * @param dataset
+ * an alignment to search for indirect references
+ * @return
*/
- public static String[] findSequenceXrefTypes(boolean dna,
+ public static List<String> findXrefSourcesForSequences(boolean dna,
SequenceI[] seqs, AlignmentI dataset)
{
- String[] dbrefs = null;
- List<String> refs = new ArrayList<String>();
+ List<String> sources = new ArrayList<String>();
for (SequenceI seq : seqs)
{
if (seq != null)
{
- SequenceI dss = seq;
- while (dss.getDatasetSequence() != null)
- {
- dss = dss.getDatasetSequence();
- }
- DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRefs());
- if (rfs != null)
- {
- for (DBRefEntry ref : rfs)
- {
- if (!refs.contains(ref.getSource()))
- {
- refs.add(ref.getSource());
- }
- }
- }
- if (dataset != null)
- {
- // search for references to this sequence's direct references.
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
- List<SequenceI> rseqs = new ArrayList<SequenceI>();
- CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs,
- null); // don't need to specify codon frame for mapping here
- for (SequenceI rs : rseqs)
- {
- DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRefs());
- if (xrs != null)
- {
- for (DBRefEntry ref : xrs)
- {
- if (!refs.contains(ref.getSource()))
- {
- refs.add(ref.getSource());
- }
- }
- }
- // looks like copy and paste - change rfs to xrs?
- // for (int r = 0; rfs != null && r < rfs.length; r++)
- // {
- // if (!refs.contains(rfs[r].getSource()))
- // {
- // refs.add(rfs[r].getSource());
- // }
- // }
- }
- }
+ findXrefSourcesForSequence(seq, dna, dataset, sources);
}
}
- if (refs.size() > 0)
- {
- dbrefs = new String[refs.size()];
- refs.toArray(dbrefs);
- }
- return dbrefs;
+ return sources;
}
- public static boolean hasCdnaMap(SequenceI[] seqs)
+ /**
+ * Returns a list of distinct database sources for which a sequence has either
+ * <ul>
+ * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
+ * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
+ * reference from another sequence in the dataset which has a cross-reference
+ * to a direct DBRefEntry on the given sequence</li>
+ * </ul>
+ *
+ * @param seq
+ * the sequence whose dbrefs we are searching against
+ * @param dna
+ * true if the sequence is nucleotide
+ * @param dataset
+ * an alignment to search for indirect references
+ * @param sources
+ * a list of sources to add matches to
+ */
+ static void findXrefSourcesForSequence(SequenceI seq, boolean dna,
+ AlignmentI dataset, List<String> sources)
{
- // TODO unused - remove?
- String[] reftypes = findSequenceXrefTypes(false, seqs);
- for (int s = 0; s < reftypes.length; s++)
+ /*
+ * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
+ */
+ DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs());
+ addXrefsToSources(rfs, sources);
+ if (dataset != null)
{
- if (reftypes.equals(DBRefSource.EMBLCDS))
+ /*
+ * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
+ */
+ DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+ List<SequenceI> rseqs = new ArrayList<SequenceI>();
+
+ /*
+ * find sequences in the alignment which xref one of these DBRefs
+ * i.e. is xref-ed to a common sequence identifier
+ */
+ CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null);
+
+ /*
+ * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
+ */
+ for (SequenceI rs : rseqs)
{
- return true;
- // no map
+ DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs());
+ addXrefsToSources(xrs, sources);
}
}
- return false;
}
- public static SequenceI[] getCdnaMap(SequenceI[] seqs)
+ /**
+ * Helper method that adds the source identifiers of some cross-references to
+ * a (non-redundant) list of database sources
+ *
+ * @param xrefs
+ * @param sources
+ */
+ static void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
{
- // TODO unused - remove?
- Vector cseqs = new Vector();
- for (int s = 0; s < seqs.length; s++)
+ if (xrefs != null)
{
- DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRefs());
- for (int c = 0; c < cdna.length; c++)
+ for (DBRefEntry ref : xrefs)
{
- if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
+ String source = ref.getSource();
+ if (!sources.contains(source))
{
- System.err
- .println("TODO: unimplemented sequence retrieval for coding region sequence.");
- // TODO: retrieve CDS dataset sequences
- // need global dataset sequence retriever/resolver to reuse refs
- // and construct Mapping entry.
- // insert gaps in CDS according to peptide gaps.
- // add gapped sequence to cseqs
+ sources.add(source);
}
}
}
- if (cseqs.size() > 0)
- {
- SequenceI[] rsqs = new SequenceI[cseqs.size()];
- cseqs.copyInto(rsqs);
- return rsqs;
- }
- return null;
-
}
/**
dss = dss.getDatasetSequence();
}
boolean found = false;
- DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRefs());
+ DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs());
if ((xrfs == null || xrfs.length == 0) && dataset != null)
{
- System.out.println("Attempting to find ds Xrefs refs.");
- // FIXME should be dss not seq here?
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
- // less ambiguous would be a 'find primary dbRefEntry' method.
- // filter for desired source xref here
+ /*
+ * found no suitable dbrefs on sequence - look for sequences in the
+ * alignment which share a dbref with this one
+ */
+ DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+
+ /*
+ * find sequences (except this one!), of complementary type,
+ * which have a dbref to an accession id for this sequence,
+ * and add them to the results
+ */
found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
rseqs, cf);
}
{
if (xref.getMap().getTo() != null)
{
+ found = true;
SequenceI rsq = new Sequence(xref.getMap().getTo());
rseqs.add(rsq);
if (xref.getMap().getMap().getFromRatio() != xref
cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
}
}
- found = true;
}
}
if (!found)
// xrefs on this sequence.
if (dataset != null)
{
- found |= searchDataset(dss, xref, dataset, rseqs, cf, false,
+ found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/
!dna);
if (found)
{
{
if (xrfs != null && xrfs.length > 0)
{
- // Try and get the sequence reference...
- /*
- * Ideal world - we ask for a sequence fetcher implementation here if
- * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
- */
- ASequenceFetcher sftch = new SequenceFetcher();
+ ASequenceFetcher sftch = SequenceFetcherFactory
+ .getSequenceFetcher();
SequenceI[] retrieved = null;
int l = xrfs.length;
for (int r = 0; r < xrfs.length; r++)
cf.addMap(retrieved[rs].getDatasetSequence(),
dss, map.getMap());
}
+ // TODO remove this 'else' and the cf.addMap above?
else
{
cf.addMap(retrieved[rs].getDatasetSequence(),
// add in wildcards
xref.setVersion(null);
xref.setMap(null);
- found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
+ found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false,
+ dna);
}
return found;
}
/**
- * search a given sequence dataset for references matching cross-references to
- * the given sequence
+ * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
+ * associated sequence to rseqs
*
* @param sequenceI
+ * a sequence to ignore (start point of search)
* @param xrf
+ * a cross-reference to try to match
* @param dataset
+ * sequences to search in
* @param rseqs
- * set of unique sequences
+ * result list to add to
* @param cf
- * @return true if one or more unique sequences were found and added
- */
- public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf)
- {
- return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
- }
-
- /**
- * TODO: generalise to different protein classifications Search dataset for
- * DBRefEntrys matching the given one (xrf) and add the associated sequence to
- * rseq.
- *
- * @param sequenceI
- * @param xrf
- * @param dataset
- * @param rseqs
+ * a set of sequence mappings to add to
* @param direct
* - search all references or only subset
* @param dna
boolean direct, boolean dna)
{
boolean found = false;
- SequenceI[] typer = new SequenceI[1];
if (dataset == null)
{
return false;
System.err
.println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
}
- if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
+ if (nxt == sequenceI || nxt == sequenceI.getDatasetSequence())
+ {
+ continue;
+ }
+ // check if this is the correct sequence type
{
- // check if this is the correct sequence type
+ // TODO 'direct' is always set to false - remove?
+ // or should it be 'true' from findXrefSequences?
+ // also its Javadoc conflicts with its use:
+ // test below implies 'direct' means find complementary sequences,
+ // !direct means select same molecule type
+ boolean isDna = Comparison
+ .isNucleotide(new SequenceI[] { nxt });
+ if ((direct && isDna == dna) || (!direct && isDna != dna))
{
- typer[0] = nxt;
- boolean isDna = jalview.util.Comparison.isNucleotide(typer);
- if ((direct && isDna == dna) || (!direct && isDna != dna))
- {
- // skip this sequence because it is same molecule type
- continue;
- }
+ // skip this sequence because it is wrong molecule type
+ continue;
}
+ }
- // look for direct or indirect references in common
- DBRefEntry[] poss = nxt.getDBRefs(), cands = null;
- if (direct)
- {
- cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
- }
- else
- {
- poss = CrossRef.findXDbRefs(dna, poss); //
- cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
- }
- if (cands != null)
+ // look for direct or indirect references in common
+ DBRefEntry[] poss = nxt.getDBRefs();
+ DBRefEntry[] cands = null;
+ /*
+ * TODO does this make any sense?
+ * if 'direct', search the dbrefs for xrf
+ * else, filter the dbrefs by type and then search for xrf
+ * - the result is the same isn't it?
+ */
+ if (direct)
+ {
+ cands = DBRefUtils.searchRefs(poss, xrf);
+ }
+ else
+ {
+ poss = DBRefUtils.selectDbRefs(!dna, poss);
+ cands = DBRefUtils.searchRefs(poss, xrf);
+ }
+ if (cands != null)
+ {
+ if (!rseqs.contains(nxt))
{
- if (!rseqs.contains(nxt))
+ found = true;
+ rseqs.add(nxt);
+ boolean foundmap = cf != null;
+ // don't search if we aren't given a codon map object
+ for (int r = 0; foundmap && r < cands.length; r++)
{
- rseqs.add(nxt);
- boolean foundmap = cf != null;
- // don't search if we aren't given a codon map object
- for (int r = 0; foundmap && r < cands.length; r++)
+ if (cands[r].hasMap())
{
- if (cands[r].hasMap())
+ Mapping mapping = cands[r].getMap();
+ MapList map = mapping.getMap();
+ if (mapping.getTo() != null
+ && map.getFromRatio() != map.getToRatio())
{
- if (cands[r].getMap().getTo() != null
- && cands[r].getMap().getMap().getFromRatio() != cands[r]
- .getMap().getMap().getToRatio())
+ // get sense of map correct for adding to product
+ // alignment.
+ if (dna)
{
- foundmap = true;
- // get sense of map correct for adding to product
- // alignment.
- if (dna)
- {
- // map is from dna seq to a protein product
- cf.addMap(sequenceI, nxt, cands[r].getMap()
- .getMap());
- }
- else
- {
- // map should be from protein seq to its coding dna
- cf.addMap(nxt, sequenceI, cands[r].getMap()
- .getMap().getInverse());
- }
+ // map is from dna seq to a protein product
+ cf.addMap(sequenceI, nxt, map);
+ }
+ else
+ {
+ // map should be from protein seq to its coding dna
+ cf.addMap(nxt, sequenceI, map.getInverse());
}
}
}
- // TODO: add mapping between sequences if necessary
- found = true;
}
+ // TODO: add mapping between sequences if necessary
}
-
}
}
}
}
return found;
}
-
- /**
- * precalculate different products that can be found for seqs in dataset and
- * return them.
- *
- * @param dna
- * @param seqs
- * @param dataset
- * @param fake
- * - don't actually build lists - just get types
- * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
- * seqs, AlignmentI dataset, boolean fake) { String types[] =
- * jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
- * dataset); if (types != null) { System.out.println("Xref Types for:
- * "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
- * System.out.println("Type: " + types[t]); SequenceI[] prod =
- * jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
- * System.out.println("Found " + ((prod == null) ? "no" : "" +
- * prod.length) + " products"); if (prod!=null) { for (int p=0;
- * p<prod.length; p++) { System.out.println("Prod "+p+":
- * "+prod[p].getDisplayId(true)); } } } } else {
- * System.out.println("Trying getProducts for
- * "+al.getSequenceAt(0).getDisplayId(true));
- * System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
- * // have a bash at finding the products amongst all the retrieved
- * sequences. SequenceI[] prod =
- * jalview.analysis.CrossRef.findXrefSequences(al
- * .getSequencesArray(), dna, null, ds); System.out.println("Found " +
- * ((prod == null) ? "no" : "" + prod.length) + " products"); if
- * (prod!=null) { // select non-equivalent sequences from dataset list
- * for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
- * "+prod[p].getDisplayId(true)); } } } }
- */
}
{
showProducts.removeAll();
final boolean dna = viewport.getAlignment().isNucleotide();
- String[] ptypes = (selection == null || selection.length == 0) ? null
- : CrossRef.findSequenceXrefTypes(dna, selection, dataset);
+ List<String> ptypes = (selection == null || selection.length == 0) ? null
+ : CrossRef.findXrefSourcesForSequences(dna, selection, dataset);
- for (int t = 0; ptypes != null && t < ptypes.length; t++)
+ for (final String source : ptypes)
{
showp = true;
final AlignFrame af = this;
- final String source = ptypes[t];
- JMenuItem xtype = new JMenuItem(ptypes[t]);
+ JMenuItem xtype = new JMenuItem(source);
xtype.addActionListener(new ActionListener()
{
-
@Override
public void actionPerformed(ActionEvent e)
{
showProductsFor(af.viewport.getSequenceSelection(), dna, source);
}
-
});
showProducts.add(xtype);
}
showProducts.setEnabled(showp);
} catch (Exception e)
{
- jalview.bin.Cache.log
+ Cache.log
.warn("canShowProducts threw an exception - please report to help@jalview.org",
e);
return false;
}
/**
+ * Returns those DBRefEntry objects whose source identifier (once converted to
+ * Jalview's canonical form) is in the list of sources to search for. Returns
+ * null if no matches found.
*
* @param dbrefs
- * array of DBRef objects to search
+ * DBRefEntry objects to search
* @param sources
- * String[] array of source DBRef IDs to retrieve
+ * array of sources to select
* @return
*/
public static DBRefEntry[] selectRefs(DBRefEntry[] dbrefs,
};
/**
- * accession ID and DB must be identical. Version is ignored. No map on either
- * or map but no maplist on either or maplist of map on a is equivalent to the
- * maplist of map on b.
+ * accession ID and DB must be identical, or null on a. Version is ignored. No
+ * map on either or map but no maplist on either or maplist of map on a is
+ * equivalent to the maplist of map on b.
*/
public static DbRefComp matchDbAndIdAndEitherMapOrEquivalentMapList = new DbRefComp()
{
&& refb.getSource().equals(refa.getSource()))
{
// We dont care about version
- if (refa.getAccessionId() != null && refb.getAccessionId() != null
- && refb.getAccessionId().equals(refa.getAccessionId()))
+
+ if (refa.getAccessionId() == null
+ || refa.getAccessionId().equals(refb.getAccessionId()))
{
if (refa.getMap() == null || refb.getMap() == null)
{
|| (refb.getMap().getMap() != null
&& refa.getMap().getMap() != null && (refb
.getMap().getMap().equals(refa.getMap().getMap()))))
- { // getMap().getMap().containsEither(false,refa.getMap().getMap())
+ {
return true;
}
}
return (o1 == null ? o2.equals(o1) : o1.equals(o2));
}
+ /**
+ * Selects just the DNA or protein references from a set of references
+ *
+ * @param selectDna
+ * if true, select references to 'standard' DNA databases, else to
+ * 'standard' peptide databases
+ * @param refs
+ * a set of references to select from
+ * @return
+ */
+ public static DBRefEntry[] selectDbRefs(boolean selectDna,
+ DBRefEntry[] refs)
+ {
+ return selectRefs(refs, selectDna ? DBRefSource.DNACODINGDBS
+ : DBRefSource.PROTEINDBS);
+ // could attempt to find other cross
+ // refs here - ie PDB xrefs
+ // (not dna, not protein seq)
+ }
+
}
--- /dev/null
+package jalview.ws;
+
+import jalview.ws.seqfetcher.ASequenceFetcher;
+
+public class SequenceFetcherFactory
+{
+
+ private static SequenceFetcher instance;
+
+ /**
+ * Returns a new SequenceFetcher object, or a mock object if one has been set
+ *
+ * @return
+ */
+ public static ASequenceFetcher getSequenceFetcher()
+ {
+ return instance == null ? new SequenceFetcher() : instance;
+ }
+
+ /**
+ * Set the instance object to use (intended for unit testing with mock
+ * objects).
+ *
+ * Be sure to reset to null in the tearDown method of any tests!
+ *
+ * @param sf
+ */
+ public static void setSequenceFetcher(SequenceFetcher sf)
+ {
+ instance = sf;
+ }
+}
/**
* Constructor
*/
- public ASequenceFetcher()
+ protected ASequenceFetcher()
{
super();
package jalview.analysis;
import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertNotSame;
+import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
+import jalview.util.MapList;
+import jalview.ws.SequenceFetcher;
+import jalview.ws.SequenceFetcherFactory;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.testng.annotations.AfterClass;
import org.testng.annotations.Test;
public class CrossRefTest
DBRefEntry ref6 = new DBRefEntry("emblCDS", "1", "A123");
DBRefEntry ref7 = new DBRefEntry("GeneDB", "1", "A123");
DBRefEntry ref8 = new DBRefEntry("PFAM", "1", "A123");
+ // ENSEMBL is a source of either dna or protein sequence data
+ DBRefEntry ref9 = new DBRefEntry("ENSEMBL", "1", "A123");
DBRefEntry[] refs = new DBRefEntry[] { ref1, ref2, ref3, ref4, ref5,
- ref6, ref7, ref8 };
+ ref6, ref7, ref8, ref9 };
/*
* Just the DNA refs:
*/
- DBRefEntry[] found = CrossRef.findXDbRefs(false, refs);
- assertEquals(3, found.length);
+ DBRefEntry[] found = DBRefUtils.selectDbRefs(true, refs);
+ assertEquals(4, found.length);
assertSame(ref5, found[0]);
assertSame(ref6, found[1]);
assertSame(ref7, found[2]);
+ assertSame(ref9, found[3]);
/*
* Just the protein refs:
*/
- found = CrossRef.findXDbRefs(true, refs);
- assertEquals(4, found.length);
+ found = DBRefUtils.selectDbRefs(false, refs);
+ assertEquals(5, found.length);
assertSame(ref1, found[0]);
assertSame(ref2, found[1]);
assertSame(ref3, found[2]);
assertSame(ref4, found[3]);
+ assertSame(ref9, found[4]);
+ }
+
+ /**
+ * Test the method that finds a sequence's "product" xref source databases,
+ * which may be direct (dbrefs on the sequence), or indirect (dbrefs on
+ * sequences which share a dbref with the sequence
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSourcesForSequence_proteinToDna()
+ {
+ SequenceI seq = new Sequence("Seq1", "MGKYQARLSS");
+ List<String> sources = new ArrayList<String>();
+ AlignmentI al = new Alignment(new SequenceI[] {});
+
+ /*
+ * first with no dbrefs to search
+ */
+ CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ assertTrue(sources.isEmpty());
+
+ /*
+ * add some dbrefs to sequence
+ */
+ // protein db is not a candidate for findXrefSources
+ seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ // dna coding databatases are
+ seq.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
+ // a second EMBL xref should not result in a duplicate
+ seq.addDBRef(new DBRefEntry("EMBL", "0", "E2346"));
+ seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
+ seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
+ seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
+ seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
+ CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ assertEquals(4, sources.size());
+ assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]",
+ sources.toString());
+
+ /*
+ * add a sequence to the alignment which has a dbref to UNIPROT|A1234
+ * and others to dna coding databases
+ */
+ sources.clear();
+ seq.setDBRefs(null);
+ seq.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ seq.addDBRef(new DBRefEntry("EMBLCDS", "0", "E2347"));
+ SequenceI seq2 = new Sequence("Seq2", "MGKYQARLSS");
+ seq2.addDBRef(new DBRefEntry("UNIPROT", "0", "A1234"));
+ seq2.addDBRef(new DBRefEntry("EMBL", "0", "E2345"));
+ seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
+ // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
+ al.addSequence(seq2);
+ CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ assertEquals(3, sources.size());
+ assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString());
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where only an indirect
+ * xref is found - not on the nucleotide sequence but on a peptide sequence in
+ * the alignment which which it shares a nucleotide dbref
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_indirectDbrefToProtein()
+ {
+ /*
+ * Alignment setup:
+ * - nucleotide dbref EMBL|AF039662
+ * - peptide dbrefs EMBL|AF039662, UNIPROT|Q9ZTS2
+ */
+ SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
+ uniprotSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ /*
+ * Find UNIPROT xrefs for nucleotide
+ * - it has no UNIPROT dbref of its own
+ * - but peptide with matching nucleotide dbref does, so is returned
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
+ Alignment xrefs = CrossRef.findXrefSequences(
+ new SequenceI[] { emblSeq }, true, "UNIPROT", al);
+ assertEquals(1, xrefs.getHeight());
+ assertSame(uniprotSeq, xrefs.getSequenceAt(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where only an indirect
+ * xref is found - not on the peptide sequence but on a nucleotide sequence in
+ * the alignment which which it shares a protein dbref
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_indirectDbrefToNucleotide()
+ {
+ /*
+ * Alignment setup:
+ * - peptide dbref UNIPROT|Q9ZTS2
+ * - nucleotide dbref EMBL|AF039662, UNIPROT|Q9ZTS2
+ */
+ SequenceI uniprotSeq = new Sequence("Q9ZTS2", "MASVSATMISTS");
+ uniprotSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ SequenceI emblSeq = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ emblSeq.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ emblSeq.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
+ /*
+ * find EMBL xrefs for peptide sequence - it has no direct
+ * dbrefs, but the 'corresponding' nucleotide sequence does, so is returned
+ */
+ /*
+ * Find EMBL xrefs for peptide
+ * - it has no EMBL dbref of its own
+ * - but nucleotide with matching peptide dbref does, so is returned
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
+ Alignment xrefs = CrossRef.findXrefSequences(
+ new SequenceI[] { uniprotSeq }, false, "EMBL", al);
+ assertEquals(1, xrefs.getHeight());
+ assertSame(emblSeq, xrefs.getSequenceAt(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has no dbref to the desired source, and there are no indirect
+ * references via another sequence in the alignment
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_noDbrefs()
+ {
+ /*
+ * two nucleotide sequences, one with UNIPROT dbref
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ SequenceI dna2 = new Sequence("AJ307031", "AAACCCTTT");
+
+ /*
+ * find UNIPROT xrefs for peptide sequence - it has no direct
+ * dbrefs, and the other sequence (which has a UNIPROT dbref) is not
+ * equatable to it, so no results found
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
+ Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna2 },
+ true, "UNIPROT", al);
+ assertNull(xrefs);
+ }
+
+ /**
+ * Tests for the method that searches an alignment (with one sequence
+ * excluded) for protein/nucleotide sequences with a given cross-reference
+ */
+ @Test(groups = { "Functional" })
+ public void testSearchDataset()
+ {
+ /*
+ * nucleotide sequence with UNIPROT AND EMBL dbref
+ * peptide sequence with UNIPROT dbref
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ dna1.addDBRef(new DBRefEntry("EMBL", "0", "AF039662"));
+ SequenceI pep1 = new Sequence("Q9ZTS2", "MLAVSRGQ");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ AlignmentI al = new Alignment(new SequenceI[] { dna1, pep1 });
+
+ List<SequenceI> result = new ArrayList<SequenceI>();
+
+ /*
+ * first search for a dbref nowhere on the alignment:
+ */
+ DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "P30419");
+ boolean found = CrossRef.searchDataset(dna1, dbref, al, result, null,
+ true, true);
+ assertFalse(found);
+ assertTrue(result.isEmpty());
+
+ // TODO we are setting direct=true here but it is set to
+ // false in Jalview code...
+
+ /*
+ * search for a protein sequence with dbref UNIPROT:Q9ZTS2
+ */
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ found = CrossRef.searchDataset(dna1, dbref, al, result, null, true,
+ true);
+ assertTrue(found);
+ assertEquals(1, result.size());
+ assertSame(pep1, result.get(0));
+
+ /*
+ * search for a nucleotide sequence with dbref UNIPROT:Q9ZTS2
+ */
+ result.clear();
+ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
+ found = CrossRef.searchDataset(pep1, dbref, al, result, null, true,
+ false);
+ assertTrue(found);
+ assertEquals(1, result.size());
+ assertSame(dna1, result.get(0));
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has a dbref with a mapping to a sequence
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_fromDbRefMap()
+ {
+ /*
+ * two peptide sequences each with a DBRef and SequenceFeature
+ */
+ SequenceI pep1 = new Sequence("Q9ZTS2", "MALFQRSV");
+ pep1.addDBRef(new DBRefEntry("Pfam", "0", "PF00111"));
+ pep1.addSequenceFeature(new SequenceFeature("type", "desc", 12, 14, 1f,
+ "group"));
+ SequenceI pep2 = new Sequence("P30419", "MTRRSQIF");
+ pep2.addDBRef(new DBRefEntry("PDB", "0", "3JTK"));
+ pep2.addSequenceFeature(new SequenceFeature("type2", "desc2", 13, 15,
+ 12f, "group2"));
+
+ /*
+ * nucleotide sequence (to go in the alignment)
+ */
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+
+ /*
+ * add DBRefEntry's to dna1 with mappings from dna to both peptides
+ */
+ MapList mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 },
+ 3, 1);
+ Mapping map = new Mapping(pep1, mapList);
+ DBRefEntry dbRef1 = new DBRefEntry("UNIPROT", "0", "Q9ZTS2", map);
+ dna1.addDBRef(dbRef1);
+ mapList = new MapList(new int[] { 1, 24 }, new int[] { 1, 3 }, 3, 1);
+ map = new Mapping(pep2, mapList);
+ DBRefEntry dbRef2 = new DBRefEntry("UNIPROT", "0", "P30419", map);
+ dna1.addDBRef(dbRef2);
+
+ /*
+ * find UNIPROT xrefs for nucleotide sequence - it should pick up
+ * mapped sequences
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1 });
+ Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
+ true, "UNIPROT", al);
+ assertEquals(2, xrefs.getHeight());
+
+ /*
+ * cross-refs alignment holds copies of the mapped sequences
+ * including copies of their dbrefs and features
+ */
+ checkCopySequence(pep1, xrefs.getSequenceAt(0));
+ checkCopySequence(pep2, xrefs.getSequenceAt(1));
+ }
+
+ /**
+ * Helper method to assert seq1 looks like a copy of seq2
+ *
+ * @param seq1
+ * @param seq2
+ */
+ private void checkCopySequence(SequenceI seq1, SequenceI seq2)
+ {
+ assertNotSame(seq1, seq2);
+ assertEquals(seq1.getName(), seq2.getName());
+ assertEquals(seq1.getStart(), seq2.getStart());
+ assertEquals(seq1.getEnd(), seq2.getEnd());
+ assertEquals(seq1.getSequenceAsString(), seq2.getSequenceAsString());
+
+ /*
+ * compare dbrefs
+ */
+ assertArrayEquals(seq1.getDBRefs(), seq2.getDBRefs());
+ // check one to verify a copy, not the same object
+ if (seq1.getDBRefs().length > 0)
+ {
+ assertNotSame(seq1.getDBRefs()[0], seq2.getDBRefs()[0]);
+ }
+
+ /*
+ * compare features
+ */
+ assertArrayEquals(seq1.getSequenceFeatures(),
+ seq2.getSequenceFeatures());
+ if (seq1.getSequenceFeatures().length > 0)
+ {
+ assertNotSame(seq1.getSequenceFeatures()[0],
+ seq2.getSequenceFeatures()[0]);
+ }
+ }
+
+ /**
+ * Test for finding 'product' sequences for the case where the selected
+ * sequence has a dbref with no mapping, triggering a fetch from database
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_withFetch()
+ {
+ SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "P30419"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314"));
+ final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
+ final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
+
+ SequenceFetcher mockFetcher = new SequenceFetcher()
+ {
+
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(DBRefEntry[] refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for nucleotide sequence
+ */
+ AlignmentI al = new Alignment(new SequenceI[] { dna1 });
+ Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
+ true, "UNIPROT", al);
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));
+ }
+
+ @AfterClass
+ public void tearDown()
+ {
+ SequenceFetcherFactory.setSequenceFetcher(null);
}
}
ref5.setMap(new Mapping(new MapList(new int[] { 1, 1 }, new int[] { 1,
1 }, 1, 1)));
- DBRefEntry[] matches = DBRefUtils.searchRefs(new DBRefEntry[] { ref1,
- ref2, ref3, ref4, ref5 }, "A1234");
+ DBRefEntry[] dbrefs = new DBRefEntry[] { ref1,
+ ref2, ref3, ref4, ref5 };
+ DBRefEntry[] matches = DBRefUtils.searchRefs(dbrefs, "A1234");
assertEquals(3, matches.length);
assertSame(ref1, matches[0]);
assertSame(ref2, matches[1]);
assertSame(ref5, matches[2]);
}
+
+ /**
+ * Test the method that searches for matches references - case when we are
+ * matching a reference with null (any) accession id
+ */
+ @Test(groups = { "Functional" })
+ public void testSearchRefs_wildcardAccessionid()
+ {
+ DBRefEntry target = new DBRefEntry("EMBL", "2", null);
+
+ DBRefEntry ref1 = new DBRefEntry("EMBL", "1", "A1234"); // matches
+ // constructor changes embl to EMBL
+ DBRefEntry ref2 = new DBRefEntry("embl", "1", "A1235"); // matches
+ // constructor does not upper-case accession id
+ DBRefEntry ref3 = new DBRefEntry("EMBL", "1", "A1236"); // matches
+ DBRefEntry ref4 = new DBRefEntry("EMBLCDS", "1", "A1234"); // no match
+ // ref5 matches although it has a mapping - ignored
+ DBRefEntry ref5 = new DBRefEntry("EMBL", "1", "A1237");
+ ref5.setMap(new Mapping(new MapList(new int[] { 1, 1 }, new int[] { 1,
+ 1 }, 1, 1)));
+
+ DBRefEntry[] matches = DBRefUtils.searchRefs(new DBRefEntry[] { ref1,
+ ref2, ref3, ref4, ref5 }, target);
+ assertEquals(4, matches.length);
+ assertSame(ref1, matches[0]);
+ assertSame(ref2, matches[1]);
+ assertSame(ref3, matches[2]);
+ assertSame(ref5, matches[3]);
+ }
}
package jalview.ws;
+import jalview.analysis.CrossRef;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefSource;
{
boolean dna = sp.isDnaCoding();
// try and find products
- String types[] = jalview.analysis.CrossRef
- .findSequenceXrefTypes(dna, al.getSequencesArray());
+ List<String> types = CrossRef.findXrefSourcesForSequences(dna,
+ al.getSequencesArray(), null);
if (types != null)
{
System.out.println("Xref Types for: "
+ (dna ? "dna" : "prot"));
- for (int t = 0; t < types.length; t++)
+ for (String source : types)
{
- System.out.println("Type: " + types[t]);
+ System.out.println("Type: " + source);
SequenceI[] prod = jalview.analysis.CrossRef
.findXrefSequences(al.getSequencesArray(), dna,
- types[t], null)
+ source, null)
.getSequencesArray();
System.out.println("Found "
+ ((prod == null) ? "no" : "" + prod.length)