{
List<SequenceI> cdsSeqs = new ArrayList<SequenceI>();
+ /*
+ * construct CDS sequences from the (cds-to-protein) mappings made earlier;
+ * this makes it possible to model multiple products from dna (e.g. EMBL);
+ * however it does mean we don't have the EMBL protein_id (a property on
+ * the CDS features) in order to make the CDS sequence name :-(
+ */
for (SequenceI seq : dna)
{
AlignedCodonFrame cdsMappings = new AlignedCodonFrame();
* sort to get sequence features in start position order
* - would be better to store in Sequence as a TreeSet or NCList?
*/
- Arrays.sort(peptide.getSequenceFeatures(),
- new Comparator<SequenceFeature>()
- {
- @Override
- public int compare(SequenceFeature o1, SequenceFeature o2)
+ if (peptide.getSequenceFeatures() != null)
+ {
+ Arrays.sort(peptide.getSequenceFeatures(),
+ new Comparator<SequenceFeature>()
{
- int c = Integer.compare(o1.getBegin(), o2.getBegin());
- return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
- : c;
- }
- });
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+ : c;
+ }
+ });
+ }
return count;
}
import jalview.ws.seqfetcher.ASequenceFetcher;
import java.util.ArrayList;
-import java.util.Arrays;
+import java.util.Iterator;
import java.util.List;
/**
- * Functions for cross-referencing sequence databases. user must first specify
- * if cross-referencing from protein or dna (set dna==true)
+ * Functions for cross-referencing sequence databases.
*
* @author JimP
*
public class CrossRef
{
/*
- * A sub-class that ignores Parent attribute when comparing sequence
- * features. This avoids 'duplicate' CDS features that only
- * differ in their parent Transcript ids.
+ * the dataset of the alignment for which we are searching for
+ * cross-references; in some cases we may resolve xrefs by
+ * searching in the dataset
*/
- class MySequenceFeature extends SequenceFeature
- {
- private SequenceFeature feat;
+ private AlignmentI dataset;
- MySequenceFeature(SequenceFeature sf)
- {
- this.feat = sf;
- }
+ /*
+ * true if we are searching for cross-references from nucleotide,
+ * i.e. for protein sequences, false if the reverse
+ */
+ private boolean fromDna;
- @Override
- public boolean equals(Object o)
- {
- return feat.equals(o, true);
- }
- }
+ /*
+ * the sequences for which we are seeking cross-references
+ */
+ private SequenceI[] fromSeqs;
/**
+ * Constructor
+ *
+ * @param seqs
+ * the sequences for which we are seeking cross-references
+ * @param ds
+ * the containing alignment dataset (may be searched to resolve
+ * cross-references)
+ */
+ public CrossRef(SequenceI[] seqs, AlignmentI ds)
+ {
+ fromSeqs = seqs;
+ fromDna = ds.isNucleotide();
+ dataset = ds.getDataset() == null ? ds : ds.getDataset();
+ }
+ /**
* Returns a list of distinct database sources for which sequences have either
* <ul>
* <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
* reference from another sequence in the dataset which has a cross-reference
* to a direct DBRefEntry on the given sequence</li>
* </ul>
- *
- * @param dna
- * true if seqs are nucleotide
- * @param seqs
- * sequences whose xrefs we are seeking
- * @param dataset
- * an alignment to search for indirect references
* @return
*/
- public static List<String> findXrefSourcesForSequences(boolean dna,
- SequenceI[] seqs, AlignmentI dataset)
+ public List<String> findXrefSourcesForSequences()
{
List<String> sources = new ArrayList<String>();
- for (SequenceI seq : seqs)
+ for (SequenceI seq : fromSeqs)
{
if (seq != null)
{
- findXrefSourcesForSequence(seq, dna, dataset, sources);
+ findXrefSourcesForSequence(seq, sources);
}
}
return sources;
*
* @param seq
* the sequence whose dbrefs we are searching against
- * @param dna
- * true if the sequence is nucleotide
- * @param dataset
- * an alignment to search for indirect references
* @param sources
* a list of sources to add matches to
*/
- static void findXrefSourcesForSequence(SequenceI seq, boolean dna,
- AlignmentI dataset, List<String> sources)
+ void findXrefSourcesForSequence(SequenceI seq, List<String> sources)
{
/*
* first find seq's xrefs (dna-to-peptide or peptide-to-dna)
*/
- DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs());
+ DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
addXrefsToSources(rfs, sources);
if (dataset != null)
{
/*
* find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
*/
- DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+ DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
List<SequenceI> rseqs = new ArrayList<SequenceI>();
/*
* find sequences in the alignment which xref one of these DBRefs
* i.e. is xref-ed to a common sequence identifier
*/
- CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null);
+ searchDatasetXrefs(seq, lrfs, rseqs, null);
/*
* add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
*/
for (SequenceI rs : rseqs)
{
- DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs());
+ DBRefEntry[] xrs = DBRefUtils
+ .selectDbRefs(!fromDna, rs.getDBRefs());
addXrefsToSources(xrs, sources);
}
}
* @param xrefs
* @param sources
*/
- static void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
+ void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
{
if (xrefs != null)
{
* add to)
* @return products (as dataset sequences)
*/
- public static Alignment findXrefSequences(SequenceI[] seqs,
- final boolean dna, final String source, AlignmentI al)
+ public Alignment findXrefSequences(String source)
{
- AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
List<SequenceI> rseqs = new ArrayList<SequenceI>();
AlignedCodonFrame cf = new AlignedCodonFrame();
- for (SequenceI seq : seqs)
+ SequenceIdMatcher matcher = new SequenceIdMatcher(
+ dataset.getSequences());
+
+ for (SequenceI seq : fromSeqs)
{
SequenceI dss = seq;
while (dss.getDatasetSequence() != null)
dss = dss.getDatasetSequence();
}
boolean found = false;
- DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs());
+ DBRefEntry[] xrfs = DBRefUtils
+ .selectDbRefs(!fromDna, dss.getDBRefs());
if ((xrfs == null || xrfs.length == 0) && dataset != null)
{
/*
* found no suitable dbrefs on sequence - look for sequences in the
* alignment which share a dbref with this one
*/
- DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+ DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
+ seq.getDBRefs());
/*
* find sequences (except this one!), of complementary type,
* which have a dbref to an accession id for this sequence,
* and add them to the results
*/
- found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
- rseqs, cf);
+ found = searchDatasetXrefs(dss, lrfs, rseqs, cf);
}
- for (int r = 0; xrfs != null && r < xrfs.length; r++)
+ if (xrfs == null && !found)
{
- DBRefEntry xref = xrfs[r];
- if (source != null && !source.equals(xref.getSource()))
- {
- continue;
- }
+ /*
+ * no dbref to source on this sequence or matched
+ * complementary sequence in the dataset
+ */
+ continue;
+ }
+ List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
+ source);
+ Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
+ while (refIterator.hasNext())
+ {
+ DBRefEntry xref = refIterator.next();
+ found = false;
if (xref.hasMap())
{
- if (xref.getMap().getTo() != null)
+ SequenceI mappedTo = xref.getMap().getTo();
+ if (mappedTo != null)
{
+ /*
+ * dbref contains the sequence it maps to; add it to the
+ * results unless we have done so already (could happen if
+ * fetching xrefs for sequences which have xrefs in common)
+ * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
+ */
found = true;
- SequenceI rsq = new Sequence(xref.getMap().getTo());
+ SequenceI matchInDataset = findInDataset(mappedTo);// matcher.findIdMatch(mappedTo);
+ if (matchInDataset != null)
+ {
+ if (!rseqs.contains(matchInDataset))
+ {
+ rseqs.add(matchInDataset);
+ }
+ continue;
+ }
+ SequenceI rsq = new Sequence(mappedTo);
rseqs.add(rsq);
- if (xref.getMap().getMap().getFromRatio() != xref
- .getMap().getMap().getToRatio())
+ if (xref.getMap().getMap().getFromRatio() != xref.getMap()
+ .getMap().getToRatio())
{
// get sense of map correct for adding to product alignment.
- if (dna)
+ if (fromDna)
{
// map is from dna seq to a protein product
cf.addMap(dss, rsq, xref.getMap().getMap());
}
}
}
+
if (!found)
{
- // do a bit more work - search for sequences with references matching
- // xrefs on this sequence.
- if (dataset != null)
+ SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|"
+ + xref.getAccessionId());
+ if (matchedSeq != null)
{
- found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/
- !dna);
- if (found)
+ if (constructMapping(seq, matchedSeq, xref, cf))
{
- xrfs[r] = null; // we've recovered seqs for this one.
+ found = true;
}
}
}
+
+ if (!found)
+ {
+ // do a bit more work - search for sequences with references matching
+ // xrefs on this sequence.
+ found = searchDataset(dss, xref, rseqs, cf, false);
+ }
+ if (found)
+ {
+ refIterator.remove();
+ }
}
- if (!found)
+
+ /*
+ * fetch from source database any dbrefs we haven't resolved up to here
+ */
+ if (!sourceRefs.isEmpty())
{
- if (xrfs != null && xrfs.length > 0)
+ ASequenceFetcher sftch = SequenceFetcherFactory
+ .getSequenceFetcher();
+ SequenceI[] retrieved = null;
+ try
{
- ASequenceFetcher sftch = SequenceFetcherFactory
- .getSequenceFetcher();
- SequenceI[] retrieved = null;
- int l = xrfs.length;
- for (int r = 0; r < xrfs.length; r++)
- {
- // filter out any irrelevant or irretrievable references
- if (xrfs[r] == null
- || ((source != null && !source.equals(xrfs[r]
- .getSource())) || !sftch.isFetchable(xrfs[r]
- .getSource())))
- {
- l--;
- xrfs[r] = null;
- }
- }
- if (l > 0)
- {
- // System.out
- // .println("Attempting to retrieve cross referenced sequences.");
- DBRefEntry[] t = new DBRefEntry[l];
- l = 0;
- for (int r = 0; r < xrfs.length; r++)
- {
- if (xrfs[r] != null)
- {
- t[l++] = xrfs[r];
- }
- }
- xrfs = t;
- try
- {
- retrieved = sftch.getSequences(Arrays.asList(xrfs), !dna);
- // problem here is we don't know which of xrfs resulted in which
- // retrieved element
- } catch (Exception e)
- {
- System.err
- .println("Problem whilst retrieving cross references for Sequence : "
- + seq.getName());
- e.printStackTrace();
- }
+ retrieved = sftch.getSequences(sourceRefs, !fromDna);
+ } catch (Exception e)
+ {
+ System.err
+ .println("Problem whilst retrieving cross references for Sequence : "
+ + seq.getName());
+ e.printStackTrace();
+ }
- if (retrieved != null)
+ if (retrieved != null)
+ {
+ updateDbrefMappings(seq, xrfs, retrieved, cf);
+ for (SequenceI retrievedSequence : retrieved)
+ {
+ SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
+ : retrievedSequence.getDatasetSequence();
+ DBRefEntry[] dbr = retrievedSequence.getDBRefs();
+ if (dbr != null)
{
- updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
-
- SequenceIdMatcher matcher = new SequenceIdMatcher(
- dataset.getSequences());
- List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
- CrossRef me = new CrossRef();
- for (int rs = 0; rs < retrieved.length; rs++)
+ for (DBRefEntry dbref : dbr)
{
- // TODO: examine each sequence for 'redundancy'
- DBRefEntry[] dbr = retrieved[rs].getDBRefs();
- if (dbr != null && dbr.length > 0)
+ // find any entry where we should put in the sequence being
+ // cross-referenced into the map
+ Mapping map = dbref.getMap();
+ if (map != null)
{
- for (int di = 0; di < dbr.length; di++)
+ if (map.getTo() != null && map.getMap() != null)
{
- // find any entry where we should put in the sequence being
- // cross-referenced into the map
- Mapping map = dbr[di].getMap();
- if (map != null)
+ // TODO findInDataset requires exact sequence match but
+ // 'congruent' test only for the mapped part
+ SequenceI matched = findInDataset(map.getTo());// matcher.findIdMatch(map.getTo());
+ if (matched != null)
{
- if (map.getTo() != null && map.getMap() != null)
+ /*
+ * already got an xref to this sequence; update this
+ * map to point to the same sequence, and add
+ * any new dbrefs to it
+ */
+ DBRefEntry[] toRefs = map.getTo().getDBRefs();
+ if (toRefs != null)
{
- SequenceI matched = matcher
- .findIdMatch(map.getTo());
- if (matched != null)
- {
- /*
- * already got an xref to this sequence; update this
- * map to point to the same sequence, and add
- * any new dbrefs to it
- */
- for (DBRefEntry ref : map.getTo().getDBRefs())
- {
- matched.addDBRef(ref); // add or update mapping
- }
- map.setTo(matched);
- }
- else
+ for (DBRefEntry ref : toRefs)
{
- matcher.add(map.getTo());
+ matched.addDBRef(ref); // add or update mapping
}
- try
+ }
+ map.setTo(matched);
+ }
+ else
+ {
+ matcher.add(map.getTo());
+ }
+ try
+ {
+ // compare ms with dss and replace with dss in mapping
+ // if map is congruent
+ SequenceI ms = map.getTo();
+ int sf = map.getMap().getToLowest();
+ int st = map.getMap().getToHighest();
+ SequenceI mappedrg = ms.getSubSequence(sf, st);
+ // SequenceI loc = dss.getSubSequence(sf, st);
+ if (mappedrg.getLength() > 0
+ && ms.getSequenceAsString().equals(
+ dss.getSequenceAsString()))
+ // && mappedrg.getSequenceAsString().equals(
+ // loc.getSequenceAsString()))
+ {
+ String msg = "Mapping updated from " + ms.getName()
+ + " to retrieved crossreference "
+ + dss.getName();
+ System.out.println(msg);
+ // method to update all refs of existing To on
+ // retrieved sequence with dss and merge any props
+ // on To onto dss.
+ // TODO don't we have to change the mapped to ranges
+ // if not to the whole sequence?
+ map.setTo(dss);
+ /*
+ * copy sequence features as well, avoiding
+ * duplication (e.g. same variation from 2
+ * transcripts)
+ */
+ SequenceFeature[] sfs = ms.getSequenceFeatures();
+ if (sfs != null)
{
- // compare ms with dss and replace with dss in mapping
- // if map is congruent
- SequenceI ms = map.getTo();
- int sf = map.getMap().getToLowest();
- int st = map.getMap().getToHighest();
- SequenceI mappedrg = ms.getSubSequence(sf, st);
- // SequenceI loc = dss.getSubSequence(sf, st);
- if (mappedrg.getLength() > 0
- && ms.getSequenceAsString().equals(
- dss.getSequenceAsString()))
- // && mappedrg.getSequenceAsString().equals(
- // loc.getSequenceAsString()))
+ for (SequenceFeature feat : sfs)
{
- String msg = "Mapping updated from "
- + ms.getName()
- + " to retrieved crossreference "
- + dss.getName();
- System.out.println(msg);
- // method to update all refs of existing To on
- // retrieved sequence with dss and merge any props
- // on To onto dss.
- map.setTo(dss);
/*
- * copy sequence features as well, avoiding
- * duplication (e.g. same variation from 2
- * transcripts)
+ * make a flyweight feature object which ignores Parent
+ * attribute in equality test, to avoid creating many
+ * otherwise duplicate exon features on genomic sequence
*/
- SequenceFeature[] sfs = ms
- .getSequenceFeatures();
- if (sfs != null)
+ SequenceFeature newFeature = new SequenceFeature(
+ feat)
{
- for (SequenceFeature feat : sfs)
+ @Override
+ public boolean equals(Object o)
{
- /*
- * we override SequenceFeature.equals here (but
- * not elsewhere) to ignore Parent attribute
- * TODO not quite working yet!
- */
- if (!copiedFeatures
- .contains(me.new MySequenceFeature(
- feat)))
- {
- dss.addSequenceFeature(feat);
- copiedFeatures.add(feat);
- }
+ return super.equals(o, true);
}
- }
- cf.addMap(retrieved[rs].getDatasetSequence(),
- dss, map.getMap());
- }
- // TODO remove this 'else' and the cf.addMap above?
- else
- {
- cf.addMap(retrieved[rs].getDatasetSequence(),
- map.getTo(), map.getMap());
+ };
+ dss.addSequenceFeature(newFeature);
}
- } catch (Exception e)
- {
- System.err
- .println("Exception when consolidating Mapped sequence set...");
- e.printStackTrace(System.err);
}
}
+ cf.addMap(retrievedDss, map.getTo(), map.getMap());
+ } catch (Exception e)
+ {
+ System.err
+ .println("Exception when consolidating Mapped sequence set...");
+ e.printStackTrace(System.err);
}
}
}
- retrieved[rs].updatePDBIds();
- rseqs.add(retrieved[rs]);
}
}
+ retrievedSequence.updatePDBIds();
+ rseqs.add(retrievedSequence);
+ dataset.addSequence(retrievedDss);
+ matcher.add(retrievedSequence);
}
}
}
}
/**
+ * Returns the first identical sequence in the dataset if any, else null
+ *
+ * @param mappedTo
+ * @return
+ */
+ SequenceI findInDataset(SequenceI mappedTo)
+ {
+ if (mappedTo == null)
+ {
+ return null;
+ }
+ SequenceI dss = mappedTo.getDatasetSequence() == null ? mappedTo
+ : mappedTo.getDatasetSequence();
+ for (SequenceI seq : dataset.getSequences())
+ {
+ if (sameSequence(seq, dss))
+ {
+ return seq;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
+ * case), else false. This method compares the lengths, then each character in
+ * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
+ * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
+ *
+ * @param seq1
+ * @param seq2
+ * @return
+ */
+ // TODO move to Sequence / SequenceI
+ static boolean sameSequence(SequenceI seq1, SequenceI seq2)
+ {
+ if (seq1 == seq2)
+ {
+ return true;
+ }
+ if (seq1 == null || seq2 == null)
+ {
+ return false;
+ }
+ char[] c1 = seq1.getSequence();
+ char[] c2 = seq2.getSequence();
+ if (c1.length != c2.length)
+ {
+ return false;
+ }
+ for (int i = 0; i < c1.length; i++)
+ {
+ int diff = c1[i] - c2[i];
+ /*
+ * same char or differ in case only ('a'-'A' == 32)
+ */
+ if (diff != 0 && diff != 32 && diff != -32)
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
* Updates any empty mappings in the cross-references with one to a compatible
* retrieved sequence if found, and adds any new mappings to the
* AlignedCodonFrame
*
- * @param dna
* @param mapFrom
* @param xrefs
* @param retrieved
* @param acf
*/
- static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+ void updateDbrefMappings(SequenceI mapFrom,
DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
{
SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
}
for (SequenceI seq : matches)
{
- MapList mapping = null;
- if (dna)
- {
- mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
- }
- else
- {
- mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
- if (mapping != null)
- {
- mapping = mapping.getInverse();
- }
- }
- if (mapping != null)
- {
- xref.setMap(new Mapping(seq, mapping));
- if (dna)
- {
- AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
- }
- if (dna)
- {
- acf.addMap(mapFrom, seq, mapping);
- }
- else
- {
- acf.addMap(seq, mapFrom, mapping.getInverse());
- }
- continue;
- }
+ constructMapping(mapFrom, seq, xref, acf);
}
}
}
}
/**
+ * Tries to make a mapping from dna to protein. If successful, adds the
+ * mapping to the dbref and the mappings collection and answers true,
+ * otherwise answers false.
+ *
+ * @param mapFrom
+ * @param mapTo
+ * @param xref
+ * @param mappings
+ * @return
+ */
+ boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
+ DBRefEntry xref, AlignedCodonFrame mappings)
+ {
+ MapList mapping = null;
+ if (fromDna)
+ {
+ mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
+ }
+ else
+ {
+ mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
+ if (mapping != null)
+ {
+ mapping = mapping.getInverse();
+ }
+ }
+ if (mapping == null)
+ {
+ return false;
+ }
+ xref.setMap(new Mapping(mapTo, mapping));
+ if (fromDna)
+ {
+ AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
+ mappings.addMap(mapFrom, mapTo, mapping);
+ }
+ else
+ {
+ mappings.addMap(mapTo, mapFrom, mapping.getInverse());
+ }
+
+ return true;
+ }
+
+ /**
* find references to lrfs in the cross-reference set of each sequence in
* dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
* based on source and accession string only - Map and Version are nulled.
*
* @param sequenceI
* @param lrfs
- * @param dataset
* @param rseqs
* @return true if matches were found.
*/
- private static boolean searchDatasetXrefs(SequenceI sequenceI,
- boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
- List<SequenceI> rseqs, AlignedCodonFrame cf)
+ private boolean searchDatasetXrefs(SequenceI sequenceI,
+ DBRefEntry[] lrfs, List<SequenceI> rseqs, AlignedCodonFrame cf)
{
boolean found = false;
if (lrfs == null)
// add in wildcards
xref.setVersion(null);
xref.setMap(null);
- found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false,
- dna);
+ found |= searchDataset(sequenceI, xref, rseqs, cf, false);
}
return found;
}
* a sequence to ignore (start point of search)
* @param xrf
* a cross-reference to try to match
- * @param dataset
- * sequences to search in
* @param rseqs
* result list to add to
* @param cf
* a set of sequence mappings to add to
* @param direct
* - search all references or only subset
- * @param dna
- * search dna or protein xrefs (if direct=false)
* @return true if relationship found and sequence added.
*/
- public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
- boolean direct, boolean dna)
+ boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
+ List<SequenceI> rseqs, AlignedCodonFrame cf, boolean direct)
{
boolean found = false;
if (dataset == null)
{
continue;
}
- // check if this is the correct sequence type
+ /*
+ * only look at same molecule type if 'direct', or
+ * complementary type if !direct
+ */
{
- // TODO 'direct' is always set to false - remove?
- // or should it be 'true' from findXrefSequences?
- // also its Javadoc conflicts with its use:
- // test below implies 'direct' means find complementary sequences,
- // !direct means select same molecule type
boolean isDna = Comparison
.isNucleotide(new SequenceI[] { nxt });
- if ((direct && isDna == dna) || (!direct && isDna != dna))
+ if (direct ? (isDna != fromDna) : (isDna == fromDna))
{
// skip this sequence because it is wrong molecule type
continue;
}
else
{
- poss = DBRefUtils.selectDbRefs(!dna, poss);
+ poss = DBRefUtils.selectDbRefs(!fromDna, poss);
cands = DBRefUtils.searchRefs(poss, xrf);
}
if (!cands.isEmpty())
{
// get sense of map correct for adding to product
// alignment.
- if (dna)
+ if (fromDna)
{
// map is from dna seq to a protein product
cf.addMap(sequenceI, nxt, map);
package jalview.analysis;
+import jalview.analysis.CrossRef.MySequenceFeature;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
public class CrossRefs
{
+ /*
+ * A sub-class that ignores Parent attribute when comparing sequence
+ * features. This avoids 'duplicate' CDS features that only
+ * differ in their parent Transcript ids.
+ */
+ class MySequenceFeature extends SequenceFeature
+ {
+ private SequenceFeature feat;
+
+ MySequenceFeature(SequenceFeature sf)
+ {
+ this.feat = sf;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ return feat.equals(o, true);
+ }
+ }
+
/**
* Finds cross-references for sequences from a specified source database.
* These may be found in four ways:
public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
String source, AlignmentI dataset)
{
- List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
- AlignedCodonFrame mappings = new AlignedCodonFrame();
-
- List<DBRefEntry> sourceRefs = new ArrayList<DBRefEntry>();
-
+ /*
+ * filter to only those sequences of the right type (nucleotide/protein)
+ */
+ List<SequenceI> fromSeqs = new ArrayList<SequenceI>();
for (SequenceI seq : seqs)
{
- if (dna != Comparison.isNucleotide(seq))
+ if (dna == Comparison.isNucleotide(seq))
{
- /*
- * mixed alignment, and this sequence is of the wrong type
- */
- continue;
+ fromSeqs.add(seq);
}
+ }
+ return findXrefSequences(fromSeqs, dna, source, dataset);
+ }
+
+ /**
+ * Finds cross-references for sequences from a specified source database.
+ * These may be found in four ways:
+ * <ul>
+ * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
+ * <li>a sequence of complementary type in the alignment dataset, which has a
+ * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
+ * <li>a sequence of complementary type in the alignment, which has a
+ * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
+ * <li>by fetching the accession from the remote database</li>
+ * </ul>
+ *
+ * @param seqs
+ * the sequences whose cross-references we are searching for,
+ * filtered to only those which are of the type denoted by 'dna'
+ * @param dna
+ * true if the sequences are from a nucleotide alignment, else false
+ * @param source
+ * the database source we want cross-references to
+ * @param dataset
+ * the alignment dataset the sequences belong to
+ * @return an alignment containing cross-reference sequences, or null if none
+ * found
+ */
+ static AlignmentI findXrefSequences(List<SequenceI> fromSeqs,
+ boolean dna, String source, AlignmentI dataset)
+ {
+ List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
+ AlignedCodonFrame mappings = new AlignedCodonFrame();
- /*
- * get this sequence's dbrefs to source database (if any)
- */
- List<DBRefEntry> seqSourceRefs = DBRefUtils.searchRefsForSource(
- seq.getDBRefs(), source);
+ List<DBRefEntry> unresolvedRefs = new ArrayList<DBRefEntry>();
- /*
- * first extract any mapped sequences from sourceRefs
- */
- findMappedDbrefs(seq, seqSourceRefs, foundSeqs, mappings);
+ /*
+ * first extract any mapped sequences from sourceRefs
+ * if successful, sequence is removed from fromSeqs
+ * if unsuccessful, dbrefs are added to unresolvedRefs
+ */
+ findMappedDbrefs(fromSeqs, source, foundSeqs,
+ unresolvedRefs, mappings);
- /*
- * for remaining sourceRefs, try to match a
- * complementary sequence in the dataset
- */
- findIndirectCrossReferences(seq, source, seqSourceRefs, dataset,
- foundSeqs, mappings);
- }
+ /*
+ * then search the alignment dataset for dbref resolutions
+ */
+ findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs,
+ unresolvedRefs, mappings);
/*
* fetch any remaining sourceRefs from the source database
*/
- fetchCrossReferences(sourceRefs, foundSeqs, mappings, dna, dataset);
+ fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings,
+ dna, dataset);
if (foundSeqs.isEmpty())
{
/**
* Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
* found, adds the sequence to foundSeqs and removes the dbref from the list.
+ * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting
+ * version number to 0 i.e. use source and accession only).
*
- * @param seq
- * the dataset sequence we are searching from
- * @param sourceRefs
- * the sequence's dbrefs to 'source'
+ * @param fromSeqs
+ * the dataset sequences we are searching from
+ * @param source
+ * the database source we are searching dbrefs for
* @param foundSeqs
- * a list of cross-references to add to
+ * a list of found sequences to add to
+ * @param unresolvedRefs
+ * a list of unresolved cross-references to add to
* @param mappings
* a set of sequence mappings to add to
* @return
*/
- static void findMappedDbrefs(SequenceI seq, List<DBRefEntry> sourceRefs,
- List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
+ static void findMappedDbrefs(List<SequenceI> fromSeqs, String source,
+ List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
+ AlignedCodonFrame mappings)
{
- Iterator<DBRefEntry> refs = sourceRefs.iterator();
- while (refs.hasNext())
+ Iterator<SequenceI> it = fromSeqs.iterator();
+ while (it.hasNext())
{
- DBRefEntry dbref = refs.next();
- Mapping map = dbref.getMap();
- if (map != null)
+ SequenceI seq = it.next();
+ SequenceI dss = seq.getDatasetSequence();
+ dss = dss == null ? seq : dss;
+
+ DBRefEntry[] dbRefs = seq.getDBRefs();
+ if (dbRefs == null)
+ {
+ continue;
+ }
+ boolean resolved = false;
+ for (DBRefEntry dbref : dbRefs)
{
- SequenceI mappedTo = map.getTo();
- if (mappedTo != null)
+ if (!source.equals(dbref.getSource()))
{
- foundSeqs.add(new Sequence(mappedTo));
- refs.remove();
-
- /*
- * check mapping is not 'direct' (it shouldn't be if we reach here)
- * and add mapping (dna-to-peptide or vice versa) to the set
- */
- MapList mapList = map.getMap();
- int fromRatio = mapList.getFromRatio();
- int toRatio = mapList.getToRatio();
- if (fromRatio != toRatio)
+ continue;
+ }
+ DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0",
+ dbref.getAccessionId());
+ Mapping map = dbref.getMap();
+ if (map != null)
+ {
+ unresolvedRefs.remove(todo);
+ resolved = true;
+ SequenceI mappedTo = map.getTo();
+ if (mappedTo != null)
{
- if (fromRatio == 3)
- {
- mappings.addMap(seq, mappedTo, mapList);
- }
- else
+ foundSeqs.add(new Sequence(mappedTo));
+
+ /*
+ * check mapping is not 'direct' (it shouldn't be if we reach here)
+ * and add mapping (dna-to-peptide or vice versa) to the set
+ */
+ MapList mapList = map.getMap();
+ int fromRatio = mapList.getFromRatio();
+ int toRatio = mapList.getToRatio();
+ if (fromRatio != toRatio)
{
- mappings.addMap(mappedTo, seq, mapList.getInverse());
+ if (fromRatio == 3)
+ {
+ mappings.addMap(dss, mappedTo, mapList);
+ }
+ else
+ {
+ mappings.addMap(mappedTo, dss, mapList.getInverse());
+ }
}
}
}
+ else
+ {
+ /*
+ * no mapping to resolve dbref - add source+accession to list to resolve
+ */
+ if (!unresolvedRefs.contains(todo))
+ {
+ unresolvedRefs.add(todo);
+ }
+ }
+ }
+ if (resolved)
+ {
+ it.remove();
}
}
}
* to the foundSeqs list. If found, tries to make a mapping between seq and
* the retrieved sequence and insert it into the database reference.
*
- * @param seq
+ * @param fromSeqs
* @param sourceRefs
* @param foundSeqs
* @param mappings
* @param dna
*/
- static void fetchCrossReferences(SequenceI seq,
+ static void fetchCrossReferences(List<SequenceI> fromSeqs,
List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
{
retrieved = sftch.getSequences(sourceRefs, !dna);
} catch (Exception e)
{
- System.err
- .println("Problem whilst retrieving cross references for Sequence : "
- + seq.getName());
+ System.err.println("Problem whilst retrieving cross references: "
+ + e.getMessage());
e.printStackTrace();
return;
}
- if (retrieved != null)
+ if (retrieved == null)
{
- updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings);
+ return;
+ }
+ updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings);
- SequenceIdMatcher matcher = new SequenceIdMatcher(
- dataset.getSequences());
- List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
- CrossRef me = new CrossRef();
- for (int rs = 0; rs < retrieved.length; rs++)
+ SequenceIdMatcher matcher = new SequenceIdMatcher(
+ dataset.getSequences());
+ List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
+ CrossRefs me = new CrossRefs();
+ for (int rs = 0; rs < retrieved.length; rs++)
+ {
+ // TODO: examine each sequence for 'redundancy'
+ DBRefEntry[] dbr = retrieved[rs].getDBRefs();
+ if (dbr != null && dbr.length > 0)
{
- // TODO: examine each sequence for 'redundancy'
- DBRefEntry[] dbr = retrieved[rs].getDBRefs();
- if (dbr != null && dbr.length > 0)
+ for (int di = 0; di < dbr.length; di++)
{
- for (int di = 0; di < dbr.length; di++)
+ // find any entry where we should put in the sequence being
+ // cross-referenced into the map
+ Mapping map = dbr[di].getMap();
+ if (map != null)
{
- // find any entry where we should put in the sequence being
- // cross-referenced into the map
- Mapping map = dbr[di].getMap();
- if (map != null)
+ if (map.getTo() != null && map.getMap() != null)
{
- if (map.getTo() != null && map.getMap() != null)
+ SequenceI matched = matcher.findIdMatch(map.getTo());
+ if (matched != null)
{
- SequenceI matched = matcher.findIdMatch(map.getTo());
- if (matched != null)
- {
- /*
- * already got an xref to this sequence; update this
- * map to point to the same sequence, and add
- * any new dbrefs to it
- */
- for (DBRefEntry ref : map.getTo().getDBRefs())
- {
- matched.addDBRef(ref); // add or update mapping
- }
- map.setTo(matched);
- }
- else
+ /*
+ * already got an xref to this sequence; update this
+ * map to point to the same sequence, and add
+ * any new dbrefs to it
+ */
+ for (DBRefEntry ref : map.getTo().getDBRefs())
{
- matcher.add(map.getTo());
+ matched.addDBRef(ref); // add or update mapping
}
- try
+ map.setTo(matched);
+ }
+ else
+ {
+ matcher.add(map.getTo());
+ }
+ try
+ {
+ // compare ms with dss and replace with dss in mapping
+ // if map is congruent
+ SequenceI ms = map.getTo();
+ int sf = map.getMap().getToLowest();
+ int st = map.getMap().getToHighest();
+ SequenceI mappedrg = ms.getSubSequence(sf, st);
+ // SequenceI loc = dss.getSubSequence(sf, st);
+ if (mappedrg.getLength() > 0
+ && ms.getSequenceAsString().equals(
+ fromSeqs.getSequenceAsString()))
+ // && mappedrg.getSequenceAsString().equals(
+ // loc.getSequenceAsString()))
{
- // compare ms with dss and replace with dss in mapping
- // if map is congruent
- SequenceI ms = map.getTo();
- int sf = map.getMap().getToLowest();
- int st = map.getMap().getToHighest();
- SequenceI mappedrg = ms.getSubSequence(sf, st);
- // SequenceI loc = dss.getSubSequence(sf, st);
- if (mappedrg.getLength() > 0
- && ms.getSequenceAsString().equals(
- seq.getSequenceAsString()))
- // && mappedrg.getSequenceAsString().equals(
- // loc.getSequenceAsString()))
+ String msg = "Mapping updated from " + ms.getName()
+ + " to retrieved crossreference "
+ + fromSeqs.getName();
+ System.out.println(msg);
+ // method to update all refs of existing To on
+ // retrieved sequence with dss and merge any props
+ // on To onto dss.
+ map.setTo(fromSeqs);
+ /*
+ * copy sequence features as well, avoiding
+ * duplication (e.g. same variation from 2
+ * transcripts)
+ */
+ SequenceFeature[] sfs = ms.getSequenceFeatures();
+ if (sfs != null)
{
- String msg = "Mapping updated from " + ms.getName()
- + " to retrieved crossreference "
- + seq.getName();
- System.out.println(msg);
- // method to update all refs of existing To on
- // retrieved sequence with dss and merge any props
- // on To onto dss.
- map.setTo(seq);
- /*
- * copy sequence features as well, avoiding
- * duplication (e.g. same variation from 2
- * transcripts)
- */
- SequenceFeature[] sfs = ms.getSequenceFeatures();
- if (sfs != null)
+ for (SequenceFeature feat : sfs)
{
- for (SequenceFeature feat : sfs)
+ /*
+ * we override SequenceFeature.equals here (but
+ * not elsewhere) to ignore Parent attribute
+ * TODO not quite working yet!
+ */
+ if (!copiedFeatures
+ .contains(me.new MySequenceFeature(feat)))
{
- /*
- * we override SequenceFeature.equals here (but
- * not elsewhere) to ignore Parent attribute
- * TODO not quite working yet!
- */
- if (!copiedFeatures
- .contains(me.new MySequenceFeature(feat)))
- {
- seq.addSequenceFeature(feat);
- copiedFeatures.add(feat);
- }
+ fromSeqs.addSequenceFeature(feat);
+ copiedFeatures.add(feat);
}
}
}
- mappings.addMap(retrieved[rs].getDatasetSequence(),
- map.getTo(), map.getMap());
- } catch (Exception e)
- {
- System.err
- .println("Exception when consolidating Mapped sequence set...");
- e.printStackTrace(System.err);
}
+ mappings.addMap(retrieved[rs].getDatasetSequence(),
+ map.getTo(), map.getMap());
+ } catch (Exception e)
+ {
+ System.err
+ .println("Exception when consolidating Mapped sequence set...");
+ e.printStackTrace(System.err);
}
}
}
}
- retrieved[rs].updatePDBIds();
- foundSeqs.add(retrieved[rs]);
}
+ retrieved[rs].updatePDBIds();
+ foundSeqs.add(retrieved[rs]);
}
}
* shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
* removes the resolved sourceRef from the search list.
*
- * @param seq
+ * @param fromSeqs
* @param source
- * @param sourceRefs
- * @param dataset
+ * @param unresolvedRefs
* @param foundSeqs
+ * @param unresolvedRefs
* @param mappings
* @return
*/
- static void findIndirectCrossReferences(SequenceI seq, String source,
- List<DBRefEntry> sourceRefs, AlignmentI dataset,
- List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
+ static void findIndirectCrossReferences(List<SequenceI> fromSeqs,
+ String source, AlignmentI dataset,
+ List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
+ AlignedCodonFrame mappings)
{
- Iterator<DBRefEntry> refs = sourceRefs.iterator();
+ Iterator<DBRefEntry> refs = unresolvedRefs.iterator();
while (refs.hasNext())
{
DBRefEntry dbref = refs.next();
- boolean found = searchDatasetForCrossReference(seq, dbref, dataset,
- foundSeqs, mappings);
+ boolean found = false;
+ // boolean found = searchDatasetForCrossReference(fromSeqs, dbref,
+ // foundSeqs,
+ // unresolvedRefs, mappings);
if (found)
{
refs.remove();
* AlignedCodonFrame
*
* @param dna
- * @param mapFrom
+ * @param fromSeqs
* @param xrefs
* @param retrieved
* @param mappings
*/
- static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+ static void updateDbrefMappings(boolean dna, List<SequenceI> fromSeqs,
List<DBRefEntry> xrefs, SequenceI[] retrieved,
AlignedCodonFrame mappings)
{
MapList mapping = null;
if (dna)
{
- mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
+ mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs);
}
else
{
- mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
+ mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq);
if (mapping != null)
{
mapping = mapping.getInverse();
xref.setMap(new Mapping(seq, mapping));
if (dna)
{
- AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
+ AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping);
}
if (dna)
{
- mappings.addMap(mapFrom, seq, mapping);
+ mappings.addMap(fromSeqs, seq, mapping);
}
else
{
- mappings.addMap(seq, mapFrom, mapping.getInverse());
+ mappings.addMap(seq, fromSeqs, mapping.getInverse());
}
continue;
}
{
if (s != null)
{
- id = new String(s.toLowerCase());
+ id = s.toLowerCase();
}
else
{
.indexOf(s.charAt(id.length())) > -1)) : false;
}
}
+
+ /**
+ * toString method returns the wrapped sequence id. For debugging purposes
+ * only, behaviour not guaranteed not to change.
+ */
+ @Override
+ public String toString()
+ {
+ return id;
+ }
}
}
{
for (DBRefEntry dbref : feature.dbRefs)
{
+ /*
+ * convert UniProtKB/Swiss-Prot to UNIPROT
+ */
+ dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource()));
dna.addDBRef(dbref);
}
}
}
/*
- * add dbRefs to sequence, and mappings for Uniprot xrefs
+ * add mappings for Uniprot xrefs
*/
if (feature.dbRefs != null)
{
boolean mappingUsed = false;
for (DBRefEntry ref : feature.dbRefs)
{
- ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
if (ref.getSource().equals(DBRefSource.UNIPROT))
{
String proteinSeqName = DBRefSource.UNIPROT + "|"
}
}
}
- dna.addDBRef(ref);
}
if (noProteinDbref && product != null)
{
import jalview.analysis.AlignmentSorter;
import jalview.analysis.AlignmentUtils;
import jalview.analysis.CrossRef;
-import jalview.analysis.CrossRefs;
import jalview.analysis.Dna;
import jalview.analysis.ParseProperties;
import jalview.analysis.SequenceIdMatcher;
}
/**
- * Searches selected sequences for xRef products and builds the Show
- * Cross-References menu (formerly called Show Products)
+ * Searches the alignment sequences for xRefs and builds the Show
+ * Cross-References menu (formerly called Show Products), with database
+ * sources for which cross-references are found (protein sources for a
+ * nucleotide alignment and vice versa)
*
- * @return true if Show Cross-references menu should be enabled.
+ * @return true if Show Cross-references menu should be enabled
*/
public boolean canShowProducts()
{
- SequenceI[] selection = viewport.getSequenceSelection();
+ SequenceI[] seqs = viewport.getAlignment().getSequencesArray();
AlignmentI dataset = viewport.getAlignment().getDataset();
boolean showp = false;
try
{
showProducts.removeAll();
final boolean dna = viewport.getAlignment().isNucleotide();
- List<String> ptypes = (selection == null || selection.length == 0) ? null
- : CrossRef.findXrefSourcesForSequences(dna, selection, dataset);
+ List<String> ptypes = (seqs == null || seqs.length == 0) ? null
+ : new CrossRef(seqs, dataset)
+ .findXrefSourcesForSequences();
for (final String source : ptypes)
{
{
AlignmentI alignment = AlignFrame.this.getViewport()
.getAlignment();
- AlignmentI xrefs = CrossRefs.findXrefSequences(sel, dna, source,
- alignment);
+ AlignmentI xrefs = new CrossRef(sel, alignment)
+ .findXrefSequences(source);
if (xrefs != null)
{
/*
{
/*
* note lowest and highest values - bearing in mind the
- * direction may be revesed
+ * direction may be reversed
*/
fromLowest = Math.min(fromLowest, Math.min(from[i], from[i + 1]));
fromHighest = Math.max(fromHighest, Math.max(from[i], from[i + 1]));
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertNotSame;
import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertSame;
/*
* first with no dbrefs to search
*/
- CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences();
assertTrue(sources.isEmpty());
/*
seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
- CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences();
assertEquals(4, sources.size());
assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]",
sources.toString());
seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
// TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
al.addSequence(seq2);
- CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+ sources = new CrossRef(new SequenceI[] { seq }, al)
+ .findXrefSourcesForSequences();
assertEquals(3, sources.size());
assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString());
}
* - but peptide with matching nucleotide dbref does, so is returned
*/
AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
- Alignment xrefs = CrossRef.findXrefSequences(
- new SequenceI[] { emblSeq }, true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
+ .findXrefSequences("UNIPROT");
assertEquals(1, xrefs.getHeight());
assertSame(uniprotSeq, xrefs.getSequenceAt(0));
}
* - but nucleotide with matching peptide dbref does, so is returned
*/
AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
- Alignment xrefs = CrossRef.findXrefSequences(
- new SequenceI[] { uniprotSeq }, false, "EMBL", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq },
+ al).findXrefSequences("EMBL");
assertEquals(1, xrefs.getHeight());
assertSame(emblSeq, xrefs.getSequenceAt(0));
}
* equatable to it, so no results found
*/
AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
- Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna2 },
- true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al)
+ .findXrefSequences("UNIPROT");
assertNull(xrefs);
}
* first search for a dbref nowhere on the alignment:
*/
DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "P30419");
- boolean found = CrossRef.searchDataset(dna1, dbref, al, result, null,
- true, true);
+ CrossRef testee = new CrossRef(al.getSequencesArray(), al);
+ boolean found = testee.searchDataset(dna1, dbref, result, null, true);
assertFalse(found);
assertTrue(result.isEmpty());
* search for a protein sequence with dbref UNIPROT:Q9ZTS2
*/
dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
- found = CrossRef.searchDataset(dna1, dbref, al, result, null, true,
- true);
+ found = testee.searchDataset(dna1, dbref, result, null, true);
assertTrue(found);
assertEquals(1, result.size());
assertSame(pep1, result.get(0));
*/
result.clear();
dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
- found = CrossRef.searchDataset(pep1, dbref, al, result, null, true,
- false);
+ found = testee.searchDataset(pep1, dbref, result, null, false);
assertTrue(found);
assertEquals(1, result.size());
assertSame(dna1, result.get(0));
* mapped sequences
*/
AlignmentI al = new Alignment(new SequenceI[] { dna1 });
- Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
- true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT");
assertEquals(2, xrefs.getHeight());
/*
final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
- SequenceFetcher mockFetcher = new SequenceFetcher()
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
{
-
@Override
public boolean isFetchable(String source)
{
* find UNIPROT xrefs for nucleotide sequence
*/
AlignmentI al = new Alignment(new SequenceI[] { dna1 });
- Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
- true, "UNIPROT", al);
+ Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+ .findXrefSequences("UNIPROT");
assertEquals(2, xrefs.getHeight());
assertSame(pep1, xrefs.getSequenceAt(0));
assertSame(pep2, xrefs.getSequenceAt(1));
SequenceFetcherFactory.setSequenceFetcher(null);
}
+ /**
+ * Test for finding 'product' sequences for the case where both gene and
+ * transcript sequences have dbrefs to Uniprot.
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_forGeneAndTranscripts()
+ {
+ /*
+ * 'gene' sequence
+ */
+ SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC");
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+
+ /*
+ * 'transcript' with CDS feature (supports mapping to protein)
+ */
+ SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa");
+ braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+ braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f,
+ null));
+
+ /*
+ * 'spliced transcript' with CDS ranges
+ */
+ SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
+ braf002.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
+ null));
+ braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
+ null));
+
+ /*
+ * TODO code is fragile - use of SequenceIdMatcher depends on fetched
+ * sequences having a name starting Source|Accession
+ * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl
+ */
+ final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
+ final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
+
+ /*
+ * argument false suppresses adding DAS sources
+ * todo: define an interface type SequenceFetcherI and mock that
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ return new SequenceI[] { pep1, pep2 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find UNIPROT xrefs for gene and transcripts
+ * verify that
+ * - the two proteins are retrieved but not duplicated
+ * - mappings are built from transcript (CDS) to proteins
+ * - no mappings from gene to proteins
+ */
+ SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al)
+ .findXrefSequences("UNIPROT");
+ assertEquals(2, xrefs.getHeight());
+ assertSame(pep1, xrefs.getSequenceAt(0));
+ assertSame(pep2, xrefs.getSequenceAt(1));
+ }
+
+ /**
+ * <pre>
+ * Test that emulates this (real but simplified) case:
+ * Alignment: DBrefs
+ * UNIPROT|P0CE19 EMBL|J03321, EMBL|X06707, EMBL|M19487
+ * UNIPROT|P0CE20 EMBL|J03321, EMBL|X06707, EMBL|X07547
+ * Find cross-references for EMBL. These are mocked here as
+ * EMBL|J03321 with mappings to P0CE18, P0CE19, P0CE20
+ * EMBL|X06707 with mappings to P0CE17, P0CE19, P0CE20
+ * EMBL|M19487 with mappings to P0CE19, Q46432
+ * EMBL|X07547 with mappings to P0CE20, B0BCM4
+ * EMBL sequences are first 'fetched' (mocked here) for P0CE19.
+ * The 3 EMBL sequences are added to the alignment dataset.
+ * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the
+ * alignment dataset and updated to reference the original Uniprot sequences.
+ * For the second Uniprot sequence, the J03321 and X06707 xrefs should be
+ * resolved from the dataset, and only the X07547 dbref fetched.
+ * So the end state to verify is:
+ * - 4 cross-ref sequences returned: J03321, X06707, M19487, X07547
+ * - P0CE19/20 dbrefs to EMBL sequences now have mappings
+ * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+ * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+ * </pre>
+ */
+ @Test(groups = { "Functional" })
+ public void testFindXrefSequences_uniprotEmblManyToMany()
+ {
+ /*
+ * Uniprot sequences, both with xrefs to EMBL|J03321
+ * and EMBL|X07547
+ * Sequences faked to ensure dna translates to protein
+ * (so that mappings can be made)
+ */
+ SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG");
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+ p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487"));
+ SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "KPFG");
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+ p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547"));
+
+ /*
+ * EMBL sequences to be 'fetched', complete with dbrefs and mappings
+ * to their protein products (CDS location and translations are provided
+ * in EMBL XML); these should be matched to, and replaced with,
+ * the corresponding uniprot sequences after fetching
+ */
+
+ /*
+ * J03321 with mappings to P0CE19 and P0CE20
+ */
+ final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGG");
+ DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ MapList mapList = new MapList(new int[] { 1, 18 },
+ new int[] { 1, 6 }, 3, 1);
+ Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), mapList);
+ // add a dbref to the mapped to sequence - should get copied to p0ce19
+ map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875"));
+ dbref1.setMap(map);
+ j03321.addDBRef(dbref1);
+ DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "KPFG"),
+ new MapList(mapList)));
+ j03321.addDBRef(dbref2);
+
+ /*
+ * X06707 with mappings to P0CE19 and P0CE20
+ */
+ final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG");
+ // TODO CrossRef.constructMapping ignores the reverse mapping ??
+ // should it not use its inverse if available?
+ // how does this work for real?
+ DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ MapList map2 = new MapList(new int[] { 4, 21 }, new int[] { 1, 6 }, 3,
+ 1);
+ dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2));
+ x06707.addDBRef(dbref3);
+ DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "KPFG"),
+ new MapList(mapList)));
+ x06707.addDBRef(dbref4);
+
+ /*
+ * M19487 with mapping to P0CE19 and Q46432
+ */
+ final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG");
+ DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+ dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ new MapList(mapList)));
+ m19487.addDBRef(dbref5);
+ DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432");
+ dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"),
+ new MapList(mapList)));
+ m19487.addDBRef(dbref6);
+
+ /*
+ * X07547 with mapping to P0CE20 and B0BCM4
+ */
+ final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG");
+ DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+ dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ new MapList(map2)));
+ x07547.addDBRef(dbref7);
+ DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4");
+ dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"),
+ new MapList(map2)));
+ x07547.addDBRef(dbref8);
+
+ /*
+ * mock sequence fetcher to 'return' the EMBL sequences
+ * TODO: Mockito would allow .thenReturn().thenReturn() here,
+ * and also capture and verification of the parameters
+ * passed in calls to getSequences()
+ */
+ SequenceFetcher mockFetcher = new SequenceFetcher(false)
+ {
+ int call = 0;
+ @Override
+ public boolean isFetchable(String source)
+ {
+ return true;
+ }
+ @Override
+ public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+ {
+ call++;
+ return call == 1 ? new SequenceI[] { j03321, x06707, m19487 }
+ : new SequenceI[] { x07547 };
+ }
+ };
+ SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+
+ /*
+ * find EMBL xrefs for Uniprot seqs and verify that
+ * - the EMBL xref'd sequences are retrieved without duplicates
+ * - mappings are added to the Uniprot dbrefs
+ * - mappings in the EMBL-to-Uniprot dbrefs are updated to the
+ * alignment sequences
+ * - dbrefs on the EMBL sequences are added to the original dbrefs
+ */
+ SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 };
+ AlignmentI al = new Alignment(seqs);
+ Alignment xrefs = new CrossRef(seqs, al)
+ .findXrefSequences("EMBL");
+
+ /*
+ * verify retrieved sequences
+ */
+ assertNotNull(xrefs);
+ assertEquals(4, xrefs.getHeight());
+ assertSame(j03321, xrefs.getSequenceAt(0));
+ assertSame(x06707, xrefs.getSequenceAt(1));
+ assertSame(m19487, xrefs.getSequenceAt(2));
+ assertSame(x07547, xrefs.getSequenceAt(3));
+
+ /*
+ * verify mappings added to Uniprot-to-EMBL dbrefs
+ */
+ Mapping mapping = p0ce19.getDBRefs()[0].getMap();
+ assertSame(j03321, mapping.getTo());
+ mapping = p0ce19.getDBRefs()[1].getMap();
+ assertSame(x06707, mapping.getTo());
+ mapping = p0ce20.getDBRefs()[0].getMap();
+ assertSame(j03321, mapping.getTo());
+ mapping = p0ce20.getDBRefs()[1].getMap();
+ assertSame(x06707, mapping.getTo());
+
+ /*
+ * verify dbrefs on EMBL are mapped to alignment seqs
+ */
+ assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo());
+ assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo());
+ assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo());
+ assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo());
+
+ /*
+ * verify new dbref on EMBL dbref mapping is copied to the
+ * original Uniprot sequence
+ */
+ assertEquals(4, p0ce19.getDBRefs().length);
+ assertEquals("PIR", p0ce19.getDBRefs()[3].getSource());
+ assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId());
+ }
+
+ @Test(groups = "Functional")
+ public void testSameSequence()
+ {
+ assertTrue(CrossRef.sameSequence(null, null));
+ SequenceI seq1 = new Sequence("seq1", "ABCDEF");
+ assertFalse(CrossRef.sameSequence(seq1, null));
+ assertFalse(CrossRef.sameSequence(null, seq1));
+ assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF")));
+ assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef")));
+ assertFalse(CrossRef
+ .sameSequence(seq1, new Sequence("seq2", "ABCDE-F")));
+ assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF")));
+ }
}
* case insensitive matching
*/
assertTrue(testee.equals("a12345"));
+
+ testee = sequenceIdMatcher.new SeqIdName("UNIPROT|A12345");
+ assertFalse(testee.equals("A12345"));
+ assertFalse(testee.equals("UNIPROT|B98765"));
+ assertFalse(testee.equals("UNIPROT|"));
+ assertTrue(testee.equals("UNIPROT"));
}
}
assertEquals(87064, ranges.get(1)[1]);
/*
- * beyond 5' or 3' locus
- */
- ranges = DnaUtils.parseLocation("<34..126");
- assertEquals(1, ranges.size());
- assertEquals(34, ranges.get(0)[0]);
- assertEquals(126, ranges.get(0)[1]);
- ranges = DnaUtils.parseLocation("35..>127");
- assertEquals(1, ranges.size());
- assertEquals(35, ranges.get(0)[0]);
- assertEquals(127, ranges.get(0)[1]);
-
- /*
* valid things we don't yet handle
*/
+ assertNull(DnaUtils.parseLocation("<34..126"));
+ assertNull(DnaUtils.parseLocation("35..>126"));
assertNull(DnaUtils.parseLocation("34.126"));
assertNull(DnaUtils.parseLocation("34^126"));
assertNull(DnaUtils.parseLocation("order(34..126,130..180)"));
{
boolean dna = sp.isDnaCoding();
// try and find products
- List<String> types = CrossRef.findXrefSourcesForSequences(dna,
- al.getSequencesArray(), null);
+ CrossRef crossRef = new CrossRef(al.getSequencesArray(),
+ al);
+ List<String> types = crossRef.findXrefSourcesForSequences();
if (types != null)
{
System.out.println("Xref Types for: "
for (String source : types)
{
System.out.println("Type: " + source);
- SequenceI[] prod = jalview.analysis.CrossRef
- .findXrefSequences(al.getSequencesArray(), dna,
- source, null)
+ SequenceI[] prod = crossRef.findXrefSequences(source)
.getSequencesArray();
System.out.println("Found "
+ ((prod == null) ? "no" : "" + prod.length)
// have a bash at finding the products amongst all the retrieved
// sequences.
SequenceI[] seqs = al.getSequencesArray();
- Alignment prodal = jalview.analysis.CrossRef.findXrefSequences(
- seqs, dna, null, ds);
+ Alignment prodal = new CrossRef(seqs, ds)
+ .findXrefSequences(null);
System.out.println("Found "
+ ((prodal == null) ? "no" : "" + prodal.getHeight())
+ " products");
.getMap().getMappedWidth(), 1);
assertEquals("Expected local reference map to be 3 nucleotides", dr[0]
.getMap().getWidth(), 3);
- AlignmentI sprods = CrossRef.findXrefSequences(
- alsq.getSequencesArray(), true, dr[0].getSource(), alsq);
+ AlignmentI sprods = new CrossRef(alsq.getSequencesArray(), alsq)
+ .findXrefSequences(dr[0].getSource());
assertNotNull(
"Couldn't recover cross reference sequence from dataset. Was it ever added ?",
sprods);