From 9c39e96af6b84257604da448101505361dced686 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 9 Jun 2016 14:55:45 +0100 Subject: [PATCH] JAL-2110 work in progress --- src/jalview/analysis/AlignmentUtils.java | 29 +- src/jalview/analysis/CrossRef.java | 587 ++++++++++++---------- src/jalview/analysis/CrossRefs.java | 410 +++++++++------ src/jalview/analysis/SequenceIdMatcher.java | 12 +- src/jalview/datamodel/xdb/embl/EmblEntry.java | 8 +- src/jalview/gui/AlignFrame.java | 20 +- src/jalview/util/MapList.java | 2 +- test/jalview/analysis/CrossRefTest.java | 321 +++++++++++- test/jalview/analysis/SequenceIdMatcherTest.java | 6 + test/jalview/util/DnaUtilsTest.java | 14 +- test/jalview/ws/SequenceFetcherTest.java | 13 +- test/jalview/ws/seqfetcher/DbRefFetcherTest.java | 4 +- 12 files changed, 944 insertions(+), 482 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 081f446..949c47a 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -1411,6 +1411,12 @@ public class AlignmentUtils { List cdsSeqs = new ArrayList(); + /* + * construct CDS sequences from the (cds-to-protein) mappings made earlier; + * this makes it possible to model multiple products from dna (e.g. EMBL); + * however it does mean we don't have the EMBL protein_id (a property on + * the CDS features) in order to make the CDS sequence name :-( + */ for (SequenceI seq : dna) { AlignedCodonFrame cdsMappings = new AlignedCodonFrame(); @@ -1795,17 +1801,20 @@ public class AlignmentUtils * sort to get sequence features in start position order * - would be better to store in Sequence as a TreeSet or NCList? */ - Arrays.sort(peptide.getSequenceFeatures(), - new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) + if (peptide.getSequenceFeatures() != null) + { + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) - : c; - } - }); + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); + } return count; } diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index cb664df..4e8f070 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -35,12 +35,11 @@ import jalview.ws.SequenceFetcherFactory; import jalview.ws.seqfetcher.ASequenceFetcher; import java.util.ArrayList; -import java.util.Arrays; +import java.util.Iterator; import java.util.List; /** - * Functions for cross-referencing sequence databases. user must first specify - * if cross-referencing from protein or dna (set dna==true) + * Functions for cross-referencing sequence databases. * * @author JimP * @@ -48,28 +47,40 @@ import java.util.List; public class CrossRef { /* - * A sub-class that ignores Parent attribute when comparing sequence - * features. This avoids 'duplicate' CDS features that only - * differ in their parent Transcript ids. + * the dataset of the alignment for which we are searching for + * cross-references; in some cases we may resolve xrefs by + * searching in the dataset */ - class MySequenceFeature extends SequenceFeature - { - private SequenceFeature feat; + private AlignmentI dataset; - MySequenceFeature(SequenceFeature sf) - { - this.feat = sf; - } + /* + * true if we are searching for cross-references from nucleotide, + * i.e. for protein sequences, false if the reverse + */ + private boolean fromDna; - @Override - public boolean equals(Object o) - { - return feat.equals(o, true); - } - } + /* + * the sequences for which we are seeking cross-references + */ + private SequenceI[] fromSeqs; /** + * Constructor + * + * @param seqs + * the sequences for which we are seeking cross-references + * @param ds + * the containing alignment dataset (may be searched to resolve + * cross-references) + */ + public CrossRef(SequenceI[] seqs, AlignmentI ds) + { + fromSeqs = seqs; + fromDna = ds.isNucleotide(); + dataset = ds.getDataset() == null ? ds : ds.getDataset(); + } + /** * Returns a list of distinct database sources for which sequences have either *
    *
  • a (dna-to-protein or protein-to-dna) cross-reference
  • @@ -77,24 +88,16 @@ public class CrossRef * reference from another sequence in the dataset which has a cross-reference * to a direct DBRefEntry on the given sequence *
- * - * @param dna - * true if seqs are nucleotide - * @param seqs - * sequences whose xrefs we are seeking - * @param dataset - * an alignment to search for indirect references * @return */ - public static List findXrefSourcesForSequences(boolean dna, - SequenceI[] seqs, AlignmentI dataset) + public List findXrefSourcesForSequences() { List sources = new ArrayList(); - for (SequenceI seq : seqs) + for (SequenceI seq : fromSeqs) { if (seq != null) { - findXrefSourcesForSequence(seq, dna, dataset, sources); + findXrefSourcesForSequence(seq, sources); } } return sources; @@ -111,41 +114,37 @@ public class CrossRef * * @param seq * the sequence whose dbrefs we are searching against - * @param dna - * true if the sequence is nucleotide - * @param dataset - * an alignment to search for indirect references * @param sources * a list of sources to add matches to */ - static void findXrefSourcesForSequence(SequenceI seq, boolean dna, - AlignmentI dataset, List sources) + void findXrefSourcesForSequence(SequenceI seq, List sources) { /* * first find seq's xrefs (dna-to-peptide or peptide-to-dna) */ - DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs()); + DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs()); addXrefsToSources(rfs, sources); if (dataset != null) { /* * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs */ - DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs()); + DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs()); List rseqs = new ArrayList(); /* * find sequences in the alignment which xref one of these DBRefs * i.e. is xref-ed to a common sequence identifier */ - CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null); + searchDatasetXrefs(seq, lrfs, rseqs, null); /* * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources */ for (SequenceI rs : rseqs) { - DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs()); + DBRefEntry[] xrs = DBRefUtils + .selectDbRefs(!fromDna, rs.getDBRefs()); addXrefsToSources(xrs, sources); } } @@ -158,7 +157,7 @@ public class CrossRef * @param xrefs * @param sources */ - static void addXrefsToSources(DBRefEntry[] xrefs, List sources) + void addXrefsToSources(DBRefEntry[] xrefs, List sources) { if (xrefs != null) { @@ -185,13 +184,14 @@ public class CrossRef * add to) * @return products (as dataset sequences) */ - public static Alignment findXrefSequences(SequenceI[] seqs, - final boolean dna, final String source, AlignmentI al) + public Alignment findXrefSequences(String source) { - AlignmentI dataset = al.getDataset() == null ? al : al.getDataset(); List rseqs = new ArrayList(); AlignedCodonFrame cf = new AlignedCodonFrame(); - for (SequenceI seq : seqs) + SequenceIdMatcher matcher = new SequenceIdMatcher( + dataset.getSequences()); + + for (SequenceI seq : fromSeqs) { SequenceI dss = seq; while (dss.getDatasetSequence() != null) @@ -199,42 +199,67 @@ public class CrossRef dss = dss.getDatasetSequence(); } boolean found = false; - DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs()); + DBRefEntry[] xrfs = DBRefUtils + .selectDbRefs(!fromDna, dss.getDBRefs()); if ((xrfs == null || xrfs.length == 0) && dataset != null) { /* * found no suitable dbrefs on sequence - look for sequences in the * alignment which share a dbref with this one */ - DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs()); + DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, + seq.getDBRefs()); /* * find sequences (except this one!), of complementary type, * which have a dbref to an accession id for this sequence, * and add them to the results */ - found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, - rseqs, cf); + found = searchDatasetXrefs(dss, lrfs, rseqs, cf); } - for (int r = 0; xrfs != null && r < xrfs.length; r++) + if (xrfs == null && !found) { - DBRefEntry xref = xrfs[r]; - if (source != null && !source.equals(xref.getSource())) - { - continue; - } + /* + * no dbref to source on this sequence or matched + * complementary sequence in the dataset + */ + continue; + } + List sourceRefs = DBRefUtils.searchRefsForSource(xrfs, + source); + Iterator refIterator = sourceRefs.iterator(); + while (refIterator.hasNext()) + { + DBRefEntry xref = refIterator.next(); + found = false; if (xref.hasMap()) { - if (xref.getMap().getTo() != null) + SequenceI mappedTo = xref.getMap().getTo(); + if (mappedTo != null) { + /* + * dbref contains the sequence it maps to; add it to the + * results unless we have done so already (could happen if + * fetching xrefs for sequences which have xrefs in common) + * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707} + */ found = true; - SequenceI rsq = new Sequence(xref.getMap().getTo()); + SequenceI matchInDataset = findInDataset(mappedTo);// matcher.findIdMatch(mappedTo); + if (matchInDataset != null) + { + if (!rseqs.contains(matchInDataset)) + { + rseqs.add(matchInDataset); + } + continue; + } + SequenceI rsq = new Sequence(mappedTo); rseqs.add(rsq); - if (xref.getMap().getMap().getFromRatio() != xref - .getMap().getMap().getToRatio()) + if (xref.getMap().getMap().getFromRatio() != xref.getMap() + .getMap().getToRatio()) { // get sense of map correct for adding to product alignment. - if (dna) + if (fromDna) { // map is from dna seq to a protein product cf.addMap(dss, rsq, xref.getMap().getMap()); @@ -247,182 +272,162 @@ public class CrossRef } } } + if (!found) { - // do a bit more work - search for sequences with references matching - // xrefs on this sequence. - if (dataset != null) + SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|" + + xref.getAccessionId()); + if (matchedSeq != null) { - found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/ - !dna); - if (found) + if (constructMapping(seq, matchedSeq, xref, cf)) { - xrfs[r] = null; // we've recovered seqs for this one. + found = true; } } } + + if (!found) + { + // do a bit more work - search for sequences with references matching + // xrefs on this sequence. + found = searchDataset(dss, xref, rseqs, cf, false); + } + if (found) + { + refIterator.remove(); + } } - if (!found) + + /* + * fetch from source database any dbrefs we haven't resolved up to here + */ + if (!sourceRefs.isEmpty()) { - if (xrfs != null && xrfs.length > 0) + ASequenceFetcher sftch = SequenceFetcherFactory + .getSequenceFetcher(); + SequenceI[] retrieved = null; + try { - ASequenceFetcher sftch = SequenceFetcherFactory - .getSequenceFetcher(); - SequenceI[] retrieved = null; - int l = xrfs.length; - for (int r = 0; r < xrfs.length; r++) - { - // filter out any irrelevant or irretrievable references - if (xrfs[r] == null - || ((source != null && !source.equals(xrfs[r] - .getSource())) || !sftch.isFetchable(xrfs[r] - .getSource()))) - { - l--; - xrfs[r] = null; - } - } - if (l > 0) - { - // System.out - // .println("Attempting to retrieve cross referenced sequences."); - DBRefEntry[] t = new DBRefEntry[l]; - l = 0; - for (int r = 0; r < xrfs.length; r++) - { - if (xrfs[r] != null) - { - t[l++] = xrfs[r]; - } - } - xrfs = t; - try - { - retrieved = sftch.getSequences(Arrays.asList(xrfs), !dna); - // problem here is we don't know which of xrfs resulted in which - // retrieved element - } catch (Exception e) - { - System.err - .println("Problem whilst retrieving cross references for Sequence : " - + seq.getName()); - e.printStackTrace(); - } + retrieved = sftch.getSequences(sourceRefs, !fromDna); + } catch (Exception e) + { + System.err + .println("Problem whilst retrieving cross references for Sequence : " + + seq.getName()); + e.printStackTrace(); + } - if (retrieved != null) + if (retrieved != null) + { + updateDbrefMappings(seq, xrfs, retrieved, cf); + for (SequenceI retrievedSequence : retrieved) + { + SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence + : retrievedSequence.getDatasetSequence(); + DBRefEntry[] dbr = retrievedSequence.getDBRefs(); + if (dbr != null) { - updateDbrefMappings(dna, seq, xrfs, retrieved, cf); - - SequenceIdMatcher matcher = new SequenceIdMatcher( - dataset.getSequences()); - List copiedFeatures = new ArrayList(); - CrossRef me = new CrossRef(); - for (int rs = 0; rs < retrieved.length; rs++) + for (DBRefEntry dbref : dbr) { - // TODO: examine each sequence for 'redundancy' - DBRefEntry[] dbr = retrieved[rs].getDBRefs(); - if (dbr != null && dbr.length > 0) + // find any entry where we should put in the sequence being + // cross-referenced into the map + Mapping map = dbref.getMap(); + if (map != null) { - for (int di = 0; di < dbr.length; di++) + if (map.getTo() != null && map.getMap() != null) { - // find any entry where we should put in the sequence being - // cross-referenced into the map - Mapping map = dbr[di].getMap(); - if (map != null) + // TODO findInDataset requires exact sequence match but + // 'congruent' test only for the mapped part + SequenceI matched = findInDataset(map.getTo());// matcher.findIdMatch(map.getTo()); + if (matched != null) { - if (map.getTo() != null && map.getMap() != null) + /* + * already got an xref to this sequence; update this + * map to point to the same sequence, and add + * any new dbrefs to it + */ + DBRefEntry[] toRefs = map.getTo().getDBRefs(); + if (toRefs != null) { - SequenceI matched = matcher - .findIdMatch(map.getTo()); - if (matched != null) - { - /* - * already got an xref to this sequence; update this - * map to point to the same sequence, and add - * any new dbrefs to it - */ - for (DBRefEntry ref : map.getTo().getDBRefs()) - { - matched.addDBRef(ref); // add or update mapping - } - map.setTo(matched); - } - else + for (DBRefEntry ref : toRefs) { - matcher.add(map.getTo()); + matched.addDBRef(ref); // add or update mapping } - try + } + map.setTo(matched); + } + else + { + matcher.add(map.getTo()); + } + try + { + // compare ms with dss and replace with dss in mapping + // if map is congruent + SequenceI ms = map.getTo(); + int sf = map.getMap().getToLowest(); + int st = map.getMap().getToHighest(); + SequenceI mappedrg = ms.getSubSequence(sf, st); + // SequenceI loc = dss.getSubSequence(sf, st); + if (mappedrg.getLength() > 0 + && ms.getSequenceAsString().equals( + dss.getSequenceAsString())) + // && mappedrg.getSequenceAsString().equals( + // loc.getSequenceAsString())) + { + String msg = "Mapping updated from " + ms.getName() + + " to retrieved crossreference " + + dss.getName(); + System.out.println(msg); + // method to update all refs of existing To on + // retrieved sequence with dss and merge any props + // on To onto dss. + // TODO don't we have to change the mapped to ranges + // if not to the whole sequence? + map.setTo(dss); + /* + * copy sequence features as well, avoiding + * duplication (e.g. same variation from 2 + * transcripts) + */ + SequenceFeature[] sfs = ms.getSequenceFeatures(); + if (sfs != null) { - // compare ms with dss and replace with dss in mapping - // if map is congruent - SequenceI ms = map.getTo(); - int sf = map.getMap().getToLowest(); - int st = map.getMap().getToHighest(); - SequenceI mappedrg = ms.getSubSequence(sf, st); - // SequenceI loc = dss.getSubSequence(sf, st); - if (mappedrg.getLength() > 0 - && ms.getSequenceAsString().equals( - dss.getSequenceAsString())) - // && mappedrg.getSequenceAsString().equals( - // loc.getSequenceAsString())) + for (SequenceFeature feat : sfs) { - String msg = "Mapping updated from " - + ms.getName() - + " to retrieved crossreference " - + dss.getName(); - System.out.println(msg); - // method to update all refs of existing To on - // retrieved sequence with dss and merge any props - // on To onto dss. - map.setTo(dss); /* - * copy sequence features as well, avoiding - * duplication (e.g. same variation from 2 - * transcripts) + * make a flyweight feature object which ignores Parent + * attribute in equality test, to avoid creating many + * otherwise duplicate exon features on genomic sequence */ - SequenceFeature[] sfs = ms - .getSequenceFeatures(); - if (sfs != null) + SequenceFeature newFeature = new SequenceFeature( + feat) { - for (SequenceFeature feat : sfs) + @Override + public boolean equals(Object o) { - /* - * we override SequenceFeature.equals here (but - * not elsewhere) to ignore Parent attribute - * TODO not quite working yet! - */ - if (!copiedFeatures - .contains(me.new MySequenceFeature( - feat))) - { - dss.addSequenceFeature(feat); - copiedFeatures.add(feat); - } + return super.equals(o, true); } - } - cf.addMap(retrieved[rs].getDatasetSequence(), - dss, map.getMap()); - } - // TODO remove this 'else' and the cf.addMap above? - else - { - cf.addMap(retrieved[rs].getDatasetSequence(), - map.getTo(), map.getMap()); + }; + dss.addSequenceFeature(newFeature); } - } catch (Exception e) - { - System.err - .println("Exception when consolidating Mapped sequence set..."); - e.printStackTrace(System.err); } } + cf.addMap(retrievedDss, map.getTo(), map.getMap()); + } catch (Exception e) + { + System.err + .println("Exception when consolidating Mapped sequence set..."); + e.printStackTrace(System.err); } } } - retrieved[rs].updatePDBIds(); - rseqs.add(retrieved[rs]); } } + retrievedSequence.updatePDBIds(); + rseqs.add(retrievedSequence); + dataset.addSequence(retrievedDss); + matcher.add(retrievedSequence); } } } @@ -441,17 +446,81 @@ public class CrossRef } /** + * Returns the first identical sequence in the dataset if any, else null + * + * @param mappedTo + * @return + */ + SequenceI findInDataset(SequenceI mappedTo) + { + if (mappedTo == null) + { + return null; + } + SequenceI dss = mappedTo.getDatasetSequence() == null ? mappedTo + : mappedTo.getDatasetSequence(); + for (SequenceI seq : dataset.getSequences()) + { + if (sameSequence(seq, dss)) + { + return seq; + } + } + return null; + } + + /** + * Answers true if seq1 and seq2 contain exactly the same characters (ignoring + * case), else false. This method compares the lengths, then each character in + * turn, in order to 'fail fast'. For case-sensitive comparison, it would be + * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()). + * + * @param seq1 + * @param seq2 + * @return + */ + // TODO move to Sequence / SequenceI + static boolean sameSequence(SequenceI seq1, SequenceI seq2) + { + if (seq1 == seq2) + { + return true; + } + if (seq1 == null || seq2 == null) + { + return false; + } + char[] c1 = seq1.getSequence(); + char[] c2 = seq2.getSequence(); + if (c1.length != c2.length) + { + return false; + } + for (int i = 0; i < c1.length; i++) + { + int diff = c1[i] - c2[i]; + /* + * same char or differ in case only ('a'-'A' == 32) + */ + if (diff != 0 && diff != 32 && diff != -32) + { + return false; + } + } + return true; + } + + /** * Updates any empty mappings in the cross-references with one to a compatible * retrieved sequence if found, and adds any new mappings to the * AlignedCodonFrame * - * @param dna * @param mapFrom * @param xrefs * @param retrieved * @param acf */ - static void updateDbrefMappings(boolean dna, SequenceI mapFrom, + void updateDbrefMappings(SequenceI mapFrom, DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf) { SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved); @@ -468,55 +537,69 @@ public class CrossRef } for (SequenceI seq : matches) { - MapList mapping = null; - if (dna) - { - mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom); - } - else - { - mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq); - if (mapping != null) - { - mapping = mapping.getInverse(); - } - } - if (mapping != null) - { - xref.setMap(new Mapping(seq, mapping)); - if (dna) - { - AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping); - } - if (dna) - { - acf.addMap(mapFrom, seq, mapping); - } - else - { - acf.addMap(seq, mapFrom, mapping.getInverse()); - } - continue; - } + constructMapping(mapFrom, seq, xref, acf); } } } } /** + * Tries to make a mapping from dna to protein. If successful, adds the + * mapping to the dbref and the mappings collection and answers true, + * otherwise answers false. + * + * @param mapFrom + * @param mapTo + * @param xref + * @param mappings + * @return + */ + boolean constructMapping(SequenceI mapFrom, SequenceI mapTo, + DBRefEntry xref, AlignedCodonFrame mappings) + { + MapList mapping = null; + if (fromDna) + { + mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom); + } + else + { + mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo); + if (mapping != null) + { + mapping = mapping.getInverse(); + } + } + if (mapping == null) + { + return false; + } + xref.setMap(new Mapping(mapTo, mapping)); + if (fromDna) + { + AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping); + mappings.addMap(mapFrom, mapTo, mapping); + } + else + { + mappings.addMap(mapTo, mapFrom, mapping.getInverse()); + } + + return true; + } + + /** * find references to lrfs in the cross-reference set of each sequence in * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry * based on source and accession string only - Map and Version are nulled. * * @param sequenceI * @param lrfs - * @param dataset * @param rseqs * @return true if matches were found. */ - private static boolean searchDatasetXrefs(SequenceI sequenceI, - boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, - List rseqs, AlignedCodonFrame cf) + private boolean searchDatasetXrefs(SequenceI sequenceI, + DBRefEntry[] lrfs, List rseqs, AlignedCodonFrame cf) { boolean found = false; if (lrfs == null) @@ -529,8 +612,7 @@ public class CrossRef // add in wildcards xref.setVersion(null); xref.setMap(null); - found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false, - dna); + found |= searchDataset(sequenceI, xref, rseqs, cf, false); } return found; } @@ -543,21 +625,16 @@ public class CrossRef * a sequence to ignore (start point of search) * @param xrf * a cross-reference to try to match - * @param dataset - * sequences to search in * @param rseqs * result list to add to * @param cf * a set of sequence mappings to add to * @param direct * - search all references or only subset - * @param dna - * search dna or protein xrefs (if direct=false) * @return true if relationship found and sequence added. */ - public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf, - AlignmentI dataset, List rseqs, AlignedCodonFrame cf, - boolean direct, boolean dna) + boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf, + List rseqs, AlignedCodonFrame cf, boolean direct) { boolean found = false; if (dataset == null) @@ -585,16 +662,14 @@ public class CrossRef { continue; } - // check if this is the correct sequence type + /* + * only look at same molecule type if 'direct', or + * complementary type if !direct + */ { - // TODO 'direct' is always set to false - remove? - // or should it be 'true' from findXrefSequences? - // also its Javadoc conflicts with its use: - // test below implies 'direct' means find complementary sequences, - // !direct means select same molecule type boolean isDna = Comparison .isNucleotide(new SequenceI[] { nxt }); - if ((direct && isDna == dna) || (!direct && isDna != dna)) + if (direct ? (isDna != fromDna) : (isDna == fromDna)) { // skip this sequence because it is wrong molecule type continue; @@ -616,7 +691,7 @@ public class CrossRef } else { - poss = DBRefUtils.selectDbRefs(!dna, poss); + poss = DBRefUtils.selectDbRefs(!fromDna, poss); cands = DBRefUtils.searchRefs(poss, xrf); } if (!cands.isEmpty()) @@ -639,7 +714,7 @@ public class CrossRef { // get sense of map correct for adding to product // alignment. - if (dna) + if (fromDna) { // map is from dna seq to a protein product cf.addMap(sequenceI, nxt, map); diff --git a/src/jalview/analysis/CrossRefs.java b/src/jalview/analysis/CrossRefs.java index 0f3f425..691e972 100644 --- a/src/jalview/analysis/CrossRefs.java +++ b/src/jalview/analysis/CrossRefs.java @@ -1,5 +1,6 @@ package jalview.analysis; +import jalview.analysis.CrossRef.MySequenceFeature; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; @@ -20,6 +21,27 @@ import java.util.List; public class CrossRefs { + /* + * A sub-class that ignores Parent attribute when comparing sequence + * features. This avoids 'duplicate' CDS features that only + * differ in their parent Transcript ids. + */ + class MySequenceFeature extends SequenceFeature + { + private SequenceFeature feat; + + MySequenceFeature(SequenceFeature sf) + { + this.feat = sf; + } + + @Override + public boolean equals(Object o) + { + return feat.equals(o, true); + } + } + /** * Finds cross-references for sequences from a specified source database. * These may be found in four ways: @@ -46,44 +68,71 @@ public class CrossRefs public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna, String source, AlignmentI dataset) { - List foundSeqs = new ArrayList(); - AlignedCodonFrame mappings = new AlignedCodonFrame(); - - List sourceRefs = new ArrayList(); - + /* + * filter to only those sequences of the right type (nucleotide/protein) + */ + List fromSeqs = new ArrayList(); for (SequenceI seq : seqs) { - if (dna != Comparison.isNucleotide(seq)) + if (dna == Comparison.isNucleotide(seq)) { - /* - * mixed alignment, and this sequence is of the wrong type - */ - continue; + fromSeqs.add(seq); } + } + return findXrefSequences(fromSeqs, dna, source, dataset); + } + + /** + * Finds cross-references for sequences from a specified source database. + * These may be found in four ways: + *
    + *
  • as a DBRefEntry on the known sequence, which has a mapped-to sequence
  • + *
  • a sequence of complementary type in the alignment dataset, which has a + * DBRefEntry to one of the known sequence's 'direct' DBRefs
  • + *
  • a sequence of complementary type in the alignment, which has a + * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs
  • + *
  • by fetching the accession from the remote database
  • + *
+ * + * @param seqs + * the sequences whose cross-references we are searching for, + * filtered to only those which are of the type denoted by 'dna' + * @param dna + * true if the sequences are from a nucleotide alignment, else false + * @param source + * the database source we want cross-references to + * @param dataset + * the alignment dataset the sequences belong to + * @return an alignment containing cross-reference sequences, or null if none + * found + */ + static AlignmentI findXrefSequences(List fromSeqs, + boolean dna, String source, AlignmentI dataset) + { + List foundSeqs = new ArrayList(); + AlignedCodonFrame mappings = new AlignedCodonFrame(); - /* - * get this sequence's dbrefs to source database (if any) - */ - List seqSourceRefs = DBRefUtils.searchRefsForSource( - seq.getDBRefs(), source); + List unresolvedRefs = new ArrayList(); - /* - * first extract any mapped sequences from sourceRefs - */ - findMappedDbrefs(seq, seqSourceRefs, foundSeqs, mappings); + /* + * first extract any mapped sequences from sourceRefs + * if successful, sequence is removed from fromSeqs + * if unsuccessful, dbrefs are added to unresolvedRefs + */ + findMappedDbrefs(fromSeqs, source, foundSeqs, + unresolvedRefs, mappings); - /* - * for remaining sourceRefs, try to match a - * complementary sequence in the dataset - */ - findIndirectCrossReferences(seq, source, seqSourceRefs, dataset, - foundSeqs, mappings); - } + /* + * then search the alignment dataset for dbref resolutions + */ + findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs, + unresolvedRefs, mappings); /* * fetch any remaining sourceRefs from the source database */ - fetchCrossReferences(sourceRefs, foundSeqs, mappings, dna, dataset); + fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings, + dna, dataset); if (foundSeqs.isEmpty()) { @@ -98,52 +147,90 @@ public class CrossRefs /** * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If * found, adds the sequence to foundSeqs and removes the dbref from the list. + * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting + * version number to 0 i.e. use source and accession only). * - * @param seq - * the dataset sequence we are searching from - * @param sourceRefs - * the sequence's dbrefs to 'source' + * @param fromSeqs + * the dataset sequences we are searching from + * @param source + * the database source we are searching dbrefs for * @param foundSeqs - * a list of cross-references to add to + * a list of found sequences to add to + * @param unresolvedRefs + * a list of unresolved cross-references to add to * @param mappings * a set of sequence mappings to add to * @return */ - static void findMappedDbrefs(SequenceI seq, List sourceRefs, - List foundSeqs, AlignedCodonFrame mappings) + static void findMappedDbrefs(List fromSeqs, String source, + List foundSeqs, List unresolvedRefs, + AlignedCodonFrame mappings) { - Iterator refs = sourceRefs.iterator(); - while (refs.hasNext()) + Iterator it = fromSeqs.iterator(); + while (it.hasNext()) { - DBRefEntry dbref = refs.next(); - Mapping map = dbref.getMap(); - if (map != null) + SequenceI seq = it.next(); + SequenceI dss = seq.getDatasetSequence(); + dss = dss == null ? seq : dss; + + DBRefEntry[] dbRefs = seq.getDBRefs(); + if (dbRefs == null) + { + continue; + } + boolean resolved = false; + for (DBRefEntry dbref : dbRefs) { - SequenceI mappedTo = map.getTo(); - if (mappedTo != null) + if (!source.equals(dbref.getSource())) { - foundSeqs.add(new Sequence(mappedTo)); - refs.remove(); - - /* - * check mapping is not 'direct' (it shouldn't be if we reach here) - * and add mapping (dna-to-peptide or vice versa) to the set - */ - MapList mapList = map.getMap(); - int fromRatio = mapList.getFromRatio(); - int toRatio = mapList.getToRatio(); - if (fromRatio != toRatio) + continue; + } + DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0", + dbref.getAccessionId()); + Mapping map = dbref.getMap(); + if (map != null) + { + unresolvedRefs.remove(todo); + resolved = true; + SequenceI mappedTo = map.getTo(); + if (mappedTo != null) { - if (fromRatio == 3) - { - mappings.addMap(seq, mappedTo, mapList); - } - else + foundSeqs.add(new Sequence(mappedTo)); + + /* + * check mapping is not 'direct' (it shouldn't be if we reach here) + * and add mapping (dna-to-peptide or vice versa) to the set + */ + MapList mapList = map.getMap(); + int fromRatio = mapList.getFromRatio(); + int toRatio = mapList.getToRatio(); + if (fromRatio != toRatio) { - mappings.addMap(mappedTo, seq, mapList.getInverse()); + if (fromRatio == 3) + { + mappings.addMap(dss, mappedTo, mapList); + } + else + { + mappings.addMap(mappedTo, dss, mapList.getInverse()); + } } } } + else + { + /* + * no mapping to resolve dbref - add source+accession to list to resolve + */ + if (!unresolvedRefs.contains(todo)) + { + unresolvedRefs.add(todo); + } + } + } + if (resolved) + { + it.remove(); } } } @@ -153,13 +240,13 @@ public class CrossRefs * to the foundSeqs list. If found, tries to make a mapping between seq and * the retrieved sequence and insert it into the database reference. * - * @param seq + * @param fromSeqs * @param sourceRefs * @param foundSeqs * @param mappings * @param dna */ - static void fetchCrossReferences(SequenceI seq, + static void fetchCrossReferences(List fromSeqs, List sourceRefs, List foundSeqs, AlignedCodonFrame mappings, boolean dna, AlignmentI dataset) { @@ -170,116 +257,116 @@ public class CrossRefs retrieved = sftch.getSequences(sourceRefs, !dna); } catch (Exception e) { - System.err - .println("Problem whilst retrieving cross references for Sequence : " - + seq.getName()); + System.err.println("Problem whilst retrieving cross references: " + + e.getMessage()); e.printStackTrace(); return; } - if (retrieved != null) + if (retrieved == null) { - updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings); + return; + } + updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings); - SequenceIdMatcher matcher = new SequenceIdMatcher( - dataset.getSequences()); - List copiedFeatures = new ArrayList(); - CrossRef me = new CrossRef(); - for (int rs = 0; rs < retrieved.length; rs++) + SequenceIdMatcher matcher = new SequenceIdMatcher( + dataset.getSequences()); + List copiedFeatures = new ArrayList(); + CrossRefs me = new CrossRefs(); + for (int rs = 0; rs < retrieved.length; rs++) + { + // TODO: examine each sequence for 'redundancy' + DBRefEntry[] dbr = retrieved[rs].getDBRefs(); + if (dbr != null && dbr.length > 0) { - // TODO: examine each sequence for 'redundancy' - DBRefEntry[] dbr = retrieved[rs].getDBRefs(); - if (dbr != null && dbr.length > 0) + for (int di = 0; di < dbr.length; di++) { - for (int di = 0; di < dbr.length; di++) + // find any entry where we should put in the sequence being + // cross-referenced into the map + Mapping map = dbr[di].getMap(); + if (map != null) { - // find any entry where we should put in the sequence being - // cross-referenced into the map - Mapping map = dbr[di].getMap(); - if (map != null) + if (map.getTo() != null && map.getMap() != null) { - if (map.getTo() != null && map.getMap() != null) + SequenceI matched = matcher.findIdMatch(map.getTo()); + if (matched != null) { - SequenceI matched = matcher.findIdMatch(map.getTo()); - if (matched != null) - { - /* - * already got an xref to this sequence; update this - * map to point to the same sequence, and add - * any new dbrefs to it - */ - for (DBRefEntry ref : map.getTo().getDBRefs()) - { - matched.addDBRef(ref); // add or update mapping - } - map.setTo(matched); - } - else + /* + * already got an xref to this sequence; update this + * map to point to the same sequence, and add + * any new dbrefs to it + */ + for (DBRefEntry ref : map.getTo().getDBRefs()) { - matcher.add(map.getTo()); + matched.addDBRef(ref); // add or update mapping } - try + map.setTo(matched); + } + else + { + matcher.add(map.getTo()); + } + try + { + // compare ms with dss and replace with dss in mapping + // if map is congruent + SequenceI ms = map.getTo(); + int sf = map.getMap().getToLowest(); + int st = map.getMap().getToHighest(); + SequenceI mappedrg = ms.getSubSequence(sf, st); + // SequenceI loc = dss.getSubSequence(sf, st); + if (mappedrg.getLength() > 0 + && ms.getSequenceAsString().equals( + fromSeqs.getSequenceAsString())) + // && mappedrg.getSequenceAsString().equals( + // loc.getSequenceAsString())) { - // compare ms with dss and replace with dss in mapping - // if map is congruent - SequenceI ms = map.getTo(); - int sf = map.getMap().getToLowest(); - int st = map.getMap().getToHighest(); - SequenceI mappedrg = ms.getSubSequence(sf, st); - // SequenceI loc = dss.getSubSequence(sf, st); - if (mappedrg.getLength() > 0 - && ms.getSequenceAsString().equals( - seq.getSequenceAsString())) - // && mappedrg.getSequenceAsString().equals( - // loc.getSequenceAsString())) + String msg = "Mapping updated from " + ms.getName() + + " to retrieved crossreference " + + fromSeqs.getName(); + System.out.println(msg); + // method to update all refs of existing To on + // retrieved sequence with dss and merge any props + // on To onto dss. + map.setTo(fromSeqs); + /* + * copy sequence features as well, avoiding + * duplication (e.g. same variation from 2 + * transcripts) + */ + SequenceFeature[] sfs = ms.getSequenceFeatures(); + if (sfs != null) { - String msg = "Mapping updated from " + ms.getName() - + " to retrieved crossreference " - + seq.getName(); - System.out.println(msg); - // method to update all refs of existing To on - // retrieved sequence with dss and merge any props - // on To onto dss. - map.setTo(seq); - /* - * copy sequence features as well, avoiding - * duplication (e.g. same variation from 2 - * transcripts) - */ - SequenceFeature[] sfs = ms.getSequenceFeatures(); - if (sfs != null) + for (SequenceFeature feat : sfs) { - for (SequenceFeature feat : sfs) + /* + * we override SequenceFeature.equals here (but + * not elsewhere) to ignore Parent attribute + * TODO not quite working yet! + */ + if (!copiedFeatures + .contains(me.new MySequenceFeature(feat))) { - /* - * we override SequenceFeature.equals here (but - * not elsewhere) to ignore Parent attribute - * TODO not quite working yet! - */ - if (!copiedFeatures - .contains(me.new MySequenceFeature(feat))) - { - seq.addSequenceFeature(feat); - copiedFeatures.add(feat); - } + fromSeqs.addSequenceFeature(feat); + copiedFeatures.add(feat); } } } - mappings.addMap(retrieved[rs].getDatasetSequence(), - map.getTo(), map.getMap()); - } catch (Exception e) - { - System.err - .println("Exception when consolidating Mapped sequence set..."); - e.printStackTrace(System.err); } + mappings.addMap(retrieved[rs].getDatasetSequence(), + map.getTo(), map.getMap()); + } catch (Exception e) + { + System.err + .println("Exception when consolidating Mapped sequence set..."); + e.printStackTrace(System.err); } } } } - retrieved[rs].updatePDBIds(); - foundSeqs.add(retrieved[rs]); } + retrieved[rs].updatePDBIds(); + foundSeqs.add(retrieved[rs]); } } @@ -288,24 +375,27 @@ public class CrossRefs * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and * removes the resolved sourceRef from the search list. * - * @param seq + * @param fromSeqs * @param source - * @param sourceRefs - * @param dataset + * @param unresolvedRefs * @param foundSeqs + * @param unresolvedRefs * @param mappings * @return */ - static void findIndirectCrossReferences(SequenceI seq, String source, - List sourceRefs, AlignmentI dataset, - List foundSeqs, AlignedCodonFrame mappings) + static void findIndirectCrossReferences(List fromSeqs, + String source, AlignmentI dataset, + List foundSeqs, List unresolvedRefs, + AlignedCodonFrame mappings) { - Iterator refs = sourceRefs.iterator(); + Iterator refs = unresolvedRefs.iterator(); while (refs.hasNext()) { DBRefEntry dbref = refs.next(); - boolean found = searchDatasetForCrossReference(seq, dbref, dataset, - foundSeqs, mappings); + boolean found = false; + // boolean found = searchDatasetForCrossReference(fromSeqs, dbref, + // foundSeqs, + // unresolvedRefs, mappings); if (found) { refs.remove(); @@ -427,12 +517,12 @@ public class CrossRefs * AlignedCodonFrame * * @param dna - * @param mapFrom + * @param fromSeqs * @param xrefs * @param retrieved * @param mappings */ - static void updateDbrefMappings(boolean dna, SequenceI mapFrom, + static void updateDbrefMappings(boolean dna, List fromSeqs, List xrefs, SequenceI[] retrieved, AlignedCodonFrame mappings) { @@ -453,11 +543,11 @@ public class CrossRefs MapList mapping = null; if (dna) { - mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom); + mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs); } else { - mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq); + mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq); if (mapping != null) { mapping = mapping.getInverse(); @@ -468,15 +558,15 @@ public class CrossRefs xref.setMap(new Mapping(seq, mapping)); if (dna) { - AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping); + AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping); } if (dna) { - mappings.addMap(mapFrom, seq, mapping); + mappings.addMap(fromSeqs, seq, mapping); } else { - mappings.addMap(seq, mapFrom, mapping.getInverse()); + mappings.addMap(seq, fromSeqs, mapping.getInverse()); } continue; } diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java index 3fd0581..c12de4e 100755 --- a/src/jalview/analysis/SequenceIdMatcher.java +++ b/src/jalview/analysis/SequenceIdMatcher.java @@ -290,7 +290,7 @@ public class SequenceIdMatcher { if (s != null) { - id = new String(s.toLowerCase()); + id = s.toLowerCase(); } else { @@ -357,5 +357,15 @@ public class SequenceIdMatcher .indexOf(s.charAt(id.length())) > -1)) : false; } } + + /** + * toString method returns the wrapped sequence id. For debugging purposes + * only, behaviour not guaranteed not to change. + */ + @Override + public String toString() + { + return id; + } } } diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index cfe87d9..a0e1234 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -212,6 +212,10 @@ public class EmblEntry { for (DBRefEntry dbref : feature.dbRefs) { + /* + * convert UniProtKB/Swiss-Prot to UNIPROT + */ + dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource())); dna.addDBRef(dbref); } } @@ -419,14 +423,13 @@ public class EmblEntry } /* - * add dbRefs to sequence, and mappings for Uniprot xrefs + * add mappings for Uniprot xrefs */ if (feature.dbRefs != null) { boolean mappingUsed = false; for (DBRefEntry ref : feature.dbRefs) { - ref.setSource(DBRefUtils.getCanonicalName(ref.getSource())); if (ref.getSource().equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" @@ -482,7 +485,6 @@ public class EmblEntry } } } - dna.addDBRef(ref); } if (noProteinDbref && product != null) { diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java index 39e4bcd..751bf4d 100644 --- a/src/jalview/gui/AlignFrame.java +++ b/src/jalview/gui/AlignFrame.java @@ -23,7 +23,6 @@ package jalview.gui; import jalview.analysis.AlignmentSorter; import jalview.analysis.AlignmentUtils; import jalview.analysis.CrossRef; -import jalview.analysis.CrossRefs; import jalview.analysis.Dna; import jalview.analysis.ParseProperties; import jalview.analysis.SequenceIdMatcher; @@ -4634,22 +4633,25 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, } /** - * Searches selected sequences for xRef products and builds the Show - * Cross-References menu (formerly called Show Products) + * Searches the alignment sequences for xRefs and builds the Show + * Cross-References menu (formerly called Show Products), with database + * sources for which cross-references are found (protein sources for a + * nucleotide alignment and vice versa) * - * @return true if Show Cross-references menu should be enabled. + * @return true if Show Cross-references menu should be enabled */ public boolean canShowProducts() { - SequenceI[] selection = viewport.getSequenceSelection(); + SequenceI[] seqs = viewport.getAlignment().getSequencesArray(); AlignmentI dataset = viewport.getAlignment().getDataset(); boolean showp = false; try { showProducts.removeAll(); final boolean dna = viewport.getAlignment().isNucleotide(); - List ptypes = (selection == null || selection.length == 0) ? null - : CrossRef.findXrefSourcesForSequences(dna, selection, dataset); + List ptypes = (seqs == null || seqs.length == 0) ? null + : new CrossRef(seqs, dataset) + .findXrefSourcesForSequences(); for (final String source : ptypes) { @@ -4706,8 +4708,8 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, { AlignmentI alignment = AlignFrame.this.getViewport() .getAlignment(); - AlignmentI xrefs = CrossRefs.findXrefSequences(sel, dna, source, - alignment); + AlignmentI xrefs = new CrossRef(sel, alignment) + .findXrefSequences(source); if (xrefs != null) { /* diff --git a/src/jalview/util/MapList.java b/src/jalview/util/MapList.java index e51442c..ab811a9 100644 --- a/src/jalview/util/MapList.java +++ b/src/jalview/util/MapList.java @@ -215,7 +215,7 @@ public class MapList { /* * note lowest and highest values - bearing in mind the - * direction may be revesed + * direction may be reversed */ fromLowest = Math.min(fromLowest, Math.min(from[i], from[i + 1])); fromHighest = Math.max(fromHighest, Math.max(from[i], from[i + 1])); diff --git a/test/jalview/analysis/CrossRefTest.java b/test/jalview/analysis/CrossRefTest.java index 31f9728..b2720f2 100644 --- a/test/jalview/analysis/CrossRefTest.java +++ b/test/jalview/analysis/CrossRefTest.java @@ -22,6 +22,7 @@ package jalview.analysis; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; +import static org.testng.AssertJUnit.assertNotNull; import static org.testng.AssertJUnit.assertNotSame; import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; @@ -101,7 +102,8 @@ public class CrossRefTest /* * first with no dbrefs to search */ - CrossRef.findXrefSourcesForSequence(seq, false, al, sources); + sources = new CrossRef(new SequenceI[] { seq }, al) + .findXrefSourcesForSequences(); assertTrue(sources.isEmpty()); /* @@ -117,7 +119,8 @@ public class CrossRefTest seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348")); seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349")); seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350")); - CrossRef.findXrefSourcesForSequence(seq, false, al, sources); + sources = new CrossRef(new SequenceI[] { seq }, al) + .findXrefSourcesForSequences(); assertEquals(4, sources.size()); assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]", sources.toString()); @@ -136,7 +139,8 @@ public class CrossRefTest seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348")); // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ? al.addSequence(seq2); - CrossRef.findXrefSourcesForSequence(seq, false, al, sources); + sources = new CrossRef(new SequenceI[] { seq }, al) + .findXrefSourcesForSequences(); assertEquals(3, sources.size()); assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString()); } @@ -166,8 +170,8 @@ public class CrossRefTest * - but peptide with matching nucleotide dbref does, so is returned */ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq }); - Alignment xrefs = CrossRef.findXrefSequences( - new SequenceI[] { emblSeq }, true, "UNIPROT", al); + Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al) + .findXrefSequences("UNIPROT"); assertEquals(1, xrefs.getHeight()); assertSame(uniprotSeq, xrefs.getSequenceAt(0)); } @@ -201,8 +205,8 @@ public class CrossRefTest * - but nucleotide with matching peptide dbref does, so is returned */ AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq }); - Alignment xrefs = CrossRef.findXrefSequences( - new SequenceI[] { uniprotSeq }, false, "EMBL", al); + Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq }, + al).findXrefSequences("EMBL"); assertEquals(1, xrefs.getHeight()); assertSame(emblSeq, xrefs.getSequenceAt(0)); } @@ -228,8 +232,8 @@ public class CrossRefTest * equatable to it, so no results found */ AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 }); - Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna2 }, - true, "UNIPROT", al); + Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al) + .findXrefSequences("UNIPROT"); assertNull(xrefs); } @@ -257,8 +261,8 @@ public class CrossRefTest * first search for a dbref nowhere on the alignment: */ DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "P30419"); - boolean found = CrossRef.searchDataset(dna1, dbref, al, result, null, - true, true); + CrossRef testee = new CrossRef(al.getSequencesArray(), al); + boolean found = testee.searchDataset(dna1, dbref, result, null, true); assertFalse(found); assertTrue(result.isEmpty()); @@ -269,8 +273,7 @@ public class CrossRefTest * search for a protein sequence with dbref UNIPROT:Q9ZTS2 */ dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2"); - found = CrossRef.searchDataset(dna1, dbref, al, result, null, true, - true); + found = testee.searchDataset(dna1, dbref, result, null, true); assertTrue(found); assertEquals(1, result.size()); assertSame(pep1, result.get(0)); @@ -280,8 +283,7 @@ public class CrossRefTest */ result.clear(); dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2"); - found = CrossRef.searchDataset(pep1, dbref, al, result, null, true, - false); + found = testee.searchDataset(pep1, dbref, result, null, false); assertTrue(found); assertEquals(1, result.size()); assertSame(dna1, result.get(0)); @@ -329,8 +331,8 @@ public class CrossRefTest * mapped sequences */ AlignmentI al = new Alignment(new SequenceI[] { dna1 }); - Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 }, - true, "UNIPROT", al); + Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al) + .findXrefSequences("UNIPROT"); assertEquals(2, xrefs.getHeight()); /* @@ -391,9 +393,12 @@ public class CrossRefTest final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW"); final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG"); - SequenceFetcher mockFetcher = new SequenceFetcher() + /* + * argument false suppresses adding DAS sources + * todo: define an interface type SequenceFetcherI and mock that + */ + SequenceFetcher mockFetcher = new SequenceFetcher(false) { - @Override public boolean isFetchable(String source) { @@ -412,8 +417,8 @@ public class CrossRefTest * find UNIPROT xrefs for nucleotide sequence */ AlignmentI al = new Alignment(new SequenceI[] { dna1 }); - Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 }, - true, "UNIPROT", al); + Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al) + .findXrefSequences("UNIPROT"); assertEquals(2, xrefs.getHeight()); assertSame(pep1, xrefs.getSequenceAt(0)); assertSame(pep2, xrefs.getSequenceAt(1)); @@ -425,4 +430,278 @@ public class CrossRefTest SequenceFetcherFactory.setSequenceFetcher(null); } + /** + * Test for finding 'product' sequences for the case where both gene and + * transcript sequences have dbrefs to Uniprot. + */ + @Test(groups = { "Functional" }) + public void testFindXrefSequences_forGeneAndTranscripts() + { + /* + * 'gene' sequence + */ + SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC"); + gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056")); + gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3")); + + /* + * 'transcript' with CDS feature (supports mapping to protein) + */ + SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa"); + braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056")); + braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f, + null)); + + /* + * 'spliced transcript' with CDS ranges + */ + SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa"); + braf002.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3")); + braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f, + null)); + braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f, + null)); + + /* + * TODO code is fragile - use of SequenceIdMatcher depends on fetched + * sequences having a name starting Source|Accession + * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl + */ + final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL"); + final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF"); + + /* + * argument false suppresses adding DAS sources + * todo: define an interface type SequenceFetcherI and mock that + */ + SequenceFetcher mockFetcher = new SequenceFetcher(false) + { + @Override + public boolean isFetchable(String source) + { + return true; + } + + @Override + public SequenceI[] getSequences(List refs, boolean dna) + { + return new SequenceI[] { pep1, pep2 }; + } + }; + SequenceFetcherFactory.setSequenceFetcher(mockFetcher); + + /* + * find UNIPROT xrefs for gene and transcripts + * verify that + * - the two proteins are retrieved but not duplicated + * - mappings are built from transcript (CDS) to proteins + * - no mappings from gene to proteins + */ + SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 }; + AlignmentI al = new Alignment(seqs); + Alignment xrefs = new CrossRef(seqs, al) + .findXrefSequences("UNIPROT"); + assertEquals(2, xrefs.getHeight()); + assertSame(pep1, xrefs.getSequenceAt(0)); + assertSame(pep2, xrefs.getSequenceAt(1)); + } + + /** + *
+   * Test that emulates this (real but simplified) case:
+   * Alignment:          DBrefs
+   *     UNIPROT|P0CE19  EMBL|J03321, EMBL|X06707, EMBL|M19487
+   *     UNIPROT|P0CE20  EMBL|J03321, EMBL|X06707, EMBL|X07547
+   * Find cross-references for EMBL. These are mocked here as
+   *     EMBL|J03321     with mappings to P0CE18, P0CE19, P0CE20
+   *     EMBL|X06707     with mappings to P0CE17, P0CE19, P0CE20
+   *     EMBL|M19487     with mappings to P0CE19, Q46432
+   *     EMBL|X07547     with mappings to P0CE20, B0BCM4
+   * EMBL sequences are first 'fetched' (mocked here) for P0CE19.
+   * The 3 EMBL sequences are added to the alignment dataset.
+   * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the
+   * alignment dataset and updated to reference the original Uniprot sequences.
+   * For the second Uniprot sequence, the J03321 and X06707 xrefs should be 
+   * resolved from the dataset, and only the X07547 dbref fetched.
+   * So the end state to verify is:
+   * - 4 cross-ref sequences returned: J03321, X06707,  M19487, X07547
+   * - P0CE19/20 dbrefs to EMBL sequences now have mappings
+   * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+   * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+   * 
+ */ + @Test(groups = { "Functional" }) + public void testFindXrefSequences_uniprotEmblManyToMany() + { + /* + * Uniprot sequences, both with xrefs to EMBL|J03321 + * and EMBL|X07547 + * Sequences faked to ensure dna translates to protein + * (so that mappings can be made) + */ + SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG"); + p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321")); + p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707")); + p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487")); + SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "KPFG"); + p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321")); + p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707")); + p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547")); + + /* + * EMBL sequences to be 'fetched', complete with dbrefs and mappings + * to their protein products (CDS location and translations are provided + * in EMBL XML); these should be matched to, and replaced with, + * the corresponding uniprot sequences after fetching + */ + + /* + * J03321 with mappings to P0CE19 and P0CE20 + */ + final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGG"); + DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19"); + MapList mapList = new MapList(new int[] { 1, 18 }, + new int[] { 1, 6 }, 3, 1); + Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), mapList); + // add a dbref to the mapped to sequence - should get copied to p0ce19 + map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875")); + dbref1.setMap(map); + j03321.addDBRef(dbref1); + DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20"); + dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "KPFG"), + new MapList(mapList))); + j03321.addDBRef(dbref2); + + /* + * X06707 with mappings to P0CE19 and P0CE20 + */ + final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG"); + // TODO CrossRef.constructMapping ignores the reverse mapping ?? + // should it not use its inverse if available? + // how does this work for real? + DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19"); + MapList map2 = new MapList(new int[] { 4, 21 }, new int[] { 1, 6 }, 3, + 1); + dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2)); + x06707.addDBRef(dbref3); + DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20"); + dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "KPFG"), + new MapList(mapList))); + x06707.addDBRef(dbref4); + + /* + * M19487 with mapping to P0CE19 and Q46432 + */ + final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG"); + DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19"); + dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), + new MapList(mapList))); + m19487.addDBRef(dbref5); + DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432"); + dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"), + new MapList(mapList))); + m19487.addDBRef(dbref6); + + /* + * X07547 with mapping to P0CE20 and B0BCM4 + */ + final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG"); + DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20"); + dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), + new MapList(map2))); + x07547.addDBRef(dbref7); + DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4"); + dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"), + new MapList(map2))); + x07547.addDBRef(dbref8); + + /* + * mock sequence fetcher to 'return' the EMBL sequences + * TODO: Mockito would allow .thenReturn().thenReturn() here, + * and also capture and verification of the parameters + * passed in calls to getSequences() + */ + SequenceFetcher mockFetcher = new SequenceFetcher(false) + { + int call = 0; + @Override + public boolean isFetchable(String source) + { + return true; + } + @Override + public SequenceI[] getSequences(List refs, boolean dna) + { + call++; + return call == 1 ? new SequenceI[] { j03321, x06707, m19487 } + : new SequenceI[] { x07547 }; + } + }; + SequenceFetcherFactory.setSequenceFetcher(mockFetcher); + + /* + * find EMBL xrefs for Uniprot seqs and verify that + * - the EMBL xref'd sequences are retrieved without duplicates + * - mappings are added to the Uniprot dbrefs + * - mappings in the EMBL-to-Uniprot dbrefs are updated to the + * alignment sequences + * - dbrefs on the EMBL sequences are added to the original dbrefs + */ + SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 }; + AlignmentI al = new Alignment(seqs); + Alignment xrefs = new CrossRef(seqs, al) + .findXrefSequences("EMBL"); + + /* + * verify retrieved sequences + */ + assertNotNull(xrefs); + assertEquals(4, xrefs.getHeight()); + assertSame(j03321, xrefs.getSequenceAt(0)); + assertSame(x06707, xrefs.getSequenceAt(1)); + assertSame(m19487, xrefs.getSequenceAt(2)); + assertSame(x07547, xrefs.getSequenceAt(3)); + + /* + * verify mappings added to Uniprot-to-EMBL dbrefs + */ + Mapping mapping = p0ce19.getDBRefs()[0].getMap(); + assertSame(j03321, mapping.getTo()); + mapping = p0ce19.getDBRefs()[1].getMap(); + assertSame(x06707, mapping.getTo()); + mapping = p0ce20.getDBRefs()[0].getMap(); + assertSame(j03321, mapping.getTo()); + mapping = p0ce20.getDBRefs()[1].getMap(); + assertSame(x06707, mapping.getTo()); + + /* + * verify dbrefs on EMBL are mapped to alignment seqs + */ + assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo()); + assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo()); + assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo()); + assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo()); + + /* + * verify new dbref on EMBL dbref mapping is copied to the + * original Uniprot sequence + */ + assertEquals(4, p0ce19.getDBRefs().length); + assertEquals("PIR", p0ce19.getDBRefs()[3].getSource()); + assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId()); + } + + @Test(groups = "Functional") + public void testSameSequence() + { + assertTrue(CrossRef.sameSequence(null, null)); + SequenceI seq1 = new Sequence("seq1", "ABCDEF"); + assertFalse(CrossRef.sameSequence(seq1, null)); + assertFalse(CrossRef.sameSequence(null, seq1)); + assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF"))); + assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef"))); + assertFalse(CrossRef + .sameSequence(seq1, new Sequence("seq2", "ABCDE-F"))); + assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF"))); + } } diff --git a/test/jalview/analysis/SequenceIdMatcherTest.java b/test/jalview/analysis/SequenceIdMatcherTest.java index 9d3e3b6..a17270d 100644 --- a/test/jalview/analysis/SequenceIdMatcherTest.java +++ b/test/jalview/analysis/SequenceIdMatcherTest.java @@ -90,5 +90,11 @@ public class SequenceIdMatcherTest * case insensitive matching */ assertTrue(testee.equals("a12345")); + + testee = sequenceIdMatcher.new SeqIdName("UNIPROT|A12345"); + assertFalse(testee.equals("A12345")); + assertFalse(testee.equals("UNIPROT|B98765")); + assertFalse(testee.equals("UNIPROT|")); + assertTrue(testee.equals("UNIPROT")); } } diff --git a/test/jalview/util/DnaUtilsTest.java b/test/jalview/util/DnaUtilsTest.java index 6623c13..d7a76b9 100644 --- a/test/jalview/util/DnaUtilsTest.java +++ b/test/jalview/util/DnaUtilsTest.java @@ -89,20 +89,10 @@ public class DnaUtilsTest assertEquals(87064, ranges.get(1)[1]); /* - * beyond 5' or 3' locus - */ - ranges = DnaUtils.parseLocation("<34..126"); - assertEquals(1, ranges.size()); - assertEquals(34, ranges.get(0)[0]); - assertEquals(126, ranges.get(0)[1]); - ranges = DnaUtils.parseLocation("35..>127"); - assertEquals(1, ranges.size()); - assertEquals(35, ranges.get(0)[0]); - assertEquals(127, ranges.get(0)[1]); - - /* * valid things we don't yet handle */ + assertNull(DnaUtils.parseLocation("<34..126")); + assertNull(DnaUtils.parseLocation("35..>126")); assertNull(DnaUtils.parseLocation("34.126")); assertNull(DnaUtils.parseLocation("34^126")); assertNull(DnaUtils.parseLocation("order(34..126,130..180)")); diff --git a/test/jalview/ws/SequenceFetcherTest.java b/test/jalview/ws/SequenceFetcherTest.java index 76ca69b..50fb3c0 100644 --- a/test/jalview/ws/SequenceFetcherTest.java +++ b/test/jalview/ws/SequenceFetcherTest.java @@ -106,8 +106,9 @@ public class SequenceFetcherTest { boolean dna = sp.isDnaCoding(); // try and find products - List types = CrossRef.findXrefSourcesForSequences(dna, - al.getSequencesArray(), null); + CrossRef crossRef = new CrossRef(al.getSequencesArray(), + al); + List types = crossRef.findXrefSourcesForSequences(); if (types != null) { System.out.println("Xref Types for: " @@ -115,9 +116,7 @@ public class SequenceFetcherTest for (String source : types) { System.out.println("Type: " + source); - SequenceI[] prod = jalview.analysis.CrossRef - .findXrefSequences(al.getSequencesArray(), dna, - source, null) + SequenceI[] prod = crossRef.findXrefSequences(source) .getSequencesArray(); System.out.println("Found " + ((prod == null) ? "no" : "" + prod.length) @@ -200,8 +199,8 @@ public class SequenceFetcherTest // have a bash at finding the products amongst all the retrieved // sequences. SequenceI[] seqs = al.getSequencesArray(); - Alignment prodal = jalview.analysis.CrossRef.findXrefSequences( - seqs, dna, null, ds); + Alignment prodal = new CrossRef(seqs, ds) + .findXrefSequences(null); System.out.println("Found " + ((prodal == null) ? "no" : "" + prodal.getHeight()) + " products"); diff --git a/test/jalview/ws/seqfetcher/DbRefFetcherTest.java b/test/jalview/ws/seqfetcher/DbRefFetcherTest.java index 63b1b9c..44cefe2 100644 --- a/test/jalview/ws/seqfetcher/DbRefFetcherTest.java +++ b/test/jalview/ws/seqfetcher/DbRefFetcherTest.java @@ -178,8 +178,8 @@ public class DbRefFetcherTest .getMap().getMappedWidth(), 1); assertEquals("Expected local reference map to be 3 nucleotides", dr[0] .getMap().getWidth(), 3); - AlignmentI sprods = CrossRef.findXrefSequences( - alsq.getSequencesArray(), true, dr[0].getSource(), alsq); + AlignmentI sprods = new CrossRef(alsq.getSequencesArray(), alsq) + .findXrefSequences(dr[0].getSource()); assertNotNull( "Couldn't recover cross reference sequence from dataset. Was it ever added ?", sprods); -- 1.7.10.2