From 3ef44bef1f825d26977dedd1608469712a87fe15 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 13 Apr 2015 16:22:35 +0100 Subject: [PATCH] JAL-1693 make exon alignment for get-xref splitframe (with CDS xref) --- resources/lang/Messages.properties | 2 +- src/jalview/analysis/AlignmentUtils.java | 126 ++++++++++++++ src/jalview/analysis/CrossRef.java | 102 ++++------- src/jalview/analysis/SeqsetUtils.java | 13 +- src/jalview/datamodel/Alignment.java | 15 +- src/jalview/datamodel/FeatureProperties.java | 30 +++- src/jalview/datamodel/Sequence.java | 10 +- src/jalview/datamodel/SequenceI.java | 6 +- src/jalview/util/DBRefUtils.java | 147 ++++++++-------- src/jalview/ws/dbsources/Uniprot.java | 2 +- test/jalview/analysis/AlignmentUtilsTests.java | 138 +++++++++++++++ test/jalview/analysis/CrossRefTest.java | 46 +++++ test/jalview/util/DBRefUtilsTest.java | 222 ++++++++++++++++++++++++ test/jalview/util/MappingUtilsTest.java | 2 +- 14 files changed, 702 insertions(+), 159 deletions(-) create mode 100644 test/jalview/analysis/CrossRefTest.java create mode 100644 test/jalview/util/DBRefUtilsTest.java diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties index 027a9ce..7544a6a 100644 --- a/resources/lang/Messages.properties +++ b/resources/lang/Messages.properties @@ -697,7 +697,7 @@ label.translate_cDNA = Translate as cDNA label.linked_view_title = Linked cDNA and protein view label.align = Align label.extract_scores = Extract Scores -label.get_cross_refs = Get Cross References +label.get_cross_refs = Get Cross-References label.sort_alignment_new_tree = Sort Alignment With New Tree label.add_sequences = Add Sequences label.new_window = New Window diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index b78afeb..a811d84 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -35,16 +35,22 @@ import java.util.TreeMap; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; +import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.MappingUtils; /** * grab bag of useful alignment manipulation operations Expect these to be @@ -1258,4 +1264,124 @@ public class AlignmentUtils } return false; } + + /** + * Constructs an alignment consisting of the mapped exon regions in the given + * nucleotide sequences, and updates mappings to match. + * + * @param dna + * aligned dna sequences + * @param mappings + * from dna to protein; these are replaced with new mappings + * @return an alignment whose sequences are the exon-only parts of the dna + * sequences (or null if no exons are found) + */ + public static AlignmentI makeExonAlignment(SequenceI[] dna, + Set mappings) + { + Set newMappings = new HashSet(); + List exonSequences = new ArrayList(); + + for (SequenceI dnaSeq : dna) + { + final SequenceI ds = dnaSeq.getDatasetSequence(); + List seqMappings = MappingUtils + .findMappingsForSequence(ds, mappings); + if (!seqMappings.isEmpty()) + { + /* + * We assume here that only one protein mapping is expected per dna + * sequence. Mapping to multiple protein sequences is conceivable but + * undefined. Splitting a mapping to one protein sequence across + * multiple mappings is possible but pathological. Need closer + * constraints on the contents of AlignedCodonFrame. + */ + AlignedCodonFrame newMapping = new AlignedCodonFrame(); + final SequenceI exonSequence = makeExonSequence(ds, + seqMappings.get(0), newMapping); + exonSequences.add(exonSequence); + newMappings.add(newMapping); + } + } + AlignmentI al = new Alignment( + exonSequences.toArray(new SequenceI[exonSequences.size()])); + al.setDataset(null); + + /* + * Replace the old mappings with the new ones + */ + mappings.clear(); + mappings.addAll(newMappings); + + return al; + } + + /** + * Helper method to make an exon-only sequence and populate its mapping to + * protein + *

+ * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein + * then generate a sequence CCTTGA with mapping [1, 6] to the same protein + * residues + * + * @param dnaSeq + * a dna dataset sequence + * @param mapping + * the current mapping of the sequence to protein + * @param newMapping + * the new mapping to populate, from the exon-only sequence + * @return + */ + protected static SequenceI makeExonSequence(SequenceI dnaSeq, + AlignedCodonFrame acf, AlignedCodonFrame newMapping) + { + Mapping mapping = acf.getMappingForSequence(dnaSeq); + final char[] dna = dnaSeq.getSequence(); + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + List exonRanges = mapping.getMap().getFromRanges(); + for (int[] range : exonRanges) + { + for (int pos = range[0]; pos <= range[1]; pos++) + { + newSequence.append(dna[pos - 1]); + } + } + + SequenceI exon = new Sequence(dnaSeq.getName(), newSequence.toString()); + + /* + * Locate any xrefs to CDS database on the protein product and attach to the + * CDS sequence. Also add as a sub-token of the sequence name. + */ + // default to "CDS" if we can't locate an actual gene id + String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs( + mapping.getTo().getDBRef(), DBRefSource.CODINGDBS); + if (cdsRefs != null) + { + for (DBRefEntry cdsRef : cdsRefs) + { + exon.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } + } + exon.setName(exon.getName() + "|" + cdsAccId); + exon.createDatasetSequence(); + + /* + * Build new mappings - from the same protein regions, but now to contiguous + * exons + */ + List exonRange = new ArrayList(); + exonRange.add(new int[] + { 1, newSequence.length() }); + MapList map = new MapList(exonRange, mapping.getMap().getToRanges(), 3, 1); + newMapping.addMap(exon.getDatasetSequence(), mapping.getTo(), map); + + return exon; + } } diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 47bd7bc..7238239 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -20,6 +20,10 @@ */ package jalview.analysis; +import java.util.ArrayList; +import java.util.List; +import java.util.Vector; + import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; @@ -27,14 +31,10 @@ import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; +import jalview.util.DBRefUtils; import jalview.ws.SequenceFetcher; import jalview.ws.seqfetcher.ASequenceFetcher; -import java.util.Enumeration; -import java.util.Hashtable; -import java.util.List; -import java.util.Vector; - /** * Functions for cross-referencing sequence databases. user must first specify * if cross-referencing from protein or dna (set dna==true) @@ -45,39 +45,22 @@ import java.util.Vector; public class CrossRef { /** - * get the DNA or protein references for a protein or dna sequence + * Select just the DNA or protein references for a protein or dna sequence * - * @param dna - * @param rfs + * @param fromDna + * if true, select references from DNA (i.e. Protein databases), else + * DNA database references + * @param refs + * a set of references to select from * @return */ - public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs) + public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs) { - if (dna) - { - rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS); - } - else - { - rfs = jalview.util.DBRefUtils.selectRefs(rfs, - DBRefSource.DNACODINGDBS); // could attempt to find other cross - // refs and return here - ie PDB xrefs - // (not dna, not protein seq) - } - return rfs; - } - - public static Hashtable classifyDbRefs(DBRefEntry[] rfs) - { - Hashtable classes = new Hashtable(); - classes.put(DBRefSource.PROTEINDBS, - jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS)); - classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils - .selectRefs(rfs, DBRefSource.DNACODINGDBS)); - classes.put(DBRefSource.DOMAINDBS, - jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS)); - // classes.put(OTHER, ) - return classes; + return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS + : DBRefSource.DNACODINGDBS); + // could attempt to find other cross + // refs here - ie PDB xrefs + // (not dna, not protein seq) } /** @@ -104,12 +87,11 @@ public class CrossRef SequenceI[] seqs, AlignmentI dataset) { String[] dbrefs = null; - Vector refs = new Vector(); + List refs = new ArrayList(); for (int s = 0; s < seqs.length; s++) { if (seqs[s] != null) { - SequenceI dss = seqs[s]; while (dss.getDatasetSequence() != null) { @@ -120,7 +102,7 @@ public class CrossRef { if (!refs.contains(rfs[r].getSource())) { - refs.addElement(rfs[r].getSource()); + refs.add(rfs[r].getSource()); } } if (dataset != null) @@ -128,19 +110,17 @@ public class CrossRef // search for references to this sequence's direct references. DBRefEntry[] lrfs = CrossRef .findXDbRefs(!dna, seqs[s].getDBRef()); - Vector rseqs = new Vector(); + List rseqs = new ArrayList(); CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs, null); // don't need to specify codon frame for mapping here - Enumeration lr = rseqs.elements(); - while (lr.hasMoreElements()) + for (SequenceI rs : rseqs) { - SequenceI rs = (SequenceI) lr.nextElement(); - DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef()); + DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef()); // not used?? for (int r = 0; rfs != null && r < rfs.length; r++) { if (!refs.contains(rfs[r].getSource())) { - refs.addElement(rfs[r].getSource()); + refs.add(rfs[r].getSource()); } } } @@ -150,7 +130,7 @@ public class CrossRef if (refs.size() > 0) { dbrefs = new String[refs.size()]; - refs.copyInto(dbrefs); + refs.toArray(dbrefs); } return dbrefs; } @@ -228,7 +208,7 @@ public class CrossRef public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna, String source, AlignmentI dataset) { - Vector rseqs = new Vector(); + List rseqs = new ArrayList(); Alignment ral = null; AlignedCodonFrame cf = new AlignedCodonFrame(); // nominal width for (int s = 0; s < seqs.length; s++) @@ -243,14 +223,8 @@ public class CrossRef if ((xrfs == null || xrfs.length == 0) && dataset != null) { System.out.println("Attempting to find ds Xrefs refs."); - DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less - // ambiguous - // would - // be a - // 'find - // primary - // dbRefEntry' - // method. + DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); + // less ambiguous would be a 'find primary dbRefEntry' method. // filter for desired source xref here found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, rseqs, cf); @@ -265,8 +239,8 @@ public class CrossRef { if (xrfs[r].getMap().getTo() != null) { - Sequence rsq = new Sequence(xrfs[r].getMap().getTo()); - rseqs.addElement(rsq); + SequenceI rsq = new Sequence(xrfs[r].getMap().getTo()); + rseqs.add(rsq); if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r] .getMap().getMap().getToRatio()) { @@ -401,7 +375,7 @@ public class CrossRef } } retrieved[rs].updatePDBIds(); - rseqs.addElement(retrieved[rs]); + rseqs.add(retrieved[rs]); } } } @@ -411,7 +385,7 @@ public class CrossRef if (rseqs.size() > 0) { SequenceI[] rsqs = new SequenceI[rseqs.size()]; - rseqs.copyInto(rsqs); + rseqs.toArray(rsqs); ral = new Alignment(rsqs); if (cf != null && cf.getProtMappings() != null) { @@ -433,7 +407,8 @@ public class CrossRef * @return true if matches were found. */ private static boolean searchDatasetXrefs(SequenceI sequenceI, - boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs, + boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, + List rseqs, AlignedCodonFrame cf) { boolean found = false; @@ -465,7 +440,7 @@ public class CrossRef * @return true if one or more unique sequences were found and added */ public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf, - AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf) + AlignmentI dataset, List rseqs, AlignedCodonFrame cf) { return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false); } @@ -486,7 +461,7 @@ public class CrossRef * @return true if relationship found and sequence added. */ public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf, - AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf, + AlignmentI dataset, List rseqs, AlignedCodonFrame cf, boolean direct, boolean dna) { boolean found = false; @@ -540,10 +515,9 @@ public class CrossRef { if (!rseqs.contains(nxt)) { - rseqs.addElement(nxt); - boolean foundmap = cf != null; // don't search if we aren't - // given - // a codon map object + rseqs.add(nxt); + boolean foundmap = cf != null; + // don't search if we aren't given a codon map object for (int r = 0; foundmap && r < cands.length; r++) { if (cands[r].hasMap()) diff --git a/src/jalview/analysis/SeqsetUtils.java b/src/jalview/analysis/SeqsetUtils.java index 27ad577..2ede9ed 100755 --- a/src/jalview/analysis/SeqsetUtils.java +++ b/src/jalview/analysis/SeqsetUtils.java @@ -20,9 +20,14 @@ */ package jalview.analysis; -import java.util.*; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.Vector; -import jalview.datamodel.*; +import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; public class SeqsetUtils { @@ -56,7 +61,7 @@ public class SeqsetUtils } sqinfo.put("SeqFeatures", sfeat); sqinfo.put("PdbId", (seq.getPDBId() != null) ? seq.getPDBId() - : new Vector()); + : new Vector()); sqinfo.put("datasetSequence", (seq.getDatasetSequence() != null) ? seq.getDatasetSequence() : new Sequence("THISISAPLACEHOLDER", "")); @@ -84,7 +89,7 @@ public class SeqsetUtils Integer start = (Integer) sqinfo.get("Start"); Integer end = (Integer) sqinfo.get("End"); Vector sfeatures = (Vector) sqinfo.get("SeqFeatures"); - Vector pdbid = (Vector) sqinfo.get("PdbId"); + Vector pdbid = (Vector) sqinfo.get("PdbId"); String description = (String) sqinfo.get("Description"); Sequence seqds = (Sequence) sqinfo.get("datasetSequence"); if (oldname == null) diff --git a/src/jalview/datamodel/Alignment.java b/src/jalview/datamodel/Alignment.java index 482df7f..81046f1 100755 --- a/src/jalview/datamodel/Alignment.java +++ b/src/jalview/datamodel/Alignment.java @@ -20,10 +20,6 @@ */ package jalview.datamodel; -import jalview.analysis.AlignmentUtils; -import jalview.io.FastaFile; -import jalview.util.MessageManager; - import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; @@ -34,6 +30,10 @@ import java.util.Map; import java.util.Set; import java.util.Vector; +import jalview.analysis.AlignmentUtils; +import jalview.io.FastaFile; +import jalview.util.MessageManager; + /** * Data structure to hold and manipulate a multiple sequence alignment */ @@ -1664,8 +1664,8 @@ public class Alignment implements AlignmentI * identically. If this is nucleotide and the other is protein, make 3 gaps * for each gap in the protein sequences. If this is protein and the other is * nucleotide, insert a gap for each 3 gaps (or part thereof) between - * nucleotide bases. Does nothing if alignment of protein from cDNA is - * requested (not yet implemented). + * nucleotide bases. If this is protein and the other is nucleotide, gaps + * protein to match the relative ordering of codons in the nucleotide. * * Parameters control whether gaps in exon (mapped) and intron (unmapped) * regions are preserved. Gaps that connect introns to exons are treated @@ -1697,6 +1697,9 @@ public class Alignment implements AlignmentI { thisGapChar, thisGapChar, thisGapChar }) : String .valueOf(thisGapChar); + // TODO handle intron regions? Needs a 'holistic' alignment of dna, + // not just sequence by sequence. But how to 'gap' intron regions? + /* * Get mappings from 'that' alignment's sequences to this. */ diff --git a/src/jalview/datamodel/FeatureProperties.java b/src/jalview/datamodel/FeatureProperties.java index b940eb1..d25eb96 100644 --- a/src/jalview/datamodel/FeatureProperties.java +++ b/src/jalview/datamodel/FeatureProperties.java @@ -29,6 +29,8 @@ package jalview.datamodel; public class FeatureProperties { + private static final String EMBL_CODING_FEATURE = "CDS"; + public static final String EXONPOS = "exon number"; public static final String EXONPRODUCT = "product"; @@ -43,9 +45,29 @@ public class FeatureProperties */ public static boolean isCodingFeature(String dbrefsource, String type) { - return ((dbrefsource == null - || dbrefsource.equalsIgnoreCase(DBRefSource.EMBL) || dbrefsource - .equalsIgnoreCase(DBRefSource.EMBLCDS)) && type - .equalsIgnoreCase("CDS")); + if (type.equalsIgnoreCase(EMBL_CODING_FEATURE)) + { + return (dbrefsource == null + || dbrefsource.equalsIgnoreCase(DBRefSource.EMBL) || dbrefsource + .equalsIgnoreCase(DBRefSource.EMBLCDS)); + } + return false; + } + + /** + * Returns the coding feature name for a database source. Currently just + * hard-coded to return CDS for EMBL/EMBLCDS, else null. + * + * @param dbrefsource + * @return + */ + public static String getCodingFeature(String dbrefsource) + { + if (DBRefSource.EMBL.equalsIgnoreCase(dbrefsource) + || DBRefSource.EMBLCDS.equalsIgnoreCase(dbrefsource)) + { + return EMBL_CODING_FEATURE; + } + return null; } } diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java index 5c1fba5..65d8179 100755 --- a/src/jalview/datamodel/Sequence.java +++ b/src/jalview/datamodel/Sequence.java @@ -20,9 +20,6 @@ */ package jalview.datamodel; -import jalview.analysis.AlignSeq; -import jalview.util.StringUtils; - import java.util.ArrayList; import java.util.Enumeration; import java.util.List; @@ -30,6 +27,9 @@ import java.util.Vector; import fr.orsay.lri.varna.models.rna.RNA; +import jalview.analysis.AlignSeq; +import jalview.util.StringUtils; + /** * * Implements the SequenceI interface for a char[] based sequence object. @@ -51,7 +51,7 @@ public class Sequence implements SequenceI int end; - Vector pdbIds; + Vector pdbIds; String vamsasId; @@ -353,7 +353,7 @@ public class Sequence implements SequenceI { if (pdbIds == null) { - pdbIds = new Vector(); + pdbIds = new Vector(); } if (!pdbIds.contains(entry)) { diff --git a/src/jalview/datamodel/SequenceI.java b/src/jalview/datamodel/SequenceI.java index a9a7589..04f3588 100755 --- a/src/jalview/datamodel/SequenceI.java +++ b/src/jalview/datamodel/SequenceI.java @@ -265,14 +265,14 @@ public interface SequenceI * @param id * DOCUMENT ME! */ - public void setPDBId(Vector ids); + public void setPDBId(Vector ids); /** - * DOCUMENT ME! + * Returns a list * * @return DOCUMENT ME! */ - public Vector getPDBId(); + public Vector getPDBId(); /** * add entry to the vector of PDBIds, if it isn't in the list already diff --git a/src/jalview/util/DBRefUtils.java b/src/jalview/util/DBRefUtils.java index 8163f05..9765a1a 100755 --- a/src/jalview/util/DBRefUtils.java +++ b/src/jalview/util/DBRefUtils.java @@ -20,18 +20,39 @@ */ package jalview.util; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.PDBEntry; -import jalview.datamodel.SequenceI; - import java.util.ArrayList; import java.util.HashMap; import java.util.Hashtable; +import java.util.List; import java.util.Map; -import java.util.Vector; + +import com.stevesoft.pat.Regex; + +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.PDBEntry; +import jalview.datamodel.SequenceI; public class DBRefUtils { + private static Map canonicalSourceNameLookup = new HashMap(); + + private static Map dasCoordinateSystemsLookup = new HashMap(); + + static + { + // TODO load these from a resource file? + canonicalSourceNameLookup.put("uniprotkb/swiss-prot", + DBRefSource.UNIPROT); + canonicalSourceNameLookup.put("uniprotkb/trembl", DBRefSource.UNIPROT); + canonicalSourceNameLookup.put("pdb", DBRefSource.PDB); + + dasCoordinateSystemsLookup.put("pdbresnum", DBRefSource.PDB); + dasCoordinateSystemsLookup.put("uniprot", DBRefSource.UNIPROT); + dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBL); + // dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBLCDS); + } + /** * Utilities for handling DBRef objects and their collections. */ @@ -89,37 +110,20 @@ public class DBRefUtils * @return boolean true if Source DBRefEntry is compatible with DAS * CoordinateSystem name */ - public static Hashtable DasCoordinateSystemsLookup = null; public static boolean isDasCoordinateSystem(String string, DBRefEntry dBRefEntry) { - if (DasCoordinateSystemsLookup == null) + if (string == null || dBRefEntry == null) { - // TODO: Make a DasCoordinateSystemsLookup properties resource - // Initialise - DasCoordinateSystemsLookup = new Hashtable(); - DasCoordinateSystemsLookup.put("pdbresnum", - jalview.datamodel.DBRefSource.PDB); - DasCoordinateSystemsLookup.put("uniprot", - jalview.datamodel.DBRefSource.UNIPROT); - DasCoordinateSystemsLookup.put("EMBL", - jalview.datamodel.DBRefSource.EMBL); - // DasCoordinateSystemsLookup.put("EMBL", - // jalview.datamodel.DBRefSource.EMBLCDS); + return false; } - - String coordsys = (String) DasCoordinateSystemsLookup.get(string + String coordsys = dasCoordinateSystemsLookup.get(string .toLowerCase()); - if (coordsys != null) - { - return coordsys.equals(dBRefEntry.getSource()); - } - return false; + return coordsys == null ? false : coordsys.equals(dBRefEntry + .getSource()); } - public static Hashtable CanonicalSourceNameLookup = null; - /** * look up source in an internal list of database reference sources and return * the canonical jalview name for the source, or the original string if it has @@ -131,34 +135,28 @@ public class DBRefUtils */ public static String getCanonicalName(String source) { - if (CanonicalSourceNameLookup == null) + if (source == null) { - CanonicalSourceNameLookup = new Hashtable(); - CanonicalSourceNameLookup.put("uniprotkb/swiss-prot", - jalview.datamodel.DBRefSource.UNIPROT); - CanonicalSourceNameLookup.put("uniprotkb/trembl", - jalview.datamodel.DBRefSource.UNIPROT); - CanonicalSourceNameLookup.put("pdb", - jalview.datamodel.DBRefSource.PDB); + return null; } - String canonical = (String) CanonicalSourceNameLookup.get(source + String canonical = canonicalSourceNameLookup.get(source .toLowerCase()); - if (canonical == null) - { - return source; - } - return canonical; + return canonical == null ? source : canonical; } /** - * find RefEntry corresponding to a particular pattern the equals method of - * each entry is used, from String attributes right down to Mapping - * attributes. + * Returns an array of those references that match the given entry, or null if + * no matches. Currently uses a comparator which matches if + *

    + *
  • database sources are the same
  • + *
  • accession ids are the same
  • + *
  • both have no mapping, or the mappings are the same
  • + *
* * @param ref * Set of references to search * @param entry - * pattern to collect - null any entry for wildcard match + * pattern to match * @return */ public static DBRefEntry[] searchRefs(DBRefEntry[] ref, DBRefEntry entry) @@ -167,32 +165,36 @@ public class DBRefUtils matchDbAndIdAndEitherMapOrEquivalentMapList); } - public static DBRefEntry[] searchRefs(DBRefEntry[] ref, DBRefEntry entry, + /** + * Returns an array of those references that match the given entry, according + * to the given comparator. Returns null if no matches. + * + * @param refs + * an array of database references to search + * @param entry + * an entry to compare against + * @param comparator + * @return + */ + static DBRefEntry[] searchRefs(DBRefEntry[] refs, DBRefEntry entry, DbRefComp comparator) { - if (ref == null || entry == null) + if (refs == null || entry == null) { return null; } - Vector rfs = new Vector(); - for (int i = 0; i < ref.length; i++) + List rfs = new ArrayList(); + for (int i = 0; i < refs.length; i++) { - if (comparator.matches(entry, ref[i])) + if (comparator.matches(entry, refs[i])) { - rfs.addElement(ref[i]); + rfs.add(refs[i]); } } - // TODO Auto-generated method stub - if (rfs.size() > 0) - { - DBRefEntry[] rf = new DBRefEntry[rfs.size()]; - rfs.copyInto(rf); - return rf; - } - return null; + return rfs.size() == 0 ? null : rfs.toArray(new DBRefEntry[rfs.size()]); } - public interface DbRefComp + interface DbRefComp { public boolean matches(DBRefEntry refa, DBRefEntry refb); } @@ -402,14 +404,17 @@ public class DBRefUtils }; /** - * used by file parsers to generate DBRefs from annotation within file (eg - * stockholm) + * Parses a DBRefEntry and adds it to the sequence, also a PDBEntry if the + * database is PDB. + *

+ * Used by file parsers to generate DBRefs from annotation within file (eg + * Stockholm) * * @param dbname * @param version * @param acn * @param seq - * where to anotate with reference + * where to annotate with reference * @return parsed version of entry that was added to seq (if any) */ public static DBRefEntry parseToDbRef(SequenceI seq, String dbname, @@ -418,12 +423,14 @@ public class DBRefUtils DBRefEntry ref = null; if (dbname != null) { - String locsrc = jalview.util.DBRefUtils.getCanonicalName(dbname); - if (locsrc.equals(jalview.datamodel.DBRefSource.PDB)) + String locsrc = DBRefUtils.getCanonicalName(dbname); + if (locsrc.equals(DBRefSource.PDB)) { - // check for chaincode and mapping - // PFAM style stockhom PDB citation - com.stevesoft.pat.Regex r = new com.stevesoft.pat.Regex( + /* + * Check for PFAM style stockhom PDB accession id citation e.g. + * "1WRI A; 7-80;" + */ + Regex r = new com.stevesoft.pat.Regex( "([0-9][0-9A-Za-z]{3})\\s*(.?)\\s*;\\s*([0-9]+)-([0-9]+)"); if (r.search(acn.trim())) { @@ -433,8 +440,8 @@ public class DBRefUtils { chaincode = " "; } - String mapstart = r.stringMatched(3); - String mapend = r.stringMatched(4); + // String mapstart = r.stringMatched(3); + // String mapend = r.stringMatched(4); if (chaincode.equals(" ")) { chaincode = "_"; diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index d204b99..f0e5de0 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -222,7 +222,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy { UniprotEntry entry = (UniprotEntry) entries.elementAt(i); Enumeration e = entry.getDbReference().elements(); - Vector onlyPdbEntries = new Vector(); + Vector onlyPdbEntries = new Vector(); Vector dbxrefs = new Vector(); while (e.hasMoreElements()) { diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 71b1bcb..98d77d4 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -29,8 +29,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.junit.Test; @@ -41,11 +43,14 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Mapping; +import jalview.datamodel.SearchResults; +import jalview.datamodel.SearchResults.Match; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import jalview.io.AppletFormatAdapter; import jalview.io.FormatAdapter; import jalview.util.MapList; +import jalview.util.MappingUtils; public class AlignmentUtilsTests { @@ -869,4 +874,137 @@ public class AlignmentUtilsTests assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2)); assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1)); } + + /** + * Test the method that extracts the exon-only part of a dna alignment. + */ + @Test + public void testMakeExonAlignment() + { + SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); + SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC"); + SequenceI pep1 = new Sequence("pep1", "GF"); + SequenceI pep2 = new Sequence("pep2", "GFP"); + dna1.createDatasetSequence(); + dna2.createDatasetSequence(); + pep1.createDatasetSequence(); + pep2.createDatasetSequence(); + + Set mappings = new HashSet(); + MapList map = new MapList(new int[] + { 4, 6, 10, 12 }, new int[] + { 1, 2 }, 3, 1); + AlignedCodonFrame acf = new AlignedCodonFrame(); + acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); + mappings.add(acf); + map = new MapList(new int[] + { 1, 3, 7, 9, 13, 15 }, new int[] + { 1, 3 }, 3, 1); + acf = new AlignedCodonFrame(); + acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map); + mappings.add(acf); + + AlignmentI exons = AlignmentUtils.makeExonAlignment(new SequenceI[] + { dna1, dna2 }, mappings); + assertEquals(2, exons.getSequences().size()); + assertEquals("GGGTTT", exons.getSequenceAt(0).getSequenceAsString()); + assertEquals("GGGTTTCCC", exons.getSequenceAt(1).getSequenceAsString()); + + /* + * Verify updated mappings + */ + assertEquals(2, mappings.size()); + + /* + * Mapping from pep1 to GGGTTT in first new exon sequence + */ + List pep1Mapping = MappingUtils + .findMappingsForSequence(pep1, mappings); + assertEquals(1, pep1Mapping.size()); + // map G to GGG + SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings); + assertEquals(1, sr.getResults().size()); + Match m = sr.getResults().get(0); + assertEquals(exons.getSequenceAt(0).getDatasetSequence(), + m.getSequence()); + assertEquals(1, m.getStart()); + assertEquals(3, m.getEnd()); + // map F to TTT + sr = MappingUtils.buildSearchResults(pep1, 2, mappings); + m = sr.getResults().get(0); + assertEquals(exons.getSequenceAt(0).getDatasetSequence(), + m.getSequence()); + assertEquals(4, m.getStart()); + assertEquals(6, m.getEnd()); + + /* + * Mapping from pep2 to GGGTTTCCC in second new exon sequence + */ + List pep2Mapping = MappingUtils + .findMappingsForSequence(pep2, mappings); + assertEquals(1, pep2Mapping.size()); + // map G to GGG + sr = MappingUtils.buildSearchResults(pep2, 1, mappings); + assertEquals(1, sr.getResults().size()); + m = sr.getResults().get(0); + assertEquals(exons.getSequenceAt(1).getDatasetSequence(), + m.getSequence()); + assertEquals(1, m.getStart()); + assertEquals(3, m.getEnd()); + // map F to TTT + sr = MappingUtils.buildSearchResults(pep2, 2, mappings); + m = sr.getResults().get(0); + assertEquals(exons.getSequenceAt(1).getDatasetSequence(), + m.getSequence()); + assertEquals(4, m.getStart()); + assertEquals(6, m.getEnd()); + // map P to CCC + sr = MappingUtils.buildSearchResults(pep2, 3, mappings); + m = sr.getResults().get(0); + assertEquals(exons.getSequenceAt(1).getDatasetSequence(), + m.getSequence()); + assertEquals(7, m.getStart()); + assertEquals(9, m.getEnd()); + } + + /** + * Test the method that makes an exon-only sequence from a DNA sequence and + * its product mapping. Test includes the expected case that the DNA sequence + * already has a protein product (Uniprot translation) which in turn has an + * x-ref to the EMBLCDS record. + */ + @Test + public void testMakeExonSequence() + { + SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa"); + SequenceI pep1 = new Sequence("pep1", "GF"); + dna1.createDatasetSequence(); + pep1.createDatasetSequence(); + pep1.getDatasetSequence().addDBRef( + new DBRefEntry("EMBLCDS", "2", "A12345")); + + /* + * Make the mapping from dna to protein. The protein sequence has a DBRef to + * EMBLCDS|A12345. + */ + Set mappings = new HashSet(); + MapList map = new MapList(new int[] + { 4, 6, 10, 12 }, new int[] + { 1, 2 }, 3, 1); + AlignedCodonFrame acf = new AlignedCodonFrame(); + acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map); + mappings.add(acf); + + AlignedCodonFrame newMapping = new AlignedCodonFrame(); + SequenceI exon = AlignmentUtils.makeExonSequence(dna1, acf, newMapping); + + assertEquals("GGGTTT", exon.getSequenceAsString()); + assertEquals("dna1|A12345", exon.getName()); + assertEquals(1, exon.getDBRef().length); + DBRefEntry cdsRef = exon.getDBRef()[0]; + assertEquals("EMBLCDS", cdsRef.getSource()); + assertEquals("2", cdsRef.getVersion()); + assertEquals("A12345", cdsRef.getAccessionId()); + + } } diff --git a/test/jalview/analysis/CrossRefTest.java b/test/jalview/analysis/CrossRefTest.java new file mode 100644 index 0000000..35606f0 --- /dev/null +++ b/test/jalview/analysis/CrossRefTest.java @@ -0,0 +1,46 @@ +package jalview.analysis; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertSame; + +import org.junit.Test; + +import jalview.datamodel.DBRefEntry; + +public class CrossRefTest +{ + @Test + public void testFindXDbRefs() + { + DBRefEntry ref1 = new DBRefEntry("UNIPROT", "1", "A123"); + DBRefEntry ref2 = new DBRefEntry("UNIPROTKB/TREMBL", "1", "A123"); + DBRefEntry ref3 = new DBRefEntry("pdb", "1", "A123"); + DBRefEntry ref4 = new DBRefEntry("EMBLCDSPROTEIN", "1", "A123"); + DBRefEntry ref5 = new DBRefEntry("embl", "1", "A123"); + DBRefEntry ref6 = new DBRefEntry("emblCDS", "1", "A123"); + DBRefEntry ref7 = new DBRefEntry("GeneDB", "1", "A123"); + DBRefEntry ref8 = new DBRefEntry("PFAM", "1", "A123"); + DBRefEntry[] refs = new DBRefEntry[] + { ref1, ref2, ref3, ref4, ref5, ref6, ref7, ref8 }; + + /* + * Just the DNA refs: + */ + DBRefEntry[] found = CrossRef.findXDbRefs(false, refs); + assertEquals(3, found.length); + assertSame(ref5, found[0]); + assertSame(ref6, found[1]); + assertSame(ref7, found[2]); + + /* + * Just the protein refs: + */ + found = CrossRef.findXDbRefs(true, refs); + assertEquals(4, found.length); + assertSame(ref1, found[0]); + assertSame(ref2, found[1]); + assertSame(ref3, found[2]); + assertSame(ref4, found[3]); + } + +} diff --git a/test/jalview/util/DBRefUtilsTest.java b/test/jalview/util/DBRefUtilsTest.java new file mode 100644 index 0000000..e606665 --- /dev/null +++ b/test/jalview/util/DBRefUtilsTest.java @@ -0,0 +1,222 @@ +package jalview.util; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.Mapping; +import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; + +public class DBRefUtilsTest +{ + + /** + * Test the method that selects DBRefEntry items whose source is in a supplied + * list + */ + @Test + public void testSelectRefs() + { + assertNull(DBRefUtils.selectRefs(null, null)); + assertNull(DBRefUtils.selectRefs(null, DBRefSource.CODINGDBS)); + + DBRefEntry ref1 = new DBRefEntry("EMBL", "1.2", "A12345"); + DBRefEntry ref2 = new DBRefEntry("UNIPROT", "1.2", "A12346"); + // Source is converted to upper-case by this constructor! + DBRefEntry ref3 = new DBRefEntry("Uniprot", "1.2", "A12347"); + DBRefEntry[] dbrefs = new DBRefEntry[] + { ref1, ref2, ref3 }; + String[] sources = new String[] + { "EMBL", "UNIPROT" }; + + DBRefEntry[] selected = DBRefUtils.selectRefs(dbrefs, sources); + assertEquals(3, selected.length); + assertSame(ref1, selected[0]); + assertSame(ref2, selected[1]); + assertSame(ref3, selected[2]); + + sources = new String[] + { "EMBL" }; + selected = DBRefUtils.selectRefs(dbrefs, sources); + assertEquals(1, selected.length); + assertSame(ref1, selected[0]); + + sources = new String[] + { "UNIPROT" }; + selected = DBRefUtils.selectRefs(dbrefs, sources); + assertEquals(2, selected.length); + assertSame(ref2, selected[0]); + assertSame(ref3, selected[1]); + + sources = new String[] + { "Uniprot", "EMBLCDS" }; + selected = DBRefUtils.selectRefs(dbrefs, sources); + assertNull(selected); + } + + /** + * Test the method that converts (currently three) database names to a + * canonical name (not case-sensitive) + */ + @Test + public void testGetCanonicalName() + { + assertNull(DBRefUtils.getCanonicalName(null)); + assertEquals("", DBRefUtils.getCanonicalName("")); + assertEquals("PDB", DBRefUtils.getCanonicalName("pdb")); + assertEquals("PDB", DBRefUtils.getCanonicalName("Pdb")); + assertEquals("UNIPROT", + DBRefUtils.getCanonicalName("uniprotkb/swiss-prot")); + assertEquals("UNIPROT", DBRefUtils.getCanonicalName("uniprotkb/trembl")); + assertEquals("UNIPROT", + DBRefUtils.getCanonicalName("UNIPROTKB/SWISS-PROT")); + assertEquals("UNIPROT", DBRefUtils.getCanonicalName("UNIPROTKB/TREMBL")); + assertEquals("UNIPROTKB/SWISS-CHEESE", + DBRefUtils.getCanonicalName("UNIPROTKB/SWISS-CHEESE")); + } + + @Test + public void testIsDasCoordinateSystem() + { + assertFalse(DBRefUtils.isDasCoordinateSystem(null, null)); + assertFalse(DBRefUtils.isDasCoordinateSystem("pdbresnum", null)); + assertFalse(DBRefUtils.isDasCoordinateSystem(null, new DBRefEntry( + "PDB", "v1", "a1"))); + + assertTrue(DBRefUtils.isDasCoordinateSystem("pdbresnum", + new DBRefEntry("PDB", "v1", "a1"))); + assertTrue(DBRefUtils.isDasCoordinateSystem("PDBRESNUM", + new DBRefEntry("PDB", "v1", "a1"))); + // "pdb" is converted to upper-case in DBRefEntry constructor + assertTrue(DBRefUtils.isDasCoordinateSystem("pdbresnum", + new DBRefEntry("pdb", "v1", "a1"))); + assertFalse(DBRefUtils.isDasCoordinateSystem("pdb", new DBRefEntry( + "pdb", "v1", "a1"))); + + assertTrue(DBRefUtils.isDasCoordinateSystem("UNIPROT", new DBRefEntry( + "Uniprot", "v1", "a1"))); + assertTrue(DBRefUtils.isDasCoordinateSystem("Uniprot", new DBRefEntry( + "UNIPROT", "v1", "a1"))); + assertFalse(DBRefUtils.isDasCoordinateSystem("UNIPROTKB", + new DBRefEntry( + "pdb", "v1", "a1"))); + + assertTrue(DBRefUtils.isDasCoordinateSystem("EMBL", new DBRefEntry( + "EMBL", "v1", "a1"))); + assertTrue(DBRefUtils.isDasCoordinateSystem("embl", new DBRefEntry( + "embl", "v1", "a1"))); + } + + /** + * Test 'parsing' a DBRef - non PDB case + */ + @Test + public void testParseToDbRef() + { + SequenceI seq = new Sequence("Seq1", "ABCD"); + DBRefEntry ref = DBRefUtils.parseToDbRef(seq, "EMBL", "1.2", "a7890"); + DBRefEntry[] refs = seq.getDBRef(); + assertEquals(1, refs.length); + assertSame(ref, refs[0]); + assertEquals("EMBL", ref.getSource()); + assertEquals("1.2", ref.getVersion()); + assertEquals("a7890", ref.getAccessionId()); + assertNull(seq.getPDBId()); + } + + /** + * Test 'parsing' a DBRef - Stockholm PDB format + */ + @Test + public void testParseToDbRef_PDB() + { + SequenceI seq = new Sequence("Seq1", "ABCD"); + DBRefEntry ref = DBRefUtils.parseToDbRef(seq, "pdb", "1.2", + "1WRI A; 7-80;"); + DBRefEntry[] refs = seq.getDBRef(); + assertEquals(1, refs.length); + assertSame(ref, refs[0]); + assertEquals("PDB", ref.getSource()); + assertEquals("1.2", ref.getVersion()); + // DBRef id is pdbId + chain code + assertEquals("1WRIA", ref.getAccessionId()); + assertEquals(1, seq.getPDBId().size()); + PDBEntry pdbRef = seq.getPDBId().get(0); + assertEquals("1WRI", pdbRef.getId()); + assertNull(pdbRef.getFile()); + assertEquals("A", pdbRef.getProperty().get("CHAIN")); + assertNull(pdbRef.getType()); + } + + /** + * Test the method that searches for matches references - case when we are + * matching a reference with no mappings + */ + @Test + public void testSearchRefs_noMapping() + { + DBRefEntry target = new DBRefEntry("EMBL", "2", "A1234"); + + DBRefEntry ref1 = new DBRefEntry("EMBL", "1", "A1234"); // matches + // constructor changes embl to EMBL + DBRefEntry ref2 = new DBRefEntry("embl", "1", "A1234"); // matches + // constructor does not upper-case accession id + DBRefEntry ref3 = new DBRefEntry("EMBL", "1", "a1234"); // no match + DBRefEntry ref4 = new DBRefEntry("EMBLCDS", "1", "A1234"); // no match + // ref5 matches although it has a mapping - ignored + DBRefEntry ref5 = new DBRefEntry("EMBL", "1", "A1234"); + ref5.setMap(new Mapping(new MapList(new int[] + { 1, 1 }, new int[] + { 1, 1 }, 1, 1))); + + DBRefEntry[] matches = DBRefUtils.searchRefs(new DBRefEntry[] + { ref1, ref2, ref3, ref4, ref5 }, target); + assertEquals(3, matches.length); + assertSame(ref1, matches[0]); + assertSame(ref2, matches[1]); + assertSame(ref5, matches[2]); + } + + /** + * Test the method that searches for matches references - case when we are + * matching a reference with a mapping + */ + @Test + public void testSearchRefs_withMapping() + { + DBRefEntry target = new DBRefEntry("EMBL", "2", "A1234"); + final Mapping map1 = new Mapping(new MapList(new int[] + { 1, 1 }, new int[] + { 1, 1 }, 1, 1)); + target.setMap(map1); + + // these all match target iff mappings match + DBRefEntry ref1 = new DBRefEntry("EMBL", "1", "A1234"); // no map: matches + DBRefEntry ref2 = new DBRefEntry("EMBL", "1", "A1234"); // =map: matches + final Mapping map2 = new Mapping(new MapList(new int[] + { 1, 1 }, new int[] + { 1, 1 }, 1, 1)); + ref2.setMap(map2); + + // different map: no match + DBRefEntry ref3 = new DBRefEntry("EMBL", "1", "A1234"); + final Mapping map3 = new Mapping(new MapList(new int[] + { 1, 1 }, new int[] + { 1, 1 }, 2, 2)); + ref3.setMap(map3); + + DBRefEntry[] matches = DBRefUtils.searchRefs(new DBRefEntry[] + { ref1, ref2, ref3 }, target); + assertEquals(2, matches.length); + assertSame(ref1, matches[0]); + assertSame(ref2, matches[1]); + } +} diff --git a/test/jalview/util/MappingUtilsTest.java b/test/jalview/util/MappingUtilsTest.java index 4e144fc..c86259b 100644 --- a/test/jalview/util/MappingUtilsTest.java +++ b/test/jalview/util/MappingUtilsTest.java @@ -90,7 +90,7 @@ public class MappingUtilsTest * Simple test of mapping with introns involved. */ @Test - public void testBuildSearchResults_withIntro() + public void testBuildSearchResults_withIntron() { final Sequence seq1 = new Sequence("Seq1", "C-G-TAGA-GCAGCTT"); seq1.createDatasetSequence(); -- 1.7.10.2