From 7c7ff05445948740df781678899b1be99d709a1f Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 19 Oct 2017 15:02:45 +0100 Subject: [PATCH] JAL-2755 find gene loci for fetched xrefs where possible --- src/jalview/gui/AlignFrame.java | 2 +- src/jalview/gui/CrossRefAction.java | 377 +++++++++++++++++++++++++------- test/jalview/io/CrossRef2xmlTests.java | 29 ++- 3 files changed, 315 insertions(+), 93 deletions(-) diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java index 95cabcd..f6b8392 100644 --- a/src/jalview/gui/AlignFrame.java +++ b/src/jalview/gui/AlignFrame.java @@ -4261,7 +4261,7 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, protected void showProductsFor(final SequenceI[] sel, final boolean _odna, final String source) { - new Thread(CrossRefAction.showProductsFor(sel, _odna, source, this)) + new Thread(CrossRefAction.getHandlerFor(sel, _odna, source, this)) .start(); } diff --git a/src/jalview/gui/CrossRefAction.java b/src/jalview/gui/CrossRefAction.java index 2d1dfd4..21a0a84 100644 --- a/src/jalview/gui/CrossRefAction.java +++ b/src/jalview/gui/CrossRefAction.java @@ -27,17 +27,25 @@ import jalview.api.FeatureSettingsModelI; import jalview.bin.Cache; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; +import jalview.datamodel.GeneLociI; import jalview.datamodel.SequenceI; +import jalview.ext.ensembl.EnsemblInfo; +import jalview.ext.ensembl.EnsemblMap; import jalview.io.gff.SequenceOntologyI; import jalview.structure.StructureSelectionManager; +import jalview.util.DBRefUtils; +import jalview.util.MapList; +import jalview.util.MappingUtils; import jalview.util.MessageManager; import jalview.ws.SequenceFetcher; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; - -import javax.swing.JOptionPane; +import java.util.Map; +import java.util.Set; /** * Factory constructor and runnable for discovering and displaying @@ -52,13 +60,13 @@ public class CrossRefAction implements Runnable private SequenceI[] sel; - private boolean _odna; + private final boolean _odna; private String source; - List xrefViews = new ArrayList(); + List xrefViews = new ArrayList<>(); - public List getXrefViews() + List getXrefViews() { return xrefViews; } @@ -90,6 +98,13 @@ public class CrossRefAction implements Runnable { return; } + + /* + * try to look up chromosomal coordinates for nucleotide + * sequences (if not already retrieved) + */ + findGeneLoci(xrefs.getSequences()); + /* * get display scheme (if any) to apply to features */ @@ -113,75 +128,14 @@ public class CrossRefAction implements Runnable if (Cache.getDefault(Preferences.ENABLE_SPLIT_FRAME, true)) { - boolean copyAlignmentIsAligned = false; - if (dna) - { - copyAlignment = AlignmentUtils.makeCdsAlignment(sel, dataset, - xrefsAlignment.getSequencesArray()); - if (copyAlignment.getHeight() == 0) - { - JvOptionPane.showMessageDialog(alignFrame, - MessageManager.getString("label.cant_map_cds"), - MessageManager.getString("label.operation_failed"), - JvOptionPane.OK_OPTION); - System.err.println("Failed to make CDS alignment"); - } - - /* - * pending getting Embl transcripts to 'align', - * we are only doing this for Ensembl - */ - // TODO proper criteria for 'can align as cdna' - if (DBRefSource.ENSEMBL.equalsIgnoreCase(source) - || AlignmentUtils.looksLikeEnsembl(alignment)) - { - copyAlignment.alignAs(alignment); - copyAlignmentIsAligned = true; - } - } - else + copyAlignment = copyAlignmentForSplitFrame(alignment, dataset, dna, + xrefs, xrefsAlignment); + if (copyAlignment == null) { - copyAlignment = AlignmentUtils.makeCopyAlignment(sel, - xrefs.getSequencesArray(), dataset); - } - copyAlignment - .setGapCharacter(alignFrame.viewport.getGapCharacter()); - - StructureSelectionManager ssm = StructureSelectionManager - .getStructureSelectionManager(Desktop.instance); - - /* - * register any new mappings for sequence mouseover etc - * (will not duplicate any previously registered mappings) - */ - ssm.registerMappings(dataset.getCodonFrames()); - - if (copyAlignment.getHeight() <= 0) - { - System.err.println( - "No Sequences generated for xRef type " + source); - return; - } - /* - * align protein to dna - */ - if (dna && copyAlignmentIsAligned) - { - xrefsAlignment.alignAs(copyAlignment); - } - else - { - /* - * align cdna to protein - currently only if - * fetching and aligning Ensembl transcripts! - */ - // TODO: generalise for other sources of locus/transcript/cds data - if (dna && DBRefSource.ENSEMBL.equalsIgnoreCase(source)) - { - copyAlignment.alignAs(xrefsAlignment); - } + return; // failed } } + /* * build AlignFrame(s) according to available alignment data */ @@ -207,6 +161,7 @@ public class CrossRefAction implements Runnable xrefViews.add(newFrame.alignPanel); return; // via finally clause } + AlignFrame copyThis = new AlignFrame(copyAlignment, AlignFrame.DEFAULT_WIDTH, AlignFrame.DEFAULT_HEIGHT); copyThis.setTitle(alignFrame.getTitle()); @@ -263,6 +218,260 @@ public class CrossRefAction implements Runnable } /** + * Tries to add chromosomal coordinates to any nucleotide sequence which does + * not already have them. Coordinates are retrieved from Ensembl given an + * Ensembl identifier, either on the sequence itself or on a peptide sequence + * it has a reference to. + * + *
+   * Example (human):
+   * - fetch EMBLCDS cross-references for Uniprot entry P30419
+   * - the EMBL sequences do not have xrefs to Ensembl
+   * - the Uniprot entry has xrefs to 
+   *    ENSP00000258960, ENSP00000468424, ENST00000258960, ENST00000592782
+   * - either of the transcript ids can be used to retrieve gene loci e.g.
+   *    http://rest.ensembl.org/map/cds/ENST00000592782/1..100000
+   * Example (invertebrate):
+   * - fetch EMBLCDS cross-references for Uniprot entry Q43517 (FER1_SOLLC)
+   * - the Uniprot entry has an xref to ENSEMBLPLANTS Solyc10g044520.1.1
+   * - can retrieve gene loci with
+   *    http://rest.ensemblgenomes.org/map/cds/Solyc10g044520.1.1/1..100000
+   * 
+ * + * @param sequences + */ + public static void findGeneLoci(List sequences) + { + Map retrievedLoci = new HashMap<>(); + for (SequenceI seq : sequences) + { + findGeneLoci(seq, retrievedLoci); + } + } + + /** + * Tres to find chromosomal coordinates for the sequence, by searching its + * direct and indirect cross-references for Ensembl. If the loci have already + * been retrieved, just reads them out of the map of retrievedLoci; this is + * the case of an alternative transcript for the same protein. Otherwise calls + * a REST service to retrieve the loci, and if successful, adds them to the + * sequence and to the retrievedLoci. + * + * @param seq + * @param retrievedLoci + */ + static void findGeneLoci(SequenceI seq, + Map retrievedLoci) + { + /* + * don't replace any existing chromosomal coordinates + */ + if (seq == null || seq.isProtein() || seq.getGeneLoci() != null + || seq.getDBRefs() == null) + { + return; + } + + Set ensemblDivisions = new EnsemblInfo().getDivisions(); + + /* + * first look for direct dbrefs from sequence to Ensembl + */ + String[] divisionsArray = ensemblDivisions + .toArray(new String[ensemblDivisions.size()]); + DBRefEntry[] seqRefs = seq.getDBRefs(); + DBRefEntry[] directEnsemblRefs = DBRefUtils.selectRefs(seqRefs, + divisionsArray); + if (directEnsemblRefs != null) + { + for (DBRefEntry ensemblRef : directEnsemblRefs) + { + if (fetchGeneLoci(seq, ensemblRef, retrievedLoci)) + { + return; + } + } + } + + /* + * else look for indirect dbrefs from sequence to Ensembl + */ + for (DBRefEntry dbref : seq.getDBRefs()) + { + if (dbref.getMap() != null && dbref.getMap().getTo() != null) + { + DBRefEntry[] dbrefs = dbref.getMap().getTo().getDBRefs(); + DBRefEntry[] indirectEnsemblRefs = DBRefUtils.selectRefs(dbrefs, + divisionsArray); + if (indirectEnsemblRefs != null) + { + for (DBRefEntry ensemblRef : indirectEnsemblRefs) + { + if (fetchGeneLoci(seq, ensemblRef, retrievedLoci)) + { + return; + } + } + } + } + } + } + + /** + * Retrieves chromosomal coordinates for the Ensembl (or EnsemblGenomes) + * identifier in dbref. If successful, and the sequence length matches gene + * loci length, then add it to the sequence, and to the retrievedLoci map. + * Answers true if successful, else false. + * + * @param seq + * @param dbref + * @param retrievedLoci + * @return + */ + static boolean fetchGeneLoci(SequenceI seq, DBRefEntry dbref, + Map retrievedLoci) + { + String accession = dbref.getAccessionId(); + String division = dbref.getSource(); + + /* + * hack: ignore cross-references to Ensembl protein ids + * (can't fetch chromosomal mapping for these) + * todo: is there an equivalent in EnsemblGenomes? + */ + if (accession.startsWith("ENSP")) + { + return false; + } + EnsemblMap mapper = new EnsemblMap(); + + /* + * try CDS mapping first + */ + GeneLociI geneLoci = mapper.getCdsMapping(division, accession, 1, + seq.getLength()); + if (geneLoci != null) + { + MapList map = geneLoci.getMap(); + int mappedFromLength = MappingUtils.getLength(map.getFromRanges()); + if (mappedFromLength == seq.getLength()) + { + seq.setGeneLoci(geneLoci.getSpeciesId(), geneLoci.getAssemblyId(), + geneLoci.getChromosomeId(), geneLoci.getMap()); + retrievedLoci.put(dbref, geneLoci); + return true; + } + } + + /* + * else try CDNA mapping + */ + geneLoci = mapper.getCdnaMapping(division, accession, 1, + seq.getLength()); + if (geneLoci != null) + { + MapList map = geneLoci.getMap(); + int mappedFromLength = MappingUtils.getLength(map.getFromRanges()); + if (mappedFromLength == seq.getLength()) + { + seq.setGeneLoci(geneLoci.getSpeciesId(), geneLoci.getAssemblyId(), + geneLoci.getChromosomeId(), geneLoci.getMap()); + retrievedLoci.put(dbref, geneLoci); + return true; + } + } + + return false; + } + + /** + * @param alignment + * @param dataset + * @param dna + * @param xrefs + * @param xrefsAlignment + * @return + */ + protected AlignmentI copyAlignmentForSplitFrame(AlignmentI alignment, + AlignmentI dataset, boolean dna, AlignmentI xrefs, + AlignmentI xrefsAlignment) + { + AlignmentI copyAlignment; + boolean copyAlignmentIsAligned = false; + if (dna) + { + copyAlignment = AlignmentUtils.makeCdsAlignment(sel, dataset, + xrefsAlignment.getSequencesArray()); + if (copyAlignment.getHeight() == 0) + { + JvOptionPane.showMessageDialog(alignFrame, + MessageManager.getString("label.cant_map_cds"), + MessageManager.getString("label.operation_failed"), + JvOptionPane.OK_OPTION); + System.err.println("Failed to make CDS alignment"); + return null; + } + + /* + * pending getting Embl transcripts to 'align', + * we are only doing this for Ensembl + */ + // TODO proper criteria for 'can align as cdna' + if (DBRefSource.ENSEMBL.equalsIgnoreCase(source) + || AlignmentUtils.looksLikeEnsembl(alignment)) + { + copyAlignment.alignAs(alignment); + copyAlignmentIsAligned = true; + } + } + else + { + copyAlignment = AlignmentUtils.makeCopyAlignment(sel, + xrefs.getSequencesArray(), dataset); + } + copyAlignment + .setGapCharacter(alignFrame.viewport.getGapCharacter()); + + StructureSelectionManager ssm = StructureSelectionManager + .getStructureSelectionManager(Desktop.instance); + + /* + * register any new mappings for sequence mouseover etc + * (will not duplicate any previously registered mappings) + */ + ssm.registerMappings(dataset.getCodonFrames()); + + if (copyAlignment.getHeight() <= 0) + { + System.err.println( + "No Sequences generated for xRef type " + source); + return null; + } + + /* + * align protein to dna + */ + if (dna && copyAlignmentIsAligned) + { + xrefsAlignment.alignAs(copyAlignment); + } + else + { + /* + * align cdna to protein - currently only if + * fetching and aligning Ensembl transcripts! + */ + // TODO: generalise for other sources of locus/transcript/cds data + if (dna && DBRefSource.ENSEMBL.equalsIgnoreCase(source)) + { + copyAlignment.alignAs(xrefsAlignment); + } + } + + return copyAlignment; + } + + /** * Makes an alignment containing the given sequences, and adds them to the * given dataset, which is also set as the dataset for the new alignment * @@ -291,20 +500,28 @@ public class CrossRefAction implements Runnable return al; } - public CrossRefAction(AlignFrame alignFrame, SequenceI[] sel, - boolean _odna, String source) + /** + * Constructor + * + * @param af + * @param seqs + * @param fromDna + * @param dbSource + */ + CrossRefAction(AlignFrame af, SequenceI[] seqs, boolean fromDna, + String dbSource) { - this.alignFrame = alignFrame; - this.sel = sel; - this._odna = _odna; - this.source = source; + this.alignFrame = af; + this.sel = seqs; + this._odna = fromDna; + this.source = dbSource; } - public static CrossRefAction showProductsFor(final SequenceI[] sel, - final boolean _odna, final String source, + public static CrossRefAction getHandlerFor(final SequenceI[] sel, + final boolean fromDna, final String source, final AlignFrame alignFrame) { - return new CrossRefAction(alignFrame, sel, _odna, source); + return new CrossRefAction(alignFrame, sel, fromDna, source); } } diff --git a/test/jalview/io/CrossRef2xmlTests.java b/test/jalview/io/CrossRef2xmlTests.java index 0715857..b3db4de 100644 --- a/test/jalview/io/CrossRef2xmlTests.java +++ b/test/jalview/io/CrossRef2xmlTests.java @@ -39,6 +39,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; + +import junit.extensions.PA; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -90,9 +93,9 @@ public class CrossRef2xmlTests extends Jalview2xmlBase // . codonframes // // - HashMap dbtoviewBit = new HashMap<>(); + Map dbtoviewBit = new HashMap<>(); List keyseq = new ArrayList<>(); - HashMap savedProjects = new HashMap<>(); + Map savedProjects = new HashMap<>(); for (String[] did : new String[][] { { "UNIPROT", "P00338" } }) { @@ -186,15 +189,16 @@ public class CrossRef2xmlTests extends Jalview2xmlBase if (pass2 == 0) { // retrieve and show cross-refs in this thread - cra = new CrossRefAction(af, seqs, dna, db); + cra = CrossRefAction.getHandlerFor(seqs, dna, db, af); cra.run(); - if (cra.getXrefViews().size() == 0) + cra_views = (List) PA.getValue(cra, + "xrefViews"); + if (cra_views.size() == 0) { failedXrefMenuItems.add("No crossrefs retrieved for " + first + " -> " + db); continue; } - cra_views = cra.getXrefViews(); assertNucleotide(cra_views.get(0), "Nucleotide panel included proteins for " + first + " -> " + db); @@ -286,16 +290,18 @@ public class CrossRef2xmlTests extends Jalview2xmlBase if (pass3 == 0) { - SequenceI[] xrseqs = avp.getAlignment() .getSequencesArray(); AlignFrame nextaf = Desktop.getAlignFrameFor(avp .getAlignViewport()); - cra = new CrossRefAction(nextaf, xrseqs, avp - .getAlignViewport().isNucleotide(), xrefdb); + cra = CrossRefAction.getHandlerFor(xrseqs, avp + .getAlignViewport().isNucleotide(), xrefdb, + nextaf); cra.run(); - if (cra.getXrefViews().size() == 0) + cra_views2 = (List) PA.getValue( + cra, "xrefViews"); + if (cra_views2.size() == 0) { failedXrefMenuItems .add("No crossrefs retrieved for '" @@ -303,7 +309,6 @@ public class CrossRef2xmlTests extends Jalview2xmlBase + " via '" + nextaf.getTitle() + "'"); continue; } - cra_views2 = cra.getXrefViews(); assertNucleotide(cra_views2.get(0), "Nucleotide panel included proteins for '" + nextxref + "' to " + xrefdb @@ -541,8 +546,8 @@ public class CrossRef2xmlTests extends Jalview2xmlBase * viewpanel needs to be called with a distinct xrefpath to ensure * each one's strings are compared) */ - private void stringify(HashMap dbtoviewBit, - HashMap savedProjects, String xrefpath, + private void stringify(Map dbtoviewBit, + Map savedProjects, String xrefpath, AlignmentViewPanel avp) { if (savedProjects != null) -- 1.7.10.2