X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=82ebc4ba47251212cb2aac9b8df33e13301ca635;hb=a1984b1c8c273ed33c7ce9283039f4027dcae2de;hp=24d1b95304412e428b731ca6b78454cc7b97e04c;hpb=17ff1f476e009b3a3c7e892e416edc2a4af8a2bc;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 24d1b95..82ebc4b 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -20,6 +20,17 @@ */ package jalview.ext.ensembl; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.json.simple.parser.ParseException; + import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.bin.Cache; @@ -28,12 +39,12 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; import jalview.exceptions.JalviewException; -import jalview.io.FastaFile; -import jalview.io.FileParse; +import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.util.Comparison; @@ -41,14 +52,6 @@ import jalview.util.DBRefUtils; import jalview.util.IntRangeComparator; import jalview.util.MapList; -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - /** * Base class for Ensembl sequence fetchers * @@ -57,10 +60,6 @@ import java.util.List; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - private static final String ALLELES = "alleles"; - - protected static final String NAME = "Name"; - protected static final String DESCRIPTION = "description"; /* @@ -171,14 +170,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * fetch and transfer genomic sequence features, * fetch protein product and add as cross-reference */ - for (String accId : allIds) + for (int i = 0, n = allIds.size(); i < n; i++) { - addFeaturesAndProduct(accId, alignment); + addFeaturesAndProduct(allIds.get(i), alignment); } - for (SequenceI seq : alignment.getSequences()) + List seqs = alignment.getSequences(); + for (int i = 0, n = seqs.size(); i < n; i++) { - getCrossReferences(seq); + getCrossReferences(seqs.get(i)); } return alignment; @@ -203,23 +203,30 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient try { /* - * get 'dummy' genomic sequence with exon, cds and variation features + * get 'dummy' genomic sequence with gene, transcript, + * exon, cds and variation features */ SequenceI genomicSequence = null; EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); - EnsemblFeatureType[] features = getFeaturesToFetch(); + EnsemblFeatureType[] features = getFeaturesToFetch(); + + // Platform.timeCheck("ESP.getsequencerec1", Platform.TIME_MARK); + AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); if (geneFeatures != null && geneFeatures.getHeight() > 0) { genomicSequence = geneFeatures.getSequenceAt(0); } + + // Platform.timeCheck("ESP.getsequencerec2", Platform.TIME_MARK); + if (genomicSequence != null) { /* * transfer features to the query sequence */ - SequenceI querySeq = alignment.findName(accId); + SequenceI querySeq = alignment.findName(accId, true); if (transferFeatures(accId, genomicSequence, querySeq)) { @@ -227,6 +234,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * fetch and map protein product, and add it as a cross-reference * of the retrieved sequence */ + // Platform.timeCheck("ESP.transferFeatures", Platform.TIME_MARK); addProteinProduct(querySeq); } } @@ -235,6 +243,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient System.err.println( "Error transferring Ensembl features: " + e.getMessage()); } + // Platform.timeCheck("ESP.addfeat done", Platform.TIME_MARK); } /** @@ -256,6 +265,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId = querySeq.getName(); try { + System.out.println("Adding protein product for " + accId); AlignmentI protein = new EnsemblProtein(getDomain()) .getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) @@ -282,10 +292,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient DBRefEntry dbr = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); - DBRefEntry[] uprots = DBRefUtils.selectRefs(ds.getDBRefs(), + List uprots = DBRefUtils.selectRefs(ds.getDBRefs(), new String[] { DBRefSource.UNIPROT }); - DBRefEntry[] upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(), + List upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(), new String[] { DBRefSource.UNIPROT }); if (uprots != null) @@ -302,7 +312,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (upx.size() > 1) { - Cache.log.warn( + Cache.warn( "Implementation issue - multiple uniprot acc on product sequence."); } } @@ -328,8 +338,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * copy exon features to protein, compute peptide variants from dna * variants and add as features on the protein sequence ta-da */ - AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, - mapList); + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -346,25 +356,46 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ protected void getCrossReferences(SequenceI seq) { + + // Platform.timeCheck("ESP. getdataseq ", Platform.TIME_MARK); + + while (seq.getDatasetSequence() != null) { seq = seq.getDatasetSequence(); } + // Platform.timeCheck("ESP. getxref ", Platform.TIME_MARK); + EnsemblXref xrefFetcher = new EnsemblXref(getDomain(), getDbSource(), getEnsemblDataVersion()); List xrefs = xrefFetcher.getCrossReferences(seq.getName()); - for (DBRefEntry xref : xrefs) + + for (int i = 0, n = xrefs.size(); i < n; i++) { - seq.addDBRef(xref); +// Platform.timeCheck("ESP. getxref + " + (i) + "/" + n, Platform.TIME_MARK); + // BH 2019.01.25 this next method was taking 174 ms PER addition for a 266-reference example. + // DBRefUtils.ensurePrimaries(seq) + // was at the end of seq.addDBRef, so executed after ever addition! + // This method was moved to seq.getPrimaryDBRefs() + seq.addDBRef(xrefs.get(i)); } +// System.out.println("primaries are " + seq.getPrimaryDBRefs().toString()); /* * and add a reference to itself */ + +// Platform.timeCheck("ESP. getxref self ", Platform.TIME_MARK); + DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), - seq.getName()); + seq.getName()); + +// Platform.timeCheck("ESP. getxref self add ", Platform.TIME_MARK); + seq.addDBRef(self); + + // Platform.timeCheck("ESP. seqprox done ", Platform.TIME_MARK); } /** @@ -385,50 +416,42 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient inProgress = false; throw new JalviewException("ENSEMBL Rest API not available."); } - FileParse fp = getSequenceReader(ids); - if (fp == null) - { - return alignment; - } + // Platform.timeCheck("EnsemblSeqProx.fetchSeq ", Platform.TIME_MARK); - FastaFile fr = new FastaFile(fp); - if (fr.hasWarningMessage()) + List seqs = parseSequenceJson(ids); + if (seqs == null) + return alignment; + + if (seqs.isEmpty()) { - System.out.println( - String.format("Warning when retrieving %d ids %s\n%s", - ids.size(), ids.toString(), fr.getWarningMessage())); + throw new IOException("No data returned for " + ids); } - else if (fr.getSeqs().size() != ids.size()) + + if (seqs.size() != ids.size()) { System.out.println(String.format( "Only retrieved %d sequences for %d query strings", - fr.getSeqs().size(), ids.size())); + seqs.size(), ids.size())); } - if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) + if (!seqs.isEmpty()) { - /* - * POST request has returned an empty FASTA file e.g. for invalid id - */ - throw new IOException("No data returned for " + ids); - } - - if (fr.getSeqs().size() > 0) - { - AlignmentI seqal = new Alignment(fr.getSeqsAsArray()); - for (SequenceI sq : seqal.getSequences()) + AlignmentI seqal = new Alignment( + seqs.toArray(new SequenceI[seqs.size()])); + for (SequenceI seq : seqs) { - if (sq.getDescription() == null) + if (seq.getDescription() == null) { - sq.setDescription(getDbName()); + seq.setDescription(getDbName()); } - String name = sq.getName(); + String name = seq.getName(); if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(), + // TODO JAL-3077 use true accession version in dbref + DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(), getEnsemblDataVersion(), name); - sq.addDBRef(dbref); + seq.addDBRef(dbref); } } if (alignment == null) @@ -444,6 +467,53 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Parses a JSON response for a single sequence ID query + * + * @param br + * @return a single jalview.datamodel.Sequence + * @see http://rest.ensembl.org/documentation/info/sequence_id + */ + @SuppressWarnings("unchecked") + protected List parseSequenceJson(List ids) + { + List result = new ArrayList<>(); + try + { + /* + * for now, assumes only one sequence returned; refactor if needed + * in future to handle a JSONArray with more than one + */ + // Platform.timeCheck("ENS seqproxy", Platform.TIME_MARK); + Map val = (Map) getJSON(null, ids, -1, MODE_MAP, null); + if (val == null) + return null; + Object s = val.get("desc"); + String desc = s == null ? null : s.toString(); + s = val.get("id"); + String id = s == null ? null : s.toString(); + s = val.get("seq"); + String seq = s == null ? null : s.toString(); + Sequence sequence = new Sequence(id, seq); + if (desc != null) + { + sequence.setDescription(desc); + } + // todo JAL-3077 make a DBRefEntry with true accession version + // s = val.get("version"); + // String version = s == null ? "0" : s.toString(); + // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id); + // sequence.addDBRef(dbref); + result.add(sequence); + } catch (ParseException | IOException e) + { + System.err.println("Error processing JSON response: " + e.toString()); + // ignore + } + // Platform.timeCheck("ENS seqproxy2", Platform.TIME_MARK); + return result; + } + + /** * Returns the URL for the REST call * * @return @@ -464,7 +534,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); - urlstring.append(("&Accept=text/x-fasta")); + urlstring.append(("&Accept=application/json")); + urlstring.append(("&content-type=application/json")); String objectType = getObjectType(); if (objectType != null) @@ -504,18 +575,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return multipleIds ? "application/json" : "text/x-fasta"; - } - - @Override - protected String getResponseMimeType() - { - return "text/x-fasta"; - } - /** * * @return the configured sequence return type for this source @@ -551,8 +610,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { - List sfs = sourceSequence.getFeatures() - .getPositionalFeatures(); + List sfs = getIdentifyingFeatures(sourceSequence, + accId); if (sfs.isEmpty()) { return null; @@ -569,47 +628,31 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient for (SequenceFeature sf : sfs) { + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward + System.err + .println("Error: forward and backward strand for " + accId); + return null; + } + direction = strand; + directionSet = true; + /* - * accept the target feature type or a specialisation of it - * (e.g. coding_exon for exon) + * add to CDS ranges, semi-sorted forwards/backwards */ - if (identifiesSequence(sf, accId)) + if (strand < 0) { - int strand = sf.getStrand(); - strand = strand == 0 ? 1 : strand; // treat unknown as forward - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward - System.err.println( - "Error: forward and backward strand for " + accId); - return null; - } - direction = strand; - directionSet = true; - - /* - * add to CDS ranges, semi-sorted forwards/backwards - */ - if (strand < 0) - { - regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); - } - else - { - regions.add(new int[] { sf.getBegin(), sf.getEnd() }); - } - mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); - - if (!isSpliceable()) - { - /* - * 'gene' sequence is contiguous so we can stop as soon as its - * identifying feature has been found - */ - break; - } + regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); } + else + { + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); + } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); } if (regions.isEmpty()) @@ -634,27 +677,21 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Answers true if the sequence being retrieved may occupy discontiguous - * regions on the genomic sequence. - */ - protected boolean isSpliceable() - { - return true; - } - - /** - * Returns true if the sequence feature marks positions of the genomic + * Answers a list of sequence features that mark positions of the genomic * sequence feature which are within the sequence being retrieved. For * example, an 'exon' feature whose parent is the target transcript marks the - * cdna positions of the transcript. + * cdna positions of the transcript. For a gene sequence, this is trivially + * just the 'gene' feature with matching gene id. * - * @param sf + * @param seq * @param accId * @return */ - protected abstract boolean identifiesSequence(SequenceFeature sf, - String accId); - + protected abstract List getIdentifyingFeatures( + SequenceI seq, String accId); + + int bhtest = 0; + /** * Transfers the sequence feature to the target sequence, locating its start * and end range based on the mapping. Features which do not overlap the @@ -676,6 +713,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (mappedRange != null) { +// Platform.timeCheck(null, Platform.TIME_SET); String group = sf.getFeatureGroup(); if (".".equals(group)) { @@ -683,8 +721,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } int newBegin = Math.min(mappedRange[0], mappedRange[1]); int newEnd = Math.max(mappedRange[0], mappedRange[1]); - SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, - group, sf.getScore()); +// Platform.timeCheck(null, Platform.TIME_MARK); + bhtest++; + // 280 ms/1000 here: + SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, group, sf.getScore()); + // 0.175 ms here: targetSequence.addSequenceFeature(copy); /* @@ -707,7 +748,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ static void reverseComplementAlleles(SequenceFeature sf) { - final String alleles = (String) sf.getValue(ALLELES); + final String alleles = (String) sf.getValue(Gff3Helper.ALLELES); if (alleles == null) { return; @@ -718,19 +759,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient reverseComplementAllele(complement, allele); } String comp = complement.toString(); - sf.setValue(ALLELES, comp); + sf.setValue(Gff3Helper.ALLELES, comp); sf.setDescription(comp); - - /* - * replace value of "alleles=" in sf.ATTRIBUTES as well - * so 'output as GFF' shows reverse complement alleles - */ - String atts = sf.getAttributes(); - if (atts != null) - { - atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp); - sf.setAttributes(atts); - } } /** @@ -788,10 +818,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, targetSequence.getStart()); if (mapping == null) - { + { return false; } + // Platform.timeCheck("ESP. xfer " + sfs.size(), Platform.TIME_MARK); + boolean result = transferFeatures(sfs, targetSequence, mapping, accessionId); // System.out.println("transferFeatures (" + (sfs.size()) + " --> " @@ -825,14 +857,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient SequenceFeatures.sortFeatures(sfs, forwardStrand); boolean transferred = false; - for (SequenceFeature sf : sfs) + + for (int i = 0, n = sfs.size(); i < n; i++) { + +// if ((i%1000) == 0) { +//// Platform.timeCheck("Feature " + bhtest, Platform.TIME_GET); +// Platform.timeCheck("ESP. xferFeature + " + (i) + "/" + n, Platform.TIME_MARK); +// } + + SequenceFeature sf = sfs.get(i); if (retainFeature(sf, parentId)) { transferFeature(sf, targetSequence, mapping, forwardStrand); transferred = true; } } + return transferred; } @@ -860,8 +901,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected boolean featureMayBelong(SequenceFeature sf, String identifier) { String parent = (String) sf.getValue(PARENT); - // using contains to allow for prefix "gene:", "transcript:" etc - if (parent != null && !parent.contains(identifier)) + if (parent != null + && !parent.equalsIgnoreCase(identifier)) { // this genomic feature belongs to a different transcript return false; @@ -869,6 +910,9 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return true; } + /** + * Answers a short description of the sequence fetcher + */ @Override public String getDescription() { @@ -896,7 +940,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient for (SequenceFeature sf : sfs) { String parent = (String) sf.getValue(PARENT); - if (parent != null && parent.equals(parentId)) + if (parent != null && parent.equalsIgnoreCase(parentId)) { result.add(sf); } @@ -907,10 +951,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /** * Answers true if the feature type is either 'NMD_transcript_variant' or - * 'transcript' or one of its sub-types in the Sequence Ontology. This is - * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl + * 'transcript' (or one of its sub-types in the Sequence Ontology). This is + * because NMD_transcript_variant behaves like 'transcript' in Ensembl * although strictly speaking it is not (it is a sub-type of * sequence_variant). + *

+ * (This test was needed when fetching transcript features as GFF. As we are + * now fetching as JSON, all features have type 'transcript' so the check for + * NMD_transcript_variant is redundant. Left in for any future case arising.) * * @param featureType * @return