X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=7b448fd8a6389becba113ec2e2b40d6bb997b959;hb=ef84f77ebe6c73e67e8ec789b02f41891715ebdd;hp=dcf2eb43297455623dc471803e55b026841eaf32;hpb=5950febf61a94e527963a1eb68b0c59dd3387163;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index dcf2eb4..7b448fd 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -28,12 +28,12 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; import jalview.exceptions.JalviewException; -import jalview.io.FastaFile; -import jalview.io.FileParse; +import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.util.Comparison; @@ -41,6 +41,7 @@ import jalview.util.DBRefUtils; import jalview.util.IntRangeComparator; import jalview.util.MapList; +import java.io.BufferedReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; @@ -48,8 +49,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.Map.Entry; + +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; /** * Base class for Ensembl sequence fetchers @@ -59,12 +62,6 @@ import java.util.Map.Entry; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - private static final String ALLELES = "alleles"; - - protected static final String PARENT = "Parent"; - - protected static final String ID = "ID"; - protected static final String NAME = "Name"; protected static final String DESCRIPTION = "description"; @@ -139,8 +136,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be - List allIds = Arrays.asList(query - .split(getAccessionSeparator())); + List allIds = Arrays + .asList(query.split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; @@ -209,7 +206,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient try { /* - * get 'dummy' genomic sequence with exon, cds and variation features + * get 'dummy' genomic sequence with gene, transcript, + * exon, cds and variation features */ SequenceI genomicSequence = null; EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); @@ -225,7 +223,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * transfer features to the query sequence */ - SequenceI querySeq = alignment.findName(accId); + SequenceI querySeq = alignment.findName(accId, true); if (transferFeatures(accId, genomicSequence, querySeq)) { @@ -238,8 +236,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } } catch (IOException e) { - System.err.println("Error transferring Ensembl features: " - + e.getMessage()); + System.err.println( + "Error transferring Ensembl features: " + e.getMessage()); } } @@ -277,8 +275,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = AlignmentUtils - .mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, + proteinSeq); if (mapList != null) { // clunky: ensure Uniprot xref if we have one is on mapped sequence @@ -289,9 +287,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); DBRefEntry[] uprots = DBRefUtils.selectRefs(ds.getDBRefs(), - new String[] { DBRefSource.UNIPROT }); + new String[] + { DBRefSource.UNIPROT }); DBRefEntry[] upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(), - new String[] { DBRefSource.UNIPROT }); + new String[] + { DBRefSource.UNIPROT }); if (uprots != null) { for (DBRefEntry up : uprots) @@ -306,8 +306,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (upx.size() > 1) { - Cache.log - .warn("Implementation issue - multiple uniprot acc on product sequence."); + Cache.log.warn( + "Implementation issue - multiple uniprot acc on product sequence."); } } else @@ -332,8 +332,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * copy exon features to protein, compute peptide variants from dna * variants and add as features on the protein sequence ta-da */ - AlignmentUtils - .computeProteinFeatures(querySeq, proteinSeq, mapList); + AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, + mapList); } } catch (Exception e) { @@ -366,8 +366,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * and add a reference to itself */ - DBRefEntry self = new DBRefEntry(getDbSource(), - getEnsemblDataVersion(), seq.getName()); + DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), + seq.getName()); seq.addDBRef(self); } @@ -381,58 +381,52 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @throws JalviewException * @throws IOException */ - protected AlignmentI fetchSequences(List ids, AlignmentI alignment) - throws JalviewException, IOException + protected AlignmentI fetchSequences(List ids, + AlignmentI alignment) throws JalviewException, IOException { if (!isEnsemblAvailable()) { inProgress = false; throw new JalviewException("ENSEMBL Rest API not available."); } - FileParse fp = getSequenceReader(ids); - if (fp == null) + BufferedReader br = getSequenceReader(ids); + if (br == null) { return alignment; } - FastaFile fr = new FastaFile(fp); - if (fr.hasWarningMessage()) - { - System.out.println(String.format( - "Warning when retrieving %d ids %s\n%s", ids.size(), - ids.toString(), fr.getWarningMessage())); - } - else if (fr.getSeqs().size() != ids.size()) + List seqs = parseSequenceJson(br); + + if (seqs.isEmpty()) { - System.out.println(String.format( - "Only retrieved %d sequences for %d query strings", fr - .getSeqs().size(), ids.size())); + throw new IOException("No data returned for " + ids); } - if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) + if (seqs.size() != ids.size()) { - /* - * POST request has returned an empty FASTA file e.g. for invalid id - */ - throw new IOException("No data returned for " + ids); + System.out.println(String.format( + "Only retrieved %d sequences for %d query strings", + seqs.size(), ids.size())); } - if (fr.getSeqs().size() > 0) + if (!seqs.isEmpty()) { - AlignmentI seqal = new Alignment(fr.getSeqsAsArray()); - for (SequenceI sq : seqal.getSequences()) + AlignmentI seqal = new Alignment( + seqs.toArray(new SequenceI[seqs.size()])); + for (SequenceI seq : seqs) { - if (sq.getDescription() == null) + if (seq.getDescription() == null) { - sq.setDescription(getDbName()); + seq.setDescription(getDbName()); } - String name = sq.getName(); + String name = seq.getName(); if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(), + // TODO JAL-3077 use true accession version in dbref + DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(), getEnsemblDataVersion(), name); - sq.addDBRef(dbref); + seq.addDBRef(dbref); } } if (alignment == null) @@ -448,6 +442,49 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Parses a JSON response for a single sequence ID query + * + * @param br + * @return a single jalview.datamodel.Sequence + * @see http://rest.ensembl.org/documentation/info/sequence_id + */ + protected List parseSequenceJson(BufferedReader br) + { + JSONParser jp = new JSONParser(); + List result = new ArrayList<>(); + try + { + /* + * for now, assumes only one sequence returned; refactor if needed + * in future to handle a JSONArray with more than one + */ + final JSONObject val = (JSONObject) jp.parse(br); + Object s = val.get("desc"); + String desc = s == null ? null : s.toString(); + s = val.get("id"); + String id = s == null ? null : s.toString(); + s = val.get("seq"); + String seq = s == null ? null : s.toString(); + Sequence sequence = new Sequence(id, seq); + if (desc != null) + { + sequence.setDescription(desc); + } + // todo JAL-3077 make a DBRefEntry with true accession version + // s = val.get("version"); + // String version = s == null ? "0" : s.toString(); + // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id); + // sequence.addDBRef(dbref); + result.add(sequence); + } catch (ParseException | IOException e) + { + System.err.println("Error processing JSON response: " + e.toString()); + // ignore + } + return result; + } + + /** * Returns the URL for the REST call * * @return @@ -468,16 +505,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats urlstring.append("?type=").append(getSourceEnsemblType().getType()); - urlstring.append(("&Accept=text/x-fasta")); + urlstring.append(("&Accept=application/json")); + urlstring.append(("&Content-Type=application/json")); - Map params = getAdditionalParameters(); - if (params != null) + String objectType = getObjectType(); + if (objectType != null) { - for (Entry entry : params.entrySet()) - { - urlstring.append("&").append(entry.getKey()).append("=") - .append(entry.getValue()); - } + urlstring.append("&").append(OBJECT_TYPE).append("=") + .append(objectType); } URL url = new URL(urlstring.toString()); @@ -485,11 +520,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Override this method to add any additional x=y URL parameters needed + * Override this method to specify object_type request parameter * * @return */ - protected Map getAdditionalParameters() + protected String getObjectType() { return null; } @@ -511,18 +546,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - @Override - protected String getRequestMimeType(boolean multipleIds) - { - return multipleIds ? "application/json" : "text/x-fasta"; - } - - @Override - protected String getResponseMimeType() - { - return "text/x-fasta"; - } - /** * * @return the configured sequence return type for this source @@ -558,9 +581,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { - // SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - List sfs = sourceSequence.getFeatures() - .getPositionalFeatures(); + List sfs = getIdentifyingFeatures(sourceSequence, + accId); if (sfs.isEmpty()) { return null; @@ -570,54 +592,38 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * generously initial size for number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ - List regions = new ArrayList(100); + List regions = new ArrayList<>(100); int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; for (SequenceFeature sf : sfs) { + int strand = sf.getStrand(); + strand = strand == 0 ? 1 : strand; // treat unknown as forward + + if (directionSet && strand != direction) + { + // abort - mix of forward and backward + System.err + .println("Error: forward and backward strand for " + accId); + return null; + } + direction = strand; + directionSet = true; + /* - * accept the target feature type or a specialisation of it - * (e.g. coding_exon for exon) + * add to CDS ranges, semi-sorted forwards/backwards */ - if (identifiesSequence(sf, accId)) + if (strand < 0) { - int strand = sf.getStrand(); - strand = strand == 0 ? 1 : strand; // treat unknown as forward - - if (directionSet && strand != direction) - { - // abort - mix of forward and backward - System.err.println("Error: forward and backward strand for " - + accId); - return null; - } - direction = strand; - directionSet = true; - - /* - * add to CDS ranges, semi-sorted forwards/backwards - */ - if (strand < 0) - { - regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); - } - else - { - regions.add(new int[] { sf.getBegin(), sf.getEnd() }); - } - mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); - - if (!isSpliceable()) - { - /* - * 'gene' sequence is contiguous so we can stop as soon as its - * identifying feature has been found - */ - break; - } + regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); + } + else + { + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); } if (regions.isEmpty()) @@ -634,33 +640,26 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient Collections.sort(regions, direction == 1 ? IntRangeComparator.ASCENDING : IntRangeComparator.DESCENDING); - List to = Arrays.asList(new int[] { start, - start + mappedLength - 1 }); + List to = Arrays + .asList(new int[] + { start, start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } /** - * Answers true if the sequence being retrieved may occupy discontiguous - * regions on the genomic sequence. - */ - protected boolean isSpliceable() - { - return true; - } - - /** - * Returns true if the sequence feature marks positions of the genomic + * Answers a list of sequence features that mark positions of the genomic * sequence feature which are within the sequence being retrieved. For * example, an 'exon' feature whose parent is the target transcript marks the - * cdna positions of the transcript. + * cdna positions of the transcript. For a gene sequence, this is trivially + * just the 'gene' feature with matching gene id. * - * @param sf + * @param seq * @param accId * @return */ - protected abstract boolean identifiesSequence(SequenceFeature sf, - String accId); + protected abstract List getIdentifyingFeatures( + SequenceI seq, String accId); /** * Transfers the sequence feature to the target sequence, locating its start @@ -691,16 +690,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int newBegin = Math.min(mappedRange[0], mappedRange[1]); int newEnd = Math.max(mappedRange[0], mappedRange[1]); SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, - group); + group, sf.getScore()); targetSequence.addSequenceFeature(copy); /* * for sequence_variant on reverse strand, have to convert the allele * values to their complements */ - if (!forwardStrand - && SequenceOntologyFactory.getInstance().isA(sf.getType(), - SequenceOntologyI.SEQUENCE_VARIANT)) + if (!forwardStrand && SequenceOntologyFactory.getInstance() + .isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) { reverseComplementAlleles(copy); } @@ -715,7 +713,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ static void reverseComplementAlleles(SequenceFeature sf) { - final String alleles = (String) sf.getValue(ALLELES); + final String alleles = (String) sf.getValue(Gff3Helper.ALLELES); if (alleles == null) { return; @@ -726,7 +724,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient reverseComplementAllele(complement, allele); } String comp = complement.toString(); - sf.setValue(ALLELES, comp); + sf.setValue(Gff3Helper.ALLELES, comp); sf.setDescription(comp); /* @@ -736,7 +734,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String atts = sf.getAttributes(); if (atts != null) { - atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp); + atts = atts.replace(Gff3Helper.ALLELES + "=" + alleles, + Gff3Helper.ALLELES + "=" + comp); sf.setAttributes(atts); } } @@ -790,8 +789,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - long start = System.currentTimeMillis(); - // SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); +// long start = System.currentTimeMillis(); List sfs = sourceSequence.getFeatures() .getPositionalFeatures(); MapList mapping = getGenomicRangesFromFeatures(sourceSequence, @@ -803,10 +801,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient boolean result = transferFeatures(sfs, targetSequence, mapping, accessionId); - System.out.println("transferFeatures (" + (sfs.size()) + " --> " - + targetSequence.getFeatures().getFeatureCount(true) + ") to " - + targetSequence.getName() + " took " - + (System.currentTimeMillis() - start) + "ms"); +// System.out.println("transferFeatures (" + (sfs.size()) + " --> " +// + targetSequence.getFeatures().getFeatureCount(true) + ") to " +// + targetSequence.getName() + " took " +// + (System.currentTimeMillis() - start) + "ms"); return result; } @@ -870,7 +868,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { String parent = (String) sf.getValue(PARENT); // using contains to allow for prefix "gene:", "transcript:" etc - if (parent != null && !parent.contains(identifier)) + if (parent != null + && !parent.toUpperCase().contains(identifier.toUpperCase())) { // this genomic feature belongs to a different transcript return false; @@ -898,14 +897,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected List findFeatures(SequenceI sequence, String term, String parentId) { - List result = new ArrayList(); + List result = new ArrayList<>(); List sfs = sequence.getFeatures() .getFeaturesByOntology(term); for (SequenceFeature sf : sfs) { String parent = (String) sf.getValue(PARENT); - if (parent != null && parent.equals(parentId)) + if (parent != null && parent.equalsIgnoreCase(parentId)) { result.add(sf); }