X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=b2ebb1ac5dc4db1c833c16def46b67431cc3de16;hb=2779b461347e684414f9e98e607e138b1e43db84;hp=c749b946307b749aae5be41a028e3f2f0ec70fd5;hpb=96153c4037c295b5a23ebb3196a0df7465732d92;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index c749b94..b2ebb1a 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,3 +1,23 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.ext.ensembl; import jalview.analysis.AlignmentUtils; @@ -10,6 +30,7 @@ import jalview.datamodel.DBRefSource; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.SequenceFeatures; import jalview.exceptions.JalviewException; import jalview.io.FastaFile; import jalview.io.FileParse; @@ -17,6 +38,7 @@ import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.util.Comparison; import jalview.util.DBRefUtils; +import jalview.util.IntRangeComparator; import jalview.util.MapList; import java.io.IOException; @@ -25,7 +47,6 @@ import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.Comparator; import java.util.List; /** @@ -38,10 +59,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { private static final String ALLELES = "alleles"; - protected static final String PARENT = "Parent"; - - protected static final String ID = "ID"; - protected static final String NAME = "Name"; protected static final String DESCRIPTION = "description"; @@ -90,26 +107,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * A comparator to sort ranges into ascending start position order - */ - private class RangeSorter implements Comparator - { - boolean forwards; - - RangeSorter(boolean forward) - { - forwards = forward; - } - - @Override - public int compare(int[] o1, int[] o2) - { - return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); - } - - } - - /** * Default constructor (to use rest.ensembl.org) */ public EnsemblSeqProxy() @@ -136,8 +133,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be - List allIds = Arrays.asList(query - .split(getAccessionSeparator())); + List allIds = Arrays + .asList(query.split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; @@ -206,14 +203,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient try { /* - * get 'dummy' genomic sequence with exon, cds and variation features + * get 'dummy' genomic sequence with gene, transcript, + * exon, cds and variation features */ SequenceI genomicSequence = null; EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); - if (geneFeatures.getHeight() > 0) + if (geneFeatures != null && geneFeatures.getHeight() > 0) { genomicSequence = geneFeatures.getSequenceAt(0); } @@ -222,7 +220,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * transfer features to the query sequence */ - SequenceI querySeq = alignment.findName(accId); + SequenceI querySeq = alignment.findName(accId, true); if (transferFeatures(accId, genomicSequence, querySeq)) { @@ -235,8 +233,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } } catch (IOException e) { - System.err.println("Error transferring Ensembl features: " - + e.getMessage()); + System.err.println( + "Error transferring Ensembl features: " + e.getMessage()); } } @@ -274,7 +272,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, + proteinSeq); if (mapList != null) { // clunky: ensure Uniprot xref if we have one is on mapped sequence @@ -285,15 +284,18 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); DBRefEntry[] uprots = DBRefUtils.selectRefs(ds.getDBRefs(), - new String[] { DBRefSource.UNIPROT }); + new String[] + { DBRefSource.UNIPROT }); DBRefEntry[] upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(), - new String[] { DBRefSource.UNIPROT }); + new String[] + { DBRefSource.UNIPROT }); if (uprots != null) { for (DBRefEntry up : uprots) { // locate local uniprot ref and map - List upx = DBRefUtils.searchRefs(upxrefs, up.getAccessionId()); + List upx = DBRefUtils.searchRefs(upxrefs, + up.getAccessionId()); DBRefEntry upxref; if (upx.size() != 0) { @@ -301,14 +303,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (upx.size() > 1) { - Cache.log - .warn("Implementation issue - multiple uniprot acc on product sequence."); + Cache.log.warn( + "Implementation issue - multiple uniprot acc on product sequence."); } } else { upxref = new DBRefEntry(DBRefSource.UNIPROT, - getEnsemblDataVersion(), up.getAccessionId()); + getEnsemblDataVersion(), up.getAccessionId()); } Mapping newMap = new Mapping(ds, mapList); @@ -319,15 +321,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // add the new uniprot ref querySeq.getDatasetSequence().addDBRef(upxref); } - + } } - + /* * copy exon features to protein, compute peptide variants from dna * variants and add as features on the protein sequence ta-da */ - AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); + AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, + mapList); } } catch (Exception e) { @@ -360,8 +363,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * and add a reference to itself */ - DBRefEntry self = new DBRefEntry(getDbSource(), - getEnsemblDataVersion(), seq.getName()); + DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), + seq.getName()); seq.addDBRef(self); } @@ -375,8 +378,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @throws JalviewException * @throws IOException */ - protected AlignmentI fetchSequences(List ids, AlignmentI alignment) - throws JalviewException, IOException + protected AlignmentI fetchSequences(List ids, + AlignmentI alignment) throws JalviewException, IOException { if (!isEnsemblAvailable()) { @@ -384,18 +387,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient throw new JalviewException("ENSEMBL Rest API not available."); } FileParse fp = getSequenceReader(ids); + if (fp == null) + { + return alignment; + } + FastaFile fr = new FastaFile(fp); if (fr.hasWarningMessage()) { - System.out.println(String.format( - "Warning when retrieving %d ids %s\n%s", ids.size(), - ids.toString(), fr.getWarningMessage())); + System.out.println( + String.format("Warning when retrieving %d ids %s\n%s", + ids.size(), ids.toString(), fr.getWarningMessage())); } else if (fr.getSeqs().size() != ids.size()) { System.out.println(String.format( - "Only retrieved %d sequences for %d query strings", fr - .getSeqs().size(), ids.size())); + "Only retrieved %d sequences for %d query strings", + fr.getSeqs().size(), ids.size())); } if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0) @@ -408,9 +416,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (fr.getSeqs().size() > 0) { - AlignmentI seqal = new Alignment( - fr.getSeqsAsArray()); - for (SequenceI sq:seqal.getSequences()) + AlignmentI seqal = new Alignment(fr.getSeqsAsArray()); + for (SequenceI sq : seqal.getSequences()) { if (sq.getDescription() == null) { @@ -460,11 +467,28 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient urlstring.append("?type=").append(getSourceEnsemblType().getType()); urlstring.append(("&Accept=text/x-fasta")); + String objectType = getObjectType(); + if (objectType != null) + { + urlstring.append("&").append(OBJECT_TYPE).append("=") + .append(objectType); + } + URL url = new URL(urlstring.toString()); return url; } /** + * Override this method to specify object_type request parameter + * + * @return + */ + protected String getObjectType() + { + return null; + } + + /** * A sequence/id POST request currently allows up to 50 queries * * @see http://rest.ensembl.org/documentation/info/sequence_id_post @@ -528,8 +552,9 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, String accId, int start) { - SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - if (sfs == null) + List sfs = sourceSequence.getFeatures() + .getPositionalFeatures(); + if (sfs.isEmpty()) { return null; } @@ -538,11 +563,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * generously initial size for number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ - List regions = new ArrayList(100); + List regions = new ArrayList<>(100); int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; - + for (SequenceFeature sf : sfs) { /* @@ -557,22 +582,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (directionSet && strand != direction) { // abort - mix of forward and backward - System.err.println("Error: forward and backward strand for " - + accId); - return null; - } - direction = strand; - directionSet = true; - - /* - * add to CDS ranges, semi-sorted forwards/backwards - */ - if (strand < 0) - { - regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); - } - else - { + System.err.println( + "Error: forward and backward strand for " + accId); + return null; + } + direction = strand; + directionSet = true; + + /* + * add to CDS ranges, semi-sorted forwards/backwards + */ + if (strand < 0) + { + regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); + } + else + { regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); @@ -587,7 +612,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } } } - + if (regions.isEmpty()) { System.out.println("Failed to identify target sequence for " + accId @@ -599,11 +624,13 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) */ - Collections.sort(regions, new RangeSorter(direction == 1)); - - List to = Arrays.asList(new int[] { start, - start + mappedLength - 1 }); - + Collections.sort(regions, direction == 1 ? IntRangeComparator.ASCENDING + : IntRangeComparator.DESCENDING); + + List to = Arrays + .asList(new int[] + { start, start + mappedLength - 1 }); + return new MapList(regions, to, 1, 1); } @@ -647,25 +674,26 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedRange = mapping.locateInTo(start, end); - + if (mappedRange != null) { - SequenceFeature copy = new SequenceFeature(sf); - copy.setBegin(Math.min(mappedRange[0], mappedRange[1])); - copy.setEnd(Math.max(mappedRange[0], mappedRange[1])); - if (".".equals(copy.getFeatureGroup())) + String group = sf.getFeatureGroup(); + if (".".equals(group)) { - copy.setFeatureGroup(getDbSource()); + group = getDbSource(); } + int newBegin = Math.min(mappedRange[0], mappedRange[1]); + int newEnd = Math.max(mappedRange[0], mappedRange[1]); + SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, + group, sf.getScore()); targetSequence.addSequenceFeature(copy); /* * for sequence_variant on reverse strand, have to convert the allele * values to their complements */ - if (!forwardStrand - && SequenceOntologyFactory.getInstance().isA(sf.getType(), - SequenceOntologyI.SEQUENCE_VARIANT)) + if (!forwardStrand && SequenceOntologyFactory.getInstance() + .isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) { reverseComplementAlleles(copy); } @@ -755,10 +783,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - // long start = System.currentTimeMillis(); - SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, - targetSequence.getStart()); +// long start = System.currentTimeMillis(); + List sfs = sourceSequence.getFeatures() + .getPositionalFeatures(); + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, + accessionId, targetSequence.getStart()); if (mapping == null) { return false; @@ -766,10 +795,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient boolean result = transferFeatures(sfs, targetSequence, mapping, accessionId); - // System.out.println("transferFeatures (" + (sfs.length) + " --> " - // + targetSequence.getSequenceFeatures().length + ") to " - // + targetSequence.getName() - // + " took " + (System.currentTimeMillis() - start) + "ms"); +// System.out.println("transferFeatures (" + (sfs.size()) + " --> " +// + targetSequence.getFeatures().getFeatureCount(true) + ") to " +// + targetSequence.getName() + " took " +// + (System.currentTimeMillis() - start) + "ms"); return result; } @@ -778,13 +807,13 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * converted using the mapping. Features which do not overlap are ignored. * Features whose parent is not the specified identifier are also ignored. * - * @param features + * @param sfs * @param targetSequence * @param mapping * @param parentId * @return */ - protected boolean transferFeatures(SequenceFeature[] features, + protected boolean transferFeatures(List sfs, SequenceI targetSequence, MapList mapping, String parentId) { final boolean forwardStrand = mapping.isFromForwardStrand(); @@ -794,10 +823,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * position descending if reverse strand) so as to add them in * 'forwards' order to the target sequence */ - sortFeatures(features, forwardStrand); + SequenceFeatures.sortFeatures(sfs, forwardStrand); boolean transferred = false; - for (SequenceFeature sf : features) + for (SequenceFeature sf : sfs) { if (retainFeature(sf, parentId)) { @@ -809,33 +838,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Sort features by start position ascending (if on forward strand), or end - * position descending (if on reverse strand) - * - * @param features - * @param forwardStrand - */ - protected static void sortFeatures(SequenceFeature[] features, - final boolean forwardStrand) - { - Arrays.sort(features, new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) - { - if (forwardStrand) - { - return Integer.compare(o1.getBegin(), o2.getBegin()); - } - else - { - return Integer.compare(o2.getEnd(), o1.getEnd()); - } - } - }); - } - - /** * Answers true if the feature type is one we want to keep for the sequence. * Some features are only retrieved in order to identify the sequence range, * and may then be discarded as redundant information (e.g. "CDS" feature for @@ -860,7 +862,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { String parent = (String) sf.getValue(PARENT); // using contains to allow for prefix "gene:", "transcript:" etc - if (parent != null && !parent.contains(identifier)) + if (parent != null + && !parent.toUpperCase().contains(identifier.toUpperCase())) { // this genomic feature belongs to a different transcript return false; @@ -877,33 +880,30 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /** * Returns a (possibly empty) list of features on the sequence which have the - * specified sequence ontology type (or a sub-type of it), and the given + * specified sequence ontology term (or a sub-type of it), and the given * identifier as parent * * @param sequence - * @param type + * @param term * @param parentId * @return */ protected List findFeatures(SequenceI sequence, - String type, String parentId) + String term, String parentId) { - List result = new ArrayList(); - - SequenceFeature[] sfs = sequence.getSequenceFeatures(); - if (sfs != null) { - SequenceOntologyI so = SequenceOntologyFactory.getInstance(); - for (SequenceFeature sf :sfs) { - if (so.isA(sf.getType(), type)) - { - String parent = (String) sf.getValue(PARENT); - if (parent.equals(parentId)) - { - result.add(sf); - } - } + List result = new ArrayList<>(); + + List sfs = sequence.getFeatures() + .getFeaturesByOntology(term); + for (SequenceFeature sf : sfs) + { + String parent = (String) sf.getValue(PARENT); + if (parent != null && parent.equalsIgnoreCase(parentId)) + { + result.add(sf); } } + return result; }