X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=233707b5de906247c5c8305bedcaab492c87e5f2;hb=defcce0ef74ffc0923b4387bcb5f0d867bf64831;hp=5f3f1c8801a5ede1418f6a2e82d73fb5d1fa0713;hpb=b40bbf24867e2f44c40e9af6cf43bdfee29c337c;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 5f3f1c8..233707b 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -1,7 +1,28 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.ext.ensembl; import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; +import jalview.bin.Cache; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -14,8 +35,10 @@ import jalview.io.FastaFile; import jalview.io.FileParse; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; +import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.RangeComparator; import java.io.IOException; import java.net.MalformedURLException; @@ -36,12 +59,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { private static final String ALLELES = "alleles"; - private static final List CROSS_REFERENCES = Arrays - .asList(new String[] { "CCDS", "Uniprot/SWISSPROT", - "Uniprot/SPTREMBL" }); - - protected static final String CONSEQUENCE_TYPE = "consequence_type"; - protected static final String PARENT = "Parent"; protected static final String ID = "ID"; @@ -61,7 +78,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient GENOMIC("genomic"), /** - * type=cdna to fetch dna including UTRs + * type=cdna to fetch coding dna including UTRs */ CDNA("cdna"), @@ -94,26 +111,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * A comparator to sort ranges into ascending start position order - */ - private class RangeSorter implements Comparator - { - boolean forwards; - - RangeSorter(boolean forward) - { - forwards = forward; - } - - @Override - public int compare(int[] o1, int[] o2) - { - return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); - } - - } - - /** * Default constructor (to use rest.ensembl.org) */ public EnsemblSeqProxy() @@ -164,6 +161,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " chunks. Unexpected problem (" + r.getLocalizedMessage() + ")"; System.err.println(msg); + r.printStackTrace(); break; } } @@ -216,7 +214,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); - if (geneFeatures.getHeight() > 0) + if (geneFeatures != null && geneFeatures.getHeight() > 0) { genomicSequence = geneFeatures.getSequenceAt(0); } @@ -277,23 +275,63 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils + .mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { // clunky: ensure Uniprot xref if we have one is on mapped sequence SequenceI ds = proteinSeq.getDatasetSequence(); - ds.setSourceDBRef(proteinSeq.getSourceDBRef()); - + // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence() Mapping map = new Mapping(ds, mapList); - DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), - proteinSeq.getName(), map); + DBRefEntry dbr = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), proteinSeq.getName(), map); querySeq.getDatasetSequence().addDBRef(dbr); - + DBRefEntry[] uprots = DBRefUtils.selectRefs(ds.getDBRefs(), + new String[] { DBRefSource.UNIPROT }); + DBRefEntry[] upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(), + new String[] { DBRefSource.UNIPROT }); + if (uprots != null) + { + for (DBRefEntry up : uprots) + { + // locate local uniprot ref and map + List upx = DBRefUtils.searchRefs(upxrefs, + up.getAccessionId()); + DBRefEntry upxref; + if (upx.size() != 0) + { + upxref = upx.get(0); + + if (upx.size() > 1) + { + Cache.log + .warn("Implementation issue - multiple uniprot acc on product sequence."); + } + } + else + { + upxref = new DBRefEntry(DBRefSource.UNIPROT, + getEnsemblDataVersion(), up.getAccessionId()); + } + + Mapping newMap = new Mapping(ds, mapList); + upxref.setVersion(getEnsemblDataVersion()); + upxref.setMap(newMap); + if (upx.size() == 0) + { + // add the new uniprot ref + querySeq.getDatasetSequence().addDBRef(upxref); + } + + } + } + /* * copy exon features to protein, compute peptide variants from dna * variants and add as features on the protein sequence ta-da */ - AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); + AlignmentUtils + .computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -315,41 +353,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient seq = seq.getDatasetSequence(); } - EnsemblXref xrefFetcher = new EnsemblXref(getDomain()); - List xrefs = xrefFetcher.getCrossReferences(seq.getName(), - getCrossReferenceDatabases()); + EnsemblXref xrefFetcher = new EnsemblXref(getDomain(), getDbSource(), + getEnsemblDataVersion()); + List xrefs = xrefFetcher.getCrossReferences(seq.getName()); for (DBRefEntry xref : xrefs) { seq.addDBRef(xref); - /* - * Save any Uniprot xref to be the reference for SIFTS mapping - */ - if (DBRefSource.UNIPROT.equals(xref.getSource())) - { - seq.setSourceDBRef(xref); - } } /* * and add a reference to itself */ - DBRefEntry self = new DBRefEntry(getDbSource(), "0", seq.getName()); + DBRefEntry self = new DBRefEntry(getDbSource(), + getEnsemblDataVersion(), seq.getName()); seq.addDBRef(self); } /** - * Returns a list of database names to be used when fetching cross-references. - * Specifically, the names are used to filter data returned by the Ensembl - * xrefs REST service on the value in field 'dbname'. - * - * @return - */ - protected List getCrossReferenceDatabases() - { - return CROSS_REFERENCES; - } - - /** * Fetches sequences for the list of accession ids and adds them to the * alignment. Returns the extended (or created) alignment. * @@ -368,6 +388,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient throw new JalviewException("ENSEMBL Rest API not available."); } FileParse fp = getSequenceReader(ids); + if (fp == null) + { + return alignment; + } + FastaFile fr = new FastaFile(fp); if (fr.hasWarningMessage()) { @@ -392,9 +417,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (fr.getSeqs().size() > 0) { - AlignmentI seqal = new Alignment( - fr.getSeqsAsArray()); - for (SequenceI sq:seqal.getSequences()) + AlignmentI seqal = new Alignment(fr.getSeqsAsArray()); + for (SequenceI sq : seqal.getSequences()) { if (sq.getDescription() == null) { @@ -404,7 +428,9 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (ids.contains(name) || ids.contains(name.replace("ENSP", "ENST"))) { - DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name); + DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(), + getEnsemblDataVersion(), name); + sq.addDBRef(dbref); } } if (alignment == null) @@ -524,7 +550,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; - + for (SequenceFeature sf : sfs) { /* @@ -541,20 +567,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // abort - mix of forward and backward System.err.println("Error: forward and backward strand for " + accId); - return null; - } - direction = strand; - directionSet = true; - - /* - * add to CDS ranges, semi-sorted forwards/backwards - */ - if (strand < 0) - { - regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); - } - else - { + return null; + } + direction = strand; + directionSet = true; + + /* + * add to CDS ranges, semi-sorted forwards/backwards + */ + if (strand < 0) + { + regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); + } + else + { regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); @@ -569,7 +595,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } } } - + if (regions.isEmpty()) { System.out.println("Failed to identify target sequence for " + accId @@ -581,11 +607,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) */ - Collections.sort(regions, new RangeSorter(direction == 1)); - + Collections.sort(regions, new RangeComparator(direction == 1)); + List to = Arrays.asList(new int[] { start, start + mappedLength - 1 }); - + return new MapList(regions, to, 1, 1); } @@ -629,12 +655,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedRange = mapping.locateInTo(start, end); - + if (mappedRange != null) { SequenceFeature copy = new SequenceFeature(sf); copy.setBegin(Math.min(mappedRange[0], mappedRange[1])); copy.setEnd(Math.max(mappedRange[0], mappedRange[1])); + if (".".equals(copy.getFeatureGroup())) + { + copy.setFeatureGroup(getDbSource()); + } targetSequence.addSequenceFeature(copy); /* @@ -698,16 +728,21 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { complement.append(","); } - if ("HGMD_MUTATION".equalsIgnoreCase(allele)) + + /* + * some 'alleles' are actually descriptive terms + * e.g. HGMD_MUTATION, PhenCode_variation + * - we don't want to 'reverse complement' these + */ + if (!Comparison.isNucleotideSequence(allele, true)) { complement.append(allele); } else { - char[] alleles = allele.toCharArray(); - for (int i = alleles.length - 1; i >= 0; i--) + for (int i = allele.length() - 1; i >= 0; i--) { - complement.append(Dna.getComplement(alleles[i])); + complement.append(Dna.getComplement(allele.charAt(i))); } } } @@ -730,8 +765,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient // long start = System.currentTimeMillis(); SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, - targetSequence.getStart()); + MapList mapping = getGenomicRangesFromFeatures(sourceSequence, + accessionId, targetSequence.getStart()); if (mapping == null) { return false; @@ -862,11 +897,13 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String type, String parentId) { List result = new ArrayList(); - + SequenceFeature[] sfs = sequence.getSequenceFeatures(); - if (sfs != null) { + if (sfs != null) + { SequenceOntologyI so = SequenceOntologyFactory.getInstance(); - for (SequenceFeature sf :sfs) { + for (SequenceFeature sf : sfs) + { if (so.isA(sf.getType(), type)) { String parent = (String) sf.getValue(PARENT);