X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblSeqProxy.java;h=869a7028f1d2c8f5da9462a36877ad202359d1de;hb=1e8c7a9ab9f5da589d0aa2482fd2e3361c320d57;hp=8c1e972e18bee012eefae8c63bbb23687de01c04;hpb=3e3c9f8a78a2f8378b01b68900d9efb4ab95c7e6;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 8c1e972..869a702 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -38,7 +38,7 @@ import java.util.Map.Entry; public abstract class EnsemblSeqProxy extends EnsemblRestClient { private static final List CROSS_REFERENCES = Arrays - .asList(new String[] { "CCDS" }); + .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" }); protected static final String CONSEQUENCE_TYPE = "consequence_type"; @@ -46,33 +46,30 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected static final String ID = "ID"; - /* - * this needs special handling, as it isA sequence_variant in the - * Sequence Ontology, but behaves in Ensembl as if it isA transcript - */ - protected static final String NMD_VARIANT = "NMD_transcript_variant"; - protected static final String NAME = "Name"; + /* + * enum for 'type' parameter to the /sequence REST service + */ public enum EnsemblSeqType { /** - * type=genomic for the full dna including introns + * type=genomic to fetch full dna including introns */ GENOMIC("genomic"), /** - * type=cdna for transcribed dna including UTRs + * type=cdna to fetch dna including UTRs */ CDNA("cdna"), /** - * type=cds for coding dna excluding UTRs + * type=cds to fetch coding dna excluding UTRs */ CDS("cds"), /** - * type=protein for the peptide product sequence + * type=protein to fetch peptide product sequence */ PROTEIN("protein"); @@ -157,17 +154,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + ")"; System.err.println(msg); break; - // if (alignment != null) - // { - // break; // return what we got - // } - // else - // { - // throw new JalviewException(msg, r); - // } } } + if (alignment == null) + { + return null; + } + /* * fetch and transfer genomic sequence features, * fetch protein product and add as cross-reference @@ -207,7 +201,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * get 'dummy' genomic sequence with exon, cds and variation features */ SequenceI genomicSequence = null; - EnsemblOverlap gffFetcher = new EnsemblOverlap(); + EnsemblFeatures gffFetcher = new EnsemblFeatures(); EnsemblFeatureType[] features = getFeaturesToFetch(); AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, features); @@ -274,7 +268,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient MapList mapList = mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { - Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList); + // clunky: ensure Uniprot xref if we have one is on mapped sequence + SequenceI ds = proteinSeq.getDatasetSequence(); + ds.setSourceDBRef(proteinSeq.getSourceDBRef()); + Mapping map = new Mapping(ds, mapList); DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(), accId, map); querySeq.getDatasetSequence().addDBRef(dbr); @@ -294,8 +291,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein - * sequence + * Get database xrefs from Ensembl, and attach them to the sequence * * @param seq */ @@ -347,7 +343,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient int mappedDnaLength = getCdsRanges(dnaSeq, ranges); int proteinLength = proteinSeq.getLength(); - List proteinRange = new ArrayList(); + int proteinEnd = proteinLength; int proteinStart = 1; /* @@ -359,15 +355,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinStart = 2; proteinLength--; } - proteinRange.add(new int[] { proteinStart, proteinLength }); + List proteinRange = new ArrayList(); /* * dna length should map to protein (or protein plus stop codon) */ int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == proteinLength - || codesForResidues == (proteinLength + 1)) + if (codesForResidues == (proteinLength + 1)) { + MappingUtils.unmapStopCodon(ranges, mappedDnaLength); + codesForResidues--; + } + if (codesForResidues == proteinLength) + { + proteinRange.add(new int[] { proteinStart, proteinEnd }); return new MapList(ranges, proteinRange, 3, 1); } return null; @@ -392,14 +393,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient { return 0; } + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); int mappedDnaLength = 0; for (SequenceFeature sf : sfs) { /* * process a CDS feature (or a sub-type of CDS) */ - if (SequenceOntologyFactory.getInstance().isA(sf.getType(), - SequenceOntologyI.CDS)) + if (so.isA(sf.getType(), SequenceOntologyI.CDS)) { int phase = 0; try { @@ -414,7 +415,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ int begin = sf.getBegin(); int end = sf.getEnd(); - if (ranges.isEmpty() && phase > 0) + if (ranges.isEmpty()) { begin += phase; if (begin > end) @@ -719,18 +720,18 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * for sequence_variant, make an additional feature with consequence */ - if (SequenceOntologyFactory.getInstance().isA(sf.getType(), - SequenceOntologyI.SEQUENCE_VARIANT)) - { - String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); - if (consequence != null) - { - SequenceFeature sf2 = new SequenceFeature("consequence", - consequence, copy.getBegin(), copy.getEnd(), 0f, - null); - targetSequence.addSequenceFeature(sf2); - } - } + // if (SequenceOntologyFactory.getInstance().isA(sf.getType(), + // SequenceOntologyI.SEQUENCE_VARIANT)) + // { + // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); + // if (consequence != null) + // { + // SequenceFeature sf2 = new SequenceFeature("consequence", + // consequence, copy.getBegin(), copy.getEnd(), 0f, + // null); + // targetSequence.addSequenceFeature(sf2); + // } + // } } } @@ -750,6 +751,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } + // long start = System.currentTimeMillis(); SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId, targetSequence.getStart()); @@ -758,7 +760,13 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return false; } - return transferFeatures(sfs, targetSequence, mapping, accessionId); + boolean result = transferFeatures(sfs, targetSequence, mapping, + accessionId); + // System.out.println("transferFeatures (" + (sfs.length) + " --> " + // + targetSequence.getSequenceFeatures().length + ") to " + // + targetSequence.getName() + // + " took " + (System.currentTimeMillis() - start) + "ms"); + return result; } /** @@ -925,6 +933,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient count++; } } + + /* + * ugly sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet instead? + */ + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); return count; } @@ -1105,7 +1129,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ public static boolean isTranscript(String featureType) { - return NMD_VARIANT.equals(featureType) + return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) || SequenceOntologyFactory.getInstance().isA(featureType, SequenceOntologyI.TRANSCRIPT); }