X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fws%2Fdbsources%2FEmblXmlSource.java;h=c2d661baa77e06c6110c523f5b65dd7a79e44548;hb=41b0e9331ac71787c1280aa1d809f54c575fbf97;hp=8f55080532694eb96e52b2b7c8bd9e178807698c;hpb=7e82e1fed011077e5cd4cc40ac8ad3519d7c47a8;p=jalview.git diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index 8f55080..c2d661b 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -20,6 +20,29 @@ */ package jalview.ws.dbsources; +import java.util.Locale; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBElement; +import javax.xml.bind.JAXBException; +import javax.xml.stream.FactoryConfigurationError; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import com.stevesoft.pat.Regex; + import jalview.analysis.SequenceIdMatcher; import jalview.bin.Cache; import jalview.datamodel.Alignment; @@ -35,41 +58,28 @@ import jalview.util.DBRefUtils; import jalview.util.DnaUtils; import jalview.util.MapList; import jalview.util.MappingUtils; -import jalview.util.MessageManager; import jalview.ws.ebi.EBIFetchClient; import jalview.xml.binding.embl.EntryType; import jalview.xml.binding.embl.EntryType.Feature; import jalview.xml.binding.embl.EntryType.Feature.Qualifier; +import jalview.xml.binding.embl.ROOT; import jalview.xml.binding.embl.XrefType; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Hashtable; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.regex.Pattern; - -import javax.xml.bind.JAXBContext; -import javax.xml.bind.JAXBException; -import javax.xml.stream.FactoryConfigurationError; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; - +/** + * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from + * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}. + * + * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource + */ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { + private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+"); + /* * JAL-1856 Embl returns this text for query not found */ private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found."; - private static final Pattern SPACE_PATTERN = Pattern.compile(" "); - public EmblXmlSource() { super(); @@ -85,7 +95,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy * @return * @throws Exception */ - public AlignmentI getEmblSequenceRecords(String emprefx, String query) + protected AlignmentI getEmblSequenceRecords(String emprefx, String query) throws Exception { startQuery(); @@ -94,14 +104,15 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy try { reply = dbFetch.fetchDataAsFile( - emprefx.toLowerCase() + ":" + query.trim(), "display=xml", + emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), "display=xml", "xml"); } catch (Exception e) { stopQuery(); - throw new Exception(MessageManager.formatMessage( - "exception.ebiembl_retrieval_failed_on", new String[] - { emprefx.toLowerCase(), query.trim() }), e); + throw new Exception( + String.format("EBI EMBL XML retrieval failed for %s:%s", + emprefx.toLowerCase(Locale.ROOT), query.trim()), + e); } return getEmblSequenceRecords(emprefx, query, reply); } @@ -118,7 +129,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy * @return * @throws Exception */ - public AlignmentI getEmblSequenceRecords(String emprefx, String query, + protected AlignmentI getEmblSequenceRecords(String emprefx, String query, File reply) throws Exception { List entries = null; @@ -183,8 +194,9 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy XMLStreamReader streamReader = XMLInputFactory.newInstance() .createXMLStreamReader(is); javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); - jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um - .unmarshal(streamReader); + JAXBElement rootElement = um.unmarshal(streamReader, + ROOT.class); + ROOT root = rootElement.getValue(); /* * document root contains either "entry" or "entrySet" @@ -244,14 +256,18 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy /* * add db references */ - List dbRefs = entry.getXref(); - if (dbRefs != null) + List xrefs = entry.getXref(); + if (xrefs != null) { - for (XrefType dbref : dbRefs) + for (XrefType xref : xrefs) { - String acc = dbref.getId(); - String source = DBRefUtils.getCanonicalName(dbref.getDb()); - String version = dbref.getSecondaryId(); + String acc = xref.getId(); + String source = DBRefUtils.getCanonicalName(xref.getDb()); + String version = xref.getSecondaryId(); + if (version == null || "".equals(version)) + { + version = "0"; + } dna.addDBRef(new DBRefEntry(source, version, acc)); } } @@ -524,10 +540,14 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy * ensure UniProtKB/Swiss-Prot converted to UNIPROT */ String source = DBRefUtils.getCanonicalName(xref.getDb()); - DBRefEntry dbref = new DBRefEntry(source, xref.getSecondaryId(), - xref.getId()); - DBRefEntry proteinDbRef = new DBRefEntry(dbref.getSource(), - dbref.getVersion(), dbref.getAccessionId()); + String version = xref.getSecondaryId(); + if (version == null || "".equals(version)) + { + version = "0"; + } + DBRefEntry dbref = new DBRefEntry(source, version, xref.getId()); + DBRefEntry proteinDbRef = new DBRefEntry(source, version, + dbref.getAccessionId()); if (source.equals(DBRefSource.UNIPROT)) { String proteinSeqName = DBRefSource.UNIPROT + "|" @@ -556,6 +576,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy proteinSeq = new Sequence(proteinSeqName, product.getSequenceAsString()); matcher.add(proteinSeq); + proteinSeq.setDescription(product.getDescription()); peptides.add(proteinSeq); } dnaToProteinMapping.setTo(proteinSeq); @@ -609,8 +630,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, sequenceVersion, - proteinId); + DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); @@ -639,7 +659,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { return new int[] {}; } - + try { List ranges = DnaUtils.parseLocation(location); @@ -695,23 +715,48 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); if (!vals.isEmpty()) { - StringBuilder sb = new StringBuilder(); - boolean first = true; for (Entry val : vals.entrySet()) { - if (!first) - { - sb.append(";"); - } - sb.append(val.getKey()).append("=").append(val.getValue()); - first = false; sf.setValue(val.getKey(), val.getValue()); } - sf.setAttributes(sb.toString()); } return sf; } + @Override + public String getAccessionSeparator() + { + return null; + } + + @Override + public Regex getAccessionValidator() + { + return ACCESSION_REGEX; + } + + @Override + public String getDbVersion() + { + return "0"; + } + + @Override + public int getTier() + { + return 0; + } + + @Override + public boolean isValidReference(String accession) + { + if (accession == null || accession.length() < 2) + { + return false; + } + return getAccessionValidator().search(accession); + } + /** * Truncates (if necessary) the exon intervals to match 3 times the length of * the protein; also accepts 3 bases longer (for stop codon not included in @@ -730,7 +775,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy } int expectedCdsLength = proteinLength * 3; int exonLength = MappingUtils.getLength(Arrays.asList(exon)); - + /* * if exon length matches protein, or is shorter, or longer by the * length of a stop codon (3 bases), then leave it unchanged @@ -740,7 +785,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { return exon; } - + int origxon[]; int sxpos = -1; int endxon = 0; @@ -760,7 +805,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy // .println("Truncating final exon interval on region by " // + (cdspos - cdslength)); } - + /* * shrink the final exon - reduce end position if forward * strand, increase it if reverse @@ -776,7 +821,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy break; } } - + if (sxpos != -1) { // and trim the exon interval set if necessary