From 0a37e3b824b46b026916e124b42400590242d145 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 21 Jul 2020 12:22:18 +0100 Subject: [PATCH] JAL-3692 EMBL flatfile fetcher / parser (todo: CDS dbrefs and mappings) Conflicts: src/jalview/ws/dbsources/EmblXmlSource.java src/jalview/ws/ebi/EBIFetchClient.java --- resources/lang/Messages.properties | 1 - resources/lang/Messages_es.properties | 1 - src/jalview/io/EmblFlatFile.java | 455 ++++++++++++++++++++ src/jalview/util/MappingUtils.java | 19 + src/jalview/ws/dbsources/EmblCdsSource.java | 40 +- src/jalview/ws/dbsources/EmblFlatfileSource.java | 115 +++++ src/jalview/ws/dbsources/EmblSource.java | 60 +-- src/jalview/ws/dbsources/EmblXmlSource.java | 108 +++-- src/jalview/ws/ebi/EBIFetchClient.java | 33 +- test/jalview/util/MappingUtilsTest.java | 30 ++ ...{EmblSourceTest.java => EmblXmlSourceTest.java} | 87 ++-- 11 files changed, 788 insertions(+), 161 deletions(-) create mode 100644 src/jalview/io/EmblFlatFile.java create mode 100644 src/jalview/ws/dbsources/EmblFlatfileSource.java rename test/jalview/ws/dbsources/{EmblSourceTest.java => EmblXmlSourceTest.java} (93%) diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties index a4b24ed..1c8cc2a 100644 --- a/resources/lang/Messages.properties +++ b/resources/lang/Messages.properties @@ -1071,7 +1071,6 @@ exception.unable_to_create_internet_config = Unable to create an Internet Config exception.invocation_target_calling_url = InvocationTargetException while calling openURL: {0} exception.illegal_access_calling_url = IllegalAccessException while calling openURL: {0} exception.interrupted_launching_browser = InterruptedException while launching browser: {0} -exception.ebiembl_retrieval_failed_on = EBI EMBL XML retrieval failed on {0}:{1} exception.no_pdb_records_for_chain = No PDB Records for {0} chain {1} exception.unexpected_handling_rnaml_translation_for_pdb = Unexpected exception when handling RNAML translation of PDB data exception.couldnt_recover_sequence_properties_for_alignment = Couldn't recover sequence properties for alignment diff --git a/resources/lang/Messages_es.properties b/resources/lang/Messages_es.properties index 3d7065b..1f1afdc 100644 --- a/resources/lang/Messages_es.properties +++ b/resources/lang/Messages_es.properties @@ -995,7 +995,6 @@ exception.unable_to_create_internet_config = Imposible crear una instancia de co exception.invocation_target_calling_url = InvocationTargetException mientras se invocaba openURL: {0} exception.illegal_access_calling_url = IllegalAccessException mientras se invocaba openURL: {0} exception.interrupted_launching_browser = InterruptedException mientras se lanzaba el navegador: {0} -exception.ebiembl_retrieval_failed_on = La recuperación de datos EBI EMBL XML ha fallado en {0}:{1} exception.no_pdb_records_for_chain = No se han encontrado registros {0} para la cadena {1} exception.unexpected_handling_rnaml_translation_for_pdb = Excepcion inesperada cuando se traducían a RNAML los datos PDB exception.couldnt_recover_sequence_properties_for_alignment = No es posible recuperar las propiedades de la secuencia para el alineamiento diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java new file mode 100644 index 0000000..759fa28 --- /dev/null +++ b/src/jalview/io/EmblFlatFile.java @@ -0,0 +1,455 @@ +package jalview.io; + +import java.io.IOException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import jalview.bin.Cache; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.DnaUtils; +import jalview.util.MappingUtils; + +/** + * A class that provides selective parsing of the EMBL flatfile format. + *

+ * The initial implementation is limited to extracting fields used by Jalview + * after fetching an EMBL or EMBLCDS entry: + * + *

+ * accession, version, sequence, xref
+ * and (for CDS feature) location, protein_id, product, codon_start, translation
+ * 
+ * + * For a complete parser, it may be best to adopt that provided in + * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile + * (but note this has a dependency on the Apache Commons library) + * + * @author gmcarstairs + * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt + * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html + */ +public class EmblFlatFile extends AlignFile // FileParse +{ + private static final String WHITESPACE = "\\s+"; + + private String sourceDb; + + /* + * values parsed from the EMBL flatfile record + */ + private String accession; // from ID (first token) + + private String version; // from ID (second token) + + private int length = 128; // from ID (7th token), with usable default + + private List dbrefs; // from DR and also CDS /db_xref qualifiers + + private String sequenceString; // from SQ lines + + private String translation; // from CDS feature /translation + + private String cdsLocation; // CDS /location raw value + + private int codonStart = 1; // from CDS /codon_start + + private String proteinName; // from CDS /product + + private String proteinId; // from CDS /protein_id + + private Map cdsProps; // CDS other qualifiers e.g. 'note' + + /** + * Constructor + * @param fp + * @param sourceId + * @throws IOException + */ + public EmblFlatFile(FileParse fp, String sourceId) throws IOException + { + super(false, fp); // don't parse immediately + this.sourceDb = sourceId; + dbrefs = new ArrayList<>(); + cdsProps = new Hashtable<>(); + } + + /** + * Parses the flatfile, and if successful, saves as an annotated sequence + * which may be retrieved by calling {@code getSequence()} + * + * @throws IOException + */ + public void parse() throws IOException + { + String line = nextLine(); + while (line != null) + { + if (line.startsWith("ID")) + { + line = processID(line); + } + else if (line.startsWith("DR")) + { + line = processDR(line); + } + else if (line.startsWith("SQ")) + { + line = processSQ(); + } + else if (line.startsWith("FT")) + { + line = processFT(line); + } + else + { + line = nextLine(); + } + } + assembleSequence(); + } + + /** + * Extracts and saves the primary accession and version (SV value) from an ID + * line, or null if not found. Returns the next line after the one processed. + * + * @param line + * @throws IOException + */ + String processID(String line) throws IOException + { + String[] tokens = line.substring(2).split(";"); + + /* + * first is primary accession + */ + String token = tokens[0].trim(); + if (!token.isEmpty()) + { + this.accession = token; + } + + /* + * second token is 'SV versionNo' + */ + if (tokens.length > 1) + { + token = tokens[1].trim(); + if (token.startsWith("SV")) + { + String[] bits = token.trim().split(WHITESPACE); + this.version = bits[bits.length - 1]; + } + } + + /* + * seventh token is 'length BP' + */ + if (tokens.length > 6) + { + token = tokens[6].trim(); + String[] bits = token.trim().split(WHITESPACE); + try + { + this.length = Integer.valueOf(bits[0]); + } catch (NumberFormatException e) + { + Cache.log.error("bad length read in flatfile, line: " + line); + } + } + + return nextLine(); + } + + /** + * Processes one DR line and saves as a DBRefEntry cross-reference. Returns + * the line following the line processed. + * + * @param line + * @throws IOException + */ + String processDR(String line) throws IOException + { + String[] tokens = line.substring(2).split(";"); + if (tokens.length > 1) + { + String db = tokens[0].trim(); + String acc = tokens[1].trim(); + if (acc.endsWith(".")) + { + acc = acc.substring(0, acc.length() - 1); + } + this.dbrefs.add(new DBRefEntry(db, "0", acc)); + } + + return nextLine(); + } + + /** + * Reads and saves the sequence, read from the lines following the SQ line. + * Whitespace and position counters are discarded. Returns the next line + * following the sequence data (the next line that doesn't start with + * whitespace). + * + * @throws IOException + */ + String processSQ() throws IOException + { + StringBuilder sb = new StringBuilder(this.length); + String line = nextLine(); + while (line != null && line.startsWith(" ")) + { + line = line.trim(); + String[] blocks = line.split(WHITESPACE); + + /* + * omit the last block (position counter) on each line + */ + for (int i = 0; i < blocks.length - 1; i++) + { + sb.append(blocks[i]); + } + line = nextLine(); + } + this.sequenceString = sb.toString(); + + return line; + } + + /** + * Processes an FT line. If it declares a feature type of interest (currently, + * only CDS is processed), processes all of the associated lines (feature + * qualifiers), and returns the next line after that, otherwise simply returns + * the next line. + * + * @param line + * @return + * @throws IOException + */ + String processFT(String line) throws IOException + { + String[] tokens = line.split(WHITESPACE); + if (tokens.length < 3 || !"CDS".equals(tokens[1])) + { + return nextLine(); + } + + this.cdsLocation = tokens[2]; + + while ((line = nextLine()) != null) + { + if (!line.startsWith("FT ")) // 4 spaces + { + // e.g. start of next feature "FT source..." + break; + } + + /* + * extract qualifier, e.g. FT /protein_id="CAA37824.1" + */ + int slashPos = line.indexOf('/'); + if (slashPos == -1) + { + Cache.log.error("Unexpected EMBL line ignored: " + line); + continue; + } + int eqPos = line.indexOf('=', slashPos + 1); + if (eqPos == -1) + { + Cache.log.error("Unexpected EMBL line ignored: " + line); + continue; + } + String qualifier = line.substring(slashPos + 1, eqPos); + String value = line.substring(eqPos + 1); + if (value.startsWith("\"") && value.endsWith("\"")) + { + value = value.substring(1, value.length() - 1); + } + + if ("protein_id".equals(qualifier)) + { + proteinId = value; + } + else if ("codon_start".equals(qualifier)) + { + try + { + codonStart = Integer.parseInt(value.trim()); + } catch (NumberFormatException e) + { + Cache.log.error("Invalid codon_start in XML for " + this.accession + + ": " + e.getMessage()); + } + } + else if ("product".equals(qualifier)) + { + // sometimes name is returned e.g. for V00488 + proteinName = value; + } + else if ("translation".equals(qualifier)) + { + line = readTranslation(value); + } + else if (!"".equals(value)) + { + // throw anything else into the additional properties hash + cdsProps.put(qualifier, value); + } + } + + return line; + } + + /** + * Reads and saves the CDS translation from one or more lines of the file, and + * returns the next line after that + * + * @param value + * the first line of the translation (likely quoted) + * @return + * @throws IOException + */ + String readTranslation(String value) throws IOException + { + StringBuilder sb = new StringBuilder(this.length / 3 + 1); + sb.append(value.replace("\"", "")); + + String line; + while ((line = nextLine()) != null) + { + if (!line.startsWith("FT ")) + { + break; // reached next feature or other input line + } + String[] tokens = line.split(WHITESPACE); + if (tokens.length < 2) + { + Cache.log.error("Ignoring bad EMBL line: " + line); + break; + } + if (tokens[1].startsWith("/")) + { + break; // next feature qualifier + } + sb.append(tokens[1].replace("\"", "")); + } + + return sb.toString(); + } + + /** + * Processes the parsed CDS feature data to + *
    + *
  • add a CDS feature to the sequence for each CDS start-end range
  • + *
  • create a protein product sequence for the translation
  • + *
  • create a cross-reference to protein with mapping from dna
  • + *
  • add any CDS dbrefs to the sequence and to the protein product
  • + *
+ * @param SequenceI dna + */ + void processCDS(SequenceI dna) + { + /* + * parse location into a list of [start, end, start, end] positions + */ + int[] exons = getCdsRanges(this.accession, this.cdsLocation); + int exonNumber = 0; + + for (int xint = 0; exons != null + && xint < exons.length - 1; xint += 2) + { + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + exonNumber++; + String desc = String.format("Exon %d for protein EMBLCDS:%s", + exonNumber, proteinId); + + SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb); + if (!cdsProps.isEmpty()) + { + for (Entry val : cdsProps.entrySet()) + { + sf.setValue(val.getKey(), val.getValue()); + } + } + + sf.setEnaLocation(this.cdsLocation); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + + dna.addSequenceFeature(sf); + } + } + + /** + * Constructs and saves the sequence from parsed components + */ + void assembleSequence() + { + String name = this.accession; + if (this.sourceDb != null) + { + name = this.sourceDb + "|" + name; + } + SequenceI seq = new Sequence(name, this.sequenceString); + for (DBRefEntry dbref : this.dbrefs) + { + seq.addDBRef(dbref); + } + + processCDS(seq); + seq.deriveSequence(); + + addSequence(seq); + } + + /** + * Output (print) is not implemented for EMBL flat file format + */ + @Override + public String print(SequenceI[] seqs, boolean jvsuffix) + { + return null; + } + + /** + * Returns the CDS location as a single array of [start, end, start, end...] + * positions. If on the reverse strand, these will be in descending order. + * + * @param accession + * @param location + * @return + */ + protected int[] getCdsRanges(String accession, String location) + { + if (location == null) + { + return new int[] {}; + } + + try + { + List ranges = DnaUtils.parseLocation(location); + return MappingUtils.listToArray(ranges); + } catch (ParseException e) + { + Cache.log.warn( + String.format("Not parsing inexact CDS location %s in ENA %s", + location, accession)); + return new int[] {}; + } + } +} diff --git a/src/jalview/util/MappingUtils.java b/src/jalview/util/MappingUtils.java index b552c21..915293e 100644 --- a/src/jalview/util/MappingUtils.java +++ b/src/jalview/util/MappingUtils.java @@ -1020,4 +1020,23 @@ public final class MappingUtils } } } + + /** + * Converts a list of [start, end] ranges to a single array of [start, end, + * start, end ...] + * + * @param ranges + * @return + */ + public static int[] listToArray(List ranges) + { + int[] result = new int[ranges.size() * 2]; + int i = 0; + for (int[] range : ranges) + { + result[i++] = range[0]; + result[i++] = range[1]; + } + return result; + } } diff --git a/src/jalview/ws/dbsources/EmblCdsSource.java b/src/jalview/ws/dbsources/EmblCdsSource.java index a73af61..7455e4f 100644 --- a/src/jalview/ws/dbsources/EmblCdsSource.java +++ b/src/jalview/ws/dbsources/EmblCdsSource.java @@ -20,12 +20,12 @@ */ package jalview.ws.dbsources; +import com.stevesoft.pat.Regex; + import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefSource; -import com.stevesoft.pat.Regex; - -public class EmblCdsSource extends EmblXmlSource +public class EmblCdsSource extends /*EmblXmlSource */ EmblFlatfileSource { public EmblCdsSource() @@ -34,31 +34,12 @@ public class EmblCdsSource extends EmblXmlSource } @Override - public String getAccessionSeparator() - { - return null; - } - - @Override - public Regex getAccessionValidator() - { - return new Regex("^[A-Z]+[0-9]+"); - } - - @Override public String getDbSource() { return DBRefSource.EMBLCDS; } @Override - public String getDbVersion() - { - return "0"; // TODO : this is dynamically set for a returned record - not - // tied to proxy - } - - @Override public AlignmentI getSequenceRecords(String queries) throws Exception { if (queries.indexOf(".") > -1) @@ -68,15 +49,6 @@ public class EmblCdsSource extends EmblXmlSource return getEmblSequenceRecords(DBRefSource.EMBLCDS, queries); } - @Override - public boolean isValidReference(String accession) - { - // most embl CDS refs look like .. - // TODO: improve EMBLCDS regex - return (accession == null || accession.length() < 2) ? false - : getAccessionValidator().search(accession); - } - /** * cDNA for LDHA_CHICK swissprot sequence */ @@ -92,10 +64,4 @@ public class EmblCdsSource extends EmblXmlSource return "EMBL (CDS)"; } - @Override - public int getTier() - { - return 0; - } - } diff --git a/src/jalview/ws/dbsources/EmblFlatfileSource.java b/src/jalview/ws/dbsources/EmblFlatfileSource.java new file mode 100644 index 0000000..2353f22 --- /dev/null +++ b/src/jalview/ws/dbsources/EmblFlatfileSource.java @@ -0,0 +1,115 @@ +package jalview.ws.dbsources; + +import java.io.File; +import java.io.IOException; + +import com.stevesoft.pat.Regex; + +import jalview.bin.Cache; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceI; +import jalview.io.DataSourceType; +import jalview.io.EmblFlatFile; +import jalview.io.FileParse; +import jalview.ws.ebi.EBIFetchClient; + +/** + * A class that does partial parsing of an EMBL flatfile. + * + * @author gmcarstairs + * + */ +public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy +{ + private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+"); + + @Override + public String getDbVersion() + { + return "0"; + } + + @Override + public String getAccessionSeparator() + { + return null; + } + + @Override + public Regex getAccessionValidator() + { + return ACCESSION_REGEX; + } + + @Override + public boolean isValidReference(String accession) + { + if (accession == null || accession.length() < 2) + { + return false; + } + return getAccessionValidator().search(accession); + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + return null; + } + + @Override + public int getTier() + { + return 0; + } + + protected AlignmentI getEmblSequenceRecords(String dbName, String query) + throws Exception + { + startQuery(); + EBIFetchClient dbFetch = new EBIFetchClient(); + File reply; + try + { + reply = dbFetch.fetchDataAsFile( + dbName.toLowerCase() + ":" + query.trim(), null, "txt"); + } catch (Exception e) + { + stopQuery(); + throw new Exception( + String.format("EBI EMBL XML retrieval failed for %s:%s", + dbName.toLowerCase(), query.trim()), + e); + } + return getEmblSequenceRecords(dbName, query, reply); + } + + private AlignmentI getEmblSequenceRecords(String dbName, String query, + File reply) throws IOException + { + AlignmentI al = null; + + if (reply != null && reply.exists()) + { + file = reply.getAbsolutePath(); + FileParse fp = new FileParse(file, DataSourceType.FILE); + EmblFlatFile emblParser = new EmblFlatFile(fp, getDbSource()); + emblParser.parse(); + SequenceI[] seqs = emblParser.getSeqsAsArray(); + if (seqs.length > 0) + { + al = new Alignment(seqs); + } + + if (al == null) + { + Cache.log.error( + "No record found for '" + dbName + ":" + query + "'"); + } + } + + stopQuery(); + return al; + } +} diff --git a/src/jalview/ws/dbsources/EmblSource.java b/src/jalview/ws/dbsources/EmblSource.java index 6bbe2e1..4cff4a0 100644 --- a/src/jalview/ws/dbsources/EmblSource.java +++ b/src/jalview/ws/dbsources/EmblSource.java @@ -23,13 +23,11 @@ package jalview.ws.dbsources; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefSource; -import com.stevesoft.pat.Regex; - /** * @author JimP * */ -public class EmblSource extends EmblXmlSource +public class EmblSource extends /* EmblXmlSource */ EmblFlatfileSource { public EmblSource() @@ -40,29 +38,6 @@ public class EmblSource extends EmblXmlSource /* * (non-Javadoc) * - * @see jalview.ws.DbSourceProxy#getAccessionSeparator() - */ - @Override - public String getAccessionSeparator() - { - // TODO Auto-generated method stub - return null; - } - - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#getAccessionValidator() - */ - @Override - public Regex getAccessionValidator() - { - return new Regex("^[A-Z]+[0-9]+"); - } - - /* - * (non-Javadoc) - * * @see jalview.ws.DbSourceProxy#getDbSource() */ @Override @@ -74,18 +49,6 @@ public class EmblSource extends EmblXmlSource /* * (non-Javadoc) * - * @see jalview.ws.DbSourceProxy#getDbVersion() - */ - @Override - public String getDbVersion() - { - // TODO Auto-generated method stub - return "0"; - } - - /* - * (non-Javadoc) - * * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) */ @Override @@ -94,21 +57,6 @@ public class EmblSource extends EmblXmlSource return getEmblSequenceRecords(DBRefSource.EMBL, queries); } - /* - * (non-Javadoc) - * - * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) - */ - @Override - public boolean isValidReference(String accession) - { - // most embl refs look like .. - - return (accession == null || accession.length() < 2) ? false - : getAccessionValidator().search(accession); - - } - /** * return LHD_CHICK coding gene */ @@ -123,10 +71,4 @@ public class EmblSource extends EmblXmlSource { return "EMBL"; // getDbSource(); } - - @Override - public int getTier() - { - return 0; - } } diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index a420d9f..6b6f2ec 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -20,6 +20,27 @@ */ package jalview.ws.dbsources; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBElement; +import javax.xml.bind.JAXBException; +import javax.xml.stream.FactoryConfigurationError; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import com.stevesoft.pat.Regex; + import jalview.analysis.SequenceIdMatcher; import jalview.bin.Cache; import jalview.datamodel.Alignment; @@ -35,33 +56,23 @@ import jalview.util.DBRefUtils; import jalview.util.DnaUtils; import jalview.util.MapList; import jalview.util.MappingUtils; -import jalview.util.MessageManager; import jalview.ws.ebi.EBIFetchClient; import jalview.xml.binding.embl.EntryType; import jalview.xml.binding.embl.EntryType.Feature; import jalview.xml.binding.embl.EntryType.Feature.Qualifier; +import jalview.xml.binding.embl.ROOT; import jalview.xml.binding.embl.XrefType; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Hashtable; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - -import javax.xml.bind.JAXBContext; -import javax.xml.bind.JAXBException; -import javax.xml.stream.FactoryConfigurationError; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; - +/** + * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from + * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}. + * + * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource + */ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { + private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+"); + /* * JAL-1856 Embl returns this text for query not found */ @@ -96,9 +107,10 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy } catch (Exception e) { stopQuery(); - throw new Exception(MessageManager.formatMessage( - "exception.ebiembl_retrieval_failed_on", new String[] - { emprefx.toLowerCase(), query.trim() }), e); + throw new Exception( + String.format("EBI EMBL XML retrieval failed for %s:%s", + emprefx.toLowerCase(), query.trim()), + e); } return getEmblSequenceRecords(emprefx, query, reply); } @@ -180,8 +192,9 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy XMLStreamReader streamReader = XMLInputFactory.newInstance() .createXMLStreamReader(is); javax.xml.bind.Unmarshaller um = jc.createUnmarshaller(); - jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um - .unmarshal(streamReader); + JAXBElement rootElement = um.unmarshal(streamReader, + ROOT.class); + ROOT root = rootElement.getValue(); /* * document root contains either "entry" or "entrySet" @@ -614,8 +627,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy && dnaToProteinMapping.getTo() != null) { DBRefEntry dnaToEmblProteinRef = new DBRefEntry( - DBRefSource.EMBLCDSProduct, sequenceVersion, - proteinId); + DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId); dnaToEmblProteinRef.setMap(dnaToProteinMapping); dnaToProteinMapping.setMappedFromId(proteinId); dna.addDBRef(dnaToEmblProteinRef); @@ -644,7 +656,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { return new int[] {}; } - + try { List ranges = DnaUtils.parseLocation(location); @@ -708,6 +720,40 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy return sf; } + @Override + public String getAccessionSeparator() + { + return null; + } + + @Override + public Regex getAccessionValidator() + { + return ACCESSION_REGEX; + } + + @Override + public String getDbVersion() + { + return "0"; + } + + @Override + public int getTier() + { + return 0; + } + + @Override + public boolean isValidReference(String accession) + { + if (accession == null || accession.length() < 2) + { + return false; + } + return getAccessionValidator().search(accession); + } + /** * Truncates (if necessary) the exon intervals to match 3 times the length of * the protein; also accepts 3 bases longer (for stop codon not included in @@ -726,7 +772,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy } int expectedCdsLength = proteinLength * 3; int exonLength = MappingUtils.getLength(Arrays.asList(exon)); - + /* * if exon length matches protein, or is shorter, or longer by the * length of a stop codon (3 bases), then leave it unchanged @@ -736,7 +782,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { return exon; } - + int origxon[]; int sxpos = -1; int endxon = 0; @@ -756,7 +802,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy // .println("Truncating final exon interval on region by " // + (cdspos - cdslength)); } - + /* * shrink the final exon - reduce end position if forward * strand, increase it if reverse @@ -772,7 +818,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy break; } } - + if (sxpos != -1) { // and trim the exon interval set if necessary diff --git a/src/jalview/ws/ebi/EBIFetchClient.java b/src/jalview/ws/ebi/EBIFetchClient.java index 07a9df4..9a77087 100644 --- a/src/jalview/ws/ebi/EBIFetchClient.java +++ b/src/jalview/ws/ebi/EBIFetchClient.java @@ -27,6 +27,7 @@ import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; +import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; @@ -91,7 +92,7 @@ public class EBIFetchClient * the query formatted as db:query1;query2;query3 * @param format * the format wanted - * @param extension + * @param ext * for the temporary file to hold response (without separator) * @return the file holding the response * @throws OutOfMemoryError @@ -201,7 +202,8 @@ public class EBIFetchClient { // long time = System.currentTimeMillis(); String url = buildUrl(ids, database, format); - + InputStream is = null; + BufferedReader br = null; try { URL rcall = new URL(url); @@ -213,7 +215,7 @@ public class EBIFetchClient System.err.println("Warning: response code " + responseCode + " for " + url); } - InputStream is = new BufferedInputStream(conn.getInputStream()); + is = new BufferedInputStream(conn.getInputStream()); if (outFile != null) { FileOutputStream fio = new FileOutputStream(outFile); @@ -228,7 +230,7 @@ public class EBIFetchClient } else { - BufferedReader br = new BufferedReader(new InputStreamReader(is)); + br = new BufferedReader(new InputStreamReader(is)); String rtn; List arl = new ArrayList(); while ((rtn = br.readLine()) != null) @@ -257,6 +259,24 @@ public class EBIFetchClient { // System.err.println("EBIFetch took " + (System.currentTimeMillis() - // time) + " ms"); + if (is != null) + { + try + { + is.close(); + } catch (IOException e) + { + } + } + if (br != null) + { + try + { + br.close(); + } catch (IOException e) + { + } + } } return null; } @@ -275,8 +295,9 @@ public class EBIFetchClient if (database.equalsIgnoreCase(DBRefSource.EMBL) || database.equalsIgnoreCase(DBRefSource.EMBLCDS)) { - url = "https://www.ebi.ac.uk/ena/data/view/" + ids.toLowerCase() - + (format != null ? "&" + format : ""); +// url = "https://www.ebi.ac.uk/ena/data/view/" + ids.toLowerCase() +// + (format != null ? "&" + format : ""); + url = "https://www.ebi.ac.uk/ena/browser/api/embl/" + ids.toLowerCase(); } else { diff --git a/test/jalview/util/MappingUtilsTest.java b/test/jalview/util/MappingUtilsTest.java index 097ccd4..dd789d6 100644 --- a/test/jalview/util/MappingUtilsTest.java +++ b/test/jalview/util/MappingUtilsTest.java @@ -24,6 +24,7 @@ import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; +import static org.testng.AssertJUnit.fail; import jalview.api.AlignViewportI; import jalview.commands.EditCommand; @@ -1284,4 +1285,33 @@ public class MappingUtilsTest assertEquals(1, ranges.size()); assertEquals(9, ranges.get(0)[1]); } + + @Test(groups = "Functional") + public void testListToArray() + { + List ranges = new ArrayList<>(); + + int[] result = MappingUtils.listToArray(ranges); + assertEquals(result.length, 0); + ranges.add(new int[] {24, 12}); + result = MappingUtils.listToArray(ranges); + assertEquals(result.length, 2); + assertEquals(result[0], 24); + assertEquals(result[1], 12); + ranges.add(new int[] {-7, 30}); + result = MappingUtils.listToArray(ranges); + assertEquals(result.length, 4); + assertEquals(result[0], 24); + assertEquals(result[1], 12); + assertEquals(result[2], -7); + assertEquals(result[3], 30); + try + { + MappingUtils.listToArray(null); + fail("Expected exception"); + } catch (NullPointerException e) + { + // expected + } + } } diff --git a/test/jalview/ws/dbsources/EmblSourceTest.java b/test/jalview/ws/dbsources/EmblXmlSourceTest.java similarity index 93% rename from test/jalview/ws/dbsources/EmblSourceTest.java rename to test/jalview/ws/dbsources/EmblXmlSourceTest.java index d450495..5f288a8 100644 --- a/test/jalview/ws/dbsources/EmblSourceTest.java +++ b/test/jalview/ws/dbsources/EmblXmlSourceTest.java @@ -26,6 +26,7 @@ import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; +import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.SequenceI; @@ -40,9 +41,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -public class EmblSourceTest +public class EmblXmlSourceTest { // adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml @@ -95,16 +97,49 @@ public class EmblSourceTest + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT" + ""; + private EmblXmlSource testee; + + @BeforeClass(alwaysRun = true) + public void setUp() + { + testee = new EmblXmlSource() + { + + @Override + public String getDbSource() + { + return null; + } + + @Override + public String getDbName() + { + return null; + } + + @Override + public String getTestQuery() + { + return null; + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + return null; + } + }; + } + @Test(groups = "Functional") public void testGetCdsRanges() { - EmblSource testee = new EmblSource(); - /* * Make a (CDS) Feature with 5 locations */ Feature cds = new Feature(); - cds.setLocation("join(10..20,complement(30..40),50..60,70..80,complement(110..120))"); + cds.setLocation( + "join(10..20,complement(30..40),50..60,70..80,complement(110..120))"); int[] exons = testee.getCdsRanges("EMBL", cds); assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]", @@ -116,10 +151,9 @@ public class EmblSourceTest { // not the whole sequence but enough for this test... List peptides = new ArrayList<>(); - List entries = EmblSourceTest.getEmblEntries(); + List entries = getEmblEntries(); assertEquals(1, entries.size()); EntryType entry = entries.get(0); - EmblSource testee = new EmblSource(); String sourceDb = "EMBL"; SequenceI dna = testee.getSequence(sourceDb, entry, peptides); @@ -165,8 +199,9 @@ public class EmblSourceTest 3, 1); MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 }, 3, 1); - MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] { - 1, 3 }, 3, 1); + MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, + new int[] + { 1, 3 }, 3, 1); DBRefEntry[] dbrefs = dna.getDBRefs(); assertEquals(7, dbrefs.length); @@ -222,10 +257,12 @@ public class EmblSourceTest * - to EMBLCDS (with 1:3 mapping) * - direct (no mapping) to other protein accessions */ - MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] { - 1, 12 }, 1, 3); - MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] { - 1, 9 }, 1, 3); + MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, + new int[] + { 1, 12 }, 1, 3); + MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, + new int[] + { 1, 9 }, 1, 3); // dbrefs for first CDS EMBL product CAA30420.1 dbrefs = peptides.get(0).getDBRefs(); @@ -339,10 +376,10 @@ public class EmblSourceTest @Test(groups = { "Functional" }) public void testGetEmblEntries() { - List entries = EmblSourceTest.getEmblEntries(); + List entries = getEmblEntries(); assertEquals(1, entries.size()); EntryType entry = entries.get(0); - + assertEquals("X07547", entry.getAccession()); assertEquals("C. trachomatis plasmid", entry.getDescription()); assertEquals("STD", entry.getDataClass()); @@ -359,7 +396,7 @@ public class EmblSourceTest assertEquals(2, entry.getKeyword().size()); assertEquals("plasmid", entry.getKeyword().get(0)); assertEquals("unidentified reading frame", entry.getKeyword().get(1)); - + /* * dbrefs */ @@ -372,7 +409,7 @@ public class EmblSourceTest assertEquals("MD5", dbref.getDb()); assertEquals("ac73317", dbref.getId()); assertNull(dbref.getSecondaryId()); - + /* * three sequence features for CDS */ @@ -403,7 +440,7 @@ public class EmblSourceTest q = ef.getQualifier().get(2); assertEquals("translation", q.getName()); assertEquals("MLCF", q.getValue()); - + /* * second CDS */ @@ -422,7 +459,7 @@ public class EmblSourceTest q = ef.getQualifier().get(1); assertEquals("translation", q.getName()); assertEquals("MSSS", q.getValue()); - + /* * third CDS */ @@ -438,16 +475,14 @@ public class EmblSourceTest q = ef.getQualifier().get(1); assertEquals("translation", q.getName()); assertEquals("MSS", q.getValue()); - + /* * Sequence - raw data before removal of newlines */ String seq = entry.getSequence(); - assertEquals( - "GGTATGTCCTCTAGTACAAAC\n" - + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT", - seq); - + assertEquals("GGTATGTCCTCTAGTACAAAC\n" + + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT", seq); + /* * getSequence() converts empty DBRefEntry.version to "0" */ @@ -455,9 +490,9 @@ public class EmblSourceTest assertNull(entry.getFeature().get(0).getXref().get(1).getSecondaryId()); } - static List getEmblEntries() + List getEmblEntries() { - return new EmblSource() + return testee .getEmblEntries(new ByteArrayInputStream(TESTDATA.getBytes())); } } -- 1.7.10.2