exception.invocation_target_calling_url = InvocationTargetException while calling openURL: {0}
exception.illegal_access_calling_url = IllegalAccessException while calling openURL: {0}
exception.interrupted_launching_browser = InterruptedException while launching browser: {0}
-exception.ebiembl_retrieval_failed_on = EBI EMBL XML retrieval failed on {0}:{1}
exception.no_pdb_records_for_chain = No PDB Records for {0} chain {1}
exception.unexpected_handling_rnaml_translation_for_pdb = Unexpected exception when handling RNAML translation of PDB data
exception.couldnt_recover_sequence_properties_for_alignment = Couldn't recover sequence properties for alignment
exception.invocation_target_calling_url = InvocationTargetException mientras se invocaba openURL: {0}
exception.illegal_access_calling_url = IllegalAccessException mientras se invocaba openURL: {0}
exception.interrupted_launching_browser = InterruptedException mientras se lanzaba el navegador: {0}
-exception.ebiembl_retrieval_failed_on = La recuperación de datos EBI EMBL XML ha fallado en {0}:{1}
exception.no_pdb_records_for_chain = No se han encontrado registros {0} para la cadena {1}
exception.unexpected_handling_rnaml_translation_for_pdb = Excepcion inesperada cuando se traducían a RNAML los datos PDB
exception.couldnt_recover_sequence_properties_for_alignment = No es posible recuperar las propiedades de la secuencia para el alineamiento
--- /dev/null
+package jalview.io;
+
+import java.io.IOException;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+
+import jalview.bin.Cache;
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.FeatureProperties;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
+import jalview.util.DnaUtils;
+import jalview.util.MapList;
+import jalview.util.MappingUtils;
+
+/**
+ * A class that provides selective parsing of the EMBL flatfile format.
+ * <p>
+ * The initial implementation is limited to extracting fields used by Jalview
+ * after fetching an EMBL or EMBLCDS entry:
+ *
+ * <pre>
+ * accession, version, sequence, xref
+ * and (for CDS feature) location, protein_id, product, codon_start, translation
+ * </pre>
+ *
+ * For a complete parser, it may be best to adopt that provided in
+ * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
+ * (but note this has a dependency on the Apache Commons library)
+ *
+ * @author gmcarstairs
+ * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
+ * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
+ */
+public class EmblFlatFile extends AlignFile // FileParse
+{
+ private static final String QUOTE = "\"";
+
+ private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
+
+ /**
+ * when true, interpret the mol_type 'source' feature attribute
+ * and generate an RNA sequence from the DNA record
+ */
+ private boolean produceRna=true;
+ /**
+ * A data bean class to hold values parsed from one CDS Feature (FT)
+ */
+ class CdsData
+ {
+ String translation; // from CDS feature /translation
+
+ String cdsLocation; // CDS /location raw value
+
+ int codonStart = 1; // from CDS /codon_start
+
+ String proteinName; // from CDS /product; used for protein description
+
+ String proteinId; // from CDS /protein_id
+
+ List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
+
+ Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
+ }
+
+ private static final String WHITESPACE = "\\s+";
+
+ private String sourceDb;
+
+ /*
+ * values parsed from the EMBL flatfile record
+ */
+ private String accession; // from ID (first token)
+
+ private String version; // from ID (second token)
+
+ private String description; // from (first) DE line
+
+ private int length = 128; // from ID (7th token), with usable default
+
+ private List<DBRefEntry> dbrefs; // from DR
+
+ private boolean sequenceStringIsRNA=false;
+ private String sequenceString; // from SQ lines
+
+ /*
+ * parsed CDS data fields, keyed by protein_id
+ */
+ private Map<String, CdsData> cds;
+
+ /**
+ * Constructor
+ *
+ * @param fp
+ * @param sourceId
+ * @throws IOException
+ */
+ public EmblFlatFile(FileParse fp, String sourceId) throws IOException
+ {
+ super(false, fp); // don't parse immediately
+ this.sourceDb = sourceId;
+ dbrefs = new ArrayList<>();
+
+ /*
+ * using TreeMap gives CDS sequences in alphabetical, so readable, order
+ */
+ cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
+ }
+
+ /**
+ * Parses the flatfile, and if successful, saves as an annotated sequence
+ * which may be retrieved by calling {@code getSequence()}
+ *
+ * @throws IOException
+ */
+ public void parse() throws IOException
+ {
+ String line = nextLine();
+ while (line != null)
+ {
+ if (line.startsWith("ID"))
+ {
+ line = parseID(line);
+ }
+ else if (line.startsWith("DE"))
+ {
+ line = parseDE(line);
+ }
+ else if (line.startsWith("DR"))
+ {
+ line = parseDR(line);
+ }
+ else if (line.startsWith("SQ"))
+ {
+ line = parseSQ();
+ }
+ else if (line.startsWith("FT"))
+ {
+ line = parseFT(line);
+ }
+ else
+ {
+ line = nextLine();
+ }
+ }
+ buildSequence();
+ }
+
+ /**
+ * Extracts and saves the primary accession and version (SV value) from an ID
+ * line, or null if not found. Returns the next line after the one processed.
+ *
+ * @param line
+ * @throws IOException
+ */
+ String parseID(String line) throws IOException
+ {
+ String[] tokens = line.substring(2).split(";");
+
+ /*
+ * first is primary accession
+ */
+ String token = tokens[0].trim();
+ if (!token.isEmpty())
+ {
+ this.accession = token;
+ }
+
+ /*
+ * second token is 'SV versionNo'
+ */
+ if (tokens.length > 1)
+ {
+ token = tokens[1].trim();
+ if (token.startsWith("SV"))
+ {
+ String[] bits = token.trim().split(WHITESPACE);
+ this.version = bits[bits.length - 1];
+ }
+ }
+
+ /*
+ * seventh token is 'length BP'
+ */
+ if (tokens.length > 6)
+ {
+ token = tokens[6].trim();
+ String[] bits = token.trim().split(WHITESPACE);
+ try
+ {
+ this.length = Integer.valueOf(bits[0]);
+ } catch (NumberFormatException e)
+ {
+ Cache.log.error("bad length read in flatfile, line: " + line);
+ }
+ }
+
+ return nextLine();
+ }
+
+ /**
+ * Reads sequence description from the first DE line found. Any trailing
+ * period is discarded. If there are multiple DE lines, only the first (short
+ * description) is read, the rest are ignored.
+ *
+ * @param line
+ * @return
+ * @throws IOException
+ */
+ String parseDE(String line) throws IOException
+ {
+ String desc = line.substring(2).trim();
+ if (desc.endsWith("."))
+ {
+ desc = desc.substring(0, desc.length() - 1);
+ }
+ this.description = desc;
+
+ /*
+ * pass over any additional DE lines
+ */
+ while ((line = nextLine()) != null)
+ {
+ if (!line.startsWith("DE"))
+ {
+ break;
+ }
+ }
+
+ return line;
+ }
+
+ /**
+ * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
+ * the line following the line processed.
+ *
+ * @param line
+ * @throws IOException
+ */
+ String parseDR(String line) throws IOException
+ {
+ String[] tokens = line.substring(2).split(";");
+ if (tokens.length > 1)
+ {
+ /*
+ * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+ */
+ String db = tokens[0].trim();
+ db = DBRefUtils.getCanonicalName(db);
+ String acc = tokens[1].trim();
+ if (acc.endsWith("."))
+ {
+ acc = acc.substring(0, acc.length() - 1);
+ }
+ String version = "0";
+ if (tokens.length > 2)
+ {
+ String secondaryId = tokens[2].trim();
+ if (!secondaryId.isEmpty())
+ {
+ // todo: is this right? secondary id is not a version number
+ // version = secondaryId;
+ }
+ }
+ this.dbrefs.add(new DBRefEntry(db, version, acc));
+ }
+
+ return nextLine();
+ }
+
+ /**
+ * Reads and saves the sequence, read from the lines following the SQ line.
+ * Whitespace and position counters are discarded. Returns the next line
+ * following the sequence data (the next line that doesn't start with
+ * whitespace).
+ *
+ * @throws IOException
+ */
+ String parseSQ() throws IOException
+ {
+ StringBuilder sb = new StringBuilder(this.length);
+ String line = nextLine();
+ while (line != null && line.startsWith(" "))
+ {
+ line = line.trim();
+ String[] blocks = line.split(WHITESPACE);
+
+ /*
+ * omit the last block (position counter) on each line
+ */
+ for (int i = 0; i < blocks.length - 1; i++)
+ {
+ sb.append(blocks[i]);
+ }
+ line = nextLine();
+ }
+ this.sequenceString = sb.toString();
+
+ return line;
+ }
+
+ /**
+ * Processes an FT line. If it declares a feature type of interest (currently,
+ * only CDS is processed), processes all of the associated lines (feature
+ * qualifiers), and returns the next line after that, otherwise simply returns
+ * the next line.
+ *
+ * @param line
+ * @return
+ * @throws IOException
+ */
+ String parseFT(String line) throws IOException
+ {
+ String[] tokens = line.split(WHITESPACE);
+ if (tokens.length < 3 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
+ {
+ return nextLine();
+ }
+
+ if (tokens[1].equals("source"))
+ {
+ return parseSourceQualifiers(tokens);
+ }
+
+ /*
+ * parse location - which may be over more than one line e.g. EAW51554
+ */
+ CdsData data = new CdsData();
+ data.cdsLocation = tokens[2];
+ // TODO location can be over >1 line e.g. EAW51554
+
+ line = nextLine();
+ while (line != null)
+ {
+ if (!line.startsWith("FT ")) // 4 spaces
+ {
+ // e.g. start of next feature "FT source..."
+ break;
+ }
+
+ /*
+ * extract qualifier, e.g. FT /protein_id="CAA37824.1"
+ * - the value may extend over more than one line
+ * - if the value has enclosing quotes, these are removed
+ * - escaped double quotes ("") are reduced to a single character
+ */
+ int slashPos = line.indexOf('/');
+ if (slashPos == -1)
+ {
+ Cache.log.error("Unexpected EMBL line ignored: " + line);
+ line = nextLine();
+ continue;
+ }
+ int eqPos = line.indexOf('=', slashPos + 1);
+ if (eqPos == -1)
+ {
+ // can happen, e.g. /ribosomal_slippage
+ // Cache.log.error("Unexpected EMBL line ignored: " + line);
+ line = nextLine();
+ continue;
+ }
+ String qualifier = line.substring(slashPos + 1, eqPos);
+ String value = line.substring(eqPos + 1);
+ value = removeQuotes(value);
+ StringBuilder sb = new StringBuilder().append(value);
+ line = parseFeatureQualifier(sb, qualifier);
+ String featureValue = sb.toString();
+
+ if ("protein_id".equals(qualifier))
+ {
+ data.proteinId = featureValue;
+ }
+ else if ("codon_start".equals(qualifier))
+ {
+ try
+ {
+ data.codonStart = Integer.parseInt(featureValue.trim());
+ } catch (NumberFormatException e)
+ {
+ Cache.log.error("Invalid codon_start in XML for " + this.accession
+ + ": " + e.getMessage());
+ }
+ }
+ else if ("db_xref".equals(qualifier))
+ {
+ String[] parts = featureValue.split(":");
+ if (parts.length == 2)
+ {
+ String db = parts[0].trim();
+ db = DBRefUtils.getCanonicalName(db);
+ DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
+ data.xrefs.add(dbref);
+ }
+ }
+ else if ("product".equals(qualifier))
+ {
+ data.proteinName = featureValue;
+ }
+ else if ("translation".equals(qualifier))
+ {
+ data.translation = featureValue;
+ }
+ else if (!"".equals(featureValue))
+ {
+ // throw anything else into the additional properties hash
+ data.cdsProps.put(qualifier, featureValue);
+ }
+ }
+
+ if (data.proteinId != null)
+ {
+ this.cds.put(data.proteinId, data);
+ }
+ else
+ {
+ Cache.log.error("Ignoring CDS feature with no protein_id for "
+ + sourceDb + ":" + accession);
+ }
+
+ return line;
+ }
+
+ /**
+ * process attributes for 'source' until the next FT feature entry
+ * only interested in 'mol_type'
+ * @param tokens
+ * @return
+ * @throws IOException
+ */
+ private String parseSourceQualifiers(String[] tokens) throws IOException
+ {
+ if (!"source".equals(tokens[1]))
+ {
+ throw (new RuntimeException("Not given a source qualifier"));
+ }
+ // search for mol_type attribute
+
+ StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
+ // sequence
+
+ String line = parseFeatureQualifier(sb, "source");
+ while (line != null)
+ {
+ if (!line.startsWith("FT ")) // four spaces, end of this feature table
+ // entry
+ {
+ return line;
+ }
+
+ int p = line.indexOf("\\mol_type");
+ int qs = line.indexOf("\"", p);
+ int qe = line.indexOf("\"", qs + 1);
+ String qualifier=line.substring(qs,qe).toLowerCase();
+ if (qualifier.indexOf("rna") > -1)
+ {
+ sequenceStringIsRNA = true;
+ }
+ if (qualifier.indexOf("dna") > -1)
+ {
+ sequenceStringIsRNA = false;
+ }
+ line=parseFeatureQualifier(sb, "source");
+ }
+ return line;
+ }
+
+ /**
+ * Removes leading or trailing double quotes (") unless doubled, and changes
+ * any 'escaped' (doubled) double quotes to single characters. As per the
+ * Feature Table specification for Qualifiers, Free Text.
+ *
+ * @param value
+ * @return
+ */
+ static String removeQuotes(String value)
+ {
+ if (value == null)
+ {
+ return null;
+ }
+ if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
+ {
+ value = value.substring(1);
+ }
+ if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
+ {
+ value = value.substring(0, value.length() - 1);
+ }
+ value = value.replace(DOUBLED_QUOTE, QUOTE);
+ return value;
+ }
+
+ /**
+ * Reads the value of a feature (FT) qualifier from one or more lines of the
+ * file, and returns the next line after that. Values are appended to the
+ * string buffer, which should be already primed with the value read from the
+ * first line for the qualifier (with any leading double quote removed).
+ * Enclosing double quotes are removed, and escaped (repeated) double quotes
+ * reduced to one only. For example for
+ *
+ * <pre>
+ * FT /note="gene_id=hCG28070.3
+ * FT ""foobar"" isoform=CRA_b"
+ * the returned value is
+ * gene_id=hCG28070.3 "foobar" isoform=CRA_b
+ * </pre>
+ *
+ * Note the side-effect of this method, to advance data reading to the next
+ * line after the feature qualifier.
+ *
+ * @param sb
+ * a string buffer primed with the first line of the value
+ * @param qualifierName
+ * @return
+ * @throws IOException
+ */
+ String parseFeatureQualifier(StringBuilder sb, String qualifierName)
+ throws IOException
+ {
+ String line;
+ while ((line = nextLine()) != null)
+ {
+ if (!line.startsWith("FT "))
+ {
+ break; // reached next feature or other input line
+ }
+ String[] tokens = line.split(WHITESPACE);
+ if (tokens.length < 2)
+ {
+ Cache.log.error("Ignoring bad EMBL line for " + this.accession
+ + ": " + line);
+ break;
+ }
+ if (tokens[1].startsWith("/"))
+ {
+ break; // next feature qualifier
+ }
+
+ /*
+ * heuristic rule: most multi-line value (e.g. /product) are text,
+ * so add a space for word boundary at a new line; not for translation
+ */
+ if (!"translation".equals(qualifierName))
+ {
+ sb.append(" ");
+ }
+
+ /*
+ * remove trailing " and unescape doubled ""
+ */
+ String data = removeQuotes(tokens[1]);
+ sb.append(data);
+ }
+
+ return line;
+ }
+
+ /**
+ * Constructs and saves the sequence from parsed components
+ */
+ void buildSequence()
+ {
+ if (this.accession == null || this.sequenceString == null)
+ {
+ Cache.log.error("Failed to parse data from EMBL");
+ return;
+ }
+
+ String name = this.accession;
+ if (this.sourceDb != null)
+ {
+ name = this.sourceDb + "|" + name;
+ }
+
+ if (produceRna && sequenceStringIsRNA)
+ {
+ sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
+ }
+
+ SequenceI seq = new Sequence(name, this.sequenceString);
+ seq.setDescription(this.description);
+
+ /*
+ * add a DBRef to itself
+ */
+ DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
+ int[] startEnd = new int[] { 1, seq.getLength() };
+ selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
+ seq.addDBRef(selfRef);
+
+ for (DBRefEntry dbref : this.dbrefs)
+ {
+ seq.addDBRef(dbref);
+ }
+
+ processCDSFeatures(seq);
+
+ seq.deriveSequence();
+
+ addSequence(seq);
+ }
+
+ /**
+ * Process the CDS features, including generation of cross-references and
+ * mappings to the protein products (translation)
+ *
+ * @param seq
+ */
+ protected void processCDSFeatures(SequenceI seq)
+ {
+ /*
+ * record protein products found to avoid duplication i.e. >1 CDS with
+ * the same /protein_id [though not sure I can find an example of this]
+ */
+ Map<String, SequenceI> proteins = new HashMap<>();
+ for (CdsData data : cds.values())
+ {
+ processCDSFeature(seq, data, proteins);
+ }
+ }
+
+ /**
+ * Processes data for one parsed CDS feature to
+ * <ul>
+ * <li>create a protein product sequence for the translation</li>
+ * <li>create a cross-reference to protein with mapping from dna</li>
+ * <li>add a CDS feature to the sequence for each CDS start-end range</li>
+ * <li>add any CDS dbrefs to the sequence and to the protein product</li>
+ * </ul>
+ *
+ * @param SequenceI
+ * dna
+ * @param proteins
+ * map of protein products so far derived from CDS data
+ */
+ void processCDSFeature(SequenceI dna, CdsData data,
+ Map<String, SequenceI> proteins)
+ {
+ /*
+ * parse location into a list of [start, end, start, end] positions
+ */
+ int[] exons = getCdsRanges(this.accession, data.cdsLocation);
+
+ MapList maplist = buildMappingToProtein(dna, exons, data);
+
+ int exonNumber = 0;
+
+ for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
+ {
+ int exonStart = exons[xint];
+ int exonEnd = exons[xint + 1];
+ int begin = Math.min(exonStart, exonEnd);
+ int end = Math.max(exonStart, exonEnd);
+ exonNumber++;
+ String desc = String.format("Exon %d for protein EMBLCDS:%s",
+ exonNumber, data.proteinId);
+
+ SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
+ this.sourceDb);
+ for (Entry<String, String> val : data.cdsProps.entrySet())
+ {
+ sf.setValue(val.getKey(), val.getValue());
+ }
+
+ sf.setEnaLocation(data.cdsLocation);
+ boolean forwardStrand = exonStart <= exonEnd;
+ sf.setStrand(forwardStrand ? "+" : "-");
+ sf.setPhase(String.valueOf(data.codonStart - 1));
+ sf.setValue(FeatureProperties.EXONPOS, exonNumber);
+ sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
+
+ dna.addSequenceFeature(sf);
+ }
+
+ boolean hasUniprotDbref = false;
+ for (DBRefEntry xref : data.xrefs)
+ {
+ dna.addDBRef(xref);
+ if (xref.getSource().equals(DBRefSource.UNIPROT))
+ {
+ /*
+ * construct (or find) the sequence for (data.protein_id, data.translation)
+ */
+ SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
+ Mapping map = new Mapping(protein, maplist);
+ map.setMappedFromId(data.proteinId);
+ xref.setMap(map);
+
+ /*
+ * add DBRefs with mappings from dna to protein and the inverse
+ */
+ DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
+ db1.setMap(new Mapping(dna, maplist.getInverse()));
+ protein.addDBRef(db1);
+
+ hasUniprotDbref = true;
+ }
+ }
+
+ /*
+ * if we have a product (translation) but no explicit Uniprot dbref
+ * (example: EMBL M19487 protein_id AAB02592.1)
+ * then construct mappings to an assumed EMBLCDSPROTEIN accession
+ */
+ if (!hasUniprotDbref)
+ {
+ SequenceI protein = proteins.get(data.proteinId);
+ if (protein == null)
+ {
+ protein = new Sequence(data.proteinId, data.translation);
+ protein.setDescription(data.proteinName);
+ proteins.put(data.proteinId, protein);
+ }
+ // assuming CDSPROTEIN sequence version = dna version (?!)
+ DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
+ this.version, data.proteinId);
+ protein.addDBRef(db1);
+
+ DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
+ DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
+ Mapping map = new Mapping(protein, maplist);
+ map.setMappedFromId(data.proteinId);
+ dnaToEmblProteinRef.setMap(map);
+ dna.addDBRef(dnaToEmblProteinRef);
+ }
+
+ /*
+ * comment brought forward from EmblXmlSource, lines 447-451:
+ * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
+ * sequence with the exon map; if given a dataset reference, search
+ * dataset for parent EMBL sequence if it exists and set its map;
+ * make a new feature annotating the coding contig
+ */
+ }
+
+ /**
+ * Computes a mapping from CDS positions in DNA sequence to protein product
+ * positions, with allowance for stop codon or incomplete start codon
+ *
+ * @param dna
+ * @param exons
+ * @param data
+ * @return
+ */
+ MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
+ final CdsData data)
+ {
+ MapList dnaToProteinMapping = null;
+ int peptideLength = data.translation.length();
+
+ int[] proteinRange = new int[] { 1, peptideLength };
+ if (exons != null && exons.length > 0)
+ {
+ /*
+ * We were able to parse 'location'; do a final
+ * product length truncation check
+ */
+ int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
+ dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
+ }
+ else
+ {
+ /*
+ * workaround until we handle all 'location' formats fully
+ * e.g. X53828.1:60..1058 or <123..>289
+ */
+ Cache.log.error(String.format(
+ "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
+ + " - Making up the CDNA region of (%s:%s)... may be incorrect",
+ data.cdsLocation, sourceDb, this.accession));
+
+ int completeCodonsLength = 1 - data.codonStart + dna.getLength();
+ int mappedDnaEnd = dna.getEnd();
+ if (peptideLength * 3 == completeCodonsLength)
+ {
+ // this might occur for CDS sequences where no features are marked
+ Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
+ mappedDnaEnd = dna.getEnd();
+ }
+ else if ((peptideLength + 1) * 3 == completeCodonsLength)
+ {
+ Cache.log.warn("Assuming stop codon at end of cDNA fragment");
+ mappedDnaEnd = dna.getEnd() - 3;
+ }
+
+ if (mappedDnaEnd != -1)
+ {
+ int[] cdsRanges = new int[] {
+ dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
+ dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
+ }
+ }
+
+ return dnaToProteinMapping;
+ }
+
+ /**
+ * Constructs a sequence for the protein product for the CDS data (if there is
+ * one), and dbrefs with mappings from CDS to protein and the reverse
+ *
+ * @param dna
+ * @param xref
+ * @param data
+ * @param proteins
+ * @return
+ */
+ SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
+ CdsData data, Map<String, SequenceI> proteins)
+ {
+ /*
+ * check we have some data to work with
+ */
+ if (data.proteinId == null || data.translation == null)
+ {
+ return null;
+ }
+
+ /*
+ * Construct the protein sequence (if not already seen)
+ */
+ String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
+ SequenceI protein = proteins.get(proteinSeqName);
+ if (protein == null)
+ {
+ protein = new Sequence(proteinSeqName, data.translation, 1,
+ data.translation.length());
+ protein.setDescription(data.proteinName != null ? data.proteinName
+ : "Protein Product from " + sourceDb);
+ proteins.put(proteinSeqName, protein);
+ }
+
+ return protein;
+ }
+
+ /**
+ * Returns the CDS location as a single array of [start, end, start, end...]
+ * positions. If on the reverse strand, these will be in descending order.
+ *
+ * @param accession
+ * @param location
+ * @return
+ */
+ protected int[] getCdsRanges(String accession, String location)
+ {
+ if (location == null)
+ {
+ return new int[] {};
+ }
+
+ try
+ {
+ List<int[]> ranges = DnaUtils.parseLocation(location);
+ return MappingUtils.listToArray(ranges);
+ } catch (ParseException e)
+ {
+ Cache.log.warn(
+ String.format("Not parsing inexact CDS location %s in ENA %s",
+ location, accession));
+ return new int[] {};
+ }
+ }
+
+ /**
+ * Output (print) is not implemented for EMBL flat file format
+ */
+ @Override
+ public String print(SequenceI[] seqs, boolean jvsuffix)
+ {
+ return null;
+ }
+
+ /**
+ * Truncates (if necessary) the exon intervals to match 3 times the length of
+ * the protein; also accepts 3 bases longer (for stop codon not included in
+ * protein)
+ *
+ * @param proteinLength
+ * @param exon
+ * an array of [start, end, start, end...] intervals
+ * @return the same array (if unchanged) or a truncated copy
+ */
+ static int[] adjustForProteinLength(int proteinLength, int[] exon)
+ {
+ if (proteinLength <= 0 || exon == null)
+ {
+ return exon;
+ }
+ int expectedCdsLength = proteinLength * 3;
+ int exonLength = MappingUtils.getLength(Arrays.asList(exon));
+
+ /*
+ * if exon length matches protein, or is shorter, or longer by the
+ * length of a stop codon (3 bases), then leave it unchanged
+ */
+ if (expectedCdsLength >= exonLength
+ || expectedCdsLength == exonLength - 3)
+ {
+ return exon;
+ }
+
+ int origxon[];
+ int sxpos = -1;
+ int endxon = 0;
+ origxon = new int[exon.length];
+ System.arraycopy(exon, 0, origxon, 0, exon.length);
+ int cdspos = 0;
+ for (int x = 0; x < exon.length; x += 2)
+ {
+ cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
+ if (expectedCdsLength <= cdspos)
+ {
+ // advanced beyond last codon.
+ sxpos = x;
+ if (expectedCdsLength != cdspos)
+ {
+ // System.err
+ // .println("Truncating final exon interval on region by "
+ // + (cdspos - cdslength));
+ }
+
+ /*
+ * shrink the final exon - reduce end position if forward
+ * strand, increase it if reverse
+ */
+ if (exon[x + 1] >= exon[x])
+ {
+ endxon = exon[x + 1] - cdspos + expectedCdsLength;
+ }
+ else
+ {
+ endxon = exon[x + 1] + cdspos - expectedCdsLength;
+ }
+ break;
+ }
+ }
+
+ if (sxpos != -1)
+ {
+ // and trim the exon interval set if necessary
+ int[] nxon = new int[sxpos + 2];
+ System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
+ nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
+ // set
+ exon = nxon;
+ }
+ return exon;
+ }
+}
}
}
}
+
+ /**
+ * Converts a list of [start, end] ranges to a single array of [start, end,
+ * start, end ...]
+ *
+ * @param ranges
+ * @return
+ */
+ public static int[] listToArray(List<int[]> ranges)
+ {
+ int[] result = new int[ranges.size() * 2];
+ int i = 0;
+ for (int[] range : ranges)
+ {
+ result[i++] = range[0];
+ result[i++] = range[1];
+ }
+ return result;
+ }
}
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefSource;
-import com.stevesoft.pat.Regex;
-
-public class EmblCdsSource extends EmblXmlSource
+public class EmblCdsSource extends EmblFlatfileSource // was EmblXmlSource
{
public EmblCdsSource()
}
@Override
- public String getAccessionSeparator()
- {
- return null;
- }
-
- @Override
- public Regex getAccessionValidator()
- {
- return new Regex("^[A-Z]+[0-9]+");
- }
-
- @Override
public String getDbSource()
{
return DBRefSource.EMBLCDS;
}
@Override
- public String getDbVersion()
- {
- return "0"; // TODO : this is dynamically set for a returned record - not
- // tied to proxy
- }
-
- @Override
public AlignmentI getSequenceRecords(String queries) throws Exception
{
if (queries.indexOf(".") > -1)
return getEmblSequenceRecords(DBRefSource.EMBLCDS, queries);
}
- @Override
- public boolean isValidReference(String accession)
- {
- // most embl CDS refs look like ..
- // TODO: improve EMBLCDS regex
- return (accession == null || accession.length() < 2) ? false
- : getAccessionValidator().search(accession);
- }
-
/**
* cDNA for LDHA_CHICK swissprot sequence
*/
return "EMBL (CDS)";
}
- @Override
- public int getTier()
- {
- return 0;
- }
-
}
--- /dev/null
+package jalview.ws.dbsources;
+
+import java.io.File;
+import java.io.IOException;
+
+import com.stevesoft.pat.Regex;
+
+import jalview.bin.Cache;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceI;
+import jalview.io.DataSourceType;
+import jalview.io.EmblFlatFile;
+import jalview.io.FileParse;
+import jalview.ws.ebi.EBIFetchClient;
+
+/**
+ * A class that does partial parsing of an EMBL flatfile.
+ *
+ * @author gmcarstairs
+ *
+ */
+public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy
+{
+ private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
+
+ @Override
+ public String getDbVersion()
+ {
+ return "0";
+ }
+
+ @Override
+ public String getAccessionSeparator()
+ {
+ return null;
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
+ @Override
+ public boolean isValidReference(String accession)
+ {
+ if (accession == null || accession.length() < 2)
+ {
+ return false;
+ }
+ return getAccessionValidator().search(accession);
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ return null;
+ }
+
+ @Override
+ public int getTier()
+ {
+ return 0;
+ }
+
+ protected AlignmentI getEmblSequenceRecords(String dbName, String query)
+ throws Exception
+ {
+ startQuery();
+ EBIFetchClient dbFetch = new EBIFetchClient();
+ File reply;
+ try
+ {
+ reply = dbFetch.fetchDataAsFile(
+ dbName.toLowerCase() + ":" + query.trim(), null, "gz");
+ } catch (Exception e)
+ {
+ stopQuery();
+ throw new Exception(
+ String.format("EBI EMBL retrieval failed for %s:%s",
+ dbName.toLowerCase(), query.trim()),
+ e);
+ }
+ return getEmblSequenceRecords(dbName, query, reply);
+ }
+
+ private AlignmentI getEmblSequenceRecords(String dbName, String query,
+ File reply) throws IOException
+ {
+ AlignmentI al = null;
+
+ if (reply != null && reply.exists())
+ {
+ file = reply.getAbsolutePath();
+ FileParse fp = new FileParse(file, DataSourceType.FILE);
+ EmblFlatFile emblParser = new EmblFlatFile(fp, getDbSource());
+ emblParser.parse();
+ SequenceI[] seqs = emblParser.getSeqsAsArray();
+ if (seqs.length > 0)
+ {
+ al = new Alignment(seqs);
+ }
+
+ if (al == null)
+ {
+ Cache.log.error(
+ "No record found for '" + dbName + ":" + query + "'");
+ }
+ }
+
+ stopQuery();
+ return al;
+ }
+
+ @Override
+ public boolean isDnaCoding()
+ {
+ return true;
+ }
+}
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefSource;
-import com.stevesoft.pat.Regex;
-
/**
* @author JimP
*
*/
-public class EmblSource extends EmblXmlSource
+public class EmblSource extends EmblFlatfileSource // was EmblXmlSource
{
public EmblSource()
/*
* (non-Javadoc)
*
- * @see jalview.ws.DbSourceProxy#getAccessionSeparator()
- */
- @Override
- public String getAccessionSeparator()
- {
- // TODO Auto-generated method stub
- return null;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see jalview.ws.DbSourceProxy#getAccessionValidator()
- */
- @Override
- public Regex getAccessionValidator()
- {
- return new Regex("^[A-Z]+[0-9]+");
- }
-
- /*
- * (non-Javadoc)
- *
* @see jalview.ws.DbSourceProxy#getDbSource()
*/
@Override
/*
* (non-Javadoc)
*
- * @see jalview.ws.DbSourceProxy#getDbVersion()
- */
- @Override
- public String getDbVersion()
- {
- // TODO Auto-generated method stub
- return "0";
- }
-
- /*
- * (non-Javadoc)
- *
* @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[])
*/
@Override
return getEmblSequenceRecords(DBRefSource.EMBL, queries);
}
- /*
- * (non-Javadoc)
- *
- * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String)
- */
- @Override
- public boolean isValidReference(String accession)
- {
- // most embl refs look like ..
-
- return (accession == null || accession.length() < 2) ? false
- : getAccessionValidator().search(accession);
-
- }
-
/**
* return LHD_CHICK coding gene
*/
{
return "EMBL"; // getDbSource();
}
-
- @Override
- public int getTier()
- {
- return 0;
- }
}
*/
package jalview.ws.dbsources;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBElement;
+import javax.xml.bind.JAXBException;
+import javax.xml.stream.FactoryConfigurationError;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+
+import com.stevesoft.pat.Regex;
+
import jalview.analysis.SequenceIdMatcher;
import jalview.bin.Cache;
import jalview.datamodel.Alignment;
import jalview.util.DnaUtils;
import jalview.util.MapList;
import jalview.util.MappingUtils;
-import jalview.util.MessageManager;
import jalview.ws.ebi.EBIFetchClient;
import jalview.xml.binding.embl.EntryType;
import jalview.xml.binding.embl.EntryType.Feature;
import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
-import jalview.xml.binding.jalview.JalviewModel;
import jalview.xml.binding.embl.ROOT;
import jalview.xml.binding.embl.XrefType;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Hashtable;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-
-import javax.xml.bind.JAXBContext;
-import javax.xml.bind.JAXBElement;
-import javax.xml.bind.JAXBException;
-import javax.xml.stream.FactoryConfigurationError;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-
+/**
+ * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
+ * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
+ *
+ * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
+ */
public abstract class EmblXmlSource extends EbiFileRetrievedProxy
{
+ private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
+
/*
* JAL-1856 Embl returns this text for query not found
*/
} catch (Exception e)
{
stopQuery();
- throw new Exception(MessageManager.formatMessage(
- "exception.ebiembl_retrieval_failed_on", new String[]
- { emprefx.toLowerCase(), query.trim() }), e);
+ throw new Exception(
+ String.format("EBI EMBL XML retrieval failed for %s:%s",
+ emprefx.toLowerCase(), query.trim()),
+ e);
}
return getEmblSequenceRecords(emprefx, query, reply);
}
XMLStreamReader streamReader = XMLInputFactory.newInstance()
.createXMLStreamReader(is);
javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
- JAXBElement<ROOT> rootElement = um.unmarshal(streamReader, ROOT.class);
+ JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
+ ROOT.class);
ROOT root = rootElement.getValue();
/*
proteinSeq = new Sequence(proteinSeqName,
product.getSequenceAsString());
matcher.add(proteinSeq);
+ proteinSeq.setDescription(product.getDescription());
peptides.add(proteinSeq);
}
dnaToProteinMapping.setTo(proteinSeq);
&& dnaToProteinMapping.getTo() != null)
{
DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
- DBRefSource.EMBLCDSProduct, sequenceVersion,
- proteinId);
+ DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
dnaToEmblProteinRef.setMap(dnaToProteinMapping);
dnaToProteinMapping.setMappedFromId(proteinId);
dna.addDBRef(dnaToEmblProteinRef);
{
return new int[] {};
}
-
+
try
{
List<int[]> ranges = DnaUtils.parseLocation(location);
return sf;
}
+ @Override
+ public String getAccessionSeparator()
+ {
+ return null;
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
+ @Override
+ public String getDbVersion()
+ {
+ return "0";
+ }
+
+ @Override
+ public int getTier()
+ {
+ return 0;
+ }
+
+ @Override
+ public boolean isValidReference(String accession)
+ {
+ if (accession == null || accession.length() < 2)
+ {
+ return false;
+ }
+ return getAccessionValidator().search(accession);
+ }
+
/**
* Truncates (if necessary) the exon intervals to match 3 times the length of
* the protein; also accepts 3 bases longer (for stop codon not included in
}
int expectedCdsLength = proteinLength * 3;
int exonLength = MappingUtils.getLength(Arrays.asList(exon));
-
+
/*
* if exon length matches protein, or is shorter, or longer by the
* length of a stop codon (3 bases), then leave it unchanged
{
return exon;
}
-
+
int origxon[];
int sxpos = -1;
int endxon = 0;
// .println("Truncating final exon interval on region by "
// + (cdspos - cdslength));
}
-
+
/*
* shrink the final exon - reduce end position if forward
* strand, increase it if reverse
break;
}
}
-
+
if (sxpos != -1)
{
// and trim the exon interval set if necessary
* the query formatted as db:query1;query2;query3
* @param format
* the format wanted
- * @param extension
+ * @param ext
* for the temporary file to hold response (without separator)
* @return the file holding the response
* @throws OutOfMemoryError
{
String url = buildUrl(ids, database, format);
InputStream is = null;
+ BufferedReader br = null;
try
{
URL rcall = new URL(url);
Platform.streamToFile(is, outFile);
return null;
}
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ br = new BufferedReader(new InputStreamReader(is));
String rtn;
List<String> arl = new ArrayList<>();
while ((rtn = br.readLine()) != null)
{
}
}
+ if (br != null)
+ {
+ try
+ {
+ br.close();
+ } catch (IOException e)
+ {
+ }
+ }
}
return null;
}
if (database.equalsIgnoreCase(DBRefSource.EMBL)
|| database.equalsIgnoreCase(DBRefSource.EMBLCDS))
{
- url = "https://www.ebi.ac.uk/ena/data/view/" + ids.toLowerCase()
- + (format != null ? "&" + format : "");
+ url = "https://www.ebi.ac.uk/ena/browser/api/embl/"
+ + ids.toLowerCase() + "?download=true&gzip=true";
}
else
{
--- /dev/null
+package jalview.io;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+import static org.testng.AssertJUnit.assertNotNull;
+import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.fail;
+import static org.testng.AssertJUnit.assertNull;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+
+import org.testng.annotations.Test;
+
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.Sequence.DBModList;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.datamodel.features.SequenceFeatures;
+import jalview.util.MapList;
+
+public class EmblFlatFileTest
+{
+ /**
+ * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features,
+ * one of them reverse strand
+ *
+ * @throws MalformedURLException
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testParse() throws MalformedURLException, IOException
+ {
+ File dataFile = new File("test/jalview/io/J03321.embl.txt");
+ FileParse fp = new FileParse(dataFile, DataSourceType.FILE);
+ EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest");
+ parser.parse();
+ List<SequenceI> seqs = parser.getSeqs();
+
+ assertEquals(seqs.size(), 1);
+ SequenceI seq = seqs.get(0);
+ assertEquals(seq.getName(), "EmblTest|J03321");
+ assertEquals(seq.getLength(), 7502);
+ assertEquals(seq.getDescription(),
+ "Chlamydia trachomatis plasmid pCHL1, complete sequence");
+
+ /*
+ * should be 9 CDS features (one is a 'join' of two exons)
+ */
+ Set<String> featureTypes = seq.getFeatures().getFeatureTypes();
+ assertEquals(featureTypes.size(), 1);
+ assertTrue(featureTypes.contains("CDS"));
+
+ /*
+ * inspect some features (sorted just for convenience of test assertions)
+ */
+ List<SequenceFeature> features = seq.getFeatures()
+ .getAllFeatures("CDS");
+ SequenceFeatures.sortFeatures(features, true);
+ assertEquals(features.size(), 9);
+
+ SequenceFeature sf = features.get(0);
+ assertEquals(sf.getBegin(), 1);
+ assertEquals(sf.getEnd(), 437);
+ assertEquals(sf.getDescription(),
+ "Exon 2 for protein EMBLCDS:AAA91567.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), 1);
+ assertEquals(sf.getValue("note"), "pGP7-D");
+ // this is the second exon of circular CDS!
+ assertEquals(sf.getValue("exon number"), 2);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+ assertEquals(sf.getValue("transl_table"), "11");
+
+ sf = features.get(1);
+ assertEquals(sf.getBegin(), 488);
+ assertEquals(sf.getEnd(), 1480);
+ assertEquals(sf.getDescription(),
+ "Exon 1 for protein EMBLCDS:AAA91568.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "complement(488..1480)");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), -1); // reverse strand!
+ assertEquals(sf.getValue("note"), "pGP8-D");
+ assertEquals(sf.getValue("exon number"), 1);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+
+ sf = features.get(7);
+ assertEquals(sf.getBegin(), 6045);
+ assertEquals(sf.getEnd(), 6788);
+ assertEquals(sf.getDescription(),
+ "Exon 1 for protein EMBLCDS:AAA91574.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "6045..6788");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), 1);
+ assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)");
+ assertEquals(sf.getValue("exon number"), 1);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+
+ /*
+ * CDS at 7022-7502 is the first exon of the circular CDS
+ */
+ sf = features.get(8);
+ assertEquals(sf.getBegin(), 7022);
+ assertEquals(sf.getEnd(), 7502);
+ assertEquals(sf.getDescription(),
+ "Exon 1 for protein EMBLCDS:AAA91567.1");
+ assertEquals(sf.getFeatureGroup(), "EmblTest");
+ assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
+ assertEquals(sf.getPhase(), "0");
+ assertEquals(sf.getStrand(), 1);
+ assertEquals(sf.getValue("note"), "pGP7-D");
+ assertEquals(sf.getValue("exon number"), 1);
+ assertEquals(sf.getValue("product"), "hypothetical protein");
+
+ /*
+ * Verify DBRefs, whether declared in the file or added by Jalview.
+ * There are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries
+ * (some e.g. INTERPRO are duplicates). Jalview adds a dbref to 'self'.
+ * Sample a few here. Note DBRefEntry constructor capitalises source.
+ */
+ List<DBRefEntry> dbrefs = seq.getDBRefs();
+ assertEquals(dbrefs.size(), 32);
+ // xref to 'self':
+ DBRefEntry selfRef = new DBRefEntry("EMBLTEST", "1", "J03321");
+ int[] range = new int[] { 1, seq.getLength() };
+ selfRef.setMap(new Mapping(null, range, range, 1, 1));
+ assertTrue(dbrefs.contains(selfRef));
+
+ // 1st DR line; note trailing period is removed
+ assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0",
+ "d4c4942a634e3df4995fd5ac75c26a61")));
+ // the 4th DR line:
+ assertTrue(
+ dbrefs.contains(new DBRefEntry("EUROPEPMC", "0", "PMC87941")));
+ // from the first CDS feature
+ assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19")));
+ // from the last CDS feature
+ assertTrue(
+ dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350")));
+
+ /*
+ * verify mappings to, and sequences for, UNIPROT proteins
+ */
+ int uniprotCount = 0;
+ List<int[]> ranges;
+ for (DBRefEntry dbref : dbrefs)
+ {
+ if ("UNIPROT".equals(dbref.getSource()))
+ {
+ uniprotCount++;
+ Mapping mapping = dbref.getMap();
+ assertNotNull(mapping);
+ MapList map = mapping.getMap();
+ String mappedToName = mapping.getTo().getName();
+ if ("UNIPROT|P0CE16".equals(mappedToName))
+ {
+ assertEquals((ranges = map.getFromRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1579);
+ assertEquals(ranges.get(0)[1], 2934);
+ assertEquals((ranges = map.getToRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1);
+ assertEquals(ranges.get(0)[1], 451);
+ // CDS /product carries over as protein product description
+ assertEquals(mapping.getTo().getDescription(),
+ "hypothetical protein");
+ }
+ else if ("UNIPROT|P0CE17".equals(mappedToName))
+ {
+ assertEquals((ranges = map.getFromRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 2928);
+ assertEquals(ranges.get(0)[1], 3992);
+ assertEquals((ranges = map.getToRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1);
+ assertEquals(ranges.get(0)[1], 354);
+ }
+ else if ("UNIPROT|P0CE18".equals(mappedToName))
+ {
+ assertEquals((ranges = map.getFromRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 4054);
+ assertEquals(ranges.get(0)[1], 4848);
+ assertEquals((ranges = map.getToRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1);
+ assertEquals(ranges.get(0)[1], 264);
+ }
+ else if ("UNIPROT|P0CE19".equals(mappedToName))
+ {
+ // join(7022..7502,1..437)
+ assertEquals((ranges = map.getFromRanges()).size(), 2);
+ assertEquals(ranges.get(0)[0], 7022);
+ assertEquals(ranges.get(0)[1], 7502);
+ assertEquals(ranges.get(1)[0], 1);
+ assertEquals(ranges.get(1)[1], 437);
+ assertEquals((ranges = map.getToRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1);
+ assertEquals(ranges.get(0)[1], 305);
+ }
+ else if ("UNIPROT|P0CE20".equals(mappedToName))
+ {
+ // complement(488..1480)
+ assertEquals((ranges = map.getFromRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1480);
+ assertEquals(ranges.get(0)[1], 488);
+ assertEquals((ranges = map.getToRanges()).size(), 1);
+ assertEquals(ranges.get(0)[0], 1);
+ assertEquals(ranges.get(0)[1], 330);
+ }
+ else if (!"UNIPROT|P0CE23".equals(mappedToName)
+ && !"UNIPROT|P10559".equals(mappedToName)
+ && !"UNIPROT|P10560".equals(mappedToName))
+ {
+ fail("Unexpected UNIPROT dbref to " + mappedToName);
+ }
+ }
+ }
+ assertEquals(uniprotCount, 8);
+ }
+
+ @Test(groups = "Functional")
+ public void testParse_codonStartNot1()
+ {
+ // TODO verify CDS-to-protein mapping for CDS with /codon_start=2
+ // example: https://www.ebi.ac.uk/ena/browser/api/embl/EU498516
+ }
+
+ /**
+ * Test for the case that the EMBL CDS has no UNIPROT xref. In this case
+ * Jalview should synthesize an xref to EMBLCDSPROTEIN in the hope this will
+ * allow Get Cross-References.
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testParse_noUniprotXref() throws IOException
+ {
+ // MN908947 cut down to 40BP, one CDS, length 5 peptide for test purposes
+ // plus an additional (invented) test case:
+ // - multi-line /product qualifier including escaped quotes
+ String data = "ID MN908947; SV 3; linear; genomic RNA; STD; VRL; 20 BP.\n"
+ + "DE Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,\n"
+ + "FT CDS 3..17\n"
+ + "FT /protein_id=\"QHD43415.1\"\n"
+ + "FT /product=\"orf1ab polyprotein\n"
+ + "FT \"\"foobar\"\" \"\n"
+ + "FT /translation=\"MRKLD\n"
+ + "SQ Sequence 7496 BP; 2450 A; 1290 C; 1434 G; 2322 T; 0 other;\n"
+ + " ggatGcgtaa gttagacgaa attttgtctt tgcgcacaga 40\n";
+ FileParse fp = new FileParse(data, DataSourceType.PASTE);
+ EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest");
+ parser.parse();
+ List<SequenceI> seqs = parser.getSeqs();
+ assertEquals(seqs.size(), 1);
+ SequenceI seq = seqs.get(0);
+ DBModList<DBRefEntry> dbrefs = seq.getDBRefs();
+
+ /*
+ * dna should have dbref to itself, and to inferred EMBLCDSPROTEIN:QHD43415.1
+ */
+ assertEquals(dbrefs.size(), 2);
+
+ // dbref to self
+ DBRefEntry dbref = dbrefs.get(0);
+ assertEquals(dbref.getSource(), "EMBLTEST");
+ assertEquals(dbref.getAccessionId(), "MN908947");
+ Mapping mapping = dbref.getMap();
+ assertNull(mapping.getTo());
+ MapList map = mapping.getMap();
+ assertEquals(map.getFromLowest(), 1);
+ assertEquals(map.getFromHighest(), 40);
+ assertEquals(map.getToLowest(), 1);
+ assertEquals(map.getToHighest(), 40);
+ assertEquals(map.getFromRatio(), 1);
+ assertEquals(map.getToRatio(), 1);
+
+ // dbref to inferred EMBLCDSPROTEIN:
+ dbref = dbrefs.get(1);
+ assertEquals(dbref.getSource(), "EMBLCDSPROTEIN");
+ assertEquals(dbref.getAccessionId(), "QHD43415.1");
+ mapping = dbref.getMap();
+ SequenceI mapTo = mapping.getTo();
+ assertEquals(mapTo.getName(), "QHD43415.1");
+ // the /product qualifier transfers to protein product description
+ assertEquals(mapTo.getDescription(), "orf1ab polyprotein \"foobar\"");
+ assertEquals(mapTo.getSequenceAsString(), "MRKLD");
+ map = mapping.getMap();
+ assertEquals(map.getFromLowest(), 3);
+ assertEquals(map.getFromHighest(), 17);
+ assertEquals(map.getToLowest(), 1);
+ assertEquals(map.getToHighest(), 5);
+ assertEquals(map.getFromRatio(), 3);
+ assertEquals(map.getToRatio(), 1);
+ }
+
+ @Test(groups = "Functional")
+ public void testAdjustForProteinLength()
+ {
+ int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp
+
+ // exact length match:
+ assertSame(exons, EmblFlatFile.adjustForProteinLength(6, exons));
+
+ // match if we assume exons include stop codon not in protein:
+ assertSame(exons, EmblFlatFile.adjustForProteinLength(5, exons));
+
+ // truncate last exon by 6bp
+ int[] truncated = EmblFlatFile.adjustForProteinLength(4, exons);
+ assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated));
+
+ // remove last exon and truncate preceding by 1bp (so 3bp in total)
+ truncated = EmblFlatFile.adjustForProteinLength(3, exons);
+ assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated));
+
+ // exact removal of exon case:
+ exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp
+ truncated = EmblFlatFile.adjustForProteinLength(4, exons);
+ assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated));
+
+ // what if exons are too short for protein?
+ truncated = EmblFlatFile.adjustForProteinLength(7, exons);
+ assertSame(exons, truncated);
+ }
+
+ @Test(groups = "Functional")
+ public void testRemoveQuotes()
+ {
+ assertNull(EmblFlatFile.removeQuotes(null));
+ assertEquals(EmblFlatFile.removeQuotes("No quotes here"), "No quotes here");
+ assertEquals(EmblFlatFile.removeQuotes("\"Enclosing quotes\""), "Enclosing quotes");
+ assertEquals(EmblFlatFile.removeQuotes("\"Escaped \"\"quotes\"\" example\""), "Escaped \"quotes\" example");
+ }
+}
--- /dev/null
+ID J03321; SV 1; circular; genomic DNA; STD; PRO; 7502 BP.
+XX
+AC J03321;
+XX
+DT 27-JUL-1990 (Rel. 24, Created)
+DT 10-APR-2020 (Rel. 144, Last updated, Version 9)
+XX
+DE Chlamydia trachomatis plasmid pCHL1, complete sequence.
+XX
+KW .
+XX
+OS Chlamydia trachomatis
+OC Bacteria; Chlamydiae; Chlamydiales; Chlamydiaceae;
+OC Chlamydia/Chlamydophila group; Chlamydia.
+OG Plasmid pCHL1
+XX
+RN [1]
+RP 1-7502
+RX DOI; 10.1016/0147-619X(90)90034-A.
+RX PUBMED; 2194229.
+RA Comanducci M., Ricci S., Cevenini R., Ratti G.;
+RT "Diversity of the Chlamydia trachomatis common plasmid in biovars with
+RT different pathogenicity";
+RL Plasmid 23(2):149-154(1990).
+XX
+RN [2]
+RP 1-7502
+RA Comanducci M., Ricci S., Cevenini R., Ratti G.;
+RT ;
+RL Submitted (23-JUN-2010) to the INSDC.
+RL Sclavo Research Centre, Siena, Italy
+XX
+DR MD5; d4c4942a634e3df4995fd5ac75c26a61.
+DR BioSample; SAMN14225621.
+DR EuropePMC; PMC4450983; 26031715.
+DR EuropePMC; PMC87941; 11283058.
+XX
+CC Draft entry and computer-readable sequence kindly submitted by
+CC G.Ratti, 28-MAR-1990.
+XX
+FH Key Location/Qualifiers
+FH
+FT source 1..7502
+FT /organism="Chlamydia trachomatis"
+FT /plasmid="pCHL1"
+FT /isolate="G0/86"
+FT /serotype="D"
+FT /mol_type="genomic DNA"
+FT /isolation_source="trachoma"
+FT /db_xref="taxon:813"
+FT CDS join(7022..7502,1..437)
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP7-D"
+FT /db_xref="GOA:P0CE19"
+FT /db_xref="InterPro:IPR002104"
+FT /db_xref="InterPro:IPR011010"
+FT /db_xref="InterPro:IPR013762"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE19"
+FT /protein_id="AAA91567.1"
+FT /translation="MGSMAFHKSRLFLTFGDASEIWLSTLSYLTRKNYASGINFLVSLE
+FT ILDLSETLIKAISLDHSESLFKIKSLDVFNGKVVSEASKQARAACYISFTKFLYRLTKG
+FT YIKPAIPLKDFGNTTFFKIRDKIKTESISKQEWTVFFEALRIVNYRDYLIGKLIVQGIR
+FT KLDEILSLRTDDLFFASNQISFRIKKRQNKETKILITFPISLMEELQKYTCGRNGRVFV
+FT SKIGIPVTTSQVAHNFRLAEFHSAMKIKITPRVLRASALIHLKQIGLKDEEIMRISCLS
+FT SRQSVCSYCSGEEVIPLVQTPTIL"
+FT CDS complement(488..1480)
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP8-D"
+FT /db_xref="GOA:P0CE20"
+FT /db_xref="InterPro:IPR002104"
+FT /db_xref="InterPro:IPR011010"
+FT /db_xref="InterPro:IPR013762"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE20"
+FT /protein_id="AAA91568.1"
+FT /translation="MGKGILSLQQEMSLEYSEKSYQEVLKIRQESYWKRMKSFSLFEVI
+FT MHWTASLNKHTCRSYRGSFLSLEKIGLLSLDMNLQEFSLLNHNLILDAIKKVSSAKTSW
+FT TEGTKQVRAASYISLTRFLNRMTQGIVAIAQPSKQENSRTFFKTREIVKTDAMNSLQTA
+FT SFLKELKKINARDWLIAQTMLQGGKRSSEVLSLEISQICFQQATISFSQLKNRQTEKRI
+FT IITYPQKFMHFLQEYIGQRRGFVFVTRSGKMVGLRQIARTFSQAGLQAAIPFKITPHVL
+FT RATAVTEYKRLGCSDSDIMKVTGHATAKMIFAYDKSSREDNASKKMALI"
+FT CDS 1579..2934
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP1-D"
+FT /db_xref="GOA:P0CE16"
+FT /db_xref="InterPro:IPR003593"
+FT /db_xref="InterPro:IPR007693"
+FT /db_xref="InterPro:IPR007694"
+FT /db_xref="InterPro:IPR027417"
+FT /db_xref="InterPro:IPR036185"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE16"
+FT /protein_id="AAA91569.1"
+FT /translation="MKTRSEIENRMQDIEYALLGKALIFEDSTEYILRQLANYEFKCSH
+FT HKNIFIVFKHLKDNGLPITVDSAWEELLRRRIKDMDKSYLGLMLHDALSNDKLRSVSHT
+FT VFLDDLSVCSAEENLSNFIFRSFNEYNENPLRRSPFLLLERIKGRLDSAIAKTFSIRSA
+FT RGRSIYDIFSQSEIGVLARIKKRRVAFSENQNSFFDGFPTGYKDIDDKGVILAKGNFVI
+FT IAARPSIGKTALAIDMAINLAVTQQRRVGFLSLEMSAGQIVERIIANLTGISGEKLQRG
+FT DLSKEELFRVEEAGETVRESHFYICSDSQYKLNLIANQIRLLRKEDRVDVIFIDYLQLI
+FT NSSVGENRQNEIADISRTLRGLASELNIPIVCLSQLSRKVEDRANKVPMLSDLRDSGQI
+FT EQDADVILFINRKESSSNCEITVGKNRHGSVFSSVLHFDPKISKFSAIKKVW"
+FT CDS 2928..3992
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP2-D"
+FT /db_xref="InterPro:IPR040719"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE17"
+FT /protein_id="AAA91570.1"
+FT /translation="MVNYSNCHFIKSPIHLENQKFGRRPGQSIKISPKLAQNGMVEVIG
+FT LDFLSSHYHALAAIQRLLTATNYKGNTKGVVLSRESNSFQFEGWIPRIRFTKTEFLEAY
+FT GVKRYKTSRNKYEFSGKEAETALEALYHLGHQPFLIVATRTRWTNGTQIVDRYQTLSPI
+FT IRIYEGWEGLTDEENIDIDLTPFNSPPTRKHKGFVVEPCPILVDQIESYFVIKPANVYQ
+FT EIKMRFPNASKYAYTFIDWVITAAAKKRRKLTKDNSWPENLLLNVNVKSLAYILRMNRY
+FT ICTRNWKKIELAIDKCIEIAIQLGWLSRRKRIEFLDSSKLSKKEILYLNKERFEEITKK
+FT SKEQMEQLEQESIN"
+FT CDS 4054..4848
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP3-D"
+FT /db_xref="InterPro:IPR008444"
+FT /db_xref="InterPro:IPR033758"
+FT /db_xref="InterPro:IPR038264"
+FT /db_xref="PDB:6GJT"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE18"
+FT /protein_id="AAA91571.1"
+FT /translation="MGNSGFYLYNTENCVFADNIKVGQMTEPLKDQQIILGTTSTPVAA
+FT KMTASDGISLTVSNNSSTNASITIGLDAEKAYQLILEKLGDQILDGIADTIVDSTVQDI
+FT LDKIKTDPSLGLLKAFNNFPITNKIQCNGLFTPSNIETLLGGTEIGKFTVTPKSSGSMF
+FT LVSADIIASRMEGGVVLALVREGDSKPCAISYGYSSGIPNLCSLRTSITNTGLTPTTYS
+FT LRVGGLESGVVWVNALSNGNDILGITNTSNVSFLEVIPQTNA"
+FT CDS 4918..5226
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP4-D"
+FT /db_xref="UniProtKB/Swiss-Prot:P0CE23"
+FT /protein_id="AAA91572.1"
+FT /translation="MQNKRKVRDDFIKIVKDVKKDFPELDLKIRVNKEKVTFLNSPLEL
+FT YHKSVSLILGLLQQIENSLGLFPDSPVLEKLEDNSLKLKKALIMLILSRKDMFSKAE"
+FT CDS 5317..6048
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP5-D (gtg start codon)"
+FT /db_xref="GOA:P10559"
+FT /db_xref="InterPro:IPR025669"
+FT /db_xref="InterPro:IPR027417"
+FT /db_xref="UniProtKB/Swiss-Prot:P10559"
+FT /protein_id="AAA91573.1"
+FT /translation="MGCNLAQFLGKKVLLADLDPQSNLSSGLGASVRSDQKGLHDIVYT
+FT SNDLKSIICETKKDSVDLIPASFSSEQFRELDIHRGPSNNLKLFLNEYCAPFYDICIID
+FT TPPSLGGLTKEAFVAGDKLIACLTPEPFSILGLQKIREFLSSVGKPEEEHILGIALSFW
+FT DDRNSTNQMYIDIIESIYKNKLFSTKIRRDISLSRSLLKEDSVANVYPNSRAAEDILKL
+FT THEIANILHIEYERDYSQRTT"
+FT CDS 6045..6788
+FT /codon_start=1
+FT /transl_table=11
+FT /product="hypothetical protein"
+FT /note="pGP6-D (gtg start codon)"
+FT /db_xref="InterPro:IPR005350"
+FT /db_xref="UniProtKB/Swiss-Prot:P10560"
+FT /protein_id="AAA91574.1"
+FT /translation="MNKLKKEADVFFKKNQTAASLDFKKTLPSIELFSATLNSEESQSL
+FT DRLFLSESQNYSDEEFYQEDILAVKLLTGQIKSIQKQHVLLLGEKIYNARKILSKDHFS
+FT STTFSSWIELVFRTKSSAYNALAYYELFINLPNQTLQKEFQSIPYKSAYILAARKGDLK
+FT TKVDVIGKVCGMSNSSAIRVLDQFLPSSRNKDVRETIDKSDSEKNRQLSDFLIEILRIM
+FT CSGVSLSSYNENLLQQLFELFKQKS"
+FT repeat_region 6857..6945
+FT /note="four tandem 22bp repeats"
+XX
+SQ Sequence 7502 BP; 2460 A; 1285 C; 1433 G; 2324 T; 0 other;
+ ggatccgtaa gttagacgaa attttgtctt tgcgcacaga cgatctattt tttgcatcca 60
+ atcagatttc ctttcgcatt aaaaaaagac agaataaaga aaccaaaatt ctaatcacat 120
+ ttcctatcag cttaatggaa gagttgcaaa aatacacttg tgggagaaat gggagagtat 180
+ ttgtttctaa aatagggatt cctgtaacaa caagtcaggt tgcgcataat tttaggcttg 240
+ cagagttcca tagtgctatg aaaataaaaa ttactcccag agtacttcgt gcaagcgctt 300
+ tgattcattt aaagcaaata ggattaaaag atgaggaaat catgcgtatt tcctgtcttt 360
+ catcgagaca aagtgtgtgt tcttattgtt ctggggaaga ggtaattcct ctagtacaaa 420
+ cacccacaat attgtgatat aattaaaatt atattcatat tctgttgcca gaaaaaacac 480
+ ctttaggcta tattagagcc atcttctttg aagcgttgtc ttctcgagaa gatttatcgt 540
+ acgcaaatat catctttgcg gttgcgtgtc ctgtgacctt cattatgtcg gagtctgagc 600
+ accctaggcg tttgtactcc gtcacagcgg ttgctcgaag cacgtgcggg gttattttaa 660
+ aagggattgc agcttgtagt cctgcttgag agaacgtgcg ggcgatttgc cttaacccca 720
+ ccatttttcc ggagcgagtt acgaagacaa aacctcttcg ttgaccgatg tactcttgta 780
+ gaaagtgcat aaacttctga ggataagtta taataatcct cttttctgtc tgacggttct 840
+ taagctggga gaaagaaatg gtagcttgtt ggaaacaaat ctgactaatc tccaagctta 900
+ agacttcaga ggagcgttta cctccttgga gcattgtctg ggcgatcaac caatcccggg 960
+ cattgatttt ttttagctct tttaggaagg atgctgtttg caaactgttc atcgcatccg 1020
+ tttttactat ttccctggtt ttaaaaaatg ttcgactatt ttcttgttta gaaggttgcg 1080
+ ctatagcgac tattccttga gtcatcctgt ttaggaatct tgttaaggaa atatagcttg 1140
+ ctgctcgaac ttgtttagta ccttcggtcc aagaagtctt ggcagaggaa acttttttaa 1200
+ tcgcatctag gattagatta tgatttaaaa gggaaaactc ttgcagattc atatccaagg 1260
+ acaatagacc aatcttttct aaagacaaaa aagatcctcg atatgatcta caagtatgtt 1320
+ tgttgagtga tgcggtccaa tgcataataa cttcgaataa ggagaagctt ttcatgcgtt 1380
+ tccaatagga ttcttggcga atttttaaaa cttcctgata agacttttca ctatattcta 1440
+ acgacatttc ttgctgcaaa gataaaatcc ctttacccat gaaatccctc gtgatataac 1500
+ ctatccgtaa aatgtcctga ttagtgaaat aatcaggttg ttaacaggat agcacgctcg 1560
+ gtattttttt atataaacat gaaaactcgt tccgaaatag aaaatcgcat gcaagatatc 1620
+ gagtatgcgt tgttaggtaa agctctgata tttgaagact ctactgagta tattctgagg 1680
+ cagcttgcta attatgagtt taagtgttct catcataaaa acatattcat agtatttaaa 1740
+ cacttaaaag acaatggatt acctataact gtagactcgg cttgggaaga gcttttgcgg 1800
+ cgtcgtatca aagatatgga caaatcgtat ctcgggttaa tgttgcatga tgctttatca 1860
+ aatgacaagc ttagatccgt ttctcatacg gttttcctcg atgatttgag cgtgtgtagc 1920
+ gctgaagaaa atttgagtaa tttcattttc cgctcgttta atgagtacaa tgaaaatcca 1980
+ ttgcgtagat ctccgtttct attgcttgag cgtataaagg gaaggcttga tagtgctata 2040
+ gcaaagactt tttctattcg cagcgctaga ggccggtcta tttatgatat attctcacag 2100
+ tcagaaattg gagtgctggc tcgtataaaa aaaagacgag tagcgttctc tgagaatcaa 2160
+ aattctttct ttgatggctt cccaacagga tacaaggata ttgatgataa aggagttatc 2220
+ ttagctaaag gtaatttcgt gattatagca gctagaccat ctatagggaa aacagcttta 2280
+ gctatagaca tggcgataaa tcttgcggtt actcaacagc gtagagttgg tttcctatct 2340
+ ctagaaatga gcgcaggtca aattgttgag cggattattg ctaatttaac aggaatatct 2400
+ ggtgaaaaat tacaaagagg ggatctctct aaagaagaat tattccgagt agaagaagct 2460
+ ggagaaacgg ttagagaatc acatttttat atctgcagtg atagtcagta taagcttaac 2520
+ ttaatcgcga atcagatccg gttgctgaga aaagaagatc gagtagacgt aatatttatc 2580
+ gattacttgc agttgatcaa ctcatcggtt ggagaaaatc gtcaaaatga aatagcagat 2640
+ atatctagaa ccttaagagg tttagcctca gagctaaaca ttcctatagt ttgtttatcc 2700
+ caactatcta gaaaagttga ggatagagca aataaagttc ccatgctttc agatttgcga 2760
+ gacagcggtc aaatagagca agacgcagat gtgattttgt ttatcaatag gaaggaatcg 2820
+ tcttctaatt gtgagataac tgttgggaaa aatagacatg gatcggtttt ctcttcggta 2880
+ ttacatttcg atccaaaaat tagtaaattc tccgctatta aaaaagtatg gtaaattata 2940
+ gtaactgcca cttcatcaaa agtcctatcc accttgaaaa tcagaagttt ggaagaagac 3000
+ ctggtcaatc tattaagata tctcccaaat tggctcaaaa tgggatggta gaagttatag 3060
+ gtcttgattt tctttcatct cattaccatg cattagcagc tatccaaaga ttactgaccg 3120
+ caacgaatta caaggggaac acaaaagggg ttgttttatc cagagaatca aatagttttc 3180
+ aatttgaagg atggatacca agaatccgtt ttacaaaaac tgaattctta gaggcttatg 3240
+ gagttaagcg gtataaaaca tccagaaata agtatgagtt tagtggaaaa gaagctgaaa 3300
+ ctgctttaga agccttatac catttaggac atcaaccgtt tttaatagtg gcaactagaa 3360
+ ctcgatggac taatggaaca caaatagtag accgttacca aactctttct ccgatcatta 3420
+ ggatttacga aggatgggaa ggtttaactg acgaagaaaa tatagatata gacttaacac 3480
+ cttttaattc accacctaca cggaaacata aagggttcgt tgtagagcca tgtcctatct 3540
+ tggtagatca aatagaatcc tactttgtaa tcaagcctgc aaatgtatac caagaaataa 3600
+ aaatgcgttt cccaaatgca tcaaagtatg cttacacatt tatcgactgg gtgattacag 3660
+ cagctgcgaa aaagagacga aaattaacta aggataattc ttggccagaa aacttgttat 3720
+ taaacgttaa cgttaaaagt cttgcatata ttttaaggat gaatcggtac atctgtacaa 3780
+ ggaactggaa aaaaatcgag ttagctatcg ataaatgtat agaaatcgcc attcagcttg 3840
+ gctggttatc tagaagaaaa cgcattgaat ttctggattc ttctaaactc tctaaaaaag 3900
+ aaattctata tctaaataaa gagcgctttg aagaaataac taagaaatct aaagaacaaa 3960
+ tggaacaatt agaacaagaa tctattaatt aatagcaagc ttgaaactaa aaacctaatt 4020
+ tatttaaagc tcaaaataaa aaagagtttt aaaatgggaa attctggttt ttatttgtat 4080
+ aacactgaaa actgcgtctt tgctgataat atcaaagttg ggcaaatgac agagccgctc 4140
+ aaggaccagc aaataatcct tgggacaaca tcaacacctg tcgcagccaa aatgacagct 4200
+ tctgatggaa tatctttaac agtctccaat aattcatcaa ccaatgcttc tattacaatt 4260
+ ggtttggatg cggaaaaagc ttaccagctt attctagaaa agttgggaga tcaaattctt 4320
+ gatggaattg ctgatactat tgttgatagt acagtccaag atattttaga caaaatcaaa 4380
+ acagaccctt ctctaggttt gttgaaagct tttaacaact ttccaatcac taataaaatt 4440
+ caatgcaacg ggttattcac tcccagtaac attgaaactt tattaggagg aactgaaata 4500
+ ggaaaattca cagtcacacc caaaagctct gggagcatgt tcttagtctc agcagatatt 4560
+ attgcatcaa gaatggaagg cggcgttgtt ctagctttgg tacgagaagg tgattctaag 4620
+ ccctgcgcga ttagttatgg atactcatca ggcattccta atttatgtag tctaagaacc 4680
+ agtattacta atacaggatt gactccgaca acgtattcat tacgtgtagg cggtttagaa 4740
+ agcggtgtgg tatgggttaa tgccctttct aatggcaatg atattttagg aataacaaat 4800
+ acttctaatg tatctttttt agaggtaata cctcaaacaa acgcttaaac aatttttatt 4860
+ ggatttttct tataggtttt atatttagag aaaacagttc gaattacggg gtttgttatg 4920
+ caaaataaaa gaaaagtgag ggacgatttt attaaaattg ttaaagatgt gaaaaaagat 4980
+ ttccccgaat tagacctaaa aatacgagta aacaaggaaa aagtaacttt cttaaattct 5040
+ cccttagaac tctaccataa aagtgtctca ctaattctag gactgcttca acaaatagaa 5100
+ aactctttag gattattccc agactctcct gttcttgaaa aattagagga taacagttta 5160
+ aagctaaaaa aggctttgat tatgcttatc ttgtctagaa aagacatgtt ttccaaggct 5220
+ gaatagacaa cttactctaa cgttggagtt gatttgcaca ccttagtttt ttgctctttt 5280
+ aagggaggaa ctggaaaaac aacactttct ctaaacgtgg gatgcaactt ggcccaattt 5340
+ ttagggaaaa aagtgttact tgctgaccta gacccgcaat ccaatttatc ttctggattg 5400
+ ggggctagtg tcagaagtga ccaaaaaggc ttgcacgaca tagtatacac atcaaacgat 5460
+ ttaaaatcaa tcatttgcga aacaaaaaaa gatagtgtgg acctaattcc tgcatcattt 5520
+ tcatccgaac agtttagaga attggatatt catagaggac ctagtaacaa cttaaagtta 5580
+ tttctgaatg agtactgcgc tcctttttat gacatctgca taatagacac tccacctagc 5640
+ ctaggagggt taacgaaaga agcttttgtt gcaggagaca aattaattgc ttgtttaact 5700
+ ccagaacctt tttctattct agggttacaa aagatacgtg aattcttaag ttcggtcgga 5760
+ aaacctgaag aagaacacat tcttggaata gctttgtctt tttgggatga tcgtaactcg 5820
+ actaaccaaa tgtatataga cattatcgag tctatttaca aaaacaagct tttttcaaca 5880
+ aaaattcgtc gagatatttc tctcagccgt tctcttctta aagaagattc tgtagctaat 5940
+ gtctatccaa attctagggc cgcagaagat attctgaagt taacgcatga aatagcaaat 6000
+ attttgcata tcgaatatga acgagattac tctcagagga caacgtgaac aaactaaaaa 6060
+ aagaagcgga tgtctttttt aaaaaaaatc aaactgccgc ttctctagat tttaagaaga 6120
+ cgcttccctc cattgaacta ttctcagcaa ctttgaattc tgaggaaagt cagagtttgg 6180
+ atcgattatt tttatcagag tcccaaaact attcggatga agaattttat caagaagaca 6240
+ tcctagcggt aaaactgctt actggtcaga taaaatccat acagaagcaa cacgtacttc 6300
+ ttttaggaga aaaaatctat aatgctagaa aaatcctgag taaggatcac ttctcctcaa 6360
+ caactttttc atcttggata gagttagttt ttagaactaa gtcttctgct tacaatgctc 6420
+ ttgcatatta cgagcttttt ataaacctcc ccaaccaaac tctacaaaaa gagtttcaat 6480
+ cgatccccta taaatccgca tatattttgg ccgctagaaa aggcgattta aaaaccaagg 6540
+ tcgatgtgat agggaaagta tgtggaatgt cgaactcatc ggcgataagg gtgttggatc 6600
+ aatttcttcc ttcatctaga aacaaagacg ttagagaaac gatagataag tctgattcag 6660
+ agaagaatcg ccaattatct gatttcttaa tagagatact tcgcatcatg tgttccggag 6720
+ tttctttgtc ctcctataac gaaaatcttc tacaacagct ttttgaactt tttaagcaaa 6780
+ agagctgatc ctccgtcagc tcatatatat atatctatta tatatatata tttagggatt 6840
+ tgatttcacg agagagattt gcaactcttg gtggtagact ttgcaactct tggtggtaga 6900
+ ctttgcaact cttggtggta gactttgcaa ctcttggtgg tagacttggt cataatggac 6960
+ ttttgttaaa aaatttatta aaatcttaga gctccgattt tgaatagctt tggttaagaa 7020
+ aatgggctcg atggctttcc ataaaagtag attgttttta acttttgggg acgcgtcgga 7080
+ aatttggtta tctactttat cttatctaac tagaaaaaat tatgcgtctg ggattaactt 7140
+ tcttgtttct ttagagattc tggatttatc ggaaaccttg ataaaggcta tttctcttga 7200
+ ccacagcgaa tctttgttta aaatcaagtc tctagatgtt tttaatggaa aagttgtttc 7260
+ agaggcatct aaacaggcta gagcggcatg ctacatatct ttcacaaagt ttttgtatag 7320
+ attgaccaag ggatatatta aacccgctat tccattgaaa gattttggaa acactacatt 7380
+ ttttaaaatc cgagacaaaa tcaaaacaga atcgatttct aagcaggaat ggacagtttt 7440
+ ttttgaagcg ctccggatag tgaattatag agactattta atcggtaaat tgattgtaca 7500
+ ag 7502
+//
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.AssertJUnit.fail;
import jalview.api.AlignViewportI;
import jalview.commands.EditCommand;
assertEquals(1, ranges.size());
assertEquals(9, ranges.get(0)[1]);
}
+
+ @Test(groups = "Functional")
+ public void testListToArray()
+ {
+ List<int[]> ranges = new ArrayList<>();
+
+ int[] result = MappingUtils.listToArray(ranges);
+ assertEquals(result.length, 0);
+ ranges.add(new int[] {24, 12});
+ result = MappingUtils.listToArray(ranges);
+ assertEquals(result.length, 2);
+ assertEquals(result[0], 24);
+ assertEquals(result[1], 12);
+ ranges.add(new int[] {-7, 30});
+ result = MappingUtils.listToArray(ranges);
+ assertEquals(result.length, 4);
+ assertEquals(result[0], 24);
+ assertEquals(result[1], 12);
+ assertEquals(result[2], -7);
+ assertEquals(result[3], 30);
+ try
+ {
+ MappingUtils.listToArray(null);
+ fail("Expected exception");
+ } catch (NullPointerException e)
+ {
+ // expected
+ }
+ }
}
import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
+import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.SequenceI;
import java.util.Arrays;
import java.util.List;
+import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
-public class EmblSourceTest
+public class EmblXmlSourceTest
{
// adapted from http://www.ebi.ac.uk/ena/data/view/X07547&display=xml
+ "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT"
+ "</sequence></entry></ROOT>";
+ private EmblXmlSource testee;
+
+ @BeforeClass(alwaysRun = true)
+ public void setUp()
+ {
+ testee = new EmblXmlSource()
+ {
+
+ @Override
+ public String getDbSource()
+ {
+ return null;
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return null;
+ }
+
+ @Override
+ public String getTestQuery()
+ {
+ return null;
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ return null;
+ }
+ };
+ }
+
@Test(groups = "Functional")
public void testGetCdsRanges()
{
- EmblSource testee = new EmblSource();
-
/*
* Make a (CDS) Feature with 5 locations
*/
Feature cds = new Feature();
- cds.setLocation("join(10..20,complement(30..40),50..60,70..80,complement(110..120))");
+ cds.setLocation(
+ "join(10..20,complement(30..40),50..60,70..80,complement(110..120))");
int[] exons = testee.getCdsRanges("EMBL", cds);
assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110]",
{
// not the whole sequence but enough for this test...
List<SequenceI> peptides = new ArrayList<>();
- List<EntryType> entries = EmblSourceTest.getEmblEntries();
+ List<EntryType> entries = getEmblEntries();
assertEquals(1, entries.size());
EntryType entry = entries.get(0);
- EmblSource testee = new EmblSource();
String sourceDb = "EMBL";
SequenceI dna = testee.getSequence(sourceDb, entry, peptides);
3, 1);
MapList cds2Map = new MapList(new int[] { 4, 15 }, new int[] { 1, 4 },
3, 1);
- MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 }, new int[] {
- 1, 3 }, 3, 1);
+ MapList cds3Map = new MapList(new int[] { 4, 6, 10, 15 },
+ new int[]
+ { 1, 3 }, 3, 1);
List<DBRefEntry> dbrefs = dna.getDBRefs();
assertEquals(7, dbrefs.size());
* - to EMBLCDS (with 1:3 mapping)
* - direct (no mapping) to other protein accessions
*/
- MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 }, new int[] {
- 1, 12 }, 1, 3);
- MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 }, new int[] {
- 1, 9 }, 1, 3);
+ MapList proteinToCdsMap1 = new MapList(new int[] { 1, 4 },
+ new int[]
+ { 1, 12 }, 1, 3);
+ MapList proteinToCdsMap2 = new MapList(new int[] { 1, 3 },
+ new int[]
+ { 1, 9 }, 1, 3);
// dbrefs for first CDS EMBL product CAA30420.1
dbrefs = peptides.get(0).getDBRefs();
@Test(groups = { "Functional" })
public void testGetEmblEntries()
{
- List<EntryType> entries = EmblSourceTest.getEmblEntries();
+ List<EntryType> entries = getEmblEntries();
assertEquals(1, entries.size());
EntryType entry = entries.get(0);
-
+
assertEquals("X07547", entry.getAccession());
assertEquals("C. trachomatis plasmid", entry.getDescription());
assertEquals("STD", entry.getDataClass());
assertEquals(2, entry.getKeyword().size());
assertEquals("plasmid", entry.getKeyword().get(0));
assertEquals("unidentified reading frame", entry.getKeyword().get(1));
-
+
/*
* dbrefs
*/
assertEquals("MD5", dbref.getDb());
assertEquals("ac73317", dbref.getId());
assertNull(dbref.getSecondaryId());
-
+
/*
* three sequence features for CDS
*/
q = ef.getQualifier().get(2);
assertEquals("translation", q.getName());
assertEquals("MLCF", q.getValue());
-
+
/*
* second CDS
*/
q = ef.getQualifier().get(1);
assertEquals("translation", q.getName());
assertEquals("MSSS", q.getValue());
-
+
/*
* third CDS
*/
q = ef.getQualifier().get(1);
assertEquals("translation", q.getName());
assertEquals("MSS", q.getValue());
-
+
/*
* Sequence - raw data before removal of newlines
*/
String seq = entry.getSequence();
- assertEquals(
- "GGTATGTCCTCTAGTACAAAC\n"
- + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT",
- seq);
-
+ assertEquals("GGTATGTCCTCTAGTACAAAC\n"
+ + "ACCCCCAATATTGTGATATAATTAAAAACATAGCAT", seq);
+
/*
* getSequence() converts empty DBRefEntry.version to "0"
*/
assertNull(entry.getFeature().get(0).getXref().get(1).getSecondaryId());
}
- static List<EntryType> getEmblEntries()
+ List<EntryType> getEmblEntries()
{
- return new EmblSource()
+ return testee
.getEmblEntries(new ByteArrayInputStream(TESTDATA.getBytes()));
}
}
/*
* EMBL
*/
- assertEquals("https://www.ebi.ac.uk/ena/data/view/x53838&display=xml",
+ assertEquals("https://www.ebi.ac.uk/ena/browser/api/embl/x53838?download=true&gzip=true",
EBIFetchClient.buildUrl("X53838", "EMBL", "display=xml"));
/*
* EMBLCDS
*/
- assertEquals("https://www.ebi.ac.uk/ena/data/view/caa37824&display=xml",
+ assertEquals("https://www.ebi.ac.uk/ena/browser/api/embl/caa37824?download=true&gzip=true",
EBIFetchClient.buildUrl("CAA37824", "EMBL", "display=xml"));
/*