From: gmungoc Date: Mon, 27 Jul 2020 15:26:08 +0000 (+0100) Subject: JAL-3692 refactor/complete parsing, more unit test coverage X-Git-Tag: Release_2_11_1_1~13^2~24^2~4 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=582096635d0502a9bc8415c8f1ef3bcc9c545a44 JAL-3692 refactor/complete parsing, more unit test coverage --- diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 5be4364..f7a5161 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -3,14 +3,17 @@ package jalview.io; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.TreeMap; import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; @@ -18,6 +21,7 @@ import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; import jalview.util.DnaUtils; +import jalview.util.MapList; import jalview.util.MappingUtils; /** @@ -41,6 +45,8 @@ import jalview.util.MappingUtils; */ public class EmblFlatFile extends AlignFile // FileParse { + private static final String QUOTE = "\""; + /** * A data bean class to hold values parsed from one CDS Feature (FT) */ @@ -56,6 +62,8 @@ public class EmblFlatFile extends AlignFile // FileParse String proteinId; // from CDS /protein_id + List xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers + Map cdsProps = new Hashtable<>(); // CDS other qualifiers } @@ -74,11 +82,14 @@ public class EmblFlatFile extends AlignFile // FileParse private int length = 128; // from ID (7th token), with usable default - private List dbrefs; // from DR and also CDS /db_xref qualifiers + private List dbrefs; // from DR private String sequenceString; // from SQ lines - private List cds; + /* + * parsed CDS data fields, keyed by protein_id + */ + private Map cds; /** * Constructor @@ -92,7 +103,11 @@ public class EmblFlatFile extends AlignFile // FileParse super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); - cds = new ArrayList<>(); + + /* + * using TreeMap gives CDS sequences in alphabetical, so readable, order + */ + cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); } /** @@ -131,7 +146,7 @@ public class EmblFlatFile extends AlignFile // FileParse line = nextLine(); } } - assembleSequence(); + buildSequence(); } /** @@ -329,12 +344,14 @@ public class EmblFlatFile extends AlignFile // FileParse int eqPos = line.indexOf('=', slashPos + 1); if (eqPos == -1) { - Cache.log.error("Unexpected EMBL line ignored: " + line); + // can happen, e.g. /ribosomal_slippage +// Cache.log.error("Unexpected EMBL line ignored: " + line); + line = nextLine(); continue; } String qualifier = line.substring(slashPos + 1, eqPos); String value = line.substring(eqPos + 1); - if (value.startsWith("\"") && value.endsWith("\"")) + if (value.startsWith(QUOTE) && value.endsWith(QUOTE)) { value = value.substring(1, value.length() - 1); } @@ -364,7 +381,7 @@ public class EmblFlatFile extends AlignFile // FileParse String db = parts[0].trim(); db = DBRefUtils.getCanonicalName(db); DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); - this.dbrefs.add(dbref); + data.xrefs.add(dbref); } line = nextLine(); } @@ -376,7 +393,7 @@ public class EmblFlatFile extends AlignFile // FileParse } else if ("translation".equals(qualifier)) { - line = readTranslation(value, data); + line = parseTranslation(value, data); } else if (!"".equals(value)) { @@ -386,7 +403,15 @@ public class EmblFlatFile extends AlignFile // FileParse } } - this.cds.add(data); + if (data.proteinId != null) + { + this.cds.put(data.proteinId, data); + } + else + { + Cache.log.error("Ignoring CDS feature with no protein_id for " + + sourceDb + ":" + accession); + } return line; } @@ -401,10 +426,10 @@ public class EmblFlatFile extends AlignFile // FileParse * @return * @throws IOException */ - String readTranslation(String value, CdsData data) throws IOException + String parseTranslation(String value, CdsData data) throws IOException { StringBuilder sb = new StringBuilder(this.length / 3 + 1); - sb.append(value.replace("\"", "")); + sb.append(value.replace(QUOTE, "")); String line; while ((line = nextLine()) != null) @@ -423,7 +448,7 @@ public class EmblFlatFile extends AlignFile // FileParse { break; // next feature qualifier } - sb.append(tokens[1].replace("\"", "")); + sb.append(tokens[1].replace(QUOTE, "")); } data.translation = sb.toString(); @@ -434,7 +459,7 @@ public class EmblFlatFile extends AlignFile // FileParse /** * Constructs and saves the sequence from parsed components */ - void assembleSequence() + void buildSequence() { String name = this.accession; if (this.sourceDb != null) @@ -457,7 +482,7 @@ public class EmblFlatFile extends AlignFile // FileParse seq.addDBRef(dbref); } - processAllCDS(seq); + processCDSFeatures(seq); seq.deriveSequence(); @@ -470,25 +495,25 @@ public class EmblFlatFile extends AlignFile // FileParse * * @param seq */ - protected void processAllCDS(SequenceI seq) + protected void processCDSFeatures(SequenceI seq) { /* * record protein products found to avoid duplication i.e. >1 CDS with * the same /protein_id [though not sure I can find an example of this] */ Map proteins = new HashMap<>(); - for (CdsData data : cds) + for (CdsData data : cds.values()) { - processOneCDS(seq, data, proteins); + processCDSFeature(seq, data, proteins); } } /** - * Processes the parsed CDS feature data to + * Processes data for one parsed CDS feature to *
    - *
  • add a CDS feature to the sequence for each CDS start-end range
  • *
  • create a protein product sequence for the translation
  • *
  • create a cross-reference to protein with mapping from dna
  • + *
  • add a CDS feature to the sequence for each CDS start-end range
  • *
  • add any CDS dbrefs to the sequence and to the protein product
  • *
* @@ -497,13 +522,16 @@ public class EmblFlatFile extends AlignFile // FileParse * @param proteins * map of protein products so far derived from CDS data */ - void processOneCDS(SequenceI dna, CdsData data, + void processCDSFeature(SequenceI dna, CdsData data, Map proteins) { /* * parse location into a list of [start, end, start, end] positions */ int[] exons = getCdsRanges(this.accession, data.cdsLocation); + + MapList maplist = buildMappingToProtein(dna, exons, data); + int exonNumber = 0; for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) @@ -531,9 +559,127 @@ public class EmblFlatFile extends AlignFile // FileParse sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); dna.addSequenceFeature(sf); + } - linkProteinProduct(dna, data, proteins); + boolean hasUniprotDbref = false; + for (DBRefEntry xref : data.xrefs) + { + dna.addDBRef(xref); + if (xref.getSource().equals(DBRefSource.UNIPROT)) + { + /* + * construct (or find) the sequence for (data.protein_id, data.translation) + */ + SequenceI protein = buildProteinProduct(dna, xref, data, proteins); + Mapping map = new Mapping(protein, maplist); + map.setMappedFromId(data.proteinId); + xref.setMap(map); + + /* + * add DBRefs with mappings from dna to protein and the inverse + */ + DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession); + db1.setMap(new Mapping(dna, maplist.getInverse())); + protein.addDBRef(db1); + + hasUniprotDbref = true; + } } + + /* + * if we have a product (translation) but no explicit Uniprot dbref + * (example: EMBL M19487 protein_id AAB02592.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession + */ + if (!hasUniprotDbref) + { + SequenceI protein = proteins.get(data.proteinId); + if (protein == null) + { + protein = new Sequence(data.proteinId, data.translation); + proteins.put(data.proteinId, protein); + } + // assuming CDSPROTEIN sequence version = dna version (?!) + DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct, + this.version, data.proteinId); + protein.addDBRef(db1); + + DBRefEntry dnaToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, this.version, data.proteinId); + Mapping map = new Mapping(protein, maplist); + map.setMappedFromId(data.proteinId); + dnaToEmblProteinRef.setMap(map); + dna.addDBRef(dnaToEmblProteinRef); + } + + /* + * comment brought forward from EmblXmlSource, lines 447-451: + * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL + * sequence with the exon map; if given a dataset reference, search + * dataset for parent EMBL sequence if it exists and set its map; + * make a new feature annotating the coding contig + */ + } + + /** + * Computes a mapping from CDS positions in DNA sequence to protein product + * positions, with allowance for stop codon or incomplete start codon + * + * @param dna + * @param exons + * @param data + * @return + */ + MapList buildMappingToProtein(final SequenceI dna, final int[] exons, + final CdsData data) + { + MapList dnaToProteinMapping = null; + int peptideLength = data.translation.length(); + + int[] proteinRange = new int[] { 1, peptideLength }; + if (exons != null && exons.length > 0) + { + /* + * We were able to parse 'location'; do a final + * product length truncation check + */ + int[] cdsRanges = adjustForProteinLength(peptideLength, exons); + dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); + } + else + { + /* + * workaround until we handle all 'location' formats fully + * e.g. X53828.1:60..1058 or <123..>289 + */ + Cache.log.error(String.format( + "Implementation Notice: EMBLCDS location '%s'not properly supported yet" + + " - Making up the CDNA region of (%s:%s)... may be incorrect", + data.cdsLocation, sourceDb, this.accession)); + + int completeCodonsLength = 1 - data.codonStart + dna.getLength(); + int mappedDnaEnd = dna.getEnd(); + if (peptideLength * 3 == completeCodonsLength) + { + // this might occur for CDS sequences where no features are marked + Cache.log.warn("Assuming no stop codon at end of cDNA fragment"); + mappedDnaEnd = dna.getEnd(); + } + else if ((peptideLength + 1) * 3 == completeCodonsLength) + { + Cache.log.warn("Assuming stop codon at end of cDNA fragment"); + mappedDnaEnd = dna.getEnd() - 3; + } + + if (mappedDnaEnd != -1) + { + int[] cdsRanges = new int[] { + dna.getStart() + (data.codonStart - 1), mappedDnaEnd }; + dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); + } + } + + return dnaToProteinMapping; } /** @@ -541,31 +687,37 @@ public class EmblFlatFile extends AlignFile // FileParse * one), and dbrefs with mappings from CDS to protein and the reverse * * @param dna + * @param xref * @param data * @param proteins + * @return */ - void linkProteinProduct(SequenceI dna, CdsData data, Map proteins) + SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref, + CdsData data, Map proteins) { /* * check we have some data to work with */ if (data.proteinId == null || data.translation == null) { - return; + return null; } - + /* * Construct the protein sequence (if not already seen) */ - SequenceI protein = proteins.get(data.proteinId); + String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId(); + SequenceI protein = proteins.get(proteinSeqName); if (protein == null) { - protein = new Sequence(data.proteinId, data.translation, 1, + protein = new Sequence(proteinSeqName, data.translation, 1, data.translation.length()); protein.setDescription(data.proteinName != null ? data.proteinName : "Protein Product from " + sourceDb); - proteins.put(data.proteinId, protein); + proteins.put(proteinSeqName, protein); } + + return protein; } /** @@ -604,4 +756,81 @@ public class EmblFlatFile extends AlignFile // FileParse { return null; } + + /** + * Truncates (if necessary) the exon intervals to match 3 times the length of + * the protein; also accepts 3 bases longer (for stop codon not included in + * protein) + * + * @param proteinLength + * @param exon + * an array of [start, end, start, end...] intervals + * @return the same array (if unchanged) or a truncated copy + */ + static int[] adjustForProteinLength(int proteinLength, int[] exon) + { + if (proteinLength <= 0 || exon == null) + { + return exon; + } + int expectedCdsLength = proteinLength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); + + /* + * if exon length matches protein, or is shorter, or longer by the + * length of a stop codon (3 bases), then leave it unchanged + */ + if (expectedCdsLength >= exonLength + || expectedCdsLength == exonLength - 3) + { + return exon; + } + + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) + { + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (expectedCdsLength <= cdspos) + { + // advanced beyond last codon. + sxpos = x; + if (expectedCdsLength != cdspos) + { + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); + } + + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + expectedCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - expectedCdsLength; + } + break; + } + } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } + return exon; + } } diff --git a/src/jalview/ws/dbsources/EmblCdsSource.java b/src/jalview/ws/dbsources/EmblCdsSource.java index 7455e4f..d02910c 100644 --- a/src/jalview/ws/dbsources/EmblCdsSource.java +++ b/src/jalview/ws/dbsources/EmblCdsSource.java @@ -20,12 +20,10 @@ */ package jalview.ws.dbsources; -import com.stevesoft.pat.Regex; - import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefSource; -public class EmblCdsSource extends /*EmblXmlSource */ EmblFlatfileSource +public class EmblCdsSource extends EmblFlatfileSource // was EmblXmlSource { public EmblCdsSource() diff --git a/src/jalview/ws/dbsources/EmblSource.java b/src/jalview/ws/dbsources/EmblSource.java index 4cff4a0..df43bc3 100644 --- a/src/jalview/ws/dbsources/EmblSource.java +++ b/src/jalview/ws/dbsources/EmblSource.java @@ -27,7 +27,7 @@ import jalview.datamodel.DBRefSource; * @author JimP * */ -public class EmblSource extends /* EmblXmlSource */ EmblFlatfileSource +public class EmblSource extends EmblFlatfileSource // was EmblXmlSource { public EmblSource() diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index 6b6f2ec..97d7c9f 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -574,6 +574,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy proteinSeq = new Sequence(proteinSeqName, product.getSequenceAsString()); matcher.add(proteinSeq); + proteinSeq.setDescription(product.getDescription()); peptides.add(proteinSeq); } dnaToProteinMapping.setTo(proteinSeq); diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java index b1023d1..949e0a2 100644 --- a/test/jalview/io/EmblFlatFileTest.java +++ b/test/jalview/io/EmblFlatFileTest.java @@ -2,10 +2,15 @@ package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; +import static org.testng.AssertJUnit.assertNotNull; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.fail; +import static org.testng.AssertJUnit.assertNull; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.util.Arrays; import java.util.List; import java.util.Set; @@ -13,9 +18,11 @@ import org.testng.annotations.Test; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Mapping; +import jalview.datamodel.Sequence.DBModList; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; +import jalview.util.MapList; public class EmblFlatFileTest { @@ -39,7 +46,8 @@ public class EmblFlatFileTest SequenceI seq = seqs.get(0); assertEquals(seq.getName(), "EmblTest|J03321"); assertEquals(seq.getLength(), 7502); - assertEquals(seq.getDescription(), "Chlamydia trachomatis plasmid pCHL1, complete sequence"); + assertEquals(seq.getDescription(), + "Chlamydia trachomatis plasmid pCHL1, complete sequence"); /* * should be 9 CDS features (one is a 'join' of two exons) @@ -47,15 +55,15 @@ public class EmblFlatFileTest Set featureTypes = seq.getFeatures().getFeatureTypes(); assertEquals(featureTypes.size(), 1); assertTrue(featureTypes.contains("CDS")); - + /* * inspect some features (sorted just for convenience of test assertions) */ List features = seq.getFeatures() .getAllFeatures("CDS"); - SequenceFeatures.sortFeatures(features, true); + SequenceFeatures.sortFeatures(features, true); assertEquals(features.size(), 9); - + SequenceFeature sf = features.get(0); assertEquals(sf.getBegin(), 1); assertEquals(sf.getEnd(), 437); @@ -70,7 +78,7 @@ public class EmblFlatFileTest assertEquals(sf.getValue("exon number"), 2); assertEquals(sf.getValue("product"), "hypothetical protein"); assertEquals(sf.getValue("transl_table"), "11"); - + sf = features.get(1); assertEquals(sf.getBegin(), 488); assertEquals(sf.getEnd(), 1480); @@ -83,7 +91,7 @@ public class EmblFlatFileTest assertEquals(sf.getValue("note"), "pGP8-D"); assertEquals(sf.getValue("exon number"), 1); assertEquals(sf.getValue("product"), "hypothetical protein"); - + sf = features.get(7); assertEquals(sf.getBegin(), 6045); assertEquals(sf.getEnd(), 6788); @@ -96,7 +104,7 @@ public class EmblFlatFileTest assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)"); assertEquals(sf.getValue("exon number"), 1); assertEquals(sf.getValue("product"), "hypothetical protein"); - + /* * CDS at 7022-7502 is the first exon of the circular CDS */ @@ -114,31 +122,203 @@ public class EmblFlatFileTest assertEquals(sf.getValue("product"), "hypothetical protein"); /* - * Jalview adds a dbref to 'self', and there are 4 'direct' (DR) dbrefs, - * and numerous CDS /db_xref entries (some e.g. INTERPRO are duplicates) - * sample a few here - * Note DBRefEntry constructor capitalises source + * Verify DBRefs, whether declared in the file or added by Jalview. + * There are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries + * (some e.g. INTERPRO are duplicates). Jalview adds a dbref to 'self'. + * Sample a few here. Note DBRefEntry constructor capitalises source. */ List dbrefs = seq.getDBRefs(); assertEquals(dbrefs.size(), 32); // xref to 'self': DBRefEntry selfRef = new DBRefEntry("EMBLTEST", "1", "J03321"); - int[] range = new int[] {1, seq.getLength()}; + int[] range = new int[] { 1, seq.getLength() }; selfRef.setMap(new Mapping(null, range, range, 1, 1)); assertTrue(dbrefs.contains(selfRef)); - + // 1st DR line; note trailing period is removed assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0", "d4c4942a634e3df4995fd5ac75c26a61"))); // the 4th DR line: assertTrue( dbrefs.contains(new DBRefEntry("EUROPEPMC", "0", "PMC87941"))); - // from the first CDS feature; note canonicalisation to "UNIPROT" + // from the first CDS feature assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19"))); - assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19"))); // from the last CDS feature - assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350"))); + assertTrue( + dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350"))); + + /* + * verify mappings to, and sequences for, UNIPROT proteins + */ + int uniprotCount = 0; + List ranges; + for (DBRefEntry dbref : dbrefs) + { + if ("UNIPROT".equals(dbref.getSource())) + { + uniprotCount++; + Mapping mapping = dbref.getMap(); + assertNotNull(mapping); + MapList map = mapping.getMap(); + String mappedToName = mapping.getTo().getName(); + if ("UNIPROT|P0CE16".equals(mappedToName)) + { + assertEquals((ranges = map.getFromRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1579); + assertEquals(ranges.get(0)[1], 2934); + assertEquals((ranges = map.getToRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1); + assertEquals(ranges.get(0)[1], 451); + // CDS /product carries over as protein product description + assertEquals(mapping.getTo().getDescription(), + "hypothetical protein"); + } + else if ("UNIPROT|P0CE17".equals(mappedToName)) + { + assertEquals((ranges = map.getFromRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 2928); + assertEquals(ranges.get(0)[1], 3992); + assertEquals((ranges = map.getToRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1); + assertEquals(ranges.get(0)[1], 354); + } + else if ("UNIPROT|P0CE18".equals(mappedToName)) + { + assertEquals((ranges = map.getFromRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 4054); + assertEquals(ranges.get(0)[1], 4848); + assertEquals((ranges = map.getToRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1); + assertEquals(ranges.get(0)[1], 264); + } + else if ("UNIPROT|P0CE19".equals(mappedToName)) + { + // join(7022..7502,1..437) + assertEquals((ranges = map.getFromRanges()).size(), 2); + assertEquals(ranges.get(0)[0], 7022); + assertEquals(ranges.get(0)[1], 7502); + assertEquals(ranges.get(1)[0], 1); + assertEquals(ranges.get(1)[1], 437); + assertEquals((ranges = map.getToRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1); + assertEquals(ranges.get(0)[1], 305); + } + else if ("UNIPROT|P0CE20".equals(mappedToName)) + { + // complement(488..1480) + assertEquals((ranges = map.getFromRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1480); + assertEquals(ranges.get(0)[1], 488); + assertEquals((ranges = map.getToRanges()).size(), 1); + assertEquals(ranges.get(0)[0], 1); + assertEquals(ranges.get(0)[1], 330); + } + else if (!"UNIPROT|P0CE23".equals(mappedToName) + && !"UNIPROT|P10559".equals(mappedToName) + && !"UNIPROT|P10560".equals(mappedToName)) + { + fail("Unexpected UNIPROT dbref to " + mappedToName); + } + } + } + assertEquals(uniprotCount, 8); + } + + @Test(groups = "Functional") + public void testParse_codonStartNot1() + { + // TODO verify CDS-to-protein mapping for CDS with /codon_start=2 + // example: https://www.ebi.ac.uk/ena/browser/api/embl/EU498516 + } + + /** + * Test for the case that the EMBL CDS has no UNIPROT xref. In this case + * Jalview should synthesize an xref to EMBLCDSPROTEIN in the hope this will + * allow Get Cross-References. + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testParse_noUniprotXref() throws IOException + { + // MN908947 cut down to 40BP, one CDS, length 5 peptide for test purposes + String data = "ID MN908947; SV 3; linear; genomic RNA; STD; VRL; 20 BP.\n" + + "DE Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,\n" + + "FT CDS 3..17\n" + + "FT /protein_id=\"QHD43415.1\"\n" + + "FT /translation=\"MRKLD\n" + + "SQ Sequence 7496 BP; 2450 A; 1290 C; 1434 G; 2322 T; 0 other;\n" + + " ggatGcgtaa gttagacgaa attttgtctt tgcgcacaga 40\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); + parser.parse(); + List seqs = parser.getSeqs(); + assertEquals(seqs.size(), 1); + SequenceI seq = seqs.get(0); + DBModList dbrefs = seq.getDBRefs(); + + /* + * dna should have dbref to itself, and to inferred EMBLCDSPROTEIN:QHD43415.1 + */ + assertEquals(dbrefs.size(), 2); + + // dbref to self + DBRefEntry dbref = dbrefs.get(0); + assertEquals(dbref.getSource(), "EMBLTEST"); + assertEquals(dbref.getAccessionId(), "MN908947"); + Mapping mapping = dbref.getMap(); + assertNull(mapping.getTo()); + MapList map = mapping.getMap(); + assertEquals(map.getFromLowest(), 1); + assertEquals(map.getFromHighest(), 40); + assertEquals(map.getToLowest(), 1); + assertEquals(map.getToHighest(), 40); + assertEquals(map.getFromRatio(), 1); + assertEquals(map.getToRatio(), 1); + + // dbref to inferred EMBLCDSPROTEIN: + dbref = dbrefs.get(1); + assertEquals(dbref.getSource(), "EMBLCDSPROTEIN"); + assertEquals(dbref.getAccessionId(), "QHD43415.1"); + mapping = dbref.getMap(); + SequenceI mapTo = mapping.getTo(); + assertEquals(mapTo.getName(), "QHD43415.1"); + assertEquals(mapTo.getSequenceAsString(), "MRKLD"); + map = mapping.getMap(); + assertEquals(map.getFromLowest(), 3); + assertEquals(map.getFromHighest(), 17); + assertEquals(map.getToLowest(), 1); + assertEquals(map.getToHighest(), 5); + assertEquals(map.getFromRatio(), 3); + assertEquals(map.getToRatio(), 1); + } + + @Test(groups = "Functional") + public void testAdjustForProteinLength() + { + int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp + + // exact length match: + assertSame(exons, EmblFlatFile.adjustForProteinLength(6, exons)); + + // match if we assume exons include stop codon not in protein: + assertSame(exons, EmblFlatFile.adjustForProteinLength(5, exons)); + + // truncate last exon by 6bp + int[] truncated = EmblFlatFile.adjustForProteinLength(4, exons); + assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated)); + + // remove last exon and truncate preceding by 1bp (so 3bp in total) + truncated = EmblFlatFile.adjustForProteinLength(3, exons); + assertEquals("[11, 15, 21, 24]", Arrays.toString(truncated)); + + // exact removal of exon case: + exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp + truncated = EmblFlatFile.adjustForProteinLength(4, exons); + assertEquals("[11, 15, 21, 27]", Arrays.toString(truncated)); - // todo: mappings to, and sequences for, UNIPROT proteins + // what if exons are too short for protein? + truncated = EmblFlatFile.adjustForProteinLength(7, exons); + assertSame(exons, truncated); } }