From: gmungoc Date: Thu, 27 Aug 2020 14:40:22 +0000 (+0100) Subject: JAL-3725 exclude stop codon from CDS-to-protein mapping X-Git-Tag: Release_2_11_2_0~13^2~4^2~10 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=fe8a6f4ccc5044d3e7653fef1633f50369121368;p=jalview.git JAL-3725 exclude stop codon from CDS-to-protein mapping --- diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index c2d661b..06cbb13 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -20,8 +20,6 @@ */ package jalview.ws.dbsources; -import java.util.Locale; - import java.io.File; import java.io.FileInputStream; import java.io.InputStream; @@ -30,6 +28,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Map.Entry; @@ -65,12 +64,6 @@ import jalview.xml.binding.embl.EntryType.Feature.Qualifier; import jalview.xml.binding.embl.ROOT; import jalview.xml.binding.embl.XrefType; -/** - * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from - * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}. - * - * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource - */ public abstract class EmblXmlSource extends EbiFileRetrievedProxy { private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+"); @@ -104,8 +97,8 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy try { reply = dbFetch.fetchDataAsFile( - emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), "display=xml", - "xml"); + emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), + "display=xml", "xml"); } catch (Exception e) { stopQuery(); @@ -455,9 +448,8 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy else { // final product length truncation check - int[] cdsRanges = adjustForProteinLength(translationLength, - exons); - dnaToProteinMapping = new Mapping(product, cdsRanges, + int[] exons2 = adjustForProteinLength(translationLength, exons); + dnaToProteinMapping = new Mapping(product, exons2, new int[] { 1, translationLength }, 3, 1); if (product != null) @@ -759,8 +751,7 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy /** * Truncates (if necessary) the exon intervals to match 3 times the length of - * the protein; also accepts 3 bases longer (for stop codon not included in - * protein) + * the protein (including truncation for stop codon included in exon) * * @param proteinLength * @param exon @@ -777,11 +768,9 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy int exonLength = MappingUtils.getLength(Arrays.asList(exon)); /* - * if exon length matches protein, or is shorter, or longer by the - * length of a stop codon (3 bases), then leave it unchanged + * if exon length matches protein, or is shorter, then leave it unchanged */ - if (expectedCdsLength >= exonLength - || expectedCdsLength == exonLength - 3) + if (expectedCdsLength >= exonLength) { return exon; } diff --git a/test/jalview/ws/dbsources/EmblXmlSourceTest.java b/test/jalview/ws/dbsources/EmblXmlSourceTest.java index a0991e5..236e0a5 100644 --- a/test/jalview/ws/dbsources/EmblXmlSourceTest.java +++ b/test/jalview/ws/dbsources/EmblXmlSourceTest.java @@ -26,6 +26,14 @@ import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; +import java.io.ByteArrayInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; @@ -36,14 +44,6 @@ import jalview.xml.binding.embl.EntryType.Feature; import jalview.xml.binding.embl.EntryType.Feature.Qualifier; import jalview.xml.binding.embl.XrefType; -import java.io.ByteArrayInputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - public class EmblXmlSourceTest { @@ -352,11 +352,12 @@ public class EmblXmlSourceTest // exact length match: assertSame(exons, EmblXmlSource.adjustForProteinLength(6, exons)); - // match if we assume exons include stop codon not in protein: - assertSame(exons, EmblXmlSource.adjustForProteinLength(5, exons)); + // truncate last exon by 3bp (e.g. stop codon) + int[] truncated = EmblXmlSource.adjustForProteinLength(5, exons); + assertEquals("[11, 15, 21, 25, 31, 35]", Arrays.toString(truncated)); // truncate last exon by 6bp - int[] truncated = EmblXmlSource.adjustForProteinLength(4, exons); + truncated = EmblXmlSource.adjustForProteinLength(4, exons); assertEquals("[11, 15, 21, 25, 31, 32]", Arrays.toString(truncated)); // remove last exon and truncate preceding by 1bp