From: gmungoc Date: Thu, 19 May 2016 16:01:14 +0000 (+0100) Subject: JAL-2114 parser + tests for GenBank location descriptors X-Git-Tag: Release_2_10_0~218^2~6 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=a4507b66add69be7e5097dcc8fbd9bd08b0626cb;p=jalview.git JAL-2114 parser + tests for GenBank location descriptors --- diff --git a/src/jalview/util/DnaUtils.java b/src/jalview/util/DnaUtils.java new file mode 100644 index 0000000..639eb8e --- /dev/null +++ b/src/jalview/util/DnaUtils.java @@ -0,0 +1,131 @@ +package jalview.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class DnaUtils +{ + + /** + * Parses an ENA/GenBank format location specifier and returns a list of + * [start, end] ranges. Returns null if not able to parse. + * + * @param location + * @return + * @see http://www.insdc.org/files/feature_table.html#3.4 + */ + public static List parseLocation(String location) + { + if (location.startsWith("join(")) + { + return parseJoin(location); + } + else if (location.startsWith("complement(")) + { + return parseComplement(location); + } + String errorMessage = "Unable to process location specifier: " + + location; + if (location.startsWith("order(")) + { + System.err.println(errorMessage); + return null; + } + String[] range = location.split("\\.\\."); + if (range.length == 2) + { + try + { + int start = Integer.valueOf(range[0]); + int end = Integer.valueOf(range[1]); + return Collections.singletonList(new int[] { start, end }); + } catch (NumberFormatException e) + { + /* + * could be a location like <1..888 or 1..>888 + */ + System.err.println(errorMessage); + return null; + } + } + else + { + /* + * could be a location like 102.110 or 123^124 + */ + System.err.println(errorMessage); + return null; + } + } + + /** + * Parses a complement(locationSpec) into a list of start-end ranges + * + * @param location + * @return + */ + static List parseComplement(String location) + { + /* + * take what is inside complement() + */ + String toComplement = location.substring("complement(".length(), + location.length() - 1); + List ranges = parseLocation(toComplement); + if (ranges == null) + { + /* + * something bad in there + */ + return null; + } + + /* + * reverse the order and direction of ranges + */ + Collections.reverse(ranges); + for (int[] range : ranges) + { + int temp = range[0]; + range[0] = range[1]; + range[1] = temp; + } + return ranges; + } + + /** + * Parses a join(loc1,loc2,...,locn) into a list of start-end ranges + * + * @param location + * @return + */ + static List parseJoin(String location) + { + List ranges = new ArrayList(); + + /* + * take what is inside join() + */ + String joinedLocs = location.substring("join(".length(), + location.length() - 1); + String[] locations = joinedLocs.split(","); + for (String loc : locations) + { + List range = parseLocation(loc); + if (range == null) + { + /* + * something bad in there + */ + return null; + } + else + { + ranges.addAll(range); + } + } + return ranges; + } + +} diff --git a/test/jalview/util/DnaUtilsTest.java b/test/jalview/util/DnaUtilsTest.java new file mode 100644 index 0000000..af76885 --- /dev/null +++ b/test/jalview/util/DnaUtilsTest.java @@ -0,0 +1,107 @@ +package jalview.util; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.fail; + +import java.util.List; + +import org.testng.annotations.Test; + +public class DnaUtilsTest +{ + /** + * Tests for parsing an ENA/GenBank location specifier + * + * @see http://www.insdc.org/files/feature_table.html#3.4 + */ + @Test(groups = { "Functional" }) + public void testParseLocation() + { + /* + * simple range + */ + List ranges = DnaUtils.parseLocation("12..78"); + assertEquals(1, ranges.size()); + assertEquals(12, ranges.get(0)[0]); + assertEquals(78, ranges.get(0)[1]); + + /* + * join of simple ranges + */ + ranges = DnaUtils.parseLocation("join(12..78,134..202,322..345)"); + assertEquals(3, ranges.size()); + assertEquals(12, ranges.get(0)[0]); + assertEquals(78, ranges.get(0)[1]); + assertEquals(134, ranges.get(1)[0]); + assertEquals(202, ranges.get(1)[1]); + assertEquals(322, ranges.get(2)[0]); + assertEquals(345, ranges.get(2)[1]); + + /* + * complement of a simple range + */ + ranges = DnaUtils.parseLocation("complement(34..126)"); + assertEquals(1, ranges.size()); + assertEquals(126, ranges.get(0)[0]); + assertEquals(34, ranges.get(0)[1]); + + /* + * complement of a join + */ + ranges = DnaUtils + .parseLocation("complement(join(2691..4571,4918..5163))"); + assertEquals(2, ranges.size()); + assertEquals(5163, ranges.get(0)[0]); + assertEquals(4918, ranges.get(0)[1]); + assertEquals(4571, ranges.get(1)[0]); + assertEquals(2691, ranges.get(1)[1]); + + /* + * join of two complements + */ + ranges = DnaUtils + .parseLocation("join(complement(4918..5163),complement(2691..4571))"); + assertEquals(2, ranges.size()); + assertEquals(5163, ranges.get(0)[0]); + assertEquals(4918, ranges.get(0)[1]); + assertEquals(4571, ranges.get(1)[0]); + assertEquals(2691, ranges.get(1)[1]); + + /* + * join complement to non-complement + * @see http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation/ Transpliced Genes + */ + ranges = DnaUtils + .parseLocation("join(complement(36618..36700),86988..87064)"); + assertEquals(2, ranges.size()); + assertEquals(36700, ranges.get(0)[0]); + assertEquals(36618, ranges.get(0)[1]); + assertEquals(86988, ranges.get(1)[0]); + assertEquals(87064, ranges.get(1)[1]); + + /* + * valid things we don't yet handle + */ + assertNull(DnaUtils.parseLocation("<34..126")); + assertNull(DnaUtils.parseLocation("34..>126")); + assertNull(DnaUtils.parseLocation("34.126")); + assertNull(DnaUtils.parseLocation("34^126")); + + /* + * invalid things + */ + assertNull(DnaUtils.parseLocation("")); + assertNull(DnaUtils.parseLocation("JOIN(1..2)")); + assertNull(DnaUtils.parseLocation("join(1..2")); + try + { + assertNull(DnaUtils.parseLocation(null)); + fail("Expected exception"); + } catch (NullPointerException e) + { + // expected + } + } + +}