From 8df2831b1c0c41bb8f9d3d26b25392efbd908885 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 1 Oct 2015 10:42:46 +0100 Subject: [PATCH] JAL-1499 replace identity symbols when parsing --- src/jalview/io/MegaFile.java | 48 +++++++++++++++++++---- test/jalview/io/MegaFileTest.java | 76 +++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 7 deletions(-) diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index 2f48bf5..2f5e35a 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -134,6 +134,9 @@ public class MegaFile extends AlignFile // gap character may be explicitly declared, if not we infer it private Character gapCharacter; + // identity character if declared + private char identityCharacter = 0; + // this can be True, False or null (meaning not asserted in file) private Boolean nucleotide; @@ -465,9 +468,7 @@ public class MegaFile extends AlignFile StringBuilder sb = getSequenceDataBuffer(seqData, currentId); - /* - * Add the current line of data to the sequence. - */ + dataLine = reformatSequenceData(dataLine, sb.length(), seqData); sb.append(dataLine); setPositionsPerLine(Math.max(positionsPerLine, dataLine.length())); @@ -529,10 +530,7 @@ public class MegaFile extends AlignFile */ if (data != null && data.length() > 0) { - if (data.indexOf(SPACE) != -1) - { - data = data.replace(SPACE, ""); - } + data = reformatSequenceData(data, sb.length(), seqData); sb.append(data); setPositionsPerLine(Math.max(positionsPerLine, data.length())); assertInterleaved(true, dataLine); @@ -540,6 +538,41 @@ public class MegaFile extends AlignFile } /** + * Reformat input sequence data by removing any internal formatting spaces, + * and converting any 'identity' characters to the corresponding position in + * the first sequence. + * + * @param data + * @param startPos + * the sequence position (base 0) of the start of the data + * @param seqData + * @return + */ + protected String reformatSequenceData(String data, int startPos, Map seqData) + { + String formatted = data.replace(SPACE, ""); + if (formatted.indexOf(identityCharacter) > -1) + { + /* + * sequence contains '.' or other identity symbol; replace these with the + * same position from the first (reference) sequence + */ + StringBuilder referenceSequence = seqData.values().iterator().next(); + StringBuilder sb = new StringBuilder(formatted.length()); + for (int i = 0 ; i < formatted.length() ; i++) { + char nextChar = formatted.charAt(i); + if (nextChar != identityCharacter) { + sb.append(nextChar); + } else { + sb.append(referenceSequence.charAt(startPos + i)); + } + } + formatted = sb.toString(); + } + return formatted; + } + + /** * If the line begins with (e.g.) "#abcde " then returns "abcde" as the * identifier. Else returns null. * @@ -738,6 +771,7 @@ public class MegaFile extends AlignFile || keyword.equalsIgnoreCase("MatchChar")) { setAlignmentProperty(PROP_IDENTITY, value); + this.identityCharacter = value.charAt(0); if (!".".equals(value)) { System.err.println("Warning: " + token diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index 948efa8..309ffee 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -491,4 +491,80 @@ public class MegaFileTest assertEquals("Roundtrip didn't match", expected, formatted); } + + //@formatter:on + + /** + * Test paste of interleaved mega format data where the identity character is + * used in sequences after the first + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testParse_interleavedWithIdentity() throws IOException + { + //@formatter:off + MegaFile testee = new MegaFile("#MEGA\n"+ + "!TITLE Interleaved sequence data;\n" + + "!Format Identical=.;\n\n" + + "#U455 ABCDEF\n" + + "#CPZANT M..P.R\n\n" + + "#U455 KLMNOP\n" + + "#CPZANT ..YZ..", AppletFormatAdapter.PASTE); + //@formatter:on + assertEquals("Title not as expected", "Interleaved sequence data", + testee.getAlignmentProperty(MegaFile.PROP_TITLE)); + Vector seqs = testee.getSeqs(); + // should be 2 sequences + assertEquals("Expected two sequences", 2, seqs.size()); + // check sequence names correct and order preserved + assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); + assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) + .getName()); + // check sequence data + assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) + .getSequenceAsString()); + assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1) + .getSequenceAsString()); + assertTrue("File format is not flagged as interleaved", + testee.isInterleaved()); + } + + /** + * Test paste of noninterleaved format data including identity symbol + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testParse_nonInterleavedWithIdentity() throws IOException + { + //@formatter:off + MegaFile testee = new MegaFile("#MEGA\n" + + "!TITLE Noninterleaved sequence data;\n" + + "!Format MatchChar=.;\n" + + "#U455 \n" + + "ABCFEDHIJ\n" + + "MNOPQR\n\n" + + "#CPZANT \n" + + "KL..O..XYZ\n" + + "CG..C\n", + AppletFormatAdapter.PASTE); + //@formatter:on + assertEquals("Title not as expected", "Noninterleaved sequence data", + testee.getAlignmentProperty(MegaFile.PROP_TITLE)); + Vector seqs = testee.getSeqs(); + // should be 2 sequences + assertEquals("Expected two sequences", 2, seqs.size()); + // check sequence names correct and order preserved + assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); + assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) + .getName()); + // check sequence data + assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs + .get(0).getSequenceAsString()); + assertEquals("Second sequence data wrong", "KLCFODHXYZCGPQC", + seqs.get(1).getSequenceAsString()); + assertFalse("File format is not flagged as noninterleaved", + testee.isInterleaved()); + } } -- 1.7.10.2