// gap character may be explicitly declared, if not we infer it
private Character gapCharacter;
+ // identity character if declared
+ private char identityCharacter = 0;
+
// this can be True, False or null (meaning not asserted in file)
private Boolean nucleotide;
StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
- /*
- * Add the current line of data to the sequence.
- */
+ dataLine = reformatSequenceData(dataLine, sb.length(), seqData);
sb.append(dataLine);
setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
*/
if (data != null && data.length() > 0)
{
- if (data.indexOf(SPACE) != -1)
- {
- data = data.replace(SPACE, "");
- }
+ data = reformatSequenceData(data, sb.length(), seqData);
sb.append(data);
setPositionsPerLine(Math.max(positionsPerLine, data.length()));
assertInterleaved(true, dataLine);
}
/**
+ * Reformat input sequence data by removing any internal formatting spaces,
+ * and converting any 'identity' characters to the corresponding position in
+ * the first sequence.
+ *
+ * @param data
+ * @param startPos
+ * the sequence position (base 0) of the start of the data
+ * @param seqData
+ * @return
+ */
+ protected String reformatSequenceData(String data, int startPos, Map<String, StringBuilder> seqData)
+ {
+ String formatted = data.replace(SPACE, "");
+ if (formatted.indexOf(identityCharacter) > -1)
+ {
+ /*
+ * sequence contains '.' or other identity symbol; replace these with the
+ * same position from the first (reference) sequence
+ */
+ StringBuilder referenceSequence = seqData.values().iterator().next();
+ StringBuilder sb = new StringBuilder(formatted.length());
+ for (int i = 0 ; i < formatted.length() ; i++) {
+ char nextChar = formatted.charAt(i);
+ if (nextChar != identityCharacter) {
+ sb.append(nextChar);
+ } else {
+ sb.append(referenceSequence.charAt(startPos + i));
+ }
+ }
+ formatted = sb.toString();
+ }
+ return formatted;
+ }
+
+ /**
* If the line begins with (e.g.) "#abcde " then returns "abcde" as the
* identifier. Else returns null.
*
|| keyword.equalsIgnoreCase("MatchChar"))
{
setAlignmentProperty(PROP_IDENTITY, value);
+ this.identityCharacter = value.charAt(0);
if (!".".equals(value))
{
System.err.println("Warning: " + token
assertEquals("Roundtrip didn't match", expected,
formatted);
}
+
+ //@formatter:on
+
+ /**
+ * Test paste of interleaved mega format data where the identity character is
+ * used in sequences after the first
+ *
+ * @throws IOException
+ */
+ @Test(groups = { "Functional" })
+ public void testParse_interleavedWithIdentity() throws IOException
+ {
+ //@formatter:off
+ MegaFile testee = new MegaFile("#MEGA\n"+
+ "!TITLE Interleaved sequence data;\n" +
+ "!Format Identical=.;\n\n" +
+ "#U455 ABCDEF\n" +
+ "#CPZANT M..P.R\n\n" +
+ "#U455 KLMNOP\n" +
+ "#CPZANT ..YZ..", AppletFormatAdapter.PASTE);
+ //@formatter:on
+ assertEquals("Title not as expected", "Interleaved sequence data",
+ testee.getAlignmentProperty(MegaFile.PROP_TITLE));
+ Vector<SequenceI> seqs = testee.getSeqs();
+ // should be 2 sequences
+ assertEquals("Expected two sequences", 2, seqs.size());
+ // check sequence names correct and order preserved
+ assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
+ assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
+ .getName());
+ // check sequence data
+ assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+ .getSequenceAsString());
+ assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1)
+ .getSequenceAsString());
+ assertTrue("File format is not flagged as interleaved",
+ testee.isInterleaved());
+ }
+
+ /**
+ * Test paste of noninterleaved format data including identity symbol
+ *
+ * @throws IOException
+ */
+ @Test(groups = { "Functional" })
+ public void testParse_nonInterleavedWithIdentity() throws IOException
+ {
+ //@formatter:off
+ MegaFile testee = new MegaFile("#MEGA\n"
+ + "!TITLE Noninterleaved sequence data;\n"
+ + "!Format MatchChar=.;\n"
+ + "#U455 \n"
+ + "ABCFEDHIJ\n"
+ + "MNOPQR\n\n"
+ + "#CPZANT \n"
+ + "KL..O..XYZ\n"
+ + "CG..C\n",
+ AppletFormatAdapter.PASTE);
+ //@formatter:on
+ assertEquals("Title not as expected", "Noninterleaved sequence data",
+ testee.getAlignmentProperty(MegaFile.PROP_TITLE));
+ Vector<SequenceI> seqs = testee.getSeqs();
+ // should be 2 sequences
+ assertEquals("Expected two sequences", 2, seqs.size());
+ // check sequence names correct and order preserved
+ assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
+ assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
+ .getName());
+ // check sequence data
+ assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs
+ .get(0).getSequenceAsString());
+ assertEquals("Second sequence data wrong", "KLCFODHXYZCGPQC",
+ seqs.get(1).getSequenceAsString());
+ assertFalse("File format is not flagged as noninterleaved",
+ testee.isInterleaved());
+ }
}