From a04258ae9f3cbf96355d9638d3dbfacf5d97d2ac Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 9 Oct 2015 10:15:14 +0100 Subject: [PATCH] JAL-1499 output !Label lines for "MEGA Label" alignment annotation --- src/jalview/io/MegaFile.java | 189 ++++++++++++++++++++++++++++--------- test/jalview/io/MegaFileTest.java | 80 +++++++++++++--- 2 files changed, 211 insertions(+), 58 deletions(-) diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index fcb7a93..69e7435 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.Vector; /** * A parser for input or output of MEGA format files.
@@ -56,6 +57,8 @@ import java.util.Set; */ public class MegaFile extends AlignFile { + private static final String MEGA_ANNOTATION_LABEL = "MEGA Label"; + private static final char UNDERSCORE = '_'; private static final String WHITESPACE = "\\s+"; @@ -246,6 +249,7 @@ public class MegaFile extends AlignFile */ currentSequenceId = ""; + boolean annotationAdded = false; while (dataLine != null) { dataLine = dataLine.trim(); @@ -260,7 +264,7 @@ public class MegaFile extends AlignFile } else if (upperCased.startsWith(BANG + LABEL.toUpperCase())) { - parseLabel(dataLine); + annotationAdded |= parseLabel(dataLine); } else { @@ -288,7 +292,10 @@ public class MegaFile extends AlignFile deriveSequencesAndFeatures(); - deriveAnnotations(); + if (annotationAdded) + { + deriveAnnotations(); + } } /** @@ -301,7 +308,7 @@ public class MegaFile extends AlignFile { Annotation[] anns = labelAnnotations .toArray(new Annotation[labelAnnotations.size()]); - AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label", + AlignmentAnnotation aa = new AlignmentAnnotation(MEGA_ANNOTATION_LABEL, "", anns); this.annotations.add(aa); } @@ -313,9 +320,10 @@ public class MegaFile extends AlignFile * Labels are assembled into an AlignmentAnnotation object. * * @param dataLine + * @return true if any non-null annotation was created * @throws FileFormatException */ - protected void parseLabel(String dataLine) throws FileFormatException + protected boolean parseLabel(String dataLine) throws FileFormatException { // strip off leading !Label and following spaces dataLine = dataLine.substring(LABEL.length() + 1).trim(); @@ -331,6 +339,7 @@ public class MegaFile extends AlignFile System.err.println("Warning: '" + dataLine + "' should end with semi-colon"); } + boolean added = false; for (char c : labels.toCharArray()) { if (c == UNDERSCORE) @@ -341,21 +350,10 @@ public class MegaFile extends AlignFile { this.labelAnnotations.add(new Annotation(String.valueOf(c), "", ' ', 0f)); + added = true; } } - - /* - * sanity check - the number of labels added should exactly match the - * sequence length so far - */ - int sequenceLength = seqData.isEmpty() ? 0 : seqData.values() - .iterator().next().length(); - if (labelAnnotations.size() != sequenceLength) - { - System.err.println("Warning: file inconsistent - " - + labelAnnotations.size() + " labels for " + sequenceLength - + " positions after " + dataLine); - } + return added; } /** @@ -364,11 +362,19 @@ public class MegaFile extends AlignFile protected void endOfDataBlock() { this.firstDataBlockRead = true; - // TODO: - // (initialise and) populate arrays of sequence length so far (excluding - // gaps) - // On change or end of a denoted Gene or Domain, add sequence features for - // it + + /* + * append null annotations to keep the annotations the same length as the + * sequences (in case some blocks have !Label lines and some don't) + */ + + int sequenceLength = seqData.isEmpty() ? 0 : seqData.values() + .iterator().next().length(); + int annotationsToAdd = sequenceLength - labelAnnotations.size(); + for (int i = 0; i < annotationsToAdd; i++) + { + labelAnnotations.add(null); + } } /** @@ -1419,6 +1425,7 @@ public class MegaFile extends AlignFile sb.append(newline); first = false; } + sb.append(printLabel(from, advancedBy, maxIdLength)); from += advancedBy; } @@ -1553,40 +1560,112 @@ public class MegaFile extends AlignFile */ StringBuilder sb = new StringBuilder(numLines * positionsPerLine); + for (SequenceI seq : s) + { + printSequence(sb, seq); + } + + return new String(sb); + } + + /** + * Append a formatted complete sequence to the string buffer + * + * @param sb + * @param seq + */ + protected void printSequence(StringBuilder sb, SequenceI seq) + { int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; + // round down to output a whole number of spaced blocks int chunksPerLine = positionsPerLine / spaceEvery; - for (SequenceI seq : s) + + sb.append(newline); + sb.append(HASHSIGN + seq.getName()).append(newline); + int startPos = 0; + while (startPos < seq.getLength()) { - sb.append(newline); - sb.append(HASHSIGN + seq.getName()).append(newline); - int startPos = 0; - while (startPos < seq.getLength()) + /* + * print next line for this sequence + */ + boolean firstChunk = true; + int lastPos = startPos + positionsPerLine; // exclusive + for (int j = 0; j < chunksPerLine; j++) { - boolean firstChunk = true; - /* - * print next line for this sequence - */ - int lastPos = startPos + positionsPerLine; // exclusive - for (int j = 0; j < chunksPerLine; j++) + char[] subSequence = seq.getSequence(startPos, + Math.min(lastPos, startPos + spaceEvery)); + if (subSequence.length > 0) { - char[] subSequence = seq.getSequence(startPos, - Math.min(lastPos, startPos + positionsPerLine)); - if (subSequence.length > 0) + if (!firstChunk) { - if (!firstChunk) - { - sb.append(SPACE); - } - sb.append(subSequence); - firstChunk = false; + sb.append(SPACE); } - startPos += subSequence.length; + sb.append(subSequence); + firstChunk = false; } - sb.append(newline); + startPos += subSequence.length; } + // line end position (base 1) as a comment + sb.append(SPACE).append(COMMENT_START).append(startPos) + .append(COMMENT_END); + sb.append(newline); } + } - return new String(sb); + /** + * Returns a formatted string like
+ * !Label aa_b_ ab_b_
+ * where underscore represents no annotation, any other character a MEGA label + * character
+ * Returns an empty string if there is no non-null annotation in the given + * alignment range + * + * @param fromPos + * start column of the alignment (base 0) + * @param positions + * number of positions to output + * @param labelWidth + * padded width of !Label statement to output + * @return + */ + protected String printLabel(int fromPos, int positions, int labelWidth) + { + int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; + String none = ""; + if (annotations == null || annotations.isEmpty() + || !MEGA_ANNOTATION_LABEL.equals(annotations.get(0).label)) + { + return none; + } + + StringBuilder sb = new StringBuilder(positions + 20); + sb.append(String.format("%-" + labelWidth + "s ", BANG + LABEL)); + Annotation[] anns = annotations.get(0).annotations; + int blockCharCount = 0; + boolean annotationFound = false; + + for (int i = fromPos; i < fromPos + positions; i++) + { + String label = String.valueOf(UNDERSCORE); + if (i < anns.length && anns[i] != null) + { + label = anns[i].displayCharacter; + } + sb.append(label); + if (label.charAt(0) != UNDERSCORE) + { + annotationFound = true; + } + // add a space after each block except the last + if (++blockCharCount % spaceEvery == 0 + && (i < fromPos + positions - 1)) + { + sb.append(SPACE); + } + } + sb.append(SEMICOLON).append(newline); + + return annotationFound ? sb.toString() : none; } /** @@ -1643,13 +1722,31 @@ public class MegaFile extends AlignFile /** * Print the given alignment in MEGA format. If the alignment was created by * parsing a MEGA file, it should have properties set (e.g. Title) which can - * influence the output. + * surface in the output. */ @Override public String print(AlignmentI al) { this.nucleotide = al.isNucleotide(); + /* + * if the alignment has a "MEGA" annotation, we'll output its values as + * !Label statements; MEGA only supports one of these + */ + AlignmentAnnotation[] anns = al.getAlignmentAnnotation(); + if (anns != null) + { + for (AlignmentAnnotation ann : anns) + { + if (MEGA_ANNOTATION_LABEL.equals(ann.label)) + { + this.annotations = new Vector(); + annotations.add(ann); + break; + } + } + } + String lineLength = (String) al.getProperty(PROP_LINELENGTH); this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer .parseInt(lineLength); diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index f7d83c0..bdae11a 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -279,9 +279,9 @@ public class MegaFileTest System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines - String expected = "#MEGA\n\n" - + "#U455\n" + "ABCFEDHIJM\nNOPQR\n\n" - + "#CPZANT\n" + "KLMNOPWXYZ\nCGATC\n"; + String expected = "#MEGA\n\n" + "#U455\n" + + "ABCFEDHIJM [10]\nNOPQR [15]\n\n" + "#CPZANT\n" + + "KLMNOPWXYZ [10]\nCGATC [15]\n"; assertEquals("Print format wrong", expected, printed); } @@ -304,7 +304,6 @@ public class MegaFileTest String printed = testee.print(); System.out.println(printed); //@formatter:off - //0123456789klmnopqrstABCDEFGHIJ9876543210abcdefghij String expected = "#MEGA\n\n" + "#U455 0123456789 klmnopqrst [20]\n" + // first 20 @@ -333,10 +332,14 @@ public class MegaFileTest assertEquals(30, testee.getPositionsPerLine()); testee.setPositionsPerLine(25); String printed = testee.print(); - // 60 character sequence should be output as 50 on first line then 10 more + + /* + * 25 positions per line is rounded down to 20 (two blocks of 10) + */ String expected = "#MEGA\n\n" + "#SIXTY\n" - + "0123456789klmnopqrstABCDE\n" + "FGHIJ9876543210abcdefghij\n" - + "9993332221\n"; + + "0123456789 klmnopqrst [20]\n" + + "ABCDEFGHIJ 9876543210 [40]\n" + + "abcdefghij 9993332221 [60]\n"; assertEquals("Print format wrong", expected, printed); } @@ -718,24 +721,31 @@ public class MegaFileTest "TITLE: Interleaved sequence data\n\n" + "#U455 ABC DEF\n" + "#CPZANT MNO PQR\n" + - "!Label +-_ 23_\n" + + "!Label +-_ 23_\n\n" + + // a row with no labels = null annotation + "#U455 abc def\n" + + "#CPZANT mno pqr\n\n" + "#U455 KLM NOP\n" + "#CPZANT WXY ZGC\n" + "!label __3 +X_\n", AppletFormatAdapter.PASTE); //@formatter:on Vector seqs = testee.getSeqs(); assertEquals("Expected two sequences", 2, seqs.size()); - assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) + assertEquals("First sequence data wrong", "ABCDEFabcdefKLMNOP", seqs + .get(0) .getSequenceAsString()); - assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1) + assertEquals("Second sequence data wrong", "MNOPQRmnopqrWXYZGC", seqs + .get(1) .getSequenceAsString()); // check AlignmentAnnotation added with expected values assertEquals(1, testee.annotations.size()); AlignmentAnnotation aa = testee.annotations.get(0); assertNull(aa.sequenceRef); - assertEquals(12, aa.annotations.length); - assertEquals("+, -, , 2, 3, , , , 3, +, X, , ", aa.toString()); + assertEquals("MEGA Label", aa.label); + assertEquals(18, aa.annotations.length); + assertEquals("+, -, , 2, 3, , , , , , , , , , 3, +, X, , ", + aa.toString()); } //@formatter:on @@ -809,4 +819,50 @@ public class MegaFileTest .getSequenceAsString()); assertEquals('-', al.getGapCharacter()); } + + /** + * Test reading a MEGA file to an alignment then writing it out in MEGA + * format. Includes !Label statements which should be converted to + * AlignmentAnnotation and back again. + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testRoundTrip_withLabels() throws IOException + { + AppletFormatAdapter fa = new AppletFormatAdapter(); + + //@formatter:off + String data = "#MEGA\n" + + "#U455 C-- GTA\n" + + "#CPZANT ATC -G-\n" + + "!Label F__E_H\n\n" + + "#U455 CGA --T\n" + + "#CPZANT CA- -GC\n" + + "!Label FFH__E\n"; + AlignmentI al = fa.readFile(data, + AppletFormatAdapter.PASTE, "MEGA"); + AlignmentAnnotation aa = al.getAlignmentAnnotation()[0]; + assertEquals("MEGA Label", aa.label); + assertEquals("F, , , E, , H, F, F, H, , , E, ", + aa.toString()); + + MegaFile output = new MegaFile(); + String formatted = output.print(al); + String expected = + "#MEGA\n" + + "!Format\n" + + " DataType=Nucleotide CodeTable=Standard\n" + + " NSeqs=2 NSites=12\n" + + " Indel=-;\n\n" + + "#U455 C-- GTA [6]\n" + + "#CPZANT ATC -G- [6]\n" + + "!Label F__ E_H;\n\n" + + "#U455 CGA --T [12]\n" + + "#CPZANT CA- -GC [12]\n" + + "!Label FFH __E;\n"; + //@formatter:on + assertEquals("Roundtrip didn't match", expected, + formatted); + } } -- 1.7.10.2