import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
+import java.util.Vector;
/**
* A parser for input or output of MEGA format files. <br>
*/
public class MegaFile extends AlignFile
{
+ private static final String MEGA_ANNOTATION_LABEL = "MEGA Label";
+
private static final char UNDERSCORE = '_';
private static final String WHITESPACE = "\\s+";
*/
currentSequenceId = "";
+ boolean annotationAdded = false;
while (dataLine != null)
{
dataLine = dataLine.trim();
}
else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
{
- parseLabel(dataLine);
+ annotationAdded |= parseLabel(dataLine);
}
else
{
deriveSequencesAndFeatures();
- deriveAnnotations();
+ if (annotationAdded)
+ {
+ deriveAnnotations();
+ }
}
/**
{
Annotation[] anns = labelAnnotations
.toArray(new Annotation[labelAnnotations.size()]);
- AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
+ AlignmentAnnotation aa = new AlignmentAnnotation(MEGA_ANNOTATION_LABEL, "",
anns);
this.annotations.add(aa);
}
* Labels are assembled into an AlignmentAnnotation object.
*
* @param dataLine
+ * @return true if any non-null annotation was created
* @throws FileFormatException
*/
- protected void parseLabel(String dataLine) throws FileFormatException
+ protected boolean parseLabel(String dataLine) throws FileFormatException
{
// strip off leading !Label and following spaces
dataLine = dataLine.substring(LABEL.length() + 1).trim();
System.err.println("Warning: '" + dataLine
+ "' should end with semi-colon");
}
+ boolean added = false;
for (char c : labels.toCharArray())
{
if (c == UNDERSCORE)
{
this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
' ', 0f));
+ added = true;
}
}
-
- /*
- * sanity check - the number of labels added should exactly match the
- * sequence length so far
- */
- int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
- .iterator().next().length();
- if (labelAnnotations.size() != sequenceLength)
- {
- System.err.println("Warning: file inconsistent - "
- + labelAnnotations.size() + " labels for " + sequenceLength
- + " positions after " + dataLine);
- }
+ return added;
}
/**
protected void endOfDataBlock()
{
this.firstDataBlockRead = true;
- // TODO:
- // (initialise and) populate arrays of sequence length so far (excluding
- // gaps)
- // On change or end of a denoted Gene or Domain, add sequence features for
- // it
+
+ /*
+ * append null annotations to keep the annotations the same length as the
+ * sequences (in case some blocks have !Label lines and some don't)
+ */
+
+ int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
+ .iterator().next().length();
+ int annotationsToAdd = sequenceLength - labelAnnotations.size();
+ for (int i = 0; i < annotationsToAdd; i++)
+ {
+ labelAnnotations.add(null);
+ }
}
/**
sb.append(newline);
first = false;
}
+ sb.append(printLabel(from, advancedBy, maxIdLength));
from += advancedBy;
}
*/
StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
+ for (SequenceI seq : s)
+ {
+ printSequence(sb, seq);
+ }
+
+ return new String(sb);
+ }
+
+ /**
+ * Append a formatted complete sequence to the string buffer
+ *
+ * @param sb
+ * @param seq
+ */
+ protected void printSequence(StringBuilder sb, SequenceI seq)
+ {
int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+ // round down to output a whole number of spaced blocks
int chunksPerLine = positionsPerLine / spaceEvery;
- for (SequenceI seq : s)
+
+ sb.append(newline);
+ sb.append(HASHSIGN + seq.getName()).append(newline);
+ int startPos = 0;
+ while (startPos < seq.getLength())
{
- sb.append(newline);
- sb.append(HASHSIGN + seq.getName()).append(newline);
- int startPos = 0;
- while (startPos < seq.getLength())
+ /*
+ * print next line for this sequence
+ */
+ boolean firstChunk = true;
+ int lastPos = startPos + positionsPerLine; // exclusive
+ for (int j = 0; j < chunksPerLine; j++)
{
- boolean firstChunk = true;
- /*
- * print next line for this sequence
- */
- int lastPos = startPos + positionsPerLine; // exclusive
- for (int j = 0; j < chunksPerLine; j++)
+ char[] subSequence = seq.getSequence(startPos,
+ Math.min(lastPos, startPos + spaceEvery));
+ if (subSequence.length > 0)
{
- char[] subSequence = seq.getSequence(startPos,
- Math.min(lastPos, startPos + positionsPerLine));
- if (subSequence.length > 0)
+ if (!firstChunk)
{
- if (!firstChunk)
- {
- sb.append(SPACE);
- }
- sb.append(subSequence);
- firstChunk = false;
+ sb.append(SPACE);
}
- startPos += subSequence.length;
+ sb.append(subSequence);
+ firstChunk = false;
}
- sb.append(newline);
+ startPos += subSequence.length;
}
+ // line end position (base 1) as a comment
+ sb.append(SPACE).append(COMMENT_START).append(startPos)
+ .append(COMMENT_END);
+ sb.append(newline);
}
+ }
- return new String(sb);
+ /**
+ * Returns a formatted string like <br>
+ * !Label aa_b_ ab_b_ <br>
+ * where underscore represents no annotation, any other character a MEGA label
+ * character <br>
+ * Returns an empty string if there is no non-null annotation in the given
+ * alignment range
+ *
+ * @param fromPos
+ * start column of the alignment (base 0)
+ * @param positions
+ * number of positions to output
+ * @param labelWidth
+ * padded width of !Label statement to output
+ * @return
+ */
+ protected String printLabel(int fromPos, int positions, int labelWidth)
+ {
+ int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+ String none = "";
+ if (annotations == null || annotations.isEmpty()
+ || !MEGA_ANNOTATION_LABEL.equals(annotations.get(0).label))
+ {
+ return none;
+ }
+
+ StringBuilder sb = new StringBuilder(positions + 20);
+ sb.append(String.format("%-" + labelWidth + "s ", BANG + LABEL));
+ Annotation[] anns = annotations.get(0).annotations;
+ int blockCharCount = 0;
+ boolean annotationFound = false;
+
+ for (int i = fromPos; i < fromPos + positions; i++)
+ {
+ String label = String.valueOf(UNDERSCORE);
+ if (i < anns.length && anns[i] != null)
+ {
+ label = anns[i].displayCharacter;
+ }
+ sb.append(label);
+ if (label.charAt(0) != UNDERSCORE)
+ {
+ annotationFound = true;
+ }
+ // add a space after each block except the last
+ if (++blockCharCount % spaceEvery == 0
+ && (i < fromPos + positions - 1))
+ {
+ sb.append(SPACE);
+ }
+ }
+ sb.append(SEMICOLON).append(newline);
+
+ return annotationFound ? sb.toString() : none;
}
/**
/**
* Print the given alignment in MEGA format. If the alignment was created by
* parsing a MEGA file, it should have properties set (e.g. Title) which can
- * influence the output.
+ * surface in the output.
*/
@Override
public String print(AlignmentI al)
{
this.nucleotide = al.isNucleotide();
+ /*
+ * if the alignment has a "MEGA" annotation, we'll output its values as
+ * !Label statements; MEGA only supports one of these
+ */
+ AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
+ if (anns != null)
+ {
+ for (AlignmentAnnotation ann : anns)
+ {
+ if (MEGA_ANNOTATION_LABEL.equals(ann.label))
+ {
+ this.annotations = new Vector<AlignmentAnnotation>();
+ annotations.add(ann);
+ break;
+ }
+ }
+ }
+
String lineLength = (String) al.getProperty(PROP_LINELENGTH);
this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
.parseInt(lineLength);
System.out.println(printed);
// normally output should match input
// we cheated here with a number of short input lines
- String expected = "#MEGA\n\n"
- + "#U455\n" + "ABCFEDHIJM\nNOPQR\n\n"
- + "#CPZANT\n" + "KLMNOPWXYZ\nCGATC\n";
+ String expected = "#MEGA\n\n" + "#U455\n"
+ + "ABCFEDHIJM [10]\nNOPQR [15]\n\n" + "#CPZANT\n"
+ + "KLMNOPWXYZ [10]\nCGATC [15]\n";
assertEquals("Print format wrong", expected, printed);
}
String printed = testee.print();
System.out.println(printed);
//@formatter:off
- //0123456789klmnopqrstABCDEFGHIJ9876543210abcdefghij
String expected =
"#MEGA\n\n" +
"#U455 0123456789 klmnopqrst [20]\n" + // first 20
assertEquals(30, testee.getPositionsPerLine());
testee.setPositionsPerLine(25);
String printed = testee.print();
- // 60 character sequence should be output as 50 on first line then 10 more
+
+ /*
+ * 25 positions per line is rounded down to 20 (two blocks of 10)
+ */
String expected = "#MEGA\n\n" + "#SIXTY\n"
- + "0123456789klmnopqrstABCDE\n" + "FGHIJ9876543210abcdefghij\n"
- + "9993332221\n";
+ + "0123456789 klmnopqrst [20]\n"
+ + "ABCDEFGHIJ 9876543210 [40]\n"
+ + "abcdefghij 9993332221 [60]\n";
assertEquals("Print format wrong", expected, printed);
}
"TITLE: Interleaved sequence data\n\n" +
"#U455 ABC DEF\n" +
"#CPZANT MNO PQR\n" +
- "!Label +-_ 23_\n" +
+ "!Label +-_ 23_\n\n" +
+ // a row with no labels = null annotation
+ "#U455 abc def\n" +
+ "#CPZANT mno pqr\n\n" +
"#U455 KLM NOP\n" +
"#CPZANT WXY ZGC\n" +
"!label __3 +X_\n", AppletFormatAdapter.PASTE);
//@formatter:on
Vector<SequenceI> seqs = testee.getSeqs();
assertEquals("Expected two sequences", 2, seqs.size());
- assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+ assertEquals("First sequence data wrong", "ABCDEFabcdefKLMNOP", seqs
+ .get(0)
.getSequenceAsString());
- assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1)
+ assertEquals("Second sequence data wrong", "MNOPQRmnopqrWXYZGC", seqs
+ .get(1)
.getSequenceAsString());
// check AlignmentAnnotation added with expected values
assertEquals(1, testee.annotations.size());
AlignmentAnnotation aa = testee.annotations.get(0);
assertNull(aa.sequenceRef);
- assertEquals(12, aa.annotations.length);
- assertEquals("+, -, , 2, 3, , , , 3, +, X, , ", aa.toString());
+ assertEquals("MEGA Label", aa.label);
+ assertEquals(18, aa.annotations.length);
+ assertEquals("+, -, , 2, 3, , , , , , , , , , 3, +, X, , ",
+ aa.toString());
}
//@formatter:on
.getSequenceAsString());
assertEquals('-', al.getGapCharacter());
}
+
+ /**
+ * Test reading a MEGA file to an alignment then writing it out in MEGA
+ * format. Includes !Label statements which should be converted to
+ * AlignmentAnnotation and back again.
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testRoundTrip_withLabels() throws IOException
+ {
+ AppletFormatAdapter fa = new AppletFormatAdapter();
+
+ //@formatter:off
+ String data = "#MEGA\n"
+ + "#U455 C-- GTA\n"
+ + "#CPZANT ATC -G-\n"
+ + "!Label F__E_H\n\n"
+ + "#U455 CGA --T\n"
+ + "#CPZANT CA- -GC\n"
+ + "!Label FFH__E\n";
+ AlignmentI al = fa.readFile(data,
+ AppletFormatAdapter.PASTE, "MEGA");
+ AlignmentAnnotation aa = al.getAlignmentAnnotation()[0];
+ assertEquals("MEGA Label", aa.label);
+ assertEquals("F, , , E, , H, F, F, H, , , E, ",
+ aa.toString());
+
+ MegaFile output = new MegaFile();
+ String formatted = output.print(al);
+ String expected =
+ "#MEGA\n" +
+ "!Format\n" +
+ " DataType=Nucleotide CodeTable=Standard\n" +
+ " NSeqs=2 NSites=12\n" +
+ " Indel=-;\n\n" +
+ "#U455 C-- GTA [6]\n" +
+ "#CPZANT ATC -G- [6]\n" +
+ "!Label F__ E_H;\n\n" +
+ "#U455 CGA --T [12]\n" +
+ "#CPZANT CA- -GC [12]\n" +
+ "!Label FFH __E;\n";
+ //@formatter:on
+ assertEquals("Roundtrip didn't match", expected,
+ formatted);
+ }
}