*/
package jalview.io;
+import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Annotation;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
*/
public class MegaFile extends AlignFile
{
+ private static final char UNDERSCORE = '_';
+
private static final String WHITESPACE = "\\s+";
private static final int DEFAULT_LINE_LENGTH = 60;
private static final String SPACE = " ";
+ private static final String TAB = "\t";
+
/*
* number of sequence positions output per line
*/
// map of SequenceFeature's by sequence id
Map<String, List<SequenceFeature>> sequenceFeatures;
+ // each !Label line character becomes an Annotation (except underscores)
+ List<Annotation> labelAnnotations;
+
public MegaFile()
{
}
geneStart = new HashMap<String, Integer>();
domainStart = new HashMap<String, Integer>();
residuesRead = new HashMap<String, Integer>();
+ labelAnnotations = new ArrayList<Annotation>();
/*
* Read and process MEGA and Title/Format/Description headers if present.
dataLine = dataLine.trim();
if (dataLine.length() > 0)
{
- if (dataLine.startsWith(BANG + GENE)
- || dataLine.startsWith(BANG + DOMAIN))
+ dataLine = dataLine.replace(TAB, SPACE);
+ String upperCased = dataLine.toUpperCase();
+ if (upperCased.startsWith(BANG + GENE.toUpperCase())
+ || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
{
parseGeneOrDomain(dataLine);
}
- else if (dataLine.startsWith(BANG + LABEL))
+ else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
{
parseLabel(dataLine);
}
// remember the (longest) line length read in, so we can output the same
setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
- deriveSequences();
+ deriveSequencesAndFeatures();
+
+ deriveAnnotations();
}
/**
- * Parse a !Label
+ * If we parsed !Label statements into a list of Annotation objects, create an
+ * AlignmentAnnotation
+ */
+ protected void deriveAnnotations()
+ {
+ if (this.labelAnnotations.size() > 0)
+ {
+ Annotation[] anns = labelAnnotations
+ .toArray(new Annotation[labelAnnotations.size()]);
+ AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
+ anns);
+ this.annotations.add(aa);
+ }
+ }
+
+ /**
+ * Parse a !Label line. This contains a single character per position (column)
+ * of the alignment block above. An underscore character represents no label.
+ * Labels are assembled into an AlignmentAnnotation object.
*
* @param dataLine
+ * @throws FileFormatException
*/
- protected void parseLabel(String dataLine)
+ protected void parseLabel(String dataLine) throws FileFormatException
{
- // TODO Auto-generated method stub
+ // strip off leading !Label and following spaces
+ dataLine = dataLine.substring(LABEL.length() + 1).trim();
+
+ // remove internal spacing and any leading tab
+ String labels = dataLine.replace(SPACE, "");
+ if (labels.endsWith(SEMICOLON))
+ {
+ labels = labels.substring(0, labels.length() - 1);
+ }
+ else
+ {
+ System.err.println("Warning: '" + dataLine
+ + "' should end with semi-colon");
+ }
+ for (char c : labels.toCharArray())
+ {
+ if (c == UNDERSCORE)
+ {
+ this.labelAnnotations.add(null);
+ }
+ else
+ {
+ this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
+ ' ', 0f));
+ }
+ }
+ /*
+ * sanity check - the number of labels added should exactly match the
+ * sequence length so far
+ */
+ int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
+ .iterator().next().length();
+ if (labelAnnotations.size() != sequenceLength)
+ {
+ System.err.println("Warning: file inconsistent - "
+ + labelAnnotations.size() + " labels for " + sequenceLength
+ + " positions after " + dataLine);
+ }
}
/**
* the order of processing below ensures that we correctly capture where a
* domain is in the context of an enclosing gene
*/
- processDomainEnd(domain, property);
+ processDomainEnd(domain, gene, property);
processGeneEnd(gene);
/**
* If we have been processing a domain, and it is not being continued, then
- * make a sequence feature for the domain just ended
+ * make a sequence feature for the domain just ended. Criteria for the domain
+ * not being continued are either an explicit new domain or gene name, or a
+ * 'Property=domainend' statement
*
* @param domain
+ * @param gene
* @param property
* @return true if a feature is created, else false
*/
- protected boolean processDomainEnd(String domain, String property)
+ protected boolean processDomainEnd(String domain, String gene,
+ String property)
{
+ boolean newGene = (gene != null && !gene.equals(currentGene));
+
String verboseDomain = makeVerboseDomainName(domain, property);
+
if (this.currentDomain != null)
{
+ boolean newDomain = !this.currentDomain.equals(verboseDomain);
boolean domainEnded = "domainend".equalsIgnoreCase(property);
- if (!this.currentDomain.equals(verboseDomain) || domainEnded)
+ if (newDomain || newGene || domainEnded)
{
createFeature(DOMAIN, currentDomain, domainStart);
+ currentDomain = null;
return true;
}
}
if (this.currentGene != null && !this.currentGene.equals(gene))
{
createFeature(GENE, currentGene, geneStart);
+ currentGene = null;
created = true;
}
/**
* Convert the parsed sequence strings to objects and store them in the model.
*/
- protected void deriveSequences()
+ protected void deriveSequencesAndFeatures()
{
Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
* @throws IOException
*/
protected void parseNoninterleavedDataLine(String dataLine)
- throws IOException
+ throws FileFormatException
{
if (currentSequenceId == null)
{
/*
* Oops. Data but no sequence id context.
*/
- throw new IOException("No sequence id context at: " + dataLine);
+ throw new FileFormatException("No sequence id context at: "
+ + dataLine);
}
assertInterleaved(false, dataLine);
import static org.testng.AssertJUnit.assertTrue;
import static org.testng.AssertJUnit.fail;
+import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
* @throws IOException
*/
@Test(groups = { "Functional" })
- public void testParse_interleavedWithIdentity() throws IOException
+ public void testParse_interleavedWithIdentityAndTabs() throws IOException
{
//@formatter:off
+ // uses tab instead of space separators to check robustness
MegaFile testee = new MegaFile("#MEGA\n"+
- "!TITLE Interleaved sequence data;\n" +
- "!Format Identical=.;\n\n" +
- "#U455 ABCDEF\n" +
- "#CPZANT M..P.R\n\n" +
- "#U455 KLMNOP\n" +
- "#CPZANT ..YZ..", AppletFormatAdapter.PASTE);
+ "!TITLE\tInterleaved sequence data;\n" +
+ "!Format\tIdentical=.;\n\n" +
+ "#U455\tABCDEF\n" +
+ "#CPZANT\tM..P.R\n\n" +
+ "#U455\t\tKLMNOP\n" +
+ "#CPZANT\t..YZ..", AppletFormatAdapter.PASTE);
//@formatter:on
assertEquals("Title not as expected", "Interleaved sequence data",
testee.getAlignmentProperty(MegaFile.PROP_TITLE));
"!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" +
"#U455 GGGGGG\n" +
"#CPZANT AAAAAA\n\n" +
- "!Domain=Intron1 Property=Intron Gene=Adh;\n" +
+ "!domain=Intron1 Property=Intron Gene=Adh;\n" +
"#U455 tttttt\n" +
"#CPZANT cccccc\n\n" +
"!Domain=Exon2 Gene=Adh Property=Exon CodonStart=1;\n" +
assertEquals(begin, sf.begin);
assertEquals(end, sf.end);
}
+
+ //@formatter:on
+
+ /**
+ * Test parse of data including !Label statements. An underscore means no
+ * label, other characters are treated as alignment annotation.
+ *
+ * @throws IOException
+ */
+ @Test(groups = { "Functional" })
+ public void testParse_withLabels() throws IOException
+ {
+ //@formatter:off
+ MegaFile testee = new MegaFile("#MEGA\n"+
+ "TITLE: Interleaved sequence data\n\n" +
+ "#U455 ABC DEF\n" +
+ "#CPZANT MNO PQR\n" +
+ "!Label +-_ 23_\n" +
+ "#U455 KLM NOP\n" +
+ "#CPZANT WXY ZGC\n" +
+ "!label __3 +X_\n", AppletFormatAdapter.PASTE);
+ //@formatter:on
+ Vector<SequenceI> seqs = testee.getSeqs();
+ assertEquals("Expected two sequences", 2, seqs.size());
+ assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+ .getSequenceAsString());
+ assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1)
+ .getSequenceAsString());
+
+ // check AlignmentAnnotation added with expected values
+ assertEquals(1, testee.annotations.size());
+ AlignmentAnnotation aa = testee.annotations.get(0);
+ assertNull(aa.sequenceRef);
+ assertEquals(12, aa.annotations.length);
+ assertEquals("+, -, , 2, 3, , , , 3, +, X, , ", aa.toString());
+ }
+
+ //@formatter:on
+
+ /**
+ * Test case where a domain is implicitly terminated by starting a new gene
+ *
+ * @throws IOException
+ */
+ @Test(groups = { "Functional" })
+ public void testParse_changeOfGeneEndsDomain() throws IOException
+ {
+ //@formatter:off
+ // uses tab instead of space separators to check robustness
+ MegaFile testee = new MegaFile("#MEGA\n"+
+ "!TITLE Interleaved sequence data;\n" +
+ "!Format Identical=.;\n\n" +
+ "!Gene=gene1 Domain=Exon1 Property=Coding;\n" +
+ "#U455 ABCDEF\n" +
+ "#CPZANT M..P.R\n\n" +
+ "!Gene=gene2;\n" +
+ "#U455 KLMNOP\n" +
+ "#CPZANT ..YZ..", AppletFormatAdapter.PASTE);
+ //@formatter:on
+ Vector<SequenceI> seqs = testee.getSeqs();
+ assertEquals("Expected two sequences", 2, seqs.size());
+ assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
+ .getSequenceAsString());
+ assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1)
+ .getSequenceAsString());
+ assertTrue("File format is not flagged as interleaved",
+ testee.isInterleaved());
+
+ for (SequenceI seq : seqs)
+ {
+ SequenceFeature[] sfs = seq.getSequenceFeatures();
+ assertEquals(3, sfs.length);
+ verifySequenceFeature(sfs[0], "Exon1 (gene1 Coding)", "Domain", 1, 6);
+ verifySequenceFeature(sfs[1], "gene1", "Gene", 1, 6);
+ verifySequenceFeature(sfs[2], "gene2", "Gene", 7, 12);
+ }
+ }
}