From: gmungoc Date: Wed, 7 Oct 2015 15:47:13 +0000 (+0100) Subject: JAL-1499 parsing !Label statements to AlignmentAnnotation X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=9443c2fb3dc8c7e9262510d5269b8c5b59721478;p=jalview.git JAL-1499 parsing !Label statements to AlignmentAnnotation --- diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index eb9868b..3096b60 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -18,7 +18,9 @@ */ package jalview.io; +import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Annotation; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -53,6 +55,8 @@ import java.util.Set; */ public class MegaFile extends AlignFile { + private static final char UNDERSCORE = '_'; + private static final String WHITESPACE = "\\s+"; private static final int DEFAULT_LINE_LENGTH = 60; @@ -137,6 +141,8 @@ public class MegaFile extends AlignFile private static final String SPACE = " "; + private static final String TAB = "\t"; + /* * number of sequence positions output per line */ @@ -190,6 +196,9 @@ public class MegaFile extends AlignFile // map of SequenceFeature's by sequence id Map> sequenceFeatures; + // each !Label line character becomes an Annotation (except underscores) + List labelAnnotations; + public MegaFile() { } @@ -215,6 +224,7 @@ public class MegaFile extends AlignFile geneStart = new HashMap(); domainStart = new HashMap(); residuesRead = new HashMap(); + labelAnnotations = new ArrayList(); /* * Read and process MEGA and Title/Format/Description headers if present. @@ -238,12 +248,14 @@ public class MegaFile extends AlignFile dataLine = dataLine.trim(); if (dataLine.length() > 0) { - if (dataLine.startsWith(BANG + GENE) - || dataLine.startsWith(BANG + DOMAIN)) + dataLine = dataLine.replace(TAB, SPACE); + String upperCased = dataLine.toUpperCase(); + if (upperCased.startsWith(BANG + GENE.toUpperCase()) + || upperCased.startsWith(BANG + DOMAIN.toUpperCase())) { parseGeneOrDomain(dataLine); } - else if (dataLine.startsWith(BANG + LABEL)) + else if (upperCased.startsWith(BANG + LABEL.toUpperCase())) { parseLabel(dataLine); } @@ -271,18 +283,76 @@ public class MegaFile extends AlignFile // remember the (longest) line length read in, so we can output the same setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine)); - deriveSequences(); + deriveSequencesAndFeatures(); + + deriveAnnotations(); } /** - * Parse a !Label + * If we parsed !Label statements into a list of Annotation objects, create an + * AlignmentAnnotation + */ + protected void deriveAnnotations() + { + if (this.labelAnnotations.size() > 0) + { + Annotation[] anns = labelAnnotations + .toArray(new Annotation[labelAnnotations.size()]); + AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label", + anns); + this.annotations.add(aa); + } + } + + /** + * Parse a !Label line. This contains a single character per position (column) + * of the alignment block above. An underscore character represents no label. + * Labels are assembled into an AlignmentAnnotation object. * * @param dataLine + * @throws FileFormatException */ - protected void parseLabel(String dataLine) + protected void parseLabel(String dataLine) throws FileFormatException { - // TODO Auto-generated method stub + // strip off leading !Label and following spaces + dataLine = dataLine.substring(LABEL.length() + 1).trim(); + + // remove internal spacing and any leading tab + String labels = dataLine.replace(SPACE, ""); + if (labels.endsWith(SEMICOLON)) + { + labels = labels.substring(0, labels.length() - 1); + } + else + { + System.err.println("Warning: '" + dataLine + + "' should end with semi-colon"); + } + for (char c : labels.toCharArray()) + { + if (c == UNDERSCORE) + { + this.labelAnnotations.add(null); + } + else + { + this.labelAnnotations.add(new Annotation(String.valueOf(c), "", + ' ', 0f)); + } + } + /* + * sanity check - the number of labels added should exactly match the + * sequence length so far + */ + int sequenceLength = seqData.isEmpty() ? 0 : seqData.values() + .iterator().next().length(); + if (labelAnnotations.size() != sequenceLength) + { + System.err.println("Warning: file inconsistent - " + + labelAnnotations.size() + " labels for " + sequenceLength + + " positions after " + dataLine); + } } /** @@ -383,7 +453,7 @@ public class MegaFile extends AlignFile * the order of processing below ensures that we correctly capture where a * domain is in the context of an enclosing gene */ - processDomainEnd(domain, property); + processDomainEnd(domain, gene, property); processGeneEnd(gene); @@ -435,21 +505,30 @@ public class MegaFile extends AlignFile /** * If we have been processing a domain, and it is not being continued, then - * make a sequence feature for the domain just ended + * make a sequence feature for the domain just ended. Criteria for the domain + * not being continued are either an explicit new domain or gene name, or a + * 'Property=domainend' statement * * @param domain + * @param gene * @param property * @return true if a feature is created, else false */ - protected boolean processDomainEnd(String domain, String property) + protected boolean processDomainEnd(String domain, String gene, + String property) { + boolean newGene = (gene != null && !gene.equals(currentGene)); + String verboseDomain = makeVerboseDomainName(domain, property); + if (this.currentDomain != null) { + boolean newDomain = !this.currentDomain.equals(verboseDomain); boolean domainEnded = "domainend".equalsIgnoreCase(property); - if (!this.currentDomain.equals(verboseDomain) || domainEnded) + if (newDomain || newGene || domainEnded) { createFeature(DOMAIN, currentDomain, domainStart); + currentDomain = null; return true; } } @@ -473,6 +552,7 @@ public class MegaFile extends AlignFile if (this.currentGene != null && !this.currentGene.equals(gene)) { createFeature(GENE, currentGene, geneStart); + currentGene = null; created = true; } @@ -705,7 +785,7 @@ public class MegaFile extends AlignFile /** * Convert the parsed sequence strings to objects and store them in the model. */ - protected void deriveSequences() + protected void deriveSequencesAndFeatures() { Set> datasets = seqData.entrySet(); @@ -773,14 +853,15 @@ public class MegaFile extends AlignFile * @throws IOException */ protected void parseNoninterleavedDataLine(String dataLine) - throws IOException + throws FileFormatException { if (currentSequenceId == null) { /* * Oops. Data but no sequence id context. */ - throw new IOException("No sequence id context at: " + dataLine); + throw new FileFormatException("No sequence id context at: " + + dataLine); } assertInterleaved(false, dataLine); diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index 881c47b..2b2422f 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -6,6 +6,7 @@ import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertTrue; import static org.testng.AssertJUnit.fail; +import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; @@ -502,16 +503,17 @@ public class MegaFileTest * @throws IOException */ @Test(groups = { "Functional" }) - public void testParse_interleavedWithIdentity() throws IOException + public void testParse_interleavedWithIdentityAndTabs() throws IOException { //@formatter:off + // uses tab instead of space separators to check robustness MegaFile testee = new MegaFile("#MEGA\n"+ - "!TITLE Interleaved sequence data;\n" + - "!Format Identical=.;\n\n" + - "#U455 ABCDEF\n" + - "#CPZANT M..P.R\n\n" + - "#U455 KLMNOP\n" + - "#CPZANT ..YZ..", AppletFormatAdapter.PASTE); + "!TITLE\tInterleaved sequence data;\n" + + "!Format\tIdentical=.;\n\n" + + "#U455\tABCDEF\n" + + "#CPZANT\tM..P.R\n\n" + + "#U455\t\tKLMNOP\n" + + "#CPZANT\t..YZ..", AppletFormatAdapter.PASTE); //@formatter:on assertEquals("Title not as expected", "Interleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); @@ -623,7 +625,7 @@ public class MegaFileTest "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" + "#U455 GGGGGG\n" + "#CPZANT AAAAAA\n\n" + - "!Domain=Intron1 Property=Intron Gene=Adh;\n" + + "!domain=Intron1 Property=Intron Gene=Adh;\n" + "#U455 tttttt\n" + "#CPZANT cccccc\n\n" + "!Domain=Exon2 Gene=Adh Property=Exon CodonStart=1;\n" + @@ -698,4 +700,81 @@ public class MegaFileTest assertEquals(begin, sf.begin); assertEquals(end, sf.end); } + + //@formatter:on + + /** + * Test parse of data including !Label statements. An underscore means no + * label, other characters are treated as alignment annotation. + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testParse_withLabels() throws IOException + { + //@formatter:off + MegaFile testee = new MegaFile("#MEGA\n"+ + "TITLE: Interleaved sequence data\n\n" + + "#U455 ABC DEF\n" + + "#CPZANT MNO PQR\n" + + "!Label +-_ 23_\n" + + "#U455 KLM NOP\n" + + "#CPZANT WXY ZGC\n" + + "!label __3 +X_\n", AppletFormatAdapter.PASTE); + //@formatter:on + Vector seqs = testee.getSeqs(); + assertEquals("Expected two sequences", 2, seqs.size()); + assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) + .getSequenceAsString()); + assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1) + .getSequenceAsString()); + + // check AlignmentAnnotation added with expected values + assertEquals(1, testee.annotations.size()); + AlignmentAnnotation aa = testee.annotations.get(0); + assertNull(aa.sequenceRef); + assertEquals(12, aa.annotations.length); + assertEquals("+, -, , 2, 3, , , , 3, +, X, , ", aa.toString()); + } + + //@formatter:on + + /** + * Test case where a domain is implicitly terminated by starting a new gene + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testParse_changeOfGeneEndsDomain() throws IOException + { + //@formatter:off + // uses tab instead of space separators to check robustness + MegaFile testee = new MegaFile("#MEGA\n"+ + "!TITLE Interleaved sequence data;\n" + + "!Format Identical=.;\n\n" + + "!Gene=gene1 Domain=Exon1 Property=Coding;\n" + + "#U455 ABCDEF\n" + + "#CPZANT M..P.R\n\n" + + "!Gene=gene2;\n" + + "#U455 KLMNOP\n" + + "#CPZANT ..YZ..", AppletFormatAdapter.PASTE); + //@formatter:on + Vector seqs = testee.getSeqs(); + assertEquals("Expected two sequences", 2, seqs.size()); + assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) + .getSequenceAsString()); + assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1) + .getSequenceAsString()); + assertTrue("File format is not flagged as interleaved", + testee.isInterleaved()); + + for (SequenceI seq : seqs) + { + SequenceFeature[] sfs = seq.getSequenceFeatures(); + assertEquals(3, sfs.length); + verifySequenceFeature(sfs[0], "Exon1 (gene1 Coding)", "Domain", 1, 6); + verifySequenceFeature(sfs[1], "gene1", "Gene", 1, 6); + verifySequenceFeature(sfs[2], "gene2", "Gene", 7, 12); + } + } }