From e710810610e029b1519d31c6d8a9de26833f59b9 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 6 Oct 2015 16:41:18 +0100 Subject: [PATCH] JAL-1499 improved parsing of !Domain and !Gene including Property=domainend --- src/jalview/io/MegaFile.java | 213 ++++++++++++++++++++++++++++--------- test/jalview/io/MegaFileTest.java | 76 ++++++++++++- 2 files changed, 237 insertions(+), 52 deletions(-) diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index e4079c0..58961f8 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -53,6 +53,8 @@ import java.util.Set; */ public class MegaFile extends AlignFile { + private static final String WHITESPACE = "\\s+"; + private static final int DEFAULT_LINE_LENGTH = 60; private static final String INDENT = " "; @@ -99,6 +101,10 @@ public class MegaFile extends AlignFile private static final String DOMAIN = "Domain"; + private static final String PROPERTY = "Property"; + + private static final String CODONSTART = "CodonStart"; + /* * names of properties to save to the alignment (may affect eventual output * format) @@ -167,14 +173,17 @@ public class MegaFile extends AlignFile // number of residues read (so far) per sequence Map residuesRead; - // start residue (base 1) per sequence of current feature - Map featureStart; - - // feature (Gene/Domain) if any we are parsing - private String currentFeature; + // current Gene if any we are parsing + private String currentGene; + + // start residue (base 1) per sequence of current gene + Map geneStart; - // feature type (Gene/Domain) if any we are parsing - private String currentFeatureType; + // current Domain if any we are parsing + private String currentDomain; + + // start residue (base 1) per sequence of current domain + Map domainStart; // map of SequenceFeature's by sequence id Map> sequenceFeatures; @@ -201,7 +210,8 @@ public class MegaFile extends AlignFile { gapCharacter = '-'; sequenceFeatures = new HashMap>(); - featureStart = new HashMap(); + geneStart = new HashMap(); + domainStart = new HashMap(); residuesRead = new HashMap(); /* @@ -226,13 +236,10 @@ public class MegaFile extends AlignFile dataLine = dataLine.trim(); if (dataLine.length() > 0) { - if (dataLine.startsWith(BANG + GENE)) - { - parseFeature(GENE, dataLine); - } - else if (dataLine.startsWith(BANG + DOMAIN)) + if (dataLine.startsWith(BANG + GENE) + || dataLine.startsWith(BANG + DOMAIN)) { - parseFeature(DOMAIN, dataLine); + parseGeneOrDomain(dataLine); } else { @@ -249,6 +256,12 @@ public class MegaFile extends AlignFile dataLine = nextNonCommentLine(); } + /* + * close off any features currently being parsed + */ + createFeature(GENE, currentGene, geneStart); + createFeature(DOMAIN, currentDomain, domainStart); + // remember the (longest) line length read in, so we can output the same setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine)); @@ -269,36 +282,131 @@ public class MegaFile extends AlignFile } /** - * Parse a !Gene or !Domain command line + * Parse a !Gene or !Domain command line. MEGA accepts + *
    + *
  • !Gene=name;
  • + *
  • !Gene=name Property=Coding/Noncoding CodonStart=1/2/3;
  • + *
  • !Gene=genename Domain=domainname Property= etc
  • + *
  • !Domain=domainname Gene=genename Property= etc
  • + *
  • !Domain=domainname Property= etc
  • + *
  • !domain=domainname property=domainend
  • + *
+ * Properly, a Gene should be composed of Domain segments, but MEGA accepts + * without. Note that keywords don't seem to be case sensitive. * - * @param featureType * @param dataLine + * @throws FileFormatException */ - protected void parseFeature(String featureType, String dataLine) + protected void parseGeneOrDomain(String dataLine) + throws FileFormatException { - String featureName = getValue(dataLine); - // TODO parse !Gene=xyx Property=end; ??? - if (this.currentFeature != null) + String domain = null; + String gene = null; + String property = null; + String codonStart = null; + String errorMsg = "Unrecognized format: " + dataLine; + + if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON)) + { + throw new FileFormatException(errorMsg); + } + String trimmed = dataLine.substring(1, dataLine.length() - 1).trim(); + String[] tokens = trimmed.split(WHITESPACE); + for (String token : tokens) { - endSequenceFeature(); + String[] keyValue = token.split("="); + if (keyValue.length != 2) + { + throw new FileFormatException(errorMsg); + } + String key = keyValue[0]; + if (GENE.equalsIgnoreCase(key)) + { + gene = keyValue[1]; + } + else if (DOMAIN.equalsIgnoreCase(key)) + { + domain = keyValue[1]; + } + else if (PROPERTY.equalsIgnoreCase(key)) + { + property = keyValue[1]; + } + else if (CODONSTART.equalsIgnoreCase(key)) + { + codonStart = keyValue[1]; + } + else + { + System.err.println("Unrecognised token: '" + key + "; in " + + dataLine); + } } - startSequenceFeature(featureName, featureType); + + processGeneOrDomain(gene, domain, property, codonStart); } /** - * Start processing a new feature + * Process a statement containing one or both of Gene and Domain, and + * optionally Property or CodonStart commands. * - * @param featureName + * @param gene + * the Gene name if specified, else null + * @param domain + * the Domain name if specified, else null + * @param property + * the Property value if specified, else null + * @param codonStart + * the CodonStart value if specified, else null */ - protected void startSequenceFeature(String featureName, String featureType) + protected void processGeneOrDomain(String gene, String domain, + String property, String codonStart) { - currentFeature = featureName; - currentFeatureType = featureType; + /* + * If we have been processing a Gene or Domain, and this does not continue + * it, then close it off (generate sequence features for it) + */ + if (this.currentGene != null && !this.currentGene.equals(gene)) + { + createFeature(GENE, currentGene, geneStart); + } + if (this.currentDomain != null) + { + if (!this.currentDomain.equals(domain) + || "domainend".equalsIgnoreCase(property)) + { + createFeature(DOMAIN, currentDomain, domainStart); + } + } /* - * If the feature name precedes all sequences, we will know in - * endSequenceFeature that it starts with residue 1; otherwise note now - * where it starts in each sequence + * and if we have declared a Gene or Domain which does not continue the + * current one, then record its start positions per sequence + */ + if (gene != null && !gene.equals(currentGene)) + { + startSequenceFeature(geneStart); + } + if (domain != null && !domain.equals(currentDomain)) + { + startSequenceFeature(domainStart); + } + + currentGene = gene; + currentDomain = domain; + } + + /** + * Start processing a new feature + * + * @param startPositions + */ + protected void startSequenceFeature(Map startPositions) + { + /* + * If the feature declaration precedes all sequences, we will know in + * createFeature that it started with residue 1; otherwise note now where it + * starts in each sequence */ if (!residuesRead.isEmpty()) { @@ -306,22 +414,32 @@ public class MegaFile extends AlignFile { String seqId = entry.getKey(); Integer nextResidue = entry.getValue() + 1; - featureStart.put(seqId, nextResidue); + startPositions.put(seqId, nextResidue); } } } /** - * Add a SequenceFeature for the current feature to each sequence, using the - * current feature start/end values per sequence + * Add a SequenceFeature to each sequence, using the given start/end values + * per sequence + * + * @param featureType + * @param featureValue + * @param featureStartResidues */ - protected void endSequenceFeature() + protected void createFeature(String featureType, String featureValue, + Map featureStartResidues) { + if (featureValue == null) + { + return; + } + Iterator seqids = this.seqData.keySet().iterator(); while (seqids.hasNext()) { String seqid = seqids.next(); - Integer startAt = featureStart.get(seqid); + Integer startAt = featureStartResidues.get(seqid); int sfstart = startAt == null ? 1 : startAt.intValue(); int sfend = residuesRead.get(seqid); if (sfend >= sfstart) @@ -329,23 +447,16 @@ public class MegaFile extends AlignFile /* * don't add feature if entirely gapped in the sequence */ - SequenceFeature sf = new SequenceFeature(currentFeature, - currentFeatureType, sfstart, sfend, 0f, null); + // TODO: type="Gene" (but then all coloured the same) or + // type="GeneName"? + SequenceFeature sf = new SequenceFeature(featureValue, featureType, + sfstart, sfend, 0f, null); sequenceFeatures.get(seqid).add(sf); } } } /** - * Parse a !Domain command line - * - * @param dataLine - */ - private void parseDomain(String dataLine) - { - } - - /** * Returns the next line that is not a comment, or null at end of file. * Comments in MEGA are within [ ] brackets, and may be nested. * @@ -798,7 +909,7 @@ public class MegaFile extends AlignFile { return; } - String[] tokens = inputLine.trim().split("\\s"); // any whitespace + String[] tokens = inputLine.trim().split(WHITESPACE); for (String token : tokens) { parseFormatKeyword(token); @@ -892,14 +1003,14 @@ public class MegaFile extends AlignFile System.err.println("Warning: " + token + " not supported"); } - else if (keyword.equalsIgnoreCase("Property")) + else if (keyword.equalsIgnoreCase(PROPERTY)) { - // TODO: figure out what to do with this - // can it appear more than once in a file? - setAlignmentProperty(PROP_MISSING, value); + // TODO: can Property appear in a Format command? + // suspect this is a mistake in the manual } else if (!keyword.equalsIgnoreCase(N_SEQS) + && !keyword.equalsIgnoreCase("NTaxa") && !keyword.equalsIgnoreCase(N_SITES)) { System.err.println("Warning: " + msg); diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index d868dcb..14cb27e 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -8,6 +8,7 @@ import static org.testng.AssertJUnit.fail; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.io.IOException; @@ -76,7 +77,7 @@ public class MegaFileTest private static final String INTERLEAVED_WITH_DESCRIPTION = "#MEGA\n" + "!Title Data with description;\n" - + "!Format DataType=DNA indel=- CodeTable=Standard Missing=? MatchChar=.;\n\n" + + "!Format DataType=DNA indel=-\tCodeTable=Standard Missing=? MatchChar=.;\n\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n\n" @@ -605,5 +606,78 @@ public class MegaFileTest } //@formatter:on + + /** + * Test parse of data with !Gene and !Domain statements. + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testParse_geneDomains() throws IOException + { + //@formatter:off + String data = "#MEGA\n"+ + "TITLE: Interleaved sequence data\n\n" + + "#U455 CCCCCC\n" + + "#CPZANT TTTTTT\n\n" + + "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" + + "#U455 GGGGGG\n" + + "#CPZANT AAAAAA\n\n" + + "!Domain=Intron1 Gene=Adh;\n" + + "#U455 tttttt\n" + + "#CPZANT cccccc\n\n" + + "!Domain=Exon2 Gene=Adh Property=Coding CodonStart=1;\n" + + "#U455 aaaaaa\n" + + "#CPZANT gggggg\n\n" + + // explicit end of Exon2, implicit end of Adh: + "!Domain=Exon2 Property=domainend;\n" + + "!Domain=BindingSite;\n" + + "#U455 CCCCCC\n" + + "#CPZANT TTTTTT\n\n"; + //@formatter:on + MegaFile testee = new MegaFile(data, AppletFormatAdapter.PASTE); + Vector seqs = testee.getSeqs(); + // should be 2 sequences + assertEquals("Expected two sequences", 2, seqs.size()); + // check sequence data + assertEquals("First sequence data wrong", + "CCCCCCGGGGGGttttttaaaaaaCCCCCC", seqs.get(0) + .getSequenceAsString()); + assertEquals("Second sequence data wrong", + "TTTTTTAAAAAAccccccggggggTTTTTT", seqs.get(1) + .getSequenceAsString()); + + /* + * sequences should have features for Gene=Adh 7-24, Exon1 7-12, Intron1 + * 13-18, Exon2 19-24, BindingSite 25-30 + */ + for (SequenceI seq : seqs) { + SequenceFeature[] sfs = seq.getSequenceFeatures(); + // features are added in the order in which their end is found + // (Gene before Domain when they end together) + assertEquals(5, sfs.length); + // TODO settle which way round type/description go! + assertEquals("Exon1", sfs[0].type); + assertEquals("Domain", sfs[0].description); + assertEquals(7, sfs[0].begin); + assertEquals(12, sfs[0].end); + assertEquals("Intron1", sfs[1].type); + assertEquals("Domain", sfs[1].description); + assertEquals(13, sfs[1].begin); + assertEquals(18, sfs[1].end); + assertEquals("Adh", sfs[2].type); + assertEquals("Gene", sfs[2].description); + assertEquals(7, sfs[2].begin); + assertEquals(24, sfs[2].end); + assertEquals("Exon2", sfs[3].type); + assertEquals("Domain", sfs[3].description); + assertEquals(19, sfs[3].begin); + assertEquals(24, sfs[3].end); + assertEquals("BindingSite", sfs[4].type); + assertEquals("Domain", sfs[4].description); + assertEquals(25, sfs[4].begin); + assertEquals(30, sfs[4].end); + } + } } -- 1.7.10.2