*/
public class MegaFile extends AlignFile
{
+ private static final String WHITESPACE = "\\s+";
+
private static final int DEFAULT_LINE_LENGTH = 60;
private static final String INDENT = " ";
private static final String DOMAIN = "Domain";
+ private static final String PROPERTY = "Property";
+
+ private static final String CODONSTART = "CodonStart";
+
/*
* names of properties to save to the alignment (may affect eventual output
* format)
// number of residues read (so far) per sequence
Map<String, Integer> residuesRead;
- // start residue (base 1) per sequence of current feature
- Map<String, Integer> featureStart;
-
- // feature (Gene/Domain) if any we are parsing
- private String currentFeature;
+ // current Gene if any we are parsing
+ private String currentGene;
+
+ // start residue (base 1) per sequence of current gene
+ Map<String, Integer> geneStart;
- // feature type (Gene/Domain) if any we are parsing
- private String currentFeatureType;
+ // current Domain if any we are parsing
+ private String currentDomain;
+
+ // start residue (base 1) per sequence of current domain
+ Map<String, Integer> domainStart;
// map of SequenceFeature's by sequence id
Map<String, List<SequenceFeature>> sequenceFeatures;
{
gapCharacter = '-';
sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
- featureStart = new HashMap<String, Integer>();
+ geneStart = new HashMap<String, Integer>();
+ domainStart = new HashMap<String, Integer>();
residuesRead = new HashMap<String, Integer>();
/*
dataLine = dataLine.trim();
if (dataLine.length() > 0)
{
- if (dataLine.startsWith(BANG + GENE))
- {
- parseFeature(GENE, dataLine);
- }
- else if (dataLine.startsWith(BANG + DOMAIN))
+ if (dataLine.startsWith(BANG + GENE)
+ || dataLine.startsWith(BANG + DOMAIN))
{
- parseFeature(DOMAIN, dataLine);
+ parseGeneOrDomain(dataLine);
}
else
{
dataLine = nextNonCommentLine();
}
+ /*
+ * close off any features currently being parsed
+ */
+ createFeature(GENE, currentGene, geneStart);
+ createFeature(DOMAIN, currentDomain, domainStart);
+
// remember the (longest) line length read in, so we can output the same
setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
}
/**
- * Parse a !Gene or !Domain command line
+ * Parse a !Gene or !Domain command line. MEGA accepts
+ * <ul>
+ * <li>!Gene=name;</li>
+ * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
+ * <li>!Gene=genename Domain=domainname Property= etc</li>
+ * <li>!Domain=domainname Gene=genename Property= etc</li>
+ * <li>!Domain=domainname Property= etc</li>
+ * <li>!domain=domainname property=domainend</li>
+ * </ul>
+ * Properly, a Gene should be composed of Domain segments, but MEGA accepts
+ * without. Note that keywords don't seem to be case sensitive.
*
- * @param featureType
* @param dataLine
+ * @throws FileFormatException
*/
- protected void parseFeature(String featureType, String dataLine)
+ protected void parseGeneOrDomain(String dataLine)
+ throws FileFormatException
{
- String featureName = getValue(dataLine);
- // TODO parse !Gene=xyx Property=end; ???
- if (this.currentFeature != null)
+ String domain = null;
+ String gene = null;
+ String property = null;
+ String codonStart = null;
+ String errorMsg = "Unrecognized format: " + dataLine;
+
+ if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
+ {
+ throw new FileFormatException(errorMsg);
+ }
+ String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
+ String[] tokens = trimmed.split(WHITESPACE);
+ for (String token : tokens)
{
- endSequenceFeature();
+ String[] keyValue = token.split("=");
+ if (keyValue.length != 2)
+ {
+ throw new FileFormatException(errorMsg);
+ }
+ String key = keyValue[0];
+ if (GENE.equalsIgnoreCase(key))
+ {
+ gene = keyValue[1];
+ }
+ else if (DOMAIN.equalsIgnoreCase(key))
+ {
+ domain = keyValue[1];
+ }
+ else if (PROPERTY.equalsIgnoreCase(key))
+ {
+ property = keyValue[1];
+ }
+ else if (CODONSTART.equalsIgnoreCase(key))
+ {
+ codonStart = keyValue[1];
+ }
+ else
+ {
+ System.err.println("Unrecognised token: '" + key + "; in "
+ + dataLine);
+ }
}
- startSequenceFeature(featureName, featureType);
+
+ processGeneOrDomain(gene, domain, property, codonStart);
}
/**
- * Start processing a new feature
+ * Process a statement containing one or both of Gene and Domain, and
+ * optionally Property or CodonStart commands.
*
- * @param featureName
+ * @param gene
+ * the Gene name if specified, else null
+ * @param domain
+ * the Domain name if specified, else null
+ * @param property
+ * the Property value if specified, else null
+ * @param codonStart
+ * the CodonStart value if specified, else null
*/
- protected void startSequenceFeature(String featureName, String featureType)
+ protected void processGeneOrDomain(String gene, String domain,
+ String property, String codonStart)
{
- currentFeature = featureName;
- currentFeatureType = featureType;
+ /*
+ * If we have been processing a Gene or Domain, and this does not continue
+ * it, then close it off (generate sequence features for it)
+ */
+ if (this.currentGene != null && !this.currentGene.equals(gene))
+ {
+ createFeature(GENE, currentGene, geneStart);
+ }
+ if (this.currentDomain != null)
+ {
+ if (!this.currentDomain.equals(domain)
+ || "domainend".equalsIgnoreCase(property))
+ {
+ createFeature(DOMAIN, currentDomain, domainStart);
+ }
+ }
/*
- * If the feature name precedes all sequences, we will know in
- * endSequenceFeature that it starts with residue 1; otherwise note now
- * where it starts in each sequence
+ * and if we have declared a Gene or Domain which does not continue the
+ * current one, then record its start positions per sequence
+ */
+ if (gene != null && !gene.equals(currentGene))
+ {
+ startSequenceFeature(geneStart);
+ }
+ if (domain != null && !domain.equals(currentDomain))
+ {
+ startSequenceFeature(domainStart);
+ }
+
+ currentGene = gene;
+ currentDomain = domain;
+ }
+
+ /**
+ * Start processing a new feature
+ *
+ * @param startPositions
+ */
+ protected void startSequenceFeature(Map<String, Integer> startPositions)
+ {
+ /*
+ * If the feature declaration precedes all sequences, we will know in
+ * createFeature that it started with residue 1; otherwise note now where it
+ * starts in each sequence
*/
if (!residuesRead.isEmpty())
{
{
String seqId = entry.getKey();
Integer nextResidue = entry.getValue() + 1;
- featureStart.put(seqId, nextResidue);
+ startPositions.put(seqId, nextResidue);
}
}
}
/**
- * Add a SequenceFeature for the current feature to each sequence, using the
- * current feature start/end values per sequence
+ * Add a SequenceFeature to each sequence, using the given start/end values
+ * per sequence
+ *
+ * @param featureType
+ * @param featureValue
+ * @param featureStartResidues
*/
- protected void endSequenceFeature()
+ protected void createFeature(String featureType, String featureValue,
+ Map<String, Integer> featureStartResidues)
{
+ if (featureValue == null)
+ {
+ return;
+ }
+
Iterator<String> seqids = this.seqData.keySet().iterator();
while (seqids.hasNext())
{
String seqid = seqids.next();
- Integer startAt = featureStart.get(seqid);
+ Integer startAt = featureStartResidues.get(seqid);
int sfstart = startAt == null ? 1 : startAt.intValue();
int sfend = residuesRead.get(seqid);
if (sfend >= sfstart)
/*
* don't add feature if entirely gapped in the sequence
*/
- SequenceFeature sf = new SequenceFeature(currentFeature,
- currentFeatureType, sfstart, sfend, 0f, null);
+ // TODO: type="Gene" (but then all coloured the same) or
+ // type="GeneName"?
+ SequenceFeature sf = new SequenceFeature(featureValue, featureType,
+ sfstart, sfend, 0f, null);
sequenceFeatures.get(seqid).add(sf);
}
}
}
/**
- * Parse a !Domain command line
- *
- * @param dataLine
- */
- private void parseDomain(String dataLine)
- {
- }
-
- /**
* Returns the next line that is not a comment, or null at end of file.
* Comments in MEGA are within [ ] brackets, and may be nested.
*
{
return;
}
- String[] tokens = inputLine.trim().split("\\s"); // any whitespace
+ String[] tokens = inputLine.trim().split(WHITESPACE);
for (String token : tokens)
{
parseFormatKeyword(token);
System.err.println("Warning: " + token + " not supported");
}
- else if (keyword.equalsIgnoreCase("Property"))
+ else if (keyword.equalsIgnoreCase(PROPERTY))
{
- // TODO: figure out what to do with this
- // can it appear more than once in a file?
- setAlignmentProperty(PROP_MISSING, value);
+ // TODO: can Property appear in a Format command?
+ // suspect this is a mistake in the manual
}
else if (!keyword.equalsIgnoreCase(N_SEQS)
+ && !keyword.equalsIgnoreCase("NTaxa")
&& !keyword.equalsIgnoreCase(N_SITES))
{
System.err.println("Warning: " + msg);
import jalview.datamodel.AlignmentI;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import java.io.IOException;
private static final String INTERLEAVED_WITH_DESCRIPTION =
"#MEGA\n"
+ "!Title Data with description;\n"
- + "!Format DataType=DNA indel=- CodeTable=Standard Missing=? MatchChar=.;\n\n"
+ + "!Format DataType=DNA indel=-\tCodeTable=Standard Missing=? MatchChar=.;\n\n"
+ "!Description\n"
+ " Line one of description\n"
+ " Line two of description;\n\n"
}
//@formatter:on
+
+ /**
+ * Test parse of data with !Gene and !Domain statements.
+ *
+ * @throws IOException
+ */
+ @Test(groups = { "Functional" })
+ public void testParse_geneDomains() throws IOException
+ {
+ //@formatter:off
+ String data = "#MEGA\n"+
+ "TITLE: Interleaved sequence data\n\n" +
+ "#U455 CCCCCC\n" +
+ "#CPZANT TTTTTT\n\n" +
+ "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" +
+ "#U455 GGGGGG\n" +
+ "#CPZANT AAAAAA\n\n" +
+ "!Domain=Intron1 Gene=Adh;\n" +
+ "#U455 tttttt\n" +
+ "#CPZANT cccccc\n\n" +
+ "!Domain=Exon2 Gene=Adh Property=Coding CodonStart=1;\n" +
+ "#U455 aaaaaa\n" +
+ "#CPZANT gggggg\n\n" +
+ // explicit end of Exon2, implicit end of Adh:
+ "!Domain=Exon2 Property=domainend;\n" +
+ "!Domain=BindingSite;\n" +
+ "#U455 CCCCCC\n" +
+ "#CPZANT TTTTTT\n\n";
+ //@formatter:on
+ MegaFile testee = new MegaFile(data, AppletFormatAdapter.PASTE);
+ Vector<SequenceI> seqs = testee.getSeqs();
+ // should be 2 sequences
+ assertEquals("Expected two sequences", 2, seqs.size());
+ // check sequence data
+ assertEquals("First sequence data wrong",
+ "CCCCCCGGGGGGttttttaaaaaaCCCCCC", seqs.get(0)
+ .getSequenceAsString());
+ assertEquals("Second sequence data wrong",
+ "TTTTTTAAAAAAccccccggggggTTTTTT", seqs.get(1)
+ .getSequenceAsString());
+
+ /*
+ * sequences should have features for Gene=Adh 7-24, Exon1 7-12, Intron1
+ * 13-18, Exon2 19-24, BindingSite 25-30
+ */
+ for (SequenceI seq : seqs) {
+ SequenceFeature[] sfs = seq.getSequenceFeatures();
+ // features are added in the order in which their end is found
+ // (Gene before Domain when they end together)
+ assertEquals(5, sfs.length);
+ // TODO settle which way round type/description go!
+ assertEquals("Exon1", sfs[0].type);
+ assertEquals("Domain", sfs[0].description);
+ assertEquals(7, sfs[0].begin);
+ assertEquals(12, sfs[0].end);
+ assertEquals("Intron1", sfs[1].type);
+ assertEquals("Domain", sfs[1].description);
+ assertEquals(13, sfs[1].begin);
+ assertEquals(18, sfs[1].end);
+ assertEquals("Adh", sfs[2].type);
+ assertEquals("Gene", sfs[2].description);
+ assertEquals(7, sfs[2].begin);
+ assertEquals(24, sfs[2].end);
+ assertEquals("Exon2", sfs[3].type);
+ assertEquals("Domain", sfs[3].description);
+ assertEquals(19, sfs[3].begin);
+ assertEquals(24, sfs[3].end);
+ assertEquals("BindingSite", sfs[4].type);
+ assertEquals("Domain", sfs[4].description);
+ assertEquals(25, sfs[4].begin);
+ assertEquals(30, sfs[4].end);
+ }
+ }
}