*/
public class MegaFile extends AlignFile
{
+ private static final String WHITESPACE = "\\s+";
+
private static final int DEFAULT_LINE_LENGTH = 60;
private static final String INDENT = " ";
private static final String DOMAIN = "Domain";
+ private static final String PROPERTY = "Property";
+
+ private static final String CODONSTART = "CodonStart";
+
/*
* names of properties to save to the alignment (may affect eventual output
* format)
// number of residues read (so far) per sequence
Map<String, Integer> residuesRead;
- // start residue (base 1) per sequence of current feature
- Map<String, Integer> featureStart;
-
- // feature (Gene/Domain) if any we are parsing
- private String currentFeature;
+ // current Gene if any we are parsing
+ private String currentGene;
+
+ // start residue (base 1) per sequence of current gene
+ Map<String, Integer> geneStart;
- // feature type (Gene/Domain) if any we are parsing
- private String currentFeatureType;
+ // current Domain if any we are parsing
+ private String currentDomain;
+
+ // start residue (base 1) per sequence of current domain
+ Map<String, Integer> domainStart;
// map of SequenceFeature's by sequence id
Map<String, List<SequenceFeature>> sequenceFeatures;
{
gapCharacter = '-';
sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
- featureStart = new HashMap<String, Integer>();
+ geneStart = new HashMap<String, Integer>();
+ domainStart = new HashMap<String, Integer>();
residuesRead = new HashMap<String, Integer>();
/*
dataLine = dataLine.trim();
if (dataLine.length() > 0)
{
- if (dataLine.startsWith(BANG + GENE))
- {
- parseFeature(GENE, dataLine);
- }
- else if (dataLine.startsWith(BANG + DOMAIN))
+ if (dataLine.startsWith(BANG + GENE)
+ || dataLine.startsWith(BANG + DOMAIN))
{
- parseFeature(DOMAIN, dataLine);
+ parseGeneOrDomain(dataLine);
}
else
{
dataLine = nextNonCommentLine();
}
+ /*
+ * close off any features currently being parsed
+ */
+ createFeature(GENE, currentGene, geneStart);
+ createFeature(DOMAIN, currentDomain, domainStart);
+
// remember the (longest) line length read in, so we can output the same
setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
}
/**
- * Parse a !Gene or !Domain command line
+ * Parse a !Gene or !Domain command line. MEGA accepts
+ * <ul>
+ * <li>!Gene=name;</li>
+ * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
+ * <li>!Gene=genename Domain=domainname Property= etc</li>
+ * <li>!Domain=domainname Gene=genename Property= etc</li>
+ * <li>!Domain=domainname Property= etc</li>
+ * <li>!domain=domainname property=domainend</li>
+ * </ul>
+ * Properly, a Gene should be composed of Domain segments, but MEGA accepts
+ * without. Note that keywords don't seem to be case sensitive.
*
- * @param featureType
* @param dataLine
+ * @throws FileFormatException
*/
- protected void parseFeature(String featureType, String dataLine)
+ protected void parseGeneOrDomain(String dataLine)
+ throws FileFormatException
{
- String featureName = getValue(dataLine);
- // TODO parse !Gene=xyx Property=end; ???
- if (this.currentFeature != null)
+ String domain = null;
+ String gene = null;
+ String property = null;
+ String codonStart = null;
+ String errorMsg = "Unrecognized format: " + dataLine;
+
+ if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
+ {
+ throw new FileFormatException(errorMsg);
+ }
+ String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
+ String[] tokens = trimmed.split(WHITESPACE);
+ for (String token : tokens)
{
- endSequenceFeature();
+ String[] keyValue = token.split("=");
+ if (keyValue.length != 2)
+ {
+ throw new FileFormatException(errorMsg);
+ }
+ String key = keyValue[0];
+ if (GENE.equalsIgnoreCase(key))
+ {
+ gene = keyValue[1];
+ }
+ else if (DOMAIN.equalsIgnoreCase(key))
+ {
+ domain = keyValue[1];
+ }
+ else if (PROPERTY.equalsIgnoreCase(key))
+ {
+ property = keyValue[1];
+ }
+ else if (CODONSTART.equalsIgnoreCase(key))
+ {
+ codonStart = keyValue[1];
+ }
+ else
+ {
+ System.err.println("Unrecognised token: '" + key + "; in "
+ + dataLine);
+ }
}
- startSequenceFeature(featureName, featureType);
+
+ processGeneOrDomain(gene, domain, property, codonStart);
}
/**
- * Start processing a new feature
+ * Process a statement containing one or both of Gene and Domain, and
+ * optionally Property or CodonStart commands.
*
- * @param featureName
+ * @param gene
+ * the Gene name if specified, else null
+ * @param domain
+ * the Domain name if specified, else null
+ * @param property
+ * the Property value if specified, else null
+ * @param codonStart
+ * the CodonStart value if specified, else null
*/
- protected void startSequenceFeature(String featureName, String featureType)
+ protected void processGeneOrDomain(String gene, String domain,
+ String property, String codonStart)
{
- currentFeature = featureName;
- currentFeatureType = featureType;
+ /*
+ * If we have been processing a Gene or Domain, and this does not continue
+ * it, then close it off (generate sequence features for it)
+ */
+ if (this.currentGene != null && !this.currentGene.equals(gene))
+ {
+ createFeature(GENE, currentGene, geneStart);
+ }
+ if (this.currentDomain != null)
+ {
+ if (!this.currentDomain.equals(domain)
+ || "domainend".equalsIgnoreCase(property))
+ {
+ createFeature(DOMAIN, currentDomain, domainStart);
+ }
+ }
/*
- * If the feature name precedes all sequences, we will know in
- * endSequenceFeature that it starts with residue 1; otherwise note now
- * where it starts in each sequence
+ * and if we have declared a Gene or Domain which does not continue the
+ * current one, then record its start positions per sequence
+ */
+ if (gene != null && !gene.equals(currentGene))
+ {
+ startSequenceFeature(geneStart);
+ }
+ if (domain != null && !domain.equals(currentDomain))
+ {
+ startSequenceFeature(domainStart);
+ }
+
+ currentGene = gene;
+ currentDomain = domain;
+ }
+
+ /**
+ * Start processing a new feature
+ *
+ * @param startPositions
+ */
+ protected void startSequenceFeature(Map<String, Integer> startPositions)
+ {
+ /*
+ * If the feature declaration precedes all sequences, we will know in
+ * createFeature that it started with residue 1; otherwise note now where it
+ * starts in each sequence
*/
if (!residuesRead.isEmpty())
{
{
String seqId = entry.getKey();
Integer nextResidue = entry.getValue() + 1;
- featureStart.put(seqId, nextResidue);
+ startPositions.put(seqId, nextResidue);
}
}
}
/**
- * Add a SequenceFeature for the current feature to each sequence, using the
- * current feature start/end values per sequence
+ * Add a SequenceFeature to each sequence, using the given start/end values
+ * per sequence
+ *
+ * @param featureType
+ * @param featureValue
+ * @param featureStartResidues
*/
- protected void endSequenceFeature()
+ protected void createFeature(String featureType, String featureValue,
+ Map<String, Integer> featureStartResidues)
{
+ if (featureValue == null)
+ {
+ return;
+ }
+
Iterator<String> seqids = this.seqData.keySet().iterator();
while (seqids.hasNext())
{
String seqid = seqids.next();
- Integer startAt = featureStart.get(seqid);
+ Integer startAt = featureStartResidues.get(seqid);
int sfstart = startAt == null ? 1 : startAt.intValue();
int sfend = residuesRead.get(seqid);
if (sfend >= sfstart)
/*
* don't add feature if entirely gapped in the sequence
*/
- SequenceFeature sf = new SequenceFeature(currentFeature,
- currentFeatureType, sfstart, sfend, 0f, null);
+ // TODO: type="Gene" (but then all coloured the same) or
+ // type="GeneName"?
+ SequenceFeature sf = new SequenceFeature(featureValue, featureType,
+ sfstart, sfend, 0f, null);
sequenceFeatures.get(seqid).add(sf);
}
}
}
/**
- * Parse a !Domain command line
- *
- * @param dataLine
- */
- private void parseDomain(String dataLine)
- {
- }
-
- /**
* Returns the next line that is not a comment, or null at end of file.
* Comments in MEGA are within [ ] brackets, and may be nested.
*
{
return;
}
- String[] tokens = inputLine.trim().split("\\s"); // any whitespace
+ String[] tokens = inputLine.trim().split(WHITESPACE);
for (String token : tokens)
{
parseFormatKeyword(token);
System.err.println("Warning: " + token + " not supported");
}
- else if (keyword.equalsIgnoreCase("Property"))
+ else if (keyword.equalsIgnoreCase(PROPERTY))
{
- // TODO: figure out what to do with this
- // can it appear more than once in a file?
- setAlignmentProperty(PROP_MISSING, value);
+ // TODO: can Property appear in a Format command?
+ // suspect this is a mistake in the manual
}
else if (!keyword.equalsIgnoreCase(N_SEQS)
+ && !keyword.equalsIgnoreCase("NTaxa")
&& !keyword.equalsIgnoreCase(N_SITES))
{
System.err.println("Warning: " + msg);