*/
public class MegaFile extends AlignFile
{
+ private static final int DEFAULT_LINE_LENGTH = 60;
+
+ private static final String INDENT = " ";
+
+ private static final String N_SITES = "NSites";
+
+ private static final String N_SEQS = "NSeqs";
+
+ private static final String MISSING = "Missing";
+
+ private static final String IDENTICAL = "Identical";
+
+ private static final String INDEL = "Indel";
+
+ private static final String CODETABLE = "CodeTable";
+
+ private static final String PROTEIN = "Protein";
+
+ private static final String NUCLEOTIDE = "Nucleotide";
+
+ private static final String DATATYPE = "DataType";
+
private static final char COMMENT_START = '[';
private static final char COMMENT_END = ']';
private static final String MEGA_ID = HASHSIGN + "MEGA";
- private static final String TITLE = "TITLE";
+ private static final String TITLE = "Title";
private static final String FORMAT = "Format";
private static final String DOMAIN = "Domain";
- private static final String INTERLEAVED = "Interleaved";
-
/*
* names of properties to save to the alignment (may affect eventual output
* format)
static final String PROP_MISSING = "MEGA_MISSING";
+ static final String PROP_DATATYPE = "MEGA_DATATYPE";
+
+ // number of bases per line of file (value is inferred)
+ static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
+
// TODO: need a controlled name for Gene as a feature if we want to be able to
// output the MEGA file with !Gene headers
// WTF do we do if the sequences get realigned?
private static final String SPACE = " ";
- private static final int POSITIONS_PER_LINE = 50;
+ /*
+ * number of sequence positions output per line
+ */
+ private int positionsPerLine;
private String title;
dataLine = nextNonCommentLine();
}
+ // remember the (longest) line length read in, so we can output the same
+ setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
+
setSequences(seqData);
}
* Add the current line of data to the sequence.
*/
sb.append(dataLine);
+
+ setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
}
/**
/*
* Do nothing if this line is _only_ a sequence id with no data following.
*
- * Remove any internal spaces (present in the 'fancy' file format)
+ * Remove any internal spaces
*/
if (data != null && data.length() > 0)
{
data = data.replace(SPACE, "");
}
sb.append(data);
+ setPositionsPerLine(Math.max(positionsPerLine, data.length()));
assertInterleaved(true, dataLine);
}
}
if (isTitle(inputLine))
{
- setAlignmentProperty(PROP_TITLE, getValue(inputLine));
+ this.title = getValue(inputLine);
+ setAlignmentProperty(PROP_TITLE, title);
}
else if (inputLine.startsWith(BANG + DESCRIPTION))
{
{
inputLine = inputLine.substring(0, inputLine.length() - 1);
}
+ if (inputLine.length() == 0)
+ {
+ return;
+ }
String[] tokens = inputLine.trim().split("\\s"); // any whitespace
for (String token : tokens)
{
/*
* Jalview will work out whether nucleotide or not anyway
*/
- if (keyword.equalsIgnoreCase("DataType"))
+ if (keyword.equalsIgnoreCase(DATATYPE))
{
if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
|| value.equalsIgnoreCase("Nucleotide"))
this.nucleotide = true;
// alignment computes whether or not it is nucleotide when created
}
- else if (value.equalsIgnoreCase("Protein"))
+ else if (value.equalsIgnoreCase(PROTEIN))
{
this.nucleotide = false;
}
{
throw new FileFormatException(msg);
}
+ setAlignmentProperty(PROP_DATATYPE, value);
}
/*
* accept non-Standard code table but save in case we want to disable
* 'translate as cDNA'
*/
- else if (keyword.equalsIgnoreCase("CodeTable"))
+ else if (keyword.equalsIgnoreCase(CODETABLE))
{
setAlignmentProperty(PROP_CODETABLE, value);
}
/*
* save gap char to set later on alignment once created
*/
- else if (keyword.equalsIgnoreCase("Indel"))
+ else if (keyword.equalsIgnoreCase(INDEL))
{
this.gapCharacter = value.charAt(0);
}
- else if (keyword.equalsIgnoreCase("Identical")
+ else if (keyword.equalsIgnoreCase(IDENTICAL)
|| keyword.equalsIgnoreCase("MatchChar"))
{
+ setAlignmentProperty(PROP_IDENTITY, value);
if (!".".equals(value))
{
- setAlignmentProperty(PROP_IDENTITY, value);
System.err.println("Warning: " + token
+ " not supported, Jalview uses '.' for identity");
}
}
- else if (keyword.equalsIgnoreCase("Missing"))
+ else if (keyword.equalsIgnoreCase(MISSING))
{
setAlignmentProperty(PROP_MISSING, value);
System.err.println("Warning: " + token + " not supported");
setAlignmentProperty(PROP_MISSING, value);
}
- else if (!keyword.equalsIgnoreCase("NSeqs")
- && !keyword.equalsIgnoreCase("NSites"))
+ else if (!keyword.equalsIgnoreCase(N_SEQS)
+ && !keyword.equalsIgnoreCase(N_SITES))
{
System.err.println("Warning: " + msg);
}
return false;
}
String upper = inputLine.toUpperCase();
- return (upper.startsWith(TITLE) || upper.startsWith(BANG + TITLE));
+ return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
+ + TITLE.toUpperCase()));
}
/**
{
if (line.endsWith(SEMICOLON))
{
- desc.append(line.substring(0, line.length() - 1)).append(newline);
+ desc.append(line.substring(0, line.length() - 1));
break;
}
else if (line.length() > 0)
}
/**
- * Write out the alignment sequences in Mega format.
+ * Returns the alignment sequences in Mega format.
*/
@Override
public String print()
{
- return print(getSeqsAsArray());
+ return MEGA_ID + newline + print(getSeqsAsArray());
}
/**
* Write out the alignment sequences in Mega format - interleaved unless
* explicitly noninterleaved.
*/
- public String print(SequenceI[] s)
+ protected String print(SequenceI[] s)
{
- // TODO: is there a way to preserve the 'interleaved' property so it can
- // affect output?
-
- String result = null;
+ String result;
if (this.interleaved != null && !this.interleaved)
{
result = printNonInterleaved(s);
}
/**
- * Print the sequences in interleaved format, each row 15 space-separated
- * triplets.
- *
- * @param s
- * @return
- */
- protected String printInterleavedCodons(SequenceI[] s)
- {
- // TODO not coded yet - defaulting to the 'simple' format output
- return printInterleaved(s);
- }
-
- /**
- * Print to string in Interleaved format - blocks of next 50 characters of
- * each sequence in turn.
+ * Print to string in Interleaved format - blocks of next N characters of each
+ * sequence in turn.
*
* @param s
*/
{
int maxIdLength = getMaxIdLength(s);
int maxSequenceLength = getMaxSequenceLength(s);
- int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx
+ int numLines = maxSequenceLength / positionsPerLine + 3; // approx
/*
* Size a buffer to hold the whole output
*/
StringBuilder sb = new StringBuilder(numLines
- * (maxIdLength + 2 + POSITIONS_PER_LINE));
- printHeaders(sb);
+ * (maxIdLength + 2 + positionsPerLine));
+
+ int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
+ int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+ int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
- int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1;
+ /*
+ * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
+ */
+ int from = 0;
for (int i = 0; i < numDataBlocks; i++)
{
sb.append(newline);
+ boolean first = true;
+ int advancedBy = 0;
for (SequenceI seq : s)
{
-
- String seqId = String.format("#%-" + maxIdLength + "s ",
+ int seqFrom = from;
+ String seqId = String.format("#%-" + maxIdLength + "s",
seq.getName());
- char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE,
- (i + 1) * POSITIONS_PER_LINE);
+
+ /*
+ * output next line for this sequence
+ */
sb.append(seqId);
- sb.append(subSequence);
+ int lastPos = seqFrom + positionsPerLine; // exclusive
+ for (int j = 0; j < chunksPerLine; j++)
+ {
+ char[] subSequence = seq.getSequence(seqFrom,
+ Math.min(lastPos, seqFrom + spaceEvery));
+ if (subSequence.length > 0)
+ {
+ sb.append(SPACE).append(subSequence);
+ }
+ seqFrom += subSequence.length;
+ if (first)
+ {
+ // all sequences should be the same length in MEGA
+ advancedBy += subSequence.length;
+ }
+ }
sb.append(newline);
+ first = false;
}
+ from += advancedBy;
}
return new String(sb);
}
/**
- * Append the MEGA header and any other known properties
+ * Outputs to string the MEGA header and any other known and relevant
+ * alignment properties
*
- * @param sb
+ * @param al
*/
- private void printHeaders(StringBuilder sb)
+ protected String printHeaders(AlignmentI al)
{
- sb.append(MEGA_ID);
- sb.append(newline);
+ StringBuilder sb = new StringBuilder(128);
+ sb.append(MEGA_ID).append(newline);
+ printProperty(al, sb, PROP_TITLE, TITLE);
+ printProperty(al, sb, PROP_DESCRIPTION, DESCRIPTION);
- String ttle = getAlignmentProperty(PROP_TITLE);
- if (ttle != null)
+ /*
+ * !Format DataType CodeTable
+ */
+ sb.append(BANG).append(FORMAT).append(newline);
+ String dataType = (String) al.getProperty(PROP_DATATYPE);
+ if (dataType == null)
{
- sb.append(BANG).append(TITLE).append(SPACE).append(ttle)
- .append(SEMICOLON).append(newline);
+ dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
}
+ sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
+ String codeTable = (String) al.getProperty(PROP_CODETABLE);
+ sb.append(SPACE).append(CODETABLE).append(EQUALS)
+ .append(codeTable == null ? "Standard" : codeTable)
+ .append(newline);
+
+ /*
+ * !Format NSeqs NSites
+ * NSites the length of any sequence (they should all be the same), excluding
+ * gaps?!?
+ */
+ sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
+ SequenceI seq = al.getSequenceAt(0);
+ sb.append(SPACE).append(N_SITES).append(EQUALS)
+ .append(seq.getEnd() - seq.getStart() + 1);
+ sb.append(newline);
- String desc = getAlignmentProperty(PROP_DESCRIPTION);
- if (desc != null)
+ /*
+ * !Format Indel Identical Missing
+ */
+ sb.append(INDENT);
+ sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
+ String identity = (String) al.getProperty(PROP_IDENTITY);
+ if (identity != null)
+ {
+ sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
+ }
+ String missing = (String) al.getProperty(PROP_MISSING);
+ if (missing != null)
{
- sb.append(BANG).append(DESCRIPTION).append(SPACE).append(desc)
- .append(SEMICOLON).append(newline);
+ sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
}
+ sb.append(SEMICOLON).append(newline);
+
+ return sb.toString();
}
/**
{
int maxSequenceLength = getMaxSequenceLength(s);
// approx
- int numLines = maxSequenceLength / POSITIONS_PER_LINE + 2 + s.length;
+ int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
/*
* Roughly size a buffer to hold the whole output
*/
- StringBuilder sb = new StringBuilder(numLines * POSITIONS_PER_LINE);
- printHeaders(sb);
+ StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
+ int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
+ int chunksPerLine = positionsPerLine / spaceEvery;
for (SequenceI seq : s)
{
sb.append(newline);
sb.append(HASHSIGN + seq.getName()).append(newline);
int startPos = 0;
- while (startPos <= seq.getLength())
+ while (startPos < seq.getLength())
{
- char[] subSequence = seq.getSequence(startPos, startPos
- + POSITIONS_PER_LINE);
- sb.append(subSequence);
+ boolean firstChunk = true;
+ /*
+ * print next line for this sequence
+ */
+ int lastPos = startPos + positionsPerLine; // exclusive
+ for (int j = 0; j < chunksPerLine; j++)
+ {
+ char[] subSequence = seq.getSequence(startPos,
+ Math.min(lastPos, startPos + positionsPerLine));
+ if (subSequence.length > 0)
+ {
+ if (!firstChunk)
+ {
+ sb.append(SPACE);
+ }
+ sb.append(subSequence);
+ firstChunk = false;
+ }
+ startPos += subSequence.length;
+ }
sb.append(newline);
- startPos += POSITIONS_PER_LINE;
}
}
* @throws IOException
*/
protected void assertInterleaved(boolean isIt, String dataLine)
- throws IOException
+ throws FileFormatException
{
if (this.interleaved != null && isIt != this.interleaved.booleanValue())
{
- throw new IOException(
+ throw new FileFormatException(
"Parse error: mix of interleaved and noninterleaved detected, at line: "
+ dataLine);
}
this.interleaved = new Boolean(isIt);
+ setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
}
public boolean isInterleaved()
+ (nucleotide ? " not" : ""));
}
}
+
+ /**
+ * Print the given alignment in MEGA format. If the alignment was created by
+ * parsing a MEGA file, it should have properties set (e.g. Title) which can
+ * influence the output.
+ */
+ @Override
+ public String print(AlignmentI al)
+ {
+ this.nucleotide = al.isNucleotide();
+ String lineLength = (String) al.getProperty(PROP_LINELENGTH);
+ this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
+ .parseInt(lineLength);
+ return printHeaders(al) + print(al.getSequencesArray());
+ }
+
+ /**
+ * Helper method to append a property e.g. !Title to the output buffer, if the
+ * property is set on the alignment.
+ *
+ * @param al
+ * @param headers
+ * @param propertyName
+ * @param propertyKeyword
+ */
+ protected void printProperty(AlignmentI al, StringBuilder headers,
+ String propertyName, String propertyKeyword)
+ {
+ String propertyValue = (String) al.getProperty(propertyName);
+ if (propertyValue != null)
+ {
+ headers.append(BANG).append(propertyKeyword).append(SPACE)
+ .append(propertyValue).append(SEMICOLON)
+ .append(newline);
+ }
+ }
+
+ /**
+ * Returns the number of sequence positions output per line
+ *
+ * @return
+ */
+ public int getPositionsPerLine()
+ {
+ return positionsPerLine;
+ }
+
+ /**
+ * Sets the number of sequence positions output per line. Note these will be
+ * formatted in blocks of 3 (nucleotide) or 10 (peptide).
+ *
+ * @param p
+ */
+ public void setPositionsPerLine(int p)
+ {
+ this.positionsPerLine = p;
+ }
}