/*
* Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
* Copyright (C) 2014 The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.io;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class MegaFile extends AlignFile
{
/*
* Simple file format as at
* http://www.hiv.lanl.gov/content/sequence/HelpDocs/SEQsamples.html
*
* Fancy file format as at
* http://primerdigital.com/fastpcr/images/Drosophila_Adh.txt
*/
public enum FileFormat
{
SIMPLE, FANCY
}
private static final String HASHSIGN = "#"; // TODO: public constants file
private static final String COLON = ":";
private static final String BANG = "!";
private static final String EQUALS = "=";
private static final String MEGA_ID = HASHSIGN + "MEGA";
public static final String PROP_TITLE = "TITLE";
public static final String PROP_FORMAT = "Format";
public static final String PROP_DESCRIPTION = "Description";
public static final String PROP_GENE = "Gene";
public static final String PROP_INTERLEAVED = "Interleaved";
// initial size for sequence data buffer
private static final int SEQBUFFERSIZE = 256;
private static final String SPACE = " ";
private static final int POSITIONS_PER_LINE = 50;
// this can be True, False or null (meaning we don't know yet)
private Boolean interleaved;
// set once we have seen one block of interleaved data
private boolean firstDataBlockRead = false;
private FileFormat fileFormat;
public MegaFile()
{
}
public MegaFile(String inFile, String type) throws IOException
{
super(inFile, type);
}
public MegaFile(FileParse source) throws IOException
{
super(source);
}
/**
* Parse the input stream.
*/
@Override
public void parse() throws IOException
{
/*
* Read MEGA and Title/Format/Description/Gene headers if present. These are
* saved as alignment properties. Returns the first sequence data line
*/
String dataLine = parseHeaderLines();
/*
* If we didn't positively identify as 'fancy format', assume 'simple
* format'
*/
if (this.fileFormat == null)
{
setFileFormat(FileFormat.SIMPLE);
}
/*
* Temporary store of {sequenceId, positionData} while parsing appending
*/
Map seqData = new LinkedHashMap();
/*
* The id of the sequence being read (for non-interleaved)
*/
String currentId = "";
while (dataLine != null)
{
dataLine = dataLine.trim();
if (dataLine.length() > 0)
{
currentId = parseDataLine(dataLine, seqData, currentId);
}
else if (!seqData.isEmpty())
{
/*
* Blank line after processing some data...
*/
this.firstDataBlockRead = true;
}
dataLine = nextLine();
}
setSequences(seqData);
}
/**
* Convert the parsed sequence strings to objects and store them in the model.
*
* @param seqData
*/
protected void setSequences(Map seqData)
{
Set> datasets = seqData.entrySet();
for (Entry dataset : datasets)
{
String sequenceId = dataset.getKey();
StringBuilder characters = dataset.getValue();
SequenceI s = new Sequence(sequenceId, new String(characters));
this.seqs.addElement(s);
}
}
/**
* Process one line of sequence data. If it has no sequence identifier, append
* to the current id's sequence. Else parse out the sequence id and append the
* data (if any) to that id's sequence. Returns the sequence id (implicit or
* explicit) for this line.
*
* @param dataLine
* @param seqData
* @param currentid
* @return
* @throws IOException
*/
protected String parseDataLine(String dataLine,
Map seqData, String currentId)
throws IOException
{
String seqId = getSequenceId(dataLine);
if (seqId == null)
{
/*
* Just character data
*/
parseNoninterleavedDataLine(dataLine, seqData, currentId);
return currentId;
}
else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
{
/*
* Sequence id only - header line for noninterleaved data
*/
return seqId;
}
else
{
/*
* Sequence id followed by data
*/
parseInterleavedDataLine(dataLine, seqData, seqId);
return seqId;
}
}
/**
* Add a line of sequence data to the buffer for the given sequence id. Start
* a new one if we haven't seen it before.
*
* @param dataLine
* @param seqData
* @param currentId
* @throws IOException
*/
protected void parseNoninterleavedDataLine(String dataLine,
Map seqData, String currentId)
throws IOException
{
if (currentId == null)
{
/*
* Oops. Data but no sequence id context.
*/
throw new IOException("No sequence id context at: " + dataLine);
}
assertInterleaved(false, dataLine);
StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
/*
* Add the current line of data to the sequence.
*/
sb.append(dataLine);
}
/**
* Get the sequence data for this sequence id, starting a new one if
* necessary.
*
* @param seqData
* @param currentId
* @return
*/
protected StringBuilder getSequenceDataBuffer(
Map seqData, String currentId)
{
StringBuilder sb = seqData.get(currentId);
if (sb == null)
{
// first data met for this sequence id, start a new buffer
sb = new StringBuilder(SEQBUFFERSIZE);
seqData.put(currentId, sb);
}
return sb;
}
/**
* Parse one line of interleaved data e.g.
*
*
* #TheSeqId CGATCGCATGCA
*
*
* @param dataLine
* @param seqData
* @param seqId
* @throws IOException
*/
protected void parseInterleavedDataLine(String dataLine,
Map seqData, String seqId)
throws IOException
{
/*
* New sequence found in second or later data block - error.
*/
if (this.firstDataBlockRead && !seqData.containsKey(seqId))
{
throw new IOException(
"Parse error: misplaced new sequence starting at " + dataLine);
}
StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
String data = dataLine.substring(seqId.length() + 1).trim();
/*
* Do nothing if this line is _only_ a sequence id with no data following.
*
* Remove any internal spaces (present in the 'fancy' file format)
*/
if (data != null && data.length() > 0)
{
if (data.indexOf(SPACE) != -1)
{
data = data.replace(SPACE, "");
}
sb.append(data);
assertInterleaved(true, dataLine);
}
}
/**
* If the line begins with (e.g.) "#abcde " then returns "abcde" as the
* identifier. Else returns null.
*
* @param dataLine
* @return
*/
public static String getSequenceId(String dataLine)
{
// TODO refactor to a StringUtils type class
if (dataLine != null)
{
if (dataLine.startsWith(HASHSIGN))
{
int spacePos = dataLine.indexOf(" ");
return (spacePos == -1 ? dataLine.substring(1) : dataLine
.substring(1, spacePos));
}
}
return null;
}
/**
* Read the #MEGA and Title/Format/Description/Gene header lines (if present).
*
* Save as annotation properties in case useful.
*
* @return the next non-blank line following the header lines.
* @throws IOException
*/
protected String parseHeaderLines() throws IOException
{
String inputLine = null;
while ((inputLine = nextLine()) != null)
{
inputLine = inputLine.trim();
/*
* skip blank lines
*/
if (inputLine.length() == 0)
{
continue;
}
if (inputLine.startsWith(BANG))
{
setFileFormat(FileFormat.FANCY);
}
if (inputLine.startsWith(BANG + PROP_DESCRIPTION))
{
parseDescriptionLines();
}
else if (isPropertyLine(inputLine))
{
/*
* If a property is matched, parse and save it.
*/
String[] property_value = parsePropertyValue(inputLine);
setAlignmentProperty(property_value[0], property_value[1]);
}
else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
{
/*
* Return the first 'data line' i.e. one that is not blank, #MEGA or
* TITLE:
*/
break;
}
}
return inputLine;
}
/**
* Read following lines until blank, appending each to the Description
* property value.
*
* Assumes the !Description line itself does not include description text.
*
* Assumes the description is followed by a blank line (else we will consume
* one too many).
*
* @throws IOException
*/
protected void parseDescriptionLines() throws IOException
{
StringBuilder desc = new StringBuilder(256);
String line = null;
while ((line = nextLine()) != null) {
if ("".equals(line.trim()))
{
break;
}
desc.append(line).append(newline);
}
setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
}
/**
* Test whether the line holds an expected property declaration.
*
* @param inputLine
* @return
*/
protected boolean isPropertyLine(String inputLine)
{
if (lineMatchesFlag(inputLine, PROP_TITLE, BANG, COLON)
|| lineMatchesFlag(inputLine, PROP_FORMAT, BANG, COLON)
|| lineMatchesFlag(inputLine, PROP_DESCRIPTION, BANG, COLON)
|| lineMatchesFlag(inputLine, PROP_GENE, BANG, COLON))
{
return true;
}
return false;
}
/**
* Helper method that extract the name and value of a property, assuming the
* first space or equals sign is the separator.
*
* Thus "Description: Melanogaster" or "!Description=Melanogaster" both return
* {"Description", "Melanogaster"}.
*
* Returns an empty value string if no space or equals sign is present.
*
* @param s
* @return
*/
public static String[] parsePropertyValue(String s)
{
// TODO refactor to a string utils helper class (or find equivalent)
// TODO handle other cases e.g. "Description = Melanogaster"
String propertyName = s;
String value = "";
int separatorPos = -1;
if (s != null)
{
int spacePos = s.indexOf(SPACE);
int eqPos = s.indexOf(EQUALS);
if (spacePos == -1 && eqPos > -1)
{
separatorPos = eqPos;
}
else if (spacePos > -1 && eqPos == -1)
{
separatorPos = spacePos;
}
else if (spacePos > -1 && eqPos > -1)
{
separatorPos = Math.min(spacePos, eqPos);
}
}
if (separatorPos > -1)
{
value = s.substring(separatorPos + 1);
propertyName = s.substring(0, separatorPos);
}
/*
* finally strip any leading / trailing chars from property name
*/
if (propertyName.startsWith(BANG))
{
propertyName = propertyName.substring(1);
}
if (propertyName.endsWith(COLON))
{
propertyName = propertyName.substring(0, propertyName.length() - 1);
}
return new String[]
{ propertyName, value };
}
/**
* Test whether a line starts with the specified flag field followed by a
* space (or nothing).
*
* Here we accept an optional prefix and suffix on the flag, and the check is
* not case-sensitive. So these would match for "Title"
*
*
*
* @param line
* @param flag
* @param prefix
* @param suffix
* @return
*/
public static boolean lineMatchesFlag(String line, String flag, String prefix, String suffix)
{
// TODO refactor to a string utils helper class
boolean result = false;
if (line != null && flag != null) {
String lineUpper = line.toUpperCase().trim();
String flagUpper = flag.toUpperCase();
// skip prefix character e.g. ! before attempting match
if (lineUpper.startsWith(prefix)) {
lineUpper = lineUpper.substring(1);
}
// test for flag + SPACE or flag + EQUALS, with or without suffix
if (lineUpper.startsWith(flagUpper + SPACE)
|| lineUpper.startsWith(flagUpper + EQUALS)
|| lineUpper.startsWith(flagUpper + suffix + SPACE)
|| lineUpper.startsWith(flagUpper + suffix + EQUALS))
{
result = true;
}
else
{
// test for exact match i.e. flag only on this line
if (lineUpper.equals(flagUpper)
|| lineUpper.startsWith(flagUpper + suffix))
{
result = true;
}
}
}
return result;
}
/**
* Write out the alignment sequences in Mega format.
*/
@Override
public String print()
{
return print(getSeqsAsArray());
}
/**
* Write out the alignment sequences in Mega format - interleaved unless
* explicitly noninterleaved.
*/
public String print(SequenceI[] s)
{
// TODO: is there a way to preserve the 'interleaved' property so it can
// affect output?
String result = null;
if (this.fileFormat == FileFormat.FANCY)
{
result = printInterleavedCodons(s);
}
else if (this.interleaved != null && !this.interleaved)
{
result = printNonInterleaved(s);
}
else
{
result = printInterleaved(s);
}
return result;
}
/**
* Print the sequences in interleaved format, each row 15 space-separated
* triplets.
*
* @param s
* @return
*/
protected String printInterleavedCodons(SequenceI[] s)
{
// TODO not coded yet - defaulting to the 'simple' format output
return printInterleaved(s);
}
/**
* Print to string in Interleaved format - blocks of next 50 characters of
* each sequence in turn.
*
* @param s
*/
protected String printInterleaved(SequenceI[] s)
{
int maxIdLength = getMaxIdLength(s);
int maxSequenceLength = getMaxSequenceLength(s);
int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx
/*
* Size a buffer to hold the whole output
*/
StringBuilder sb = new StringBuilder(numLines
* (maxIdLength + 2 + POSITIONS_PER_LINE));
printHeaders(sb, FileFormat.SIMPLE);
int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1;
for (int i = 0; i < numDataBlocks; i++)
{
sb.append(newline);
for (SequenceI seq : s)
{
String seqId = String.format("#%-" + maxIdLength + "s ",
seq.getName());
char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE,
(i + 1) * POSITIONS_PER_LINE);
sb.append(seqId);
sb.append(subSequence);
sb.append(newline);
}
}
return new String(sb);
}
/**
* Append the MEGA header and any other known properties
*
* @param sb
*/
private void printHeaders(StringBuilder sb, FileFormat format)
{
sb.append(MEGA_ID);
sb.append(newline);
/*
*
*/
Set> props = getAlignmentProperties();
if (props != null)
{
for (Entry