/**
*
*/
package jalview.io;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import java.io.IOException;
/**
*
* Parser and exporter for PHYLIP file format, as defined in the
* documentation. The parser imports PHYLIP files in both sequential and
* interleaved format, and (currently) exports in interleaved format (using 60
* characters per matrix for the sequence).
*
*
*
* The following assumptions have been made for input
*
* - Sequences are expressed as letters, not real numbers with decimal points
* separated by blanks (which is a valid option according to the specification)
*
*
* The following assumptions have been made for output
*
* - Interleaved format is used, with each matrix consisting of 60 characters;
*
* - a blank line is added between each matrix;
* - no spacing is added between the sequence characters.
*
*
*
*
*
* @author David Corsar
*
*
*/
public class PhylipFile extends AlignFile
{
// Define file extension and description to save repeating it elsewhere
public static final String FILE_EXT = "phy";
public static final String FILE_DESC = "PHYLIP";
/**
*
* @see {@link AlignFile#AlignFile()}
*/
public PhylipFile()
{
super();
}
/**
*
* @param source
* @throws IOException
*/
public PhylipFile(FileParse source) throws IOException
{
super(source);
}
/**
* @param inFile
* @param type
* @throws IOException
* @see {@link AlignFile#AlignFile(FileParse)}
*/
public PhylipFile(String inFile, String type) throws IOException
{
super(inFile, type);
}
/**
* Parses the input source
*
* @see {@link AlignFile#parse()}
*/
@Override
public void parse() throws IOException
{
try
{
// First line should contain number of species and number of
// characters, separated by blanks
String line = nextLine();
String[] lineElements = line.trim().split("\\s+");
if (lineElements.length < 2)
{
throw new IOException(
"First line must contain the number of specifies and number of characters");
}
int numberSpecies = Integer.parseInt(lineElements[0]), numberCharacters = Integer
.parseInt(lineElements[1]);
if (numberSpecies <= 0)
{
// there are no sequences in this file so exit a nothing to
// parse
return;
}
SequenceI[] sequenceElements = new Sequence[numberSpecies];
StringBuffer[] sequences = new StringBuffer[numberSpecies];
// if file is in sequential format there is only one data matrix,
// else there are multiple
// read the first data matrix
for (int i = 0; i < numberSpecies; i++)
{
line = nextLine();
// lines start with the name - a maximum of 10 characters
// if less, then padded out or terminated with a tab
String potentialName = line.substring(0, 10);
int tabIndex = potentialName.indexOf('\t');
if (tabIndex == -1)
{
sequenceElements[i] = parseId(validateName(potentialName));
sequences[i] = new StringBuffer(
removeWhitespace(line.substring(10)));
}
else
{
sequenceElements[i] = parseId(validateName(potentialName
.substring(0, tabIndex)));
sequences[i] = new StringBuffer(
removeWhitespace(line.substring(tabIndex)));
}
}
// determine if interleaved
if ((sequences[0]).length() != numberCharacters)
{
// interleaved file, so have to read the remainder
int i = 0;
for (line = nextLine(); line != null; line = nextLine())
{
// ignore blank lines, as defined by the specification
if (line.length() > 0)
{
sequences[i++].append(removeWhitespace(line));
}
// reached end of matrix, so get ready for the next one
if (i == sequences.length)
{
i = 0;
}
}
}
// file parsed completely, now store sequences
for (int i = 0; i < numberSpecies; i++)
{
// first check sequence is the expected length
if (sequences[i].length() != numberCharacters)
{
throw new IOException(sequenceElements[i].getName()
+ " sequence is incorrect length - should be "
+ numberCharacters + " but is " + sequences[i].length());
}
sequenceElements[i].setSequence(sequences[i].toString());
seqs.add(sequenceElements[i]);
}
} catch (IOException e)
{
System.err.println("Exception parsing PHYLIP file " + e);
e.printStackTrace(System.err);
throw e;
}
}
/**
* Removes any whitespace from txt, used to strip and spaces added to
* sequences to improve human readability
*
* @param txt
* @return
*/
private String removeWhitespace(String txt)
{
return txt.replaceAll("\\s*", "");
}
/**
* According to the specification, the name cannot have parentheses, square
* brackets, colon, semicolon, comma
*
* @param name
* @return
* @throws IOException
*/
private String validateName(String name) throws IOException
{
char[] invalidCharacters = new char[]
{ '(', ')', '[', ']', ':', ';', ',' };
for (char c : invalidCharacters)
{
if (name.indexOf(c) > -1)
{
throw new IOException("Species name contains illegal character "
+ c);
}
}
return name;
}
/**
*
* Prints the seqs in interleaved format, with each matrix consisting of 60
* characters; a blank line is added between each matrix; no spacing is added
* between the sequence characters.
*
*
*
* @see {@link AlignFile#print()}
*/
@Override
public String print()
{
StringBuffer sb = new StringBuffer(Integer.toString(seqs.size()));
sb.append(" ");
// if there are no sequences, then define the number of characters as 0
sb.append(
(seqs.size() > 0) ? Integer
.toString(seqs.get(0).getSequence().length) : "0")
.append(newline);
// Due to how IO is handled, there doesn't appear to be a way to store
// if the original file was sequential or interleaved; if there is, then
// use that to set the value of the following variable
boolean sequential = false;
// maximum number of columns for each row of interleaved format
int numInterleavedColumns = 60;
int sequenceLength = 0;
for (SequenceI s : seqs)
{
// ensure name is only 10 characters
String name = s.getName();
if (name.length() > 10)
{
name = name.substring(0, 10);
}
else
{
// add padding 10 characters
name = String.format("%1$-" + 10 + "s", s.getName());
}
sb.append(name);
// sequential has the entire sequence following the name
if (sequential)
{
sb.append(s.getSequence());
}
else
{
// Jalview ensures all sequences are of same length so no need
// to keep track of min/max length
sequenceLength = s.getSequence().length;
// interleaved breaks the sequence into chunks for
// interleavedColumns characters
sb.append(s.getSequence(0,
Math.min(numInterleavedColumns, sequenceLength)));
}
sb.append(newline);
}
// add the remaining matrixes if interleaved and there is something to
// add
if (!sequential && sequenceLength > numInterleavedColumns)
{
// determine number of remaining matrixes
int numMatrics = sequenceLength / numInterleavedColumns;
if ((sequenceLength % numInterleavedColumns) > 0)
{
numMatrics++;
}
// start i = 1 as first matrix has already been printed
for (int i = 1; i < numMatrics; i++)
{
// add blank line to separate this matrix from previous
sb.append(newline);
int start = i * numInterleavedColumns;
for (SequenceI s : seqs)
{
sb.append(
s.getSequence(start, Math.min(start
+ numInterleavedColumns, sequenceLength)))
.append(newline);
}
}
}
return sb.toString();
}
}