6 import jalview.datamodel.Sequence;
7 import jalview.datamodel.SequenceI;
9 import java.io.IOException;
13 * Parser and exporter for PHYLIP file format, as defined <a
14 * href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in the
15 * documentation</a>. The parser imports PHYLIP files in both sequential and
16 * interleaved format, and (currently) exports in interleaved format (using 60
17 * characters per matrix for the sequence).
21 * The following assumptions have been made for input
23 * <li>Sequences are expressed as letters, not real numbers with decimal points
24 * separated by blanks (which is a valid option according to the specification)</li>
27 * The following assumptions have been made for output
29 * <li>Interleaved format is used, with each matrix consisting of 60 characters;
31 * <li>a blank line is added between each matrix;</li>
32 * <li>no spacing is added between the sequence characters.</li>
38 * @author David Corsar
42 public class PhylipFile extends AlignFile
45 // Define file extension and description to save repeating it elsewhere
46 public static final String FILE_EXT = "phy";
48 public static final String FILE_DESC = "PHYLIP";
52 * @see {@link AlignFile#AlignFile()}
64 public PhylipFile(FileParse source) throws IOException
73 * @see {@link AlignFile#AlignFile(FileParse)}
75 public PhylipFile(String inFile, String type) throws IOException
81 * Parses the input source
83 * @see {@link AlignFile#parse()}
86 public void parse() throws IOException
90 // First line should contain number of species and number of
91 // characters, separated by blanks
92 String line = nextLine();
93 String[] lineElements = line.trim().split("\\s+");
94 if (lineElements.length < 2)
96 throw new IOException(
97 "First line must contain the number of specifies and number of characters");
100 int numberSpecies = Integer.parseInt(lineElements[0]), numberCharacters = Integer
101 .parseInt(lineElements[1]);
103 if (numberSpecies <= 0)
105 // there are no sequences in this file so exit a nothing to
110 SequenceI[] sequenceElements = new Sequence[numberSpecies];
111 StringBuffer[] sequences = new StringBuffer[numberSpecies];
113 // if file is in sequential format there is only one data matrix,
114 // else there are multiple
116 // read the first data matrix
117 for (int i = 0; i < numberSpecies; i++)
120 // lines start with the name - a maximum of 10 characters
121 // if less, then padded out or terminated with a tab
122 String potentialName = line.substring(0, 10);
123 int tabIndex = potentialName.indexOf('\t');
126 sequenceElements[i] = parseId(validateName(potentialName));
127 sequences[i] = new StringBuffer(
128 removeWhitespace(line.substring(10)));
132 sequenceElements[i] = parseId(validateName(potentialName
133 .substring(0, tabIndex)));
134 sequences[i] = new StringBuffer(
135 removeWhitespace(line.substring(tabIndex)));
139 // determine if interleaved
140 if ((sequences[0]).length() != numberCharacters)
142 // interleaved file, so have to read the remainder
144 for (line = nextLine(); line != null; line = nextLine())
146 // ignore blank lines, as defined by the specification
147 if (line.length() > 0)
149 sequences[i++].append(removeWhitespace(line));
151 // reached end of matrix, so get ready for the next one
152 if (i == sequences.length)
159 // file parsed completely, now store sequences
160 for (int i = 0; i < numberSpecies; i++)
162 // first check sequence is the expected length
163 if (sequences[i].length() != numberCharacters)
165 throw new IOException(sequenceElements[i].getName()
166 + " sequence is incorrect length - should be "
167 + numberCharacters + " but is " + sequences[i].length());
169 sequenceElements[i].setSequence(sequences[i].toString());
170 seqs.add(sequenceElements[i]);
173 } catch (IOException e)
175 System.err.println("Exception parsing PHYLIP file " + e);
176 e.printStackTrace(System.err);
183 * Removes any whitespace from txt, used to strip and spaces added to
184 * sequences to improve human readability
189 private String removeWhitespace(String txt)
191 return txt.replaceAll("\\s*", "");
195 * According to the specification, the name cannot have parentheses, square
196 * brackets, colon, semicolon, comma
200 * @throws IOException
202 private String validateName(String name) throws IOException
204 char[] invalidCharacters = new char[]
205 { '(', ')', '[', ']', ':', ';', ',' };
206 for (char c : invalidCharacters)
208 if (name.indexOf(c) > -1)
210 throw new IOException("Species name contains illegal character "
219 * Prints the seqs in interleaved format, with each matrix consisting of 60
220 * characters; a blank line is added between each matrix; no spacing is added
221 * between the sequence characters.
225 * @see {@link AlignFile#print()}
228 public String print()
231 StringBuffer sb = new StringBuffer(Integer.toString(seqs.size()));
233 // if there are no sequences, then define the number of characters as 0
235 (seqs.size() > 0) ? Integer
236 .toString(seqs.get(0).getSequence().length) : "0")
239 // Due to how IO is handled, there doesn't appear to be a way to store
240 // if the original file was sequential or interleaved; if there is, then
241 // use that to set the value of the following variable
242 boolean sequential = false;
244 // maximum number of columns for each row of interleaved format
245 int numInterleavedColumns = 60;
247 int sequenceLength = 0;
248 for (SequenceI s : seqs)
251 // ensure name is only 10 characters
252 String name = s.getName();
253 if (name.length() > 10)
255 name = name.substring(0, 10);
259 // add padding 10 characters
260 name = String.format("%1$-" + 10 + "s", s.getName());
264 // sequential has the entire sequence following the name
267 sb.append(s.getSequence());
271 // Jalview ensures all sequences are of same length so no need
272 // to keep track of min/max length
273 sequenceLength = s.getSequence().length;
274 // interleaved breaks the sequence into chunks for
275 // interleavedColumns characters
276 sb.append(s.getSequence(0,
277 Math.min(numInterleavedColumns, sequenceLength)));
282 // add the remaining matrixes if interleaved and there is something to
284 if (!sequential && sequenceLength > numInterleavedColumns)
286 // determine number of remaining matrixes
287 int numMatrics = sequenceLength / numInterleavedColumns;
288 if ((sequenceLength % numInterleavedColumns) > 0)
293 // start i = 1 as first matrix has already been printed
294 for (int i = 1; i < numMatrics; i++)
296 // add blank line to separate this matrix from previous
298 int start = i * numInterleavedColumns;
299 for (SequenceI s : seqs)
302 s.getSequence(start, Math.min(start
303 + numInterleavedColumns, sequenceLength)))
310 return sb.toString();