6 import jalview.datamodel.Alignment;
7 import jalview.datamodel.Sequence;
8 import jalview.datamodel.SequenceI;
10 import java.io.IOException;
14 * Parser and exporter for PHYLIP file format, as defined <a
15 * href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in the
16 * documentation</a>. The parser imports PHYLIP files in both sequential and
17 * interleaved format, and (currently) exports in interleaved format (using 60
18 * characters per matrix for the sequence).
22 * The following assumptions have been made for input
24 * <li>Sequences are expressed as letters, not real numbers with decimal points
25 * separated by blanks (which is a valid option according to the specification)</li>
28 * The following assumptions have been made for output
30 * <li>Interleaved format is used, with each matrix consisting of 60 characters;
32 * <li>a blank line is added between each matrix;</li>
33 * <li>no spacing is added between the sequence characters.</li>
39 * @author David Corsar
43 public class PhylipFile extends AlignFile
46 // Define file extension and description to save repeating it elsewhere
47 public static final String FILE_EXT = "phy";
49 public static final String FILE_DESC = "PHYLIP";
53 * @see {@link AlignFile#AlignFile()}
65 public PhylipFile(FileParse source) throws IOException
74 * @see {@link AlignFile#AlignFile(FileParse)}
76 public PhylipFile(String inFile, String type) throws IOException
82 * Parses the input source
84 * @see {@link AlignFile#parse()}
87 public void parse() throws IOException
91 // First line should contain number of species and number of
92 // characters, separated by blanks
93 String line = nextLine();
94 String[] lineElements = line.trim().split("\\s+");
95 if (lineElements.length < 2)
97 throw new IOException(
98 "First line must contain the number of specifies and number of characters");
101 int numberSpecies = Integer.parseInt(lineElements[0]), numberCharacters = Integer
102 .parseInt(lineElements[1]);
104 if (numberSpecies <= 0)
106 // there are no sequences in this file so exit a nothing to
111 SequenceI[] sequenceElements = new Sequence[numberSpecies];
112 StringBuffer[] sequences = new StringBuffer[numberSpecies];
114 // if file is in sequential format there is only one data matrix,
115 // else there are multiple
117 // read the first data matrix
118 for (int i = 0; i < numberSpecies; i++)
121 // lines start with the name - a maximum of 10 characters
122 // if less, then padded out or terminated with a tab
123 String potentialName = line.substring(0, 10);
124 int tabIndex = potentialName.indexOf('\t');
127 sequenceElements[i] = parseId(validateName(potentialName));
128 sequences[i] = new StringBuffer(
129 removeWhitespace(line.substring(10)));
133 sequenceElements[i] = parseId(validateName(potentialName
134 .substring(0, tabIndex)));
135 sequences[i] = new StringBuffer(
136 removeWhitespace(line.substring(tabIndex)));
140 // determine if interleaved
141 if ((sequences[0]).length() != numberCharacters)
143 // interleaved file, so have to read the remainder
145 for (line = nextLine(); line != null; line = nextLine())
147 // ignore blank lines, as defined by the specification
148 if (line.length() > 0)
150 sequences[i++].append(removeWhitespace(line));
152 // reached end of matrix, so get ready for the next one
153 if (i == sequences.length)
160 // file parsed completely, now store sequences
161 for (int i = 0; i < numberSpecies; i++)
163 // first check sequence is the expected length
164 if (sequences[i].length() != numberCharacters)
166 throw new IOException(sequenceElements[i].getName()
167 + " sequence is incorrect length - should be "
168 + numberCharacters + " but is " + sequences[i].length());
170 sequenceElements[i].setSequence(sequences[i].toString());
171 seqs.add(sequenceElements[i]);
174 // create an alignment based on the sequences
175 Alignment a = new Alignment(sequenceElements);
176 // add annotations - although comments say addAnnotations
177 // is used by AppletFormatAdapter, it doesn't say other
178 // classes should/can not use it
181 } catch (IOException e)
183 System.err.println("Exception parsing PHYLIP file " + e);
184 e.printStackTrace(System.err);
191 * Removes any whitespace from txt, used to strip and spaces added to
192 * sequences to improve human readability
197 private String removeWhitespace(String txt)
199 return txt.replaceAll("\\s*", "");
203 * According to the specification, the name cannot have parentheses, square
204 * brackets, colon, semicolon, comma
208 * @throws IOException
210 private String validateName(String name) throws IOException
212 char[] invalidCharacters = new char[]
213 { '(', ')', '[', ']', ':', ';', ',' };
214 for (char c : invalidCharacters)
216 if (name.indexOf(c) > -1)
218 throw new IOException("Species name contains illegal character "
227 * Prints the seqs in interleaved format, with each matrix consisting of 60
228 * characters; a blank line is added between each matrix; no spacing is added
229 * between the sequence characters.
233 * @see {@link AlignFile#print()}
236 public String print()
239 StringBuffer sb = new StringBuffer(Integer.toString(seqs.size()));
241 // if there are no sequences, then define the number of characters as 0
243 (seqs.size() > 0) ? Integer
244 .toString(seqs.get(0).getSequence().length) : "0")
247 // Due to how IO is handled, there doesn't appear to be a way to store
248 // if the original file was sequential or interleaved; if there is, then
249 // use that to set the value of the following variable
250 boolean sequential = false;
252 // maximum number of columns for each row of interleaved format
253 int numInterleavedColumns = 60;
255 int sequenceLength = 0;
256 for (SequenceI s : seqs)
259 // ensure name is only 10 characters
260 String name = s.getName();
261 if (name.length() > 10)
263 name = name.substring(0, 10);
267 // add padding 10 characters
268 name = String.format("%1$-" + 10 + "s", s.getName());
272 // sequential has the entire sequence following the name
275 sb.append(s.getSequence());
279 // Jalview ensures all sequences are of same length so no need
280 // to keep track of min/max length
281 sequenceLength = s.getSequence().length;
282 // interleaved breaks the sequence into chunks for
283 // interleavedColumns characters
284 sb.append(s.getSequence(0,
285 Math.min(numInterleavedColumns, sequenceLength)));
290 // add the remaining matrixes if interleaved and there is something to
292 if (!sequential && sequenceLength > numInterleavedColumns)
294 // determine number of remaining matrixes
295 int numMatrics = sequenceLength / numInterleavedColumns;
296 if ((sequenceLength % numInterleavedColumns) > 0)
301 // start i = 1 as first matrix has already been printed
302 for (int i = 1; i < numMatrics; i++)
304 // add blank line to separate this matrix from previous
306 int start = i * numInterleavedColumns;
307 for (SequenceI s : seqs)
310 s.getSequence(start, Math.min(start
311 + numInterleavedColumns, sequenceLength)))
318 return sb.toString();