2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.Sequence;
24 import jalview.datamodel.SequenceI;
26 import java.io.IOException;
30 * Parser and exporter for PHYLIP file format, as defined <a
31 * href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in the
32 * documentation</a>. The parser imports PHYLIP files in both sequential and
33 * interleaved format, and (currently) exports in interleaved format (using 60
34 * characters per matrix for the sequence).
38 * The following assumptions have been made for input
40 * <li>Sequences are expressed as letters, not real numbers with decimal points
41 * separated by blanks (which is a valid option according to the specification)</li>
44 * The following assumptions have been made for output
46 * <li>Interleaved format is used, with each matrix consisting of 60 characters;
48 * <li>a blank line is added between each matrix;</li>
49 * <li>no spacing is added between the sequence characters.</li>
55 * @author David Corsar
59 public class PhylipFile extends AlignFile
62 // Define file extension and description to save repeating it elsewhere
63 public static final String FILE_EXT = "phy";
65 public static final String FILE_DESC = "PHYLIP";
69 * @see {@link AlignFile#AlignFile()}
81 public PhylipFile(FileParse source) throws IOException
90 * @see {@link AlignFile#AlignFile(FileParse)}
92 public PhylipFile(String inFile, String type) throws IOException
98 * Parses the input source
100 * @see {@link AlignFile#parse()}
103 public void parse() throws IOException
107 // First line should contain number of species and number of
108 // characters, separated by blanks
109 String line = nextLine();
110 String[] lineElements = line.trim().split("\\s+");
111 if (lineElements.length < 2)
113 throw new IOException(
114 "First line must contain the number of specifies and number of characters");
117 int numberSpecies = Integer.parseInt(lineElements[0]), numberCharacters = Integer
118 .parseInt(lineElements[1]);
120 if (numberSpecies <= 0)
122 // there are no sequences in this file so exit a nothing to
127 SequenceI[] sequenceElements = new Sequence[numberSpecies];
128 StringBuffer[] sequences = new StringBuffer[numberSpecies];
130 // if file is in sequential format there is only one data matrix,
131 // else there are multiple
133 // read the first data matrix
134 for (int i = 0; i < numberSpecies; i++)
137 // lines start with the name - a maximum of 10 characters
138 // if less, then padded out or terminated with a tab
139 String potentialName = line.substring(0, 10);
140 int tabIndex = potentialName.indexOf('\t');
143 sequenceElements[i] = parseId(validateName(potentialName));
144 sequences[i] = new StringBuffer(
145 removeWhitespace(line.substring(10)));
149 sequenceElements[i] = parseId(validateName(potentialName
150 .substring(0, tabIndex)));
151 sequences[i] = new StringBuffer(
152 removeWhitespace(line.substring(tabIndex)));
156 // determine if interleaved
157 if ((sequences[0]).length() != numberCharacters)
159 // interleaved file, so have to read the remainder
161 for (line = nextLine(); line != null; line = nextLine())
163 // ignore blank lines, as defined by the specification
164 if (line.length() > 0)
166 sequences[i++].append(removeWhitespace(line));
168 // reached end of matrix, so get ready for the next one
169 if (i == sequences.length)
176 // file parsed completely, now store sequences
177 for (int i = 0; i < numberSpecies; i++)
179 // first check sequence is the expected length
180 if (sequences[i].length() != numberCharacters)
182 throw new IOException(sequenceElements[i].getName()
183 + " sequence is incorrect length - should be "
184 + numberCharacters + " but is " + sequences[i].length());
186 sequenceElements[i].setSequence(sequences[i].toString());
187 seqs.add(sequenceElements[i]);
190 } catch (IOException e)
192 System.err.println("Exception parsing PHYLIP file " + e);
193 e.printStackTrace(System.err);
200 * Removes any whitespace from txt, used to strip and spaces added to
201 * sequences to improve human readability
206 private String removeWhitespace(String txt)
208 return txt.replaceAll("\\s*", "");
212 * According to the specification, the name cannot have parentheses, square
213 * brackets, colon, semicolon, comma
217 * @throws IOException
219 private String validateName(String name) throws IOException
221 char[] invalidCharacters = new char[] { '(', ')', '[', ']', ':', ';',
223 for (char c : invalidCharacters)
225 if (name.indexOf(c) > -1)
227 throw new IOException("Species name contains illegal character "
236 * Prints the seqs in interleaved format, with each matrix consisting of 60
237 * characters; a blank line is added between each matrix; no spacing is added
238 * between the sequence characters.
242 * @see {@link AlignFile#print()}
245 public String print()
248 StringBuffer sb = new StringBuffer(Integer.toString(seqs.size()));
250 // if there are no sequences, then define the number of characters as 0
252 (seqs.size() > 0) ? Integer
253 .toString(seqs.get(0).getSequence().length) : "0")
256 // Due to how IO is handled, there doesn't appear to be a way to store
257 // if the original file was sequential or interleaved; if there is, then
258 // use that to set the value of the following variable
259 boolean sequential = false;
261 // maximum number of columns for each row of interleaved format
262 int numInterleavedColumns = 60;
264 int sequenceLength = 0;
265 for (SequenceI s : seqs)
268 // ensure name is only 10 characters
269 String name = s.getName();
270 if (name.length() > 10)
272 name = name.substring(0, 10);
276 // add padding 10 characters
277 name = String.format("%1$-" + 10 + "s", s.getName());
281 // sequential has the entire sequence following the name
284 sb.append(s.getSequence());
288 // Jalview ensures all sequences are of same length so no need
289 // to keep track of min/max length
290 sequenceLength = s.getSequence().length;
291 // interleaved breaks the sequence into chunks for
292 // interleavedColumns characters
293 sb.append(s.getSequence(0,
294 Math.min(numInterleavedColumns, sequenceLength)));
299 // add the remaining matrixes if interleaved and there is something to
301 if (!sequential && sequenceLength > numInterleavedColumns)
303 // determine number of remaining matrixes
304 int numMatrics = sequenceLength / numInterleavedColumns;
305 if ((sequenceLength % numInterleavedColumns) > 0)
310 // start i = 1 as first matrix has already been printed
311 for (int i = 1; i < numMatrics; i++)
313 // add blank line to separate this matrix from previous
315 int start = i * numInterleavedColumns;
316 for (SequenceI s : seqs)
319 s.getSequence(start, Math.min(start
320 + numInterleavedColumns, sequenceLength)))
327 return sb.toString();