2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.Sequence;
24 import jalview.datamodel.SequenceI;
26 import java.io.IOException;
30 * Parser and exporter for PHYLIP file format, as defined <a
31 * href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in the
32 * documentation</a>. The parser imports PHYLIP files in both sequential and
33 * interleaved format, and (currently) exports in interleaved format (using 60
34 * characters per matrix for the sequence).
38 * The following assumptions have been made for input
40 * <li>Sequences are expressed as letters, not real numbers with decimal points
41 * separated by blanks (which is a valid option according to the specification)</li>
44 * The following assumptions have been made for output
46 * <li>Interleaved format is used, with each matrix consisting of 60 characters;
48 * <li>a blank line is added between each matrix;</li>
49 * <li>no spacing is added between the sequence characters.</li>
55 * @author David Corsar
59 public class PhylipFile extends AlignFile
62 public static final String FILE_DESC = "PHYLIP";
66 * @see {@link AlignFile#AlignFile()}
78 public PhylipFile(FileParse source) throws IOException
87 * @see {@link AlignFile#AlignFile(FileParse)}
89 public PhylipFile(String inFile, DataSourceType sourceType)
92 super(inFile, sourceType);
96 * Parses the input source
98 * @see {@link AlignFile#parse()}
101 public void parse() throws IOException
105 // First line should contain number of species and number of
106 // characters, separated by blanks
107 String line = nextLine();
108 String[] lineElements = line.trim().split("\\s+");
109 if (lineElements.length < 2)
111 throw new IOException(
112 "First line must contain the number of specifies and number of characters");
115 int numberSpecies = Integer.parseInt(lineElements[0]), numberCharacters = Integer
116 .parseInt(lineElements[1]);
118 if (numberSpecies <= 0)
120 // there are no sequences in this file so exit a nothing to
125 SequenceI[] sequenceElements = new Sequence[numberSpecies];
126 StringBuffer[] sequences = new StringBuffer[numberSpecies];
128 // if file is in sequential format there is only one data matrix,
129 // else there are multiple
131 // read the first data matrix
132 for (int i = 0; i < numberSpecies; i++)
135 // lines start with the name - a maximum of 10 characters
136 // if less, then padded out or terminated with a tab
137 String potentialName = line.substring(0, 10);
138 int tabIndex = potentialName.indexOf('\t');
141 sequenceElements[i] = parseId(validateName(potentialName));
142 sequences[i] = new StringBuffer(
143 removeWhitespace(line.substring(10)));
147 sequenceElements[i] = parseId(validateName(potentialName
148 .substring(0, tabIndex)));
149 sequences[i] = new StringBuffer(
150 removeWhitespace(line.substring(tabIndex)));
154 // determine if interleaved
155 if ((sequences[0]).length() != numberCharacters)
157 // interleaved file, so have to read the remainder
159 for (line = nextLine(); line != null; line = nextLine())
161 // ignore blank lines, as defined by the specification
162 if (line.length() > 0)
164 sequences[i++].append(removeWhitespace(line));
166 // reached end of matrix, so get ready for the next one
167 if (i == sequences.length)
174 // file parsed completely, now store sequences
175 for (int i = 0; i < numberSpecies; i++)
177 // first check sequence is the expected length
178 if (sequences[i].length() != numberCharacters)
180 throw new IOException(sequenceElements[i].getName()
181 + " sequence is incorrect length - should be "
182 + numberCharacters + " but is " + sequences[i].length());
184 sequenceElements[i].setSequence(sequences[i].toString());
185 seqs.add(sequenceElements[i]);
188 } catch (IOException e)
190 System.err.println("Exception parsing PHYLIP file " + e);
191 e.printStackTrace(System.err);
198 * Removes any whitespace from txt, used to strip and spaces added to
199 * sequences to improve human readability
204 private String removeWhitespace(String txt)
206 return txt.replaceAll("\\s*", "");
210 * According to the specification, the name cannot have parentheses, square
211 * brackets, colon, semicolon, comma
215 * @throws IOException
217 private String validateName(String name) throws IOException
219 char[] invalidCharacters = new char[] { '(', ')', '[', ']', ':', ';',
221 for (char c : invalidCharacters)
223 if (name.indexOf(c) > -1)
225 throw new IOException("Species name contains illegal character "
234 * Prints the seqs in interleaved format, with each matrix consisting of 60
235 * characters; a blank line is added between each matrix; no spacing is added
236 * between the sequence characters.
240 * @see {@link AlignFile#print()}
243 public String print(SequenceI[] sqs, boolean jvsuffix)
246 StringBuffer sb = new StringBuffer(Integer.toString(sqs.length));
248 // if there are no sequences, then define the number of characters as 0
250 (sqs.length > 0) ? Integer.toString(sqs[0].getSequence().length)
254 // Due to how IO is handled, there doesn't appear to be a way to store
255 // if the original file was sequential or interleaved; if there is, then
256 // use that to set the value of the following variable
257 boolean sequential = false;
259 // maximum number of columns for each row of interleaved format
260 int numInterleavedColumns = 60;
262 int sequenceLength = 0;
263 for (SequenceI s : sqs)
266 // ensure name is only 10 characters
267 String name = s.getName();
268 if (name.length() > 10)
270 name = name.substring(0, 10);
274 // add padding 10 characters
275 name = String.format("%1$-" + 10 + "s", s.getName());
279 // sequential has the entire sequence following the name
282 sb.append(s.getSequence());
286 // Jalview ensures all sequences are of same length so no need
287 // to keep track of min/max length
288 sequenceLength = s.getSequence().length;
289 // interleaved breaks the sequence into chunks for
290 // interleavedColumns characters
291 sb.append(s.getSequence(0,
292 Math.min(numInterleavedColumns, sequenceLength)));
297 // add the remaining matrixes if interleaved and there is something to
299 if (!sequential && sequenceLength > numInterleavedColumns)
301 // determine number of remaining matrixes
302 int numMatrics = sequenceLength / numInterleavedColumns;
303 if ((sequenceLength % numInterleavedColumns) > 0)
308 // start i = 1 as first matrix has already been printed
309 for (int i = 1; i < numMatrics; i++)
311 // add blank line to separate this matrix from previous
313 int start = i * numInterleavedColumns;
314 for (SequenceI s : sqs)
317 s.getSequence(start, Math.min(start
318 + numInterleavedColumns, sequenceLength)))
325 return sb.toString();