2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.Sequence;
24 import jalview.datamodel.SequenceI;
26 import java.io.IOException;
30 * Parser and exporter for PHYLIP file format, as defined
31 * <a href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in
32 * the documentation</a>. The parser imports PHYLIP files in both sequential and
33 * interleaved format, and (currently) exports in interleaved format (using 60
34 * characters per matrix for the sequence).
38 * The following assumptions have been made for input
40 * <li>Sequences are expressed as letters, not real numbers with decimal points
41 * separated by blanks (which is a valid option according to the
45 * The following assumptions have been made for output
47 * <li>Interleaved format is used, with each matrix consisting of 60 characters;
49 * <li>a blank line is added between each matrix;</li>
50 * <li>no spacing is added between the sequence characters.</li>
56 * @author David Corsar
60 public class PhylipFile extends AlignFile
63 public static final String FILE_DESC = "PHYLIP";
67 * @see {@link AlignFile#AlignFile()}
79 public PhylipFile(FileParse source) throws IOException
88 * @see {@link AlignFile#AlignFile(FileParse)}
90 public PhylipFile(String inFile, DataSourceType sourceType)
93 super(inFile, sourceType);
97 * Parses the input source
99 * @see {@link AlignFile#parse()}
102 public void parse() throws IOException
106 // First line should contain number of species and number of
107 // characters, separated by blanks
108 String line = nextLine();
109 String[] lineElements = line.trim().split("\\s+");
110 if (lineElements.length < 2)
112 throw new IOException(
113 "First line must contain the number of specifies and number of characters");
116 int numberSpecies = Integer.parseInt(lineElements[0]),
117 numberCharacters = Integer.parseInt(lineElements[1]);
119 if (numberSpecies <= 0)
121 // there are no sequences in this file so exit a nothing to
126 SequenceI[] sequenceElements = new Sequence[numberSpecies];
127 StringBuffer[] sequences = new StringBuffer[numberSpecies];
129 // if file is in sequential format there is only one data matrix,
130 // else there are multiple
132 // read the first data matrix
133 for (int i = 0; i < numberSpecies; i++)
136 // lines start with the name - a maximum of 10 characters
137 // if less, then padded out or terminated with a tab
138 String potentialName = line.substring(0, 10);
139 int tabIndex = potentialName.indexOf('\t');
142 sequenceElements[i] = parseId(validateName(potentialName));
143 sequences[i] = new StringBuffer(
144 removeWhitespace(line.substring(10)));
148 sequenceElements[i] = parseId(
149 validateName(potentialName.substring(0, tabIndex)));
150 sequences[i] = new StringBuffer(
151 removeWhitespace(line.substring(tabIndex)));
155 // determine if interleaved
156 if ((sequences[0]).length() != numberCharacters)
158 // interleaved file, so have to read the remainder
160 for (line = nextLine(); line != null; line = nextLine())
162 // ignore blank lines, as defined by the specification
163 if (line.length() > 0)
165 sequences[i++].append(removeWhitespace(line));
167 // reached end of matrix, so get ready for the next one
168 if (i == sequences.length)
175 // file parsed completely, now store sequences
176 for (int i = 0; i < numberSpecies; i++)
178 // first check sequence is the expected length
179 if (sequences[i].length() != numberCharacters)
181 throw new IOException(sequenceElements[i].getName()
182 + " sequence is incorrect length - should be "
183 + numberCharacters + " but is " + sequences[i].length());
185 sequenceElements[i].setSequence(sequences[i].toString());
186 seqs.add(sequenceElements[i]);
189 } catch (IOException e)
191 System.err.println("Exception parsing PHYLIP file " + e);
192 e.printStackTrace(System.err);
199 * Removes any whitespace from txt, used to strip and spaces added to
200 * sequences to improve human readability
205 private String removeWhitespace(String txt)
207 return txt.replaceAll("\\s*", "");
211 * According to the specification, the name cannot have parentheses, square
212 * brackets, colon, semicolon, comma
216 * @throws IOException
218 private String validateName(String name) throws IOException
220 char[] invalidCharacters = new char[] { '(', ')', '[', ']', ':', ';',
222 for (char c : invalidCharacters)
224 if (name.indexOf(c) > -1)
226 throw new IOException(
227 "Species name contains illegal character " + c);
235 * Prints the seqs in interleaved format, with each matrix consisting of 60
236 * characters; a blank line is added between each matrix; no spacing is added
237 * between the sequence characters.
241 * @see {@link AlignFile#print()}
244 public String print(SequenceI[] sqs, boolean jvsuffix)
247 StringBuffer sb = new StringBuffer(Integer.toString(sqs.length));
249 // if there are no sequences, then define the number of characters as 0
251 (sqs.length > 0) ? Integer.toString(sqs[0].getLength())
255 // Due to how IO is handled, there doesn't appear to be a way to store
256 // if the original file was sequential or interleaved; if there is, then
257 // use that to set the value of the following variable
258 boolean sequential = false;
260 // maximum number of columns for each row of interleaved format
261 int numInterleavedColumns = 60;
263 int sequenceLength = 0;
264 for (SequenceI s : sqs)
267 // ensure name is only 10 characters
268 String name = s.getName();
269 if (name.length() > 10)
271 name = name.substring(0, 10);
275 // add padding 10 characters
276 name = String.format("%1$-" + 10 + "s", s.getName());
280 // sequential has the entire sequence following the name
283 sb.append(s.getSequenceAsString());
287 // Jalview ensures all sequences are of same length so no need
288 // to keep track of min/max length
289 sequenceLength = s.getLength();
290 // interleaved breaks the sequence into chunks for
291 // interleavedColumns characters
292 sb.append(s.getSequence(0,
293 Math.min(numInterleavedColumns, sequenceLength)));
298 // add the remaining matrixes if interleaved and there is something to
300 if (!sequential && sequenceLength > numInterleavedColumns)
302 // determine number of remaining matrixes
303 int numMatrics = sequenceLength / numInterleavedColumns;
304 if ((sequenceLength % numInterleavedColumns) > 0)
309 // start i = 1 as first matrix has already been printed
310 for (int i = 1; i < numMatrics; i++)
312 // add blank line to separate this matrix from previous
314 int start = i * numInterleavedColumns;
315 for (SequenceI s : sqs)
317 sb.append(s.getSequence(start,
318 Math.min(start + numInterleavedColumns, sequenceLength)))
325 return sb.toString();