X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FPhylipFile.java;fp=src%2Fjalview%2Fio%2FPhylipFile.java;h=ce65eea1486deb4d83e5f4924c1866896583859e;hb=77abb3fac2965a8966410cd77cd749c7c1dc6453;hp=0000000000000000000000000000000000000000;hpb=c7b57deb6556cd548e32b10035463e4ea1900db9;p=jalview.git diff --git a/src/jalview/io/PhylipFile.java b/src/jalview/io/PhylipFile.java new file mode 100644 index 0000000..ce65eea --- /dev/null +++ b/src/jalview/io/PhylipFile.java @@ -0,0 +1,320 @@ +/** + * + */ +package jalview.io; + +import jalview.datamodel.Alignment; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; + +import java.io.IOException; + +/** + *

+ * Parser and exporter for PHYLIP file format, as defined in the + * documentation. The parser imports PHYLIP files in both sequential and + * interleaved format, and (currently) exports in interleaved format (using 60 + * characters per matrix for the sequence). + *

+ * + *

+ * The following assumptions have been made for input + *

+ * + * The following assumptions have been made for output + * + * + * + *

+ * + * @author David Corsar + * + * + */ +public class PhylipFile extends AlignFile +{ + + // Define file extension and description to save repeating it elsewhere + public static final String FILE_EXT = "phy"; + + public static final String FILE_DESC = "PHYLIP"; + + /** + * + * @see {@link AlignFile#AlignFile()} + */ + public PhylipFile() + { + super(); + } + + /** + * + * @param source + * @throws IOException + */ + public PhylipFile(FileParse source) throws IOException + { + super(source); + } + + /** + * @param inFile + * @param type + * @throws IOException + * @see {@link AlignFile#AlignFile(FileParse)} + */ + public PhylipFile(String inFile, String type) throws IOException + { + super(inFile, type); + } + + /** + * Parses the input source + * + * @see {@link AlignFile#parse()} + */ + @Override + public void parse() throws IOException + { + try + { + // First line should contain number of species and number of + // characters, separated by blanks + String line = nextLine(); + String[] lineElements = line.trim().split("\\s+"); + if (lineElements.length < 2) + { + throw new IOException( + "First line must contain the number of specifies and number of characters"); + } + + int numberSpecies = Integer.parseInt(lineElements[0]), numberCharacters = Integer + .parseInt(lineElements[1]); + + if (numberSpecies <= 0) + { + // there are no sequences in this file so exit a nothing to + // parse + return; + } + + SequenceI[] sequenceElements = new Sequence[numberSpecies]; + StringBuffer[] sequences = new StringBuffer[numberSpecies]; + + // if file is in sequential format there is only one data matrix, + // else there are multiple + + // read the first data matrix + for (int i = 0; i < numberSpecies; i++) + { + line = nextLine(); + // lines start with the name - a maximum of 10 characters + // if less, then padded out or terminated with a tab + String potentialName = line.substring(0, 10); + int tabIndex = potentialName.indexOf('\t'); + if (tabIndex == -1) + { + sequenceElements[i] = parseId(validateName(potentialName)); + sequences[i] = new StringBuffer( + removeWhitespace(line.substring(10))); + } + else + { + sequenceElements[i] = parseId(validateName(potentialName + .substring(0, tabIndex))); + sequences[i] = new StringBuffer( + removeWhitespace(line.substring(tabIndex))); + } + } + + // determine if interleaved + if ((sequences[0]).length() != numberCharacters) + { + // interleaved file, so have to read the remainder + int i = 0; + for (line = nextLine(); line != null; line = nextLine()) + { + // ignore blank lines, as defined by the specification + if (line.length() > 0) + { + sequences[i++].append(removeWhitespace(line)); + } + // reached end of matrix, so get ready for the next one + if (i == sequences.length) + { + i = 0; + } + } + } + + // file parsed completely, now store sequences + for (int i = 0; i < numberSpecies; i++) + { + // first check sequence is the expected length + if (sequences[i].length() != numberCharacters) + { + throw new IOException(sequenceElements[i].getName() + + " sequence is incorrect length - should be " + + numberCharacters + " but is " + sequences[i].length()); + } + sequenceElements[i].setSequence(sequences[i].toString()); + seqs.add(sequenceElements[i]); + } + + // create an alignment based on the sequences + Alignment a = new Alignment(sequenceElements); + // add annotations - although comments say addAnnotations + // is used by AppletFormatAdapter, it doesn't say other + // classes should/can not use it + addAnnotations(a); + + } catch (IOException e) + { + System.err.println("Exception parsing PHYLIP file " + e); + e.printStackTrace(System.err); + throw e; + } + + } + + /** + * Removes any whitespace from txt, used to strip and spaces added to + * sequences to improve human readability + * + * @param txt + * @return + */ + private String removeWhitespace(String txt) + { + return txt.replaceAll("\\s*", ""); + } + + /** + * According to the specification, the name cannot have parentheses, square + * brackets, colon, semicolon, comma + * + * @param name + * @return + * @throws IOException + */ + private String validateName(String name) throws IOException + { + char[] invalidCharacters = new char[] + { '(', ')', '[', ']', ':', ';', ',' }; + for (char c : invalidCharacters) + { + if (name.indexOf(c) > -1) + { + throw new IOException("Species name contains illegal character " + + c); + } + } + return name; + } + + /** + *

+ * Prints the seqs in interleaved format, with each matrix consisting of 60 + * characters; a blank line is added between each matrix; no spacing is added + * between the sequence characters. + *

+ * + * + * @see {@link AlignFile#print()} + */ + @Override + public String print() + { + + StringBuffer sb = new StringBuffer(Integer.toString(seqs.size())); + sb.append(" "); + // if there are no sequences, then define the number of characters as 0 + sb.append( + (seqs.size() > 0) ? Integer + .toString(seqs.get(0).getSequence().length) : "0") + .append(newline); + + // Due to how IO is handled, there doesn't appear to be a way to store + // if the original file was sequential or interleaved; if there is, then + // use that to set the value of the following variable + boolean sequential = false; + + // maximum number of columns for each row of interleaved format + int numInterleavedColumns = 60; + + int sequenceLength = 0; + for (SequenceI s : seqs) + { + + // ensure name is only 10 characters + String name = s.getName(); + if (name.length() > 10) + { + name = name.substring(0, 10); + } + else + { + // add padding 10 characters + name = String.format("%1$-" + 10 + "s", s.getName()); + } + sb.append(name); + + // sequential has the entire sequence following the name + if (sequential) + { + sb.append(s.getSequence()); + } + else + { + // Jalview ensures all sequences are of same length so no need + // to keep track of min/max length + sequenceLength = s.getSequence().length; + // interleaved breaks the sequence into chunks for + // interleavedColumns characters + sb.append(s.getSequence(0, + Math.min(numInterleavedColumns, sequenceLength))); + } + sb.append(newline); + } + + // add the remaining matrixes if interleaved and there is something to + // add + if (!sequential && sequenceLength > numInterleavedColumns) + { + // determine number of remaining matrixes + int numMatrics = sequenceLength / numInterleavedColumns; + if ((sequenceLength % numInterleavedColumns) > 0) + { + numMatrics++; + } + + // start i = 1 as first matrix has already been printed + for (int i = 1; i < numMatrics; i++) + { + // add blank line to separate this matrix from previous + sb.append(newline); + int start = i * numInterleavedColumns; + for (SequenceI s : seqs) + { + sb.append( + s.getSequence(start, Math.min(start + + numInterleavedColumns, sequenceLength))) + .append(newline); + } + } + + } + + return sb.toString(); + } +} \ No newline at end of file