X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FMegaFile.java;fp=src%2Fjalview%2Fio%2FMegaFile.java;h=f6ff645ceb9b70e8143cfa0de546fa4416c187b9;hb=6a495a81e764c82f9fdfb6b3f990b962a5b85286;hp=0000000000000000000000000000000000000000;hpb=5776f8fbf1b6c23caec6cd23d76902d228119332;p=jalview.git diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java new file mode 100644 index 0000000..f6ff645 --- /dev/null +++ b/src/jalview/io/MegaFile.java @@ -0,0 +1,784 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1) + * Copyright (C) 2014 The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.io; + +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; + +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +public class MegaFile extends AlignFile +{ + /* + * Simple file format as at + * http://www.hiv.lanl.gov/content/sequence/HelpDocs/SEQsamples.html + * + * Fancy file format as at + * http://primerdigital.com/fastpcr/images/Drosophila_Adh.txt + */ + public enum FileFormat + { + SIMPLE, FANCY + } + + private static final String HASHSIGN = "#"; // TODO: public constants file + + private static final String COLON = ":"; + + private static final String BANG = "!"; + + private static final String EQUALS = "="; + + private static final String MEGA_ID = HASHSIGN + "MEGA"; + + public static final String PROP_TITLE = "TITLE"; + + public static final String PROP_FORMAT = "Format"; + + public static final String PROP_DESCRIPTION = "Description"; + + public static final String PROP_GENE = "Gene"; + + public static final String PROP_INTERLEAVED = "Interleaved"; + + // initial size for sequence data buffer + private static final int SEQBUFFERSIZE = 256; + + private static final String SPACE = " "; + + private static final int POSITIONS_PER_LINE = 50; + + // this can be True, False or null (meaning we don't know yet) + private Boolean interleaved; + + // set once we have seen one block of interleaved data + private boolean firstDataBlockRead = false; + + private FileFormat fileFormat; + + public MegaFile() + { + } + + public MegaFile(String inFile, String type) throws IOException + { + super(inFile, type); + } + + public MegaFile(FileParse source) throws IOException + { + super(source); + } + + /** + * Parse the input stream. + */ + @Override + public void parse() throws IOException + { + /* + * Read MEGA and Title/Format/Description/Gene headers if present. These are + * saved as alignment properties. Returns the first sequence data line + */ + String dataLine = parseHeaderLines(); + + /* + * If we didn't positively identify as 'fancy format', assume 'simple + * format' + */ + if (this.fileFormat == null) + { + setFileFormat(FileFormat.SIMPLE); + } + + /* + * Temporary store of {sequenceId, positionData} while parsing appending + */ + Map seqData = new LinkedHashMap(); + + /* + * The id of the sequence being read (for non-interleaved) + */ + String currentId = ""; + + while (dataLine != null) + { + dataLine = dataLine.trim(); + if (dataLine.length() > 0) + { + currentId = parseDataLine(dataLine, seqData, currentId); + } + else if (!seqData.isEmpty()) + { + /* + * Blank line after processing some data... + */ + this.firstDataBlockRead = true; + } + dataLine = nextLine(); + } + + setSequences(seqData); + } + + /** + * Convert the parsed sequence strings to objects and store them in the model. + * + * @param seqData + */ + protected void setSequences(Map seqData) + { + Set> datasets = seqData.entrySet(); + + for (Entry dataset : datasets) + { + String sequenceId = dataset.getKey(); + StringBuilder characters = dataset.getValue(); + SequenceI s = new Sequence(sequenceId, new String(characters)); + this.seqs.addElement(s); + } + } + + /** + * Process one line of sequence data. If it has no sequence identifier, append + * to the current id's sequence. Else parse out the sequence id and append the + * data (if any) to that id's sequence. Returns the sequence id (implicit or + * explicit) for this line. + * + * @param dataLine + * @param seqData + * @param currentid + * @return + * @throws IOException + */ + protected String parseDataLine(String dataLine, + Map seqData, String currentId) + throws IOException + { + String seqId = getSequenceId(dataLine); + if (seqId == null) + { + /* + * Just character data + */ + parseNoninterleavedDataLine(dataLine, seqData, currentId); + return currentId; + } + else if ((HASHSIGN + seqId).trim().equals(dataLine.trim())) + { + /* + * Sequence id only - header line for noninterleaved data + */ + return seqId; + } + else + { + /* + * Sequence id followed by data + */ + parseInterleavedDataLine(dataLine, seqData, seqId); + return seqId; + } + } + + /** + * Add a line of sequence data to the buffer for the given sequence id. Start + * a new one if we haven't seen it before. + * + * @param dataLine + * @param seqData + * @param currentId + * @throws IOException + */ + protected void parseNoninterleavedDataLine(String dataLine, + Map seqData, String currentId) + throws IOException + { + if (currentId == null) + { + /* + * Oops. Data but no sequence id context. + */ + throw new IOException("No sequence id context at: " + dataLine); + } + + assertInterleaved(false, dataLine); + + StringBuilder sb = getSequenceDataBuffer(seqData, currentId); + + /* + * Add the current line of data to the sequence. + */ + sb.append(dataLine); + } + + /** + * Get the sequence data for this sequence id, starting a new one if + * necessary. + * + * @param seqData + * @param currentId + * @return + */ + protected StringBuilder getSequenceDataBuffer( + Map seqData, String currentId) + { + StringBuilder sb = seqData.get(currentId); + if (sb == null) + { + // first data met for this sequence id, start a new buffer + sb = new StringBuilder(SEQBUFFERSIZE); + seqData.put(currentId, sb); + } + return sb; + } + + /** + * Parse one line of interleaved data e.g. + * + *
+   * #TheSeqId CGATCGCATGCA
+   * 
+ * + * @param dataLine + * @param seqData + * @param seqId + * @throws IOException + */ + protected void parseInterleavedDataLine(String dataLine, + Map seqData, String seqId) + throws IOException + { + /* + * New sequence found in second or later data block - error. + */ + if (this.firstDataBlockRead && !seqData.containsKey(seqId)) + { + throw new IOException( + "Parse error: misplaced new sequence starting at " + dataLine); + } + + StringBuilder sb = getSequenceDataBuffer(seqData, seqId); + String data = dataLine.substring(seqId.length() + 1).trim(); + + /* + * Do nothing if this line is _only_ a sequence id with no data following. + * + * Remove any internal spaces (present in the 'fancy' file format) + */ + if (data != null && data.length() > 0) + { + if (data.indexOf(SPACE) != -1) + { + data = data.replace(SPACE, ""); + } + sb.append(data); + assertInterleaved(true, dataLine); + } + } + + /** + * If the line begins with (e.g.) "#abcde " then returns "abcde" as the + * identifier. Else returns null. + * + * @param dataLine + * @return + */ + public static String getSequenceId(String dataLine) + { + // TODO refactor to a StringUtils type class + if (dataLine != null) + { + if (dataLine.startsWith(HASHSIGN)) + { + int spacePos = dataLine.indexOf(" "); + return (spacePos == -1 ? dataLine.substring(1) : dataLine + .substring(1, spacePos)); + } + } + return null; + } + + /** + * Read the #MEGA and Title/Format/Description/Gene header lines (if present). + * + * Save as annotation properties in case useful. + * + * @return the next non-blank line following the header lines. + * @throws IOException + */ + protected String parseHeaderLines() throws IOException + { + String inputLine = null; + while ((inputLine = nextLine()) != null) + { + inputLine = inputLine.trim(); + + /* + * skip blank lines + */ + if (inputLine.length() == 0) + { + continue; + } + + if (inputLine.startsWith(BANG)) + { + setFileFormat(FileFormat.FANCY); + } + + if (inputLine.startsWith(BANG + PROP_DESCRIPTION)) + { + parseDescriptionLines(); + } + + else if (isPropertyLine(inputLine)) + { + /* + * If a property is matched, parse and save it. + */ + String[] property_value = parsePropertyValue(inputLine); + setAlignmentProperty(property_value[0], property_value[1]); + } + else if (!inputLine.toUpperCase().startsWith(MEGA_ID)) + { + + /* + * Return the first 'data line' i.e. one that is not blank, #MEGA or + * TITLE: + */ + break; + } + } + return inputLine; + } + + /** + * Read following lines until blank, appending each to the Description + * property value. + * + * Assumes the !Description line itself does not include description text. + * + * Assumes the description is followed by a blank line (else we will consume + * one too many). + * + * @throws IOException + */ + protected void parseDescriptionLines() throws IOException + { + StringBuilder desc = new StringBuilder(256); + String line = null; + while ((line = nextLine()) != null) { + if ("".equals(line.trim())) + { + break; + } + desc.append(line).append(newline); + } + setAlignmentProperty(PROP_DESCRIPTION, desc.toString()); + } + + /** + * Test whether the line holds an expected property declaration. + * + * @param inputLine + * @return + */ + protected boolean isPropertyLine(String inputLine) + { + if (lineMatchesFlag(inputLine, PROP_TITLE, BANG, COLON) + || lineMatchesFlag(inputLine, PROP_FORMAT, BANG, COLON) + || lineMatchesFlag(inputLine, PROP_DESCRIPTION, BANG, COLON) + || lineMatchesFlag(inputLine, PROP_GENE, BANG, COLON)) + { + return true; + } + return false; + } + + /** + * Helper method that extract the name and value of a property, assuming the + * first space or equals sign is the separator. + * + * Thus "Description: Melanogaster" or "!Description=Melanogaster" both return + * {"Description", "Melanogaster"}. + * + * Returns an empty value string if no space or equals sign is present. + * + * @param s + * @return + */ + public static String[] parsePropertyValue(String s) + { + // TODO refactor to a string utils helper class (or find equivalent) + // TODO handle other cases e.g. "Description = Melanogaster" + String propertyName = s; + String value = ""; + + int separatorPos = -1; + + if (s != null) + { + int spacePos = s.indexOf(SPACE); + int eqPos = s.indexOf(EQUALS); + if (spacePos == -1 && eqPos > -1) + { + separatorPos = eqPos; + } + else if (spacePos > -1 && eqPos == -1) + { + separatorPos = spacePos; + } + else if (spacePos > -1 && eqPos > -1) + { + separatorPos = Math.min(spacePos, eqPos); + } + } + if (separatorPos > -1) + { + value = s.substring(separatorPos + 1); + propertyName = s.substring(0, separatorPos); + } + + /* + * finally strip any leading / trailing chars from property name + */ + if (propertyName.startsWith(BANG)) + { + propertyName = propertyName.substring(1); + } + if (propertyName.endsWith(COLON)) + { + propertyName = propertyName.substring(0, propertyName.length() - 1); + } + + return new String[] + { propertyName, value }; + } + + /** + * Test whether a line starts with the specified flag field followed by a + * space (or nothing). + * + * Here we accept an optional prefix and suffix on the flag, and the check is + * not case-sensitive. So these would match for "Title" + * + *
+   * Title Melanogaster
+   * Title=Melanogaster
+   * TITLE Melanogaster
+   * TITLE=Melanogaster
+   * !Title Melanogaster
+   * !Title=Melanogaster
+   * !TITLE Melanogaster
+   * !TITLE=Melanogaster
+   * Title: Melanogaster
+   * Title:=Melanogaster
+   * TITLE: Melanogaster
+   * TITLE:=Melanogaster
+   * !Title: Melanogaster
+   * !Title:=Melanogaster
+   * !TITLE: Melanogaster
+   * !TITLE:=Melanogaster
+   * Title
+   * TITLE
+   * !Title
+   * !TITLE
+   * 
+ * + * @param line + * @param flag + * @param prefix + * @param suffix + * @return + */ + public static boolean lineMatchesFlag(String line, String flag, String prefix, String suffix) + { + // TODO refactor to a string utils helper class + boolean result = false; + if (line != null && flag != null) { + String lineUpper = line.toUpperCase().trim(); + String flagUpper = flag.toUpperCase(); + + // skip prefix character e.g. ! before attempting match + if (lineUpper.startsWith(prefix)) { + lineUpper = lineUpper.substring(1); + } + + // test for flag + SPACE or flag + EQUALS, with or without suffix + if (lineUpper.startsWith(flagUpper + SPACE) + || lineUpper.startsWith(flagUpper + EQUALS) + || lineUpper.startsWith(flagUpper + suffix + SPACE) + || lineUpper.startsWith(flagUpper + suffix + EQUALS)) + { + result = true; + } + else + { + // test for exact match i.e. flag only on this line + if (lineUpper.equals(flagUpper) + || lineUpper.startsWith(flagUpper + suffix)) + { + result = true; + } + } + } + return result; + } + + /** + * Write out the alignment sequences in Mega format. + */ + @Override + public String print() + { + return print(getSeqsAsArray()); + } + + /** + * Write out the alignment sequences in Mega format - interleaved unless + * explicitly noninterleaved. + */ + public String print(SequenceI[] s) + { + // TODO: is there a way to preserve the 'interleaved' property so it can + // affect output? + + String result = null; + if (this.fileFormat == FileFormat.FANCY) + { + result = printInterleavedCodons(s); + } + else if (this.interleaved != null && !this.interleaved) + { + result = printNonInterleaved(s); + } + else + { + result = printInterleaved(s); + } + return result; + } + + /** + * Print the sequences in interleaved format, each row 15 space-separated + * triplets. + * + * @param s + * @return + */ + protected String printInterleavedCodons(SequenceI[] s) + { + // TODO not coded yet - defaulting to the 'simple' format output + return printInterleaved(s); + } + + /** + * Print to string in Interleaved format - blocks of next 50 characters of + * each sequence in turn. + * + * @param s + */ + protected String printInterleaved(SequenceI[] s) + { + int maxIdLength = getMaxIdLength(s); + int maxSequenceLength = getMaxSequenceLength(s); + int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx + + /* + * Size a buffer to hold the whole output + */ + StringBuilder sb = new StringBuilder(numLines + * (maxIdLength + 2 + POSITIONS_PER_LINE)); + printHeaders(sb, FileFormat.SIMPLE); + + int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1; + for (int i = 0; i < numDataBlocks; i++) + { + sb.append(newline); + for (SequenceI seq : s) + { + + String seqId = String.format("#%-" + maxIdLength + "s ", + seq.getName()); + char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE, + (i + 1) * POSITIONS_PER_LINE); + sb.append(seqId); + sb.append(subSequence); + sb.append(newline); + } + } + + return new String(sb); + } + + /** + * Append the MEGA header and any other known properties + * + * @param sb + */ + private void printHeaders(StringBuilder sb, FileFormat format) + { + sb.append(MEGA_ID); + sb.append(newline); + /* + * + */ + Set> props = getAlignmentProperties(); + if (props != null) + { + for (Entry prop : props) + { + Object key = prop.getKey(); + Object value = prop.getValue(); + if (key instanceof String && value instanceof String) + { + if (format == FileFormat.FANCY) + { + sb.append(BANG).append(key).append(SPACE).append(value); + } + else + { + sb.append(key).append(COLON).append(SPACE).append(value); + } + sb.append(newline); + } + } + } + } + + /** + * Get the longest sequence id (to allow aligned printout). + * + * @param s + * @return + */ + protected static int getMaxIdLength(SequenceI[] s) + { + // TODO pull up for reuse + int maxLength = 0; + for (SequenceI seq : s) + { + int len = seq.getName().length(); + if (len > maxLength) + { + maxLength = len; + } + } + return maxLength; + } + + /** + * Get the longest sequence length + * + * @param s + * @return + */ + protected static int getMaxSequenceLength(SequenceI[] s) + { + // TODO pull up for reuse + int maxLength = 0; + for (SequenceI seq : s) + { + int len = seq.getLength(); + if (len > maxLength) + { + maxLength = len; + } + } + return maxLength; + } + + /** + * Print to string in noninterleaved format - all of each sequence in turn, in + * blocks of 50 characters. + * + * @param s + * @return + */ + protected String printNonInterleaved(SequenceI[] s) + { + int maxSequenceLength = getMaxSequenceLength(s); + // approx + int numLines = maxSequenceLength / POSITIONS_PER_LINE + 2 + s.length; + + /* + * Roughly size a buffer to hold the whole output + */ + StringBuilder sb = new StringBuilder(numLines * POSITIONS_PER_LINE); + printHeaders(sb, FileFormat.SIMPLE); + + for (SequenceI seq : s) + { + sb.append(newline); + sb.append(HASHSIGN + seq.getName()).append(newline); + int startPos = 0; + while (startPos <= seq.getLength()) + { + char[] subSequence = seq.getSequence(startPos, startPos + + POSITIONS_PER_LINE); + sb.append(subSequence); + sb.append(newline); + startPos += POSITIONS_PER_LINE; + } + } + + return new String(sb); + } + + /** + * Flag this file as interleaved or not, based on data format. Throws an + * exception if has previously been determined to be otherwise. + * + * @param isIt + * @param dataLine + * @throws IOException + */ + protected void assertInterleaved(boolean isIt, String dataLine) + throws IOException + { + if (this.interleaved != null && isIt != this.interleaved.booleanValue()) + { + throw new IOException( + "Parse error: mix of interleaved and noninterleaved detected, at line: " + + dataLine); + } + this.interleaved = new Boolean(isIt); + } + + public boolean isInterleaved() + { + return this.interleaved == null ? false : this.interleaved + .booleanValue(); + } + + public FileFormat getFileFormat() + { + return this.fileFormat; + } + + public void setFileFormat(FileFormat fileFormat) + { + this.fileFormat = fileFormat; + } +}