/* * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1) * Copyright (C) 2014 The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.io; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import java.io.IOException; import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Set; public class MegaFile extends AlignFile { /* * Simple file format as at * http://www.hiv.lanl.gov/content/sequence/HelpDocs/SEQsamples.html * * Fancy file format as at * http://primerdigital.com/fastpcr/images/Drosophila_Adh.txt */ public enum FileFormat { SIMPLE, FANCY } private static final String HASHSIGN = "#"; // TODO: public constants file private static final String COLON = ":"; private static final String BANG = "!"; private static final String EQUALS = "="; private static final String MEGA_ID = HASHSIGN + "MEGA"; public static final String PROP_TITLE = "TITLE"; public static final String PROP_FORMAT = "Format"; public static final String PROP_DESCRIPTION = "Description"; public static final String PROP_GENE = "Gene"; public static final String PROP_INTERLEAVED = "Interleaved"; // initial size for sequence data buffer private static final int SEQBUFFERSIZE = 256; private static final String SPACE = " "; private static final int POSITIONS_PER_LINE = 50; // this can be True, False or null (meaning we don't know yet) private Boolean interleaved; // set once we have seen one block of interleaved data private boolean firstDataBlockRead = false; private FileFormat fileFormat; public MegaFile() { } public MegaFile(String inFile, String type) throws IOException { super(inFile, type); } public MegaFile(FileParse source) throws IOException { super(source); } /** * Parse the input stream. */ @Override public void parse() throws IOException { /* * Read MEGA and Title/Format/Description/Gene headers if present. These are * saved as alignment properties. Returns the first sequence data line */ String dataLine = parseHeaderLines(); /* * If we didn't positively identify as 'fancy format', assume 'simple * format' */ if (this.fileFormat == null) { setFileFormat(FileFormat.SIMPLE); } /* * Temporary store of {sequenceId, positionData} while parsing appending */ Map seqData = new LinkedHashMap(); /* * The id of the sequence being read (for non-interleaved) */ String currentId = ""; while (dataLine != null) { dataLine = dataLine.trim(); if (dataLine.length() > 0) { currentId = parseDataLine(dataLine, seqData, currentId); } else if (!seqData.isEmpty()) { /* * Blank line after processing some data... */ this.firstDataBlockRead = true; } dataLine = nextLine(); } setSequences(seqData); } /** * Convert the parsed sequence strings to objects and store them in the model. * * @param seqData */ protected void setSequences(Map seqData) { Set> datasets = seqData.entrySet(); for (Entry dataset : datasets) { String sequenceId = dataset.getKey(); StringBuilder characters = dataset.getValue(); SequenceI s = new Sequence(sequenceId, new String(characters)); this.seqs.addElement(s); } } /** * Process one line of sequence data. If it has no sequence identifier, append * to the current id's sequence. Else parse out the sequence id and append the * data (if any) to that id's sequence. Returns the sequence id (implicit or * explicit) for this line. * * @param dataLine * @param seqData * @param currentid * @return * @throws IOException */ protected String parseDataLine(String dataLine, Map seqData, String currentId) throws IOException { String seqId = getSequenceId(dataLine); if (seqId == null) { /* * Just character data */ parseNoninterleavedDataLine(dataLine, seqData, currentId); return currentId; } else if ((HASHSIGN + seqId).trim().equals(dataLine.trim())) { /* * Sequence id only - header line for noninterleaved data */ return seqId; } else { /* * Sequence id followed by data */ parseInterleavedDataLine(dataLine, seqData, seqId); return seqId; } } /** * Add a line of sequence data to the buffer for the given sequence id. Start * a new one if we haven't seen it before. * * @param dataLine * @param seqData * @param currentId * @throws IOException */ protected void parseNoninterleavedDataLine(String dataLine, Map seqData, String currentId) throws IOException { if (currentId == null) { /* * Oops. Data but no sequence id context. */ throw new IOException("No sequence id context at: " + dataLine); } assertInterleaved(false, dataLine); StringBuilder sb = getSequenceDataBuffer(seqData, currentId); /* * Add the current line of data to the sequence. */ sb.append(dataLine); } /** * Get the sequence data for this sequence id, starting a new one if * necessary. * * @param seqData * @param currentId * @return */ protected StringBuilder getSequenceDataBuffer( Map seqData, String currentId) { StringBuilder sb = seqData.get(currentId); if (sb == null) { // first data met for this sequence id, start a new buffer sb = new StringBuilder(SEQBUFFERSIZE); seqData.put(currentId, sb); } return sb; } /** * Parse one line of interleaved data e.g. * *
   * #TheSeqId CGATCGCATGCA
   * 
* * @param dataLine * @param seqData * @param seqId * @throws IOException */ protected void parseInterleavedDataLine(String dataLine, Map seqData, String seqId) throws IOException { /* * New sequence found in second or later data block - error. */ if (this.firstDataBlockRead && !seqData.containsKey(seqId)) { throw new IOException( "Parse error: misplaced new sequence starting at " + dataLine); } StringBuilder sb = getSequenceDataBuffer(seqData, seqId); String data = dataLine.substring(seqId.length() + 1).trim(); /* * Do nothing if this line is _only_ a sequence id with no data following. * * Remove any internal spaces (present in the 'fancy' file format) */ if (data != null && data.length() > 0) { if (data.indexOf(SPACE) != -1) { data = data.replace(SPACE, ""); } sb.append(data); assertInterleaved(true, dataLine); } } /** * If the line begins with (e.g.) "#abcde " then returns "abcde" as the * identifier. Else returns null. * * @param dataLine * @return */ public static String getSequenceId(String dataLine) { // TODO refactor to a StringUtils type class if (dataLine != null) { if (dataLine.startsWith(HASHSIGN)) { int spacePos = dataLine.indexOf(" "); return (spacePos == -1 ? dataLine.substring(1) : dataLine .substring(1, spacePos)); } } return null; } /** * Read the #MEGA and Title/Format/Description/Gene header lines (if present). * * Save as annotation properties in case useful. * * @return the next non-blank line following the header lines. * @throws IOException */ protected String parseHeaderLines() throws IOException { String inputLine = null; while ((inputLine = nextLine()) != null) { inputLine = inputLine.trim(); /* * skip blank lines */ if (inputLine.length() == 0) { continue; } if (inputLine.startsWith(BANG)) { setFileFormat(FileFormat.FANCY); } if (inputLine.startsWith(BANG + PROP_DESCRIPTION)) { parseDescriptionLines(); } else if (isPropertyLine(inputLine)) { /* * If a property is matched, parse and save it. */ String[] property_value = parsePropertyValue(inputLine); setAlignmentProperty(property_value[0], property_value[1]); } else if (!inputLine.toUpperCase().startsWith(MEGA_ID)) { /* * Return the first 'data line' i.e. one that is not blank, #MEGA or * TITLE: */ break; } } return inputLine; } /** * Read following lines until blank, appending each to the Description * property value. * * Assumes the !Description line itself does not include description text. * * Assumes the description is followed by a blank line (else we will consume * one too many). * * @throws IOException */ protected void parseDescriptionLines() throws IOException { StringBuilder desc = new StringBuilder(256); String line = null; while ((line = nextLine()) != null) { if ("".equals(line.trim())) { break; } desc.append(line).append(newline); } setAlignmentProperty(PROP_DESCRIPTION, desc.toString()); } /** * Test whether the line holds an expected property declaration. * * @param inputLine * @return */ protected boolean isPropertyLine(String inputLine) { if (lineMatchesFlag(inputLine, PROP_TITLE, BANG, COLON) || lineMatchesFlag(inputLine, PROP_FORMAT, BANG, COLON) || lineMatchesFlag(inputLine, PROP_DESCRIPTION, BANG, COLON) || lineMatchesFlag(inputLine, PROP_GENE, BANG, COLON)) { return true; } return false; } /** * Helper method that extract the name and value of a property, assuming the * first space or equals sign is the separator. * * Thus "Description: Melanogaster" or "!Description=Melanogaster" both return * {"Description", "Melanogaster"}. * * Returns an empty value string if no space or equals sign is present. * * @param s * @return */ public static String[] parsePropertyValue(String s) { // TODO refactor to a string utils helper class (or find equivalent) // TODO handle other cases e.g. "Description = Melanogaster" String propertyName = s; String value = ""; int separatorPos = -1; if (s != null) { int spacePos = s.indexOf(SPACE); int eqPos = s.indexOf(EQUALS); if (spacePos == -1 && eqPos > -1) { separatorPos = eqPos; } else if (spacePos > -1 && eqPos == -1) { separatorPos = spacePos; } else if (spacePos > -1 && eqPos > -1) { separatorPos = Math.min(spacePos, eqPos); } } if (separatorPos > -1) { value = s.substring(separatorPos + 1); propertyName = s.substring(0, separatorPos); } /* * finally strip any leading / trailing chars from property name */ if (propertyName.startsWith(BANG)) { propertyName = propertyName.substring(1); } if (propertyName.endsWith(COLON)) { propertyName = propertyName.substring(0, propertyName.length() - 1); } return new String[] { propertyName, value }; } /** * Test whether a line starts with the specified flag field followed by a * space (or nothing). * * Here we accept an optional prefix and suffix on the flag, and the check is * not case-sensitive. So these would match for "Title" * *
   * Title Melanogaster
   * Title=Melanogaster
   * TITLE Melanogaster
   * TITLE=Melanogaster
   * !Title Melanogaster
   * !Title=Melanogaster
   * !TITLE Melanogaster
   * !TITLE=Melanogaster
   * Title: Melanogaster
   * Title:=Melanogaster
   * TITLE: Melanogaster
   * TITLE:=Melanogaster
   * !Title: Melanogaster
   * !Title:=Melanogaster
   * !TITLE: Melanogaster
   * !TITLE:=Melanogaster
   * Title
   * TITLE
   * !Title
   * !TITLE
   * 
* * @param line * @param flag * @param prefix * @param suffix * @return */ public static boolean lineMatchesFlag(String line, String flag, String prefix, String suffix) { // TODO refactor to a string utils helper class boolean result = false; if (line != null && flag != null) { String lineUpper = line.toUpperCase().trim(); String flagUpper = flag.toUpperCase(); // skip prefix character e.g. ! before attempting match if (lineUpper.startsWith(prefix)) { lineUpper = lineUpper.substring(1); } // test for flag + SPACE or flag + EQUALS, with or without suffix if (lineUpper.startsWith(flagUpper + SPACE) || lineUpper.startsWith(flagUpper + EQUALS) || lineUpper.startsWith(flagUpper + suffix + SPACE) || lineUpper.startsWith(flagUpper + suffix + EQUALS)) { result = true; } else { // test for exact match i.e. flag only on this line if (lineUpper.equals(flagUpper) || lineUpper.startsWith(flagUpper + suffix)) { result = true; } } } return result; } /** * Write out the alignment sequences in Mega format. */ @Override public String print() { return print(getSeqsAsArray()); } /** * Write out the alignment sequences in Mega format - interleaved unless * explicitly noninterleaved. */ public String print(SequenceI[] s) { // TODO: is there a way to preserve the 'interleaved' property so it can // affect output? String result = null; if (this.fileFormat == FileFormat.FANCY) { result = printInterleavedCodons(s); } else if (this.interleaved != null && !this.interleaved) { result = printNonInterleaved(s); } else { result = printInterleaved(s); } return result; } /** * Print the sequences in interleaved format, each row 15 space-separated * triplets. * * @param s * @return */ protected String printInterleavedCodons(SequenceI[] s) { // TODO not coded yet - defaulting to the 'simple' format output return printInterleaved(s); } /** * Print to string in Interleaved format - blocks of next 50 characters of * each sequence in turn. * * @param s */ protected String printInterleaved(SequenceI[] s) { int maxIdLength = getMaxIdLength(s); int maxSequenceLength = getMaxSequenceLength(s); int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx /* * Size a buffer to hold the whole output */ StringBuilder sb = new StringBuilder(numLines * (maxIdLength + 2 + POSITIONS_PER_LINE)); printHeaders(sb, FileFormat.SIMPLE); int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1; for (int i = 0; i < numDataBlocks; i++) { sb.append(newline); for (SequenceI seq : s) { String seqId = String.format("#%-" + maxIdLength + "s ", seq.getName()); char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE, (i + 1) * POSITIONS_PER_LINE); sb.append(seqId); sb.append(subSequence); sb.append(newline); } } return new String(sb); } /** * Append the MEGA header and any other known properties * * @param sb */ private void printHeaders(StringBuilder sb, FileFormat format) { sb.append(MEGA_ID); sb.append(newline); /* * */ Set> props = getAlignmentProperties(); if (props != null) { for (Entry prop : props) { Object key = prop.getKey(); Object value = prop.getValue(); if (key instanceof String && value instanceof String) { if (format == FileFormat.FANCY) { sb.append(BANG).append(key).append(SPACE).append(value); } else { sb.append(key).append(COLON).append(SPACE).append(value); } sb.append(newline); } } } } /** * Get the longest sequence id (to allow aligned printout). * * @param s * @return */ protected static int getMaxIdLength(SequenceI[] s) { // TODO pull up for reuse int maxLength = 0; for (SequenceI seq : s) { int len = seq.getName().length(); if (len > maxLength) { maxLength = len; } } return maxLength; } /** * Get the longest sequence length * * @param s * @return */ protected static int getMaxSequenceLength(SequenceI[] s) { // TODO pull up for reuse int maxLength = 0; for (SequenceI seq : s) { int len = seq.getLength(); if (len > maxLength) { maxLength = len; } } return maxLength; } /** * Print to string in noninterleaved format - all of each sequence in turn, in * blocks of 50 characters. * * @param s * @return */ protected String printNonInterleaved(SequenceI[] s) { int maxSequenceLength = getMaxSequenceLength(s); // approx int numLines = maxSequenceLength / POSITIONS_PER_LINE + 2 + s.length; /* * Roughly size a buffer to hold the whole output */ StringBuilder sb = new StringBuilder(numLines * POSITIONS_PER_LINE); printHeaders(sb, FileFormat.SIMPLE); for (SequenceI seq : s) { sb.append(newline); sb.append(HASHSIGN + seq.getName()).append(newline); int startPos = 0; while (startPos <= seq.getLength()) { char[] subSequence = seq.getSequence(startPos, startPos + POSITIONS_PER_LINE); sb.append(subSequence); sb.append(newline); startPos += POSITIONS_PER_LINE; } } return new String(sb); } /** * Flag this file as interleaved or not, based on data format. Throws an * exception if has previously been determined to be otherwise. * * @param isIt * @param dataLine * @throws IOException */ protected void assertInterleaved(boolean isIt, String dataLine) throws IOException { if (this.interleaved != null && isIt != this.interleaved.booleanValue()) { throw new IOException( "Parse error: mix of interleaved and noninterleaved detected, at line: " + dataLine); } this.interleaved = new Boolean(isIt); } public boolean isInterleaved() { return this.interleaved == null ? false : this.interleaved .booleanValue(); } public FileFormat getFileFormat() { return this.fileFormat; } public void setFileFormat(FileFormat fileFormat) { this.fileFormat = fileFormat; } }