/* * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1) * Copyright (C) 2014 The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.io; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; /** * A parser for input or output of MEGA format files.
*
* Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6: * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and * Evolution 30: 2725-2729.
*
* * MEGA file format is supported as described in * http://www.megasoftware.net/manual.pdf
* Limitations: * * * @see http://www.megasoftware.net/ */ public class MegaFile extends AlignFile { private static final String WHITESPACE = "\\s+"; private static final int DEFAULT_LINE_LENGTH = 60; private static final String INDENT = " "; private static final String N_SITES = "NSites"; private static final String N_SEQS = "NSeqs"; private static final String MISSING = "Missing"; private static final String IDENTICAL = "Identical"; private static final String INDEL = "Indel"; private static final String CODETABLE = "CodeTable"; private static final String PROTEIN = "Protein"; private static final String NUCLEOTIDE = "Nucleotide"; private static final String DATATYPE = "DataType"; private static final char COMMENT_START = '['; private static final char COMMENT_END = ']'; private static final String HASHSIGN = "#"; private static final String SEMICOLON = ";"; private static final String BANG = "!"; private static final String EQUALS = "="; private static final String MEGA_ID = HASHSIGN + "MEGA"; private static final String TITLE = "Title"; private static final String FORMAT = "Format"; private static final String DESCRIPTION = "Description"; private static final String GENE = "Gene"; private static final String DOMAIN = "Domain"; private static final String PROPERTY = "Property"; private static final String CODONSTART = "CodonStart"; /* * names of properties to save to the alignment (may affect eventual output * format) */ static final String PROP_TITLE = "MEGA_TITLE"; static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED"; static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION"; static final String PROP_CODETABLE = "MEGA_CODETABLE"; static final String PROP_IDENTITY = "MEGA_IDENTITY"; static final String PROP_MISSING = "MEGA_MISSING"; static final String PROP_DATATYPE = "MEGA_DATATYPE"; // number of bases per line of file (value is inferred) static final String PROP_LINELENGTH = "MEGA_LINELENGTH"; // TODO: need a controlled name for Gene as a feature if we want to be able to // output the MEGA file with !Gene headers // WTF do we do if the sequences get realigned? // initial size for sequence data buffer private static final int SEQBUFFERSIZE = 256; private static final String SPACE = " "; /* * number of sequence positions output per line */ private int positionsPerLine; private String title; // gap character may be explicitly declared, default is - private char gapCharacter = '-'; // identity character if declared private char identityCharacter = 0; // this can be True, False or null (meaning not asserted in file) private Boolean nucleotide; // set once we have seen one block of interleaved data private boolean firstDataBlockRead = false; // this can be True, False or null (meaning we don't know yet) private Boolean interleaved; // write end of line positions as a comment private boolean writePositionNumbers = true; // id of sequence being processed private String currentSequenceId; /* * Temporary store of {sequenceId, positionData} while parsing interleaved * sequences; sequences are maintained in the order in which they are added * i.e. read in the file */ Map seqData; // number of residues read (so far) per sequence Map residuesRead; // current Gene if any we are parsing private String currentGene; // start residue (base 1) per sequence of current gene Map geneStart; // current Domain if any we are parsing private String currentDomain; // start residue (base 1) per sequence of current domain Map domainStart; // map of SequenceFeature's by sequence id Map> sequenceFeatures; public MegaFile() { } public MegaFile(String inFile, String type) throws IOException { super(inFile, type); } public MegaFile(FileParse source) throws IOException { super(source); } /** * Parse the input stream. */ @Override public void parse() throws IOException { gapCharacter = '-'; sequenceFeatures = new HashMap>(); geneStart = new HashMap(); domainStart = new HashMap(); residuesRead = new HashMap(); /* * Read and process MEGA and Title/Format/Description headers if present. * Returns the first data line following the headers. */ String dataLine = parseHeaderLines(); /* * order-preserving map to hold sequences by id as they are built up during * parsing */ seqData = new LinkedHashMap(); /* * The id of the sequence being read (for non-interleaved) */ currentSequenceId = ""; while (dataLine != null) { dataLine = dataLine.trim(); if (dataLine.length() > 0) { if (dataLine.startsWith(BANG + GENE) || dataLine.startsWith(BANG + DOMAIN)) { parseGeneOrDomain(dataLine); } else { currentSequenceId = parseDataLine(dataLine); } } else if (!seqData.isEmpty()) { /* * Blank line after processing some data... */ endOfDataBlock(); } dataLine = nextNonCommentLine(); } /* * close off any features currently being parsed */ createFeature(GENE, currentGene, geneStart); createFeature(DOMAIN, currentDomain, domainStart); // remember the (longest) line length read in, so we can output the same setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine)); deriveSequences(); } /** * Post-processing after reading one block of interleaved data */ protected void endOfDataBlock() { this.firstDataBlockRead = true; // TODO: // (initialise and) populate arrays of sequence length so far (excluding // gaps) // On change or end of a denoted Gene or Domain, add sequence features for // it } /** * Parse a !Gene or !Domain command line. MEGA accepts *
    *
  • !Gene=name;
  • *
  • !Gene=name Property=Coding/Noncoding CodonStart=1/2/3;
  • *
  • !Gene=genename Domain=domainname Property= etc
  • *
  • !Domain=domainname Gene=genename Property= etc
  • *
  • !Domain=domainname Property= etc
  • *
  • !domain=domainname property=domainend
  • *
* Properly, a Gene should be composed of Domain segments, but MEGA accepts * without. Note that keywords don't seem to be case sensitive. * * @param dataLine * @throws FileFormatException */ protected void parseGeneOrDomain(String dataLine) throws FileFormatException { String domain = null; String gene = null; String property = null; String codonStart = null; String errorMsg = "Unrecognized format: " + dataLine; if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON)) { throw new FileFormatException(errorMsg); } String trimmed = dataLine.substring(1, dataLine.length() - 1).trim(); String[] tokens = trimmed.split(WHITESPACE); for (String token : tokens) { String[] keyValue = token.split("="); if (keyValue.length != 2) { throw new FileFormatException(errorMsg); } String key = keyValue[0]; if (GENE.equalsIgnoreCase(key)) { gene = keyValue[1]; } else if (DOMAIN.equalsIgnoreCase(key)) { domain = keyValue[1]; } else if (PROPERTY.equalsIgnoreCase(key)) { property = keyValue[1]; } else if (CODONSTART.equalsIgnoreCase(key)) { codonStart = keyValue[1]; } else { System.err.println("Unrecognised token: '" + key + "; in " + dataLine); } } processGeneOrDomain(gene, domain, property, codonStart); } /** * Process a statement containing one or both of Gene and Domain, and * optionally Property or CodonStart commands. * * @param gene * the Gene name if specified, else null * @param domain * the Domain name if specified, else null * @param property * the Property value if specified, else null * @param codonStart * the CodonStart value if specified, else null */ protected void processGeneOrDomain(String gene, String domain, String property, String codonStart) { boolean domainEnd = "domainend".equalsIgnoreCase(property); /* * If we have been processing a Domain or Gene, and this does not continue * it, then close it off (generate sequence features for it). Do Domain * first as it is in the context of the enclosing gene if any. */ if (this.currentDomain != null) { if (!this.currentDomain.equals(domain) || domainEnd) { String description = currentDomain + (currentGene == null ? "" : " (" + currentGene + ")"); createFeature(DOMAIN, description, domainStart); } } if (this.currentGene != null && !this.currentGene.equals(gene)) { createFeature(GENE, currentGene, geneStart); } /* * and if we have declared a Gene or Domain which does not continue the * current one, then record its start positions per sequence */ if (gene != null && !gene.equals(currentGene)) { startSequenceFeature(geneStart); } if (domain != null && !domain.equals(currentDomain)) { startSequenceFeature(domainStart); } currentGene = gene; currentDomain = domainEnd ? null : domain; } /** * Start processing a new feature * * @param startPositions */ protected void startSequenceFeature(Map startPositions) { /* * If the feature declaration precedes all sequences, we will know in * createFeature that it started with residue 1; otherwise note now where it * starts in each sequence */ if (!residuesRead.isEmpty()) { for (Entry entry : residuesRead.entrySet()) { String seqId = entry.getKey(); Integer nextResidue = entry.getValue() + 1; startPositions.put(seqId, nextResidue); } } } /** * Add a SequenceFeature to each sequence, using the given start/end values * per sequence * * @param featureType * @param featureValue * @param featureStartResidues */ protected void createFeature(String featureType, String featureValue, Map featureStartResidues) { if (featureValue == null) { return; } Iterator seqids = this.seqData.keySet().iterator(); while (seqids.hasNext()) { String seqid = seqids.next(); Integer startAt = featureStartResidues.get(seqid); int sfstart = startAt == null ? 1 : startAt.intValue(); int sfend = residuesRead.get(seqid); if (sfend >= sfstart) { /* * don't add feature if entirely gapped in the sequence */ // TODO: type="Gene" (but then all coloured the same) or // type="GeneName"? SequenceFeature sf = new SequenceFeature(featureValue, featureType, sfstart, sfend, 0f, null); sequenceFeatures.get(seqid).add(sf); } } } /** * Returns the next line that is not a comment, or null at end of file. * Comments in MEGA are within [ ] brackets, and may be nested. * * @return * @throws IOException */ protected String nextNonCommentLine() throws IOException { return nextNonCommentLine(0); } /** * Returns the next non-comment line (or part line), or null at end of file. * Comments in MEGA are within [ ] brackets, and may be nested. They may occur * anywhere within a line (for example at the end with position numbers); this * method returns the line with any comments removed. * * @param depth * current depth of nesting of comments while parsing * @return * @throws IOException */ protected String nextNonCommentLine(final int depth) throws IOException { String data = null; data = nextLine(); if (data == null) { if (depth > 0) { System.err.println("Warning: unterminated comment in data file"); } return data; } /* * If we are in a (possibly nested) comment after parsing this line, keep * reading recursively until the comment has unwound */ int newDepth = commentDepth(data, depth); if (newDepth > 0) { return nextNonCommentLine(newDepth); } else { /* * not in a comment by end of this line; return what is left */ String nonCommentPart = getNonCommentContent(data, depth); return nonCommentPart; } } /** * Returns what is left of the input data after removing any comments, whether * 'in progress' from preceding lines, or embedded in the current line * * @param data * input data * @param depth * nested depth of comments pending termination * @return * @throws FileFormatException */ protected static String getNonCommentContent(String data, int depth) throws FileFormatException { int len = data.length(); StringBuilder result = new StringBuilder(len); for (int i = 0; i < len; i++) { char c = data.charAt(i); switch (c) { case COMMENT_START: depth++; break; case COMMENT_END: if (depth > 0) { depth--; } else { result.append(c); } break; default: if (depth == 0) { result.append(c); } } } return result.toString(); } /** * Calculates new depth of comment after parsing an input line i.e. the excess * of opening '[' over closing ']' characters. Any excess ']' are ignored (not * treated as comment delimiters). * * @param data * input line * @param depth * current comment nested depth before parsing the line * @return new depth after parsing the line */ protected static int commentDepth(CharSequence data, int depth) { int newDepth = depth; int len = data.length(); for (int i = 0; i < len; i++) { char c = data.charAt(i); if (c == COMMENT_START) { newDepth++; } else if (c == COMMENT_END && newDepth > 0) { newDepth--; } } return newDepth; } /** * Convert the parsed sequence strings to objects and store them in the model. */ protected void deriveSequences() { Set> datasets = seqData.entrySet(); for (Entry dataset : datasets) { String sequenceId = dataset.getKey(); StringBuilder characters = dataset.getValue(); SequenceI s = new Sequence(sequenceId, new String(characters)); this.seqs.addElement(s); /* * and add any derived sequence features to the sequence */ for (SequenceFeature sf : sequenceFeatures.get(sequenceId)) { s.addSequenceFeature(sf); } } } /** * Process one line of sequence data. If it has no sequence identifier, append * to the current id's sequence. Else parse out the sequence id and append the * data (if any) to that id's sequence. Returns the sequence id (implicit or * explicit) for this line. * * @param dataLine * @return * @throws IOException */ protected String parseDataLine(String dataLine) throws IOException { String seqId = getSequenceId(dataLine); if (seqId == null) { /* * Just character data */ parseNoninterleavedDataLine(dataLine); return currentSequenceId; } else if ((HASHSIGN + seqId).trim().equals(dataLine.trim())) { /* * Sequence id only - header line for noninterleaved data */ return seqId; } else { /* * Sequence id followed by data */ parseInterleavedDataLine(dataLine, seqId); return seqId; } } /** * Add a line of sequence data to the buffer for the given sequence id. Start * a new one if we haven't seen it before. * * @param dataLine * @throws IOException */ protected void parseNoninterleavedDataLine(String dataLine) throws IOException { if (currentSequenceId == null) { /* * Oops. Data but no sequence id context. */ throw new IOException("No sequence id context at: " + dataLine); } assertInterleaved(false, dataLine); dataLine = addSequenceData(currentSequenceId, dataLine); setPositionsPerLine(Math.max(positionsPerLine, dataLine.length())); } /** * Get the sequence data for this sequence id, starting a new one if * necessary. * * @param currentId * @return */ protected StringBuilder getSequenceDataBuffer(String currentId) { StringBuilder sb = seqData.get(currentId); if (sb == null) { // first data met for this sequence id, start a new buffer sb = new StringBuilder(SEQBUFFERSIZE); seqData.put(currentId, sb); // and a placeholder for any SequenceFeature found sequenceFeatures.put(currentId, new ArrayList()); } return sb; } /** * Parse one line of interleaved data e.g. * *
   * #TheSeqId CGATCGCATGCA
   * 
* * @param dataLine * @param seqId * @throws FileFormatException */ protected void parseInterleavedDataLine(String dataLine, String seqId) throws FileFormatException { /* * New sequence found in second or later data block - error. */ if (this.firstDataBlockRead && !seqData.containsKey(seqId)) { throw new FileFormatException( "Parse error: misplaced new sequence starting at " + dataLine); } String data = dataLine.substring(seqId.length() + 1).trim(); /* * Do nothing if this line is _only_ a sequence id with no data following. */ if (data != null && data.length() > 0) { data = addSequenceData(seqId, data); setPositionsPerLine(Math.max(positionsPerLine, data.length())); assertInterleaved(true, dataLine); } } /** * Remove spaces, and replace identity symbol, before appending the sequence * data to the buffer for the sequence id. Returns the reformatted added data. * Also updates a count of residues read for the sequence. * * @param seqId * @param data * @return */ protected String addSequenceData(String seqId, String data) { StringBuilder sb = getSequenceDataBuffer(seqId); int len = sb.length(); String formatted = data.replace(SPACE, ""); /* * If sequence contains '.' or other identity symbol; replace these with the * same position from the first (reference) sequence */ int nonGapped = 0; StringBuilder referenceSequence = seqData.values().iterator().next(); StringBuilder sb1 = new StringBuilder(formatted.length()); for (int i = 0; i < formatted.length(); i++) { char nextChar = formatted.charAt(i); if (nextChar != gapCharacter) { nonGapped++; } if (nextChar == identityCharacter && len + i < referenceSequence.length()) { sb1.append(referenceSequence.charAt(len + i)); } else { sb1.append(nextChar); } } formatted = sb1.toString(); data = formatted; sb.append(data); /* * increment residue count for the sequence */ if (nonGapped > 0) { Integer residueCount = residuesRead.get(seqId); residuesRead.put(seqId, nonGapped + (residueCount == null ? 0 : residueCount)); } return data; } /** * If the line begins with (e.g.) "#abcde " then returns "abcde" as the * identifier. Else returns null. * * @param dataLine * @return */ public static String getSequenceId(String dataLine) { // TODO refactor to a StringUtils type class if (dataLine != null) { if (dataLine.startsWith(HASHSIGN)) { int spacePos = dataLine.indexOf(" "); return (spacePos == -1 ? dataLine.substring(1) : dataLine .substring(1, spacePos)); } } return null; } /** * Read the #MEGA and Title/Format/Description header lines (if present). * * Save as alignment properties in case useful. * * @return the next non-blank line following the header lines. * @throws IOException */ protected String parseHeaderLines() throws IOException { String inputLine = null; while ((inputLine = nextNonCommentLine()) != null) { inputLine = inputLine.trim(); /* * skip blank lines */ if (inputLine.length() == 0) { continue; } if (inputLine.toUpperCase().startsWith(MEGA_ID)) { continue; } if (isTitle(inputLine)) { this.title = getValue(inputLine); setAlignmentProperty(PROP_TITLE, title); } else if (inputLine.startsWith(BANG + DESCRIPTION)) { parseDescription(inputLine); } else if (inputLine.startsWith(BANG + FORMAT)) { parseFormat(inputLine); } else if (!inputLine.toUpperCase().startsWith(MEGA_ID)) { /* * Return the first 'data line' i.e. one that is not blank, #MEGA or * TITLE: */ break; } } return inputLine; } /** * Parse a !Format statement. This may be multiline, and is ended by a * semicolon. * * @param inputLine * @throws IOException */ protected void parseFormat(String inputLine) throws IOException { while (inputLine != null) { parseFormatLine(inputLine); if (inputLine.endsWith(SEMICOLON)) { break; } inputLine = nextNonCommentLine(); } } /** * Parse one line of a !Format statement. This may contain one or more * keyword=value pairs. * * @param inputLine * @throws FileFormatException */ protected void parseFormatLine(String inputLine) throws FileFormatException { if (inputLine.startsWith(BANG + FORMAT)) { inputLine = inputLine.substring((BANG + FORMAT).length()); } if (inputLine.endsWith(SEMICOLON)) { inputLine = inputLine.substring(0, inputLine.length() - 1); } if (inputLine.length() == 0) { return; } String[] tokens = inputLine.trim().split(WHITESPACE); for (String token : tokens) { parseFormatKeyword(token); } } /** * Parse a Keyword=Value token. Possible keywords are *
    *
  • DataType= DNA, RNA, Nucleotide, Protein
  • *
  • DataFormat= Interleaved, ?
  • *
  • NSeqs= number of sequences (synonym NTaxa)
  • *
  • NSites= number of bases / residues
  • *
  • Property= Exon (or Coding), Intron (or Noncoding), End (of domain)
  • *
  • Indel= gap character
  • *
  • Identical= identity character (to first sequence) (synonym MatchChar)
  • *
  • Missing= missing data character
  • *
  • CodeTable= Standard, other (MEGA supports various)
  • *
* * @param token * @throws FileFormatException * if an unrecognised keyword or value is encountered */ protected void parseFormatKeyword(String token) throws FileFormatException { String msg = "Unrecognised Format command: " + token; String[] bits = token.split(EQUALS); if (bits.length != 2) { throw new FileFormatException(msg); } String keyword = bits[0]; String value = bits[1]; /* * Jalview will work out whether nucleotide or not anyway */ if (keyword.equalsIgnoreCase(DATATYPE)) { if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA") || value.equalsIgnoreCase("Nucleotide")) { this.nucleotide = true; // alignment computes whether or not it is nucleotide when created } else if (value.equalsIgnoreCase(PROTEIN)) { this.nucleotide = false; } else { throw new FileFormatException(msg); } setAlignmentProperty(PROP_DATATYPE, value); } /* * accept non-Standard code table but save in case we want to disable * 'translate as cDNA' */ else if (keyword.equalsIgnoreCase(CODETABLE)) { setAlignmentProperty(PROP_CODETABLE, value); } /* * save gap char to set later on alignment once created */ else if (keyword.equalsIgnoreCase(INDEL)) { this.gapCharacter = value.charAt(0); } else if (keyword.equalsIgnoreCase(IDENTICAL) || keyword.equalsIgnoreCase("MatchChar")) { setAlignmentProperty(PROP_IDENTITY, value); this.identityCharacter = value.charAt(0); if (!".".equals(value)) { System.err.println("Warning: " + token + " not supported, Jalview uses '.' for identity"); } } else if (keyword.equalsIgnoreCase(MISSING)) { setAlignmentProperty(PROP_MISSING, value); System.err.println("Warning: " + token + " not supported"); } else if (keyword.equalsIgnoreCase(PROPERTY)) { // TODO: can Property appear in a Format command? // suspect this is a mistake in the manual } else if (!keyword.equalsIgnoreCase(N_SEQS) && !keyword.equalsIgnoreCase("NTaxa") && !keyword.equalsIgnoreCase(N_SITES)) { System.err.println("Warning: " + msg); } } /** * Returns the trimmed data on the line following either whitespace or '=', * with any trailing semi-colon removed
* So *
    *
  • Hello World
  • *
  • !Hello: \tWorld;
  • *
  • !Hello=World
  • *
      * should all return "World" * * @param inputLine * @return */ protected static String getValue(String inputLine) { if (inputLine == null) { return null; } String value = null; String s = inputLine.replaceAll("\t", " ").trim(); /* * KEYWORD = VALUE should return VALUE */ int equalsPos = s.indexOf("="); if (equalsPos >= 0) { value = s.substring(equalsPos + 1); } else { int spacePos = s.indexOf(' '); value = spacePos == -1 ? "" : s.substring(spacePos + 1); } value = value.trim(); if (value.endsWith(SEMICOLON)) { value = value.substring(0, value.length() - 1).trim(); } return value; } /** * Returns true if the input line starts with "TITLE" or "!TITLE" (not case * sensitive). The latter is the official format, some older data file * examples have it without the !. * * @param inputLine * @return */ protected static boolean isTitle(String inputLine) { if (inputLine == null) { return false; } String upper = inputLine.toUpperCase(); return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG + TITLE.toUpperCase())); } /** * Reads lines until terminated by semicolon, appending each to the * Description property value. * * @throws IOException */ protected void parseDescription(String firstDescriptionLine) throws IOException { StringBuilder desc = new StringBuilder(256); desc.append(getValue(firstDescriptionLine)); if (!firstDescriptionLine.endsWith(SEMICOLON)) { String line = nextNonCommentLine(); while (line != null) { if (line.endsWith(SEMICOLON)) { desc.append(line.substring(0, line.length() - 1)); break; } else if (line.length() > 0) { desc.append(line).append(newline); } line = nextNonCommentLine(); } } setAlignmentProperty(PROP_DESCRIPTION, desc.toString()); } /** * Returns the alignment sequences in Mega format. */ @Override public String print() { return MEGA_ID + newline + print(getSeqsAsArray()); } /** * Write out the alignment sequences in Mega format - interleaved unless * explicitly noninterleaved. */ protected String print(SequenceI[] s) { String result; if (this.interleaved != null && !this.interleaved) { result = printNonInterleaved(s); } else { result = printInterleaved(s); } return result; } /** * Print to string in Interleaved format - blocks of next N characters of each * sequence in turn. * * @param s */ protected String printInterleaved(SequenceI[] s) { int maxIdLength = getMaxIdLength(s); int maxSequenceLength = getMaxSequenceLength(s); int numLines = maxSequenceLength / positionsPerLine + 3; // approx int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1; int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery; /* * Roughly size a buffer to hold the whole output */ StringBuilder sb = new StringBuilder(numLines * (maxIdLength + positionsPerLine + chunksPerLine + 10)); /* * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide */ int from = 0; for (int i = 0; i < numDataBlocks; i++) { sb.append(newline); boolean first = true; int advancedBy = 0; for (SequenceI seq : s) { int seqFrom = from; String seqId = String.format("#%-" + maxIdLength + "s", seq.getName()); /* * output next line for this sequence */ sb.append(seqId); int lastPos = seqFrom + positionsPerLine; // exclusive for (int j = 0; j < chunksPerLine; j++) { char[] subSequence = seq.getSequence(seqFrom, Math.min(lastPos, seqFrom + spaceEvery)); if (subSequence.length > 0) { sb.append(SPACE).append(subSequence); } seqFrom += subSequence.length; if (first) { // all sequences should be the same length in MEGA advancedBy += subSequence.length; } } // write last position as a comment if (writePositionNumbers) { sb.append(SPACE).append(COMMENT_START).append(from + advancedBy) .append(COMMENT_END); } sb.append(newline); first = false; } from += advancedBy; } return new String(sb); } /** * Outputs to string the MEGA header and any other known and relevant * alignment properties * * @param al */ protected String printHeaders(AlignmentI al) { StringBuilder sb = new StringBuilder(128); sb.append(MEGA_ID).append(newline); String propertyValue = (String) al.getProperty(PROP_TITLE); if (propertyValue != null) { sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue) .append(SEMICOLON).append(newline); } propertyValue = (String) al.getProperty(PROP_DESCRIPTION); if (propertyValue != null) { sb.append(BANG).append(DESCRIPTION).append(newline) .append(propertyValue).append(SEMICOLON) .append(newline); } /* * !Format DataType CodeTable */ sb.append(BANG).append(FORMAT).append(newline); String dataType = (String) al.getProperty(PROP_DATATYPE); if (dataType == null) { dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN; } sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType); String codeTable = (String) al.getProperty(PROP_CODETABLE); sb.append(SPACE).append(CODETABLE).append(EQUALS) .append(codeTable == null ? "Standard" : codeTable) .append(newline); /* * !Format NSeqs NSites (the length of sequences - they should all be the * same - including gaps) */ sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight()); sb.append(SPACE).append(N_SITES).append(EQUALS) .append(String.valueOf(al.getWidth())); sb.append(newline); /* * !Format Indel Identical Missing */ sb.append(INDENT); sb.append(INDEL).append(EQUALS).append(al.getGapCharacter()); String identity = (String) al.getProperty(PROP_IDENTITY); if (identity != null) { sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity); } String missing = (String) al.getProperty(PROP_MISSING); if (missing != null) { sb.append(SPACE).append(MISSING).append(EQUALS).append(missing); } sb.append(SEMICOLON).append(newline); return sb.toString(); } /** * Get the longest sequence id (to allow aligned printout). * * @param s * @return */ protected static int getMaxIdLength(SequenceI[] s) { // TODO pull up for reuse int maxLength = 0; for (SequenceI seq : s) { int len = seq.getName().length(); if (len > maxLength) { maxLength = len; } } return maxLength; } /** * Get the longest sequence length * * @param s * @return */ protected static int getMaxSequenceLength(SequenceI[] s) { // TODO pull up for reuse int maxLength = 0; for (SequenceI seq : s) { int len = seq.getLength(); if (len > maxLength) { maxLength = len; } } return maxLength; } /** * Print to string in noninterleaved format - all of each sequence in turn, in * blocks of 50 characters. * * @param s * @return */ protected String printNonInterleaved(SequenceI[] s) { int maxSequenceLength = getMaxSequenceLength(s); // approx int numLines = maxSequenceLength / positionsPerLine + 2 + s.length; /* * Roughly size a buffer to hold the whole output */ StringBuilder sb = new StringBuilder(numLines * positionsPerLine); int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; int chunksPerLine = positionsPerLine / spaceEvery; for (SequenceI seq : s) { sb.append(newline); sb.append(HASHSIGN + seq.getName()).append(newline); int startPos = 0; while (startPos < seq.getLength()) { boolean firstChunk = true; /* * print next line for this sequence */ int lastPos = startPos + positionsPerLine; // exclusive for (int j = 0; j < chunksPerLine; j++) { char[] subSequence = seq.getSequence(startPos, Math.min(lastPos, startPos + positionsPerLine)); if (subSequence.length > 0) { if (!firstChunk) { sb.append(SPACE); } sb.append(subSequence); firstChunk = false; } startPos += subSequence.length; } sb.append(newline); } } return new String(sb); } /** * Flag this file as interleaved or not, based on data format. Throws an * exception if has previously been determined to be otherwise. * * @param isIt * @param dataLine * @throws IOException */ protected void assertInterleaved(boolean isIt, String dataLine) throws FileFormatException { if (this.interleaved != null && isIt != this.interleaved.booleanValue()) { throw new FileFormatException( "Parse error: mix of interleaved and noninterleaved detected, at line: " + dataLine); } this.interleaved = new Boolean(isIt); setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString()); } public boolean isInterleaved() { return this.interleaved == null ? false : this.interleaved .booleanValue(); } /** * Adds saved parsed values either as alignment properties, or (in some cases) * as specific member fields of the alignment */ @Override public void addProperties(AlignmentI al) { super.addProperties(al); al.setGapCharacter(gapCharacter); /* * warn if e.g. DataType=DNA but data is protein (or vice versa) */ if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) { System.err.println("Warning: " + this.title + " declared " + (nucleotide ? "" : " not ") + "nucleotide but it is" + (nucleotide ? " not" : "")); } } /** * Print the given alignment in MEGA format. If the alignment was created by * parsing a MEGA file, it should have properties set (e.g. Title) which can * influence the output. */ @Override public String print(AlignmentI al) { this.nucleotide = al.isNucleotide(); String lineLength = (String) al.getProperty(PROP_LINELENGTH); this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer .parseInt(lineLength); /* * round down to a multiple of 3 positions per line for nucleotide */ if (nucleotide) { positionsPerLine = positionsPerLine - (positionsPerLine % 3); } String interleave = (String) al.getProperty(PROP_INTERLEAVED); if (interleave != null) { this.interleaved = Boolean.valueOf(interleave); } String headers = printHeaders(al); return headers + print(al.getSequencesArray()); } /** * Returns the number of sequence positions output per line * * @return */ public int getPositionsPerLine() { return positionsPerLine; } /** * Sets the number of sequence positions output per line. Note these will be * formatted in blocks of 3 (nucleotide) or 10 (peptide). * * @param p */ public void setPositionsPerLine(int p) { this.positionsPerLine = p; } }