2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 import jalview.datamodel.AlignmentI;
22 import jalview.datamodel.Sequence;
23 import jalview.datamodel.SequenceI;
25 import java.io.IOException;
26 import java.util.LinkedHashMap;
28 import java.util.Map.Entry;
32 * A parser for input or output of MEGA format files. <br>
34 * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
35 * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
36 * Evolution 30: 2725-2729. <br>
39 * MEGA file format is supported as described in
40 * http://www.megasoftware.net/manual.pdf <br>
43 * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
44 * <li>to be completed</li>
47 * @see http://www.megasoftware.net/
49 public class MegaFile extends AlignFile
51 private static final int DEFAULT_LINE_LENGTH = 60;
53 private static final String INDENT = " ";
55 private static final String N_SITES = "NSites";
57 private static final String N_SEQS = "NSeqs";
59 private static final String MISSING = "Missing";
61 private static final String IDENTICAL = "Identical";
63 private static final String INDEL = "Indel";
65 private static final String CODETABLE = "CodeTable";
67 private static final String PROTEIN = "Protein";
69 private static final String NUCLEOTIDE = "Nucleotide";
71 private static final String DATATYPE = "DataType";
73 private static final char COMMENT_START = '[';
75 private static final char COMMENT_END = ']';
77 private static final String HASHSIGN = "#";
79 private static final String SEMICOLON = ";";
81 private static final String BANG = "!";
83 private static final String EQUALS = "=";
85 private static final String MEGA_ID = HASHSIGN + "MEGA";
87 private static final String TITLE = "Title";
89 private static final String FORMAT = "Format";
91 private static final String DESCRIPTION = "Description";
93 private static final String GENE = "Gene";
95 private static final String DOMAIN = "Domain";
98 * names of properties to save to the alignment (may affect eventual output
101 static final String PROP_TITLE = "MEGA_TITLE";
103 static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
105 static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
107 static final String PROP_CODETABLE = "MEGA_CODETABLE";
109 static final String PROP_IDENTITY = "MEGA_IDENTITY";
111 static final String PROP_MISSING = "MEGA_MISSING";
113 static final String PROP_DATATYPE = "MEGA_DATATYPE";
115 // number of bases per line of file (value is inferred)
116 static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
118 // TODO: need a controlled name for Gene as a feature if we want to be able to
119 // output the MEGA file with !Gene headers
120 // WTF do we do if the sequences get realigned?
122 // initial size for sequence data buffer
123 private static final int SEQBUFFERSIZE = 256;
125 private static final String SPACE = " ";
128 * number of sequence positions output per line
130 private int positionsPerLine;
132 private String title;
134 // gap character may be explicitly declared, if not we infer it
135 private Character gapCharacter;
137 // identity character if declared
138 private char identityCharacter = 0;
140 // this can be True, False or null (meaning not asserted in file)
141 private Boolean nucleotide;
143 // set once we have seen one block of interleaved data
144 private boolean firstDataBlockRead = false;
146 // this can be True, False or null (meaning we don't know yet)
147 private Boolean interleaved;
149 // write end of line positions as a comment
150 private boolean writePositionNumbers = true;
156 public MegaFile(String inFile, String type) throws IOException
161 public MegaFile(FileParse source) throws IOException
167 * Parse the input stream.
170 public void parse() throws IOException
173 * Read and process MEGA and Title/Format/Description headers if present.
174 * Returns the first data line following the headers.
176 String dataLine = parseHeaderLines();
179 * Temporary store of {sequenceId, positionData} while parsing interleaved
180 * sequences; sequences are maintained in the order in which they are added
181 * i.e. read in the file
183 Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
186 * The id of the sequence being read (for non-interleaved)
188 String currentId = "";
190 while (dataLine != null)
192 dataLine = dataLine.trim();
193 if (dataLine.length() > 0)
195 if (dataLine.startsWith(BANG + GENE))
199 else if (dataLine.startsWith(BANG + DOMAIN))
201 parseDomain(dataLine);
205 currentId = parseDataLine(dataLine, seqData, currentId);
208 else if (!seqData.isEmpty())
211 * Blank line after processing some data...
213 this.firstDataBlockRead = true;
215 dataLine = nextNonCommentLine();
218 // remember the (longest) line length read in, so we can output the same
219 setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
221 setSequences(seqData);
225 * Parse a !Gene command line
229 protected void parseGene(String dataLine)
234 * Parse a !Domain command line
238 private void parseDomain(String dataLine)
243 * Returns the next line that is not a comment, or null at end of file.
244 * Comments in MEGA are within [ ] brackets, and may be nested.
247 * @throws IOException
249 protected String nextNonCommentLine() throws IOException
251 return nextNonCommentLine(0);
255 * Returns the next line that is not a comment, or null at end of file.
256 * Comments in MEGA are within [ ] brackets, and may be nested.
259 * current depth of nesting of comments while parsing
261 * @throws IOException
263 protected String nextNonCommentLine(final int depth) throws IOException
271 System.err.println("Warning: unterminated comment in data file");
275 int leftBracket = data.indexOf(COMMENT_START);
278 * reject unnested comment following data on the same line
280 if (depth == 0 && leftBracket > 0)
282 throw new FileFormatException(
283 "Can't parse comment following data at " + data);
287 * If we are in a (possibly nested) comment after parsing this line, keep
288 * reading recursively until the comment has unwound
290 int newDepth = commentDepth(data, depth);
293 return nextNonCommentLine(newDepth);
298 * not in a comment by end of this line; return what is left (or the next
299 * line if that is empty)
301 String nonCommentPart = getNonCommentContent(data, depth);
302 // if (nonCommentPart.length() > 0)
304 return nonCommentPart;
306 // return nextNonCommentLine(0);
311 * Returns what is left of the input data after removing any comments, whether
312 * 'in progress' from preceding lines, or embedded in the current line
317 * nested depth of comments pending termination
319 * @throws FileFormatException
321 protected static String getNonCommentContent(String data, int depth)
322 throws FileFormatException
324 int len = data.length();
325 StringBuilder result = new StringBuilder(len);
326 for (int i = 0; i < len; i++)
328 char c = data.charAt(i);
353 return result.toString();
357 * Calculates new depth of comment after parsing an input line i.e. the excess
358 * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
359 * treated as comment delimiters).
364 * current comment nested depth before parsing the line
365 * @return new depth after parsing the line
367 protected static int commentDepth(CharSequence data, int depth)
369 int newDepth = depth;
370 int len = data.length();
371 for (int i = 0; i < len; i++)
373 char c = data.charAt(i);
374 if (c == COMMENT_START)
378 else if (c == COMMENT_END && newDepth > 0)
387 * Convert the parsed sequence strings to objects and store them in the model.
391 protected void setSequences(Map<String, StringBuilder> seqData)
393 Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
395 for (Entry<String, StringBuilder> dataset : datasets)
397 String sequenceId = dataset.getKey();
398 StringBuilder characters = dataset.getValue();
399 SequenceI s = new Sequence(sequenceId, new String(characters));
400 this.seqs.addElement(s);
405 * Process one line of sequence data. If it has no sequence identifier, append
406 * to the current id's sequence. Else parse out the sequence id and append the
407 * data (if any) to that id's sequence. Returns the sequence id (implicit or
408 * explicit) for this line.
414 * @throws IOException
416 protected String parseDataLine(String dataLine,
417 Map<String, StringBuilder> seqData, String currentId)
420 String seqId = getSequenceId(dataLine);
424 * Just character data
426 parseNoninterleavedDataLine(dataLine, seqData, currentId);
429 else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
432 * Sequence id only - header line for noninterleaved data
439 * Sequence id followed by data
441 parseInterleavedDataLine(dataLine, seqData, seqId);
447 * Add a line of sequence data to the buffer for the given sequence id. Start
448 * a new one if we haven't seen it before.
453 * @throws IOException
455 protected void parseNoninterleavedDataLine(String dataLine,
456 Map<String, StringBuilder> seqData, String currentId)
459 if (currentId == null)
462 * Oops. Data but no sequence id context.
464 throw new IOException("No sequence id context at: " + dataLine);
467 assertInterleaved(false, dataLine);
469 StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
471 dataLine = reformatSequenceData(dataLine, sb.length(), seqData);
474 setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
478 * Get the sequence data for this sequence id, starting a new one if
485 protected StringBuilder getSequenceDataBuffer(
486 Map<String, StringBuilder> seqData, String currentId)
488 StringBuilder sb = seqData.get(currentId);
491 // first data met for this sequence id, start a new buffer
492 sb = new StringBuilder(SEQBUFFERSIZE);
493 seqData.put(currentId, sb);
499 * Parse one line of interleaved data e.g.
502 * #TheSeqId CGATCGCATGCA
508 * @throws IOException
510 protected void parseInterleavedDataLine(String dataLine,
511 Map<String, StringBuilder> seqData, String seqId)
515 * New sequence found in second or later data block - error.
517 if (this.firstDataBlockRead && !seqData.containsKey(seqId))
519 throw new IOException(
520 "Parse error: misplaced new sequence starting at " + dataLine);
523 StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
524 String data = dataLine.substring(seqId.length() + 1).trim();
527 * Do nothing if this line is _only_ a sequence id with no data following.
529 * Remove any internal spaces
531 if (data != null && data.length() > 0)
533 data = reformatSequenceData(data, sb.length(), seqData);
535 setPositionsPerLine(Math.max(positionsPerLine, data.length()));
536 assertInterleaved(true, dataLine);
541 * Reformat input sequence data by removing any internal formatting spaces,
542 * and converting any 'identity' characters to the corresponding position in
543 * the first sequence.
547 * the sequence position (base 0) of the start of the data
551 protected String reformatSequenceData(String data, int startPos, Map<String, StringBuilder> seqData)
553 String formatted = data.replace(SPACE, "");
554 if (formatted.indexOf(identityCharacter) > -1)
557 * sequence contains '.' or other identity symbol; replace these with the
558 * same position from the first (reference) sequence
560 StringBuilder referenceSequence = seqData.values().iterator().next();
561 StringBuilder sb = new StringBuilder(formatted.length());
562 for (int i = 0 ; i < formatted.length() ; i++) {
563 char nextChar = formatted.charAt(i);
564 if (nextChar != identityCharacter) {
567 sb.append(referenceSequence.charAt(startPos + i));
570 formatted = sb.toString();
576 * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
577 * identifier. Else returns null.
582 public static String getSequenceId(String dataLine)
584 // TODO refactor to a StringUtils type class
585 if (dataLine != null)
587 if (dataLine.startsWith(HASHSIGN))
589 int spacePos = dataLine.indexOf(" ");
590 return (spacePos == -1 ? dataLine.substring(1) : dataLine
591 .substring(1, spacePos));
598 * Read the #MEGA and Title/Format/Description header lines (if present).
600 * Save as alignment properties in case useful.
602 * @return the next non-blank line following the header lines.
603 * @throws IOException
605 protected String parseHeaderLines() throws IOException
607 String inputLine = null;
608 while ((inputLine = nextNonCommentLine()) != null)
610 inputLine = inputLine.trim();
615 if (inputLine.length() == 0)
620 if (inputLine.toUpperCase().startsWith(MEGA_ID))
625 if (isTitle(inputLine))
627 this.title = getValue(inputLine);
628 setAlignmentProperty(PROP_TITLE, title);
630 else if (inputLine.startsWith(BANG + DESCRIPTION))
632 parseDescription(inputLine);
635 else if (inputLine.startsWith(BANG + FORMAT))
637 parseFormat(inputLine);
639 else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
643 * Return the first 'data line' i.e. one that is not blank, #MEGA or
653 * Parse a !Format statement. This may be multiline, and is ended by a
657 * @throws IOException
659 protected void parseFormat(String inputLine) throws IOException
661 while (inputLine != null)
663 parseFormatLine(inputLine);
664 if (inputLine.endsWith(SEMICOLON))
668 inputLine = nextNonCommentLine();
673 * Parse one line of a !Format statement. This may contain one or more
674 * keyword=value pairs.
677 * @throws FileFormatException
679 protected void parseFormatLine(String inputLine)
680 throws FileFormatException
682 if (inputLine.startsWith(BANG + FORMAT))
684 inputLine = inputLine.substring((BANG + FORMAT).length());
686 if (inputLine.endsWith(SEMICOLON))
688 inputLine = inputLine.substring(0, inputLine.length() - 1);
690 if (inputLine.length() == 0)
694 String[] tokens = inputLine.trim().split("\\s"); // any whitespace
695 for (String token : tokens)
697 parseFormatKeyword(token);
702 * Parse a Keyword=Value token. Possible keywords are
704 * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
705 * <li>DataFormat= Interleaved, ?</li>
706 * <li>NSeqs= number of sequences (synonym NTaxa)</li>
707 * <li>NSites= number of bases / residues</li>
708 * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
709 * <li>Indel= gap character</li>
710 * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
711 * <li>Missing= missing data character</li>
712 * <li>CodeTable= Standard, other (MEGA supports various)</li>
716 * @throws FileFormatException
717 * if an unrecognised keyword or value is encountered
719 protected void parseFormatKeyword(String token)
720 throws FileFormatException
722 String msg = "Unrecognised Format command: " + token;
723 String[] bits = token.split(EQUALS);
724 if (bits.length != 2)
726 throw new FileFormatException(msg);
728 String keyword = bits[0];
729 String value = bits[1];
732 * Jalview will work out whether nucleotide or not anyway
734 if (keyword.equalsIgnoreCase(DATATYPE))
736 if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
737 || value.equalsIgnoreCase("Nucleotide"))
739 this.nucleotide = true;
740 // alignment computes whether or not it is nucleotide when created
742 else if (value.equalsIgnoreCase(PROTEIN))
744 this.nucleotide = false;
748 throw new FileFormatException(msg);
750 setAlignmentProperty(PROP_DATATYPE, value);
754 * accept non-Standard code table but save in case we want to disable
755 * 'translate as cDNA'
757 else if (keyword.equalsIgnoreCase(CODETABLE))
759 setAlignmentProperty(PROP_CODETABLE, value);
763 * save gap char to set later on alignment once created
765 else if (keyword.equalsIgnoreCase(INDEL))
767 this.gapCharacter = value.charAt(0);
770 else if (keyword.equalsIgnoreCase(IDENTICAL)
771 || keyword.equalsIgnoreCase("MatchChar"))
773 setAlignmentProperty(PROP_IDENTITY, value);
774 this.identityCharacter = value.charAt(0);
775 if (!".".equals(value))
777 System.err.println("Warning: " + token
778 + " not supported, Jalview uses '.' for identity");
782 else if (keyword.equalsIgnoreCase(MISSING))
784 setAlignmentProperty(PROP_MISSING, value);
785 System.err.println("Warning: " + token + " not supported");
788 else if (keyword.equalsIgnoreCase("Property"))
790 // TODO: figure out what to do with this
791 // can it appear more than once in a file?
792 setAlignmentProperty(PROP_MISSING, value);
795 else if (!keyword.equalsIgnoreCase(N_SEQS)
796 && !keyword.equalsIgnoreCase(N_SITES))
798 System.err.println("Warning: " + msg);
803 * Returns the trimmed data on the line following either whitespace or '=',
804 * with any trailing semi-colon removed<br>
807 * <li>Hello World</li>
808 * <li>!Hello: \tWorld;</li>
809 * <li>!Hello=World</li>
811 * should all return "World"
816 protected static String getValue(String inputLine)
818 if (inputLine == null)
823 String s = inputLine.replaceAll("\t", " ").trim();
826 * KEYWORD = VALUE should return VALUE
828 int equalsPos = s.indexOf("=");
831 value = s.substring(equalsPos + 1);
835 int spacePos = s.indexOf(' ');
836 value = spacePos == -1 ? "" : s.substring(spacePos + 1);
838 value = value.trim();
839 if (value.endsWith(SEMICOLON))
841 value = value.substring(0, value.length() - 1).trim();
847 * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
848 * sensitive). The latter is the official format, some older data file
849 * examples have it without the !.
854 protected static boolean isTitle(String inputLine)
856 if (inputLine == null)
860 String upper = inputLine.toUpperCase();
861 return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
862 + TITLE.toUpperCase()));
866 * Reads lines until terminated by semicolon, appending each to the
867 * Description property value.
869 * @throws IOException
871 protected void parseDescription(String firstDescriptionLine)
874 StringBuilder desc = new StringBuilder(256);
875 String line = getValue(firstDescriptionLine);
878 if (line.endsWith(SEMICOLON))
880 desc.append(line.substring(0, line.length() - 1));
883 else if (line.length() > 0)
885 desc.append(line).append(newline);
887 line = nextNonCommentLine();
889 setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
893 * Returns the alignment sequences in Mega format.
896 public String print()
898 return MEGA_ID + newline + print(getSeqsAsArray());
902 * Write out the alignment sequences in Mega format - interleaved unless
903 * explicitly noninterleaved.
905 protected String print(SequenceI[] s)
908 if (this.interleaved != null && !this.interleaved)
910 result = printNonInterleaved(s);
914 result = printInterleaved(s);
920 * Print to string in Interleaved format - blocks of next N characters of each
925 protected String printInterleaved(SequenceI[] s)
927 int maxIdLength = getMaxIdLength(s);
928 int maxSequenceLength = getMaxSequenceLength(s);
929 int numLines = maxSequenceLength / positionsPerLine + 3; // approx
931 int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
932 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
933 int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
936 * Roughly size a buffer to hold the whole output
938 StringBuilder sb = new StringBuilder(numLines
939 * (maxIdLength + positionsPerLine + chunksPerLine + 10));
942 * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
945 for (int i = 0; i < numDataBlocks; i++)
948 boolean first = true;
950 for (SequenceI seq : s)
953 String seqId = String.format("#%-" + maxIdLength + "s",
957 * output next line for this sequence
960 int lastPos = seqFrom + positionsPerLine; // exclusive
961 for (int j = 0; j < chunksPerLine; j++)
963 char[] subSequence = seq.getSequence(seqFrom,
964 Math.min(lastPos, seqFrom + spaceEvery));
965 if (subSequence.length > 0)
967 sb.append(SPACE).append(subSequence);
969 seqFrom += subSequence.length;
972 // all sequences should be the same length in MEGA
973 advancedBy += subSequence.length;
976 // write last position as a comment
977 if (writePositionNumbers)
979 sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
980 .append(COMMENT_END);
988 return new String(sb);
992 * Outputs to string the MEGA header and any other known and relevant
993 * alignment properties
997 protected String printHeaders(AlignmentI al)
999 StringBuilder sb = new StringBuilder(128);
1000 sb.append(MEGA_ID).append(newline);
1001 String propertyValue = (String) al.getProperty(PROP_TITLE);
1002 if (propertyValue != null)
1004 sb.append(BANG).append(TITLE).append(SPACE)
1005 .append(propertyValue)
1009 propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1010 if (propertyValue != null)
1012 sb.append(BANG).append(DESCRIPTION).append(newline)
1013 .append(propertyValue).append(SEMICOLON)
1018 * !Format DataType CodeTable
1020 sb.append(BANG).append(FORMAT).append(newline);
1021 String dataType = (String) al.getProperty(PROP_DATATYPE);
1022 if (dataType == null)
1024 dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1026 sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1027 String codeTable = (String) al.getProperty(PROP_CODETABLE);
1028 sb.append(SPACE).append(CODETABLE).append(EQUALS)
1029 .append(codeTable == null ? "Standard" : codeTable)
1033 * !Format NSeqs NSites
1034 * NSites the length of any sequence (they should all be the same), excluding
1037 sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1038 SequenceI seq = al.getSequenceAt(0);
1039 sb.append(SPACE).append(N_SITES).append(EQUALS)
1040 .append(seq.getEnd() - seq.getStart() + 1);
1044 * !Format Indel Identical Missing
1047 sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1048 String identity = (String) al.getProperty(PROP_IDENTITY);
1049 if (identity != null)
1051 sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1053 String missing = (String) al.getProperty(PROP_MISSING);
1054 if (missing != null)
1056 sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1058 sb.append(SEMICOLON).append(newline);
1060 return sb.toString();
1064 * Get the longest sequence id (to allow aligned printout).
1069 protected static int getMaxIdLength(SequenceI[] s)
1071 // TODO pull up for reuse
1073 for (SequenceI seq : s)
1075 int len = seq.getName().length();
1076 if (len > maxLength)
1085 * Get the longest sequence length
1090 protected static int getMaxSequenceLength(SequenceI[] s)
1092 // TODO pull up for reuse
1094 for (SequenceI seq : s)
1096 int len = seq.getLength();
1097 if (len > maxLength)
1106 * Print to string in noninterleaved format - all of each sequence in turn, in
1107 * blocks of 50 characters.
1112 protected String printNonInterleaved(SequenceI[] s)
1114 int maxSequenceLength = getMaxSequenceLength(s);
1116 int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1119 * Roughly size a buffer to hold the whole output
1121 StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1123 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1124 int chunksPerLine = positionsPerLine / spaceEvery;
1125 for (SequenceI seq : s)
1128 sb.append(HASHSIGN + seq.getName()).append(newline);
1130 while (startPos < seq.getLength())
1132 boolean firstChunk = true;
1134 * print next line for this sequence
1136 int lastPos = startPos + positionsPerLine; // exclusive
1137 for (int j = 0; j < chunksPerLine; j++)
1139 char[] subSequence = seq.getSequence(startPos,
1140 Math.min(lastPos, startPos + positionsPerLine));
1141 if (subSequence.length > 0)
1147 sb.append(subSequence);
1150 startPos += subSequence.length;
1156 return new String(sb);
1160 * Flag this file as interleaved or not, based on data format. Throws an
1161 * exception if has previously been determined to be otherwise.
1165 * @throws IOException
1167 protected void assertInterleaved(boolean isIt, String dataLine)
1168 throws FileFormatException
1170 if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1172 throw new FileFormatException(
1173 "Parse error: mix of interleaved and noninterleaved detected, at line: "
1176 this.interleaved = new Boolean(isIt);
1177 setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1180 public boolean isInterleaved()
1182 return this.interleaved == null ? false : this.interleaved
1187 * Adds saved parsed values either as alignment properties, or (in some cases)
1188 * as specific member fields of the alignment
1191 public void addProperties(AlignmentI al)
1193 super.addProperties(al);
1194 if (this.gapCharacter != null)
1196 al.setGapCharacter(gapCharacter);
1200 * warn if e.g. DataType=DNA but data is protein (or vice versa)
1202 if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1203 System.err.println("Warning: " + this.title + " declared "
1204 + (nucleotide ? "" : " not ") + "nucleotide but it is"
1205 + (nucleotide ? " not" : ""));
1210 * Print the given alignment in MEGA format. If the alignment was created by
1211 * parsing a MEGA file, it should have properties set (e.g. Title) which can
1212 * influence the output.
1215 public String print(AlignmentI al)
1217 this.nucleotide = al.isNucleotide();
1218 String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1219 this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1220 .parseInt(lineLength);
1221 return printHeaders(al) + print(al.getSequencesArray());
1225 * Returns the number of sequence positions output per line
1229 public int getPositionsPerLine()
1231 return positionsPerLine;
1235 * Sets the number of sequence positions output per line. Note these will be
1236 * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1240 public void setPositionsPerLine(int p)
1242 this.positionsPerLine = p;