2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 import jalview.datamodel.AlignmentI;
22 import jalview.datamodel.Sequence;
23 import jalview.datamodel.SequenceFeature;
24 import jalview.datamodel.SequenceI;
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.HashMap;
29 import java.util.Iterator;
30 import java.util.LinkedHashMap;
31 import java.util.List;
33 import java.util.Map.Entry;
37 * A parser for input or output of MEGA format files. <br>
39 * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
40 * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
41 * Evolution 30: 2725-2729. <br>
44 * MEGA file format is supported as described in
45 * http://www.megasoftware.net/manual.pdf <br>
48 * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
49 * <li>to be completed</li>
52 * @see http://www.megasoftware.net/
54 public class MegaFile extends AlignFile
56 private static final int DEFAULT_LINE_LENGTH = 60;
58 private static final String INDENT = " ";
60 private static final String N_SITES = "NSites";
62 private static final String N_SEQS = "NSeqs";
64 private static final String MISSING = "Missing";
66 private static final String IDENTICAL = "Identical";
68 private static final String INDEL = "Indel";
70 private static final String CODETABLE = "CodeTable";
72 private static final String PROTEIN = "Protein";
74 private static final String NUCLEOTIDE = "Nucleotide";
76 private static final String DATATYPE = "DataType";
78 private static final char COMMENT_START = '[';
80 private static final char COMMENT_END = ']';
82 private static final String HASHSIGN = "#";
84 private static final String SEMICOLON = ";";
86 private static final String BANG = "!";
88 private static final String EQUALS = "=";
90 private static final String MEGA_ID = HASHSIGN + "MEGA";
92 private static final String TITLE = "Title";
94 private static final String FORMAT = "Format";
96 private static final String DESCRIPTION = "Description";
98 private static final String GENE = "Gene";
100 private static final String DOMAIN = "Domain";
103 * names of properties to save to the alignment (may affect eventual output
106 static final String PROP_TITLE = "MEGA_TITLE";
108 static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
110 static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
112 static final String PROP_CODETABLE = "MEGA_CODETABLE";
114 static final String PROP_IDENTITY = "MEGA_IDENTITY";
116 static final String PROP_MISSING = "MEGA_MISSING";
118 static final String PROP_DATATYPE = "MEGA_DATATYPE";
120 // number of bases per line of file (value is inferred)
121 static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
123 // TODO: need a controlled name for Gene as a feature if we want to be able to
124 // output the MEGA file with !Gene headers
125 // WTF do we do if the sequences get realigned?
127 // initial size for sequence data buffer
128 private static final int SEQBUFFERSIZE = 256;
130 private static final String SPACE = " ";
133 * number of sequence positions output per line
135 private int positionsPerLine;
137 private String title;
139 // gap character may be explicitly declared, default is -
140 private char gapCharacter = '-';
142 // identity character if declared
143 private char identityCharacter = 0;
145 // this can be True, False or null (meaning not asserted in file)
146 private Boolean nucleotide;
148 // set once we have seen one block of interleaved data
149 private boolean firstDataBlockRead = false;
151 // this can be True, False or null (meaning we don't know yet)
152 private Boolean interleaved;
154 // write end of line positions as a comment
155 private boolean writePositionNumbers = true;
157 // id of sequence being processed
158 private String currentSequenceId;
161 * Temporary store of {sequenceId, positionData} while parsing interleaved
162 * sequences; sequences are maintained in the order in which they are added
163 * i.e. read in the file
165 Map<String, StringBuilder> seqData;
167 // number of residues read (so far) per sequence
168 Map<String, Integer> residuesRead;
170 // start residue (base 1) per sequence of current feature
171 Map<String, Integer> featureStart;
173 // feature (Gene/Domain) if any we are parsing
174 private String currentFeature;
176 // feature type (Gene/Domain) if any we are parsing
177 private String currentFeatureType;
179 // map of SequenceFeature's by sequence id
180 Map<String, List<SequenceFeature>> sequenceFeatures;
186 public MegaFile(String inFile, String type) throws IOException
191 public MegaFile(FileParse source) throws IOException
197 * Parse the input stream.
200 public void parse() throws IOException
203 sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
204 featureStart = new HashMap<String, Integer>();
205 residuesRead = new HashMap<String, Integer>();
208 * Read and process MEGA and Title/Format/Description headers if present.
209 * Returns the first data line following the headers.
211 String dataLine = parseHeaderLines();
214 * order-preserving map to hold sequences by id as they are built up during
217 seqData = new LinkedHashMap<String, StringBuilder>();
220 * The id of the sequence being read (for non-interleaved)
222 currentSequenceId = "";
224 while (dataLine != null)
226 dataLine = dataLine.trim();
227 if (dataLine.length() > 0)
229 if (dataLine.startsWith(BANG + GENE))
231 parseFeature(GENE, dataLine);
233 else if (dataLine.startsWith(BANG + DOMAIN))
235 parseFeature(DOMAIN, dataLine);
239 currentSequenceId = parseDataLine(dataLine);
242 else if (!seqData.isEmpty())
245 * Blank line after processing some data...
249 dataLine = nextNonCommentLine();
252 // remember the (longest) line length read in, so we can output the same
253 setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
259 * Post-processing after reading one block of interleaved data
261 protected void endOfDataBlock()
263 this.firstDataBlockRead = true;
265 // (initialise and) populate arrays of sequence length so far (excluding
267 // On change or end of a denoted Gene or Domain, add sequence features for
272 * Parse a !Gene or !Domain command line
277 protected void parseFeature(String featureType, String dataLine)
279 String featureName = getValue(dataLine);
280 // TODO parse !Gene=xyx Property=end; ???
281 if (this.currentFeature != null)
283 endSequenceFeature();
285 startSequenceFeature(featureName, featureType);
289 * Start processing a new feature
293 protected void startSequenceFeature(String featureName, String featureType)
295 currentFeature = featureName;
296 currentFeatureType = featureType;
299 * If the feature name precedes all sequences, we will know in
300 * endSequenceFeature that it starts with residue 1; otherwise note now
301 * where it starts in each sequence
303 if (!residuesRead.isEmpty())
305 for (Entry<String, Integer> entry : residuesRead.entrySet())
307 String seqId = entry.getKey();
308 Integer nextResidue = entry.getValue() + 1;
309 featureStart.put(seqId, nextResidue);
315 * Add a SequenceFeature for the current feature to each sequence, using the
316 * current feature start/end values per sequence
318 protected void endSequenceFeature()
320 Iterator<String> seqids = this.seqData.keySet().iterator();
321 while (seqids.hasNext())
323 String seqid = seqids.next();
324 Integer startAt = featureStart.get(seqid);
325 int sfstart = startAt == null ? 1 : startAt.intValue();
326 int sfend = residuesRead.get(seqid);
327 if (sfend >= sfstart)
330 * don't add feature if entirely gapped in the sequence
332 SequenceFeature sf = new SequenceFeature(currentFeature,
333 currentFeatureType, sfstart, sfend, 0f, null);
334 sequenceFeatures.get(seqid).add(sf);
340 * Parse a !Domain command line
344 private void parseDomain(String dataLine)
349 * Returns the next line that is not a comment, or null at end of file.
350 * Comments in MEGA are within [ ] brackets, and may be nested.
353 * @throws IOException
355 protected String nextNonCommentLine() throws IOException
357 return nextNonCommentLine(0);
361 * Returns the next non-comment line (or part line), or null at end of file.
362 * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
363 * anywhere within a line (for example at the end with position numbers); this
364 * method returns the line with any comments removed.
367 * current depth of nesting of comments while parsing
369 * @throws IOException
371 protected String nextNonCommentLine(final int depth) throws IOException
379 System.err.println("Warning: unterminated comment in data file");
385 * If we are in a (possibly nested) comment after parsing this line, keep
386 * reading recursively until the comment has unwound
388 int newDepth = commentDepth(data, depth);
391 return nextNonCommentLine(newDepth);
396 * not in a comment by end of this line; return what is left
398 String nonCommentPart = getNonCommentContent(data, depth);
399 return nonCommentPart;
404 * Returns what is left of the input data after removing any comments, whether
405 * 'in progress' from preceding lines, or embedded in the current line
410 * nested depth of comments pending termination
412 * @throws FileFormatException
414 protected static String getNonCommentContent(String data, int depth)
415 throws FileFormatException
417 int len = data.length();
418 StringBuilder result = new StringBuilder(len);
419 for (int i = 0; i < len; i++)
421 char c = data.charAt(i);
446 return result.toString();
450 * Calculates new depth of comment after parsing an input line i.e. the excess
451 * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
452 * treated as comment delimiters).
457 * current comment nested depth before parsing the line
458 * @return new depth after parsing the line
460 protected static int commentDepth(CharSequence data, int depth)
462 int newDepth = depth;
463 int len = data.length();
464 for (int i = 0; i < len; i++)
466 char c = data.charAt(i);
467 if (c == COMMENT_START)
471 else if (c == COMMENT_END && newDepth > 0)
480 * Convert the parsed sequence strings to objects and store them in the model.
482 protected void deriveSequences()
484 Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
486 for (Entry<String, StringBuilder> dataset : datasets)
488 String sequenceId = dataset.getKey();
489 StringBuilder characters = dataset.getValue();
490 SequenceI s = new Sequence(sequenceId, new String(characters));
491 this.seqs.addElement(s);
494 * and add any derived sequence features to the sequence
496 for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
498 s.addSequenceFeature(sf);
504 * Process one line of sequence data. If it has no sequence identifier, append
505 * to the current id's sequence. Else parse out the sequence id and append the
506 * data (if any) to that id's sequence. Returns the sequence id (implicit or
507 * explicit) for this line.
511 * @throws IOException
513 protected String parseDataLine(String dataLine)
516 String seqId = getSequenceId(dataLine);
520 * Just character data
522 parseNoninterleavedDataLine(dataLine);
523 return currentSequenceId;
525 else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
528 * Sequence id only - header line for noninterleaved data
535 * Sequence id followed by data
537 parseInterleavedDataLine(dataLine, seqId);
543 * Add a line of sequence data to the buffer for the given sequence id. Start
544 * a new one if we haven't seen it before.
547 * @throws IOException
549 protected void parseNoninterleavedDataLine(String dataLine)
552 if (currentSequenceId == null)
555 * Oops. Data but no sequence id context.
557 throw new IOException("No sequence id context at: " + dataLine);
560 assertInterleaved(false, dataLine);
562 dataLine = addSequenceData(currentSequenceId, dataLine);
564 setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
568 * Get the sequence data for this sequence id, starting a new one if
574 protected StringBuilder getSequenceDataBuffer(String currentId)
576 StringBuilder sb = seqData.get(currentId);
579 // first data met for this sequence id, start a new buffer
580 sb = new StringBuilder(SEQBUFFERSIZE);
581 seqData.put(currentId, sb);
583 // and a placeholder for any SequenceFeature found
584 sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
590 * Parse one line of interleaved data e.g.
593 * #TheSeqId CGATCGCATGCA
598 * @throws FileFormatException
600 protected void parseInterleavedDataLine(String dataLine, String seqId)
601 throws FileFormatException
604 * New sequence found in second or later data block - error.
606 if (this.firstDataBlockRead && !seqData.containsKey(seqId))
608 throw new FileFormatException(
609 "Parse error: misplaced new sequence starting at " + dataLine);
612 String data = dataLine.substring(seqId.length() + 1).trim();
615 * Do nothing if this line is _only_ a sequence id with no data following.
617 if (data != null && data.length() > 0)
619 data = addSequenceData(seqId, data);
620 setPositionsPerLine(Math.max(positionsPerLine, data.length()));
621 assertInterleaved(true, dataLine);
626 * Remove spaces, and replace identity symbol, before appending the sequence
627 * data to the buffer for the sequence id. Returns the reformatted added data.
628 * Also updates a count of residues read for the sequence.
634 protected String addSequenceData(String seqId, String data)
636 StringBuilder sb = getSequenceDataBuffer(seqId);
637 int len = sb.length();
638 String formatted = data.replace(SPACE, "");
641 * If sequence contains '.' or other identity symbol; replace these with the
642 * same position from the first (reference) sequence
645 StringBuilder referenceSequence = seqData.values().iterator().next();
646 StringBuilder sb1 = new StringBuilder(formatted.length());
647 for (int i = 0; i < formatted.length(); i++)
649 char nextChar = formatted.charAt(i);
650 if (nextChar != gapCharacter)
654 if (nextChar == identityCharacter
655 && len + i < referenceSequence.length())
657 sb1.append(referenceSequence.charAt(len + i));
661 sb1.append(nextChar);
664 formatted = sb1.toString();
670 * increment residue count for the sequence
674 Integer residueCount = residuesRead.get(seqId);
675 residuesRead.put(seqId, nonGapped
676 + (residueCount == null ? 0 : residueCount));
683 * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
684 * identifier. Else returns null.
689 public static String getSequenceId(String dataLine)
691 // TODO refactor to a StringUtils type class
692 if (dataLine != null)
694 if (dataLine.startsWith(HASHSIGN))
696 int spacePos = dataLine.indexOf(" ");
697 return (spacePos == -1 ? dataLine.substring(1) : dataLine
698 .substring(1, spacePos));
705 * Read the #MEGA and Title/Format/Description header lines (if present).
707 * Save as alignment properties in case useful.
709 * @return the next non-blank line following the header lines.
710 * @throws IOException
712 protected String parseHeaderLines() throws IOException
714 String inputLine = null;
715 while ((inputLine = nextNonCommentLine()) != null)
717 inputLine = inputLine.trim();
722 if (inputLine.length() == 0)
727 if (inputLine.toUpperCase().startsWith(MEGA_ID))
732 if (isTitle(inputLine))
734 this.title = getValue(inputLine);
735 setAlignmentProperty(PROP_TITLE, title);
737 else if (inputLine.startsWith(BANG + DESCRIPTION))
739 parseDescription(inputLine);
742 else if (inputLine.startsWith(BANG + FORMAT))
744 parseFormat(inputLine);
746 else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
750 * Return the first 'data line' i.e. one that is not blank, #MEGA or
760 * Parse a !Format statement. This may be multiline, and is ended by a
764 * @throws IOException
766 protected void parseFormat(String inputLine) throws IOException
768 while (inputLine != null)
770 parseFormatLine(inputLine);
771 if (inputLine.endsWith(SEMICOLON))
775 inputLine = nextNonCommentLine();
780 * Parse one line of a !Format statement. This may contain one or more
781 * keyword=value pairs.
784 * @throws FileFormatException
786 protected void parseFormatLine(String inputLine)
787 throws FileFormatException
789 if (inputLine.startsWith(BANG + FORMAT))
791 inputLine = inputLine.substring((BANG + FORMAT).length());
793 if (inputLine.endsWith(SEMICOLON))
795 inputLine = inputLine.substring(0, inputLine.length() - 1);
797 if (inputLine.length() == 0)
801 String[] tokens = inputLine.trim().split("\\s"); // any whitespace
802 for (String token : tokens)
804 parseFormatKeyword(token);
809 * Parse a Keyword=Value token. Possible keywords are
811 * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
812 * <li>DataFormat= Interleaved, ?</li>
813 * <li>NSeqs= number of sequences (synonym NTaxa)</li>
814 * <li>NSites= number of bases / residues</li>
815 * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
816 * <li>Indel= gap character</li>
817 * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
818 * <li>Missing= missing data character</li>
819 * <li>CodeTable= Standard, other (MEGA supports various)</li>
823 * @throws FileFormatException
824 * if an unrecognised keyword or value is encountered
826 protected void parseFormatKeyword(String token)
827 throws FileFormatException
829 String msg = "Unrecognised Format command: " + token;
830 String[] bits = token.split(EQUALS);
831 if (bits.length != 2)
833 throw new FileFormatException(msg);
835 String keyword = bits[0];
836 String value = bits[1];
839 * Jalview will work out whether nucleotide or not anyway
841 if (keyword.equalsIgnoreCase(DATATYPE))
843 if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
844 || value.equalsIgnoreCase("Nucleotide"))
846 this.nucleotide = true;
847 // alignment computes whether or not it is nucleotide when created
849 else if (value.equalsIgnoreCase(PROTEIN))
851 this.nucleotide = false;
855 throw new FileFormatException(msg);
857 setAlignmentProperty(PROP_DATATYPE, value);
861 * accept non-Standard code table but save in case we want to disable
862 * 'translate as cDNA'
864 else if (keyword.equalsIgnoreCase(CODETABLE))
866 setAlignmentProperty(PROP_CODETABLE, value);
870 * save gap char to set later on alignment once created
872 else if (keyword.equalsIgnoreCase(INDEL))
874 this.gapCharacter = value.charAt(0);
877 else if (keyword.equalsIgnoreCase(IDENTICAL)
878 || keyword.equalsIgnoreCase("MatchChar"))
880 setAlignmentProperty(PROP_IDENTITY, value);
881 this.identityCharacter = value.charAt(0);
882 if (!".".equals(value))
884 System.err.println("Warning: " + token
885 + " not supported, Jalview uses '.' for identity");
889 else if (keyword.equalsIgnoreCase(MISSING))
891 setAlignmentProperty(PROP_MISSING, value);
892 System.err.println("Warning: " + token + " not supported");
895 else if (keyword.equalsIgnoreCase("Property"))
897 // TODO: figure out what to do with this
898 // can it appear more than once in a file?
899 setAlignmentProperty(PROP_MISSING, value);
902 else if (!keyword.equalsIgnoreCase(N_SEQS)
903 && !keyword.equalsIgnoreCase(N_SITES))
905 System.err.println("Warning: " + msg);
910 * Returns the trimmed data on the line following either whitespace or '=',
911 * with any trailing semi-colon removed<br>
914 * <li>Hello World</li>
915 * <li>!Hello: \tWorld;</li>
916 * <li>!Hello=World</li>
918 * should all return "World"
923 protected static String getValue(String inputLine)
925 if (inputLine == null)
930 String s = inputLine.replaceAll("\t", " ").trim();
933 * KEYWORD = VALUE should return VALUE
935 int equalsPos = s.indexOf("=");
938 value = s.substring(equalsPos + 1);
942 int spacePos = s.indexOf(' ');
943 value = spacePos == -1 ? "" : s.substring(spacePos + 1);
945 value = value.trim();
946 if (value.endsWith(SEMICOLON))
948 value = value.substring(0, value.length() - 1).trim();
954 * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
955 * sensitive). The latter is the official format, some older data file
956 * examples have it without the !.
961 protected static boolean isTitle(String inputLine)
963 if (inputLine == null)
967 String upper = inputLine.toUpperCase();
968 return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
969 + TITLE.toUpperCase()));
973 * Reads lines until terminated by semicolon, appending each to the
974 * Description property value.
976 * @throws IOException
978 protected void parseDescription(String firstDescriptionLine)
981 StringBuilder desc = new StringBuilder(256);
982 desc.append(getValue(firstDescriptionLine));
983 if (!firstDescriptionLine.endsWith(SEMICOLON))
985 String line = nextNonCommentLine();
988 if (line.endsWith(SEMICOLON))
990 desc.append(line.substring(0, line.length() - 1));
993 else if (line.length() > 0)
995 desc.append(line).append(newline);
997 line = nextNonCommentLine();
1000 setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1004 * Returns the alignment sequences in Mega format.
1007 public String print()
1009 return MEGA_ID + newline + print(getSeqsAsArray());
1013 * Write out the alignment sequences in Mega format - interleaved unless
1014 * explicitly noninterleaved.
1016 protected String print(SequenceI[] s)
1019 if (this.interleaved != null && !this.interleaved)
1021 result = printNonInterleaved(s);
1025 result = printInterleaved(s);
1031 * Print to string in Interleaved format - blocks of next N characters of each
1036 protected String printInterleaved(SequenceI[] s)
1038 int maxIdLength = getMaxIdLength(s);
1039 int maxSequenceLength = getMaxSequenceLength(s);
1040 int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1042 int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
1043 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1044 int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1047 * Roughly size a buffer to hold the whole output
1049 StringBuilder sb = new StringBuilder(numLines
1050 * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1053 * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1056 for (int i = 0; i < numDataBlocks; i++)
1059 boolean first = true;
1061 for (SequenceI seq : s)
1064 String seqId = String.format("#%-" + maxIdLength + "s",
1068 * output next line for this sequence
1071 int lastPos = seqFrom + positionsPerLine; // exclusive
1072 for (int j = 0; j < chunksPerLine; j++)
1074 char[] subSequence = seq.getSequence(seqFrom,
1075 Math.min(lastPos, seqFrom + spaceEvery));
1076 if (subSequence.length > 0)
1078 sb.append(SPACE).append(subSequence);
1080 seqFrom += subSequence.length;
1083 // all sequences should be the same length in MEGA
1084 advancedBy += subSequence.length;
1087 // write last position as a comment
1088 if (writePositionNumbers)
1090 sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
1091 .append(COMMENT_END);
1099 return new String(sb);
1103 * Outputs to string the MEGA header and any other known and relevant
1104 * alignment properties
1108 protected String printHeaders(AlignmentI al)
1110 StringBuilder sb = new StringBuilder(128);
1111 sb.append(MEGA_ID).append(newline);
1112 String propertyValue = (String) al.getProperty(PROP_TITLE);
1113 if (propertyValue != null)
1115 sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1116 .append(SEMICOLON).append(newline);
1118 propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1119 if (propertyValue != null)
1121 sb.append(BANG).append(DESCRIPTION).append(newline)
1122 .append(propertyValue).append(SEMICOLON)
1127 * !Format DataType CodeTable
1129 sb.append(BANG).append(FORMAT).append(newline);
1130 String dataType = (String) al.getProperty(PROP_DATATYPE);
1131 if (dataType == null)
1133 dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1135 sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1136 String codeTable = (String) al.getProperty(PROP_CODETABLE);
1137 sb.append(SPACE).append(CODETABLE).append(EQUALS)
1138 .append(codeTable == null ? "Standard" : codeTable)
1142 * !Format NSeqs NSites (the length of sequences - they should all be the
1143 * same - including gaps)
1145 sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1146 sb.append(SPACE).append(N_SITES).append(EQUALS)
1147 .append(String.valueOf(al.getWidth()));
1151 * !Format Indel Identical Missing
1154 sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1155 String identity = (String) al.getProperty(PROP_IDENTITY);
1156 if (identity != null)
1158 sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1160 String missing = (String) al.getProperty(PROP_MISSING);
1161 if (missing != null)
1163 sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1165 sb.append(SEMICOLON).append(newline);
1167 return sb.toString();
1171 * Get the longest sequence id (to allow aligned printout).
1176 protected static int getMaxIdLength(SequenceI[] s)
1178 // TODO pull up for reuse
1180 for (SequenceI seq : s)
1182 int len = seq.getName().length();
1183 if (len > maxLength)
1192 * Get the longest sequence length
1197 protected static int getMaxSequenceLength(SequenceI[] s)
1199 // TODO pull up for reuse
1201 for (SequenceI seq : s)
1203 int len = seq.getLength();
1204 if (len > maxLength)
1213 * Print to string in noninterleaved format - all of each sequence in turn, in
1214 * blocks of 50 characters.
1219 protected String printNonInterleaved(SequenceI[] s)
1221 int maxSequenceLength = getMaxSequenceLength(s);
1223 int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1226 * Roughly size a buffer to hold the whole output
1228 StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1230 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1231 int chunksPerLine = positionsPerLine / spaceEvery;
1232 for (SequenceI seq : s)
1235 sb.append(HASHSIGN + seq.getName()).append(newline);
1237 while (startPos < seq.getLength())
1239 boolean firstChunk = true;
1241 * print next line for this sequence
1243 int lastPos = startPos + positionsPerLine; // exclusive
1244 for (int j = 0; j < chunksPerLine; j++)
1246 char[] subSequence = seq.getSequence(startPos,
1247 Math.min(lastPos, startPos + positionsPerLine));
1248 if (subSequence.length > 0)
1254 sb.append(subSequence);
1257 startPos += subSequence.length;
1263 return new String(sb);
1267 * Flag this file as interleaved or not, based on data format. Throws an
1268 * exception if has previously been determined to be otherwise.
1272 * @throws IOException
1274 protected void assertInterleaved(boolean isIt, String dataLine)
1275 throws FileFormatException
1277 if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1279 throw new FileFormatException(
1280 "Parse error: mix of interleaved and noninterleaved detected, at line: "
1283 this.interleaved = new Boolean(isIt);
1284 setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1287 public boolean isInterleaved()
1289 return this.interleaved == null ? false : this.interleaved
1294 * Adds saved parsed values either as alignment properties, or (in some cases)
1295 * as specific member fields of the alignment
1298 public void addProperties(AlignmentI al)
1300 super.addProperties(al);
1301 al.setGapCharacter(gapCharacter);
1304 * warn if e.g. DataType=DNA but data is protein (or vice versa)
1306 if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1307 System.err.println("Warning: " + this.title + " declared "
1308 + (nucleotide ? "" : " not ") + "nucleotide but it is"
1309 + (nucleotide ? " not" : ""));
1314 * Print the given alignment in MEGA format. If the alignment was created by
1315 * parsing a MEGA file, it should have properties set (e.g. Title) which can
1316 * influence the output.
1319 public String print(AlignmentI al)
1321 this.nucleotide = al.isNucleotide();
1323 String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1324 this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1325 .parseInt(lineLength);
1328 * round down to a multiple of 3 positions per line for nucleotide
1332 positionsPerLine = positionsPerLine - (positionsPerLine % 3);
1335 String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1336 if (interleave != null)
1338 this.interleaved = Boolean.valueOf(interleave);
1341 String headers = printHeaders(al);
1342 return headers + print(al.getSequencesArray());
1346 * Returns the number of sequence positions output per line
1350 public int getPositionsPerLine()
1352 return positionsPerLine;
1356 * Sets the number of sequence positions output per line. Note these will be
1357 * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1361 public void setPositionsPerLine(int p)
1363 this.positionsPerLine = p;