2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 import jalview.datamodel.AlignmentI;
22 import jalview.datamodel.Sequence;
23 import jalview.datamodel.SequenceFeature;
24 import jalview.datamodel.SequenceI;
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.HashMap;
29 import java.util.Iterator;
30 import java.util.LinkedHashMap;
31 import java.util.List;
33 import java.util.Map.Entry;
37 * A parser for input or output of MEGA format files. <br>
39 * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
40 * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
41 * Evolution 30: 2725-2729. <br>
44 * MEGA file format is supported as described in
45 * http://www.megasoftware.net/manual.pdf <br>
48 * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
49 * <li>to be completed</li>
52 * @see http://www.megasoftware.net/
54 public class MegaFile extends AlignFile
56 private static final String WHITESPACE = "\\s+";
58 private static final int DEFAULT_LINE_LENGTH = 60;
60 private static final String INDENT = " ";
62 private static final String N_SITES = "NSites";
64 private static final String N_SEQS = "NSeqs";
66 private static final String MISSING = "Missing";
68 private static final String IDENTICAL = "Identical";
70 private static final String INDEL = "Indel";
72 private static final String CODETABLE = "CodeTable";
74 private static final String PROTEIN = "Protein";
76 private static final String NUCLEOTIDE = "Nucleotide";
78 private static final String DATATYPE = "DataType";
80 private static final char COMMENT_START = '[';
82 private static final char COMMENT_END = ']';
84 private static final String HASHSIGN = "#";
86 private static final String SEMICOLON = ";";
88 private static final String BANG = "!";
90 private static final String EQUALS = "=";
92 private static final String MEGA_ID = HASHSIGN + "MEGA";
94 private static final String TITLE = "Title";
96 private static final String FORMAT = "Format";
98 private static final String DESCRIPTION = "Description";
100 private static final String GENE = "Gene";
102 private static final String DOMAIN = "Domain";
104 private static final String PROPERTY = "Property";
106 private static final String CODONSTART = "CodonStart";
109 * names of properties to save to the alignment (may affect eventual output
112 static final String PROP_TITLE = "MEGA_TITLE";
114 static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
116 static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
118 static final String PROP_CODETABLE = "MEGA_CODETABLE";
120 static final String PROP_IDENTITY = "MEGA_IDENTITY";
122 static final String PROP_MISSING = "MEGA_MISSING";
124 static final String PROP_DATATYPE = "MEGA_DATATYPE";
126 // number of bases per line of file (value is inferred)
127 static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
129 // TODO: need a controlled name for Gene as a feature if we want to be able to
130 // output the MEGA file with !Gene headers
131 // WTF do we do if the sequences get realigned?
133 // initial size for sequence data buffer
134 private static final int SEQBUFFERSIZE = 256;
136 private static final String SPACE = " ";
139 * number of sequence positions output per line
141 private int positionsPerLine;
143 private String title;
145 // gap character may be explicitly declared, default is -
146 private char gapCharacter = '-';
148 // identity character if declared
149 private char identityCharacter = 0;
151 // this can be True, False or null (meaning not asserted in file)
152 private Boolean nucleotide;
154 // set once we have seen one block of interleaved data
155 private boolean firstDataBlockRead = false;
157 // this can be True, False or null (meaning we don't know yet)
158 private Boolean interleaved;
160 // write end of line positions as a comment
161 private boolean writePositionNumbers = true;
163 // id of sequence being processed
164 private String currentSequenceId;
167 * Temporary store of {sequenceId, positionData} while parsing interleaved
168 * sequences; sequences are maintained in the order in which they are added
169 * i.e. read in the file
171 Map<String, StringBuilder> seqData;
173 // number of residues read (so far) per sequence
174 Map<String, Integer> residuesRead;
176 // current Gene if any we are parsing
177 private String currentGene;
179 // start residue (base 1) per sequence of current gene
180 Map<String, Integer> geneStart;
182 // current Domain if any we are parsing
183 private String currentDomain;
185 // start residue (base 1) per sequence of current domain
186 Map<String, Integer> domainStart;
188 // map of SequenceFeature's by sequence id
189 Map<String, List<SequenceFeature>> sequenceFeatures;
195 public MegaFile(String inFile, String type) throws IOException
200 public MegaFile(FileParse source) throws IOException
206 * Parse the input stream.
209 public void parse() throws IOException
212 sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
213 geneStart = new HashMap<String, Integer>();
214 domainStart = new HashMap<String, Integer>();
215 residuesRead = new HashMap<String, Integer>();
218 * Read and process MEGA and Title/Format/Description headers if present.
219 * Returns the first data line following the headers.
221 String dataLine = parseHeaderLines();
224 * order-preserving map to hold sequences by id as they are built up during
227 seqData = new LinkedHashMap<String, StringBuilder>();
230 * The id of the sequence being read (for non-interleaved)
232 currentSequenceId = "";
234 while (dataLine != null)
236 dataLine = dataLine.trim();
237 if (dataLine.length() > 0)
239 if (dataLine.startsWith(BANG + GENE)
240 || dataLine.startsWith(BANG + DOMAIN))
242 parseGeneOrDomain(dataLine);
246 currentSequenceId = parseDataLine(dataLine);
249 else if (!seqData.isEmpty())
252 * Blank line after processing some data...
256 dataLine = nextNonCommentLine();
260 * close off any features currently being parsed
262 createFeature(GENE, currentGene, geneStart);
263 createFeature(DOMAIN, currentDomain, domainStart);
265 // remember the (longest) line length read in, so we can output the same
266 setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
272 * Post-processing after reading one block of interleaved data
274 protected void endOfDataBlock()
276 this.firstDataBlockRead = true;
278 // (initialise and) populate arrays of sequence length so far (excluding
280 // On change or end of a denoted Gene or Domain, add sequence features for
285 * Parse a !Gene or !Domain command line. MEGA accepts
287 * <li>!Gene=name;</li>
288 * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
289 * <li>!Gene=genename Domain=domainname Property= etc</li>
290 * <li>!Domain=domainname Gene=genename Property= etc</li>
291 * <li>!Domain=domainname Property= etc</li>
292 * <li>!domain=domainname property=domainend</li>
294 * Properly, a Gene should be composed of Domain segments, but MEGA accepts
295 * without. Note that keywords don't seem to be case sensitive.
298 * @throws FileFormatException
300 protected void parseGeneOrDomain(String dataLine)
301 throws FileFormatException
303 String domain = null;
305 String property = null;
306 String codonStart = null;
307 String errorMsg = "Unrecognized format: " + dataLine;
309 if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
311 throw new FileFormatException(errorMsg);
313 String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
314 String[] tokens = trimmed.split(WHITESPACE);
315 for (String token : tokens)
317 String[] keyValue = token.split("=");
318 if (keyValue.length != 2)
320 throw new FileFormatException(errorMsg);
322 String key = keyValue[0];
323 if (GENE.equalsIgnoreCase(key))
327 else if (DOMAIN.equalsIgnoreCase(key))
329 domain = keyValue[1];
331 else if (PROPERTY.equalsIgnoreCase(key))
333 property = keyValue[1];
335 else if (CODONSTART.equalsIgnoreCase(key))
337 codonStart = keyValue[1];
341 System.err.println("Unrecognised token: '" + key + "; in "
346 processGeneOrDomain(gene, domain, property, codonStart);
350 * Process a statement containing one or both of Gene and Domain, and
351 * optionally Property or CodonStart commands.
354 * the Gene name if specified, else null
356 * the Domain name if specified, else null
358 * the Property value if specified, else null
360 * the CodonStart value if specified, else null
362 protected void processGeneOrDomain(String gene, String domain,
363 String property, String codonStart)
365 boolean domainEnd = "domainend".equalsIgnoreCase(property);
368 * If we have been processing a Domain or Gene, and this does not continue
369 * it, then close it off (generate sequence features for it). Do Domain
370 * first as it is in the context of the enclosing gene if any.
372 if (this.currentDomain != null)
374 if (!this.currentDomain.equals(domain) || domainEnd)
376 String description = currentDomain
377 + (currentGene == null ? "" : " (" + currentGene + ")");
378 createFeature(DOMAIN, description, domainStart);
381 if (this.currentGene != null && !this.currentGene.equals(gene))
383 createFeature(GENE, currentGene, geneStart);
387 * and if we have declared a Gene or Domain which does not continue the
388 * current one, then record its start positions per sequence
390 if (gene != null && !gene.equals(currentGene))
392 startSequenceFeature(geneStart);
394 if (domain != null && !domain.equals(currentDomain))
396 startSequenceFeature(domainStart);
400 currentDomain = domainEnd ? null : domain;
404 * Start processing a new feature
406 * @param startPositions
408 protected void startSequenceFeature(Map<String, Integer> startPositions)
411 * If the feature declaration precedes all sequences, we will know in
412 * createFeature that it started with residue 1; otherwise note now where it
413 * starts in each sequence
415 if (!residuesRead.isEmpty())
417 for (Entry<String, Integer> entry : residuesRead.entrySet())
419 String seqId = entry.getKey();
420 Integer nextResidue = entry.getValue() + 1;
421 startPositions.put(seqId, nextResidue);
427 * Add a SequenceFeature to each sequence, using the given start/end values
431 * @param featureValue
432 * @param featureStartResidues
434 protected void createFeature(String featureType, String featureValue,
435 Map<String, Integer> featureStartResidues)
437 if (featureValue == null)
442 Iterator<String> seqids = this.seqData.keySet().iterator();
443 while (seqids.hasNext())
445 String seqid = seqids.next();
446 Integer startAt = featureStartResidues.get(seqid);
447 int sfstart = startAt == null ? 1 : startAt.intValue();
448 int sfend = residuesRead.get(seqid);
449 if (sfend >= sfstart)
452 * don't add feature if entirely gapped in the sequence
454 // TODO: type="Gene" (but then all coloured the same) or
456 SequenceFeature sf = new SequenceFeature(featureValue, featureType,
457 sfstart, sfend, 0f, null);
458 sequenceFeatures.get(seqid).add(sf);
464 * Returns the next line that is not a comment, or null at end of file.
465 * Comments in MEGA are within [ ] brackets, and may be nested.
468 * @throws IOException
470 protected String nextNonCommentLine() throws IOException
472 return nextNonCommentLine(0);
476 * Returns the next non-comment line (or part line), or null at end of file.
477 * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
478 * anywhere within a line (for example at the end with position numbers); this
479 * method returns the line with any comments removed.
482 * current depth of nesting of comments while parsing
484 * @throws IOException
486 protected String nextNonCommentLine(final int depth) throws IOException
494 System.err.println("Warning: unterminated comment in data file");
500 * If we are in a (possibly nested) comment after parsing this line, keep
501 * reading recursively until the comment has unwound
503 int newDepth = commentDepth(data, depth);
506 return nextNonCommentLine(newDepth);
511 * not in a comment by end of this line; return what is left
513 String nonCommentPart = getNonCommentContent(data, depth);
514 return nonCommentPart;
519 * Returns what is left of the input data after removing any comments, whether
520 * 'in progress' from preceding lines, or embedded in the current line
525 * nested depth of comments pending termination
527 * @throws FileFormatException
529 protected static String getNonCommentContent(String data, int depth)
530 throws FileFormatException
532 int len = data.length();
533 StringBuilder result = new StringBuilder(len);
534 for (int i = 0; i < len; i++)
536 char c = data.charAt(i);
561 return result.toString();
565 * Calculates new depth of comment after parsing an input line i.e. the excess
566 * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
567 * treated as comment delimiters).
572 * current comment nested depth before parsing the line
573 * @return new depth after parsing the line
575 protected static int commentDepth(CharSequence data, int depth)
577 int newDepth = depth;
578 int len = data.length();
579 for (int i = 0; i < len; i++)
581 char c = data.charAt(i);
582 if (c == COMMENT_START)
586 else if (c == COMMENT_END && newDepth > 0)
595 * Convert the parsed sequence strings to objects and store them in the model.
597 protected void deriveSequences()
599 Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
601 for (Entry<String, StringBuilder> dataset : datasets)
603 String sequenceId = dataset.getKey();
604 StringBuilder characters = dataset.getValue();
605 SequenceI s = new Sequence(sequenceId, new String(characters));
606 this.seqs.addElement(s);
609 * and add any derived sequence features to the sequence
611 for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
613 s.addSequenceFeature(sf);
619 * Process one line of sequence data. If it has no sequence identifier, append
620 * to the current id's sequence. Else parse out the sequence id and append the
621 * data (if any) to that id's sequence. Returns the sequence id (implicit or
622 * explicit) for this line.
626 * @throws IOException
628 protected String parseDataLine(String dataLine)
631 String seqId = getSequenceId(dataLine);
635 * Just character data
637 parseNoninterleavedDataLine(dataLine);
638 return currentSequenceId;
640 else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
643 * Sequence id only - header line for noninterleaved data
650 * Sequence id followed by data
652 parseInterleavedDataLine(dataLine, seqId);
658 * Add a line of sequence data to the buffer for the given sequence id. Start
659 * a new one if we haven't seen it before.
662 * @throws IOException
664 protected void parseNoninterleavedDataLine(String dataLine)
667 if (currentSequenceId == null)
670 * Oops. Data but no sequence id context.
672 throw new IOException("No sequence id context at: " + dataLine);
675 assertInterleaved(false, dataLine);
677 dataLine = addSequenceData(currentSequenceId, dataLine);
679 setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
683 * Get the sequence data for this sequence id, starting a new one if
689 protected StringBuilder getSequenceDataBuffer(String currentId)
691 StringBuilder sb = seqData.get(currentId);
694 // first data met for this sequence id, start a new buffer
695 sb = new StringBuilder(SEQBUFFERSIZE);
696 seqData.put(currentId, sb);
698 // and a placeholder for any SequenceFeature found
699 sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
705 * Parse one line of interleaved data e.g.
708 * #TheSeqId CGATCGCATGCA
713 * @throws FileFormatException
715 protected void parseInterleavedDataLine(String dataLine, String seqId)
716 throws FileFormatException
719 * New sequence found in second or later data block - error.
721 if (this.firstDataBlockRead && !seqData.containsKey(seqId))
723 throw new FileFormatException(
724 "Parse error: misplaced new sequence starting at " + dataLine);
727 String data = dataLine.substring(seqId.length() + 1).trim();
730 * Do nothing if this line is _only_ a sequence id with no data following.
732 if (data != null && data.length() > 0)
734 data = addSequenceData(seqId, data);
735 setPositionsPerLine(Math.max(positionsPerLine, data.length()));
736 assertInterleaved(true, dataLine);
741 * Remove spaces, and replace identity symbol, before appending the sequence
742 * data to the buffer for the sequence id. Returns the reformatted added data.
743 * Also updates a count of residues read for the sequence.
749 protected String addSequenceData(String seqId, String data)
751 StringBuilder sb = getSequenceDataBuffer(seqId);
752 int len = sb.length();
753 String formatted = data.replace(SPACE, "");
756 * If sequence contains '.' or other identity symbol; replace these with the
757 * same position from the first (reference) sequence
760 StringBuilder referenceSequence = seqData.values().iterator().next();
761 StringBuilder sb1 = new StringBuilder(formatted.length());
762 for (int i = 0; i < formatted.length(); i++)
764 char nextChar = formatted.charAt(i);
765 if (nextChar != gapCharacter)
769 if (nextChar == identityCharacter
770 && len + i < referenceSequence.length())
772 sb1.append(referenceSequence.charAt(len + i));
776 sb1.append(nextChar);
779 formatted = sb1.toString();
785 * increment residue count for the sequence
789 Integer residueCount = residuesRead.get(seqId);
790 residuesRead.put(seqId, nonGapped
791 + (residueCount == null ? 0 : residueCount));
798 * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
799 * identifier. Else returns null.
804 public static String getSequenceId(String dataLine)
806 // TODO refactor to a StringUtils type class
807 if (dataLine != null)
809 if (dataLine.startsWith(HASHSIGN))
811 int spacePos = dataLine.indexOf(" ");
812 return (spacePos == -1 ? dataLine.substring(1) : dataLine
813 .substring(1, spacePos));
820 * Read the #MEGA and Title/Format/Description header lines (if present).
822 * Save as alignment properties in case useful.
824 * @return the next non-blank line following the header lines.
825 * @throws IOException
827 protected String parseHeaderLines() throws IOException
829 String inputLine = null;
830 while ((inputLine = nextNonCommentLine()) != null)
832 inputLine = inputLine.trim();
837 if (inputLine.length() == 0)
842 if (inputLine.toUpperCase().startsWith(MEGA_ID))
847 if (isTitle(inputLine))
849 this.title = getValue(inputLine);
850 setAlignmentProperty(PROP_TITLE, title);
852 else if (inputLine.startsWith(BANG + DESCRIPTION))
854 parseDescription(inputLine);
857 else if (inputLine.startsWith(BANG + FORMAT))
859 parseFormat(inputLine);
861 else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
865 * Return the first 'data line' i.e. one that is not blank, #MEGA or
875 * Parse a !Format statement. This may be multiline, and is ended by a
879 * @throws IOException
881 protected void parseFormat(String inputLine) throws IOException
883 while (inputLine != null)
885 parseFormatLine(inputLine);
886 if (inputLine.endsWith(SEMICOLON))
890 inputLine = nextNonCommentLine();
895 * Parse one line of a !Format statement. This may contain one or more
896 * keyword=value pairs.
899 * @throws FileFormatException
901 protected void parseFormatLine(String inputLine)
902 throws FileFormatException
904 if (inputLine.startsWith(BANG + FORMAT))
906 inputLine = inputLine.substring((BANG + FORMAT).length());
908 if (inputLine.endsWith(SEMICOLON))
910 inputLine = inputLine.substring(0, inputLine.length() - 1);
912 if (inputLine.length() == 0)
916 String[] tokens = inputLine.trim().split(WHITESPACE);
917 for (String token : tokens)
919 parseFormatKeyword(token);
924 * Parse a Keyword=Value token. Possible keywords are
926 * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
927 * <li>DataFormat= Interleaved, ?</li>
928 * <li>NSeqs= number of sequences (synonym NTaxa)</li>
929 * <li>NSites= number of bases / residues</li>
930 * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
931 * <li>Indel= gap character</li>
932 * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
933 * <li>Missing= missing data character</li>
934 * <li>CodeTable= Standard, other (MEGA supports various)</li>
938 * @throws FileFormatException
939 * if an unrecognised keyword or value is encountered
941 protected void parseFormatKeyword(String token)
942 throws FileFormatException
944 String msg = "Unrecognised Format command: " + token;
945 String[] bits = token.split(EQUALS);
946 if (bits.length != 2)
948 throw new FileFormatException(msg);
950 String keyword = bits[0];
951 String value = bits[1];
954 * Jalview will work out whether nucleotide or not anyway
956 if (keyword.equalsIgnoreCase(DATATYPE))
958 if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
959 || value.equalsIgnoreCase("Nucleotide"))
961 this.nucleotide = true;
962 // alignment computes whether or not it is nucleotide when created
964 else if (value.equalsIgnoreCase(PROTEIN))
966 this.nucleotide = false;
970 throw new FileFormatException(msg);
972 setAlignmentProperty(PROP_DATATYPE, value);
976 * accept non-Standard code table but save in case we want to disable
977 * 'translate as cDNA'
979 else if (keyword.equalsIgnoreCase(CODETABLE))
981 setAlignmentProperty(PROP_CODETABLE, value);
985 * save gap char to set later on alignment once created
987 else if (keyword.equalsIgnoreCase(INDEL))
989 this.gapCharacter = value.charAt(0);
992 else if (keyword.equalsIgnoreCase(IDENTICAL)
993 || keyword.equalsIgnoreCase("MatchChar"))
995 setAlignmentProperty(PROP_IDENTITY, value);
996 this.identityCharacter = value.charAt(0);
997 if (!".".equals(value))
999 System.err.println("Warning: " + token
1000 + " not supported, Jalview uses '.' for identity");
1004 else if (keyword.equalsIgnoreCase(MISSING))
1006 setAlignmentProperty(PROP_MISSING, value);
1007 System.err.println("Warning: " + token + " not supported");
1010 else if (keyword.equalsIgnoreCase(PROPERTY))
1012 // TODO: can Property appear in a Format command?
1013 // suspect this is a mistake in the manual
1016 else if (!keyword.equalsIgnoreCase(N_SEQS)
1017 && !keyword.equalsIgnoreCase("NTaxa")
1018 && !keyword.equalsIgnoreCase(N_SITES))
1020 System.err.println("Warning: " + msg);
1025 * Returns the trimmed data on the line following either whitespace or '=',
1026 * with any trailing semi-colon removed<br>
1029 * <li>Hello World</li>
1030 * <li>!Hello: \tWorld;</li>
1031 * <li>!Hello=World</li>
1033 * should all return "World"
1038 protected static String getValue(String inputLine)
1040 if (inputLine == null)
1044 String value = null;
1045 String s = inputLine.replaceAll("\t", " ").trim();
1048 * KEYWORD = VALUE should return VALUE
1050 int equalsPos = s.indexOf("=");
1053 value = s.substring(equalsPos + 1);
1057 int spacePos = s.indexOf(' ');
1058 value = spacePos == -1 ? "" : s.substring(spacePos + 1);
1060 value = value.trim();
1061 if (value.endsWith(SEMICOLON))
1063 value = value.substring(0, value.length() - 1).trim();
1069 * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
1070 * sensitive). The latter is the official format, some older data file
1071 * examples have it without the !.
1076 protected static boolean isTitle(String inputLine)
1078 if (inputLine == null)
1082 String upper = inputLine.toUpperCase();
1083 return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
1084 + TITLE.toUpperCase()));
1088 * Reads lines until terminated by semicolon, appending each to the
1089 * Description property value.
1091 * @throws IOException
1093 protected void parseDescription(String firstDescriptionLine)
1096 StringBuilder desc = new StringBuilder(256);
1097 desc.append(getValue(firstDescriptionLine));
1098 if (!firstDescriptionLine.endsWith(SEMICOLON))
1100 String line = nextNonCommentLine();
1101 while (line != null)
1103 if (line.endsWith(SEMICOLON))
1105 desc.append(line.substring(0, line.length() - 1));
1108 else if (line.length() > 0)
1110 desc.append(line).append(newline);
1112 line = nextNonCommentLine();
1115 setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1119 * Returns the alignment sequences in Mega format.
1122 public String print()
1124 return MEGA_ID + newline + print(getSeqsAsArray());
1128 * Write out the alignment sequences in Mega format - interleaved unless
1129 * explicitly noninterleaved.
1131 protected String print(SequenceI[] s)
1134 if (this.interleaved != null && !this.interleaved)
1136 result = printNonInterleaved(s);
1140 result = printInterleaved(s);
1146 * Print to string in Interleaved format - blocks of next N characters of each
1151 protected String printInterleaved(SequenceI[] s)
1153 int maxIdLength = getMaxIdLength(s);
1154 int maxSequenceLength = getMaxSequenceLength(s);
1155 int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1157 int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
1158 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1159 int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1162 * Roughly size a buffer to hold the whole output
1164 StringBuilder sb = new StringBuilder(numLines
1165 * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1168 * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1171 for (int i = 0; i < numDataBlocks; i++)
1174 boolean first = true;
1176 for (SequenceI seq : s)
1179 String seqId = String.format("#%-" + maxIdLength + "s",
1183 * output next line for this sequence
1186 int lastPos = seqFrom + positionsPerLine; // exclusive
1187 for (int j = 0; j < chunksPerLine; j++)
1189 char[] subSequence = seq.getSequence(seqFrom,
1190 Math.min(lastPos, seqFrom + spaceEvery));
1191 if (subSequence.length > 0)
1193 sb.append(SPACE).append(subSequence);
1195 seqFrom += subSequence.length;
1198 // all sequences should be the same length in MEGA
1199 advancedBy += subSequence.length;
1202 // write last position as a comment
1203 if (writePositionNumbers)
1205 sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
1206 .append(COMMENT_END);
1214 return new String(sb);
1218 * Outputs to string the MEGA header and any other known and relevant
1219 * alignment properties
1223 protected String printHeaders(AlignmentI al)
1225 StringBuilder sb = new StringBuilder(128);
1226 sb.append(MEGA_ID).append(newline);
1227 String propertyValue = (String) al.getProperty(PROP_TITLE);
1228 if (propertyValue != null)
1230 sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1231 .append(SEMICOLON).append(newline);
1233 propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1234 if (propertyValue != null)
1236 sb.append(BANG).append(DESCRIPTION).append(newline)
1237 .append(propertyValue).append(SEMICOLON)
1242 * !Format DataType CodeTable
1244 sb.append(BANG).append(FORMAT).append(newline);
1245 String dataType = (String) al.getProperty(PROP_DATATYPE);
1246 if (dataType == null)
1248 dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1250 sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1251 String codeTable = (String) al.getProperty(PROP_CODETABLE);
1252 sb.append(SPACE).append(CODETABLE).append(EQUALS)
1253 .append(codeTable == null ? "Standard" : codeTable)
1257 * !Format NSeqs NSites (the length of sequences - they should all be the
1258 * same - including gaps)
1260 sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1261 sb.append(SPACE).append(N_SITES).append(EQUALS)
1262 .append(String.valueOf(al.getWidth()));
1266 * !Format Indel Identical Missing
1269 sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1270 String identity = (String) al.getProperty(PROP_IDENTITY);
1271 if (identity != null)
1273 sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1275 String missing = (String) al.getProperty(PROP_MISSING);
1276 if (missing != null)
1278 sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1280 sb.append(SEMICOLON).append(newline);
1282 return sb.toString();
1286 * Get the longest sequence id (to allow aligned printout).
1291 protected static int getMaxIdLength(SequenceI[] s)
1293 // TODO pull up for reuse
1295 for (SequenceI seq : s)
1297 int len = seq.getName().length();
1298 if (len > maxLength)
1307 * Get the longest sequence length
1312 protected static int getMaxSequenceLength(SequenceI[] s)
1314 // TODO pull up for reuse
1316 for (SequenceI seq : s)
1318 int len = seq.getLength();
1319 if (len > maxLength)
1328 * Print to string in noninterleaved format - all of each sequence in turn, in
1329 * blocks of 50 characters.
1334 protected String printNonInterleaved(SequenceI[] s)
1336 int maxSequenceLength = getMaxSequenceLength(s);
1338 int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1341 * Roughly size a buffer to hold the whole output
1343 StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1345 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1346 int chunksPerLine = positionsPerLine / spaceEvery;
1347 for (SequenceI seq : s)
1350 sb.append(HASHSIGN + seq.getName()).append(newline);
1352 while (startPos < seq.getLength())
1354 boolean firstChunk = true;
1356 * print next line for this sequence
1358 int lastPos = startPos + positionsPerLine; // exclusive
1359 for (int j = 0; j < chunksPerLine; j++)
1361 char[] subSequence = seq.getSequence(startPos,
1362 Math.min(lastPos, startPos + positionsPerLine));
1363 if (subSequence.length > 0)
1369 sb.append(subSequence);
1372 startPos += subSequence.length;
1378 return new String(sb);
1382 * Flag this file as interleaved or not, based on data format. Throws an
1383 * exception if has previously been determined to be otherwise.
1387 * @throws IOException
1389 protected void assertInterleaved(boolean isIt, String dataLine)
1390 throws FileFormatException
1392 if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1394 throw new FileFormatException(
1395 "Parse error: mix of interleaved and noninterleaved detected, at line: "
1398 this.interleaved = new Boolean(isIt);
1399 setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1402 public boolean isInterleaved()
1404 return this.interleaved == null ? false : this.interleaved
1409 * Adds saved parsed values either as alignment properties, or (in some cases)
1410 * as specific member fields of the alignment
1413 public void addProperties(AlignmentI al)
1415 super.addProperties(al);
1416 al.setGapCharacter(gapCharacter);
1419 * warn if e.g. DataType=DNA but data is protein (or vice versa)
1421 if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1422 System.err.println("Warning: " + this.title + " declared "
1423 + (nucleotide ? "" : " not ") + "nucleotide but it is"
1424 + (nucleotide ? " not" : ""));
1429 * Print the given alignment in MEGA format. If the alignment was created by
1430 * parsing a MEGA file, it should have properties set (e.g. Title) which can
1431 * influence the output.
1434 public String print(AlignmentI al)
1436 this.nucleotide = al.isNucleotide();
1438 String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1439 this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1440 .parseInt(lineLength);
1443 * round down to a multiple of 3 positions per line for nucleotide
1447 positionsPerLine = positionsPerLine - (positionsPerLine % 3);
1450 String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1451 if (interleave != null)
1453 this.interleaved = Boolean.valueOf(interleave);
1456 String headers = printHeaders(al);
1457 return headers + print(al.getSequencesArray());
1461 * Returns the number of sequence positions output per line
1465 public int getPositionsPerLine()
1467 return positionsPerLine;
1471 * Sets the number of sequence positions output per line. Note these will be
1472 * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1476 public void setPositionsPerLine(int p)
1478 this.positionsPerLine = p;