2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 import jalview.datamodel.AlignmentAnnotation;
22 import jalview.datamodel.AlignmentI;
23 import jalview.datamodel.Annotation;
24 import jalview.datamodel.Sequence;
25 import jalview.datamodel.SequenceFeature;
26 import jalview.datamodel.SequenceI;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.HashMap;
31 import java.util.Iterator;
32 import java.util.LinkedHashMap;
33 import java.util.List;
35 import java.util.Map.Entry;
39 * A parser for input or output of MEGA format files. <br>
41 * Tamura K, Stecher G, Peterson D, Filipski A, and Kumar S (2013) MEGA6:
42 * Molecular Evolutionary Genetics Analysis Version 6.0. Molecular Biology and
43 * Evolution 30: 2725-2729. <br>
46 * MEGA file format is supported as described in
47 * http://www.megasoftware.net/manual.pdf <br>
50 * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
51 * <li>to be completed</li>
54 * @see http://www.megasoftware.net/
56 public class MegaFile extends AlignFile
58 private static final char UNDERSCORE = '_';
60 private static final String WHITESPACE = "\\s+";
62 private static final int DEFAULT_LINE_LENGTH = 60;
64 private static final String INDENT = " ";
66 private static final String N_SITES = "NSites";
68 private static final String N_SEQS = "NSeqs";
70 private static final String MISSING = "Missing";
72 private static final String IDENTICAL = "Identical";
74 private static final String INDEL = "Indel";
76 private static final String CODETABLE = "CodeTable";
78 private static final String PROTEIN = "Protein";
80 private static final String NUCLEOTIDE = "Nucleotide";
82 private static final String DATATYPE = "DataType";
84 private static final char COMMENT_START = '[';
86 private static final char COMMENT_END = ']';
88 private static final String HASHSIGN = "#";
90 private static final String SEMICOLON = ";";
92 private static final String BANG = "!";
94 private static final String EQUALS = "=";
96 private static final String MEGA_ID = HASHSIGN + "MEGA";
98 private static final String TITLE = "Title";
100 private static final String FORMAT = "Format";
102 private static final String DESCRIPTION = "Description";
104 private static final String GENE = "Gene";
106 private static final String DOMAIN = "Domain";
108 private static final String PROPERTY = "Property";
110 private static final String CODONSTART = "CodonStart";
112 private static final String LABEL = "Label";
115 * names of properties to save to the alignment (may affect eventual output
118 static final String PROP_TITLE = "MEGA_TITLE";
120 static final String PROP_INTERLEAVED = "MEGA_INTERLEAVED";
122 static final String PROP_DESCRIPTION = "MEGA_DESCRIPTION";
124 static final String PROP_CODETABLE = "MEGA_CODETABLE";
126 static final String PROP_IDENTITY = "MEGA_IDENTITY";
128 static final String PROP_MISSING = "MEGA_MISSING";
130 static final String PROP_DATATYPE = "MEGA_DATATYPE";
132 // number of bases per line of file (value is inferred)
133 static final String PROP_LINELENGTH = "MEGA_LINELENGTH";
135 // TODO: need a controlled name for Gene as a feature if we want to be able to
136 // output the MEGA file with !Gene headers
137 // WTF do we do if the sequences get realigned?
139 // initial size for sequence data buffer
140 private static final int SEQBUFFERSIZE = 256;
142 private static final String SPACE = " ";
144 private static final String TAB = "\t";
147 * number of sequence positions output per line
149 private int positionsPerLine;
151 private String title;
153 // gap character may be explicitly declared, default is -
154 private char gapCharacter = '-';
156 // identity character if declared
157 private char identityCharacter = 0;
159 // this can be True, False or null (meaning not asserted in file)
160 private Boolean nucleotide;
162 // set once we have seen one block of interleaved data
163 private boolean firstDataBlockRead = false;
165 // this can be True, False or null (meaning we don't know yet)
166 private Boolean interleaved;
168 // write end of line positions as a comment
169 private boolean writePositionNumbers = true;
171 // id of sequence being processed
172 private String currentSequenceId;
175 * Temporary store of {sequenceId, positionData} while parsing interleaved
176 * sequences; sequences are maintained in the order in which they are added
177 * i.e. read in the file
179 Map<String, StringBuilder> seqData;
181 // number of residues read (so far) per sequence
182 Map<String, Integer> residuesRead;
184 // current Gene if any we are parsing
185 private String currentGene;
187 // start residue (base 1) per sequence of current gene
188 Map<String, Integer> geneStart;
190 // current Domain if any we are parsing
191 private String currentDomain;
193 // start residue (base 1) per sequence of current domain
194 Map<String, Integer> domainStart;
196 // map of SequenceFeature's by sequence id
197 Map<String, List<SequenceFeature>> sequenceFeatures;
199 // each !Label line character becomes an Annotation (except underscores)
200 List<Annotation> labelAnnotations;
206 public MegaFile(String inFile, String type) throws IOException
211 public MegaFile(FileParse source) throws IOException
217 * Parse the input stream.
220 public void parse() throws IOException
223 sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
224 geneStart = new HashMap<String, Integer>();
225 domainStart = new HashMap<String, Integer>();
226 residuesRead = new HashMap<String, Integer>();
227 labelAnnotations = new ArrayList<Annotation>();
230 * Read and process MEGA and Title/Format/Description headers if present.
231 * Returns the first data line following the headers.
233 String dataLine = parseHeaderLines();
236 * order-preserving map to hold sequences by id as they are built up during
239 seqData = new LinkedHashMap<String, StringBuilder>();
242 * The id of the sequence being read (for non-interleaved)
244 currentSequenceId = "";
246 while (dataLine != null)
248 dataLine = dataLine.trim();
249 if (dataLine.length() > 0)
251 dataLine = dataLine.replace(TAB, SPACE);
252 String upperCased = dataLine.toUpperCase();
253 if (upperCased.startsWith(BANG + GENE.toUpperCase())
254 || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
256 parseGeneOrDomain(dataLine);
258 else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
260 parseLabel(dataLine);
264 currentSequenceId = parseDataLine(dataLine);
267 else if (!seqData.isEmpty())
270 * Blank line after processing some data...
274 dataLine = nextNonCommentLine();
278 * close off any features currently being parsed
280 createFeature(GENE, currentGene, geneStart);
281 createFeature(DOMAIN, currentDomain, domainStart);
283 // remember the (longest) line length read in, so we can output the same
284 setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
286 deriveSequencesAndFeatures();
292 * If we parsed !Label statements into a list of Annotation objects, create an
293 * AlignmentAnnotation
295 protected void deriveAnnotations()
297 if (this.labelAnnotations.size() > 0)
299 Annotation[] anns = labelAnnotations
300 .toArray(new Annotation[labelAnnotations.size()]);
301 AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
303 this.annotations.add(aa);
308 * Parse a !Label line. This contains a single character per position (column)
309 * of the alignment block above. An underscore character represents no label.
310 * Labels are assembled into an AlignmentAnnotation object.
313 * @throws FileFormatException
315 protected void parseLabel(String dataLine) throws FileFormatException
317 // strip off leading !Label and following spaces
318 dataLine = dataLine.substring(LABEL.length() + 1).trim();
320 // remove internal spacing and any leading tab
321 String labels = dataLine.replace(SPACE, "");
322 if (labels.endsWith(SEMICOLON))
324 labels = labels.substring(0, labels.length() - 1);
328 System.err.println("Warning: '" + dataLine
329 + "' should end with semi-colon");
331 for (char c : labels.toCharArray())
335 this.labelAnnotations.add(null);
339 this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
345 * sanity check - the number of labels added should exactly match the
346 * sequence length so far
348 int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
349 .iterator().next().length();
350 if (labelAnnotations.size() != sequenceLength)
352 System.err.println("Warning: file inconsistent - "
353 + labelAnnotations.size() + " labels for " + sequenceLength
354 + " positions after " + dataLine);
359 * Post-processing after reading one block of interleaved data
361 protected void endOfDataBlock()
363 this.firstDataBlockRead = true;
365 // (initialise and) populate arrays of sequence length so far (excluding
367 // On change or end of a denoted Gene or Domain, add sequence features for
372 * Parse a !Gene or !Domain command line. MEGA accepts
374 * <li>!Gene=name;</li>
375 * <li>!Gene=name Property=Coding/Noncoding CodonStart=1/2/3;</li>
376 * <li>!Gene=genename Domain=domainname Property= etc</li>
377 * <li>!Domain=domainname Gene=genename Property= etc</li>
378 * <li>!Domain=domainname Property= etc</li>
379 * <li>!domain=domainname property=domainend</li>
381 * Properly, a Gene should be composed of Domain segments, but MEGA accepts
382 * without. Note that keywords don't seem to be case sensitive.
385 * @throws FileFormatException
387 protected void parseGeneOrDomain(String dataLine)
388 throws FileFormatException
390 String domain = null;
392 String property = null;
393 String codonStart = null;
394 String errorMsg = "Unrecognized format: " + dataLine;
396 if (!dataLine.startsWith(BANG) || !dataLine.endsWith(SEMICOLON))
398 throw new FileFormatException(errorMsg);
400 String trimmed = dataLine.substring(1, dataLine.length() - 1).trim();
401 String[] tokens = trimmed.split(WHITESPACE);
402 for (String token : tokens)
404 String[] keyValue = token.split("=");
405 if (keyValue.length != 2)
407 throw new FileFormatException(errorMsg);
409 String key = keyValue[0];
410 if (GENE.equalsIgnoreCase(key))
414 else if (DOMAIN.equalsIgnoreCase(key))
416 domain = keyValue[1];
418 else if (PROPERTY.equalsIgnoreCase(key))
420 property = keyValue[1];
422 else if (CODONSTART.equalsIgnoreCase(key))
424 codonStart = keyValue[1];
428 System.err.println("Unrecognised token: '" + key + "; in "
433 processGeneOrDomain(gene, domain, property, codonStart);
437 * Process a statement containing one or both of Gene and Domain, and
438 * optionally Property or CodonStart commands.
441 * the Gene name if specified, else null
443 * the Domain name if specified, else null
445 * the Property value if specified, else null
447 * the CodonStart value if specified, else null
449 protected void processGeneOrDomain(String gene, String domain,
450 String property, String codonStart)
453 * the order of processing below ensures that we correctly capture where a
454 * domain is in the context of an enclosing gene
456 processDomainEnd(domain, gene, property);
458 processGeneEnd(gene);
460 processGeneStart(gene);
462 processDomainStart(domain, property);
464 // TODO save codonStart if we plan to involve it in 'translate as cDNA'
468 * If we have declared a domain, and it is not continuing, start a sequence
474 protected void processDomainStart(String domain, String property)
476 if ("domainend".equalsIgnoreCase(property))
478 currentDomain = null;
482 if (domain != null && !domain.equals(currentDomain))
484 String verboseDomain = makeVerboseDomainName(domain, property);
485 startSequenceFeature(domainStart);
487 currentDomain = verboseDomain;
492 * If we have declared a gene, and it is not continuing, start a sequence
497 protected void processGeneStart(String gene)
499 if (gene != null && !gene.equals(currentGene))
501 startSequenceFeature(geneStart);
507 * If we have been processing a domain, and it is not being continued, then
508 * make a sequence feature for the domain just ended. Criteria for the domain
509 * not being continued are either an explicit new domain or gene name, or a
510 * 'Property=domainend' statement
515 * @return true if a feature is created, else false
517 protected boolean processDomainEnd(String domain, String gene,
520 boolean newGene = (gene != null && !gene.equals(currentGene));
522 String verboseDomain = makeVerboseDomainName(domain, property);
524 if (this.currentDomain != null)
526 boolean newDomain = !this.currentDomain.equals(verboseDomain);
527 boolean domainEnded = "domainend".equalsIgnoreCase(property);
528 if (newDomain || newGene || domainEnded)
530 createFeature(DOMAIN, currentDomain, domainStart);
531 currentDomain = null;
539 * If we have been processing a gene, and it is not being continued, then make
540 * a sequence feature for the gene just ended
543 * @return true if a feature is created, else false
545 protected boolean processGeneEnd(String gene)
547 boolean created = false;
549 * If we were processing a gene and now have either another, or none, create
550 * a sequence feature for that gene
552 if (this.currentGene != null && !this.currentGene.equals(gene))
554 createFeature(GENE, currentGene, geneStart);
563 * Makes an expanded descriptive name for Domain if possible e.g.
564 * "Intron1 (Adh Coding)". Currently incorporates the current gene name (if
565 * any) and the Coding/Noncoding property value (if given).
571 protected String makeVerboseDomainName(String domain, String property)
573 String verboseDomain = domain;
577 if ("Exon".equalsIgnoreCase(property)
578 || "Coding".equalsIgnoreCase(property))
582 else if ("Intron".equalsIgnoreCase(property)
583 || "Noncoding".equalsIgnoreCase(property))
585 coding = " Noncoding";
587 verboseDomain = domain
588 + (currentGene == null ? "" : " (" + currentGene + coding
591 return verboseDomain;
595 * Start processing a new feature
597 * @param startPositions
599 protected void startSequenceFeature(Map<String, Integer> startPositions)
602 * If the feature declaration precedes all sequences, we will know in
603 * createFeature that it started with residue 1; otherwise note now where it
604 * starts in each sequence
606 if (!residuesRead.isEmpty())
608 for (Entry<String, Integer> entry : residuesRead.entrySet())
610 String seqId = entry.getKey();
611 Integer nextResidue = entry.getValue() + 1;
612 startPositions.put(seqId, nextResidue);
618 * Add a SequenceFeature to each sequence, using the given start/end values
622 * @param featureValue
623 * @param featureStartResidues
625 protected void createFeature(String featureType, String featureValue,
626 Map<String, Integer> featureStartResidues)
628 if (featureValue == null)
633 Iterator<String> seqids = this.seqData.keySet().iterator();
634 while (seqids.hasNext())
636 String seqid = seqids.next();
637 Integer startAt = featureStartResidues.get(seqid);
638 int sfstart = startAt == null ? 1 : startAt.intValue();
639 int sfend = residuesRead.get(seqid);
640 if (sfend >= sfstart)
643 * don't add feature if entirely gapped in the sequence
645 // TODO: type="Gene" (but then all coloured the same) or
647 SequenceFeature sf = new SequenceFeature(featureValue, featureType,
648 sfstart, sfend, 0f, null);
649 sequenceFeatures.get(seqid).add(sf);
655 * Returns the next line that is not a comment, or null at end of file.
656 * Comments in MEGA are within [ ] brackets, and may be nested.
659 * @throws IOException
661 protected String nextNonCommentLine() throws IOException
663 return nextNonCommentLine(0);
667 * Returns the next non-comment line (or part line), or null at end of file.
668 * Comments in MEGA are within [ ] brackets, and may be nested. They may occur
669 * anywhere within a line (for example at the end with position numbers); this
670 * method returns the line with any comments removed.
673 * current depth of nesting of comments while parsing
675 * @throws IOException
677 protected String nextNonCommentLine(final int depth) throws IOException
685 System.err.println("Warning: unterminated comment in data file");
691 * If we are in a (possibly nested) comment after parsing this line, keep
692 * reading recursively until the comment has unwound
694 int newDepth = commentDepth(data, depth);
697 return nextNonCommentLine(newDepth);
702 * not in a comment by end of this line; return what is left
704 String nonCommentPart = getNonCommentContent(data, depth);
705 return nonCommentPart;
710 * Returns what is left of the input data after removing any comments, whether
711 * 'in progress' from preceding lines, or embedded in the current line
716 * nested depth of comments pending termination
718 * @throws FileFormatException
720 protected static String getNonCommentContent(String data, int depth)
721 throws FileFormatException
723 int len = data.length();
724 StringBuilder result = new StringBuilder(len);
725 for (int i = 0; i < len; i++)
727 char c = data.charAt(i);
752 return result.toString();
756 * Calculates new depth of comment after parsing an input line i.e. the excess
757 * of opening '[' over closing ']' characters. Any excess ']' are ignored (not
758 * treated as comment delimiters).
763 * current comment nested depth before parsing the line
764 * @return new depth after parsing the line
766 protected static int commentDepth(CharSequence data, int depth)
768 int newDepth = depth;
769 int len = data.length();
770 for (int i = 0; i < len; i++)
772 char c = data.charAt(i);
773 if (c == COMMENT_START)
777 else if (c == COMMENT_END && newDepth > 0)
786 * Convert the parsed sequence strings to objects and store them in the model.
788 protected void deriveSequencesAndFeatures()
790 Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
792 for (Entry<String, StringBuilder> dataset : datasets)
794 String sequenceId = dataset.getKey();
795 StringBuilder characters = dataset.getValue();
796 SequenceI s = new Sequence(sequenceId, new String(characters));
797 this.seqs.addElement(s);
800 * and add any derived sequence features to the sequence
802 for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
804 s.addSequenceFeature(sf);
810 * Process one line of sequence data. If it has no sequence identifier, append
811 * to the current id's sequence. Else parse out the sequence id and append the
812 * data (if any) to that id's sequence. Returns the sequence id (implicit or
813 * explicit) for this line.
817 * @throws IOException
819 protected String parseDataLine(String dataLine)
822 String seqId = getSequenceId(dataLine);
826 * Just character data
828 parseNoninterleavedDataLine(dataLine);
829 return currentSequenceId;
831 else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
834 * Sequence id only - header line for noninterleaved data
841 * Sequence id followed by data
843 parseInterleavedDataLine(dataLine, seqId);
849 * Add a line of sequence data to the buffer for the given sequence id. Start
850 * a new one if we haven't seen it before.
853 * @throws IOException
855 protected void parseNoninterleavedDataLine(String dataLine)
856 throws FileFormatException
858 if (currentSequenceId == null)
861 * Oops. Data but no sequence id context.
863 throw new FileFormatException("No sequence id context at: "
867 assertInterleaved(false, dataLine);
869 dataLine = addSequenceData(currentSequenceId, dataLine);
871 setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
875 * Get the sequence data for this sequence id, starting a new one if
881 protected StringBuilder getSequenceDataBuffer(String currentId)
883 StringBuilder sb = seqData.get(currentId);
886 // first data met for this sequence id, start a new buffer
887 sb = new StringBuilder(SEQBUFFERSIZE);
888 seqData.put(currentId, sb);
890 // and a placeholder for any SequenceFeature found
891 sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
897 * Parse one line of interleaved data e.g.
900 * #TheSeqId CGATCGCATGCA
905 * @throws FileFormatException
907 protected void parseInterleavedDataLine(String dataLine, String seqId)
908 throws FileFormatException
911 * New sequence found in second or later data block - error.
913 if (this.firstDataBlockRead && !seqData.containsKey(seqId))
915 throw new FileFormatException(
916 "Parse error: misplaced new sequence starting at " + dataLine);
919 String data = dataLine.substring(seqId.length() + 1).trim();
922 * Do nothing if this line is _only_ a sequence id with no data following.
924 if (data != null && data.length() > 0)
926 data = addSequenceData(seqId, data);
927 setPositionsPerLine(Math.max(positionsPerLine, data.length()));
928 assertInterleaved(true, dataLine);
933 * Remove spaces, and replace identity symbol, before appending the sequence
934 * data to the buffer for the sequence id. Returns the reformatted added data.
935 * Also updates a count of residues read for the sequence.
941 protected String addSequenceData(String seqId, String data)
943 StringBuilder sb = getSequenceDataBuffer(seqId);
944 int len = sb.length();
945 String formatted = data.replace(SPACE, "");
948 * If sequence contains '.' or other identity symbol; replace these with the
949 * same position from the first (reference) sequence
952 StringBuilder referenceSequence = seqData.values().iterator().next();
953 StringBuilder sb1 = new StringBuilder(formatted.length());
954 for (int i = 0; i < formatted.length(); i++)
956 char nextChar = formatted.charAt(i);
957 if (nextChar != gapCharacter)
961 if (nextChar == identityCharacter
962 && len + i < referenceSequence.length())
964 sb1.append(referenceSequence.charAt(len + i));
968 sb1.append(nextChar);
971 formatted = sb1.toString();
977 * increment residue count for the sequence
981 Integer residueCount = residuesRead.get(seqId);
982 residuesRead.put(seqId, nonGapped
983 + (residueCount == null ? 0 : residueCount));
990 * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
991 * identifier. Else returns null.
996 public static String getSequenceId(String dataLine)
998 // TODO refactor to a StringUtils type class
999 if (dataLine != null)
1001 if (dataLine.startsWith(HASHSIGN))
1003 int spacePos = dataLine.indexOf(" ");
1004 return (spacePos == -1 ? dataLine.substring(1) : dataLine
1005 .substring(1, spacePos));
1012 * Read the #MEGA and Title/Format/Description header lines (if present).
1014 * Save as alignment properties in case useful.
1016 * @return the next non-blank line following the header lines.
1017 * @throws IOException
1019 protected String parseHeaderLines() throws IOException
1021 String inputLine = null;
1022 while ((inputLine = nextNonCommentLine()) != null)
1024 inputLine = inputLine.trim();
1029 if (inputLine.length() == 0)
1034 if (inputLine.toUpperCase().startsWith(MEGA_ID))
1039 if (isTitle(inputLine))
1041 this.title = getValue(inputLine);
1042 setAlignmentProperty(PROP_TITLE, title);
1044 else if (inputLine.startsWith(BANG + DESCRIPTION))
1046 parseDescription(inputLine);
1049 else if (inputLine.startsWith(BANG + FORMAT))
1051 parseFormat(inputLine);
1053 else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
1057 * Return the first 'data line' i.e. one that is not blank, #MEGA or
1067 * Parse a !Format statement. This may be multiline, and is ended by a
1071 * @throws IOException
1073 protected void parseFormat(String inputLine) throws IOException
1075 while (inputLine != null)
1077 parseFormatLine(inputLine);
1078 if (inputLine.endsWith(SEMICOLON))
1082 inputLine = nextNonCommentLine();
1087 * Parse one line of a !Format statement. This may contain one or more
1088 * keyword=value pairs.
1091 * @throws FileFormatException
1093 protected void parseFormatLine(String inputLine)
1094 throws FileFormatException
1096 if (inputLine.startsWith(BANG + FORMAT))
1098 inputLine = inputLine.substring((BANG + FORMAT).length());
1100 if (inputLine.endsWith(SEMICOLON))
1102 inputLine = inputLine.substring(0, inputLine.length() - 1);
1104 if (inputLine.length() == 0)
1108 String[] tokens = inputLine.trim().split(WHITESPACE);
1109 for (String token : tokens)
1111 parseFormatKeyword(token);
1116 * Parse a Keyword=Value token. Possible keywords are
1118 * <li>DataType= DNA, RNA, Nucleotide, Protein</li>
1119 * <li>DataFormat= Interleaved, ?</li>
1120 * <li>NSeqs= number of sequences (synonym NTaxa)</li>
1121 * <li>NSites= number of bases / residues</li>
1122 * <li>Property= Exon (or Coding), Intron (or Noncoding), End (of domain)</li>
1123 * <li>Indel= gap character</li>
1124 * <li>Identical= identity character (to first sequence) (synonym MatchChar)</li>
1125 * <li>Missing= missing data character</li>
1126 * <li>CodeTable= Standard, other (MEGA supports various)</li>
1130 * @throws FileFormatException
1131 * if an unrecognised keyword or value is encountered
1133 protected void parseFormatKeyword(String token)
1134 throws FileFormatException
1136 String msg = "Unrecognised Format command: " + token;
1137 String[] bits = token.split(EQUALS);
1138 if (bits.length != 2)
1140 throw new FileFormatException(msg);
1142 String keyword = bits[0];
1143 String value = bits[1];
1146 * Jalview will work out whether nucleotide or not anyway
1148 if (keyword.equalsIgnoreCase(DATATYPE))
1150 if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA")
1151 || value.equalsIgnoreCase("Nucleotide"))
1153 this.nucleotide = true;
1154 // alignment computes whether or not it is nucleotide when created
1156 else if (value.equalsIgnoreCase(PROTEIN))
1158 this.nucleotide = false;
1162 throw new FileFormatException(msg);
1164 setAlignmentProperty(PROP_DATATYPE, value);
1168 * accept non-Standard code table but save in case we want to disable
1169 * 'translate as cDNA'
1171 else if (keyword.equalsIgnoreCase(CODETABLE))
1173 setAlignmentProperty(PROP_CODETABLE, value);
1177 * save gap char to set later on alignment once created
1179 else if (keyword.equalsIgnoreCase(INDEL))
1181 this.gapCharacter = value.charAt(0);
1184 else if (keyword.equalsIgnoreCase(IDENTICAL)
1185 || keyword.equalsIgnoreCase("MatchChar"))
1187 setAlignmentProperty(PROP_IDENTITY, value);
1188 this.identityCharacter = value.charAt(0);
1189 if (!".".equals(value))
1191 System.err.println("Warning: " + token
1192 + " not supported, Jalview uses '.' for identity");
1196 else if (keyword.equalsIgnoreCase(MISSING))
1198 setAlignmentProperty(PROP_MISSING, value);
1199 System.err.println("Warning: " + token + " not supported");
1202 else if (keyword.equalsIgnoreCase(PROPERTY))
1204 // TODO: can Property appear in a Format command?
1205 // suspect this is a mistake in the manual
1208 else if (!keyword.equalsIgnoreCase(N_SEQS)
1209 && !keyword.equalsIgnoreCase("NTaxa")
1210 && !keyword.equalsIgnoreCase(N_SITES))
1212 System.err.println("Warning: " + msg);
1217 * Returns the trimmed data on the line following either whitespace or '=',
1218 * with any trailing semi-colon removed<br>
1221 * <li>Hello World</li>
1222 * <li>!Hello: \tWorld;</li>
1223 * <li>!Hello=World</li>
1225 * should all return "World"
1230 protected static String getValue(String inputLine)
1232 if (inputLine == null)
1236 String value = null;
1237 String s = inputLine.replaceAll("\t", " ").trim();
1240 * KEYWORD = VALUE should return VALUE
1242 int equalsPos = s.indexOf("=");
1245 value = s.substring(equalsPos + 1);
1249 int spacePos = s.indexOf(' ');
1250 value = spacePos == -1 ? "" : s.substring(spacePos + 1);
1252 value = value.trim();
1253 if (value.endsWith(SEMICOLON))
1255 value = value.substring(0, value.length() - 1).trim();
1261 * Returns true if the input line starts with "TITLE" or "!TITLE" (not case
1262 * sensitive). The latter is the official format, some older data file
1263 * examples have it without the !.
1268 protected static boolean isTitle(String inputLine)
1270 if (inputLine == null)
1274 String upper = inputLine.toUpperCase();
1275 return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG
1276 + TITLE.toUpperCase()));
1280 * Reads lines until terminated by semicolon, appending each to the
1281 * Description property value.
1283 * @throws IOException
1285 protected void parseDescription(String firstDescriptionLine)
1288 StringBuilder desc = new StringBuilder(256);
1289 desc.append(getValue(firstDescriptionLine));
1290 if (!firstDescriptionLine.endsWith(SEMICOLON))
1292 String line = nextNonCommentLine();
1293 while (line != null)
1295 if (line.endsWith(SEMICOLON))
1297 desc.append(line.substring(0, line.length() - 1));
1300 else if (line.length() > 0)
1302 desc.append(line).append(newline);
1304 line = nextNonCommentLine();
1307 setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
1311 * Returns the alignment sequences in Mega format.
1314 public String print()
1316 return MEGA_ID + newline + print(getSeqsAsArray());
1320 * Write out the alignment sequences in Mega format - interleaved unless
1321 * explicitly noninterleaved.
1323 protected String print(SequenceI[] s)
1326 if (this.interleaved != null && !this.interleaved)
1328 result = printNonInterleaved(s);
1332 result = printInterleaved(s);
1338 * Print to string in Interleaved format - blocks of next N characters of each
1343 protected String printInterleaved(SequenceI[] s)
1345 int maxIdLength = getMaxIdLength(s);
1346 int maxSequenceLength = getMaxSequenceLength(s);
1347 int numLines = maxSequenceLength / positionsPerLine + 3; // approx
1349 int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1;
1350 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1351 int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery;
1354 * Roughly size a buffer to hold the whole output
1356 StringBuilder sb = new StringBuilder(numLines
1357 * (maxIdLength + positionsPerLine + chunksPerLine + 10));
1360 * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide
1363 for (int i = 0; i < numDataBlocks; i++)
1366 boolean first = true;
1368 for (SequenceI seq : s)
1371 String seqId = String.format("#%-" + maxIdLength + "s",
1375 * output next line for this sequence
1378 int lastPos = seqFrom + positionsPerLine; // exclusive
1379 for (int j = 0; j < chunksPerLine; j++)
1381 char[] subSequence = seq.getSequence(seqFrom,
1382 Math.min(lastPos, seqFrom + spaceEvery));
1383 if (subSequence.length > 0)
1385 sb.append(SPACE).append(subSequence);
1387 seqFrom += subSequence.length;
1390 // all sequences should be the same length in MEGA
1391 advancedBy += subSequence.length;
1394 // write last position as a comment
1395 if (writePositionNumbers)
1397 sb.append(SPACE).append(COMMENT_START).append(from + advancedBy)
1398 .append(COMMENT_END);
1406 return new String(sb);
1410 * Outputs to string the MEGA header and any other known and relevant
1411 * alignment properties
1415 protected String printHeaders(AlignmentI al)
1417 StringBuilder sb = new StringBuilder(128);
1418 sb.append(MEGA_ID).append(newline);
1419 String propertyValue = (String) al.getProperty(PROP_TITLE);
1420 if (propertyValue != null)
1422 sb.append(BANG).append(TITLE).append(SPACE).append(propertyValue)
1423 .append(SEMICOLON).append(newline);
1425 propertyValue = (String) al.getProperty(PROP_DESCRIPTION);
1426 if (propertyValue != null)
1428 sb.append(BANG).append(DESCRIPTION).append(newline)
1429 .append(propertyValue).append(SEMICOLON)
1434 * !Format DataType CodeTable
1436 sb.append(BANG).append(FORMAT).append(newline);
1437 String dataType = (String) al.getProperty(PROP_DATATYPE);
1438 if (dataType == null)
1440 dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN;
1442 sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType);
1443 String codeTable = (String) al.getProperty(PROP_CODETABLE);
1444 sb.append(SPACE).append(CODETABLE).append(EQUALS)
1445 .append(codeTable == null ? "Standard" : codeTable)
1449 * !Format NSeqs NSites (the length of sequences - they should all be the
1450 * same - including gaps)
1452 sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight());
1453 sb.append(SPACE).append(N_SITES).append(EQUALS)
1454 .append(String.valueOf(al.getWidth()));
1458 * !Format Indel Identical Missing
1461 sb.append(INDEL).append(EQUALS).append(al.getGapCharacter());
1462 String identity = (String) al.getProperty(PROP_IDENTITY);
1463 if (identity != null)
1465 sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity);
1467 String missing = (String) al.getProperty(PROP_MISSING);
1468 if (missing != null)
1470 sb.append(SPACE).append(MISSING).append(EQUALS).append(missing);
1472 sb.append(SEMICOLON).append(newline);
1474 return sb.toString();
1478 * Get the longest sequence id (to allow aligned printout).
1483 protected static int getMaxIdLength(SequenceI[] s)
1485 // TODO pull up for reuse
1487 for (SequenceI seq : s)
1489 int len = seq.getName().length();
1490 if (len > maxLength)
1499 * Get the longest sequence length
1504 protected static int getMaxSequenceLength(SequenceI[] s)
1506 // TODO pull up for reuse
1508 for (SequenceI seq : s)
1510 int len = seq.getLength();
1511 if (len > maxLength)
1520 * Print to string in noninterleaved format - all of each sequence in turn, in
1521 * blocks of 50 characters.
1526 protected String printNonInterleaved(SequenceI[] s)
1528 int maxSequenceLength = getMaxSequenceLength(s);
1530 int numLines = maxSequenceLength / positionsPerLine + 2 + s.length;
1533 * Roughly size a buffer to hold the whole output
1535 StringBuilder sb = new StringBuilder(numLines * positionsPerLine);
1537 int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
1538 int chunksPerLine = positionsPerLine / spaceEvery;
1539 for (SequenceI seq : s)
1542 sb.append(HASHSIGN + seq.getName()).append(newline);
1544 while (startPos < seq.getLength())
1546 boolean firstChunk = true;
1548 * print next line for this sequence
1550 int lastPos = startPos + positionsPerLine; // exclusive
1551 for (int j = 0; j < chunksPerLine; j++)
1553 char[] subSequence = seq.getSequence(startPos,
1554 Math.min(lastPos, startPos + positionsPerLine));
1555 if (subSequence.length > 0)
1561 sb.append(subSequence);
1564 startPos += subSequence.length;
1570 return new String(sb);
1574 * Flag this file as interleaved or not, based on data format. Throws an
1575 * exception if has previously been determined to be otherwise.
1579 * @throws IOException
1581 protected void assertInterleaved(boolean isIt, String dataLine)
1582 throws FileFormatException
1584 if (this.interleaved != null && isIt != this.interleaved.booleanValue())
1586 throw new FileFormatException("Parse error: interleaved was " + !isIt
1587 + " but now seems to be " + isIt + ", at line: " + dataLine);
1589 this.interleaved = new Boolean(isIt);
1590 setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString());
1593 public boolean isInterleaved()
1595 return this.interleaved == null ? false : this.interleaved
1600 * Adds saved parsed values either as alignment properties, or (in some cases)
1601 * as specific member fields of the alignment
1604 public void addProperties(AlignmentI al)
1606 super.addProperties(al);
1607 al.setGapCharacter(gapCharacter);
1610 * warn if e.g. DataType=DNA but data is protein (or vice versa)
1612 if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) {
1613 System.err.println("Warning: " + this.title + " declared "
1614 + (nucleotide ? "" : " not ") + "nucleotide but it is"
1615 + (nucleotide ? " not" : ""));
1620 * Print the given alignment in MEGA format. If the alignment was created by
1621 * parsing a MEGA file, it should have properties set (e.g. Title) which can
1622 * influence the output.
1625 public String print(AlignmentI al)
1627 this.nucleotide = al.isNucleotide();
1629 String lineLength = (String) al.getProperty(PROP_LINELENGTH);
1630 this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
1631 .parseInt(lineLength);
1634 * round down to a multiple of 3 positions per line for nucleotide
1638 positionsPerLine = positionsPerLine - (positionsPerLine % 3);
1641 String interleave = (String) al.getProperty(PROP_INTERLEAVED);
1642 if (interleave != null)
1644 this.interleaved = Boolean.valueOf(interleave);
1647 String headers = printHeaders(al);
1648 return headers + print(al.getSequencesArray());
1652 * Returns the number of sequence positions output per line
1656 public int getPositionsPerLine()
1658 return positionsPerLine;
1662 * Sets the number of sequence positions output per line. Note these will be
1663 * formatted in blocks of 3 (nucleotide) or 10 (peptide).
1667 public void setPositionsPerLine(int p)
1669 this.positionsPerLine = p;