2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 import jalview.datamodel.Sequence;
22 import jalview.datamodel.SequenceI;
24 import java.io.IOException;
25 import java.util.LinkedHashMap;
27 import java.util.Map.Entry;
30 public class MegaFile extends AlignFile
33 * Simple file format as at
34 * http://www.hiv.lanl.gov/content/sequence/HelpDocs/SEQsamples.html
36 * Fancy file format as at
37 * http://primerdigital.com/fastpcr/images/Drosophila_Adh.txt
39 public enum FileFormat
44 private static final String HASHSIGN = "#"; // TODO: public constants file
46 private static final String COLON = ":";
48 private static final String BANG = "!";
50 private static final String EQUALS = "=";
52 private static final String MEGA_ID = HASHSIGN + "MEGA";
54 public static final String PROP_TITLE = "TITLE";
56 public static final String PROP_FORMAT = "Format";
58 public static final String PROP_DESCRIPTION = "Description";
60 public static final String PROP_GENE = "Gene";
62 public static final String PROP_INTERLEAVED = "Interleaved";
64 // initial size for sequence data buffer
65 private static final int SEQBUFFERSIZE = 256;
67 private static final String SPACE = " ";
69 private static final int POSITIONS_PER_LINE = 50;
71 // this can be True, False or null (meaning we don't know yet)
72 private Boolean interleaved;
74 // set once we have seen one block of interleaved data
75 private boolean firstDataBlockRead = false;
77 private FileFormat fileFormat;
83 public MegaFile(String inFile, String type) throws IOException
88 public MegaFile(FileParse source) throws IOException
94 * Parse the input stream.
97 public void parse() throws IOException
100 * Read MEGA and Title/Format/Description/Gene headers if present. These are
101 * saved as alignment properties. Returns the first sequence data line
103 String dataLine = parseHeaderLines();
106 * If we didn't positively identify as 'fancy format', assume 'simple
109 if (this.fileFormat == null)
111 setFileFormat(FileFormat.SIMPLE);
115 * Temporary store of {sequenceId, positionData} while parsing appending
117 Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
120 * The id of the sequence being read (for non-interleaved)
122 String currentId = "";
124 while (dataLine != null)
126 dataLine = dataLine.trim();
127 if (dataLine.length() > 0)
129 currentId = parseDataLine(dataLine, seqData, currentId);
131 else if (!seqData.isEmpty())
134 * Blank line after processing some data...
136 this.firstDataBlockRead = true;
138 dataLine = nextLine();
141 setSequences(seqData);
145 * Convert the parsed sequence strings to objects and store them in the model.
149 protected void setSequences(Map<String, StringBuilder> seqData)
151 Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
153 for (Entry<String, StringBuilder> dataset : datasets)
155 String sequenceId = dataset.getKey();
156 StringBuilder characters = dataset.getValue();
157 SequenceI s = new Sequence(sequenceId, new String(characters));
158 this.seqs.addElement(s);
163 * Process one line of sequence data. If it has no sequence identifier, append
164 * to the current id's sequence. Else parse out the sequence id and append the
165 * data (if any) to that id's sequence. Returns the sequence id (implicit or
166 * explicit) for this line.
172 * @throws IOException
174 protected String parseDataLine(String dataLine,
175 Map<String, StringBuilder> seqData, String currentId)
178 String seqId = getSequenceId(dataLine);
182 * Just character data
184 parseNoninterleavedDataLine(dataLine, seqData, currentId);
187 else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
190 * Sequence id only - header line for noninterleaved data
197 * Sequence id followed by data
199 parseInterleavedDataLine(dataLine, seqData, seqId);
205 * Add a line of sequence data to the buffer for the given sequence id. Start
206 * a new one if we haven't seen it before.
211 * @throws IOException
213 protected void parseNoninterleavedDataLine(String dataLine,
214 Map<String, StringBuilder> seqData, String currentId)
217 if (currentId == null)
220 * Oops. Data but no sequence id context.
222 throw new IOException("No sequence id context at: " + dataLine);
225 assertInterleaved(false, dataLine);
227 StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
230 * Add the current line of data to the sequence.
236 * Get the sequence data for this sequence id, starting a new one if
243 protected StringBuilder getSequenceDataBuffer(
244 Map<String, StringBuilder> seqData, String currentId)
246 StringBuilder sb = seqData.get(currentId);
249 // first data met for this sequence id, start a new buffer
250 sb = new StringBuilder(SEQBUFFERSIZE);
251 seqData.put(currentId, sb);
257 * Parse one line of interleaved data e.g.
260 * #TheSeqId CGATCGCATGCA
266 * @throws IOException
268 protected void parseInterleavedDataLine(String dataLine,
269 Map<String, StringBuilder> seqData, String seqId)
273 * New sequence found in second or later data block - error.
275 if (this.firstDataBlockRead && !seqData.containsKey(seqId))
277 throw new IOException(
278 "Parse error: misplaced new sequence starting at " + dataLine);
281 StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
282 String data = dataLine.substring(seqId.length() + 1).trim();
285 * Do nothing if this line is _only_ a sequence id with no data following.
287 * Remove any internal spaces (present in the 'fancy' file format)
289 if (data != null && data.length() > 0)
291 if (data.indexOf(SPACE) != -1)
293 data = data.replace(SPACE, "");
296 assertInterleaved(true, dataLine);
301 * If the line begins with (e.g.) "#abcde " then returns "abcde" as the
302 * identifier. Else returns null.
307 public static String getSequenceId(String dataLine)
309 // TODO refactor to a StringUtils type class
310 if (dataLine != null)
312 if (dataLine.startsWith(HASHSIGN))
314 int spacePos = dataLine.indexOf(" ");
315 return (spacePos == -1 ? dataLine.substring(1) : dataLine
316 .substring(1, spacePos));
323 * Read the #MEGA and Title/Format/Description/Gene header lines (if present).
325 * Save as annotation properties in case useful.
327 * @return the next non-blank line following the header lines.
328 * @throws IOException
330 protected String parseHeaderLines() throws IOException
332 String inputLine = null;
333 while ((inputLine = nextLine()) != null)
335 inputLine = inputLine.trim();
340 if (inputLine.length() == 0)
345 if (inputLine.startsWith(BANG))
347 setFileFormat(FileFormat.FANCY);
350 if (inputLine.startsWith(BANG + PROP_DESCRIPTION))
352 parseDescriptionLines();
355 else if (isPropertyLine(inputLine))
358 * If a property is matched, parse and save it.
360 String[] property_value = parsePropertyValue(inputLine);
361 setAlignmentProperty(property_value[0], property_value[1]);
363 else if (!inputLine.toUpperCase().startsWith(MEGA_ID))
367 * Return the first 'data line' i.e. one that is not blank, #MEGA or
377 * Read following lines until blank, appending each to the Description
380 * Assumes the !Description line itself does not include description text.
382 * Assumes the description is followed by a blank line (else we will consume
385 * @throws IOException
387 protected void parseDescriptionLines() throws IOException
389 StringBuilder desc = new StringBuilder(256);
391 while ((line = nextLine()) != null) {
392 if ("".equals(line.trim()))
396 desc.append(line).append(newline);
398 setAlignmentProperty(PROP_DESCRIPTION, desc.toString());
402 * Test whether the line holds an expected property declaration.
407 protected boolean isPropertyLine(String inputLine)
409 if (lineMatchesFlag(inputLine, PROP_TITLE, BANG, COLON)
410 || lineMatchesFlag(inputLine, PROP_FORMAT, BANG, COLON)
411 || lineMatchesFlag(inputLine, PROP_DESCRIPTION, BANG, COLON)
412 || lineMatchesFlag(inputLine, PROP_GENE, BANG, COLON))
420 * Helper method that extract the name and value of a property, assuming the
421 * first space or equals sign is the separator.
423 * Thus "Description: Melanogaster" or "!Description=Melanogaster" both return
424 * {"Description", "Melanogaster"}.
426 * Returns an empty value string if no space or equals sign is present.
431 public static String[] parsePropertyValue(String s)
433 // TODO refactor to a string utils helper class (or find equivalent)
434 // TODO handle other cases e.g. "Description = Melanogaster"
435 String propertyName = s;
438 int separatorPos = -1;
442 int spacePos = s.indexOf(SPACE);
443 int eqPos = s.indexOf(EQUALS);
444 if (spacePos == -1 && eqPos > -1)
446 separatorPos = eqPos;
448 else if (spacePos > -1 && eqPos == -1)
450 separatorPos = spacePos;
452 else if (spacePos > -1 && eqPos > -1)
454 separatorPos = Math.min(spacePos, eqPos);
457 if (separatorPos > -1)
459 value = s.substring(separatorPos + 1);
460 propertyName = s.substring(0, separatorPos);
464 * finally strip any leading / trailing chars from property name
466 if (propertyName.startsWith(BANG))
468 propertyName = propertyName.substring(1);
470 if (propertyName.endsWith(COLON))
472 propertyName = propertyName.substring(0, propertyName.length() - 1);
476 { propertyName, value };
480 * Test whether a line starts with the specified flag field followed by a
481 * space (or nothing).
483 * Here we accept an optional prefix and suffix on the flag, and the check is
484 * not case-sensitive. So these would match for "Title"
491 * !Title Melanogaster
492 * !Title=Melanogaster
493 * !TITLE Melanogaster
494 * !TITLE=Melanogaster
495 * Title: Melanogaster
496 * Title:=Melanogaster
497 * TITLE: Melanogaster
498 * TITLE:=Melanogaster
499 * !Title: Melanogaster
500 * !Title:=Melanogaster
501 * !TITLE: Melanogaster
502 * !TITLE:=Melanogaster
515 public static boolean lineMatchesFlag(String line, String flag, String prefix, String suffix)
517 // TODO refactor to a string utils helper class
518 boolean result = false;
519 if (line != null && flag != null) {
520 String lineUpper = line.toUpperCase().trim();
521 String flagUpper = flag.toUpperCase();
523 // skip prefix character e.g. ! before attempting match
524 if (lineUpper.startsWith(prefix)) {
525 lineUpper = lineUpper.substring(1);
528 // test for flag + SPACE or flag + EQUALS, with or without suffix
529 if (lineUpper.startsWith(flagUpper + SPACE)
530 || lineUpper.startsWith(flagUpper + EQUALS)
531 || lineUpper.startsWith(flagUpper + suffix + SPACE)
532 || lineUpper.startsWith(flagUpper + suffix + EQUALS))
538 // test for exact match i.e. flag only on this line
539 if (lineUpper.equals(flagUpper)
540 || lineUpper.startsWith(flagUpper + suffix))
550 * Write out the alignment sequences in Mega format.
553 public String print()
555 return print(getSeqsAsArray());
559 * Write out the alignment sequences in Mega format - interleaved unless
560 * explicitly noninterleaved.
562 public String print(SequenceI[] s)
564 // TODO: is there a way to preserve the 'interleaved' property so it can
567 String result = null;
568 if (this.fileFormat == FileFormat.FANCY)
570 result = printInterleavedCodons(s);
572 else if (this.interleaved != null && !this.interleaved)
574 result = printNonInterleaved(s);
578 result = printInterleaved(s);
584 * Print the sequences in interleaved format, each row 15 space-separated
590 protected String printInterleavedCodons(SequenceI[] s)
592 // TODO not coded yet - defaulting to the 'simple' format output
593 return printInterleaved(s);
597 * Print to string in Interleaved format - blocks of next 50 characters of
598 * each sequence in turn.
602 protected String printInterleaved(SequenceI[] s)
604 int maxIdLength = getMaxIdLength(s);
605 int maxSequenceLength = getMaxSequenceLength(s);
606 int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx
609 * Size a buffer to hold the whole output
611 StringBuilder sb = new StringBuilder(numLines
612 * (maxIdLength + 2 + POSITIONS_PER_LINE));
613 printHeaders(sb, FileFormat.SIMPLE);
615 int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1;
616 for (int i = 0; i < numDataBlocks; i++)
619 for (SequenceI seq : s)
622 String seqId = String.format("#%-" + maxIdLength + "s ",
624 char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE,
625 (i + 1) * POSITIONS_PER_LINE);
627 sb.append(subSequence);
632 return new String(sb);
636 * Append the MEGA header and any other known properties
640 private void printHeaders(StringBuilder sb, FileFormat format)
647 Set<Entry<Object, Object>> props = getAlignmentProperties();
650 for (Entry<Object, Object> prop : props)
652 Object key = prop.getKey();
653 Object value = prop.getValue();
654 if (key instanceof String && value instanceof String)
656 if (format == FileFormat.FANCY)
658 sb.append(BANG).append(key).append(SPACE).append(value);
662 sb.append(key).append(COLON).append(SPACE).append(value);
671 * Get the longest sequence id (to allow aligned printout).
676 protected static int getMaxIdLength(SequenceI[] s)
678 // TODO pull up for reuse
680 for (SequenceI seq : s)
682 int len = seq.getName().length();
692 * Get the longest sequence length
697 protected static int getMaxSequenceLength(SequenceI[] s)
699 // TODO pull up for reuse
701 for (SequenceI seq : s)
703 int len = seq.getLength();
713 * Print to string in noninterleaved format - all of each sequence in turn, in
714 * blocks of 50 characters.
719 protected String printNonInterleaved(SequenceI[] s)
721 int maxSequenceLength = getMaxSequenceLength(s);
723 int numLines = maxSequenceLength / POSITIONS_PER_LINE + 2 + s.length;
726 * Roughly size a buffer to hold the whole output
728 StringBuilder sb = new StringBuilder(numLines * POSITIONS_PER_LINE);
729 printHeaders(sb, FileFormat.SIMPLE);
731 for (SequenceI seq : s)
734 sb.append(HASHSIGN + seq.getName()).append(newline);
736 while (startPos <= seq.getLength())
738 char[] subSequence = seq.getSequence(startPos, startPos
739 + POSITIONS_PER_LINE);
740 sb.append(subSequence);
742 startPos += POSITIONS_PER_LINE;
746 return new String(sb);
750 * Flag this file as interleaved or not, based on data format. Throws an
751 * exception if has previously been determined to be otherwise.
755 * @throws IOException
757 protected void assertInterleaved(boolean isIt, String dataLine)
760 if (this.interleaved != null && isIt != this.interleaved.booleanValue())
762 throw new IOException(
763 "Parse error: mix of interleaved and noninterleaved detected, at line: "
766 this.interleaved = new Boolean(isIt);
769 public boolean isInterleaved()
771 return this.interleaved == null ? false : this.interleaved
775 public FileFormat getFileFormat()
777 return this.fileFormat;
780 public void setFileFormat(FileFormat fileFormat)
782 this.fileFormat = fileFormat;