import jalview.datamodel.AlignmentI;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.LinkedHashMap;
+import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
private String title;
- // gap character may be explicitly declared, if not we infer it
- private Character gapCharacter;
+ // gap character may be explicitly declared, default is -
+ private char gapCharacter = '-';
// identity character if declared
private char identityCharacter = 0;
// write end of line positions as a comment
private boolean writePositionNumbers = true;
+ // id of sequence being processed
+ private String currentSequenceId;
+
+ /*
+ * Temporary store of {sequenceId, positionData} while parsing interleaved
+ * sequences; sequences are maintained in the order in which they are added
+ * i.e. read in the file
+ */
+ Map<String, StringBuilder> seqData;
+
+ // number of residues read (so far) per sequence
+ Map<String, Integer> residuesRead;
+
+ // start residue (base 1) per sequence of current feature
+ Map<String, Integer> featureStart;
+
+ // feature (Gene/Domain) if any we are parsing
+ private String currentFeature;
+
+ // feature type (Gene/Domain) if any we are parsing
+ private String currentFeatureType;
+
+ // map of SequenceFeature's by sequence id
+ Map<String, List<SequenceFeature>> sequenceFeatures;
+
public MegaFile()
{
}
@Override
public void parse() throws IOException
{
+ gapCharacter = '-';
+ sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
+ featureStart = new HashMap<String, Integer>();
+ residuesRead = new HashMap<String, Integer>();
+
/*
* Read and process MEGA and Title/Format/Description headers if present.
* Returns the first data line following the headers.
String dataLine = parseHeaderLines();
/*
- * Temporary store of {sequenceId, positionData} while parsing interleaved
- * sequences; sequences are maintained in the order in which they are added
- * i.e. read in the file
+ * order-preserving map to hold sequences by id as they are built up during
+ * parsing
*/
- Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
+ seqData = new LinkedHashMap<String, StringBuilder>();
/*
* The id of the sequence being read (for non-interleaved)
*/
- String currentId = "";
+ currentSequenceId = "";
while (dataLine != null)
{
{
if (dataLine.startsWith(BANG + GENE))
{
- parseGene(dataLine);
+ parseFeature(GENE, dataLine);
}
else if (dataLine.startsWith(BANG + DOMAIN))
{
- parseDomain(dataLine);
+ parseFeature(DOMAIN, dataLine);
}
else
{
- currentId = parseDataLine(dataLine, seqData, currentId);
+ currentSequenceId = parseDataLine(dataLine);
}
}
else if (!seqData.isEmpty())
/*
* Blank line after processing some data...
*/
- this.firstDataBlockRead = true;
+ endOfDataBlock();
}
dataLine = nextNonCommentLine();
}
// remember the (longest) line length read in, so we can output the same
setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
- setSequences(seqData);
+ deriveSequences();
+ }
+
+ /**
+ * Post-processing after reading one block of interleaved data
+ */
+ protected void endOfDataBlock()
+ {
+ this.firstDataBlockRead = true;
+ // TODO:
+ // (initialise and) populate arrays of sequence length so far (excluding
+ // gaps)
+ // On change or end of a denoted Gene or Domain, add sequence features for
+ // it
}
/**
- * Parse a !Gene command line
+ * Parse a !Gene or !Domain command line
*
+ * @param featureType
* @param dataLine
*/
- protected void parseGene(String dataLine)
+ protected void parseFeature(String featureType, String dataLine)
{
+ String featureName = getValue(dataLine);
+ // TODO parse !Gene=xyx Property=end; ???
+ if (this.currentFeature != null)
+ {
+ endSequenceFeature();
+ }
+ startSequenceFeature(featureName, featureType);
+ }
+
+ /**
+ * Start processing a new feature
+ *
+ * @param featureName
+ */
+ protected void startSequenceFeature(String featureName, String featureType)
+ {
+ currentFeature = featureName;
+ currentFeatureType = featureType;
+
+ /*
+ * If the feature name precedes all sequences, we will know in
+ * endSequenceFeature that it starts with residue 1; otherwise note now
+ * where it starts in each sequence
+ */
+ if (!residuesRead.isEmpty())
+ {
+ for (Entry<String, Integer> entry : residuesRead.entrySet())
+ {
+ String seqId = entry.getKey();
+ Integer nextResidue = entry.getValue() + 1;
+ featureStart.put(seqId, nextResidue);
+ }
+ }
+ }
+
+ /**
+ * Add a SequenceFeature for the current feature to each sequence, using the
+ * current feature start/end values per sequence
+ */
+ protected void endSequenceFeature()
+ {
+ Iterator<String> seqids = this.seqData.keySet().iterator();
+ while (seqids.hasNext())
+ {
+ String seqid = seqids.next();
+ Integer startAt = featureStart.get(seqid);
+ int sfstart = startAt == null ? 1 : startAt.intValue();
+ int sfend = residuesRead.get(seqid);
+ if (sfend >= sfstart)
+ {
+ /*
+ * don't add feature if entirely gapped in the sequence
+ */
+ SequenceFeature sf = new SequenceFeature(currentFeature,
+ currentFeatureType, sfstart, sfend, 0f, null);
+ sequenceFeatures.get(seqid).add(sf);
+ }
+ }
}
/**
/**
* Convert the parsed sequence strings to objects and store them in the model.
- *
- * @param seqData
*/
- protected void setSequences(Map<String, StringBuilder> seqData)
+ protected void deriveSequences()
{
Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
StringBuilder characters = dataset.getValue();
SequenceI s = new Sequence(sequenceId, new String(characters));
this.seqs.addElement(s);
+
+ /*
+ * and add any derived sequence features to the sequence
+ */
+ for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
+ {
+ s.addSequenceFeature(sf);
+ }
}
}
* explicit) for this line.
*
* @param dataLine
- * @param seqData
- * @param currentid
* @return
* @throws IOException
*/
- protected String parseDataLine(String dataLine,
- Map<String, StringBuilder> seqData, String currentId)
+ protected String parseDataLine(String dataLine)
throws IOException
{
String seqId = getSequenceId(dataLine);
/*
* Just character data
*/
- parseNoninterleavedDataLine(dataLine, seqData, currentId);
- return currentId;
+ parseNoninterleavedDataLine(dataLine);
+ return currentSequenceId;
}
else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
{
/*
* Sequence id followed by data
*/
- parseInterleavedDataLine(dataLine, seqData, seqId);
+ parseInterleavedDataLine(dataLine, seqId);
return seqId;
}
}
* a new one if we haven't seen it before.
*
* @param dataLine
- * @param seqData
- * @param currentId
* @throws IOException
*/
- protected void parseNoninterleavedDataLine(String dataLine,
- Map<String, StringBuilder> seqData, String currentId)
+ protected void parseNoninterleavedDataLine(String dataLine)
throws IOException
{
- if (currentId == null)
+ if (currentSequenceId == null)
{
/*
* Oops. Data but no sequence id context.
assertInterleaved(false, dataLine);
- StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
-
- dataLine = reformatSequenceData(dataLine, sb.length(), seqData);
- sb.append(dataLine);
+ dataLine = addSequenceData(currentSequenceId, dataLine);
setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
}
* Get the sequence data for this sequence id, starting a new one if
* necessary.
*
- * @param seqData
* @param currentId
* @return
*/
- protected StringBuilder getSequenceDataBuffer(
- Map<String, StringBuilder> seqData, String currentId)
+ protected StringBuilder getSequenceDataBuffer(String currentId)
{
StringBuilder sb = seqData.get(currentId);
if (sb == null)
// first data met for this sequence id, start a new buffer
sb = new StringBuilder(SEQBUFFERSIZE);
seqData.put(currentId, sb);
+
+ // and a placeholder for any SequenceFeature found
+ sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
}
return sb;
}
* </pre>
*
* @param dataLine
- * @param seqData
* @param seqId
- * @throws IOException
+ * @throws FileFormatException
*/
- protected void parseInterleavedDataLine(String dataLine,
- Map<String, StringBuilder> seqData, String seqId)
- throws IOException
+ protected void parseInterleavedDataLine(String dataLine, String seqId)
+ throws FileFormatException
{
/*
* New sequence found in second or later data block - error.
*/
if (this.firstDataBlockRead && !seqData.containsKey(seqId))
{
- throw new IOException(
+ throw new FileFormatException(
"Parse error: misplaced new sequence starting at " + dataLine);
}
- StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
String data = dataLine.substring(seqId.length() + 1).trim();
/*
* Do nothing if this line is _only_ a sequence id with no data following.
- *
- * Remove any internal spaces
*/
if (data != null && data.length() > 0)
{
- data = reformatSequenceData(data, sb.length(), seqData);
- sb.append(data);
+ data = addSequenceData(seqId, data);
setPositionsPerLine(Math.max(positionsPerLine, data.length()));
assertInterleaved(true, dataLine);
}
}
/**
- * Reformat input sequence data by removing any internal formatting spaces,
- * and converting any 'identity' characters to the corresponding position in
- * the first sequence.
+ * Remove spaces, and replace identity symbol, before appending the sequence
+ * data to the buffer for the sequence id. Returns the reformatted added data.
+ * Also updates a count of residues read for the sequence.
*
+ * @param seqId
* @param data
- * @param startPos
- * the sequence position (base 0) of the start of the data
- * @param seqData
* @return
*/
- protected String reformatSequenceData(String data, int startPos, Map<String, StringBuilder> seqData)
+ protected String addSequenceData(String seqId, String data)
{
+ StringBuilder sb = getSequenceDataBuffer(seqId);
+ int len = sb.length();
String formatted = data.replace(SPACE, "");
- if (formatted.indexOf(identityCharacter) > -1)
+
+ /*
+ * If sequence contains '.' or other identity symbol; replace these with the
+ * same position from the first (reference) sequence
+ */
+ int nonGapped = 0;
+ StringBuilder referenceSequence = seqData.values().iterator().next();
+ StringBuilder sb1 = new StringBuilder(formatted.length());
+ for (int i = 0; i < formatted.length(); i++)
{
- /*
- * sequence contains '.' or other identity symbol; replace these with the
- * same position from the first (reference) sequence
- */
- StringBuilder referenceSequence = seqData.values().iterator().next();
- StringBuilder sb = new StringBuilder(formatted.length());
- for (int i = 0 ; i < formatted.length() ; i++) {
- char nextChar = formatted.charAt(i);
- if (nextChar != identityCharacter) {
- sb.append(nextChar);
- } else {
- sb.append(referenceSequence.charAt(startPos + i));
- }
+ char nextChar = formatted.charAt(i);
+ if (nextChar != gapCharacter)
+ {
+ nonGapped++;
+ }
+ if (nextChar == identityCharacter
+ && len + i < referenceSequence.length())
+ {
+ sb1.append(referenceSequence.charAt(len + i));
+ }
+ else
+ {
+ sb1.append(nextChar);
}
- formatted = sb.toString();
}
- return formatted;
+ formatted = sb1.toString();
+
+ data = formatted;
+ sb.append(data);
+
+ /*
+ * increment residue count for the sequence
+ */
+ if (nonGapped > 0)
+ {
+ Integer residueCount = residuesRead.get(seqId);
+ residuesRead.put(seqId, nonGapped
+ + (residueCount == null ? 0 : residueCount));
+ }
+
+ return data;
}
/**
public void addProperties(AlignmentI al)
{
super.addProperties(al);
- if (this.gapCharacter != null)
- {
- al.setGapCharacter(gapCharacter);
- }
+ al.setGapCharacter(gapCharacter);
/*
* warn if e.g. DataType=DNA but data is protein (or vice versa)
this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
.parseInt(lineLength);
+ /*
+ * round down to a multiple of 3 positions per line for nucleotide
+ */
+ if (nucleotide)
+ {
+ positionsPerLine = positionsPerLine - (positionsPerLine % 3);
+ }
+
String interleave = (String) al.getProperty(PROP_INTERLEAVED);
if (interleave != null)
{