From 8aae5cc23f5f027d345c2f92122ed92571c0fff2 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 1 Oct 2015 16:50:20 +0100 Subject: [PATCH] JAL-1499 parse !Gene statements to SequenceFeature --- src/jalview/io/MegaFile.java | 273 ++++++++++++++++++++++++++++++------------ 1 file changed, 199 insertions(+), 74 deletions(-) diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index 88200f5..e4079c0 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -20,10 +20,15 @@ package jalview.io; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; @@ -131,8 +136,8 @@ public class MegaFile extends AlignFile private String title; - // gap character may be explicitly declared, if not we infer it - private Character gapCharacter; + // gap character may be explicitly declared, default is - + private char gapCharacter = '-'; // identity character if declared private char identityCharacter = 0; @@ -149,6 +154,31 @@ public class MegaFile extends AlignFile // write end of line positions as a comment private boolean writePositionNumbers = true; + // id of sequence being processed + private String currentSequenceId; + + /* + * Temporary store of {sequenceId, positionData} while parsing interleaved + * sequences; sequences are maintained in the order in which they are added + * i.e. read in the file + */ + Map seqData; + + // number of residues read (so far) per sequence + Map residuesRead; + + // start residue (base 1) per sequence of current feature + Map featureStart; + + // feature (Gene/Domain) if any we are parsing + private String currentFeature; + + // feature type (Gene/Domain) if any we are parsing + private String currentFeatureType; + + // map of SequenceFeature's by sequence id + Map> sequenceFeatures; + public MegaFile() { } @@ -169,6 +199,11 @@ public class MegaFile extends AlignFile @Override public void parse() throws IOException { + gapCharacter = '-'; + sequenceFeatures = new HashMap>(); + featureStart = new HashMap(); + residuesRead = new HashMap(); + /* * Read and process MEGA and Title/Format/Description headers if present. * Returns the first data line following the headers. @@ -176,16 +211,15 @@ public class MegaFile extends AlignFile String dataLine = parseHeaderLines(); /* - * Temporary store of {sequenceId, positionData} while parsing interleaved - * sequences; sequences are maintained in the order in which they are added - * i.e. read in the file + * order-preserving map to hold sequences by id as they are built up during + * parsing */ - Map seqData = new LinkedHashMap(); + seqData = new LinkedHashMap(); /* * The id of the sequence being read (for non-interleaved) */ - String currentId = ""; + currentSequenceId = ""; while (dataLine != null) { @@ -194,15 +228,15 @@ public class MegaFile extends AlignFile { if (dataLine.startsWith(BANG + GENE)) { - parseGene(dataLine); + parseFeature(GENE, dataLine); } else if (dataLine.startsWith(BANG + DOMAIN)) { - parseDomain(dataLine); + parseFeature(DOMAIN, dataLine); } else { - currentId = parseDataLine(dataLine, seqData, currentId); + currentSequenceId = parseDataLine(dataLine); } } else if (!seqData.isEmpty()) @@ -210,7 +244,7 @@ public class MegaFile extends AlignFile /* * Blank line after processing some data... */ - this.firstDataBlockRead = true; + endOfDataBlock(); } dataLine = nextNonCommentLine(); } @@ -218,16 +252,88 @@ public class MegaFile extends AlignFile // remember the (longest) line length read in, so we can output the same setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine)); - setSequences(seqData); + deriveSequences(); + } + + /** + * Post-processing after reading one block of interleaved data + */ + protected void endOfDataBlock() + { + this.firstDataBlockRead = true; + // TODO: + // (initialise and) populate arrays of sequence length so far (excluding + // gaps) + // On change or end of a denoted Gene or Domain, add sequence features for + // it } /** - * Parse a !Gene command line + * Parse a !Gene or !Domain command line * + * @param featureType * @param dataLine */ - protected void parseGene(String dataLine) + protected void parseFeature(String featureType, String dataLine) { + String featureName = getValue(dataLine); + // TODO parse !Gene=xyx Property=end; ??? + if (this.currentFeature != null) + { + endSequenceFeature(); + } + startSequenceFeature(featureName, featureType); + } + + /** + * Start processing a new feature + * + * @param featureName + */ + protected void startSequenceFeature(String featureName, String featureType) + { + currentFeature = featureName; + currentFeatureType = featureType; + + /* + * If the feature name precedes all sequences, we will know in + * endSequenceFeature that it starts with residue 1; otherwise note now + * where it starts in each sequence + */ + if (!residuesRead.isEmpty()) + { + for (Entry entry : residuesRead.entrySet()) + { + String seqId = entry.getKey(); + Integer nextResidue = entry.getValue() + 1; + featureStart.put(seqId, nextResidue); + } + } + } + + /** + * Add a SequenceFeature for the current feature to each sequence, using the + * current feature start/end values per sequence + */ + protected void endSequenceFeature() + { + Iterator seqids = this.seqData.keySet().iterator(); + while (seqids.hasNext()) + { + String seqid = seqids.next(); + Integer startAt = featureStart.get(seqid); + int sfstart = startAt == null ? 1 : startAt.intValue(); + int sfend = residuesRead.get(seqid); + if (sfend >= sfstart) + { + /* + * don't add feature if entirely gapped in the sequence + */ + SequenceFeature sf = new SequenceFeature(currentFeature, + currentFeatureType, sfstart, sfend, 0f, null); + sequenceFeatures.get(seqid).add(sf); + } + } } /** @@ -372,10 +478,8 @@ public class MegaFile extends AlignFile /** * Convert the parsed sequence strings to objects and store them in the model. - * - * @param seqData */ - protected void setSequences(Map seqData) + protected void deriveSequences() { Set> datasets = seqData.entrySet(); @@ -385,6 +489,14 @@ public class MegaFile extends AlignFile StringBuilder characters = dataset.getValue(); SequenceI s = new Sequence(sequenceId, new String(characters)); this.seqs.addElement(s); + + /* + * and add any derived sequence features to the sequence + */ + for (SequenceFeature sf : sequenceFeatures.get(sequenceId)) + { + s.addSequenceFeature(sf); + } } } @@ -395,13 +507,10 @@ public class MegaFile extends AlignFile * explicit) for this line. * * @param dataLine - * @param seqData - * @param currentid * @return * @throws IOException */ - protected String parseDataLine(String dataLine, - Map seqData, String currentId) + protected String parseDataLine(String dataLine) throws IOException { String seqId = getSequenceId(dataLine); @@ -410,8 +519,8 @@ public class MegaFile extends AlignFile /* * Just character data */ - parseNoninterleavedDataLine(dataLine, seqData, currentId); - return currentId; + parseNoninterleavedDataLine(dataLine); + return currentSequenceId; } else if ((HASHSIGN + seqId).trim().equals(dataLine.trim())) { @@ -425,7 +534,7 @@ public class MegaFile extends AlignFile /* * Sequence id followed by data */ - parseInterleavedDataLine(dataLine, seqData, seqId); + parseInterleavedDataLine(dataLine, seqId); return seqId; } } @@ -435,15 +544,12 @@ public class MegaFile extends AlignFile * a new one if we haven't seen it before. * * @param dataLine - * @param seqData - * @param currentId * @throws IOException */ - protected void parseNoninterleavedDataLine(String dataLine, - Map seqData, String currentId) + protected void parseNoninterleavedDataLine(String dataLine) throws IOException { - if (currentId == null) + if (currentSequenceId == null) { /* * Oops. Data but no sequence id context. @@ -453,10 +559,7 @@ public class MegaFile extends AlignFile assertInterleaved(false, dataLine); - StringBuilder sb = getSequenceDataBuffer(seqData, currentId); - - dataLine = reformatSequenceData(dataLine, sb.length(), seqData); - sb.append(dataLine); + dataLine = addSequenceData(currentSequenceId, dataLine); setPositionsPerLine(Math.max(positionsPerLine, dataLine.length())); } @@ -465,12 +568,10 @@ public class MegaFile extends AlignFile * Get the sequence data for this sequence id, starting a new one if * necessary. * - * @param seqData * @param currentId * @return */ - protected StringBuilder getSequenceDataBuffer( - Map seqData, String currentId) + protected StringBuilder getSequenceDataBuffer(String currentId) { StringBuilder sb = seqData.get(currentId); if (sb == null) @@ -478,6 +579,9 @@ public class MegaFile extends AlignFile // first data met for this sequence id, start a new buffer sb = new StringBuilder(SEQBUFFERSIZE); seqData.put(currentId, sb); + + // and a placeholder for any SequenceFeature found + sequenceFeatures.put(currentId, new ArrayList()); } return sb; } @@ -490,73 +594,89 @@ public class MegaFile extends AlignFile * * * @param dataLine - * @param seqData * @param seqId - * @throws IOException + * @throws FileFormatException */ - protected void parseInterleavedDataLine(String dataLine, - Map seqData, String seqId) - throws IOException + protected void parseInterleavedDataLine(String dataLine, String seqId) + throws FileFormatException { /* * New sequence found in second or later data block - error. */ if (this.firstDataBlockRead && !seqData.containsKey(seqId)) { - throw new IOException( + throw new FileFormatException( "Parse error: misplaced new sequence starting at " + dataLine); } - StringBuilder sb = getSequenceDataBuffer(seqData, seqId); String data = dataLine.substring(seqId.length() + 1).trim(); /* * Do nothing if this line is _only_ a sequence id with no data following. - * - * Remove any internal spaces */ if (data != null && data.length() > 0) { - data = reformatSequenceData(data, sb.length(), seqData); - sb.append(data); + data = addSequenceData(seqId, data); setPositionsPerLine(Math.max(positionsPerLine, data.length())); assertInterleaved(true, dataLine); } } /** - * Reformat input sequence data by removing any internal formatting spaces, - * and converting any 'identity' characters to the corresponding position in - * the first sequence. + * Remove spaces, and replace identity symbol, before appending the sequence + * data to the buffer for the sequence id. Returns the reformatted added data. + * Also updates a count of residues read for the sequence. * + * @param seqId * @param data - * @param startPos - * the sequence position (base 0) of the start of the data - * @param seqData * @return */ - protected String reformatSequenceData(String data, int startPos, Map seqData) + protected String addSequenceData(String seqId, String data) { + StringBuilder sb = getSequenceDataBuffer(seqId); + int len = sb.length(); String formatted = data.replace(SPACE, ""); - if (formatted.indexOf(identityCharacter) > -1) + + /* + * If sequence contains '.' or other identity symbol; replace these with the + * same position from the first (reference) sequence + */ + int nonGapped = 0; + StringBuilder referenceSequence = seqData.values().iterator().next(); + StringBuilder sb1 = new StringBuilder(formatted.length()); + for (int i = 0; i < formatted.length(); i++) { - /* - * sequence contains '.' or other identity symbol; replace these with the - * same position from the first (reference) sequence - */ - StringBuilder referenceSequence = seqData.values().iterator().next(); - StringBuilder sb = new StringBuilder(formatted.length()); - for (int i = 0 ; i < formatted.length() ; i++) { - char nextChar = formatted.charAt(i); - if (nextChar != identityCharacter) { - sb.append(nextChar); - } else { - sb.append(referenceSequence.charAt(startPos + i)); - } + char nextChar = formatted.charAt(i); + if (nextChar != gapCharacter) + { + nonGapped++; + } + if (nextChar == identityCharacter + && len + i < referenceSequence.length()) + { + sb1.append(referenceSequence.charAt(len + i)); + } + else + { + sb1.append(nextChar); } - formatted = sb.toString(); } - return formatted; + formatted = sb1.toString(); + + data = formatted; + sb.append(data); + + /* + * increment residue count for the sequence + */ + if (nonGapped > 0) + { + Integer residueCount = residuesRead.get(seqId); + residuesRead.put(seqId, nonGapped + + (residueCount == null ? 0 : residueCount)); + } + + return data; } /** @@ -1178,10 +1298,7 @@ public class MegaFile extends AlignFile public void addProperties(AlignmentI al) { super.addProperties(al); - if (this.gapCharacter != null) - { - al.setGapCharacter(gapCharacter); - } + al.setGapCharacter(gapCharacter); /* * warn if e.g. DataType=DNA but data is protein (or vice versa) @@ -1207,6 +1324,14 @@ public class MegaFile extends AlignFile this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer .parseInt(lineLength); + /* + * round down to a multiple of 3 positions per line for nucleotide + */ + if (nucleotide) + { + positionsPerLine = positionsPerLine - (positionsPerLine % 3); + } + String interleave = (String) al.getProperty(PROP_INTERLEAVED); if (interleave != null) { -- 1.7.10.2