JAL-1499 parse !Gene statements to SequenceFeature

author gmungoc <g.m.carstairs@dundee.ac.uk>

Thu, 1 Oct 2015 15:50:20 +0000 (16:50 +0100)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Thu, 1 Oct 2015 15:50:20 +0000 (16:50 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 1 Oct 2015 15:50:20 +0000 (16:50 +0100)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 1 Oct 2015 15:50:20 +0000 (16:50 +0100)
diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java

index 88200f5..e4079c0 100644 (file)
--- a/src/jalview/io/MegaFile.java
+++ b/src/jalview/io/MegaFile.java
@@ -20,10 +20,15 @@ package jalview.io;
  
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
  
  import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
  import java.util.LinkedHashMap;
+import java.util.List;
  import java.util.Map;
  import java.util.Map.Entry;
  import java.util.Set;
@@ -131,8 +136,8 @@ public class MegaFile extends AlignFile
  
    private String title;
  
-  // gap character may be explicitly declared, if not we infer it
-  private Character gapCharacter;
+  // gap character may be explicitly declared, default is -
+  private char gapCharacter = '-';
  
    // identity character if declared
    private char identityCharacter = 0;
@@ -149,6 +154,31 @@ public class MegaFile extends AlignFile
    // write end of line positions as a comment
    private boolean writePositionNumbers = true;
  
+  // id of sequence being processed
+  private String currentSequenceId;
+
+  /*
+   * Temporary store of {sequenceId, positionData} while parsing interleaved
+   * sequences; sequences are maintained in the order in which they are added
+   * i.e. read in the file
+   */
+  Map<String, StringBuilder> seqData;
+  
+  // number of residues read (so far) per sequence
+  Map<String, Integer> residuesRead;
+  
+  // start residue (base 1) per sequence of current feature
+  Map<String, Integer> featureStart;
+  
+  // feature (Gene/Domain) if any we are parsing
+  private String currentFeature;
+
+  // feature type (Gene/Domain) if any we are parsing
+  private String currentFeatureType;
+
+  // map of SequenceFeature's by sequence id
+  Map<String, List<SequenceFeature>> sequenceFeatures;
+
    public MegaFile()
    {
    }
@@ -169,6 +199,11 @@ public class MegaFile extends AlignFile
    @Override
    public void parse() throws IOException
    {
+    gapCharacter = '-';
+    sequenceFeatures = new HashMap<String, List<SequenceFeature>>();
+    featureStart = new HashMap<String, Integer>();
+    residuesRead = new HashMap<String, Integer>();
+
      /*
       * Read and process MEGA and Title/Format/Description headers if present.
       * Returns the first data line following the headers.
@@ -176,16 +211,15 @@ public class MegaFile extends AlignFile
      String dataLine = parseHeaderLines();
  
      /*
-     * Temporary store of {sequenceId, positionData} while parsing interleaved
-     * sequences; sequences are maintained in the order in which they are added
-     * i.e. read in the file
+     * order-preserving map to hold sequences by id as they are built up during
+     * parsing
       */
-    Map<String, StringBuilder> seqData = new LinkedHashMap<String, StringBuilder>();
+    seqData = new LinkedHashMap<String, StringBuilder>();
  
      /*
       * The id of the sequence being read (for non-interleaved)
       */
-    String currentId = "";
+    currentSequenceId = "";
  
      while (dataLine != null)
      {
@@ -194,15 +228,15 @@ public class MegaFile extends AlignFile
        {
          if (dataLine.startsWith(BANG + GENE))
          {
-          parseGene(dataLine);
+          parseFeature(GENE, dataLine);
          }
          else if (dataLine.startsWith(BANG + DOMAIN))
          {
-          parseDomain(dataLine);
+          parseFeature(DOMAIN, dataLine);
          }
          else
          {
-          currentId = parseDataLine(dataLine, seqData, currentId);
+          currentSequenceId = parseDataLine(dataLine);
          }
        }
        else if (!seqData.isEmpty())
@@ -210,7 +244,7 @@ public class MegaFile extends AlignFile
          /*
           * Blank line after processing some data...
           */
-        this.firstDataBlockRead = true;
+        endOfDataBlock();
        }
        dataLine = nextNonCommentLine();
      }
@@ -218,16 +252,88 @@ public class MegaFile extends AlignFile
      // remember the (longest) line length read in, so we can output the same
      setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
  
-    setSequences(seqData);
+    deriveSequences();
+  }
+
+  /**
+   * Post-processing after reading one block of interleaved data
+   */
+  protected void endOfDataBlock()
+  {
+    this.firstDataBlockRead = true;
+    // TODO:
+    // (initialise and) populate arrays of sequence length so far (excluding
+    // gaps)
+    // On change or end of a denoted Gene or Domain, add sequence features for
+    // it
    }
  
    /**
-   * Parse a !Gene command line
+   * Parse a !Gene or !Domain command line
     * 
+   * @param featureType
     * @param dataLine
     */
-  protected void parseGene(String dataLine)
+  protected void parseFeature(String featureType, String dataLine)
    {
+    String featureName = getValue(dataLine);
+    // TODO parse !Gene=xyx Property=end; ???
+    if (this.currentFeature != null)
+    {
+      endSequenceFeature();
+    }
+    startSequenceFeature(featureName, featureType);
+  }
+
+  /**
+   * Start processing a new feature
+   * 
+   * @param featureName
+   */
+  protected void startSequenceFeature(String featureName, String featureType)
+  {
+    currentFeature = featureName;
+    currentFeatureType = featureType;
+
+    /*
+     * If the feature name precedes all sequences, we will know in
+     * endSequenceFeature that it starts with residue 1; otherwise note now
+     * where it starts in each sequence
+     */
+    if (!residuesRead.isEmpty())
+    {
+      for (Entry<String, Integer> entry : residuesRead.entrySet())
+      {
+        String seqId = entry.getKey();
+        Integer nextResidue = entry.getValue() + 1;
+        featureStart.put(seqId, nextResidue);
+      }
+    }
+  }
+
+  /**
+   * Add a SequenceFeature for the current feature to each sequence, using the
+   * current feature start/end values per sequence
+   */
+  protected void endSequenceFeature()
+  {
+    Iterator<String> seqids = this.seqData.keySet().iterator();
+    while (seqids.hasNext())
+    {
+      String seqid = seqids.next();
+      Integer startAt = featureStart.get(seqid);
+      int sfstart = startAt == null ? 1 : startAt.intValue();
+      int sfend = residuesRead.get(seqid);
+      if (sfend >= sfstart)
+      {
+        /*
+         * don't add feature if entirely gapped in the sequence
+         */
+        SequenceFeature sf = new SequenceFeature(currentFeature,
+                currentFeatureType, sfstart, sfend, 0f, null);
+        sequenceFeatures.get(seqid).add(sf);
+      }
+    }
    }
  
    /**
@@ -372,10 +478,8 @@ public class MegaFile extends AlignFile
  
    /**
     * Convert the parsed sequence strings to objects and store them in the model.
-   * 
-   * @param seqData
     */
-  protected void setSequences(Map<String, StringBuilder> seqData)
+  protected void deriveSequences()
    {
      Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
  
@@ -385,6 +489,14 @@ public class MegaFile extends AlignFile
        StringBuilder characters = dataset.getValue();
        SequenceI s = new Sequence(sequenceId, new String(characters));
        this.seqs.addElement(s);
+
+      /*
+       * and add any derived sequence features to the sequence
+       */
+      for (SequenceFeature sf : sequenceFeatures.get(sequenceId))
+      {
+        s.addSequenceFeature(sf);
+      }
      }
    }
  
@@ -395,13 +507,10 @@ public class MegaFile extends AlignFile
     * explicit) for this line.
     * 
     * @param dataLine
-   * @param seqData
-   * @param currentid
     * @return
     * @throws IOException
     */
-  protected String parseDataLine(String dataLine,
-          Map<String, StringBuilder> seqData, String currentId)
+  protected String parseDataLine(String dataLine)
            throws IOException
    {
      String seqId = getSequenceId(dataLine);
@@ -410,8 +519,8 @@ public class MegaFile extends AlignFile
        /*
         * Just character data
         */
-      parseNoninterleavedDataLine(dataLine, seqData, currentId);
-      return currentId;
+      parseNoninterleavedDataLine(dataLine);
+      return currentSequenceId;
      }
      else if ((HASHSIGN + seqId).trim().equals(dataLine.trim()))
      {
@@ -425,7 +534,7 @@ public class MegaFile extends AlignFile
        /*
         * Sequence id followed by data
         */
-      parseInterleavedDataLine(dataLine, seqData, seqId);
+      parseInterleavedDataLine(dataLine, seqId);
        return seqId;
      }
    }
@@ -435,15 +544,12 @@ public class MegaFile extends AlignFile
     * a new one if we haven't seen it before.
     * 
     * @param dataLine
-   * @param seqData
-   * @param currentId
     * @throws IOException
     */
-  protected void parseNoninterleavedDataLine(String dataLine,
-          Map<String, StringBuilder> seqData, String currentId)
+  protected void parseNoninterleavedDataLine(String dataLine)
            throws IOException
    {
-    if (currentId == null)
+    if (currentSequenceId == null)
      {
        /*
         * Oops. Data but no sequence id context.
@@ -453,10 +559,7 @@ public class MegaFile extends AlignFile
  
      assertInterleaved(false, dataLine);
  
-    StringBuilder sb = getSequenceDataBuffer(seqData, currentId);
-
-    dataLine = reformatSequenceData(dataLine, sb.length(), seqData);
-    sb.append(dataLine);
+    dataLine = addSequenceData(currentSequenceId, dataLine);
  
      setPositionsPerLine(Math.max(positionsPerLine, dataLine.length()));
    }
@@ -465,12 +568,10 @@ public class MegaFile extends AlignFile
     * Get the sequence data for this sequence id, starting a new one if
     * necessary.
     * 
-   * @param seqData
     * @param currentId
     * @return
     */
-  protected StringBuilder getSequenceDataBuffer(
-          Map<String, StringBuilder> seqData, String currentId)
+  protected StringBuilder getSequenceDataBuffer(String currentId)
    {
      StringBuilder sb = seqData.get(currentId);
      if (sb == null)
@@ -478,6 +579,9 @@ public class MegaFile extends AlignFile
        // first data met for this sequence id, start a new buffer
        sb = new StringBuilder(SEQBUFFERSIZE);
        seqData.put(currentId, sb);
+
+      // and a placeholder for any SequenceFeature found
+      sequenceFeatures.put(currentId, new ArrayList<SequenceFeature>());
      }
      return sb;
    }
@@ -490,73 +594,89 @@ public class MegaFile extends AlignFile
     * </pre>
     * 
     * @param dataLine
-   * @param seqData
     * @param seqId
-   * @throws IOException
+   * @throws FileFormatException
     */
-  protected void parseInterleavedDataLine(String dataLine,
-          Map<String, StringBuilder> seqData, String seqId)
-          throws IOException
+  protected void parseInterleavedDataLine(String dataLine, String seqId)
+          throws FileFormatException
    {
      /*
       * New sequence found in second or later data block - error.
       */
      if (this.firstDataBlockRead && !seqData.containsKey(seqId))
      {
-      throw new IOException(
+      throw new FileFormatException(
                "Parse error: misplaced new sequence starting at " + dataLine);
      }
  
-    StringBuilder sb = getSequenceDataBuffer(seqData, seqId);
      String data = dataLine.substring(seqId.length() + 1).trim();
  
      /*
       * Do nothing if this line is _only_ a sequence id with no data following.
-     * 
-     * Remove any internal spaces
       */
      if (data != null && data.length() > 0)
      {
-      data = reformatSequenceData(data, sb.length(), seqData);
-      sb.append(data);
+      data = addSequenceData(seqId, data);
        setPositionsPerLine(Math.max(positionsPerLine, data.length()));
        assertInterleaved(true, dataLine);
      }
    }
  
    /**
-   * Reformat input sequence data by removing any internal formatting spaces,
-   * and converting any 'identity' characters to the corresponding position in
-   * the first sequence.
+   * Remove spaces, and replace identity symbol, before appending the sequence
+   * data to the buffer for the sequence id. Returns the reformatted added data.
+   * Also updates a count of residues read for the sequence.
     * 
+   * @param seqId
     * @param data
-   * @param startPos
-   *          the sequence position (base 0) of the start of the data
-   * @param seqData
     * @return
     */
-  protected String reformatSequenceData(String data, int startPos, Map<String, StringBuilder> seqData)
+  protected String addSequenceData(String seqId, String data)
    {
+    StringBuilder sb = getSequenceDataBuffer(seqId);
+    int len = sb.length();
      String formatted = data.replace(SPACE, "");
-    if (formatted.indexOf(identityCharacter) > -1)
+
+    /*
+     * If sequence contains '.' or other identity symbol; replace these with the
+     * same position from the first (reference) sequence
+     */
+    int nonGapped = 0;
+    StringBuilder referenceSequence = seqData.values().iterator().next();
+    StringBuilder sb1 = new StringBuilder(formatted.length());
+    for (int i = 0; i < formatted.length(); i++)
      {
-      /*
-       * sequence contains '.' or other identity symbol; replace these with the
-       * same position from the first (reference) sequence
-       */
-      StringBuilder referenceSequence = seqData.values().iterator().next();
-      StringBuilder sb = new StringBuilder(formatted.length());
-      for (int i = 0 ; i < formatted.length() ; i++) {
-        char nextChar = formatted.charAt(i);
-        if (nextChar != identityCharacter) {
-          sb.append(nextChar);
-        } else {
-          sb.append(referenceSequence.charAt(startPos + i));
-        }
+      char nextChar = formatted.charAt(i);
+      if (nextChar != gapCharacter)
+      {
+        nonGapped++;
+      }
+      if (nextChar == identityCharacter
+              && len + i < referenceSequence.length())
+      {
+        sb1.append(referenceSequence.charAt(len + i));
+      }
+      else
+      {
+        sb1.append(nextChar);
        }
-      formatted = sb.toString();
      }
-    return formatted;
+    formatted = sb1.toString();
+
+    data = formatted;
+    sb.append(data);
+
+    /*
+     * increment residue count for the sequence
+     */
+    if (nonGapped > 0)
+    {
+      Integer residueCount = residuesRead.get(seqId);
+      residuesRead.put(seqId, nonGapped
+              + (residueCount == null ? 0 : residueCount));
+    }
+
+    return data;
    }
  
    /**
@@ -1178,10 +1298,7 @@ public class MegaFile extends AlignFile
    public void addProperties(AlignmentI al)
    {
      super.addProperties(al);
-    if (this.gapCharacter != null)
-    {
-      al.setGapCharacter(gapCharacter);
-    }
+    al.setGapCharacter(gapCharacter);
      
      /*
       * warn if e.g. DataType=DNA but data is protein (or vice versa)
@@ -1207,6 +1324,14 @@ public class MegaFile extends AlignFile
      this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer
              .parseInt(lineLength);
  
+    /*
+     * round down to a multiple of 3 positions per line for nucleotide
+     */
+    if (nucleotide)
+    {
+      positionsPerLine = positionsPerLine - (positionsPerLine % 3);
+    }
+
      String interleave = (String) al.getProperty(PROP_INTERLEAVED);
      if (interleave != null)
      {
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Thu, 1 Oct 2015 15:50:20 +0000 (16:50 +0100)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Thu, 1 Oct 2015 15:50:20 +0000 (16:50 +0100)