add HMMFile class to read and write HMM files
[jalview.git] / src / jalview / io / HMMFile.java
diff --git a/src/jalview/io/HMMFile.java b/src/jalview/io/HMMFile.java
new file mode 100644 (file)
index 0000000..764db7f
--- /dev/null
@@ -0,0 +1,428 @@
+package jalview.io;
+
+import jalview.datamodel.EValueStatistic;
+import jalview.datamodel.HiddenMarkovModel;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+/**
+ * reads in and writes out a HMMER standard file
+ * 
+ * 
+ * @author TZVanaalten
+ *
+ */
+public class HMMFile extends FileParse
+{
+  // HMM to store file data
+  HiddenMarkovModel hmm = new HiddenMarkovModel();
+
+  // Source of file
+  String dataObject;
+
+  // number of symbols
+  int numberOfSymbols;
+
+  // number of possible transitions
+  final int NUMBER_OF_TRANSITIONS = 7;
+
+  // file header
+  String fileHeader;
+
+  /**
+   * Constructor which contains model to be filled or exported
+   * 
+   * @param dataSource
+   *          Filename, URL or Pasted String to read from
+   */
+  public HMMFile(String dataSource)
+  {
+    dataObject = dataSource;
+  }
+
+  /**
+   * reads data from HMM file
+   * 
+   * @throws IOException
+   */
+  public void parse() throws IOException
+  {
+    File file = new File(dataObject);
+    FileReader fr = new FileReader(file);
+    BufferedReader br = new BufferedReader(fr);
+    parseFileProperties(br);
+    parseModel(br);
+
+  }
+
+  /**
+   * imports file properties from hmm file
+   * 
+   * @param input
+   *          buffered reader used to read in file
+   * @throws IOException
+   */
+  public void parseFileProperties(BufferedReader input) throws IOException
+  {
+    boolean readingFile = true;
+    fileHeader = input.readLine();
+    String line = input.readLine();
+    while (readingFile)
+    {
+      if (line != null)
+      {
+        Scanner parser = new Scanner(line);
+        String next = parser.next();
+        if ("HMM".equals(next)) // indicates start of HMM data (end of file
+                              // properties)
+        {
+          readingFile = false;
+          hmm.fillSymbols(line);
+          numberOfSymbols = hmm.getSymbols().size();
+        }
+        else if ("STATS".equals(next)) // reads e-value stats into separate
+                                       // field
+                                     // on HMM object
+        {
+          readStats(parser);
+        }
+        else if ("GA".equals(next) || "TC".equals(next)
+                || "NC".equals(next)) // reads
+                                                                            // pfam
+                                                                            // data
+                                                                            // into
+                                                                            // separate
+                                                                            // field
+                                                                            // on
+                                                                            // HMM
+                                                                            // object
+        {
+          Double[] data = new Double[2];
+          data[0] = parser.nextDouble();
+          data[1] = parser.nextDouble();
+          hmm.setPFAMData(next, data);
+        }
+        else
+        {
+          String key = next;
+          String value = parser.next();
+          while (parser.hasNext())
+          {
+            value = value + " " + parser.next();
+          }
+          hmm.put(key, value);
+        }
+        parser.close();
+      }
+      line = input.readLine();
+      if (line == null)
+      {
+        readingFile = false;
+      }
+    }
+
+  }
+
+  /**
+   * creates a new EValueStatistic object to store stats
+   * 
+   * @param parser
+   *          Scanner which contains data for STATS line
+   * 
+   */
+  public void readStats(Scanner parser)
+  {
+    if (parser.hasNext())
+    {
+    String name;
+    double slope;
+    double location;
+    String configuration;
+
+    configuration = parser.next();
+    name = parser.next();
+    slope = parser.nextDouble();
+    location = parser.nextDouble();
+    hmm.addStatistic(name,
+            new EValueStatistic(configuration, slope, location));
+    }
+  }
+
+  /**
+   * parses the model data from the hmm file
+   * 
+   * @param input
+   *          buffered reader used to read file
+   * @throws IOException
+   */
+  public void parseModel(BufferedReader input) throws IOException
+  {
+
+    String line = input.readLine();
+    Scanner scanner = new Scanner(line);
+    String next = scanner.next();
+    if ("COMPO".equals(next)) // checks to and stores COMPO data if present
+    {
+      for (int i = 0; i < numberOfSymbols; i++)
+
+      {
+        hmm.getAverageMatchStateEmissionProbabilities()
+                .add(scanner.nextDouble());
+      }
+    }
+    scanner.close();
+    parseBeginNodeData(input);
+    for (int i = 0; i < hmm.getLength(); i++)
+    {
+      Scanner matchReader = new Scanner(input.readLine());
+      matchReader.nextInt(); // skips number indicating position in HMM
+      hmm.getMatchEmissions()
+              .add(fillList(matchReader, numberOfSymbols));
+      parseAnnotations(matchReader, i);
+      matchReader.close();
+      Scanner insertReader = new Scanner(input.readLine());
+      hmm.getInsertEmissions().add(fillList(insertReader, numberOfSymbols));
+      insertReader.close();
+      Scanner transitionReader = new Scanner(input.readLine());
+      hmm.getStateTransitions()
+              .add(fillList(transitionReader, NUMBER_OF_TRANSITIONS));
+      transitionReader.close();
+    }
+
+  }
+
+  /**
+   * parses the begin state transitions and insert 0 emissions
+   * 
+   * @param input
+   *          buffered reader used to read model
+   * @param currentline
+   *          string contain all data on current line of buffered reader
+   * @throws IOException
+   */
+
+  public void parseBeginNodeData(BufferedReader input)
+          throws IOException
+  {
+    Scanner scanner = new Scanner(input.readLine());
+    hmm.setInsertZeroEmissions(fillList(scanner, hmm.getSymbols().size()));
+    scanner.close();
+    Scanner scannerTransitions = new Scanner(input.readLine());
+    hmm.setBeginStateTransitions(
+            fillList(scannerTransitions, NUMBER_OF_TRANSITIONS));
+    scannerTransitions.close();
+  }
+
+  /**
+   * parses annotations on match emission line
+   * 
+   * @param scanner
+   *          scanner which is processing match emission line
+   * @param index
+   *          index of node which is beign scanned
+   */
+  public void parseAnnotations(Scanner scanner, int index)
+  {
+    if (hmm.getMapAnnotationFlag())
+    {
+      hmm.getAlignmentColumnIndexes().add(scanner.nextInt());
+    }
+    else
+    {
+      scanner.next();
+    }
+    hmm.getAnnotations().add(new HashMap<String, Character>());
+    hmm.getAnnotations().get(index).put("CONS", scanner.next().charAt(0));
+    hmm.getAnnotations().get(index).put("RF", scanner.next().charAt(0));
+    hmm.getAnnotations().get(index).put("MM", scanner.next().charAt(0));
+    hmm.getAnnotations().get(index).put("CS", scanner.next().charAt(0));
+  }
+  /**
+   * 
+   * @param transition
+   *          type of transition occuring
+   * @return index value representing position along stateTransition array.
+   */
+  public Integer getTransitionType(String transition)
+  {
+    Integer index;
+    switch (transition)
+    {
+    case "mm":
+      index = 0;
+      break;
+    case "mi":
+      index = 1;
+      break;
+    case "md":
+      index = 2;
+      break;
+    case "im":
+      index = 3;
+      break;
+    case "ii":
+      index = 4;
+      break;
+    case "dm":
+      index = 5;
+      break;
+    case "dd":
+      index = 6;
+      break;
+    default:
+      index = null;
+    }
+    return index;
+  }
+
+  /**
+   * 
+   * @param input
+   *          scanner for line containing data to be transferred to list
+   * @param numberOfElements
+   *          number of elements in the list to be filled
+   * @return filled list
+   */
+  public static List<Double> fillList(Scanner input,
+          int numberOfElements)
+  {
+    List<Double> list = new ArrayList<>();
+    String next;
+    for (int i = 0; i < numberOfElements; i++)
+    {
+      next = input.next();
+      if (next.contains("*")) // state transitions to or from delete states
+                              // occasionally have values of -infinity. These
+                              // values are represented by an * in the .hmm
+                              // file, and by a null value in the
+                              // HiddenMarkovModel class
+      {
+        list.add(null);
+      }
+      else
+      {
+        list.add(Double.valueOf(next));
+      }
+    }
+    return list;
+  }
+
+  /**
+   * writes a HiddenMarkovModel to a file. Needs mode work to make file more
+   * readable for humans (align columns)
+   * 
+   * @param exportLocation
+   *          Filename, URL or Pasted String to write to
+   * @throws FileNotFoundException
+   * @throws UnsupportedEncodingException
+   */
+  public void exportFile(String exportLocation)
+          throws FileNotFoundException, UnsupportedEncodingException
+  {
+    PrintWriter writer = new PrintWriter(exportLocation, "UTF-8");
+    writer.println(fileHeader);
+    for (Map.Entry<String, String> entry : hmm.getFileProperties()
+            .entrySet())
+    {
+      writer.println(entry.getKey() + " " + entry.getValue());
+    }
+    writer.println(
+            "HMM" + " " + convertCharListToString(hmm.getSymbols()));
+    writer.println("m->m m->i m->d i->m i->i d->m d->d");
+    if (false == hmm.getAverageMatchStateEmissionProbabilities().isEmpty())
+    {
+      writer.println("COMPO" + " " + convertDoubleListToString(
+              hmm.getAverageMatchStateEmissionProbabilities()));
+    }
+    writer.println(convertDoubleListToString(hmm.getInsertZeroEmissions()));
+    writer.println(
+            convertDoubleListToString(hmm.getBeginStateTransitions()));
+
+    for (Integer i = 0; i < hmm.getLength(); i++)
+    {
+      String matchEmissionLine = i.toString() + " "; // adds node index
+      matchEmissionLine += convertDoubleListToString(
+              hmm.getMatchEmissions().get(i)); // adds match emissions
+      matchEmissionLine += " "
+              + hmm.getAlignmentColumnIndexes().get(i).toString(); // adds MAP
+                                                                   // annotation
+      matchEmissionLine += " "
+              + hmm.getAnnotations().get(i).get("CONS").toString(); // adds CONS
+                                                                    // annotation
+      matchEmissionLine += " "
+              + hmm.getAnnotations().get(i).get("RF").toString(); // adds RF
+                                                                  // annotation
+      matchEmissionLine += " "
+              + hmm.getAnnotations().get(i).get("MM").toString(); // adds MM
+                                                                  // annotation
+      matchEmissionLine += " "
+              + hmm.getAnnotations().get(i).get("CS").toString(); // adds CS
+                                                                  // annotation
+      writer.println(matchEmissionLine);
+
+      writer.println(
+              convertDoubleListToString(hmm.getInsertEmissions().get(i)));
+      writer.println(
+              convertDoubleListToString(hmm.getStateTransitions().get(i)));
+    }
+    writer.println("//");
+
+    writer.close();
+  }
+
+  /**
+   * converts an list of characters to a string with items separated by spaces
+   * 
+   * @param list
+   *          character list to be converted
+   * @return string value of char list
+   */
+  public String convertCharListToString(List<Character> list)
+  {
+    String string = "";
+    for (Character item : list)
+    {
+      string = string + item.toString() + " ";
+    }
+
+    return string;
+  }
+  
+  /**
+   * converts an list of doubles to a string with items separated by spaces
+   * 
+   * @param list
+   *          double list to be converted
+   * @return string value of double list
+   */
+  public String convertDoubleListToString(List<Double> list)
+  {
+    String string = "";
+    for (Double item : list)
+    {
+      if (item != null)
+      {
+        string = string + item.toString() + " ";
+      }
+      else
+      {
+        string = string + "*" + " ";
+      }
+
+    }
+
+    return string;
+  }
+}
+