JAL-2616 add probability distribution analyser class
authorTZVanaalten <TZVanaalten@LS30916.ad.lifesci.dundee.ac.uk>
Thu, 20 Jul 2017 10:45:41 +0000 (11:45 +0100)
committerTZVanaalten <TZVanaalten@LS30916.ad.lifesci.dundee.ac.uk>
Thu, 20 Jul 2017 10:45:41 +0000 (11:45 +0100)
src/jalview/util/HMMProbabilityDistributionAnalyser.java [new file with mode: 0644]
src/jalview/util/ProbabilityAnalyserKickstarter.java [new file with mode: 0644]

diff --git a/src/jalview/util/HMMProbabilityDistributionAnalyser.java b/src/jalview/util/HMMProbabilityDistributionAnalyser.java
new file mode 100644 (file)
index 0000000..9d6066d
--- /dev/null
@@ -0,0 +1,445 @@
+package jalview.util;
+
+import jalview.datamodel.HiddenMarkovModel;
+import jalview.datamodel.SequenceI;
+import jalview.io.DataSourceType;
+import jalview.io.FileParse;
+import jalview.io.HMMFile;
+import jalview.io.StockholmFile;
+import jalview.schemes.ResidueProperties;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Vector;
+
+public class HMMProbabilityDistributionAnalyser
+{
+
+  Vector<SequenceI> sequences;
+
+  HiddenMarkovModel hmm;
+
+  List<ArrayList<Double>> raw = new ArrayList<>();
+
+  Map<String, Double> binned = new HashMap<>();
+
+  final static String FAMILIES = "C:/Users/TZVanaalten/Pfam-A.full";
+
+  final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
+
+  final static String RAW = "/Raw.csv";
+
+  final static String BINNED = "/Binned.csv";
+
+  final static double SCALE = 100000;
+
+  int currentFilePosition = 0;
+
+  final static String NL = "\n";
+
+  String currentFolder;
+
+  public void setFolder(String path)
+  {
+    currentFolder = path;
+  }
+
+  public void moveToFile(int index, BufferedReader br) throws IOException
+  {
+    for (int i = 0; i < index; i++)
+    {
+      String line = br.readLine();
+      while (!"//".equals(line))
+      {
+        line = br.readLine();
+      }
+    }
+
+  }
+  /**
+   * Analyses probability data
+   * 
+   * @param args
+   * @throws IOException
+   */
+  public void run(int increments) throws IOException
+  {
+
+    readPreviousData(currentFolder);
+
+    BufferedReader posReader = new BufferedReader(
+            new FileReader(currentFolder + "/CurrentPosition.txt"));
+    String line = posReader.readLine();
+    posReader.close();
+    currentFilePosition = Integer.parseInt(line);
+
+    BufferedReader inputSTO = new BufferedReader(
+            new FileReader(FAMILIES));
+    BufferedReader inputHMM = new BufferedReader(
+            new FileReader(HMMS));
+
+    moveToFile(currentFilePosition, inputHMM);
+    moveToFile(currentFilePosition, inputSTO);
+
+    int filesRead = 0;
+    while (filesRead < increments)
+    {
+      FileParse parserSTO = new FileParse(inputSTO, "",
+              DataSourceType.FILE);
+      readStockholm(parserSTO);
+
+      FileParse parserHMM = new FileParse(inputHMM, "",
+              DataSourceType.FILE);
+      readHMM(parserHMM);
+
+      if (hmm.getAlphabetType().equals("amino"))
+      {
+        int count = countValidResidues();
+        processData(count);
+        filesRead++;
+      }
+      currentFilePosition++;
+    }
+
+    PrintWriter p = new PrintWriter(
+            new File(currentFolder + "/CurrentPosition"));
+    p.print(currentFilePosition);
+    p.close();
+    exportData(currentFolder);
+    raw.clear();
+    binned.clear();
+
+  }
+
+  public void readPreviousData(String source) throws IOException
+  {
+    readBinned(source);
+    readRaw(source);
+  }
+
+  public void readBinned(String source) throws IOException
+  {
+    BufferedReader input = new BufferedReader(
+            new FileReader(source + BINNED));
+    String line = input.readLine();
+    while (!("".equals(line) || line == null))
+    {
+      binned = new HashMap<>();
+      Scanner scanner = new Scanner(line);
+      scanner.useDelimiter(",");
+      binned.put(scanner.next(), scanner.nextDouble());
+      scanner.close();
+      line = input.readLine();
+    }
+
+    input.close();
+  }
+
+  public void readRaw(String source) throws IOException
+  {
+    BufferedReader input = new BufferedReader(new FileReader(source + RAW));
+    String line = input.readLine();
+    if (line == null)
+    {
+      input.close();
+      return;
+    }
+    Scanner numberScanner = new Scanner(line);
+    numberScanner.useDelimiter(",");
+    raw = new ArrayList<>();
+    while (numberScanner.hasNext())
+    {
+      numberScanner.next();
+      raw.add(new ArrayList<Double>());
+    }
+    numberScanner.close();
+
+    line = input.readLine();
+    while (!("".equals(line) || line == null))
+    {
+      Scanner scanner = new Scanner(line);
+      scanner.useDelimiter(",");
+
+      int i = 0;
+      while (scanner.hasNext())
+      {
+        String value;
+        value = scanner.next();
+        if (!value.equals("EMPTY"))
+        {
+          raw.get(i).add(Double.parseDouble(value));
+        }
+
+        i++;
+      }
+      scanner.close();
+      line = input.readLine();
+    }
+
+    input.close();
+  }
+
+  public int countValidResidues()
+  {
+    int count = 0;
+
+    for (int width = 0; width < sequences.size(); width++)
+    {
+      for (int length = 1; length < hmm.getLength(); length++)
+      {
+        char symbol;
+        int alignPos;
+        alignPos = hmm.getNodeAlignmentColumn(length);
+
+        symbol = sequences.get(width).getCharAt(alignPos);
+        if (ResidueProperties.aminoBackgroundFrequencies
+                .containsKey(symbol))
+        {
+          count++;
+        }
+      }
+    }
+
+    return count;
+  }
+
+  public void processData(int count)
+  {
+
+    raw.add(new ArrayList<Double>());
+    int rawPos = raw.size() - 1;
+    for (int width = 0; width < sequences.size(); width++)
+    {
+      for (int length = 1; length < hmm.getLength(); length++)
+      {
+        char symbol;
+        int alignPos;
+        alignPos = hmm.getNodeAlignmentColumn(length);
+
+        symbol = sequences.get(width).getCharAt(alignPos);
+        if (ResidueProperties.aminoBackgroundFrequencies
+                .containsKey(symbol))
+        {
+
+          Double prob;
+          Float bfreq;
+          Double llr;
+          prob = hmm.getMatchEmissionProbability(alignPos, symbol);
+          bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
+          llr = Math.log(prob / bfreq);
+          raw.get(rawPos).add(llr);
+          String output;
+          output = String.format("%.1f", llr);
+          if ("-0.0".equals(output))
+          {
+            output = "0.0";
+          }
+          if (binned.containsKey(output))
+          {
+            double prev = binned.get(output);
+            prev += (SCALE / count);
+            binned.put(output, prev);
+
+          }
+          else
+          {
+            binned.put(output, SCALE / count);
+          }
+        }
+      }
+    }
+  }
+
+
+  public void readStockholm(FileParse source) throws IOException
+  {
+    StockholmFile file = new StockholmFile(source);
+    file.parse();
+    sequences = file.getSeqs();
+  }
+
+  public void readHMM(FileParse source) throws IOException
+  {
+
+    HMMFile file = new HMMFile(source);
+    file.parse();
+    hmm = file.getHMM();
+
+  }
+
+  public void exportData(String location) throws FileNotFoundException
+  {
+    PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
+    for (Map.Entry<String, Double> entry : binned.entrySet())
+    {
+      writerBin.println(entry.getKey() + "," + entry.getValue());
+    }
+    writerBin.close();
+
+    PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
+
+    StringBuilder identifier = new StringBuilder();
+
+    for (int i = 1; i < raw.size() + 1; i++)
+    {
+      identifier.append("Fam " + i + ",");
+    }
+
+    writerRaw.println(identifier);
+
+    boolean rowIsEmpty = false;
+    int row = 0;
+    while (!rowIsEmpty)
+    {
+      rowIsEmpty = true;
+      StringBuilder string = new StringBuilder();
+      for (int column = 0; column < raw.size(); column++)
+      {
+        if (raw.get(column).size() <= row)
+        {
+          string.append("EMPTY,");
+        }
+        else
+        {
+          string.append(raw.get(column).get(row) + ",");
+          rowIsEmpty = false;
+        }
+      }
+      row++;
+      writerRaw.println(string);
+    }
+    writerRaw.close();
+
+  }
+
+  public void printFam(int index) throws IOException
+  {
+    BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
+
+    moveToFile(index, br);
+
+    String line = br.readLine();
+
+    while (!"//".equals(line))
+    {
+      System.out.println(line);
+      line = br.readLine();
+    }
+    System.out.println(line);
+    br.close();
+
+  }
+
+  public void printHMM(int index) throws IOException
+  {
+    BufferedReader br = new BufferedReader(new FileReader(HMMS));
+
+    moveToFile(index, br);
+
+    String line = br.readLine();
+
+    while (!"//".equals(line))
+    {
+      System.out.println(line);
+      line = br.readLine();
+    }
+    System.out.println(line);
+    br.close();
+
+  }
+
+  public void printFamToFile(int index) throws IOException
+  {
+    String name;
+
+    BufferedReader nameFinder = new BufferedReader(
+            new FileReader(FAMILIES));
+
+    moveToFile(index, nameFinder);
+
+    nameFinder.readLine();
+
+    Scanner scanner = new Scanner(nameFinder.readLine());
+    scanner.next();
+    scanner.next();
+    name = scanner.next();
+    scanner.close();
+
+    BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
+
+    moveToFile(index, br);
+
+    String line = br.readLine();
+    PrintWriter writer = new PrintWriter(
+            currentFolder + "/" + name + ".sto");
+    while (!"//".equals(line))
+    {
+      writer.println(line);
+      line = br.readLine();
+    }
+    writer.println(line);
+    writer.close();
+    br.close();
+
+  }
+
+  public void printHMMToFile(int index) throws IOException
+  {
+
+    String name;
+
+    BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS));
+
+    moveToFile(index, nameFinder);
+
+    nameFinder.readLine();
+
+    Scanner scanner = new Scanner(nameFinder.readLine());
+    name = scanner.next();
+    name = scanner.next();
+    scanner.close();
+
+    BufferedReader br = new BufferedReader(new FileReader(HMMS));
+
+    moveToFile(index, br);
+
+    String line = br.readLine();
+
+    PrintWriter writer = new PrintWriter(
+            currentFolder + "/" + name + ".hmm");
+    while (!"//".equals(line))
+    {
+      writer.println(line);
+      line = br.readLine();
+    }
+    writer.println(line);
+    writer.close();
+    br.close();
+
+  }
+  
+  public void clear() throws FileNotFoundException
+  {
+    PrintWriter pos = new PrintWriter(
+            currentFolder + "/CurrentPosition.txt");
+    pos.println("0");
+    
+    PrintWriter raw = new PrintWriter(currentFolder + RAW);
+    
+    PrintWriter bin = new PrintWriter(currentFolder + BINNED);
+    
+    pos.close();
+    bin.close();
+    raw.close();
+  }
+
+}
diff --git a/src/jalview/util/ProbabilityAnalyserKickstarter.java b/src/jalview/util/ProbabilityAnalyserKickstarter.java
new file mode 100644 (file)
index 0000000..fd83e3a
--- /dev/null
@@ -0,0 +1,100 @@
+package jalview.util;
+
+import java.io.IOException;
+import java.util.Scanner;
+
+public class ProbabilityAnalyserKickstarter
+{
+
+  public static void main(String[] args)
+          throws IOException, InterruptedException
+  {
+
+    HMMProbabilityDistributionAnalyser analyser = new HMMProbabilityDistributionAnalyser();
+
+    boolean running = true;
+    System.out.println("ACTIVATED");
+    while (running)
+    {
+      Scanner keyboard = new Scanner(System.in);
+      String command = keyboard.nextLine();
+
+      Scanner inputScanner = new Scanner(command);
+      if (command.indexOf("printFam") > -1)
+      {
+        inputScanner.next();
+        int index = inputScanner.nextInt();
+        analyser.printFam(index);
+        continue;
+      }
+
+      if (command.indexOf("printHMM") > -1)
+      {
+
+        inputScanner.next();
+        int index = inputScanner.nextInt();
+        analyser.printHMM(index);
+        continue;
+      }
+
+      if (command.indexOf("exportFam") > -1)
+      {
+
+        inputScanner.next();
+        int index = inputScanner.nextInt();
+        String location = inputScanner.next();
+        analyser.printFamToFile(index);
+        continue;
+      }
+
+      if (command.indexOf("exportHMM") > -1)
+      {
+
+        inputScanner.next();
+        int index = inputScanner.nextInt();
+        String location = inputScanner.next();
+        analyser.printHMMToFile(index);
+        continue;
+      }
+
+      if (command.indexOf("run") > -1)
+      {
+        inputScanner.next();
+
+        int loops = inputScanner.nextInt();
+        int increments = inputScanner.nextInt();
+
+        for (int i = 0; i < loops; i++)
+        {
+          analyser.run(increments);
+        }
+        continue;
+      }
+
+      if (command.indexOf("terminate") > -1)
+      {
+        running = false;
+        continue;
+      }
+
+      if (command.indexOf("clear") > -1)
+      {
+        analyser.clear();
+        continue;
+      }
+
+      if (command.indexOf("cd") > -1)
+      {
+        inputScanner.next();
+        analyser.setFolder(inputScanner.next());
+      }
+      inputScanner.close();
+      continue;
+    }
+
+
+
+
+  }
+
+}