Merge branch 'develop' into features/mchmmer
[jalview.git] / src / jalview / util / HMMProbabilityDistributionAnalyser.java
index b30487d..66ae552 100644 (file)
@@ -1,5 +1,6 @@
 package jalview.util;
 
+import jalview.datamodel.AlignmentAnnotation;
 import jalview.datamodel.HiddenMarkovModel;
 import jalview.datamodel.SequenceI;
 import jalview.io.DataSourceType;
@@ -14,11 +15,13 @@ import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 import java.util.Scanner;
 import java.util.Vector;
 
@@ -32,6 +35,7 @@ import java.util.Vector;
  */
 public class HMMProbabilityDistributionAnalyser
 {
+  AlignmentAnnotation reference = null;
 
   Vector<SequenceI> sequences;
 
@@ -44,13 +48,13 @@ public class HMMProbabilityDistributionAnalyser
   Map<String, Double> binned = new HashMap<>();
 
   // location of the family file
-  final static String FAMILIES = "H:/Desktop/PFAM/Family/SeedFamilies.seed";
+  String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
 
   // location of the file containing the family-clan links
-  final static String FAMILIESTOCLAN = "H:/Desktop/PFAM/Family/Clanlinks.dat";
+  final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
 
   // location of the HMM file
-  final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
+  String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
 
   // suffix for raw file
   final static String RAW = "/Raw.csv";
@@ -59,13 +63,15 @@ public class HMMProbabilityDistributionAnalyser
   final static String BINNED = "/Binned.csv";
 
   // normalisation scale
-  final static double SCALE = 100000;
+  final static double SCALE = 1;
 
   // current position in file
   int currentFilePosition = 0;
 
   final static String NL = "\n";
 
+  Random generator = new Random();
+
   // current directory
   String currentFolder;
 
@@ -99,6 +105,7 @@ public class HMMProbabilityDistributionAnalyser
       while (!"//".equals(line))
       {
         line = br.readLine();
+
       }
     }
 
@@ -107,7 +114,8 @@ public class HMMProbabilityDistributionAnalyser
   /**
    * Analyses a specified number of families and then saves the data. Before
    * analysing the data, the previous saved data will be imported and after
-   * analysing this data is exported back into the file.
+   * analysing this, the data is exported back into the file. The file must be
+   * in flat file format.
    * 
    * @param increments
    *          The number of families to read before saving.
@@ -116,17 +124,26 @@ public class HMMProbabilityDistributionAnalyser
   public void run(int increments, boolean keepRawData) throws IOException
   {
     keepRaw = keepRawData;
-    readPreviousData(currentFolder);
+    try
+    {
+      readPreviousData(currentFolder);
+      BufferedReader posReader = new BufferedReader(
+              new FileReader(currentFolder + "/CurrentPosition.txt"));
+
+      String line = posReader.readLine();
+      posReader.close();
+      currentFilePosition = Integer.parseInt(line);
+    } catch (Exception e)
+    {
+      System.out.println("No previous data found");
+    }
+
 
-    BufferedReader posReader = new BufferedReader(
-            new FileReader(currentFolder + "/CurrentPosition.txt"));
-    String line = posReader.readLine();
-    posReader.close();
 
-    BufferedReader inputSTO = new BufferedReader(new FileReader(FAMILIES));
-    BufferedReader inputHMM = new BufferedReader(new FileReader(HMMS));
+    BufferedReader inputSTO = new BufferedReader(new FileReader(families));
+    BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
+
 
-    currentFilePosition = Integer.parseInt(line);
 
     moveLocationBy(currentFilePosition, inputHMM);
     moveLocationBy(currentFilePosition, inputSTO);
@@ -136,13 +153,9 @@ public class HMMProbabilityDistributionAnalyser
     while (filesRead < increments)
     {
 
-      FileParse parserSTO = new FileParse(inputSTO, "",
-              DataSourceType.FILE);
-      readStockholm(parserSTO);
+      readStockholm(inputSTO);
 
-      FileParse parserHMM = new FileParse(inputHMM, "",
-              DataSourceType.FILE);
-      readHMM(parserHMM);
+      readHMM(inputHMM);
 
         int count = countValidResidues();
         processData(count);
@@ -164,6 +177,93 @@ public class HMMProbabilityDistributionAnalyser
   }
 
   /**
+   * Analyses all families and then saves the data. Before analysing the data,
+   * the previous saved data will be imported and after analysing this, the data
+   * is exported back into the file. The file must be in flat file format.
+   * 
+   * @param increments
+   *          The number of families to read before saving.
+   * @throws IOException
+   */
+  public void runToEnd(int minCount, int maxCount, boolean keepRawData,
+          boolean forClans)
+          throws IOException
+  {
+    keepRaw = keepRawData;
+    BufferedReader inputSTO = null;
+    BufferedReader inputHMM = null;
+    int size = 0;
+    int files = 1;
+    try
+    {
+    if (forClans)
+    {
+        files = 603;
+    }
+    int filesRead = 0;
+    for (int clan = 0; clan < files; clan++)
+    {
+      System.out.println(clan);
+      String clanPath = "";
+      int numberOfFamilies = 0;
+      if (forClans)
+      {
+        clanPath = currentFolder + "/Clan" + clan;
+          if (!new File(clanPath).exists())
+          {
+            continue;
+          }
+        BufferedReader famCountReader = new BufferedReader(
+                new FileReader(clanPath + "/NumberOfFamilies.txt"));
+        numberOfFamilies = Integer.parseInt(famCountReader.readLine());
+      }
+      else
+      {
+        numberOfFamilies = 1;
+      }
+      
+      for (int fam = 0; fam < numberOfFamilies; fam++)
+      {
+        if (forClans)
+        {
+          families = clanPath + "/Families/Fam" + fam + ".sto";
+          hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
+        }
+
+        inputSTO = new BufferedReader(new FileReader(families));
+        inputHMM = new BufferedReader(new FileReader(hmms));
+
+
+        int i = 0;
+        boolean endReached = atEnd(inputSTO);
+        while (!endReached)
+        {
+          readStockholm(inputSTO);
+          readHMM(inputHMM);
+
+        int count = countValidResidues();
+            if (count >= minCount && count < maxCount)
+            {
+              processData(count);
+            }
+        filesRead++;
+          System.out.println(filesRead);
+      endReached = atEnd(inputSTO);
+      }
+      }
+    }
+    } catch (Exception e)
+    {
+      e.printStackTrace();
+    } finally
+    {
+      exportData(currentFolder);
+      raw.clear();
+      binned.clear();
+    }
+  }
+
+  /**
    * Reads the previous data from both files
    * 
    * @param source
@@ -194,7 +294,9 @@ public class HMMProbabilityDistributionAnalyser
     {
       Scanner scanner = new Scanner(line);
       scanner.useDelimiter(",");
-      binned.put(scanner.next(), scanner.nextDouble());
+      String key = scanner.next();
+      String value = scanner.next();
+      binned.put(key, Double.valueOf(value));
       scanner.close();
       line = input.readLine();
     }
@@ -242,6 +344,10 @@ public class HMMProbabilityDistributionAnalyser
         {
           raw.get(i).add(Double.parseDouble(value));
         }
+        else
+        {
+          raw.get(i).add(null);
+        }
 
         i++;
       }
@@ -263,14 +369,14 @@ public class HMMProbabilityDistributionAnalyser
 
     for (int width = 0; width < sequences.size(); width++)
     {
-      for (int length = 1; length < hmm.getLength(); length++)
+      for (int length = 1; length < hmm.getLength() + 1; length++)
       {
         char symbol;
         int alignPos;
-        alignPos = hmm.getNodeAlignmentColumn(length);
+        alignPos = hmm.getNodeMapPosition(length);
 
         symbol = sequences.get(width).getCharAt(alignPos);
-        if (ResidueProperties.aminoBackgroundFrequencies
+        if (ResidueProperties.backgroundFrequencies.get("amino")
                 .containsKey(symbol))
         {
           count++;
@@ -294,24 +400,29 @@ public class HMMProbabilityDistributionAnalyser
       raw.add(new ArrayList<Double>());
       rawPos = raw.size() - 1;
     }
-
+    Double total = 0d;
     for (int width = 0; width < sequences.size(); width++)
     {
-      for (int length = 1; length < hmm.getLength(); length++)
+      for (int length = 1; length < hmm.getLength() + 1; length++)
       {
         char symbol;
         int alignPos;
-        alignPos = hmm.getNodeAlignmentColumn(length);
+        alignPos = hmm.getNodeMapPosition(length);
         
         symbol = sequences.get(width).getCharAt(alignPos);
-        if (ResidueProperties.aminoBackgroundFrequencies
+        if (ResidueProperties.backgroundFrequencies.get("amino")
                 .containsKey(symbol))
         {
           Double prob;
           Float bfreq;
           Double llr;
           prob = hmm.getMatchEmissionProbability(alignPos, symbol);
-          bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
+          bfreq = ResidueProperties.backgroundFrequencies.get("amino")
+                  .get(symbol);
+          if (prob == 0 || bfreq == 0)
+          {
+            System.out.println("error");
+          }
           llr = Math.log(prob / bfreq);
           if (keepRaw)
           {
@@ -320,6 +431,7 @@ public class HMMProbabilityDistributionAnalyser
 
           String output;
           output = String.format("%.1f", llr);
+          total += Double.parseDouble(output);
           if ("-0.0".equals(output))
           {
             output = "0.0";
@@ -338,6 +450,7 @@ public class HMMProbabilityDistributionAnalyser
         }
       }
     }
+    System.out.println(total / count);
   }
 
 
@@ -347,9 +460,19 @@ public class HMMProbabilityDistributionAnalyser
    * @param source
    * @throws IOException
    */
-  public void readStockholm(FileParse source) throws IOException
+  public void readStockholm(BufferedReader inputSTO) throws IOException
   {
-    StockholmFile file = new StockholmFile(source);
+    FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
+    StockholmFile file = new StockholmFile(parserSTO);
+    Vector<AlignmentAnnotation> annots = file.getAnnotations();
+
+    for (AlignmentAnnotation annot : annots)
+    {
+      if (annot.label.contains("Reference"))
+      {
+        reference = annot;
+      }
+    }
     sequences = file.getSeqs();
   }
 
@@ -359,13 +482,13 @@ public class HMMProbabilityDistributionAnalyser
    * @param source
    * @throws IOException
    */
-  public void readHMM(FileParse source) throws IOException
+  public void readHMM(BufferedReader inputHMM) throws IOException
   {
-
-    HMMFile file = new HMMFile(source);
-    file.parse();
+    FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
+    HMMFile file = new HMMFile(parserHMM);
     hmm = file.getHMM();
 
+
   }
 
   /**
@@ -431,7 +554,7 @@ public class HMMProbabilityDistributionAnalyser
    */
   public void printFam(int index) throws IOException
   {
-    BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
+    BufferedReader br = new BufferedReader(new FileReader(families));
 
     moveLocationBy(index, br);
 
@@ -455,7 +578,7 @@ public class HMMProbabilityDistributionAnalyser
    */
   public void printHMM(int index) throws IOException
   {
-    BufferedReader br = new BufferedReader(new FileReader(HMMS));
+    BufferedReader br = new BufferedReader(new FileReader(hmms));
 
     moveLocationBy(index, br);
 
@@ -479,7 +602,7 @@ public class HMMProbabilityDistributionAnalyser
    */
   public void exportFam(int index, String location) throws IOException
   {
-    BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
+    BufferedReader br = new BufferedReader(new FileReader(families));
 
     moveLocationBy(index, br);
 
@@ -497,12 +620,12 @@ public class HMMProbabilityDistributionAnalyser
 
   }
 
-  public void exportFile(BufferedReader br, String location)
+  public void exportFile(BufferedReader br, String location, boolean append)
           throws IOException
   {
     String line = br.readLine();
     PrintWriter writer = new PrintWriter(
-            new FileOutputStream(new File(location), true));
+            new FileOutputStream(location, append));
     while (!"//".equals(line))
     {
       writer.println(line);
@@ -518,7 +641,7 @@ public class HMMProbabilityDistributionAnalyser
   {
     String name;
 
-    BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS));
+    BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
 
     moveLocationBy(index, nameFinder);
 
@@ -536,7 +659,7 @@ public class HMMProbabilityDistributionAnalyser
     String name;
 
     BufferedReader nameFinder = new BufferedReader(
-            new FileReader(FAMILIES));
+            new FileReader(families));
 
     moveLocationBy(index, nameFinder);
 
@@ -551,7 +674,7 @@ public class HMMProbabilityDistributionAnalyser
   }
 
   /**
-   * Prints the specified family to a .hmm file in the current directory.
+   * Prints the specified family to a .hmm file.
    * 
    * @param index
    * @throws IOException
@@ -560,7 +683,7 @@ public class HMMProbabilityDistributionAnalyser
   {
 
 
-    BufferedReader br = new BufferedReader(new FileReader(HMMS));
+    BufferedReader br = new BufferedReader(new FileReader(hmms));
 
     moveLocationBy(index, br);
 
@@ -603,9 +726,14 @@ public class HMMProbabilityDistributionAnalyser
   {
     BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
     BufferedReader familyReader = new BufferedReader(
-            new FileReader(FAMILIES));
-    BufferedReader hmmReader = new BufferedReader(new FileReader(HMMS));
+            new FileReader(families));
+    BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
+    int families = 0;
+    // moveLocationBy(7000, familyReader);
+    // moveLocationBy(7000, clanFinder);
+    // moveLocationBy(7000, hmmReader);
     HashMap<String, Integer> clanIndexes = new HashMap<>();
+    ArrayList<Integer> familyCounts = new ArrayList<>();
     int filePos = 0; 
     int clanCount = 0;
     String line;
@@ -620,6 +748,8 @@ public class HMMProbabilityDistributionAnalyser
        
       if (line.indexOf("#=GF CL") > -1)
       {
+          families++;
+          System.out.println(families);
           inClan = true;
         Scanner scanner = new Scanner(line);
         scanner.next();
@@ -631,22 +761,29 @@ public class HMMProbabilityDistributionAnalyser
         {
           clanIndexes.put(clanName, clanCount);
             clanCount++;
+            familyCounts.add(0);
         }
 
+
           Integer clanI = clanIndexes.get(clanName);
           String clanPath = directory + "/Clan" + clanI.toString();
-          File clanFolder = new File(clanPath);
-          String famPath = clanPath + "/Families.sto";
-          String hmmPath = clanPath + "/HMMs.hmm";
-          if (!clanFolder.exists())
-        {
-            clanFolder.mkdir();
-        }
-          exportFile(familyReader, famPath);
-          exportFile(hmmReader, hmmPath);
-
+          createFolders(clanPath);
+
+          int index = clanIndexes.get(clanName);
+          exportFile(familyReader,
+                  clanPath + "/Families/Fam" + familyCounts.get(index)
+                          + ".sto",
+                  false);
+          exportFile(hmmReader,
+                  clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
+                  false);
+
+          int count = familyCounts.get(index);
+          count++;
+          familyCounts.set(index, count);
       }
         line = clanFinder.readLine();
+
       }
       if (!inClan)
       {
@@ -654,14 +791,188 @@ public class HMMProbabilityDistributionAnalyser
         moveLocationBy(1, hmmReader);
       }
       filePos++;
-      System.out.println(filePos + " files read.");
+      // System.out.println(filePos + " files read.");
       line = clanFinder.readLine();
 
      }
     clanFinder.close();
+
+    for (int clan = 0; clan < clanCount; clan++)
+    {
+      PrintWriter writer = new PrintWriter(
+              directory + "/Clan" + clan + "/NumberOfFamilies.txt");
+      int count = familyCounts.get(clan);
+      writer.print(count);
+      writer.close();
+    }
       
     }
+
+  public String getFamilies()
+  {
+    return families;
+  }
+
+  public void setFamilies(String families)
+  {
+    this.families = currentFolder + families;
+  }
+
+  public String getHmms()
+  {
+    return hmms;
+  }
+
+  public void setHmms(String hmms)
+  {
+    this.hmms = currentFolder + hmms;
+  }
     
+  public void alignWithinClan(String exportLocation, String clansLocation)
+          throws IOException, InterruptedException
+  {
+    int alignmentsExported = 0;
+    for (int clan = 0; clan < 604; clan++)
+    {
+      System.out.println(clan);
+      int famCount = 0;
+      String clanPath = clansLocation + "/Clan" + clan;
+      int numberOfFamilies;
+      BufferedReader br = new BufferedReader(
+              new FileReader(clanPath + "/NumberOfFamilies.txt"));
+      String line = br.readLine();
+      numberOfFamilies = Integer.parseInt(line);
+      br.close();
+      if (numberOfFamilies == 1)
+      {
+        continue;
+      }
+      final String commandExportLocation = exportLocation + "/Clan" + clan;
+      createFolders(commandExportLocation);
+      for (int family = 0; family < numberOfFamilies; family++)
+      {
+        famCount++;
+        ArrayList<Integer> indexes = new ArrayList<>();
+        for (int i = 0; i < numberOfFamilies; i++)
+        {
+          if (i != family)
+          {
+            indexes.add(i);
+          }
+        }
+
+        int hmmIndex = getRandom(indexes);
+        String famPath = clanPath + "/Families/Fam" + family + ".sto";
+        String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
+        String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
+                + clanPath + "/Families/Fam" + hmmIndex + ".sto"
+                + " --trim ";
+        command += hmmPath + " ";
+        command += famPath;
+        final int familyIndex = family;
+        final Process p = Runtime.getRuntime().exec(command);
+
+        new Thread(new Runnable()
+        {
+          @Override
+          public void run()
+          {
+            BufferedReader input = new BufferedReader(
+                    new InputStreamReader(p.getInputStream()));
+            String line = null;
+
+            try
+            {
+              PrintWriter writer = new PrintWriter(commandExportLocation
+                      + "/Families/Fam" + familyIndex + ".sto");
+              String lastLine = "";
+              boolean dataFound = false;
+              while ((line = input.readLine()) != null)
+              {
+                if (line.contains("#=GR") && !dataFound)
+                {
+                  writer.println(lastLine);
+                  dataFound = true;
+                }
+                if (line.contains("#") || dataFound || " ".equals(line)
+                        || "".equals(line) || "//".equals(line))
+                {
+                  writer.println(line);
+                }
+                lastLine = line;
+              }
+              writer.close();
+            } catch (IOException e)
+            {
+              e.printStackTrace();
+            }
+          }
+        }).start();
+
+        p.waitFor();
+
+        BufferedReader hmmExporter = new BufferedReader(
+                new FileReader(hmmPath));
+
+        exportFile(hmmExporter,
+                commandExportLocation + "/HMMs/HMM" + family + ".hmm",
+                false);
+
+        alignmentsExported++;
+
+
+      }
+      PrintWriter writer = new PrintWriter(
+              commandExportLocation + "/NumberOfFamilies.txt");
+      writer.print(famCount);
+      writer.close();
+    }
+
+  }
+
+  public boolean atEnd(BufferedReader br) throws IOException
+  {
+    boolean end = false;
+    br.mark(80);
+    String line = br.readLine();
+    if ("".equals(line) || line == null)
+    {
+      end = true;
+    }
+    br.reset();
+    return end;
   }
 
+  public int getRandom(ArrayList<Integer> list)
+  {
+    if (!(list.size() > 0))
+    {
+      System.out.println("Error - size = " + list.size());
+    }
+    int index = generator.nextInt(list.size());
+    int value = list.get(index);
+    list.remove(index);
+    return value;
+  }
+
+  public void createFolders(String clanPath)
+  {
+    File clanFolder = new File(clanPath);
+    if (!clanFolder.exists())
+    {
+      clanFolder.mkdir();
+    }
+
+    File famFolder = new File(clanPath + "/Families");
+    File hmmFolder = new File(clanPath + "/HMMs");
+    if (!famFolder.exists())
+    {
+      famFolder.mkdir();
+      hmmFolder.mkdir();
+    }
+  }
+}
+
+
+