JAL-2599 HMMs can now be dropped onto the desktop
[jalview.git] / src / jalview / util / HMMProbabilityDistributionAnalyser.java
index 520c874..24b3bc3 100644 (file)
@@ -1,5 +1,6 @@
 package jalview.util;
 
+import jalview.datamodel.AlignmentAnnotation;
 import jalview.datamodel.HiddenMarkovModel;
 import jalview.datamodel.SequenceI;
 import jalview.io.DataSourceType;
@@ -34,6 +35,7 @@ import java.util.Vector;
  */
 public class HMMProbabilityDistributionAnalyser
 {
+  AlignmentAnnotation reference = null;
 
   Vector<SequenceI> sequences;
 
@@ -46,13 +48,13 @@ public class HMMProbabilityDistributionAnalyser
   Map<String, Double> binned = new HashMap<>();
 
   // location of the family file
-  String families = "H:/Desktop/PFAM/Family/SeedFamilies.seed";
+  String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
 
   // location of the file containing the family-clan links
-  final static String FAMILIESTOCLAN = "H:/Desktop/PFAM/Family/Clanlinks.dat";
+  final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
 
   // location of the HMM file
-  String hmms = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
+  String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
 
   // suffix for raw file
   final static String RAW = "/Raw.csv";
@@ -103,6 +105,7 @@ public class HMMProbabilityDistributionAnalyser
       while (!"//".equals(line))
       {
         line = br.readLine();
+
       }
     }
 
@@ -111,7 +114,8 @@ public class HMMProbabilityDistributionAnalyser
   /**
    * Analyses a specified number of families and then saves the data. Before
    * analysing the data, the previous saved data will be imported and after
-   * analysing this, the data is exported back into the file.
+   * analysing this, the data is exported back into the file. The file must be
+   * in flat file format.
    * 
    * @param increments
    *          The number of families to read before saving.
@@ -175,68 +179,84 @@ public class HMMProbabilityDistributionAnalyser
   /**
    * Analyses all families and then saves the data. Before analysing the data,
    * the previous saved data will be imported and after analysing this, the data
-   * is exported back into the file.
+   * is exported back into the file. The file must be in flat file format.
    * 
    * @param increments
    *          The number of families to read before saving.
    * @throws IOException
    */
-  public void runToEnd(boolean keepRawData) throws IOException
+  public void runToEnd(boolean keepRawData, boolean forClans)
+          throws IOException
   {
     keepRaw = keepRawData;
     BufferedReader inputSTO = null;
     BufferedReader inputHMM = null;
     int size = 0;
+    int files = 1;
     try
     {
-      readPreviousData(currentFolder);
-      BufferedReader posReader = new BufferedReader(
-              new FileReader(currentFolder + "/CurrentPosition.txt"));
-
-      String line = posReader.readLine();
-      posReader.close();
-      currentFilePosition = Integer.parseInt(line);
-      readPreviousData(currentFolder);
-      
-      inputSTO = new BufferedReader(new FileReader(families));
-      inputHMM = new BufferedReader(new FileReader(hmms));
-    } catch (Exception e)
+    if (forClans)
     {
-      System.out.println("No or incomplete previous data found");
+        files = 603;
     }
+    int filesRead = 0;
+    for (int clan = 0; clan < files; clan++)
+    {
+      System.out.println(clan);
+      String clanPath = "";
+      int numberOfFamilies = 0;
+      if (forClans)
+      {
+        clanPath = currentFolder + "/Clan" + clan;
+          if (!new File(clanPath).exists())
+          {
+            continue;
+          }
+        BufferedReader famCountReader = new BufferedReader(
+                new FileReader(clanPath + "/NumberOfFamilies.txt"));
+        numberOfFamilies = Integer.parseInt(famCountReader.readLine());
+      }
+      else
+      {
+        numberOfFamilies = 1;
+      }
+      
+      for (int fam = 0; fam < numberOfFamilies; fam++)
+      {
+        if (forClans)
+        {
+          families = clanPath + "/Families/Fam" + fam + ".sto";
+          hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
+        }
 
-    
+        inputSTO = new BufferedReader(new FileReader(families));
+        inputHMM = new BufferedReader(new FileReader(hmms));
 
-    moveLocationBy(currentFilePosition, inputHMM);
-    moveLocationBy(currentFilePosition, inputSTO);
 
-    int filesRead = 0;
-    int i = 0;
-    boolean endReached = atEnd(inputSTO);
-    while (!endReached)
-      {
-      readStockholm(inputSTO);
-      readHMM(inputHMM);
+        int i = 0;
+        boolean endReached = atEnd(inputSTO);
+        while (!endReached)
+        {
+          readStockholm(inputSTO);
+          readHMM(inputHMM);
 
         int count = countValidResidues();
         processData(count);
         filesRead++;
-
-        currentFilePosition++;
-        System.out.println(i);
-        i++;
+          System.out.println(filesRead);
       endReached = atEnd(inputSTO);
       }
-
-
-    PrintWriter p = new PrintWriter(
-            new File(currentFolder + "/CurrentPosition.txt"));
-    p.print(currentFilePosition);
-    p.close();
-    exportData(currentFolder);
-    raw.clear();
-    binned.clear();
-
+      }
+    }
+    } catch (Exception e)
+    {
+      e.printStackTrace();
+    } finally
+    {
+      exportData(currentFolder);
+      raw.clear();
+      binned.clear();
+    }
   }
 
   /**
@@ -376,7 +396,7 @@ public class HMMProbabilityDistributionAnalyser
       raw.add(new ArrayList<Double>());
       rawPos = raw.size() - 1;
     }
-
+    Double total = 0d;
     for (int width = 0; width < sequences.size(); width++)
     {
       for (int length = 1; length < hmm.getLength() + 1; length++)
@@ -394,6 +414,10 @@ public class HMMProbabilityDistributionAnalyser
           Double llr;
           prob = hmm.getMatchEmissionProbability(alignPos, symbol);
           bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
+          if (prob == 0 || bfreq == 0)
+          {
+            System.out.println("error");
+          }
           llr = Math.log(prob / bfreq);
           if (keepRaw)
           {
@@ -402,6 +426,7 @@ public class HMMProbabilityDistributionAnalyser
 
           String output;
           output = String.format("%.1f", llr);
+          total += Double.parseDouble(output);
           if ("-0.0".equals(output))
           {
             output = "0.0";
@@ -420,6 +445,7 @@ public class HMMProbabilityDistributionAnalyser
         }
       }
     }
+    System.out.println(total / count);
   }
 
 
@@ -433,6 +459,15 @@ public class HMMProbabilityDistributionAnalyser
   {
     FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
     StockholmFile file = new StockholmFile(parserSTO);
+    Vector<AlignmentAnnotation> annots = file.getAnnotations();
+
+    for (AlignmentAnnotation annot : annots)
+    {
+      if (annot.label.contains("Reference"))
+      {
+        reference = annot;
+      }
+    }
     sequences = file.getSeqs();
   }
 
@@ -446,9 +481,9 @@ public class HMMProbabilityDistributionAnalyser
   {
     FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
     HMMFile file = new HMMFile(parserHMM);
-    file.parse();
     hmm = file.getHMM();
 
+
   }
 
   /**
@@ -634,7 +669,7 @@ public class HMMProbabilityDistributionAnalyser
   }
 
   /**
-   * Prints the specified family to a .hmm file in the current directory.
+   * Prints the specified family to a .hmm file.
    * 
    * @param index
    * @throws IOException
@@ -688,6 +723,10 @@ public class HMMProbabilityDistributionAnalyser
     BufferedReader familyReader = new BufferedReader(
             new FileReader(families));
     BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
+    int families = 0;
+    // moveLocationBy(7000, familyReader);
+    // moveLocationBy(7000, clanFinder);
+    // moveLocationBy(7000, hmmReader);
     HashMap<String, Integer> clanIndexes = new HashMap<>();
     ArrayList<Integer> familyCounts = new ArrayList<>();
     int filePos = 0; 
@@ -704,6 +743,8 @@ public class HMMProbabilityDistributionAnalyser
        
       if (line.indexOf("#=GF CL") > -1)
       {
+          families++;
+          System.out.println(families);
           inClan = true;
         Scanner scanner = new Scanner(line);
         scanner.next();
@@ -745,7 +786,7 @@ public class HMMProbabilityDistributionAnalyser
         moveLocationBy(1, hmmReader);
       }
       filePos++;
-      System.out.println(filePos + " files read.");
+      // System.out.println(filePos + " files read.");
       line = clanFinder.readLine();
 
      }
@@ -788,6 +829,7 @@ public class HMMProbabilityDistributionAnalyser
     int alignmentsExported = 0;
     for (int clan = 0; clan < 604; clan++)
     {
+      System.out.println(clan);
       int famCount = 0;
       String clanPath = clansLocation + "/Clan" + clan;
       int numberOfFamilies;
@@ -796,7 +838,11 @@ public class HMMProbabilityDistributionAnalyser
       String line = br.readLine();
       numberOfFamilies = Integer.parseInt(line);
       br.close();
-      String commandExportLocation = exportLocation + "/Clan" + clan;
+      if (numberOfFamilies == 1)
+      {
+        continue;
+      }
+      final String commandExportLocation = exportLocation + "/Clan" + clan;
       createFolders(commandExportLocation);
       for (int family = 0; family < numberOfFamilies; family++)
       {
@@ -809,14 +855,16 @@ public class HMMProbabilityDistributionAnalyser
             indexes.add(i);
           }
         }
+
         int hmmIndex = getRandom(indexes);
         String famPath = clanPath + "/Families/Fam" + family + ".sto";
         String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
-        String command = "H:/Documents/hmmalign -o " + commandExportLocation
-                + "/Fam" + family + ".sto ";
+        String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
+                + clanPath + "/Families/Fam" + hmmIndex + ".sto"
+                + " --trim ";
         command += hmmPath + " ";
         command += famPath;
-
+        final int familyIndex = family;
         final Process p = Runtime.getRuntime().exec(command);
 
         new Thread(new Runnable()
@@ -830,10 +878,25 @@ public class HMMProbabilityDistributionAnalyser
 
             try
             {
+              PrintWriter writer = new PrintWriter(commandExportLocation
+                      + "/Families/Fam" + familyIndex + ".sto");
+              String lastLine = "";
+              boolean dataFound = false;
               while ((line = input.readLine()) != null)
               {
-                System.out.println(line);
+                if (line.contains("#=GR") && !dataFound)
+                {
+                  writer.println(lastLine);
+                  dataFound = true;
+                }
+                if (line.contains("#") || dataFound || " ".equals(line)
+                        || "".equals(line) || "//".equals(line))
+                {
+                  writer.println(line);
+                }
+                lastLine = line;
               }
+              writer.close();
             } catch (IOException e)
             {
               e.printStackTrace();
@@ -843,12 +906,15 @@ public class HMMProbabilityDistributionAnalyser
 
         p.waitFor();
 
-        exportHMM(hmmIndex,
-                commandExportLocation + "/HMMs/HMM" + family + ".hmm");
+        BufferedReader hmmExporter = new BufferedReader(
+                new FileReader(hmmPath));
+
+        exportFile(hmmExporter,
+                commandExportLocation + "/HMMs/HMM" + family + ".hmm",
+                false);
 
         alignmentsExported++;
 
-        System.out.println(alignmentsExported + " alignments exported");
 
       }
       PrintWriter writer = new PrintWriter(
@@ -874,6 +940,10 @@ public class HMMProbabilityDistributionAnalyser
 
   public int getRandom(ArrayList<Integer> list)
   {
+    if (!(list.size() > 0))
+    {
+      System.out.println("Error - size = " + list.size());
+    }
     int index = generator.nextInt(list.size());
     int value = list.get(index);
     list.remove(index);