JAL-2616 add ability to sort Pfam families and HMMs into clans
authorTZVanaalten <TZVanaalten@LS30916.ad.lifesci.dundee.ac.uk>
Fri, 21 Jul 2017 13:27:41 +0000 (14:27 +0100)
committerTZVanaalten <TZVanaalten@LS30916.ad.lifesci.dundee.ac.uk>
Fri, 21 Jul 2017 13:27:41 +0000 (14:27 +0100)
src/jalview/util/HMMProbabilityDistributionAnalyser.java
src/jalview/util/ProbabilityAnalyserKickstarter.java
test/jalview/util/HMMProbabilityDistributionAnalyserTest.java

index 1fc178a..b30487d 100644 (file)
@@ -11,6 +11,7 @@ import jalview.schemes.ResidueProperties;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.PrintWriter;
@@ -43,7 +44,10 @@ public class HMMProbabilityDistributionAnalyser
   Map<String, Double> binned = new HashMap<>();
 
   // location of the family file
-  final static String FAMILIES = "C:/Users/TZVanaalten/Pfam-A.full";
+  final static String FAMILIES = "H:/Desktop/PFAM/Family/SeedFamilies.seed";
+
+  // location of the file containing the family-clan links
+  final static String FAMILIESTOCLAN = "H:/Desktop/PFAM/Family/Clanlinks.dat";
 
   // location of the HMM file
   final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
@@ -65,6 +69,8 @@ public class HMMProbabilityDistributionAnalyser
   // current directory
   String currentFolder;
 
+  boolean keepRaw = false;
+
   /**
    * Sets the working directory.
    * 
@@ -76,15 +82,16 @@ public class HMMProbabilityDistributionAnalyser
   }
 
   /**
-   * Moves a buffered reader to a specific location in the file, delimited by
-   * '//'.
+   * Moves a buffered reader forward in the file by a certain amount of entries.
+   * Each entry in the file is delimited by '//'.
    * 
    * @param index
    *          The index of the location in the file.
    * @param br
    * @throws IOException
    */
-  public void moveToFile(int index, BufferedReader br) throws IOException
+  public void moveLocationBy(int index, BufferedReader br)
+          throws IOException
   {
     for (int i = 0; i < index; i++)
     {
@@ -106,28 +113,29 @@ public class HMMProbabilityDistributionAnalyser
    *          The number of families to read before saving.
    * @throws IOException
    */
-  public void run(int increments) throws IOException
+  public void run(int increments, boolean keepRawData) throws IOException
   {
-
+    keepRaw = keepRawData;
     readPreviousData(currentFolder);
 
     BufferedReader posReader = new BufferedReader(
             new FileReader(currentFolder + "/CurrentPosition.txt"));
     String line = posReader.readLine();
     posReader.close();
-    currentFilePosition = Integer.parseInt(line);
 
-    BufferedReader inputSTO = new BufferedReader(
-            new FileReader(FAMILIES));
-    BufferedReader inputHMM = new BufferedReader(
-            new FileReader(HMMS));
+    BufferedReader inputSTO = new BufferedReader(new FileReader(FAMILIES));
+    BufferedReader inputHMM = new BufferedReader(new FileReader(HMMS));
+
+    currentFilePosition = Integer.parseInt(line);
 
-    moveToFile(currentFilePosition, inputHMM);
-    moveToFile(currentFilePosition, inputSTO);
+    moveLocationBy(currentFilePosition, inputHMM);
+    moveLocationBy(currentFilePosition, inputSTO);
 
     int filesRead = 0;
+    int i = 0;
     while (filesRead < increments)
     {
+
       FileParse parserSTO = new FileParse(inputSTO, "",
               DataSourceType.FILE);
       readStockholm(parserSTO);
@@ -136,17 +144,17 @@ public class HMMProbabilityDistributionAnalyser
               DataSourceType.FILE);
       readHMM(parserHMM);
 
-      if (hmm.getAlphabetType().equals("amino"))
-      {
         int count = countValidResidues();
         processData(count);
         filesRead++;
-      }
+
       currentFilePosition++;
+      System.out.println(i);
+      i++;
     }
 
     PrintWriter p = new PrintWriter(
-            new File(currentFolder + "/CurrentPosition"));
+            new File(currentFolder + "/CurrentPosition.txt"));
     p.print(currentFilePosition);
     p.close();
     exportData(currentFolder);
@@ -164,7 +172,10 @@ public class HMMProbabilityDistributionAnalyser
   public void readPreviousData(String source) throws IOException
   {
     readBinned(source);
-    readRaw(source);
+    if (keepRaw)
+    {
+      readRaw(source);
+    }
   }
 
   /**
@@ -178,9 +189,9 @@ public class HMMProbabilityDistributionAnalyser
     BufferedReader input = new BufferedReader(
             new FileReader(source + BINNED));
     String line = input.readLine();
+    binned = new HashMap<>();
     while (!("".equals(line) || line == null))
     {
-      binned = new HashMap<>();
       Scanner scanner = new Scanner(line);
       scanner.useDelimiter(",");
       binned.put(scanner.next(), scanner.nextDouble());
@@ -277,9 +288,13 @@ public class HMMProbabilityDistributionAnalyser
    */
   public void processData(int count)
   {
+    int rawPos = 0;
+    if (keepRaw)
+    {
+      raw.add(new ArrayList<Double>());
+      rawPos = raw.size() - 1;
+    }
 
-    raw.add(new ArrayList<Double>());
-    int rawPos = raw.size() - 1;
     for (int width = 0; width < sequences.size(); width++)
     {
       for (int length = 1; length < hmm.getLength(); length++)
@@ -287,19 +302,22 @@ public class HMMProbabilityDistributionAnalyser
         char symbol;
         int alignPos;
         alignPos = hmm.getNodeAlignmentColumn(length);
-
+        
         symbol = sequences.get(width).getCharAt(alignPos);
         if (ResidueProperties.aminoBackgroundFrequencies
                 .containsKey(symbol))
         {
-
           Double prob;
           Float bfreq;
           Double llr;
           prob = hmm.getMatchEmissionProbability(alignPos, symbol);
           bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
           llr = Math.log(prob / bfreq);
-          raw.get(rawPos).add(llr);
+          if (keepRaw)
+          {
+            raw.get(rawPos).add(llr);
+          }
+
           String output;
           output = String.format("%.1f", llr);
           if ("-0.0".equals(output))
@@ -332,7 +350,6 @@ public class HMMProbabilityDistributionAnalyser
   public void readStockholm(FileParse source) throws IOException
   {
     StockholmFile file = new StockholmFile(source);
-    file.parse();
     sequences = file.getSeqs();
   }
 
@@ -365,18 +382,20 @@ public class HMMProbabilityDistributionAnalyser
       writerBin.println(entry.getKey() + "," + entry.getValue());
     }
     writerBin.close();
+    if (keepRaw)
+    {
 
     PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
-
+    
     StringBuilder identifier = new StringBuilder();
-
+    
     for (int i = 1; i < raw.size() + 1; i++)
     {
       identifier.append("Fam " + i + ",");
     }
-
+    
     writerRaw.println(identifier);
-
+    
     boolean rowIsEmpty = false;
     int row = 0;
     while (!rowIsEmpty)
@@ -400,6 +419,8 @@ public class HMMProbabilityDistributionAnalyser
     }
     writerRaw.close();
 
+    }
+
   }
 
   /**
@@ -412,7 +433,7 @@ public class HMMProbabilityDistributionAnalyser
   {
     BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
 
-    moveToFile(index, br);
+    moveLocationBy(index, br);
 
     String line = br.readLine();
 
@@ -436,7 +457,7 @@ public class HMMProbabilityDistributionAnalyser
   {
     BufferedReader br = new BufferedReader(new FileReader(HMMS));
 
-    moveToFile(index, br);
+    moveLocationBy(index, br);
 
     String line = br.readLine();
 
@@ -451,35 +472,20 @@ public class HMMProbabilityDistributionAnalyser
   }
 
   /**
-   * Prints the specified family to a .sto file in the current directory.
+   * Prints the specified family to a .sto file.
    * 
    * @param index
    * @throws IOException
    */
-  public void printFamToFile(int index) throws IOException
+  public void exportFam(int index, String location) throws IOException
   {
-    String name;
-
-    BufferedReader nameFinder = new BufferedReader(
-            new FileReader(FAMILIES));
-
-    moveToFile(index, nameFinder);
-
-    nameFinder.readLine();
-
-    Scanner scanner = new Scanner(nameFinder.readLine());
-    scanner.next();
-    scanner.next();
-    name = scanner.next();
-    scanner.close();
-
     BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
 
-    moveToFile(index, br);
+    moveLocationBy(index, br);
 
     String line = br.readLine();
     PrintWriter writer = new PrintWriter(
-            currentFolder + "/" + name + ".sto");
+            new FileOutputStream(new File(location), true));
     while (!"//".equals(line))
     {
       writer.println(line);
@@ -491,36 +497,77 @@ public class HMMProbabilityDistributionAnalyser
 
   }
 
-  /**
-   * Prints the specified family to a .hmm file in the current directory.
-   * 
-   * @param index
-   * @throws IOException
-   */
-  public void printHMMToFile(int index) throws IOException
+  public void exportFile(BufferedReader br, String location)
+          throws IOException
   {
+    String line = br.readLine();
+    PrintWriter writer = new PrintWriter(
+            new FileOutputStream(new File(location), true));
+    while (!"//".equals(line))
+    {
+      writer.println(line);
+      line = br.readLine();
+    }
+    writer.println(line);
+    writer.close();
+
+
+  }
 
+  public String getHMMName(int index) throws IOException
+  {
     String name;
 
     BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS));
 
-    moveToFile(index, nameFinder);
+    moveLocationBy(index, nameFinder);
+
+    nameFinder.readLine();
+
+    Scanner scanner = new Scanner(nameFinder.readLine());
+    name = scanner.next();
+    name = scanner.next();
+    scanner.close();
+    return name;
+  }
+
+  public String getFamilyName(int index) throws IOException
+  {
+    String name;
+
+    BufferedReader nameFinder = new BufferedReader(
+            new FileReader(FAMILIES));
+
+    moveLocationBy(index, nameFinder);
 
     nameFinder.readLine();
 
     Scanner scanner = new Scanner(nameFinder.readLine());
     name = scanner.next();
     name = scanner.next();
+    name = scanner.next();
     scanner.close();
+    return name;
+  }
+
+  /**
+   * Prints the specified family to a .hmm file in the current directory.
+   * 
+   * @param index
+   * @throws IOException
+   */
+  public void exportHMM(int index, String location) throws IOException
+  {
+
 
     BufferedReader br = new BufferedReader(new FileReader(HMMS));
 
-    moveToFile(index, br);
+    moveLocationBy(index, br);
 
     String line = br.readLine();
 
     PrintWriter writer = new PrintWriter(
-            currentFolder + "/" + name + ".hmm");
+            new FileOutputStream(new File(location), true));
     while (!"//".equals(line))
     {
       writer.println(line);
@@ -552,4 +599,69 @@ public class HMMProbabilityDistributionAnalyser
     raw.close();
   }
 
-}
+  public void sortIntoClans(String directory) throws IOException
+  {
+    BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
+    BufferedReader familyReader = new BufferedReader(
+            new FileReader(FAMILIES));
+    BufferedReader hmmReader = new BufferedReader(new FileReader(HMMS));
+    HashMap<String, Integer> clanIndexes = new HashMap<>();
+    int filePos = 0; 
+    int clanCount = 0;
+    String line;
+    line = clanFinder.readLine();
+    
+    while (!"".equals(line) && !" ".equals(line) && line != null)
+    {
+     String clanName;
+      boolean inClan = false;
+     while (!(line.indexOf("//") > -1))
+     {
+       
+      if (line.indexOf("#=GF CL") > -1)
+      {
+          inClan = true;
+        Scanner scanner = new Scanner(line);
+        scanner.next();
+        scanner.next();
+        clanName = scanner.next();
+          scanner.close();
+        
+        if (!clanIndexes.containsKey(clanName))
+        {
+          clanIndexes.put(clanName, clanCount);
+            clanCount++;
+        }
+
+          Integer clanI = clanIndexes.get(clanName);
+          String clanPath = directory + "/Clan" + clanI.toString();
+          File clanFolder = new File(clanPath);
+          String famPath = clanPath + "/Families.sto";
+          String hmmPath = clanPath + "/HMMs.hmm";
+          if (!clanFolder.exists())
+        {
+            clanFolder.mkdir();
+        }
+          exportFile(familyReader, famPath);
+          exportFile(hmmReader, hmmPath);
+
+      }
+        line = clanFinder.readLine();
+      }
+      if (!inClan)
+      {
+        moveLocationBy(1, familyReader);
+        moveLocationBy(1, hmmReader);
+      }
+      filePos++;
+      System.out.println(filePos + " files read.");
+      line = clanFinder.readLine();
+
+     }
+    clanFinder.close();
+      
+    }
+    
+  }
+
+
index 9eea470..86d9177 100644 (file)
@@ -32,39 +32,61 @@ public class ProbabilityAnalyserKickstarter
       // prints family to console. Syntax is printFam <index>
       if (command.indexOf("printFam") > -1)
       {
-        inputScanner.next();
-        int index = inputScanner.nextInt();
-        analyser.printFam(index);
-        continue;
+        try
+        {
+          inputScanner.next();
+          int index = inputScanner.nextInt();
+          analyser.printFam(index);
+          continue;
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
+
       }
       // prints HMM to console. Syntax is printHMM <index>
       if (command.indexOf("printHMM") > -1)
       {
-
+        try
+        {
         inputScanner.next();
         int index = inputScanner.nextInt();
         analyser.printHMM(index);
         continue;
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
       }
       // prints family to file in current folder. Syntax is exportFam <index>.
       if (command.indexOf("exportFam") > -1)
       {
-
+        try
+        {
         inputScanner.next();
         int index = inputScanner.nextInt();
-        String location = inputScanner.next();
-        analyser.printFamToFile(index);
+          String location = inputScanner.next();
+          analyser.exportFam(index, location);
         continue;
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
       }
       // prints HMM to file in current folder. Syntax is exportHMM <index>.
       if (command.indexOf("exportHMM") > -1)
       {
-
+        try
+        {
         inputScanner.next();
         int index = inputScanner.nextInt();
-        String location = inputScanner.next();
-        analyser.printHMMToFile(index);
+          String location = inputScanner.next();
+          analyser.exportHMM(index, location);
         continue;
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
       }
       // Processes data. Syntax is run <number of loops> <increments>. The
       // number loops specifies the number of increments the program will run.
@@ -74,16 +96,26 @@ public class ProbabilityAnalyserKickstarter
       // increment is the number of families read per 'save'.
       if (command.indexOf("run") > -1)
       {
+        try
+        {
+
         inputScanner.next();
 
         int loops = inputScanner.nextInt();
         int increments = inputScanner.nextInt();
+        boolean keepRaw = inputScanner.nextBoolean();
 
         for (int i = 0; i < loops; i++)
         {
-          analyser.run(increments);
+          analyser.run(increments, keepRaw);
+          System.out.println("Saved");
         }
+        System.out.println("Task completed");
         continue;
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
       }
       // terminates program. Syntax is terminate.
       if (command.indexOf("terminate") > -1)
@@ -101,11 +133,36 @@ public class ProbabilityAnalyserKickstarter
       // changes current directory. Syntax is cd <directory>
       if (command.indexOf("cd") > -1)
       {
+        try
+        {
         inputScanner.next();
         analyser.setFolder(inputScanner.next());
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
+      }
+
+      if (command.indexOf("getFamName") > -1)
+      {
+        try
+        {
+        inputScanner.next();
+        System.out.println(analyser.getFamilyName(inputScanner.nextInt()));
+          inputScanner.close();
+          continue;
+        } catch (Exception e)
+        {
+          System.out.println("Command failed");
+        }
+      }
+      if (command.indexOf("sortIntoClans") > -1)
+      {
+        inputScanner.next();
+        analyser.sortIntoClans(inputScanner.next());
+          continue;
+
       }
-      inputScanner.close();
-      continue;
     }
 
 
index 04c7890..60f6c89 100644 (file)
@@ -1,9 +1,48 @@
 package jalview.util;
 
+import static org.testng.Assert.assertEquals;
+
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceI;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+
 import org.testng.annotations.Test;
 
 public class HMMProbabilityDistributionAnalyserTest {
+
+  HMMProbabilityDistributionAnalyser analyser = new HMMProbabilityDistributionAnalyser();
+
+  @Test
+  public void testMoveToFile() throws IOException
+  {
+
+    BufferedReader br = new BufferedReader(new FileReader(
+            "test/utils/test_Fams_for_probability_analysis.txt"));
+    analyser.moveLocationBy(2, br);
+
+    String line = br.readLine();
+    assertEquals(line, "# STOCKHOLM 1.0");
+    line = br.readLine();
+    assertEquals(line, "seq1 ATW");
+    line = br.readLine();
+    assertEquals(line, "seq2 ATI");
+
+  }
+
   @Test
-  public void f() {
+  public void testCountValidResidues()
+  {
+    SequenceI[] sequence = new Sequence[] {
+        new Sequence("seq1", "ATGWWSCF"), new Sequence("seq1", "GGWMMKI"),
+        new Sequence("seq1", "--.ATccc") };
+    analyser.sequences.add(sequence[0]);
+    analyser.sequences.add(sequence[1]);
+    analyser.sequences.add(sequence[2]);
+
+    int count = analyser.countValidResidues();
+    assertEquals(count, 17);
   }
 }