From: TZVanaalten Date: Fri, 21 Jul 2017 13:27:41 +0000 (+0100) Subject: JAL-2616 add ability to sort Pfam families and HMMs into clans X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=066fc9951453a55eea27900f41e1b0a5b59dc57c;p=jalview.git JAL-2616 add ability to sort Pfam families and HMMs into clans --- diff --git a/src/jalview/util/HMMProbabilityDistributionAnalyser.java b/src/jalview/util/HMMProbabilityDistributionAnalyser.java index 1fc178a..b30487d 100644 --- a/src/jalview/util/HMMProbabilityDistributionAnalyser.java +++ b/src/jalview/util/HMMProbabilityDistributionAnalyser.java @@ -11,6 +11,7 @@ import jalview.schemes.ResidueProperties; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; @@ -43,7 +44,10 @@ public class HMMProbabilityDistributionAnalyser Map binned = new HashMap<>(); // location of the family file - final static String FAMILIES = "C:/Users/TZVanaalten/Pfam-A.full"; + final static String FAMILIES = "H:/Desktop/PFAM/Family/SeedFamilies.seed"; + + // location of the file containing the family-clan links + final static String FAMILIESTOCLAN = "H:/Desktop/PFAM/Family/Clanlinks.dat"; // location of the HMM file final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm"; @@ -65,6 +69,8 @@ public class HMMProbabilityDistributionAnalyser // current directory String currentFolder; + boolean keepRaw = false; + /** * Sets the working directory. * @@ -76,15 +82,16 @@ public class HMMProbabilityDistributionAnalyser } /** - * Moves a buffered reader to a specific location in the file, delimited by - * '//'. + * Moves a buffered reader forward in the file by a certain amount of entries. + * Each entry in the file is delimited by '//'. * * @param index * The index of the location in the file. * @param br * @throws IOException */ - public void moveToFile(int index, BufferedReader br) throws IOException + public void moveLocationBy(int index, BufferedReader br) + throws IOException { for (int i = 0; i < index; i++) { @@ -106,28 +113,29 @@ public class HMMProbabilityDistributionAnalyser * The number of families to read before saving. * @throws IOException */ - public void run(int increments) throws IOException + public void run(int increments, boolean keepRawData) throws IOException { - + keepRaw = keepRawData; readPreviousData(currentFolder); BufferedReader posReader = new BufferedReader( new FileReader(currentFolder + "/CurrentPosition.txt")); String line = posReader.readLine(); posReader.close(); - currentFilePosition = Integer.parseInt(line); - BufferedReader inputSTO = new BufferedReader( - new FileReader(FAMILIES)); - BufferedReader inputHMM = new BufferedReader( - new FileReader(HMMS)); + BufferedReader inputSTO = new BufferedReader(new FileReader(FAMILIES)); + BufferedReader inputHMM = new BufferedReader(new FileReader(HMMS)); + + currentFilePosition = Integer.parseInt(line); - moveToFile(currentFilePosition, inputHMM); - moveToFile(currentFilePosition, inputSTO); + moveLocationBy(currentFilePosition, inputHMM); + moveLocationBy(currentFilePosition, inputSTO); int filesRead = 0; + int i = 0; while (filesRead < increments) { + FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE); readStockholm(parserSTO); @@ -136,17 +144,17 @@ public class HMMProbabilityDistributionAnalyser DataSourceType.FILE); readHMM(parserHMM); - if (hmm.getAlphabetType().equals("amino")) - { int count = countValidResidues(); processData(count); filesRead++; - } + currentFilePosition++; + System.out.println(i); + i++; } PrintWriter p = new PrintWriter( - new File(currentFolder + "/CurrentPosition")); + new File(currentFolder + "/CurrentPosition.txt")); p.print(currentFilePosition); p.close(); exportData(currentFolder); @@ -164,7 +172,10 @@ public class HMMProbabilityDistributionAnalyser public void readPreviousData(String source) throws IOException { readBinned(source); - readRaw(source); + if (keepRaw) + { + readRaw(source); + } } /** @@ -178,9 +189,9 @@ public class HMMProbabilityDistributionAnalyser BufferedReader input = new BufferedReader( new FileReader(source + BINNED)); String line = input.readLine(); + binned = new HashMap<>(); while (!("".equals(line) || line == null)) { - binned = new HashMap<>(); Scanner scanner = new Scanner(line); scanner.useDelimiter(","); binned.put(scanner.next(), scanner.nextDouble()); @@ -277,9 +288,13 @@ public class HMMProbabilityDistributionAnalyser */ public void processData(int count) { + int rawPos = 0; + if (keepRaw) + { + raw.add(new ArrayList()); + rawPos = raw.size() - 1; + } - raw.add(new ArrayList()); - int rawPos = raw.size() - 1; for (int width = 0; width < sequences.size(); width++) { for (int length = 1; length < hmm.getLength(); length++) @@ -287,19 +302,22 @@ public class HMMProbabilityDistributionAnalyser char symbol; int alignPos; alignPos = hmm.getNodeAlignmentColumn(length); - + symbol = sequences.get(width).getCharAt(alignPos); if (ResidueProperties.aminoBackgroundFrequencies .containsKey(symbol)) { - Double prob; Float bfreq; Double llr; prob = hmm.getMatchEmissionProbability(alignPos, symbol); bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol); llr = Math.log(prob / bfreq); - raw.get(rawPos).add(llr); + if (keepRaw) + { + raw.get(rawPos).add(llr); + } + String output; output = String.format("%.1f", llr); if ("-0.0".equals(output)) @@ -332,7 +350,6 @@ public class HMMProbabilityDistributionAnalyser public void readStockholm(FileParse source) throws IOException { StockholmFile file = new StockholmFile(source); - file.parse(); sequences = file.getSeqs(); } @@ -365,18 +382,20 @@ public class HMMProbabilityDistributionAnalyser writerBin.println(entry.getKey() + "," + entry.getValue()); } writerBin.close(); + if (keepRaw) + { PrintWriter writerRaw = new PrintWriter(new File(location + RAW)); - + StringBuilder identifier = new StringBuilder(); - + for (int i = 1; i < raw.size() + 1; i++) { identifier.append("Fam " + i + ","); } - + writerRaw.println(identifier); - + boolean rowIsEmpty = false; int row = 0; while (!rowIsEmpty) @@ -400,6 +419,8 @@ public class HMMProbabilityDistributionAnalyser } writerRaw.close(); + } + } /** @@ -412,7 +433,7 @@ public class HMMProbabilityDistributionAnalyser { BufferedReader br = new BufferedReader(new FileReader(FAMILIES)); - moveToFile(index, br); + moveLocationBy(index, br); String line = br.readLine(); @@ -436,7 +457,7 @@ public class HMMProbabilityDistributionAnalyser { BufferedReader br = new BufferedReader(new FileReader(HMMS)); - moveToFile(index, br); + moveLocationBy(index, br); String line = br.readLine(); @@ -451,35 +472,20 @@ public class HMMProbabilityDistributionAnalyser } /** - * Prints the specified family to a .sto file in the current directory. + * Prints the specified family to a .sto file. * * @param index * @throws IOException */ - public void printFamToFile(int index) throws IOException + public void exportFam(int index, String location) throws IOException { - String name; - - BufferedReader nameFinder = new BufferedReader( - new FileReader(FAMILIES)); - - moveToFile(index, nameFinder); - - nameFinder.readLine(); - - Scanner scanner = new Scanner(nameFinder.readLine()); - scanner.next(); - scanner.next(); - name = scanner.next(); - scanner.close(); - BufferedReader br = new BufferedReader(new FileReader(FAMILIES)); - moveToFile(index, br); + moveLocationBy(index, br); String line = br.readLine(); PrintWriter writer = new PrintWriter( - currentFolder + "/" + name + ".sto"); + new FileOutputStream(new File(location), true)); while (!"//".equals(line)) { writer.println(line); @@ -491,36 +497,77 @@ public class HMMProbabilityDistributionAnalyser } - /** - * Prints the specified family to a .hmm file in the current directory. - * - * @param index - * @throws IOException - */ - public void printHMMToFile(int index) throws IOException + public void exportFile(BufferedReader br, String location) + throws IOException { + String line = br.readLine(); + PrintWriter writer = new PrintWriter( + new FileOutputStream(new File(location), true)); + while (!"//".equals(line)) + { + writer.println(line); + line = br.readLine(); + } + writer.println(line); + writer.close(); + + + } + public String getHMMName(int index) throws IOException + { String name; BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS)); - moveToFile(index, nameFinder); + moveLocationBy(index, nameFinder); + + nameFinder.readLine(); + + Scanner scanner = new Scanner(nameFinder.readLine()); + name = scanner.next(); + name = scanner.next(); + scanner.close(); + return name; + } + + public String getFamilyName(int index) throws IOException + { + String name; + + BufferedReader nameFinder = new BufferedReader( + new FileReader(FAMILIES)); + + moveLocationBy(index, nameFinder); nameFinder.readLine(); Scanner scanner = new Scanner(nameFinder.readLine()); name = scanner.next(); name = scanner.next(); + name = scanner.next(); scanner.close(); + return name; + } + + /** + * Prints the specified family to a .hmm file in the current directory. + * + * @param index + * @throws IOException + */ + public void exportHMM(int index, String location) throws IOException + { + BufferedReader br = new BufferedReader(new FileReader(HMMS)); - moveToFile(index, br); + moveLocationBy(index, br); String line = br.readLine(); PrintWriter writer = new PrintWriter( - currentFolder + "/" + name + ".hmm"); + new FileOutputStream(new File(location), true)); while (!"//".equals(line)) { writer.println(line); @@ -552,4 +599,69 @@ public class HMMProbabilityDistributionAnalyser raw.close(); } -} + public void sortIntoClans(String directory) throws IOException + { + BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN)); + BufferedReader familyReader = new BufferedReader( + new FileReader(FAMILIES)); + BufferedReader hmmReader = new BufferedReader(new FileReader(HMMS)); + HashMap clanIndexes = new HashMap<>(); + int filePos = 0; + int clanCount = 0; + String line; + line = clanFinder.readLine(); + + while (!"".equals(line) && !" ".equals(line) && line != null) + { + String clanName; + boolean inClan = false; + while (!(line.indexOf("//") > -1)) + { + + if (line.indexOf("#=GF CL") > -1) + { + inClan = true; + Scanner scanner = new Scanner(line); + scanner.next(); + scanner.next(); + clanName = scanner.next(); + scanner.close(); + + if (!clanIndexes.containsKey(clanName)) + { + clanIndexes.put(clanName, clanCount); + clanCount++; + } + + Integer clanI = clanIndexes.get(clanName); + String clanPath = directory + "/Clan" + clanI.toString(); + File clanFolder = new File(clanPath); + String famPath = clanPath + "/Families.sto"; + String hmmPath = clanPath + "/HMMs.hmm"; + if (!clanFolder.exists()) + { + clanFolder.mkdir(); + } + exportFile(familyReader, famPath); + exportFile(hmmReader, hmmPath); + + } + line = clanFinder.readLine(); + } + if (!inClan) + { + moveLocationBy(1, familyReader); + moveLocationBy(1, hmmReader); + } + filePos++; + System.out.println(filePos + " files read."); + line = clanFinder.readLine(); + + } + clanFinder.close(); + + } + + } + + diff --git a/src/jalview/util/ProbabilityAnalyserKickstarter.java b/src/jalview/util/ProbabilityAnalyserKickstarter.java index 9eea470..86d9177 100644 --- a/src/jalview/util/ProbabilityAnalyserKickstarter.java +++ b/src/jalview/util/ProbabilityAnalyserKickstarter.java @@ -32,39 +32,61 @@ public class ProbabilityAnalyserKickstarter // prints family to console. Syntax is printFam if (command.indexOf("printFam") > -1) { - inputScanner.next(); - int index = inputScanner.nextInt(); - analyser.printFam(index); - continue; + try + { + inputScanner.next(); + int index = inputScanner.nextInt(); + analyser.printFam(index); + continue; + } catch (Exception e) + { + System.out.println("Command failed"); + } + } // prints HMM to console. Syntax is printHMM if (command.indexOf("printHMM") > -1) { - + try + { inputScanner.next(); int index = inputScanner.nextInt(); analyser.printHMM(index); continue; + } catch (Exception e) + { + System.out.println("Command failed"); + } } // prints family to file in current folder. Syntax is exportFam . if (command.indexOf("exportFam") > -1) { - + try + { inputScanner.next(); int index = inputScanner.nextInt(); - String location = inputScanner.next(); - analyser.printFamToFile(index); + String location = inputScanner.next(); + analyser.exportFam(index, location); continue; + } catch (Exception e) + { + System.out.println("Command failed"); + } } // prints HMM to file in current folder. Syntax is exportHMM . if (command.indexOf("exportHMM") > -1) { - + try + { inputScanner.next(); int index = inputScanner.nextInt(); - String location = inputScanner.next(); - analyser.printHMMToFile(index); + String location = inputScanner.next(); + analyser.exportHMM(index, location); continue; + } catch (Exception e) + { + System.out.println("Command failed"); + } } // Processes data. Syntax is run . The // number loops specifies the number of increments the program will run. @@ -74,16 +96,26 @@ public class ProbabilityAnalyserKickstarter // increment is the number of families read per 'save'. if (command.indexOf("run") > -1) { + try + { + inputScanner.next(); int loops = inputScanner.nextInt(); int increments = inputScanner.nextInt(); + boolean keepRaw = inputScanner.nextBoolean(); for (int i = 0; i < loops; i++) { - analyser.run(increments); + analyser.run(increments, keepRaw); + System.out.println("Saved"); } + System.out.println("Task completed"); continue; + } catch (Exception e) + { + System.out.println("Command failed"); + } } // terminates program. Syntax is terminate. if (command.indexOf("terminate") > -1) @@ -101,11 +133,36 @@ public class ProbabilityAnalyserKickstarter // changes current directory. Syntax is cd if (command.indexOf("cd") > -1) { + try + { inputScanner.next(); analyser.setFolder(inputScanner.next()); + } catch (Exception e) + { + System.out.println("Command failed"); + } + } + + if (command.indexOf("getFamName") > -1) + { + try + { + inputScanner.next(); + System.out.println(analyser.getFamilyName(inputScanner.nextInt())); + inputScanner.close(); + continue; + } catch (Exception e) + { + System.out.println("Command failed"); + } + } + if (command.indexOf("sortIntoClans") > -1) + { + inputScanner.next(); + analyser.sortIntoClans(inputScanner.next()); + continue; + } - inputScanner.close(); - continue; } diff --git a/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java b/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java index 04c7890..60f6c89 100644 --- a/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java +++ b/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java @@ -1,9 +1,48 @@ package jalview.util; +import static org.testng.Assert.assertEquals; + +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + import org.testng.annotations.Test; public class HMMProbabilityDistributionAnalyserTest { + + HMMProbabilityDistributionAnalyser analyser = new HMMProbabilityDistributionAnalyser(); + + @Test + public void testMoveToFile() throws IOException + { + + BufferedReader br = new BufferedReader(new FileReader( + "test/utils/test_Fams_for_probability_analysis.txt")); + analyser.moveLocationBy(2, br); + + String line = br.readLine(); + assertEquals(line, "# STOCKHOLM 1.0"); + line = br.readLine(); + assertEquals(line, "seq1 ATW"); + line = br.readLine(); + assertEquals(line, "seq2 ATI"); + + } + @Test - public void f() { + public void testCountValidResidues() + { + SequenceI[] sequence = new Sequence[] { + new Sequence("seq1", "ATGWWSCF"), new Sequence("seq1", "GGWMMKI"), + new Sequence("seq1", "--.ATccc") }; + analyser.sequences.add(sequence[0]); + analyser.sequences.add(sequence[1]); + analyser.sequences.add(sequence[2]); + + int count = analyser.countValidResidues(); + assertEquals(count, 17); } }