package jalview.util; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.SequenceI; import jalview.io.DataSourceType; import jalview.io.FileParse; import jalview.io.HMMFile; import jalview.io.StockholmFile; import jalview.schemes.ResidueProperties; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Scanner; import java.util.Vector; /** * Processes probability data. The file indexes used in this program represent * the index of the location of a family or hmm in their respective files, * starting from 0. * * @author TZVanaalten * */ public class HMMProbabilityDistributionAnalyser { AlignmentAnnotation reference = null; Vector sequences; HiddenMarkovModel hmm; // contains the raw data produced List> raw = new ArrayList<>(); // contains binned data Map binned = new HashMap<>(); // location of the family file String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed"; // location of the file containing the family-clan links final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat"; // location of the HMM file String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm"; // suffix for raw file final static String RAW = "/Raw.csv"; // suffix for binned file final static String BINNED = "/Binned.csv"; // normalisation scale final static double SCALE = 1; // current position in file int currentFilePosition = 0; final static String NL = "\n"; Random generator = new Random(); // current directory String currentFolder; boolean keepRaw = false; /** * Sets the working directory. * * @param path */ public void setFolder(String path) { currentFolder = path; } /** * Moves a buffered reader forward in the file by a certain amount of entries. * Each entry in the file is delimited by '//'. * * @param index * The index of the location in the file. * @param br * @throws IOException */ public void moveLocationBy(int index, BufferedReader br) throws IOException { for (int i = 0; i < index; i++) { String line = br.readLine(); while (!"//".equals(line)) { line = br.readLine(); } } } /** * Analyses a specified number of families and then saves the data. Before * analysing the data, the previous saved data will be imported and after * analysing this, the data is exported back into the file. The file must be * in flat file format. * * @param increments * The number of families to read before saving. * @throws IOException */ public void run(int increments, boolean keepRawData) throws IOException { keepRaw = keepRawData; try { readPreviousData(currentFolder); BufferedReader posReader = new BufferedReader( new FileReader(currentFolder + "/CurrentPosition.txt")); String line = posReader.readLine(); posReader.close(); currentFilePosition = Integer.parseInt(line); } catch (Exception e) { System.out.println("No previous data found"); } BufferedReader inputSTO = new BufferedReader(new FileReader(families)); BufferedReader inputHMM = new BufferedReader(new FileReader(hmms)); moveLocationBy(currentFilePosition, inputHMM); moveLocationBy(currentFilePosition, inputSTO); int filesRead = 0; int i = 0; while (filesRead < increments) { readStockholm(inputSTO); readHMM(inputHMM); int count = countValidResidues(); processData(count); filesRead++; currentFilePosition++; System.out.println(i); i++; } PrintWriter p = new PrintWriter( new File(currentFolder + "/CurrentPosition.txt")); p.print(currentFilePosition); p.close(); exportData(currentFolder); raw.clear(); binned.clear(); } /** * Analyses all families and then saves the data. Before analysing the data, * the previous saved data will be imported and after analysing this, the data * is exported back into the file. The file must be in flat file format. * * @param increments * The number of families to read before saving. * @throws IOException */ public void runToEnd(int minCount, int maxCount, boolean keepRawData, boolean forClans) throws IOException { keepRaw = keepRawData; BufferedReader inputSTO = null; BufferedReader inputHMM = null; int size = 0; int files = 1; try { if (forClans) { files = 603; } int filesRead = 0; for (int clan = 0; clan < files; clan++) { System.out.println(clan); String clanPath = ""; int numberOfFamilies = 0; if (forClans) { clanPath = currentFolder + "/Clan" + clan; if (!new File(clanPath).exists()) { continue; } BufferedReader famCountReader = new BufferedReader( new FileReader(clanPath + "/NumberOfFamilies.txt")); numberOfFamilies = Integer.parseInt(famCountReader.readLine()); } else { numberOfFamilies = 1; } for (int fam = 0; fam < numberOfFamilies; fam++) { if (forClans) { families = clanPath + "/Families/Fam" + fam + ".sto"; hmms = clanPath + "/HMMs/HMM" + fam + ".hmm"; } inputSTO = new BufferedReader(new FileReader(families)); inputHMM = new BufferedReader(new FileReader(hmms)); int i = 0; boolean endReached = atEnd(inputSTO); while (!endReached) { readStockholm(inputSTO); readHMM(inputHMM); int count = countValidResidues(); if (count >= minCount && count < maxCount) { processData(count); } filesRead++; System.out.println(filesRead); endReached = atEnd(inputSTO); } } } } catch (Exception e) { e.printStackTrace(); } finally { exportData(currentFolder); raw.clear(); binned.clear(); } } /** * Reads the previous data from both files * * @param source * @throws IOException */ public void readPreviousData(String source) throws IOException { readBinned(source); if (keepRaw) { readRaw(source); } } /** * Reads the previous data from the binned file. * * @param source * @throws IOException */ public void readBinned(String source) throws IOException { BufferedReader input = new BufferedReader( new FileReader(source + BINNED)); String line = input.readLine(); binned = new HashMap<>(); while (!("".equals(line) || line == null)) { Scanner scanner = new Scanner(line); scanner.useDelimiter(","); String key = scanner.next(); String value = scanner.next(); binned.put(key, Double.valueOf(value)); scanner.close(); line = input.readLine(); } input.close(); } /** * Reads the previous data from the raw file. * * @param source * @throws IOException */ public void readRaw(String source) throws IOException { BufferedReader input = new BufferedReader(new FileReader(source + RAW)); String line = input.readLine(); if (line == null) { input.close(); return; } Scanner numberScanner = new Scanner(line); numberScanner.useDelimiter(","); raw = new ArrayList<>(); while (numberScanner.hasNext()) { numberScanner.next(); raw.add(new ArrayList()); } numberScanner.close(); line = input.readLine(); while (!("".equals(line) || line == null)) { Scanner scanner = new Scanner(line); scanner.useDelimiter(","); int i = 0; while (scanner.hasNext()) { String value; value = scanner.next(); if (!value.equals("EMPTY")) { raw.get(i).add(Double.parseDouble(value)); } else { raw.get(i).add(null); } i++; } scanner.close(); line = input.readLine(); } input.close(); } /** * Counts the number of valid residues in the sequence. * * @return */ public int countValidResidues() { int count = 0; for (int width = 0; width < sequences.size(); width++) { for (int length = 1; length < hmm.getLength() + 1; length++) { char symbol; int alignPos; alignPos = hmm.getNodeMapPosition(length); symbol = sequences.get(width).getCharAt(alignPos); if (ResidueProperties.backgroundFrequencies.get("amino") .containsKey(symbol)) { count++; } } } return count; } /** * Processes data, and stores it in both a raw and binned format. * * @param count */ public void processData(int count) { int rawPos = 0; if (keepRaw) { raw.add(new ArrayList()); rawPos = raw.size() - 1; } Double total = 0d; for (int width = 0; width < sequences.size(); width++) { for (int length = 1; length < hmm.getLength() + 1; length++) { char symbol; int alignPos; alignPos = hmm.getNodeMapPosition(length); symbol = sequences.get(width).getCharAt(alignPos); if (ResidueProperties.backgroundFrequencies.get("amino") .containsKey(symbol)) { Double prob; Float bfreq; Double llr; prob = hmm.getMatchEmissionProbability(alignPos, symbol); bfreq = ResidueProperties.backgroundFrequencies.get("amino") .get(symbol); if (prob == 0 || bfreq == 0) { System.out.println("error"); } llr = Math.log(prob / bfreq); if (keepRaw) { raw.get(rawPos).add(llr); } String output; output = String.format("%.1f", llr); total += Double.parseDouble(output); if ("-0.0".equals(output)) { output = "0.0"; } if (binned.containsKey(output)) { double prev = binned.get(output); prev += (SCALE / count); binned.put(output, prev); } else { binned.put(output, SCALE / count); } } } } System.out.println(total / count); } /** * Reads in the sequence data from a Stockholm file. * * @param source * @throws IOException */ public void readStockholm(BufferedReader inputSTO) throws IOException { FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE); StockholmFile file = new StockholmFile(parserSTO); Vector annots = file.getAnnotations(); for (AlignmentAnnotation annot : annots) { if (annot.label.contains("Reference")) { reference = annot; } } sequences = file.getSeqs(); } /** * Reads in the HMM data from a HMMer file. * * @param source * @throws IOException */ public void readHMM(BufferedReader inputHMM) throws IOException { FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE); HMMFile file = new HMMFile(parserHMM); hmm = file.getHMM(); } /** * Exports both the binned and raw data into separate files. * * @param location * @throws FileNotFoundException */ public void exportData(String location) throws FileNotFoundException { PrintWriter writerBin = new PrintWriter(new File(location + BINNED)); for (Map.Entry entry : binned.entrySet()) { writerBin.println(entry.getKey() + "," + entry.getValue()); } writerBin.close(); if (keepRaw) { PrintWriter writerRaw = new PrintWriter(new File(location + RAW)); StringBuilder identifier = new StringBuilder(); for (int i = 1; i < raw.size() + 1; i++) { identifier.append("Fam " + i + ","); } writerRaw.println(identifier); boolean rowIsEmpty = false; int row = 0; while (!rowIsEmpty) { rowIsEmpty = true; StringBuilder string = new StringBuilder(); for (int column = 0; column < raw.size(); column++) { if (raw.get(column).size() <= row) { string.append("EMPTY,"); } else { string.append(raw.get(column).get(row) + ","); rowIsEmpty = false; } } row++; writerRaw.println(string); } writerRaw.close(); } } /** * Prints the specified family on the console. * * @param index * @throws IOException */ public void printFam(int index) throws IOException { BufferedReader br = new BufferedReader(new FileReader(families)); moveLocationBy(index, br); String line = br.readLine(); while (!"//".equals(line)) { System.out.println(line); line = br.readLine(); } System.out.println(line); br.close(); } /** * Prints the specified HMM on the console. * * @param index * @throws IOException */ public void printHMM(int index) throws IOException { BufferedReader br = new BufferedReader(new FileReader(hmms)); moveLocationBy(index, br); String line = br.readLine(); while (!"//".equals(line)) { System.out.println(line); line = br.readLine(); } System.out.println(line); br.close(); } /** * Prints the specified family to a .sto file. * * @param index * @throws IOException */ public void exportFam(int index, String location) throws IOException { BufferedReader br = new BufferedReader(new FileReader(families)); moveLocationBy(index, br); String line = br.readLine(); PrintWriter writer = new PrintWriter( new FileOutputStream(new File(location), true)); while (!"//".equals(line)) { writer.println(line); line = br.readLine(); } writer.println(line); writer.close(); br.close(); } public void exportFile(BufferedReader br, String location, boolean append) throws IOException { String line = br.readLine(); PrintWriter writer = new PrintWriter( new FileOutputStream(location, append)); while (!"//".equals(line)) { writer.println(line); line = br.readLine(); } writer.println(line); writer.close(); } public String getHMMName(int index) throws IOException { String name; BufferedReader nameFinder = new BufferedReader(new FileReader(hmms)); moveLocationBy(index, nameFinder); nameFinder.readLine(); Scanner scanner = new Scanner(nameFinder.readLine()); name = scanner.next(); name = scanner.next(); scanner.close(); return name; } public String getFamilyName(int index) throws IOException { String name; BufferedReader nameFinder = new BufferedReader( new FileReader(families)); moveLocationBy(index, nameFinder); nameFinder.readLine(); Scanner scanner = new Scanner(nameFinder.readLine()); name = scanner.next(); name = scanner.next(); name = scanner.next(); scanner.close(); return name; } /** * Prints the specified family to a .hmm file. * * @param index * @throws IOException */ public void exportHMM(int index, String location) throws IOException { BufferedReader br = new BufferedReader(new FileReader(hmms)); moveLocationBy(index, br); String line = br.readLine(); PrintWriter writer = new PrintWriter( new FileOutputStream(new File(location), true)); while (!"//".equals(line)) { writer.println(line); line = br.readLine(); } writer.println(line); writer.close(); br.close(); } /** * Clears all raw, binned and current position data in the current directory. * * @throws FileNotFoundException */ public void clear() throws FileNotFoundException { PrintWriter pos = new PrintWriter( currentFolder + "/CurrentPosition.txt"); pos.println("0"); PrintWriter raw = new PrintWriter(currentFolder + RAW); PrintWriter bin = new PrintWriter(currentFolder + BINNED); pos.close(); bin.close(); raw.close(); } public void sortIntoClans(String directory) throws IOException { BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN)); BufferedReader familyReader = new BufferedReader( new FileReader(families)); BufferedReader hmmReader = new BufferedReader(new FileReader(hmms)); int families = 0; // moveLocationBy(7000, familyReader); // moveLocationBy(7000, clanFinder); // moveLocationBy(7000, hmmReader); HashMap clanIndexes = new HashMap<>(); ArrayList familyCounts = new ArrayList<>(); int filePos = 0; int clanCount = 0; String line; line = clanFinder.readLine(); while (!"".equals(line) && !" ".equals(line) && line != null) { String clanName; boolean inClan = false; while (!(line.indexOf("//") > -1)) { if (line.indexOf("#=GF CL") > -1) { families++; System.out.println(families); inClan = true; Scanner scanner = new Scanner(line); scanner.next(); scanner.next(); clanName = scanner.next(); scanner.close(); if (!clanIndexes.containsKey(clanName)) { clanIndexes.put(clanName, clanCount); clanCount++; familyCounts.add(0); } Integer clanI = clanIndexes.get(clanName); String clanPath = directory + "/Clan" + clanI.toString(); createFolders(clanPath); int index = clanIndexes.get(clanName); exportFile(familyReader, clanPath + "/Families/Fam" + familyCounts.get(index) + ".sto", false); exportFile(hmmReader, clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm", false); int count = familyCounts.get(index); count++; familyCounts.set(index, count); } line = clanFinder.readLine(); } if (!inClan) { moveLocationBy(1, familyReader); moveLocationBy(1, hmmReader); } filePos++; // System.out.println(filePos + " files read."); line = clanFinder.readLine(); } clanFinder.close(); for (int clan = 0; clan < clanCount; clan++) { PrintWriter writer = new PrintWriter( directory + "/Clan" + clan + "/NumberOfFamilies.txt"); int count = familyCounts.get(clan); writer.print(count); writer.close(); } } public String getFamilies() { return families; } public void setFamilies(String families) { this.families = currentFolder + families; } public String getHmms() { return hmms; } public void setHmms(String hmms) { this.hmms = currentFolder + hmms; } public void alignWithinClan(String exportLocation, String clansLocation) throws IOException, InterruptedException { int alignmentsExported = 0; for (int clan = 0; clan < 604; clan++) { System.out.println(clan); int famCount = 0; String clanPath = clansLocation + "/Clan" + clan; int numberOfFamilies; BufferedReader br = new BufferedReader( new FileReader(clanPath + "/NumberOfFamilies.txt")); String line = br.readLine(); numberOfFamilies = Integer.parseInt(line); br.close(); if (numberOfFamilies == 1) { continue; } final String commandExportLocation = exportLocation + "/Clan" + clan; createFolders(commandExportLocation); for (int family = 0; family < numberOfFamilies; family++) { famCount++; ArrayList indexes = new ArrayList<>(); for (int i = 0; i < numberOfFamilies; i++) { if (i != family) { indexes.add(i); } } int hmmIndex = getRandom(indexes); String famPath = clanPath + "/Families/Fam" + family + ".sto"; String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm"; String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali " + clanPath + "/Families/Fam" + hmmIndex + ".sto" + " --trim "; command += hmmPath + " "; command += famPath; final int familyIndex = family; final Process p = Runtime.getRuntime().exec(command); new Thread(new Runnable() { @Override public void run() { BufferedReader input = new BufferedReader( new InputStreamReader(p.getInputStream())); String line = null; try { PrintWriter writer = new PrintWriter(commandExportLocation + "/Families/Fam" + familyIndex + ".sto"); String lastLine = ""; boolean dataFound = false; while ((line = input.readLine()) != null) { if (line.contains("#=GR") && !dataFound) { writer.println(lastLine); dataFound = true; } if (line.contains("#") || dataFound || " ".equals(line) || "".equals(line) || "//".equals(line)) { writer.println(line); } lastLine = line; } writer.close(); } catch (IOException e) { e.printStackTrace(); } } }).start(); p.waitFor(); BufferedReader hmmExporter = new BufferedReader( new FileReader(hmmPath)); exportFile(hmmExporter, commandExportLocation + "/HMMs/HMM" + family + ".hmm", false); alignmentsExported++; } PrintWriter writer = new PrintWriter( commandExportLocation + "/NumberOfFamilies.txt"); writer.print(famCount); writer.close(); } } public boolean atEnd(BufferedReader br) throws IOException { boolean end = false; br.mark(80); String line = br.readLine(); if ("".equals(line) || line == null) { end = true; } br.reset(); return end; } public int getRandom(ArrayList list) { if (!(list.size() > 0)) { System.out.println("Error - size = " + list.size()); } int index = generator.nextInt(list.size()); int value = list.get(index); list.remove(index); return value; } public void createFolders(String clanPath) { File clanFolder = new File(clanPath); if (!clanFolder.exists()) { clanFolder.mkdir(); } File famFolder = new File(clanPath + "/Families"); File hmmFolder = new File(clanPath + "/HMMs"); if (!famFolder.exists()) { famFolder.mkdir(); hmmFolder.mkdir(); } } }