X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Futil%2FHMMProbabilityDistributionAnalyser.java;h=66ae5526e7c5cce7a2e49eefc6a8e00872f77f2b;hb=b3e349b56ecd2c487a616c6c52288ab7c8f84654;hp=1fc178ad80222496cc648654579fbf89b0f1b12f;hpb=ad48d68e5790a0fa55e947b9ab9bb4eb5ca716ca;p=jalview.git diff --git a/src/jalview/util/HMMProbabilityDistributionAnalyser.java b/src/jalview/util/HMMProbabilityDistributionAnalyser.java index 1fc178a..66ae552 100644 --- a/src/jalview/util/HMMProbabilityDistributionAnalyser.java +++ b/src/jalview/util/HMMProbabilityDistributionAnalyser.java @@ -1,5 +1,6 @@ package jalview.util; +import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.SequenceI; import jalview.io.DataSourceType; @@ -11,13 +12,16 @@ import jalview.schemes.ResidueProperties; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Scanner; import java.util.Vector; @@ -31,6 +35,7 @@ import java.util.Vector; */ public class HMMProbabilityDistributionAnalyser { + AlignmentAnnotation reference = null; Vector sequences; @@ -43,10 +48,13 @@ public class HMMProbabilityDistributionAnalyser Map binned = new HashMap<>(); // location of the family file - final static String FAMILIES = "C:/Users/TZVanaalten/Pfam-A.full"; + String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed"; + + // location of the file containing the family-clan links + final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat"; // location of the HMM file - final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm"; + String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm"; // suffix for raw file final static String RAW = "/Raw.csv"; @@ -55,16 +63,20 @@ public class HMMProbabilityDistributionAnalyser final static String BINNED = "/Binned.csv"; // normalisation scale - final static double SCALE = 100000; + final static double SCALE = 1; // current position in file int currentFilePosition = 0; final static String NL = "\n"; + Random generator = new Random(); + // current directory String currentFolder; + boolean keepRaw = false; + /** * Sets the working directory. * @@ -76,15 +88,16 @@ public class HMMProbabilityDistributionAnalyser } /** - * Moves a buffered reader to a specific location in the file, delimited by - * '//'. + * Moves a buffered reader forward in the file by a certain amount of entries. + * Each entry in the file is delimited by '//'. * * @param index * The index of the location in the file. * @param br * @throws IOException */ - public void moveToFile(int index, BufferedReader br) throws IOException + public void moveLocationBy(int index, BufferedReader br) + throws IOException { for (int i = 0; i < index; i++) { @@ -92,6 +105,7 @@ public class HMMProbabilityDistributionAnalyser while (!"//".equals(line)) { line = br.readLine(); + } } @@ -100,53 +114,60 @@ public class HMMProbabilityDistributionAnalyser /** * Analyses a specified number of families and then saves the data. Before * analysing the data, the previous saved data will be imported and after - * analysing this data is exported back into the file. + * analysing this, the data is exported back into the file. The file must be + * in flat file format. * * @param increments * The number of families to read before saving. * @throws IOException */ - public void run(int increments) throws IOException + public void run(int increments, boolean keepRawData) throws IOException { + keepRaw = keepRawData; + try + { + readPreviousData(currentFolder); + BufferedReader posReader = new BufferedReader( + new FileReader(currentFolder + "/CurrentPosition.txt")); + + String line = posReader.readLine(); + posReader.close(); + currentFilePosition = Integer.parseInt(line); + } catch (Exception e) + { + System.out.println("No previous data found"); + } - readPreviousData(currentFolder); - BufferedReader posReader = new BufferedReader( - new FileReader(currentFolder + "/CurrentPosition.txt")); - String line = posReader.readLine(); - posReader.close(); - currentFilePosition = Integer.parseInt(line); - BufferedReader inputSTO = new BufferedReader( - new FileReader(FAMILIES)); - BufferedReader inputHMM = new BufferedReader( - new FileReader(HMMS)); + BufferedReader inputSTO = new BufferedReader(new FileReader(families)); + BufferedReader inputHMM = new BufferedReader(new FileReader(hmms)); - moveToFile(currentFilePosition, inputHMM); - moveToFile(currentFilePosition, inputSTO); + + + moveLocationBy(currentFilePosition, inputHMM); + moveLocationBy(currentFilePosition, inputSTO); int filesRead = 0; + int i = 0; while (filesRead < increments) { - FileParse parserSTO = new FileParse(inputSTO, "", - DataSourceType.FILE); - readStockholm(parserSTO); - FileParse parserHMM = new FileParse(inputHMM, "", - DataSourceType.FILE); - readHMM(parserHMM); + readStockholm(inputSTO); + + readHMM(inputHMM); - if (hmm.getAlphabetType().equals("amino")) - { int count = countValidResidues(); processData(count); filesRead++; - } + currentFilePosition++; + System.out.println(i); + i++; } PrintWriter p = new PrintWriter( - new File(currentFolder + "/CurrentPosition")); + new File(currentFolder + "/CurrentPosition.txt")); p.print(currentFilePosition); p.close(); exportData(currentFolder); @@ -156,6 +177,93 @@ public class HMMProbabilityDistributionAnalyser } /** + * Analyses all families and then saves the data. Before analysing the data, + * the previous saved data will be imported and after analysing this, the data + * is exported back into the file. The file must be in flat file format. + * + * @param increments + * The number of families to read before saving. + * @throws IOException + */ + public void runToEnd(int minCount, int maxCount, boolean keepRawData, + boolean forClans) + throws IOException + { + keepRaw = keepRawData; + BufferedReader inputSTO = null; + BufferedReader inputHMM = null; + int size = 0; + int files = 1; + try + { + if (forClans) + { + files = 603; + } + int filesRead = 0; + for (int clan = 0; clan < files; clan++) + { + System.out.println(clan); + String clanPath = ""; + int numberOfFamilies = 0; + if (forClans) + { + clanPath = currentFolder + "/Clan" + clan; + if (!new File(clanPath).exists()) + { + continue; + } + BufferedReader famCountReader = new BufferedReader( + new FileReader(clanPath + "/NumberOfFamilies.txt")); + numberOfFamilies = Integer.parseInt(famCountReader.readLine()); + } + else + { + numberOfFamilies = 1; + } + + for (int fam = 0; fam < numberOfFamilies; fam++) + { + if (forClans) + { + families = clanPath + "/Families/Fam" + fam + ".sto"; + hmms = clanPath + "/HMMs/HMM" + fam + ".hmm"; + } + + inputSTO = new BufferedReader(new FileReader(families)); + inputHMM = new BufferedReader(new FileReader(hmms)); + + + int i = 0; + boolean endReached = atEnd(inputSTO); + while (!endReached) + { + readStockholm(inputSTO); + readHMM(inputHMM); + + int count = countValidResidues(); + if (count >= minCount && count < maxCount) + { + processData(count); + } + filesRead++; + System.out.println(filesRead); + endReached = atEnd(inputSTO); + } + } + } + } catch (Exception e) + { + e.printStackTrace(); + } finally + { + exportData(currentFolder); + raw.clear(); + binned.clear(); + } + } + + /** * Reads the previous data from both files * * @param source @@ -164,7 +272,10 @@ public class HMMProbabilityDistributionAnalyser public void readPreviousData(String source) throws IOException { readBinned(source); - readRaw(source); + if (keepRaw) + { + readRaw(source); + } } /** @@ -178,12 +289,14 @@ public class HMMProbabilityDistributionAnalyser BufferedReader input = new BufferedReader( new FileReader(source + BINNED)); String line = input.readLine(); + binned = new HashMap<>(); while (!("".equals(line) || line == null)) { - binned = new HashMap<>(); Scanner scanner = new Scanner(line); scanner.useDelimiter(","); - binned.put(scanner.next(), scanner.nextDouble()); + String key = scanner.next(); + String value = scanner.next(); + binned.put(key, Double.valueOf(value)); scanner.close(); line = input.readLine(); } @@ -231,6 +344,10 @@ public class HMMProbabilityDistributionAnalyser { raw.get(i).add(Double.parseDouble(value)); } + else + { + raw.get(i).add(null); + } i++; } @@ -252,14 +369,14 @@ public class HMMProbabilityDistributionAnalyser for (int width = 0; width < sequences.size(); width++) { - for (int length = 1; length < hmm.getLength(); length++) + for (int length = 1; length < hmm.getLength() + 1; length++) { char symbol; int alignPos; - alignPos = hmm.getNodeAlignmentColumn(length); + alignPos = hmm.getNodeMapPosition(length); symbol = sequences.get(width).getCharAt(alignPos); - if (ResidueProperties.aminoBackgroundFrequencies + if (ResidueProperties.backgroundFrequencies.get("amino") .containsKey(symbol)) { count++; @@ -277,31 +394,44 @@ public class HMMProbabilityDistributionAnalyser */ public void processData(int count) { - - raw.add(new ArrayList()); - int rawPos = raw.size() - 1; + int rawPos = 0; + if (keepRaw) + { + raw.add(new ArrayList()); + rawPos = raw.size() - 1; + } + Double total = 0d; for (int width = 0; width < sequences.size(); width++) { - for (int length = 1; length < hmm.getLength(); length++) + for (int length = 1; length < hmm.getLength() + 1; length++) { char symbol; int alignPos; - alignPos = hmm.getNodeAlignmentColumn(length); - + alignPos = hmm.getNodeMapPosition(length); + symbol = sequences.get(width).getCharAt(alignPos); - if (ResidueProperties.aminoBackgroundFrequencies + if (ResidueProperties.backgroundFrequencies.get("amino") .containsKey(symbol)) { - Double prob; Float bfreq; Double llr; prob = hmm.getMatchEmissionProbability(alignPos, symbol); - bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol); + bfreq = ResidueProperties.backgroundFrequencies.get("amino") + .get(symbol); + if (prob == 0 || bfreq == 0) + { + System.out.println("error"); + } llr = Math.log(prob / bfreq); - raw.get(rawPos).add(llr); + if (keepRaw) + { + raw.get(rawPos).add(llr); + } + String output; output = String.format("%.1f", llr); + total += Double.parseDouble(output); if ("-0.0".equals(output)) { output = "0.0"; @@ -320,6 +450,7 @@ public class HMMProbabilityDistributionAnalyser } } } + System.out.println(total / count); } @@ -329,10 +460,19 @@ public class HMMProbabilityDistributionAnalyser * @param source * @throws IOException */ - public void readStockholm(FileParse source) throws IOException + public void readStockholm(BufferedReader inputSTO) throws IOException { - StockholmFile file = new StockholmFile(source); - file.parse(); + FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE); + StockholmFile file = new StockholmFile(parserSTO); + Vector annots = file.getAnnotations(); + + for (AlignmentAnnotation annot : annots) + { + if (annot.label.contains("Reference")) + { + reference = annot; + } + } sequences = file.getSeqs(); } @@ -342,13 +482,13 @@ public class HMMProbabilityDistributionAnalyser * @param source * @throws IOException */ - public void readHMM(FileParse source) throws IOException + public void readHMM(BufferedReader inputHMM) throws IOException { - - HMMFile file = new HMMFile(source); - file.parse(); + FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE); + HMMFile file = new HMMFile(parserHMM); hmm = file.getHMM(); + } /** @@ -365,18 +505,20 @@ public class HMMProbabilityDistributionAnalyser writerBin.println(entry.getKey() + "," + entry.getValue()); } writerBin.close(); + if (keepRaw) + { PrintWriter writerRaw = new PrintWriter(new File(location + RAW)); - + StringBuilder identifier = new StringBuilder(); - + for (int i = 1; i < raw.size() + 1; i++) { identifier.append("Fam " + i + ","); } - + writerRaw.println(identifier); - + boolean rowIsEmpty = false; int row = 0; while (!rowIsEmpty) @@ -400,6 +542,8 @@ public class HMMProbabilityDistributionAnalyser } writerRaw.close(); + } + } /** @@ -410,9 +554,9 @@ public class HMMProbabilityDistributionAnalyser */ public void printFam(int index) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(FAMILIES)); + BufferedReader br = new BufferedReader(new FileReader(families)); - moveToFile(index, br); + moveLocationBy(index, br); String line = br.readLine(); @@ -434,9 +578,9 @@ public class HMMProbabilityDistributionAnalyser */ public void printHMM(int index) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(HMMS)); + BufferedReader br = new BufferedReader(new FileReader(hmms)); - moveToFile(index, br); + moveLocationBy(index, br); String line = br.readLine(); @@ -451,35 +595,37 @@ public class HMMProbabilityDistributionAnalyser } /** - * Prints the specified family to a .sto file in the current directory. + * Prints the specified family to a .sto file. * * @param index * @throws IOException */ - public void printFamToFile(int index) throws IOException + public void exportFam(int index, String location) throws IOException { - String name; - - BufferedReader nameFinder = new BufferedReader( - new FileReader(FAMILIES)); - - moveToFile(index, nameFinder); - - nameFinder.readLine(); + BufferedReader br = new BufferedReader(new FileReader(families)); - Scanner scanner = new Scanner(nameFinder.readLine()); - scanner.next(); - scanner.next(); - name = scanner.next(); - scanner.close(); + moveLocationBy(index, br); - BufferedReader br = new BufferedReader(new FileReader(FAMILIES)); + String line = br.readLine(); + PrintWriter writer = new PrintWriter( + new FileOutputStream(new File(location), true)); + while (!"//".equals(line)) + { + writer.println(line); + line = br.readLine(); + } + writer.println(line); + writer.close(); + br.close(); - moveToFile(index, br); + } + public void exportFile(BufferedReader br, String location, boolean append) + throws IOException + { String line = br.readLine(); PrintWriter writer = new PrintWriter( - currentFolder + "/" + name + ".sto"); + new FileOutputStream(location, append)); while (!"//".equals(line)) { writer.println(line); @@ -487,40 +633,64 @@ public class HMMProbabilityDistributionAnalyser } writer.println(line); writer.close(); - br.close(); + } - /** - * Prints the specified family to a .hmm file in the current directory. - * - * @param index - * @throws IOException - */ - public void printHMMToFile(int index) throws IOException + public String getHMMName(int index) throws IOException { + String name; + + BufferedReader nameFinder = new BufferedReader(new FileReader(hmms)); + + moveLocationBy(index, nameFinder); + + nameFinder.readLine(); + + Scanner scanner = new Scanner(nameFinder.readLine()); + name = scanner.next(); + name = scanner.next(); + scanner.close(); + return name; + } + public String getFamilyName(int index) throws IOException + { String name; - BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS)); + BufferedReader nameFinder = new BufferedReader( + new FileReader(families)); - moveToFile(index, nameFinder); + moveLocationBy(index, nameFinder); nameFinder.readLine(); Scanner scanner = new Scanner(nameFinder.readLine()); name = scanner.next(); name = scanner.next(); + name = scanner.next(); scanner.close(); + return name; + } - BufferedReader br = new BufferedReader(new FileReader(HMMS)); + /** + * Prints the specified family to a .hmm file. + * + * @param index + * @throws IOException + */ + public void exportHMM(int index, String location) throws IOException + { - moveToFile(index, br); + + BufferedReader br = new BufferedReader(new FileReader(hmms)); + + moveLocationBy(index, br); String line = br.readLine(); PrintWriter writer = new PrintWriter( - currentFolder + "/" + name + ".hmm"); + new FileOutputStream(new File(location), true)); while (!"//".equals(line)) { writer.println(line); @@ -552,4 +722,257 @@ public class HMMProbabilityDistributionAnalyser raw.close(); } + public void sortIntoClans(String directory) throws IOException + { + BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN)); + BufferedReader familyReader = new BufferedReader( + new FileReader(families)); + BufferedReader hmmReader = new BufferedReader(new FileReader(hmms)); + int families = 0; + // moveLocationBy(7000, familyReader); + // moveLocationBy(7000, clanFinder); + // moveLocationBy(7000, hmmReader); + HashMap clanIndexes = new HashMap<>(); + ArrayList familyCounts = new ArrayList<>(); + int filePos = 0; + int clanCount = 0; + String line; + line = clanFinder.readLine(); + + while (!"".equals(line) && !" ".equals(line) && line != null) + { + String clanName; + boolean inClan = false; + while (!(line.indexOf("//") > -1)) + { + + if (line.indexOf("#=GF CL") > -1) + { + families++; + System.out.println(families); + inClan = true; + Scanner scanner = new Scanner(line); + scanner.next(); + scanner.next(); + clanName = scanner.next(); + scanner.close(); + + if (!clanIndexes.containsKey(clanName)) + { + clanIndexes.put(clanName, clanCount); + clanCount++; + familyCounts.add(0); + } + + + Integer clanI = clanIndexes.get(clanName); + String clanPath = directory + "/Clan" + clanI.toString(); + createFolders(clanPath); + + int index = clanIndexes.get(clanName); + exportFile(familyReader, + clanPath + "/Families/Fam" + familyCounts.get(index) + + ".sto", + false); + exportFile(hmmReader, + clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm", + false); + + int count = familyCounts.get(index); + count++; + familyCounts.set(index, count); + } + line = clanFinder.readLine(); + + } + if (!inClan) + { + moveLocationBy(1, familyReader); + moveLocationBy(1, hmmReader); + } + filePos++; + // System.out.println(filePos + " files read."); + line = clanFinder.readLine(); + + } + clanFinder.close(); + + for (int clan = 0; clan < clanCount; clan++) + { + PrintWriter writer = new PrintWriter( + directory + "/Clan" + clan + "/NumberOfFamilies.txt"); + int count = familyCounts.get(clan); + writer.print(count); + writer.close(); + } + + } + + public String getFamilies() + { + return families; + } + + public void setFamilies(String families) + { + this.families = currentFolder + families; + } + + public String getHmms() + { + return hmms; + } + + public void setHmms(String hmms) + { + this.hmms = currentFolder + hmms; + } + + public void alignWithinClan(String exportLocation, String clansLocation) + throws IOException, InterruptedException + { + int alignmentsExported = 0; + for (int clan = 0; clan < 604; clan++) + { + System.out.println(clan); + int famCount = 0; + String clanPath = clansLocation + "/Clan" + clan; + int numberOfFamilies; + BufferedReader br = new BufferedReader( + new FileReader(clanPath + "/NumberOfFamilies.txt")); + String line = br.readLine(); + numberOfFamilies = Integer.parseInt(line); + br.close(); + if (numberOfFamilies == 1) + { + continue; + } + final String commandExportLocation = exportLocation + "/Clan" + clan; + createFolders(commandExportLocation); + for (int family = 0; family < numberOfFamilies; family++) + { + famCount++; + ArrayList indexes = new ArrayList<>(); + for (int i = 0; i < numberOfFamilies; i++) + { + if (i != family) + { + indexes.add(i); + } + } + + int hmmIndex = getRandom(indexes); + String famPath = clanPath + "/Families/Fam" + family + ".sto"; + String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm"; + String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali " + + clanPath + "/Families/Fam" + hmmIndex + ".sto" + + " --trim "; + command += hmmPath + " "; + command += famPath; + final int familyIndex = family; + final Process p = Runtime.getRuntime().exec(command); + + new Thread(new Runnable() + { + @Override + public void run() + { + BufferedReader input = new BufferedReader( + new InputStreamReader(p.getInputStream())); + String line = null; + + try + { + PrintWriter writer = new PrintWriter(commandExportLocation + + "/Families/Fam" + familyIndex + ".sto"); + String lastLine = ""; + boolean dataFound = false; + while ((line = input.readLine()) != null) + { + if (line.contains("#=GR") && !dataFound) + { + writer.println(lastLine); + dataFound = true; + } + if (line.contains("#") || dataFound || " ".equals(line) + || "".equals(line) || "//".equals(line)) + { + writer.println(line); + } + lastLine = line; + } + writer.close(); + } catch (IOException e) + { + e.printStackTrace(); + } + } + }).start(); + + p.waitFor(); + + BufferedReader hmmExporter = new BufferedReader( + new FileReader(hmmPath)); + + exportFile(hmmExporter, + commandExportLocation + "/HMMs/HMM" + family + ".hmm", + false); + + alignmentsExported++; + + + } + PrintWriter writer = new PrintWriter( + commandExportLocation + "/NumberOfFamilies.txt"); + writer.print(famCount); + writer.close(); + } + + } + + public boolean atEnd(BufferedReader br) throws IOException + { + boolean end = false; + br.mark(80); + String line = br.readLine(); + if ("".equals(line) || line == null) + { + end = true; + } + br.reset(); + return end; + } + + public int getRandom(ArrayList list) + { + if (!(list.size() > 0)) + { + System.out.println("Error - size = " + list.size()); + } + int index = generator.nextInt(list.size()); + int value = list.get(index); + list.remove(index); + return value; + } + + public void createFolders(String clanPath) + { + File clanFolder = new File(clanPath); + if (!clanFolder.exists()) + { + clanFolder.mkdir(); + } + + File famFolder = new File(clanPath + "/Families"); + File hmmFolder = new File(clanPath + "/HMMs"); + if (!famFolder.exists()) + { + famFolder.mkdir(); + hmmFolder.mkdir(); + } + } } + + + +