X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Futil%2FHMMProbabilityDistributionAnalyser.java;h=66ae5526e7c5cce7a2e49eefc6a8e00872f77f2b;hb=467426ae84382a85861a38d91e4f69f86c53e4c8;hp=b30487de2074e26319fd6bc67d69329f4553f2f5;hpb=066fc9951453a55eea27900f41e1b0a5b59dc57c;p=jalview.git diff --git a/src/jalview/util/HMMProbabilityDistributionAnalyser.java b/src/jalview/util/HMMProbabilityDistributionAnalyser.java index b30487d..66ae552 100644 --- a/src/jalview/util/HMMProbabilityDistributionAnalyser.java +++ b/src/jalview/util/HMMProbabilityDistributionAnalyser.java @@ -1,5 +1,6 @@ package jalview.util; +import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.SequenceI; import jalview.io.DataSourceType; @@ -14,11 +15,13 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Scanner; import java.util.Vector; @@ -32,6 +35,7 @@ import java.util.Vector; */ public class HMMProbabilityDistributionAnalyser { + AlignmentAnnotation reference = null; Vector sequences; @@ -44,13 +48,13 @@ public class HMMProbabilityDistributionAnalyser Map binned = new HashMap<>(); // location of the family file - final static String FAMILIES = "H:/Desktop/PFAM/Family/SeedFamilies.seed"; + String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed"; // location of the file containing the family-clan links - final static String FAMILIESTOCLAN = "H:/Desktop/PFAM/Family/Clanlinks.dat"; + final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat"; // location of the HMM file - final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm"; + String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm"; // suffix for raw file final static String RAW = "/Raw.csv"; @@ -59,13 +63,15 @@ public class HMMProbabilityDistributionAnalyser final static String BINNED = "/Binned.csv"; // normalisation scale - final static double SCALE = 100000; + final static double SCALE = 1; // current position in file int currentFilePosition = 0; final static String NL = "\n"; + Random generator = new Random(); + // current directory String currentFolder; @@ -99,6 +105,7 @@ public class HMMProbabilityDistributionAnalyser while (!"//".equals(line)) { line = br.readLine(); + } } @@ -107,7 +114,8 @@ public class HMMProbabilityDistributionAnalyser /** * Analyses a specified number of families and then saves the data. Before * analysing the data, the previous saved data will be imported and after - * analysing this data is exported back into the file. + * analysing this, the data is exported back into the file. The file must be + * in flat file format. * * @param increments * The number of families to read before saving. @@ -116,17 +124,26 @@ public class HMMProbabilityDistributionAnalyser public void run(int increments, boolean keepRawData) throws IOException { keepRaw = keepRawData; - readPreviousData(currentFolder); + try + { + readPreviousData(currentFolder); + BufferedReader posReader = new BufferedReader( + new FileReader(currentFolder + "/CurrentPosition.txt")); + + String line = posReader.readLine(); + posReader.close(); + currentFilePosition = Integer.parseInt(line); + } catch (Exception e) + { + System.out.println("No previous data found"); + } + - BufferedReader posReader = new BufferedReader( - new FileReader(currentFolder + "/CurrentPosition.txt")); - String line = posReader.readLine(); - posReader.close(); - BufferedReader inputSTO = new BufferedReader(new FileReader(FAMILIES)); - BufferedReader inputHMM = new BufferedReader(new FileReader(HMMS)); + BufferedReader inputSTO = new BufferedReader(new FileReader(families)); + BufferedReader inputHMM = new BufferedReader(new FileReader(hmms)); + - currentFilePosition = Integer.parseInt(line); moveLocationBy(currentFilePosition, inputHMM); moveLocationBy(currentFilePosition, inputSTO); @@ -136,13 +153,9 @@ public class HMMProbabilityDistributionAnalyser while (filesRead < increments) { - FileParse parserSTO = new FileParse(inputSTO, "", - DataSourceType.FILE); - readStockholm(parserSTO); + readStockholm(inputSTO); - FileParse parserHMM = new FileParse(inputHMM, "", - DataSourceType.FILE); - readHMM(parserHMM); + readHMM(inputHMM); int count = countValidResidues(); processData(count); @@ -164,6 +177,93 @@ public class HMMProbabilityDistributionAnalyser } /** + * Analyses all families and then saves the data. Before analysing the data, + * the previous saved data will be imported and after analysing this, the data + * is exported back into the file. The file must be in flat file format. + * + * @param increments + * The number of families to read before saving. + * @throws IOException + */ + public void runToEnd(int minCount, int maxCount, boolean keepRawData, + boolean forClans) + throws IOException + { + keepRaw = keepRawData; + BufferedReader inputSTO = null; + BufferedReader inputHMM = null; + int size = 0; + int files = 1; + try + { + if (forClans) + { + files = 603; + } + int filesRead = 0; + for (int clan = 0; clan < files; clan++) + { + System.out.println(clan); + String clanPath = ""; + int numberOfFamilies = 0; + if (forClans) + { + clanPath = currentFolder + "/Clan" + clan; + if (!new File(clanPath).exists()) + { + continue; + } + BufferedReader famCountReader = new BufferedReader( + new FileReader(clanPath + "/NumberOfFamilies.txt")); + numberOfFamilies = Integer.parseInt(famCountReader.readLine()); + } + else + { + numberOfFamilies = 1; + } + + for (int fam = 0; fam < numberOfFamilies; fam++) + { + if (forClans) + { + families = clanPath + "/Families/Fam" + fam + ".sto"; + hmms = clanPath + "/HMMs/HMM" + fam + ".hmm"; + } + + inputSTO = new BufferedReader(new FileReader(families)); + inputHMM = new BufferedReader(new FileReader(hmms)); + + + int i = 0; + boolean endReached = atEnd(inputSTO); + while (!endReached) + { + readStockholm(inputSTO); + readHMM(inputHMM); + + int count = countValidResidues(); + if (count >= minCount && count < maxCount) + { + processData(count); + } + filesRead++; + System.out.println(filesRead); + endReached = atEnd(inputSTO); + } + } + } + } catch (Exception e) + { + e.printStackTrace(); + } finally + { + exportData(currentFolder); + raw.clear(); + binned.clear(); + } + } + + /** * Reads the previous data from both files * * @param source @@ -194,7 +294,9 @@ public class HMMProbabilityDistributionAnalyser { Scanner scanner = new Scanner(line); scanner.useDelimiter(","); - binned.put(scanner.next(), scanner.nextDouble()); + String key = scanner.next(); + String value = scanner.next(); + binned.put(key, Double.valueOf(value)); scanner.close(); line = input.readLine(); } @@ -242,6 +344,10 @@ public class HMMProbabilityDistributionAnalyser { raw.get(i).add(Double.parseDouble(value)); } + else + { + raw.get(i).add(null); + } i++; } @@ -263,14 +369,14 @@ public class HMMProbabilityDistributionAnalyser for (int width = 0; width < sequences.size(); width++) { - for (int length = 1; length < hmm.getLength(); length++) + for (int length = 1; length < hmm.getLength() + 1; length++) { char symbol; int alignPos; - alignPos = hmm.getNodeAlignmentColumn(length); + alignPos = hmm.getNodeMapPosition(length); symbol = sequences.get(width).getCharAt(alignPos); - if (ResidueProperties.aminoBackgroundFrequencies + if (ResidueProperties.backgroundFrequencies.get("amino") .containsKey(symbol)) { count++; @@ -294,24 +400,29 @@ public class HMMProbabilityDistributionAnalyser raw.add(new ArrayList()); rawPos = raw.size() - 1; } - + Double total = 0d; for (int width = 0; width < sequences.size(); width++) { - for (int length = 1; length < hmm.getLength(); length++) + for (int length = 1; length < hmm.getLength() + 1; length++) { char symbol; int alignPos; - alignPos = hmm.getNodeAlignmentColumn(length); + alignPos = hmm.getNodeMapPosition(length); symbol = sequences.get(width).getCharAt(alignPos); - if (ResidueProperties.aminoBackgroundFrequencies + if (ResidueProperties.backgroundFrequencies.get("amino") .containsKey(symbol)) { Double prob; Float bfreq; Double llr; prob = hmm.getMatchEmissionProbability(alignPos, symbol); - bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol); + bfreq = ResidueProperties.backgroundFrequencies.get("amino") + .get(symbol); + if (prob == 0 || bfreq == 0) + { + System.out.println("error"); + } llr = Math.log(prob / bfreq); if (keepRaw) { @@ -320,6 +431,7 @@ public class HMMProbabilityDistributionAnalyser String output; output = String.format("%.1f", llr); + total += Double.parseDouble(output); if ("-0.0".equals(output)) { output = "0.0"; @@ -338,6 +450,7 @@ public class HMMProbabilityDistributionAnalyser } } } + System.out.println(total / count); } @@ -347,9 +460,19 @@ public class HMMProbabilityDistributionAnalyser * @param source * @throws IOException */ - public void readStockholm(FileParse source) throws IOException + public void readStockholm(BufferedReader inputSTO) throws IOException { - StockholmFile file = new StockholmFile(source); + FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE); + StockholmFile file = new StockholmFile(parserSTO); + Vector annots = file.getAnnotations(); + + for (AlignmentAnnotation annot : annots) + { + if (annot.label.contains("Reference")) + { + reference = annot; + } + } sequences = file.getSeqs(); } @@ -359,13 +482,13 @@ public class HMMProbabilityDistributionAnalyser * @param source * @throws IOException */ - public void readHMM(FileParse source) throws IOException + public void readHMM(BufferedReader inputHMM) throws IOException { - - HMMFile file = new HMMFile(source); - file.parse(); + FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE); + HMMFile file = new HMMFile(parserHMM); hmm = file.getHMM(); + } /** @@ -431,7 +554,7 @@ public class HMMProbabilityDistributionAnalyser */ public void printFam(int index) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(FAMILIES)); + BufferedReader br = new BufferedReader(new FileReader(families)); moveLocationBy(index, br); @@ -455,7 +578,7 @@ public class HMMProbabilityDistributionAnalyser */ public void printHMM(int index) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(HMMS)); + BufferedReader br = new BufferedReader(new FileReader(hmms)); moveLocationBy(index, br); @@ -479,7 +602,7 @@ public class HMMProbabilityDistributionAnalyser */ public void exportFam(int index, String location) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(FAMILIES)); + BufferedReader br = new BufferedReader(new FileReader(families)); moveLocationBy(index, br); @@ -497,12 +620,12 @@ public class HMMProbabilityDistributionAnalyser } - public void exportFile(BufferedReader br, String location) + public void exportFile(BufferedReader br, String location, boolean append) throws IOException { String line = br.readLine(); PrintWriter writer = new PrintWriter( - new FileOutputStream(new File(location), true)); + new FileOutputStream(location, append)); while (!"//".equals(line)) { writer.println(line); @@ -518,7 +641,7 @@ public class HMMProbabilityDistributionAnalyser { String name; - BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS)); + BufferedReader nameFinder = new BufferedReader(new FileReader(hmms)); moveLocationBy(index, nameFinder); @@ -536,7 +659,7 @@ public class HMMProbabilityDistributionAnalyser String name; BufferedReader nameFinder = new BufferedReader( - new FileReader(FAMILIES)); + new FileReader(families)); moveLocationBy(index, nameFinder); @@ -551,7 +674,7 @@ public class HMMProbabilityDistributionAnalyser } /** - * Prints the specified family to a .hmm file in the current directory. + * Prints the specified family to a .hmm file. * * @param index * @throws IOException @@ -560,7 +683,7 @@ public class HMMProbabilityDistributionAnalyser { - BufferedReader br = new BufferedReader(new FileReader(HMMS)); + BufferedReader br = new BufferedReader(new FileReader(hmms)); moveLocationBy(index, br); @@ -603,9 +726,14 @@ public class HMMProbabilityDistributionAnalyser { BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN)); BufferedReader familyReader = new BufferedReader( - new FileReader(FAMILIES)); - BufferedReader hmmReader = new BufferedReader(new FileReader(HMMS)); + new FileReader(families)); + BufferedReader hmmReader = new BufferedReader(new FileReader(hmms)); + int families = 0; + // moveLocationBy(7000, familyReader); + // moveLocationBy(7000, clanFinder); + // moveLocationBy(7000, hmmReader); HashMap clanIndexes = new HashMap<>(); + ArrayList familyCounts = new ArrayList<>(); int filePos = 0; int clanCount = 0; String line; @@ -620,6 +748,8 @@ public class HMMProbabilityDistributionAnalyser if (line.indexOf("#=GF CL") > -1) { + families++; + System.out.println(families); inClan = true; Scanner scanner = new Scanner(line); scanner.next(); @@ -631,22 +761,29 @@ public class HMMProbabilityDistributionAnalyser { clanIndexes.put(clanName, clanCount); clanCount++; + familyCounts.add(0); } + Integer clanI = clanIndexes.get(clanName); String clanPath = directory + "/Clan" + clanI.toString(); - File clanFolder = new File(clanPath); - String famPath = clanPath + "/Families.sto"; - String hmmPath = clanPath + "/HMMs.hmm"; - if (!clanFolder.exists()) - { - clanFolder.mkdir(); - } - exportFile(familyReader, famPath); - exportFile(hmmReader, hmmPath); - + createFolders(clanPath); + + int index = clanIndexes.get(clanName); + exportFile(familyReader, + clanPath + "/Families/Fam" + familyCounts.get(index) + + ".sto", + false); + exportFile(hmmReader, + clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm", + false); + + int count = familyCounts.get(index); + count++; + familyCounts.set(index, count); } line = clanFinder.readLine(); + } if (!inClan) { @@ -654,14 +791,188 @@ public class HMMProbabilityDistributionAnalyser moveLocationBy(1, hmmReader); } filePos++; - System.out.println(filePos + " files read."); + // System.out.println(filePos + " files read."); line = clanFinder.readLine(); } clanFinder.close(); + + for (int clan = 0; clan < clanCount; clan++) + { + PrintWriter writer = new PrintWriter( + directory + "/Clan" + clan + "/NumberOfFamilies.txt"); + int count = familyCounts.get(clan); + writer.print(count); + writer.close(); + } } + + public String getFamilies() + { + return families; + } + + public void setFamilies(String families) + { + this.families = currentFolder + families; + } + + public String getHmms() + { + return hmms; + } + + public void setHmms(String hmms) + { + this.hmms = currentFolder + hmms; + } + public void alignWithinClan(String exportLocation, String clansLocation) + throws IOException, InterruptedException + { + int alignmentsExported = 0; + for (int clan = 0; clan < 604; clan++) + { + System.out.println(clan); + int famCount = 0; + String clanPath = clansLocation + "/Clan" + clan; + int numberOfFamilies; + BufferedReader br = new BufferedReader( + new FileReader(clanPath + "/NumberOfFamilies.txt")); + String line = br.readLine(); + numberOfFamilies = Integer.parseInt(line); + br.close(); + if (numberOfFamilies == 1) + { + continue; + } + final String commandExportLocation = exportLocation + "/Clan" + clan; + createFolders(commandExportLocation); + for (int family = 0; family < numberOfFamilies; family++) + { + famCount++; + ArrayList indexes = new ArrayList<>(); + for (int i = 0; i < numberOfFamilies; i++) + { + if (i != family) + { + indexes.add(i); + } + } + + int hmmIndex = getRandom(indexes); + String famPath = clanPath + "/Families/Fam" + family + ".sto"; + String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm"; + String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali " + + clanPath + "/Families/Fam" + hmmIndex + ".sto" + + " --trim "; + command += hmmPath + " "; + command += famPath; + final int familyIndex = family; + final Process p = Runtime.getRuntime().exec(command); + + new Thread(new Runnable() + { + @Override + public void run() + { + BufferedReader input = new BufferedReader( + new InputStreamReader(p.getInputStream())); + String line = null; + + try + { + PrintWriter writer = new PrintWriter(commandExportLocation + + "/Families/Fam" + familyIndex + ".sto"); + String lastLine = ""; + boolean dataFound = false; + while ((line = input.readLine()) != null) + { + if (line.contains("#=GR") && !dataFound) + { + writer.println(lastLine); + dataFound = true; + } + if (line.contains("#") || dataFound || " ".equals(line) + || "".equals(line) || "//".equals(line)) + { + writer.println(line); + } + lastLine = line; + } + writer.close(); + } catch (IOException e) + { + e.printStackTrace(); + } + } + }).start(); + + p.waitFor(); + + BufferedReader hmmExporter = new BufferedReader( + new FileReader(hmmPath)); + + exportFile(hmmExporter, + commandExportLocation + "/HMMs/HMM" + family + ".hmm", + false); + + alignmentsExported++; + + + } + PrintWriter writer = new PrintWriter( + commandExportLocation + "/NumberOfFamilies.txt"); + writer.print(famCount); + writer.close(); + } + + } + + public boolean atEnd(BufferedReader br) throws IOException + { + boolean end = false; + br.mark(80); + String line = br.readLine(); + if ("".equals(line) || line == null) + { + end = true; + } + br.reset(); + return end; } + public int getRandom(ArrayList list) + { + if (!(list.size() > 0)) + { + System.out.println("Error - size = " + list.size()); + } + int index = generator.nextInt(list.size()); + int value = list.get(index); + list.remove(index); + return value; + } + + public void createFolders(String clanPath) + { + File clanFolder = new File(clanPath); + if (!clanFolder.exists()) + { + clanFolder.mkdir(); + } + + File famFolder = new File(clanPath + "/Families"); + File hmmFolder = new File(clanPath + "/HMMs"); + if (!famFolder.exists()) + { + famFolder.mkdir(); + hmmFolder.mkdir(); + } + } +} + + +