+++ /dev/null
-package jalview.util;
-
-import jalview.datamodel.AlignmentAnnotation;
-import jalview.datamodel.HiddenMarkovModel;
-import jalview.datamodel.SequenceI;
-import jalview.io.DataSourceType;
-import jalview.io.FileParse;
-import jalview.io.HMMFile;
-import jalview.io.StockholmFile;
-import jalview.schemes.ResidueProperties;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Scanner;
-import java.util.Vector;
-
-/**
- * Processes probability data. The file indexes used in this program represent
- * the index of the location of a family or hmm in their respective files,
- * starting from 0.
- *
- * @author TZVanaalten
- *
- */
-public class HMMProbabilityDistributionAnalyser
-{
- AlignmentAnnotation reference = null;
-
- Vector<SequenceI> sequences;
-
- HiddenMarkovModel hmm;
-
- // contains the raw data produced
- List<ArrayList<Double>> raw = new ArrayList<>();
-
- // contains binned data
- Map<String, Double> binned = new HashMap<>();
-
- // location of the family file
- String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
-
- // location of the file containing the family-clan links
- final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
-
- // location of the HMM file
- String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
-
- // suffix for raw file
- final static String RAW = "/Raw.csv";
-
- // suffix for binned file
- final static String BINNED = "/Binned.csv";
-
- // normalisation scale
- final static double SCALE = 1;
-
- // current position in file
- int currentFilePosition = 0;
-
- final static String NL = "\n";
-
- Random generator = new Random();
-
- // current directory
- String currentFolder;
-
- boolean keepRaw = false;
-
- /**
- * Sets the working directory.
- *
- * @param path
- */
- public void setFolder(String path)
- {
- currentFolder = path;
- }
-
- /**
- * Moves a buffered reader forward in the file by a certain amount of entries.
- * Each entry in the file is delimited by '//'.
- *
- * @param index
- * The index of the location in the file.
- * @param br
- * @throws IOException
- */
- public void moveLocationBy(int index, BufferedReader br)
- throws IOException
- {
- for (int i = 0; i < index; i++)
- {
- String line = br.readLine();
- while (!"//".equals(line))
- {
- line = br.readLine();
-
- }
- }
-
- }
-
- /**
- * Analyses a specified number of families and then saves the data. Before
- * analysing the data, the previous saved data will be imported and after
- * analysing this, the data is exported back into the file. The file must be
- * in flat file format.
- *
- * @param increments
- * The number of families to read before saving.
- * @throws IOException
- */
- public void run(int increments, boolean keepRawData) throws IOException
- {
- keepRaw = keepRawData;
- try
- {
- readPreviousData(currentFolder);
- BufferedReader posReader = new BufferedReader(
- new FileReader(currentFolder + "/CurrentPosition.txt"));
-
- String line = posReader.readLine();
- posReader.close();
- currentFilePosition = Integer.parseInt(line);
- } catch (Exception e)
- {
- System.out.println("No previous data found");
- }
-
-
-
- BufferedReader inputSTO = new BufferedReader(new FileReader(families));
- BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
-
-
-
- moveLocationBy(currentFilePosition, inputHMM);
- moveLocationBy(currentFilePosition, inputSTO);
-
- int filesRead = 0;
- int i = 0;
- while (filesRead < increments)
- {
-
- readStockholm(inputSTO);
-
- readHMM(inputHMM);
-
- int count = countValidResidues();
- processData(count);
- filesRead++;
-
- currentFilePosition++;
- System.out.println(i);
- i++;
- }
-
- PrintWriter p = new PrintWriter(
- new File(currentFolder + "/CurrentPosition.txt"));
- p.print(currentFilePosition);
- p.close();
- exportData(currentFolder);
- raw.clear();
- binned.clear();
-
- }
-
- /**
- * Analyses all families and then saves the data. Before analysing the data,
- * the previous saved data will be imported and after analysing this, the data
- * is exported back into the file. The file must be in flat file format.
- *
- * @param increments
- * The number of families to read before saving.
- * @throws IOException
- */
- public void runToEnd(int minCount, int maxCount, boolean keepRawData,
- boolean forClans)
- throws IOException
- {
- keepRaw = keepRawData;
- BufferedReader inputSTO = null;
- BufferedReader inputHMM = null;
- int size = 0;
- int files = 1;
- try
- {
- if (forClans)
- {
- files = 603;
- }
- int filesRead = 0;
- for (int clan = 0; clan < files; clan++)
- {
- System.out.println(clan);
- String clanPath = "";
- int numberOfFamilies = 0;
- if (forClans)
- {
- clanPath = currentFolder + "/Clan" + clan;
- if (!new File(clanPath).exists())
- {
- continue;
- }
- BufferedReader famCountReader = new BufferedReader(
- new FileReader(clanPath + "/NumberOfFamilies.txt"));
- numberOfFamilies = Integer.parseInt(famCountReader.readLine());
- }
- else
- {
- numberOfFamilies = 1;
- }
-
- for (int fam = 0; fam < numberOfFamilies; fam++)
- {
- if (forClans)
- {
- families = clanPath + "/Families/Fam" + fam + ".sto";
- hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
- }
-
- inputSTO = new BufferedReader(new FileReader(families));
- inputHMM = new BufferedReader(new FileReader(hmms));
-
-
- int i = 0;
- boolean endReached = atEnd(inputSTO);
- while (!endReached)
- {
- readStockholm(inputSTO);
- readHMM(inputHMM);
-
- int count = countValidResidues();
- if (count >= minCount && count < maxCount)
- {
- processData(count);
- }
- filesRead++;
- System.out.println(filesRead);
- endReached = atEnd(inputSTO);
- }
- }
- }
- } catch (Exception e)
- {
- e.printStackTrace();
- } finally
- {
- exportData(currentFolder);
- raw.clear();
- binned.clear();
- }
- }
-
- /**
- * Reads the previous data from both files
- *
- * @param source
- * @throws IOException
- */
- public void readPreviousData(String source) throws IOException
- {
- readBinned(source);
- if (keepRaw)
- {
- readRaw(source);
- }
- }
-
- /**
- * Reads the previous data from the binned file.
- *
- * @param source
- * @throws IOException
- */
- public void readBinned(String source) throws IOException
- {
- BufferedReader input = new BufferedReader(
- new FileReader(source + BINNED));
- String line = input.readLine();
- binned = new HashMap<>();
- while (!("".equals(line) || line == null))
- {
- Scanner scanner = new Scanner(line);
- scanner.useDelimiter(",");
- String key = scanner.next();
- String value = scanner.next();
- binned.put(key, Double.valueOf(value));
- scanner.close();
- line = input.readLine();
- }
-
- input.close();
- }
-
- /**
- * Reads the previous data from the raw file.
- *
- * @param source
- * @throws IOException
- */
- public void readRaw(String source) throws IOException
- {
- BufferedReader input = new BufferedReader(new FileReader(source + RAW));
- String line = input.readLine();
- if (line == null)
- {
- input.close();
- return;
- }
- Scanner numberScanner = new Scanner(line);
- numberScanner.useDelimiter(",");
- raw = new ArrayList<>();
- while (numberScanner.hasNext())
- {
- numberScanner.next();
- raw.add(new ArrayList<Double>());
- }
- numberScanner.close();
-
- line = input.readLine();
- while (!("".equals(line) || line == null))
- {
- Scanner scanner = new Scanner(line);
- scanner.useDelimiter(",");
-
- int i = 0;
- while (scanner.hasNext())
- {
- String value;
- value = scanner.next();
- if (!value.equals("EMPTY"))
- {
- raw.get(i).add(Double.parseDouble(value));
- }
- else
- {
- raw.get(i).add(null);
- }
-
- i++;
- }
- scanner.close();
- line = input.readLine();
- }
-
- input.close();
- }
-
- /**
- * Counts the number of valid residues in the sequence.
- *
- * @return
- */
- public int countValidResidues()
- {
- int count = 0;
-
- for (int width = 0; width < sequences.size(); width++)
- {
- for (int length = 1; length < hmm.getLength() + 1; length++)
- {
- char symbol;
- int alignPos;
- alignPos = hmm.getNodeMapPosition(length);
-
- symbol = sequences.get(width).getCharAt(alignPos);
- if (ResidueProperties.backgroundFrequencies.get("amino")
- .containsKey(symbol))
- {
- count++;
- }
- }
- }
-
- return count;
- }
-
- /**
- * Processes data, and stores it in both a raw and binned format.
- *
- * @param count
- */
- public void processData(int count)
- {
- int rawPos = 0;
- if (keepRaw)
- {
- raw.add(new ArrayList<Double>());
- rawPos = raw.size() - 1;
- }
- Double total = 0d;
- for (int width = 0; width < sequences.size(); width++)
- {
- for (int length = 1; length < hmm.getLength() + 1; length++)
- {
- char symbol;
- int alignPos;
- alignPos = hmm.getNodeMapPosition(length);
-
- symbol = sequences.get(width).getCharAt(alignPos);
- if (ResidueProperties.backgroundFrequencies.get("amino")
- .containsKey(symbol))
- {
- Double prob;
- Float bfreq;
- Double llr;
- prob = hmm.getMatchEmissionProbability(alignPos, symbol);
- bfreq = ResidueProperties.backgroundFrequencies.get("amino")
- .get(symbol);
- if (prob == 0 || bfreq == 0)
- {
- System.out.println("error");
- }
- llr = Math.log(prob / bfreq);
- if (keepRaw)
- {
- raw.get(rawPos).add(llr);
- }
-
- String output;
- output = String.format("%.1f", llr);
- total += Double.parseDouble(output);
- if ("-0.0".equals(output))
- {
- output = "0.0";
- }
- if (binned.containsKey(output))
- {
- double prev = binned.get(output);
- prev += (SCALE / count);
- binned.put(output, prev);
-
- }
- else
- {
- binned.put(output, SCALE / count);
- }
- }
- }
- }
- System.out.println(total / count);
- }
-
-
- /**
- * Reads in the sequence data from a Stockholm file.
- *
- * @param source
- * @throws IOException
- */
- public void readStockholm(BufferedReader inputSTO) throws IOException
- {
- FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
- StockholmFile file = new StockholmFile(parserSTO);
- Vector<AlignmentAnnotation> annots = file.getAnnotations();
-
- for (AlignmentAnnotation annot : annots)
- {
- if (annot.label.contains("Reference"))
- {
- reference = annot;
- }
- }
- sequences = file.getSeqs();
- }
-
- /**
- * Reads in the HMM data from a HMMer file.
- *
- * @param source
- * @throws IOException
- */
- public void readHMM(BufferedReader inputHMM) throws IOException
- {
- FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
- HMMFile file = new HMMFile(parserHMM);
- hmm = file.getHMM();
-
-
- }
-
- /**
- * Exports both the binned and raw data into separate files.
- *
- * @param location
- * @throws FileNotFoundException
- */
- public void exportData(String location) throws FileNotFoundException
- {
- PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
- for (Map.Entry<String, Double> entry : binned.entrySet())
- {
- writerBin.println(entry.getKey() + "," + entry.getValue());
- }
- writerBin.close();
- if (keepRaw)
- {
-
- PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
-
- StringBuilder identifier = new StringBuilder();
-
- for (int i = 1; i < raw.size() + 1; i++)
- {
- identifier.append("Fam " + i + ",");
- }
-
- writerRaw.println(identifier);
-
- boolean rowIsEmpty = false;
- int row = 0;
- while (!rowIsEmpty)
- {
- rowIsEmpty = true;
- StringBuilder string = new StringBuilder();
- for (int column = 0; column < raw.size(); column++)
- {
- if (raw.get(column).size() <= row)
- {
- string.append("EMPTY,");
- }
- else
- {
- string.append(raw.get(column).get(row) + ",");
- rowIsEmpty = false;
- }
- }
- row++;
- writerRaw.println(string);
- }
- writerRaw.close();
-
- }
-
- }
-
- /**
- * Prints the specified family on the console.
- *
- * @param index
- * @throws IOException
- */
- public void printFam(int index) throws IOException
- {
- BufferedReader br = new BufferedReader(new FileReader(families));
-
- moveLocationBy(index, br);
-
- String line = br.readLine();
-
- while (!"//".equals(line))
- {
- System.out.println(line);
- line = br.readLine();
- }
- System.out.println(line);
- br.close();
-
- }
-
- /**
- * Prints the specified HMM on the console.
- *
- * @param index
- * @throws IOException
- */
- public void printHMM(int index) throws IOException
- {
- BufferedReader br = new BufferedReader(new FileReader(hmms));
-
- moveLocationBy(index, br);
-
- String line = br.readLine();
-
- while (!"//".equals(line))
- {
- System.out.println(line);
- line = br.readLine();
- }
- System.out.println(line);
- br.close();
-
- }
-
- /**
- * Prints the specified family to a .sto file.
- *
- * @param index
- * @throws IOException
- */
- public void exportFam(int index, String location) throws IOException
- {
- BufferedReader br = new BufferedReader(new FileReader(families));
-
- moveLocationBy(index, br);
-
- String line = br.readLine();
- PrintWriter writer = new PrintWriter(
- new FileOutputStream(new File(location), true));
- while (!"//".equals(line))
- {
- writer.println(line);
- line = br.readLine();
- }
- writer.println(line);
- writer.close();
- br.close();
-
- }
-
- public void exportFile(BufferedReader br, String location, boolean append)
- throws IOException
- {
- String line = br.readLine();
- PrintWriter writer = new PrintWriter(
- new FileOutputStream(location, append));
- while (!"//".equals(line))
- {
- writer.println(line);
- line = br.readLine();
- }
- writer.println(line);
- writer.close();
-
-
- }
-
- public String getHMMName(int index) throws IOException
- {
- String name;
-
- BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
-
- moveLocationBy(index, nameFinder);
-
- nameFinder.readLine();
-
- Scanner scanner = new Scanner(nameFinder.readLine());
- name = scanner.next();
- name = scanner.next();
- scanner.close();
- return name;
- }
-
- public String getFamilyName(int index) throws IOException
- {
- String name;
-
- BufferedReader nameFinder = new BufferedReader(
- new FileReader(families));
-
- moveLocationBy(index, nameFinder);
-
- nameFinder.readLine();
-
- Scanner scanner = new Scanner(nameFinder.readLine());
- name = scanner.next();
- name = scanner.next();
- name = scanner.next();
- scanner.close();
- return name;
- }
-
- /**
- * Prints the specified family to a .hmm file.
- *
- * @param index
- * @throws IOException
- */
- public void exportHMM(int index, String location) throws IOException
- {
-
-
- BufferedReader br = new BufferedReader(new FileReader(hmms));
-
- moveLocationBy(index, br);
-
- String line = br.readLine();
-
- PrintWriter writer = new PrintWriter(
- new FileOutputStream(new File(location), true));
- while (!"//".equals(line))
- {
- writer.println(line);
- line = br.readLine();
- }
- writer.println(line);
- writer.close();
- br.close();
-
- }
-
- /**
- * Clears all raw, binned and current position data in the current directory.
- *
- * @throws FileNotFoundException
- */
- public void clear() throws FileNotFoundException
- {
- PrintWriter pos = new PrintWriter(
- currentFolder + "/CurrentPosition.txt");
- pos.println("0");
-
- PrintWriter raw = new PrintWriter(currentFolder + RAW);
-
- PrintWriter bin = new PrintWriter(currentFolder + BINNED);
-
- pos.close();
- bin.close();
- raw.close();
- }
-
- public void sortIntoClans(String directory) throws IOException
- {
- BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
- BufferedReader familyReader = new BufferedReader(
- new FileReader(families));
- BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
- int families = 0;
- // moveLocationBy(7000, familyReader);
- // moveLocationBy(7000, clanFinder);
- // moveLocationBy(7000, hmmReader);
- HashMap<String, Integer> clanIndexes = new HashMap<>();
- ArrayList<Integer> familyCounts = new ArrayList<>();
- int filePos = 0;
- int clanCount = 0;
- String line;
- line = clanFinder.readLine();
-
- while (!"".equals(line) && !" ".equals(line) && line != null)
- {
- String clanName;
- boolean inClan = false;
- while (!(line.indexOf("//") > -1))
- {
-
- if (line.indexOf("#=GF CL") > -1)
- {
- families++;
- System.out.println(families);
- inClan = true;
- Scanner scanner = new Scanner(line);
- scanner.next();
- scanner.next();
- clanName = scanner.next();
- scanner.close();
-
- if (!clanIndexes.containsKey(clanName))
- {
- clanIndexes.put(clanName, clanCount);
- clanCount++;
- familyCounts.add(0);
- }
-
-
- Integer clanI = clanIndexes.get(clanName);
- String clanPath = directory + "/Clan" + clanI.toString();
- createFolders(clanPath);
-
- int index = clanIndexes.get(clanName);
- exportFile(familyReader,
- clanPath + "/Families/Fam" + familyCounts.get(index)
- + ".sto",
- false);
- exportFile(hmmReader,
- clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
- false);
-
- int count = familyCounts.get(index);
- count++;
- familyCounts.set(index, count);
- }
- line = clanFinder.readLine();
-
- }
- if (!inClan)
- {
- moveLocationBy(1, familyReader);
- moveLocationBy(1, hmmReader);
- }
- filePos++;
- // System.out.println(filePos + " files read.");
- line = clanFinder.readLine();
-
- }
- clanFinder.close();
-
- for (int clan = 0; clan < clanCount; clan++)
- {
- PrintWriter writer = new PrintWriter(
- directory + "/Clan" + clan + "/NumberOfFamilies.txt");
- int count = familyCounts.get(clan);
- writer.print(count);
- writer.close();
- }
-
- }
-
- public String getFamilies()
- {
- return families;
- }
-
- public void setFamilies(String families)
- {
- this.families = currentFolder + families;
- }
-
- public String getHmms()
- {
- return hmms;
- }
-
- public void setHmms(String hmms)
- {
- this.hmms = currentFolder + hmms;
- }
-
- public void alignWithinClan(String exportLocation, String clansLocation)
- throws IOException, InterruptedException
- {
- int alignmentsExported = 0;
- for (int clan = 0; clan < 604; clan++)
- {
- System.out.println(clan);
- int famCount = 0;
- String clanPath = clansLocation + "/Clan" + clan;
- int numberOfFamilies;
- BufferedReader br = new BufferedReader(
- new FileReader(clanPath + "/NumberOfFamilies.txt"));
- String line = br.readLine();
- numberOfFamilies = Integer.parseInt(line);
- br.close();
- if (numberOfFamilies == 1)
- {
- continue;
- }
- final String commandExportLocation = exportLocation + "/Clan" + clan;
- createFolders(commandExportLocation);
- for (int family = 0; family < numberOfFamilies; family++)
- {
- famCount++;
- ArrayList<Integer> indexes = new ArrayList<>();
- for (int i = 0; i < numberOfFamilies; i++)
- {
- if (i != family)
- {
- indexes.add(i);
- }
- }
-
- int hmmIndex = getRandom(indexes);
- String famPath = clanPath + "/Families/Fam" + family + ".sto";
- String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
- String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
- + clanPath + "/Families/Fam" + hmmIndex + ".sto"
- + " --trim ";
- command += hmmPath + " ";
- command += famPath;
- final int familyIndex = family;
- final Process p = Runtime.getRuntime().exec(command);
-
- new Thread(new Runnable()
- {
- @Override
- public void run()
- {
- BufferedReader input = new BufferedReader(
- new InputStreamReader(p.getInputStream()));
- String line = null;
-
- try
- {
- PrintWriter writer = new PrintWriter(commandExportLocation
- + "/Families/Fam" + familyIndex + ".sto");
- String lastLine = "";
- boolean dataFound = false;
- while ((line = input.readLine()) != null)
- {
- if (line.contains("#=GR") && !dataFound)
- {
- writer.println(lastLine);
- dataFound = true;
- }
- if (line.contains("#") || dataFound || " ".equals(line)
- || "".equals(line) || "//".equals(line))
- {
- writer.println(line);
- }
- lastLine = line;
- }
- writer.close();
- } catch (IOException e)
- {
- e.printStackTrace();
- }
- }
- }).start();
-
- p.waitFor();
-
- BufferedReader hmmExporter = new BufferedReader(
- new FileReader(hmmPath));
-
- exportFile(hmmExporter,
- commandExportLocation + "/HMMs/HMM" + family + ".hmm",
- false);
-
- alignmentsExported++;
-
-
- }
- PrintWriter writer = new PrintWriter(
- commandExportLocation + "/NumberOfFamilies.txt");
- writer.print(famCount);
- writer.close();
- }
-
- }
-
- public boolean atEnd(BufferedReader br) throws IOException
- {
- boolean end = false;
- br.mark(80);
- String line = br.readLine();
- if ("".equals(line) || line == null)
- {
- end = true;
- }
- br.reset();
- return end;
- }
-
- public int getRandom(ArrayList<Integer> list)
- {
- if (!(list.size() > 0))
- {
- System.out.println("Error - size = " + list.size());
- }
- int index = generator.nextInt(list.size());
- int value = list.get(index);
- list.remove(index);
- return value;
- }
-
- public void createFolders(String clanPath)
- {
- File clanFolder = new File(clanPath);
- if (!clanFolder.exists())
- {
- clanFolder.mkdir();
- }
-
- File famFolder = new File(clanPath + "/Families");
- File hmmFolder = new File(clanPath + "/HMMs");
- if (!famFolder.exists())
- {
- famFolder.mkdir();
- hmmFolder.mkdir();
- }
- }
-}
-
-
-
-