package jalview.util;
+import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.HiddenMarkovModel;
import jalview.datamodel.SequenceI;
import jalview.io.DataSourceType;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Random;
import java.util.Scanner;
import java.util.Vector;
+/**
+ * Processes probability data. The file indexes used in this program represent
+ * the index of the location of a family or hmm in their respective files,
+ * starting from 0.
+ *
+ * @author TZVanaalten
+ *
+ */
public class HMMProbabilityDistributionAnalyser
{
+ AlignmentAnnotation reference = null;
Vector<SequenceI> sequences;
HiddenMarkovModel hmm;
+ // contains the raw data produced
List<ArrayList<Double>> raw = new ArrayList<>();
+ // contains binned data
Map<String, Double> binned = new HashMap<>();
- final static String FAMILIES = "C:/Users/TZVanaalten/Pfam-A.full";
+ // location of the family file
+ String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
- final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
+ // location of the file containing the family-clan links
+ final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
+ // location of the HMM file
+ String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
+
+ // suffix for raw file
final static String RAW = "/Raw.csv";
+ // suffix for binned file
final static String BINNED = "/Binned.csv";
- final static double SCALE = 100000;
+ // normalisation scale
+ final static double SCALE = 1;
+ // current position in file
int currentFilePosition = 0;
final static String NL = "\n";
+ Random generator = new Random();
+
+ // current directory
String currentFolder;
+ boolean keepRaw = false;
+
+ /**
+ * Sets the working directory.
+ *
+ * @param path
+ */
public void setFolder(String path)
{
currentFolder = path;
}
- public void moveToFile(int index, BufferedReader br) throws IOException
+ /**
+ * Moves a buffered reader forward in the file by a certain amount of entries.
+ * Each entry in the file is delimited by '//'.
+ *
+ * @param index
+ * The index of the location in the file.
+ * @param br
+ * @throws IOException
+ */
+ public void moveLocationBy(int index, BufferedReader br)
+ throws IOException
{
for (int i = 0; i < index; i++)
{
while (!"//".equals(line))
{
line = br.readLine();
+
}
}
}
+
/**
- * Analyses probability data
+ * Analyses a specified number of families and then saves the data. Before
+ * analysing the data, the previous saved data will be imported and after
+ * analysing this, the data is exported back into the file. The file must be
+ * in flat file format.
*
- * @param args
+ * @param increments
+ * The number of families to read before saving.
* @throws IOException
*/
- public void run(int increments) throws IOException
+ public void run(int increments, boolean keepRawData) throws IOException
{
+ keepRaw = keepRawData;
+ try
+ {
+ readPreviousData(currentFolder);
+ BufferedReader posReader = new BufferedReader(
+ new FileReader(currentFolder + "/CurrentPosition.txt"));
+
+ String line = posReader.readLine();
+ posReader.close();
+ currentFilePosition = Integer.parseInt(line);
+ } catch (Exception e)
+ {
+ System.out.println("No previous data found");
+ }
+
+
- readPreviousData(currentFolder);
+ BufferedReader inputSTO = new BufferedReader(new FileReader(families));
+ BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
- BufferedReader posReader = new BufferedReader(
- new FileReader(currentFolder + "/CurrentPosition.txt"));
- String line = posReader.readLine();
- posReader.close();
- currentFilePosition = Integer.parseInt(line);
- BufferedReader inputSTO = new BufferedReader(
- new FileReader(FAMILIES));
- BufferedReader inputHMM = new BufferedReader(
- new FileReader(HMMS));
- moveToFile(currentFilePosition, inputHMM);
- moveToFile(currentFilePosition, inputSTO);
+ moveLocationBy(currentFilePosition, inputHMM);
+ moveLocationBy(currentFilePosition, inputSTO);
int filesRead = 0;
+ int i = 0;
while (filesRead < increments)
{
- FileParse parserSTO = new FileParse(inputSTO, "",
- DataSourceType.FILE);
- readStockholm(parserSTO);
- FileParse parserHMM = new FileParse(inputHMM, "",
- DataSourceType.FILE);
- readHMM(parserHMM);
+ readStockholm(inputSTO);
+
+ readHMM(inputHMM);
- if (hmm.getAlphabetType().equals("amino"))
- {
int count = countValidResidues();
processData(count);
filesRead++;
- }
+
currentFilePosition++;
+ System.out.println(i);
+ i++;
}
PrintWriter p = new PrintWriter(
- new File(currentFolder + "/CurrentPosition"));
+ new File(currentFolder + "/CurrentPosition.txt"));
p.print(currentFilePosition);
p.close();
exportData(currentFolder);
}
+ /**
+ * Analyses all families and then saves the data. Before analysing the data,
+ * the previous saved data will be imported and after analysing this, the data
+ * is exported back into the file. The file must be in flat file format.
+ *
+ * @param increments
+ * The number of families to read before saving.
+ * @throws IOException
+ */
+ public void runToEnd(int minCount, int maxCount, boolean keepRawData,
+ boolean forClans)
+ throws IOException
+ {
+ keepRaw = keepRawData;
+ BufferedReader inputSTO = null;
+ BufferedReader inputHMM = null;
+ int size = 0;
+ int files = 1;
+ try
+ {
+ if (forClans)
+ {
+ files = 603;
+ }
+ int filesRead = 0;
+ for (int clan = 0; clan < files; clan++)
+ {
+ System.out.println(clan);
+ String clanPath = "";
+ int numberOfFamilies = 0;
+ if (forClans)
+ {
+ clanPath = currentFolder + "/Clan" + clan;
+ if (!new File(clanPath).exists())
+ {
+ continue;
+ }
+ BufferedReader famCountReader = new BufferedReader(
+ new FileReader(clanPath + "/NumberOfFamilies.txt"));
+ numberOfFamilies = Integer.parseInt(famCountReader.readLine());
+ }
+ else
+ {
+ numberOfFamilies = 1;
+ }
+
+ for (int fam = 0; fam < numberOfFamilies; fam++)
+ {
+ if (forClans)
+ {
+ families = clanPath + "/Families/Fam" + fam + ".sto";
+ hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
+ }
+
+ inputSTO = new BufferedReader(new FileReader(families));
+ inputHMM = new BufferedReader(new FileReader(hmms));
+
+
+ int i = 0;
+ boolean endReached = atEnd(inputSTO);
+ while (!endReached)
+ {
+ readStockholm(inputSTO);
+ readHMM(inputHMM);
+
+ int count = countValidResidues();
+ if (count >= minCount && count < maxCount)
+ {
+ processData(count);
+ }
+ filesRead++;
+ System.out.println(filesRead);
+ endReached = atEnd(inputSTO);
+ }
+ }
+ }
+ } catch (Exception e)
+ {
+ e.printStackTrace();
+ } finally
+ {
+ exportData(currentFolder);
+ raw.clear();
+ binned.clear();
+ }
+ }
+
+ /**
+ * Reads the previous data from both files
+ *
+ * @param source
+ * @throws IOException
+ */
public void readPreviousData(String source) throws IOException
{
readBinned(source);
- readRaw(source);
+ if (keepRaw)
+ {
+ readRaw(source);
+ }
}
+ /**
+ * Reads the previous data from the binned file.
+ *
+ * @param source
+ * @throws IOException
+ */
public void readBinned(String source) throws IOException
{
BufferedReader input = new BufferedReader(
new FileReader(source + BINNED));
String line = input.readLine();
+ binned = new HashMap<>();
while (!("".equals(line) || line == null))
{
- binned = new HashMap<>();
Scanner scanner = new Scanner(line);
scanner.useDelimiter(",");
- binned.put(scanner.next(), scanner.nextDouble());
+ String key = scanner.next();
+ String value = scanner.next();
+ binned.put(key, Double.valueOf(value));
scanner.close();
line = input.readLine();
}
input.close();
}
+ /**
+ * Reads the previous data from the raw file.
+ *
+ * @param source
+ * @throws IOException
+ */
public void readRaw(String source) throws IOException
{
BufferedReader input = new BufferedReader(new FileReader(source + RAW));
{
raw.get(i).add(Double.parseDouble(value));
}
+ else
+ {
+ raw.get(i).add(null);
+ }
i++;
}
input.close();
}
+ /**
+ * Counts the number of valid residues in the sequence.
+ *
+ * @return
+ */
public int countValidResidues()
{
int count = 0;
for (int width = 0; width < sequences.size(); width++)
{
- for (int length = 1; length < hmm.getLength(); length++)
+ for (int length = 1; length < hmm.getLength() + 1; length++)
{
char symbol;
int alignPos;
- alignPos = hmm.getNodeAlignmentColumn(length);
+ alignPos = hmm.getNodeMapPosition(length);
symbol = sequences.get(width).getCharAt(alignPos);
- if (ResidueProperties.aminoBackgroundFrequencies
+ if (ResidueProperties.backgroundFrequencies.get("amino")
.containsKey(symbol))
{
count++;
return count;
}
+ /**
+ * Processes data, and stores it in both a raw and binned format.
+ *
+ * @param count
+ */
public void processData(int count)
{
-
- raw.add(new ArrayList<Double>());
- int rawPos = raw.size() - 1;
+ int rawPos = 0;
+ if (keepRaw)
+ {
+ raw.add(new ArrayList<Double>());
+ rawPos = raw.size() - 1;
+ }
+ Double total = 0d;
for (int width = 0; width < sequences.size(); width++)
{
- for (int length = 1; length < hmm.getLength(); length++)
+ for (int length = 1; length < hmm.getLength() + 1; length++)
{
char symbol;
int alignPos;
- alignPos = hmm.getNodeAlignmentColumn(length);
-
+ alignPos = hmm.getNodeMapPosition(length);
+
symbol = sequences.get(width).getCharAt(alignPos);
- if (ResidueProperties.aminoBackgroundFrequencies
+ if (ResidueProperties.backgroundFrequencies.get("amino")
.containsKey(symbol))
{
-
Double prob;
Float bfreq;
Double llr;
prob = hmm.getMatchEmissionProbability(alignPos, symbol);
- bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
+ bfreq = ResidueProperties.backgroundFrequencies.get("amino")
+ .get(symbol);
+ if (prob == 0 || bfreq == 0)
+ {
+ System.out.println("error");
+ }
llr = Math.log(prob / bfreq);
- raw.get(rawPos).add(llr);
+ if (keepRaw)
+ {
+ raw.get(rawPos).add(llr);
+ }
+
String output;
output = String.format("%.1f", llr);
+ total += Double.parseDouble(output);
if ("-0.0".equals(output))
{
output = "0.0";
}
}
}
+ System.out.println(total / count);
}
- public void readStockholm(FileParse source) throws IOException
+ /**
+ * Reads in the sequence data from a Stockholm file.
+ *
+ * @param source
+ * @throws IOException
+ */
+ public void readStockholm(BufferedReader inputSTO) throws IOException
{
- StockholmFile file = new StockholmFile(source);
- file.parse();
+ FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
+ StockholmFile file = new StockholmFile(parserSTO);
+ Vector<AlignmentAnnotation> annots = file.getAnnotations();
+
+ for (AlignmentAnnotation annot : annots)
+ {
+ if (annot.label.contains("Reference"))
+ {
+ reference = annot;
+ }
+ }
sequences = file.getSeqs();
}
- public void readHMM(FileParse source) throws IOException
+ /**
+ * Reads in the HMM data from a HMMer file.
+ *
+ * @param source
+ * @throws IOException
+ */
+ public void readHMM(BufferedReader inputHMM) throws IOException
{
-
- HMMFile file = new HMMFile(source);
- file.parse();
+ FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
+ HMMFile file = new HMMFile(parserHMM);
hmm = file.getHMM();
+
}
+ /**
+ * Exports both the binned and raw data into separate files.
+ *
+ * @param location
+ * @throws FileNotFoundException
+ */
public void exportData(String location) throws FileNotFoundException
{
PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
writerBin.println(entry.getKey() + "," + entry.getValue());
}
writerBin.close();
+ if (keepRaw)
+ {
PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
-
+
StringBuilder identifier = new StringBuilder();
-
+
for (int i = 1; i < raw.size() + 1; i++)
{
identifier.append("Fam " + i + ",");
}
-
+
writerRaw.println(identifier);
-
+
boolean rowIsEmpty = false;
int row = 0;
while (!rowIsEmpty)
}
writerRaw.close();
+ }
+
}
+ /**
+ * Prints the specified family on the console.
+ *
+ * @param index
+ * @throws IOException
+ */
public void printFam(int index) throws IOException
{
- BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
+ BufferedReader br = new BufferedReader(new FileReader(families));
- moveToFile(index, br);
+ moveLocationBy(index, br);
String line = br.readLine();
}
+ /**
+ * Prints the specified HMM on the console.
+ *
+ * @param index
+ * @throws IOException
+ */
public void printHMM(int index) throws IOException
{
- BufferedReader br = new BufferedReader(new FileReader(HMMS));
+ BufferedReader br = new BufferedReader(new FileReader(hmms));
- moveToFile(index, br);
+ moveLocationBy(index, br);
String line = br.readLine();
}
- public void printFamToFile(int index) throws IOException
+ /**
+ * Prints the specified family to a .sto file.
+ *
+ * @param index
+ * @throws IOException
+ */
+ public void exportFam(int index, String location) throws IOException
{
- String name;
-
- BufferedReader nameFinder = new BufferedReader(
- new FileReader(FAMILIES));
+ BufferedReader br = new BufferedReader(new FileReader(families));
- moveToFile(index, nameFinder);
-
- nameFinder.readLine();
-
- Scanner scanner = new Scanner(nameFinder.readLine());
- scanner.next();
- scanner.next();
- name = scanner.next();
- scanner.close();
+ moveLocationBy(index, br);
- BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
+ String line = br.readLine();
+ PrintWriter writer = new PrintWriter(
+ new FileOutputStream(new File(location), true));
+ while (!"//".equals(line))
+ {
+ writer.println(line);
+ line = br.readLine();
+ }
+ writer.println(line);
+ writer.close();
+ br.close();
- moveToFile(index, br);
+ }
+ public void exportFile(BufferedReader br, String location, boolean append)
+ throws IOException
+ {
String line = br.readLine();
PrintWriter writer = new PrintWriter(
- currentFolder + "/" + name + ".sto");
+ new FileOutputStream(location, append));
while (!"//".equals(line))
{
writer.println(line);
}
writer.println(line);
writer.close();
- br.close();
+
}
- public void printHMMToFile(int index) throws IOException
+ public String getHMMName(int index) throws IOException
{
+ String name;
+
+ BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
+ moveLocationBy(index, nameFinder);
+
+ nameFinder.readLine();
+
+ Scanner scanner = new Scanner(nameFinder.readLine());
+ name = scanner.next();
+ name = scanner.next();
+ scanner.close();
+ return name;
+ }
+
+ public String getFamilyName(int index) throws IOException
+ {
String name;
- BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS));
+ BufferedReader nameFinder = new BufferedReader(
+ new FileReader(families));
- moveToFile(index, nameFinder);
+ moveLocationBy(index, nameFinder);
nameFinder.readLine();
Scanner scanner = new Scanner(nameFinder.readLine());
name = scanner.next();
name = scanner.next();
+ name = scanner.next();
scanner.close();
+ return name;
+ }
+
+ /**
+ * Prints the specified family to a .hmm file.
+ *
+ * @param index
+ * @throws IOException
+ */
+ public void exportHMM(int index, String location) throws IOException
+ {
+
- BufferedReader br = new BufferedReader(new FileReader(HMMS));
+ BufferedReader br = new BufferedReader(new FileReader(hmms));
- moveToFile(index, br);
+ moveLocationBy(index, br);
String line = br.readLine();
PrintWriter writer = new PrintWriter(
- currentFolder + "/" + name + ".hmm");
+ new FileOutputStream(new File(location), true));
while (!"//".equals(line))
{
writer.println(line);
}
+ /**
+ * Clears all raw, binned and current position data in the current directory.
+ *
+ * @throws FileNotFoundException
+ */
public void clear() throws FileNotFoundException
{
PrintWriter pos = new PrintWriter(
raw.close();
}
+ public void sortIntoClans(String directory) throws IOException
+ {
+ BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
+ BufferedReader familyReader = new BufferedReader(
+ new FileReader(families));
+ BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
+ int families = 0;
+ // moveLocationBy(7000, familyReader);
+ // moveLocationBy(7000, clanFinder);
+ // moveLocationBy(7000, hmmReader);
+ HashMap<String, Integer> clanIndexes = new HashMap<>();
+ ArrayList<Integer> familyCounts = new ArrayList<>();
+ int filePos = 0;
+ int clanCount = 0;
+ String line;
+ line = clanFinder.readLine();
+
+ while (!"".equals(line) && !" ".equals(line) && line != null)
+ {
+ String clanName;
+ boolean inClan = false;
+ while (!(line.indexOf("//") > -1))
+ {
+
+ if (line.indexOf("#=GF CL") > -1)
+ {
+ families++;
+ System.out.println(families);
+ inClan = true;
+ Scanner scanner = new Scanner(line);
+ scanner.next();
+ scanner.next();
+ clanName = scanner.next();
+ scanner.close();
+
+ if (!clanIndexes.containsKey(clanName))
+ {
+ clanIndexes.put(clanName, clanCount);
+ clanCount++;
+ familyCounts.add(0);
+ }
+
+
+ Integer clanI = clanIndexes.get(clanName);
+ String clanPath = directory + "/Clan" + clanI.toString();
+ createFolders(clanPath);
+
+ int index = clanIndexes.get(clanName);
+ exportFile(familyReader,
+ clanPath + "/Families/Fam" + familyCounts.get(index)
+ + ".sto",
+ false);
+ exportFile(hmmReader,
+ clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
+ false);
+
+ int count = familyCounts.get(index);
+ count++;
+ familyCounts.set(index, count);
+ }
+ line = clanFinder.readLine();
+
+ }
+ if (!inClan)
+ {
+ moveLocationBy(1, familyReader);
+ moveLocationBy(1, hmmReader);
+ }
+ filePos++;
+ // System.out.println(filePos + " files read.");
+ line = clanFinder.readLine();
+
+ }
+ clanFinder.close();
+
+ for (int clan = 0; clan < clanCount; clan++)
+ {
+ PrintWriter writer = new PrintWriter(
+ directory + "/Clan" + clan + "/NumberOfFamilies.txt");
+ int count = familyCounts.get(clan);
+ writer.print(count);
+ writer.close();
+ }
+
+ }
+
+ public String getFamilies()
+ {
+ return families;
+ }
+
+ public void setFamilies(String families)
+ {
+ this.families = currentFolder + families;
+ }
+
+ public String getHmms()
+ {
+ return hmms;
+ }
+
+ public void setHmms(String hmms)
+ {
+ this.hmms = currentFolder + hmms;
+ }
+
+ public void alignWithinClan(String exportLocation, String clansLocation)
+ throws IOException, InterruptedException
+ {
+ int alignmentsExported = 0;
+ for (int clan = 0; clan < 604; clan++)
+ {
+ System.out.println(clan);
+ int famCount = 0;
+ String clanPath = clansLocation + "/Clan" + clan;
+ int numberOfFamilies;
+ BufferedReader br = new BufferedReader(
+ new FileReader(clanPath + "/NumberOfFamilies.txt"));
+ String line = br.readLine();
+ numberOfFamilies = Integer.parseInt(line);
+ br.close();
+ if (numberOfFamilies == 1)
+ {
+ continue;
+ }
+ final String commandExportLocation = exportLocation + "/Clan" + clan;
+ createFolders(commandExportLocation);
+ for (int family = 0; family < numberOfFamilies; family++)
+ {
+ famCount++;
+ ArrayList<Integer> indexes = new ArrayList<>();
+ for (int i = 0; i < numberOfFamilies; i++)
+ {
+ if (i != family)
+ {
+ indexes.add(i);
+ }
+ }
+
+ int hmmIndex = getRandom(indexes);
+ String famPath = clanPath + "/Families/Fam" + family + ".sto";
+ String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
+ String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
+ + clanPath + "/Families/Fam" + hmmIndex + ".sto"
+ + " --trim ";
+ command += hmmPath + " ";
+ command += famPath;
+ final int familyIndex = family;
+ final Process p = Runtime.getRuntime().exec(command);
+
+ new Thread(new Runnable()
+ {
+ @Override
+ public void run()
+ {
+ BufferedReader input = new BufferedReader(
+ new InputStreamReader(p.getInputStream()));
+ String line = null;
+
+ try
+ {
+ PrintWriter writer = new PrintWriter(commandExportLocation
+ + "/Families/Fam" + familyIndex + ".sto");
+ String lastLine = "";
+ boolean dataFound = false;
+ while ((line = input.readLine()) != null)
+ {
+ if (line.contains("#=GR") && !dataFound)
+ {
+ writer.println(lastLine);
+ dataFound = true;
+ }
+ if (line.contains("#") || dataFound || " ".equals(line)
+ || "".equals(line) || "//".equals(line))
+ {
+ writer.println(line);
+ }
+ lastLine = line;
+ }
+ writer.close();
+ } catch (IOException e)
+ {
+ e.printStackTrace();
+ }
+ }
+ }).start();
+
+ p.waitFor();
+
+ BufferedReader hmmExporter = new BufferedReader(
+ new FileReader(hmmPath));
+
+ exportFile(hmmExporter,
+ commandExportLocation + "/HMMs/HMM" + family + ".hmm",
+ false);
+
+ alignmentsExported++;
+
+
+ }
+ PrintWriter writer = new PrintWriter(
+ commandExportLocation + "/NumberOfFamilies.txt");
+ writer.print(famCount);
+ writer.close();
+ }
+
+ }
+
+ public boolean atEnd(BufferedReader br) throws IOException
+ {
+ boolean end = false;
+ br.mark(80);
+ String line = br.readLine();
+ if ("".equals(line) || line == null)
+ {
+ end = true;
+ }
+ br.reset();
+ return end;
+ }
+
+ public int getRandom(ArrayList<Integer> list)
+ {
+ if (!(list.size() > 0))
+ {
+ System.out.println("Error - size = " + list.size());
+ }
+ int index = generator.nextInt(list.size());
+ int value = list.get(index);
+ list.remove(index);
+ return value;
+ }
+
+ public void createFolders(String clanPath)
+ {
+ File clanFolder = new File(clanPath);
+ if (!clanFolder.exists())
+ {
+ clanFolder.mkdir();
+ }
+
+ File famFolder = new File(clanPath + "/Families");
+ File hmmFolder = new File(clanPath + "/HMMs");
+ if (!famFolder.exists())
+ {
+ famFolder.mkdir();
+ hmmFolder.mkdir();
+ }
+ }
}
+
+
+
+