3 import jalview.datamodel.HiddenMarkovModel;
4 import jalview.datamodel.SequenceI;
5 import jalview.io.DataSourceType;
6 import jalview.io.FileParse;
7 import jalview.io.HMMFile;
8 import jalview.io.StockholmFile;
9 import jalview.schemes.ResidueProperties;
11 import java.io.BufferedReader;
13 import java.io.FileNotFoundException;
14 import java.io.FileReader;
15 import java.io.IOException;
16 import java.io.PrintWriter;
17 import java.util.ArrayList;
18 import java.util.HashMap;
19 import java.util.List;
21 import java.util.Scanner;
22 import java.util.Vector;
25 * Processes probability data. The file indexes used in this program represent
26 * the index of the location of a family or hmm in their respective files,
32 public class HMMProbabilityDistributionAnalyser
35 Vector<SequenceI> sequences;
37 HiddenMarkovModel hmm;
39 // contains the raw data produced
40 List<ArrayList<Double>> raw = new ArrayList<>();
42 // contains binned data
43 Map<String, Double> binned = new HashMap<>();
45 // location of the family file
46 final static String FAMILIES = "C:/Users/TZVanaalten/Pfam-A.full";
48 // location of the HMM file
49 final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
51 // suffix for raw file
52 final static String RAW = "/Raw.csv";
54 // suffix for binned file
55 final static String BINNED = "/Binned.csv";
57 // normalisation scale
58 final static double SCALE = 100000;
60 // current position in file
61 int currentFilePosition = 0;
63 final static String NL = "\n";
69 * Sets the working directory.
73 public void setFolder(String path)
79 * Moves a buffered reader to a specific location in the file, delimited by
83 * The index of the location in the file.
87 public void moveToFile(int index, BufferedReader br) throws IOException
89 for (int i = 0; i < index; i++)
91 String line = br.readLine();
92 while (!"//".equals(line))
101 * Analyses a specified number of families and then saves the data. Before
102 * analysing the data, the previous saved data will be imported and after
103 * analysing this data is exported back into the file.
106 * The number of families to read before saving.
107 * @throws IOException
109 public void run(int increments) throws IOException
112 readPreviousData(currentFolder);
114 BufferedReader posReader = new BufferedReader(
115 new FileReader(currentFolder + "/CurrentPosition.txt"));
116 String line = posReader.readLine();
118 currentFilePosition = Integer.parseInt(line);
120 BufferedReader inputSTO = new BufferedReader(
121 new FileReader(FAMILIES));
122 BufferedReader inputHMM = new BufferedReader(
123 new FileReader(HMMS));
125 moveToFile(currentFilePosition, inputHMM);
126 moveToFile(currentFilePosition, inputSTO);
129 while (filesRead < increments)
131 FileParse parserSTO = new FileParse(inputSTO, "",
132 DataSourceType.FILE);
133 readStockholm(parserSTO);
135 FileParse parserHMM = new FileParse(inputHMM, "",
136 DataSourceType.FILE);
139 if (hmm.getAlphabetType().equals("amino"))
141 int count = countValidResidues();
145 currentFilePosition++;
148 PrintWriter p = new PrintWriter(
149 new File(currentFolder + "/CurrentPosition"));
150 p.print(currentFilePosition);
152 exportData(currentFolder);
159 * Reads the previous data from both files
162 * @throws IOException
164 public void readPreviousData(String source) throws IOException
171 * Reads the previous data from the binned file.
174 * @throws IOException
176 public void readBinned(String source) throws IOException
178 BufferedReader input = new BufferedReader(
179 new FileReader(source + BINNED));
180 String line = input.readLine();
181 while (!("".equals(line) || line == null))
183 binned = new HashMap<>();
184 Scanner scanner = new Scanner(line);
185 scanner.useDelimiter(",");
186 binned.put(scanner.next(), scanner.nextDouble());
188 line = input.readLine();
195 * Reads the previous data from the raw file.
198 * @throws IOException
200 public void readRaw(String source) throws IOException
202 BufferedReader input = new BufferedReader(new FileReader(source + RAW));
203 String line = input.readLine();
209 Scanner numberScanner = new Scanner(line);
210 numberScanner.useDelimiter(",");
211 raw = new ArrayList<>();
212 while (numberScanner.hasNext())
214 numberScanner.next();
215 raw.add(new ArrayList<Double>());
217 numberScanner.close();
219 line = input.readLine();
220 while (!("".equals(line) || line == null))
222 Scanner scanner = new Scanner(line);
223 scanner.useDelimiter(",");
226 while (scanner.hasNext())
229 value = scanner.next();
230 if (!value.equals("EMPTY"))
232 raw.get(i).add(Double.parseDouble(value));
238 line = input.readLine();
245 * Counts the number of valid residues in the sequence.
249 public int countValidResidues()
253 for (int width = 0; width < sequences.size(); width++)
255 for (int length = 1; length < hmm.getLength(); length++)
259 alignPos = hmm.getNodeAlignmentColumn(length);
261 symbol = sequences.get(width).getCharAt(alignPos);
262 if (ResidueProperties.aminoBackgroundFrequencies
263 .containsKey(symbol))
274 * Processes data, and stores it in both a raw and binned format.
278 public void processData(int count)
281 raw.add(new ArrayList<Double>());
282 int rawPos = raw.size() - 1;
283 for (int width = 0; width < sequences.size(); width++)
285 for (int length = 1; length < hmm.getLength(); length++)
289 alignPos = hmm.getNodeAlignmentColumn(length);
291 symbol = sequences.get(width).getCharAt(alignPos);
292 if (ResidueProperties.aminoBackgroundFrequencies
293 .containsKey(symbol))
299 prob = hmm.getMatchEmissionProbability(alignPos, symbol);
300 bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
301 llr = Math.log(prob / bfreq);
302 raw.get(rawPos).add(llr);
304 output = String.format("%.1f", llr);
305 if ("-0.0".equals(output))
309 if (binned.containsKey(output))
311 double prev = binned.get(output);
312 prev += (SCALE / count);
313 binned.put(output, prev);
318 binned.put(output, SCALE / count);
327 * Reads in the sequence data from a Stockholm file.
330 * @throws IOException
332 public void readStockholm(FileParse source) throws IOException
334 StockholmFile file = new StockholmFile(source);
336 sequences = file.getSeqs();
340 * Reads in the HMM data from a HMMer file.
343 * @throws IOException
345 public void readHMM(FileParse source) throws IOException
348 HMMFile file = new HMMFile(source);
355 * Exports both the binned and raw data into separate files.
358 * @throws FileNotFoundException
360 public void exportData(String location) throws FileNotFoundException
362 PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
363 for (Map.Entry<String, Double> entry : binned.entrySet())
365 writerBin.println(entry.getKey() + "," + entry.getValue());
369 PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
371 StringBuilder identifier = new StringBuilder();
373 for (int i = 1; i < raw.size() + 1; i++)
375 identifier.append("Fam " + i + ",");
378 writerRaw.println(identifier);
380 boolean rowIsEmpty = false;
385 StringBuilder string = new StringBuilder();
386 for (int column = 0; column < raw.size(); column++)
388 if (raw.get(column).size() <= row)
390 string.append("EMPTY,");
394 string.append(raw.get(column).get(row) + ",");
399 writerRaw.println(string);
406 * Prints the specified family on the console.
409 * @throws IOException
411 public void printFam(int index) throws IOException
413 BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
415 moveToFile(index, br);
417 String line = br.readLine();
419 while (!"//".equals(line))
421 System.out.println(line);
422 line = br.readLine();
424 System.out.println(line);
430 * Prints the specified HMM on the console.
433 * @throws IOException
435 public void printHMM(int index) throws IOException
437 BufferedReader br = new BufferedReader(new FileReader(HMMS));
439 moveToFile(index, br);
441 String line = br.readLine();
443 while (!"//".equals(line))
445 System.out.println(line);
446 line = br.readLine();
448 System.out.println(line);
454 * Prints the specified family to a .sto file in the current directory.
457 * @throws IOException
459 public void printFamToFile(int index) throws IOException
463 BufferedReader nameFinder = new BufferedReader(
464 new FileReader(FAMILIES));
466 moveToFile(index, nameFinder);
468 nameFinder.readLine();
470 Scanner scanner = new Scanner(nameFinder.readLine());
473 name = scanner.next();
476 BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
478 moveToFile(index, br);
480 String line = br.readLine();
481 PrintWriter writer = new PrintWriter(
482 currentFolder + "/" + name + ".sto");
483 while (!"//".equals(line))
485 writer.println(line);
486 line = br.readLine();
488 writer.println(line);
495 * Prints the specified family to a .hmm file in the current directory.
498 * @throws IOException
500 public void printHMMToFile(int index) throws IOException
505 BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS));
507 moveToFile(index, nameFinder);
509 nameFinder.readLine();
511 Scanner scanner = new Scanner(nameFinder.readLine());
512 name = scanner.next();
513 name = scanner.next();
516 BufferedReader br = new BufferedReader(new FileReader(HMMS));
518 moveToFile(index, br);
520 String line = br.readLine();
522 PrintWriter writer = new PrintWriter(
523 currentFolder + "/" + name + ".hmm");
524 while (!"//".equals(line))
526 writer.println(line);
527 line = br.readLine();
529 writer.println(line);
536 * Clears all raw, binned and current position data in the current directory.
538 * @throws FileNotFoundException
540 public void clear() throws FileNotFoundException
542 PrintWriter pos = new PrintWriter(
543 currentFolder + "/CurrentPosition.txt");
546 PrintWriter raw = new PrintWriter(currentFolder + RAW);
548 PrintWriter bin = new PrintWriter(currentFolder + BINNED);