3 import jalview.datamodel.HiddenMarkovModel;
4 import jalview.datamodel.SequenceI;
5 import jalview.io.DataSourceType;
6 import jalview.io.FileParse;
7 import jalview.io.HMMFile;
8 import jalview.io.StockholmFile;
9 import jalview.schemes.ResidueProperties;
11 import java.io.BufferedReader;
13 import java.io.FileNotFoundException;
14 import java.io.FileOutputStream;
15 import java.io.FileReader;
16 import java.io.IOException;
17 import java.io.PrintWriter;
18 import java.util.ArrayList;
19 import java.util.HashMap;
20 import java.util.List;
22 import java.util.Scanner;
23 import java.util.Vector;
26 * Processes probability data. The file indexes used in this program represent
27 * the index of the location of a family or hmm in their respective files,
33 public class HMMProbabilityDistributionAnalyser
36 Vector<SequenceI> sequences;
38 HiddenMarkovModel hmm;
40 // contains the raw data produced
41 List<ArrayList<Double>> raw = new ArrayList<>();
43 // contains binned data
44 Map<String, Double> binned = new HashMap<>();
46 // location of the family file
47 final static String FAMILIES = "H:/Desktop/PFAM/Family/SeedFamilies.seed";
49 // location of the file containing the family-clan links
50 final static String FAMILIESTOCLAN = "H:/Desktop/PFAM/Family/Clanlinks.dat";
52 // location of the HMM file
53 final static String HMMS = "H:/Desktop/PFAM/HMMs/Pfam-A.hmm";
55 // suffix for raw file
56 final static String RAW = "/Raw.csv";
58 // suffix for binned file
59 final static String BINNED = "/Binned.csv";
61 // normalisation scale
62 final static double SCALE = 100000;
64 // current position in file
65 int currentFilePosition = 0;
67 final static String NL = "\n";
72 boolean keepRaw = false;
75 * Sets the working directory.
79 public void setFolder(String path)
85 * Moves a buffered reader forward in the file by a certain amount of entries.
86 * Each entry in the file is delimited by '//'.
89 * The index of the location in the file.
93 public void moveLocationBy(int index, BufferedReader br)
96 for (int i = 0; i < index; i++)
98 String line = br.readLine();
99 while (!"//".equals(line))
101 line = br.readLine();
108 * Analyses a specified number of families and then saves the data. Before
109 * analysing the data, the previous saved data will be imported and after
110 * analysing this data is exported back into the file.
113 * The number of families to read before saving.
114 * @throws IOException
116 public void run(int increments, boolean keepRawData) throws IOException
118 keepRaw = keepRawData;
119 readPreviousData(currentFolder);
121 BufferedReader posReader = new BufferedReader(
122 new FileReader(currentFolder + "/CurrentPosition.txt"));
123 String line = posReader.readLine();
126 BufferedReader inputSTO = new BufferedReader(new FileReader(FAMILIES));
127 BufferedReader inputHMM = new BufferedReader(new FileReader(HMMS));
129 currentFilePosition = Integer.parseInt(line);
131 moveLocationBy(currentFilePosition, inputHMM);
132 moveLocationBy(currentFilePosition, inputSTO);
136 while (filesRead < increments)
139 FileParse parserSTO = new FileParse(inputSTO, "",
140 DataSourceType.FILE);
141 readStockholm(parserSTO);
143 FileParse parserHMM = new FileParse(inputHMM, "",
144 DataSourceType.FILE);
147 int count = countValidResidues();
151 currentFilePosition++;
152 System.out.println(i);
156 PrintWriter p = new PrintWriter(
157 new File(currentFolder + "/CurrentPosition.txt"));
158 p.print(currentFilePosition);
160 exportData(currentFolder);
167 * Reads the previous data from both files
170 * @throws IOException
172 public void readPreviousData(String source) throws IOException
182 * Reads the previous data from the binned file.
185 * @throws IOException
187 public void readBinned(String source) throws IOException
189 BufferedReader input = new BufferedReader(
190 new FileReader(source + BINNED));
191 String line = input.readLine();
192 binned = new HashMap<>();
193 while (!("".equals(line) || line == null))
195 Scanner scanner = new Scanner(line);
196 scanner.useDelimiter(",");
197 binned.put(scanner.next(), scanner.nextDouble());
199 line = input.readLine();
206 * Reads the previous data from the raw file.
209 * @throws IOException
211 public void readRaw(String source) throws IOException
213 BufferedReader input = new BufferedReader(new FileReader(source + RAW));
214 String line = input.readLine();
220 Scanner numberScanner = new Scanner(line);
221 numberScanner.useDelimiter(",");
222 raw = new ArrayList<>();
223 while (numberScanner.hasNext())
225 numberScanner.next();
226 raw.add(new ArrayList<Double>());
228 numberScanner.close();
230 line = input.readLine();
231 while (!("".equals(line) || line == null))
233 Scanner scanner = new Scanner(line);
234 scanner.useDelimiter(",");
237 while (scanner.hasNext())
240 value = scanner.next();
241 if (!value.equals("EMPTY"))
243 raw.get(i).add(Double.parseDouble(value));
249 line = input.readLine();
256 * Counts the number of valid residues in the sequence.
260 public int countValidResidues()
264 for (int width = 0; width < sequences.size(); width++)
266 for (int length = 1; length < hmm.getLength(); length++)
270 alignPos = hmm.getNodeAlignmentColumn(length);
272 symbol = sequences.get(width).getCharAt(alignPos);
273 if (ResidueProperties.aminoBackgroundFrequencies
274 .containsKey(symbol))
285 * Processes data, and stores it in both a raw and binned format.
289 public void processData(int count)
294 raw.add(new ArrayList<Double>());
295 rawPos = raw.size() - 1;
298 for (int width = 0; width < sequences.size(); width++)
300 for (int length = 1; length < hmm.getLength(); length++)
304 alignPos = hmm.getNodeAlignmentColumn(length);
306 symbol = sequences.get(width).getCharAt(alignPos);
307 if (ResidueProperties.aminoBackgroundFrequencies
308 .containsKey(symbol))
313 prob = hmm.getMatchEmissionProbability(alignPos, symbol);
314 bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
315 llr = Math.log(prob / bfreq);
318 raw.get(rawPos).add(llr);
322 output = String.format("%.1f", llr);
323 if ("-0.0".equals(output))
327 if (binned.containsKey(output))
329 double prev = binned.get(output);
330 prev += (SCALE / count);
331 binned.put(output, prev);
336 binned.put(output, SCALE / count);
345 * Reads in the sequence data from a Stockholm file.
348 * @throws IOException
350 public void readStockholm(FileParse source) throws IOException
352 StockholmFile file = new StockholmFile(source);
353 sequences = file.getSeqs();
357 * Reads in the HMM data from a HMMer file.
360 * @throws IOException
362 public void readHMM(FileParse source) throws IOException
365 HMMFile file = new HMMFile(source);
372 * Exports both the binned and raw data into separate files.
375 * @throws FileNotFoundException
377 public void exportData(String location) throws FileNotFoundException
379 PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
380 for (Map.Entry<String, Double> entry : binned.entrySet())
382 writerBin.println(entry.getKey() + "," + entry.getValue());
388 PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
390 StringBuilder identifier = new StringBuilder();
392 for (int i = 1; i < raw.size() + 1; i++)
394 identifier.append("Fam " + i + ",");
397 writerRaw.println(identifier);
399 boolean rowIsEmpty = false;
404 StringBuilder string = new StringBuilder();
405 for (int column = 0; column < raw.size(); column++)
407 if (raw.get(column).size() <= row)
409 string.append("EMPTY,");
413 string.append(raw.get(column).get(row) + ",");
418 writerRaw.println(string);
427 * Prints the specified family on the console.
430 * @throws IOException
432 public void printFam(int index) throws IOException
434 BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
436 moveLocationBy(index, br);
438 String line = br.readLine();
440 while (!"//".equals(line))
442 System.out.println(line);
443 line = br.readLine();
445 System.out.println(line);
451 * Prints the specified HMM on the console.
454 * @throws IOException
456 public void printHMM(int index) throws IOException
458 BufferedReader br = new BufferedReader(new FileReader(HMMS));
460 moveLocationBy(index, br);
462 String line = br.readLine();
464 while (!"//".equals(line))
466 System.out.println(line);
467 line = br.readLine();
469 System.out.println(line);
475 * Prints the specified family to a .sto file.
478 * @throws IOException
480 public void exportFam(int index, String location) throws IOException
482 BufferedReader br = new BufferedReader(new FileReader(FAMILIES));
484 moveLocationBy(index, br);
486 String line = br.readLine();
487 PrintWriter writer = new PrintWriter(
488 new FileOutputStream(new File(location), true));
489 while (!"//".equals(line))
491 writer.println(line);
492 line = br.readLine();
494 writer.println(line);
500 public void exportFile(BufferedReader br, String location)
503 String line = br.readLine();
504 PrintWriter writer = new PrintWriter(
505 new FileOutputStream(new File(location), true));
506 while (!"//".equals(line))
508 writer.println(line);
509 line = br.readLine();
511 writer.println(line);
517 public String getHMMName(int index) throws IOException
521 BufferedReader nameFinder = new BufferedReader(new FileReader(HMMS));
523 moveLocationBy(index, nameFinder);
525 nameFinder.readLine();
527 Scanner scanner = new Scanner(nameFinder.readLine());
528 name = scanner.next();
529 name = scanner.next();
534 public String getFamilyName(int index) throws IOException
538 BufferedReader nameFinder = new BufferedReader(
539 new FileReader(FAMILIES));
541 moveLocationBy(index, nameFinder);
543 nameFinder.readLine();
545 Scanner scanner = new Scanner(nameFinder.readLine());
546 name = scanner.next();
547 name = scanner.next();
548 name = scanner.next();
554 * Prints the specified family to a .hmm file in the current directory.
557 * @throws IOException
559 public void exportHMM(int index, String location) throws IOException
563 BufferedReader br = new BufferedReader(new FileReader(HMMS));
565 moveLocationBy(index, br);
567 String line = br.readLine();
569 PrintWriter writer = new PrintWriter(
570 new FileOutputStream(new File(location), true));
571 while (!"//".equals(line))
573 writer.println(line);
574 line = br.readLine();
576 writer.println(line);
583 * Clears all raw, binned and current position data in the current directory.
585 * @throws FileNotFoundException
587 public void clear() throws FileNotFoundException
589 PrintWriter pos = new PrintWriter(
590 currentFolder + "/CurrentPosition.txt");
593 PrintWriter raw = new PrintWriter(currentFolder + RAW);
595 PrintWriter bin = new PrintWriter(currentFolder + BINNED);
602 public void sortIntoClans(String directory) throws IOException
604 BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
605 BufferedReader familyReader = new BufferedReader(
606 new FileReader(FAMILIES));
607 BufferedReader hmmReader = new BufferedReader(new FileReader(HMMS));
608 HashMap<String, Integer> clanIndexes = new HashMap<>();
612 line = clanFinder.readLine();
614 while (!"".equals(line) && !" ".equals(line) && line != null)
617 boolean inClan = false;
618 while (!(line.indexOf("//") > -1))
621 if (line.indexOf("#=GF CL") > -1)
624 Scanner scanner = new Scanner(line);
627 clanName = scanner.next();
630 if (!clanIndexes.containsKey(clanName))
632 clanIndexes.put(clanName, clanCount);
636 Integer clanI = clanIndexes.get(clanName);
637 String clanPath = directory + "/Clan" + clanI.toString();
638 File clanFolder = new File(clanPath);
639 String famPath = clanPath + "/Families.sto";
640 String hmmPath = clanPath + "/HMMs.hmm";
641 if (!clanFolder.exists())
645 exportFile(familyReader, famPath);
646 exportFile(hmmReader, hmmPath);
649 line = clanFinder.readLine();
653 moveLocationBy(1, familyReader);
654 moveLocationBy(1, hmmReader);
657 System.out.println(filePos + " files read.");
658 line = clanFinder.readLine();