3 import jalview.datamodel.AlignmentAnnotation;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
6 import jalview.io.DataSourceType;
7 import jalview.io.FileParse;
8 import jalview.io.HMMFile;
9 import jalview.io.StockholmFile;
10 import jalview.schemes.ResidueProperties;
12 import java.io.BufferedReader;
14 import java.io.FileNotFoundException;
15 import java.io.FileOutputStream;
16 import java.io.FileReader;
17 import java.io.IOException;
18 import java.io.InputStreamReader;
19 import java.io.PrintWriter;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.List;
24 import java.util.Random;
25 import java.util.Scanner;
26 import java.util.Vector;
29 * Processes probability data. The file indexes used in this program represent
30 * the index of the location of a family or hmm in their respective files,
36 public class HMMProbabilityDistributionAnalyser
38 AlignmentAnnotation reference = null;
40 Vector<SequenceI> sequences;
42 HiddenMarkovModel hmm;
44 // contains the raw data produced
45 List<ArrayList<Double>> raw = new ArrayList<>();
47 // contains binned data
48 Map<String, Double> binned = new HashMap<>();
50 // location of the family file
51 String families = "H:/Desktop//PFAM/Family/SeedFamilies.seed";
53 // location of the file containing the family-clan links
54 final static String FAMILIESTOCLAN = "H:/Desktop//PFAM/Family/Clanlinks.dat";
56 // location of the HMM file
57 String hmms = "H:/Desktop//PFAM/HMMs/Pfam-A.hmm";
59 // suffix for raw file
60 final static String RAW = "/Raw.csv";
62 // suffix for binned file
63 final static String BINNED = "/Binned.csv";
65 // normalisation scale
66 final static double SCALE = 1;
68 // current position in file
69 int currentFilePosition = 0;
71 final static String NL = "\n";
73 Random generator = new Random();
78 boolean keepRaw = false;
81 * Sets the working directory.
85 public void setFolder(String path)
91 * Moves a buffered reader forward in the file by a certain amount of entries.
92 * Each entry in the file is delimited by '//'.
95 * The index of the location in the file.
99 public void moveLocationBy(int index, BufferedReader br)
102 for (int i = 0; i < index; i++)
104 String line = br.readLine();
105 while (!"//".equals(line))
107 line = br.readLine();
115 * Analyses a specified number of families and then saves the data. Before
116 * analysing the data, the previous saved data will be imported and after
117 * analysing this, the data is exported back into the file. The file must be
118 * in flat file format.
121 * The number of families to read before saving.
122 * @throws IOException
124 public void run(int increments, boolean keepRawData) throws IOException
126 keepRaw = keepRawData;
129 readPreviousData(currentFolder);
130 BufferedReader posReader = new BufferedReader(
131 new FileReader(currentFolder + "/CurrentPosition.txt"));
133 String line = posReader.readLine();
135 currentFilePosition = Integer.parseInt(line);
136 } catch (Exception e)
138 System.out.println("No previous data found");
143 BufferedReader inputSTO = new BufferedReader(new FileReader(families));
144 BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
148 moveLocationBy(currentFilePosition, inputHMM);
149 moveLocationBy(currentFilePosition, inputSTO);
153 while (filesRead < increments)
156 readStockholm(inputSTO);
160 int count = countValidResidues();
164 currentFilePosition++;
165 System.out.println(i);
169 PrintWriter p = new PrintWriter(
170 new File(currentFolder + "/CurrentPosition.txt"));
171 p.print(currentFilePosition);
173 exportData(currentFolder);
180 * Analyses all families and then saves the data. Before analysing the data,
181 * the previous saved data will be imported and after analysing this, the data
182 * is exported back into the file. The file must be in flat file format.
185 * The number of families to read before saving.
186 * @throws IOException
188 public void runToEnd(boolean keepRawData, boolean forClans)
191 keepRaw = keepRawData;
192 BufferedReader inputSTO = null;
193 BufferedReader inputHMM = null;
201 for (int clan = 0; clan < files; clan++)
203 String clanPath = "";
204 int numberOfFamilies = 0;
207 clanPath = currentFolder + "/Clan" + clan;
208 BufferedReader famCountReader = new BufferedReader(
209 new FileReader(clanPath + "/NumberOfFamilies.txt"));
210 numberOfFamilies = Integer.parseInt(famCountReader.readLine());
214 numberOfFamilies = 1;
217 for (int fam = 0; fam < numberOfFamilies; fam++)
221 families = clanPath + "/Families/Fam" + fam + ".sto";
222 hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
225 inputSTO = new BufferedReader(new FileReader(families));
226 inputHMM = new BufferedReader(new FileReader(hmms));
230 boolean endReached = atEnd(inputSTO);
233 readStockholm(inputSTO);
236 int count = countValidResidues();
239 System.out.println(filesRead);
240 endReached = atEnd(inputSTO);
244 exportData(currentFolder);
250 * Reads the previous data from both files
253 * @throws IOException
255 public void readPreviousData(String source) throws IOException
265 * Reads the previous data from the binned file.
268 * @throws IOException
270 public void readBinned(String source) throws IOException
272 BufferedReader input = new BufferedReader(
273 new FileReader(source + BINNED));
274 String line = input.readLine();
275 binned = new HashMap<>();
276 while (!("".equals(line) || line == null))
278 Scanner scanner = new Scanner(line);
279 scanner.useDelimiter(",");
280 String key = scanner.next();
281 String value = scanner.next();
282 binned.put(key, Double.valueOf(value));
284 line = input.readLine();
291 * Reads the previous data from the raw file.
294 * @throws IOException
296 public void readRaw(String source) throws IOException
298 BufferedReader input = new BufferedReader(new FileReader(source + RAW));
299 String line = input.readLine();
305 Scanner numberScanner = new Scanner(line);
306 numberScanner.useDelimiter(",");
307 raw = new ArrayList<>();
308 while (numberScanner.hasNext())
310 numberScanner.next();
311 raw.add(new ArrayList<Double>());
313 numberScanner.close();
315 line = input.readLine();
316 while (!("".equals(line) || line == null))
318 Scanner scanner = new Scanner(line);
319 scanner.useDelimiter(",");
322 while (scanner.hasNext())
325 value = scanner.next();
326 if (!value.equals("EMPTY"))
328 raw.get(i).add(Double.parseDouble(value));
332 raw.get(i).add(null);
338 line = input.readLine();
345 * Counts the number of valid residues in the sequence.
349 public int countValidResidues()
353 for (int width = 0; width < sequences.size(); width++)
355 for (int length = 1; length < hmm.getLength() + 1; length++)
359 alignPos = hmm.getNodeAlignmentColumn(length);
361 symbol = sequences.get(width).getCharAt(alignPos);
362 if (ResidueProperties.aminoBackgroundFrequencies
363 .containsKey(symbol))
374 * Processes data, and stores it in both a raw and binned format.
378 public void processData(int count)
383 raw.add(new ArrayList<Double>());
384 rawPos = raw.size() - 1;
387 for (int width = 0; width < sequences.size(); width++)
389 for (int length = 1; length < hmm.getLength() + 1; length++)
393 alignPos = hmm.getNodeAlignmentColumn(length);
395 symbol = sequences.get(width).getCharAt(alignPos);
396 if (ResidueProperties.aminoBackgroundFrequencies
397 .containsKey(symbol))
402 prob = hmm.getMatchEmissionProbability(alignPos, symbol);
403 bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
404 llr = Math.log(prob / bfreq);
407 raw.get(rawPos).add(llr);
411 output = String.format("%.1f", llr);
412 if ("-0.0".equals(output))
416 if (binned.containsKey(output))
418 double prev = binned.get(output);
419 prev += (SCALE / count);
420 binned.put(output, prev);
425 binned.put(output, SCALE / count);
434 * Reads in the sequence data from a Stockholm file.
437 * @throws IOException
439 public void readStockholm(BufferedReader inputSTO) throws IOException
441 FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
442 StockholmFile file = new StockholmFile(parserSTO);
443 Vector<AlignmentAnnotation> annots = file.getAnnotations();
445 for (AlignmentAnnotation annot : annots)
447 if (annot.label.contains("Reference"))
452 sequences = file.getSeqs();
456 * Reads in the HMM data from a HMMer file.
459 * @throws IOException
461 public void readHMM(BufferedReader inputHMM) throws IOException
463 FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
464 HMMFile file = new HMMFile(parserHMM);
468 if (reference != null)
470 hmm.mapToReferenceAnnotation(reference);
476 * Exports both the binned and raw data into separate files.
479 * @throws FileNotFoundException
481 public void exportData(String location) throws FileNotFoundException
483 PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
484 for (Map.Entry<String, Double> entry : binned.entrySet())
486 writerBin.println(entry.getKey() + "," + entry.getValue());
492 PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
494 StringBuilder identifier = new StringBuilder();
496 for (int i = 1; i < raw.size() + 1; i++)
498 identifier.append("Fam " + i + ",");
501 writerRaw.println(identifier);
503 boolean rowIsEmpty = false;
508 StringBuilder string = new StringBuilder();
509 for (int column = 0; column < raw.size(); column++)
511 if (raw.get(column).size() <= row)
513 string.append("EMPTY,");
517 string.append(raw.get(column).get(row) + ",");
522 writerRaw.println(string);
531 * Prints the specified family on the console.
534 * @throws IOException
536 public void printFam(int index) throws IOException
538 BufferedReader br = new BufferedReader(new FileReader(families));
540 moveLocationBy(index, br);
542 String line = br.readLine();
544 while (!"//".equals(line))
546 System.out.println(line);
547 line = br.readLine();
549 System.out.println(line);
555 * Prints the specified HMM on the console.
558 * @throws IOException
560 public void printHMM(int index) throws IOException
562 BufferedReader br = new BufferedReader(new FileReader(hmms));
564 moveLocationBy(index, br);
566 String line = br.readLine();
568 while (!"//".equals(line))
570 System.out.println(line);
571 line = br.readLine();
573 System.out.println(line);
579 * Prints the specified family to a .sto file.
582 * @throws IOException
584 public void exportFam(int index, String location) throws IOException
586 BufferedReader br = new BufferedReader(new FileReader(families));
588 moveLocationBy(index, br);
590 String line = br.readLine();
591 PrintWriter writer = new PrintWriter(
592 new FileOutputStream(new File(location), true));
593 while (!"//".equals(line))
595 writer.println(line);
596 line = br.readLine();
598 writer.println(line);
604 public void exportFile(BufferedReader br, String location, boolean append)
607 String line = br.readLine();
608 PrintWriter writer = new PrintWriter(
609 new FileOutputStream(location, append));
610 while (!"//".equals(line))
612 writer.println(line);
613 line = br.readLine();
615 writer.println(line);
621 public String getHMMName(int index) throws IOException
625 BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
627 moveLocationBy(index, nameFinder);
629 nameFinder.readLine();
631 Scanner scanner = new Scanner(nameFinder.readLine());
632 name = scanner.next();
633 name = scanner.next();
638 public String getFamilyName(int index) throws IOException
642 BufferedReader nameFinder = new BufferedReader(
643 new FileReader(families));
645 moveLocationBy(index, nameFinder);
647 nameFinder.readLine();
649 Scanner scanner = new Scanner(nameFinder.readLine());
650 name = scanner.next();
651 name = scanner.next();
652 name = scanner.next();
658 * Prints the specified family to a .hmm file.
661 * @throws IOException
663 public void exportHMM(int index, String location) throws IOException
667 BufferedReader br = new BufferedReader(new FileReader(hmms));
669 moveLocationBy(index, br);
671 String line = br.readLine();
673 PrintWriter writer = new PrintWriter(
674 new FileOutputStream(new File(location), true));
675 while (!"//".equals(line))
677 writer.println(line);
678 line = br.readLine();
680 writer.println(line);
687 * Clears all raw, binned and current position data in the current directory.
689 * @throws FileNotFoundException
691 public void clear() throws FileNotFoundException
693 PrintWriter pos = new PrintWriter(
694 currentFolder + "/CurrentPosition.txt");
697 PrintWriter raw = new PrintWriter(currentFolder + RAW);
699 PrintWriter bin = new PrintWriter(currentFolder + BINNED);
706 public void sortIntoClans(String directory) throws IOException
708 BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
709 BufferedReader familyReader = new BufferedReader(
710 new FileReader(families));
711 BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
712 // moveLocationBy(7000, familyReader);
713 // moveLocationBy(7000, clanFinder);
714 // moveLocationBy(7000, hmmReader);
715 HashMap<String, Integer> clanIndexes = new HashMap<>();
716 ArrayList<Integer> familyCounts = new ArrayList<>();
720 line = clanFinder.readLine();
722 while (!"".equals(line) && !" ".equals(line) && line != null)
724 if (line.contains("HATP") || line.contains("CL0025"))
726 System.out.println(filePos);
729 boolean inClan = false;
730 while (!(line.indexOf("//") > -1))
733 if (line.indexOf("#=GF CL") > -1)
736 Scanner scanner = new Scanner(line);
739 clanName = scanner.next();
742 if (!clanIndexes.containsKey(clanName))
744 clanIndexes.put(clanName, clanCount);
750 Integer clanI = clanIndexes.get(clanName);
751 String clanPath = directory + "/Clan" + clanI.toString();
752 createFolders(clanPath);
754 int index = clanIndexes.get(clanName);
755 exportFile(familyReader,
756 clanPath + "/Families/Fam" + familyCounts.get(index)
759 exportFile(hmmReader,
760 clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
763 int count = familyCounts.get(index);
765 familyCounts.set(index, count);
767 line = clanFinder.readLine();
772 moveLocationBy(1, familyReader);
773 moveLocationBy(1, hmmReader);
776 // System.out.println(filePos + " files read.");
777 line = clanFinder.readLine();
782 for (int clan = 0; clan < clanCount; clan++)
784 PrintWriter writer = new PrintWriter(
785 directory + "/Clan" + clan + "/NumberOfFamilies.txt");
786 int count = familyCounts.get(clan);
793 public String getFamilies()
798 public void setFamilies(String families)
800 this.families = currentFolder + families;
803 public String getHmms()
808 public void setHmms(String hmms)
810 this.hmms = currentFolder + hmms;
813 public void alignWithinClan(String exportLocation, String clansLocation)
814 throws IOException, InterruptedException
816 int alignmentsExported = 0;
817 for (int clan = 0; clan < 604; clan++)
821 String clanPath = clansLocation + "/Clan" + clan;
822 int numberOfFamilies;
823 BufferedReader br = new BufferedReader(
824 new FileReader(clanPath + "/NumberOfFamilies.txt"));
825 String line = br.readLine();
826 numberOfFamilies = Integer.parseInt(line);
828 if (numberOfFamilies == 1)
832 final String commandExportLocation = exportLocation + "/Clan" + clan;
833 createFolders(commandExportLocation);
834 for (int family = 0; family < numberOfFamilies; family++)
837 ArrayList<Integer> indexes = new ArrayList<>();
838 for (int i = 0; i < numberOfFamilies; i++)
846 int hmmIndex = getRandom(indexes);
847 String famPath = clanPath + "/Families/Fam" + family + ".sto";
848 String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
849 String command = "H:/Desktop//hmmer/binaries/hmmalign --mapali "
850 + clanPath + "/Families/Fam" + hmmIndex + ".sto"
852 command += hmmPath + " ";
854 final int familyIndex = family;
855 final Process p = Runtime.getRuntime().exec(command);
857 new Thread(new Runnable()
862 BufferedReader input = new BufferedReader(
863 new InputStreamReader(p.getInputStream()));
868 PrintWriter writer = new PrintWriter(commandExportLocation
869 + "/Families/Fam" + familyIndex + ".sto");
870 String lastLine = "";
871 boolean dataFound = false;
872 while ((line = input.readLine()) != null)
874 if (line.contains("#=GR") && !dataFound)
876 writer.println(lastLine);
879 if (line.contains("#") || dataFound || " ".equals(line)
880 || "".equals(line) || "//".equals(line))
882 writer.println(line);
887 } catch (IOException e)
896 BufferedReader hmmExporter = new BufferedReader(
897 new FileReader(hmmPath));
899 exportFile(hmmExporter,
900 commandExportLocation + "/HMMs/HMM" + family + ".hmm",
903 alignmentsExported++;
905 System.out.println(alignmentsExported + " alignments exported");
906 System.out.println("At clan " + clan);
909 PrintWriter writer = new PrintWriter(
910 commandExportLocation + "/NumberOfFamilies.txt");
911 writer.print(famCount);
917 public boolean atEnd(BufferedReader br) throws IOException
921 String line = br.readLine();
922 if ("".equals(line) || line == null)
930 public int getRandom(ArrayList<Integer> list)
932 if (!(list.size() > 0))
934 System.out.println("Error - size = " + list.size());
936 int index = generator.nextInt(list.size());
937 int value = list.get(index);
942 public void createFolders(String clanPath)
944 File clanFolder = new File(clanPath);
945 if (!clanFolder.exists())
950 File famFolder = new File(clanPath + "/Families");
951 File hmmFolder = new File(clanPath + "/HMMs");
952 if (!famFolder.exists())