3 import jalview.datamodel.AlignmentAnnotation;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
6 import jalview.io.DataSourceType;
7 import jalview.io.FileParse;
8 import jalview.io.HMMFile;
9 import jalview.io.StockholmFile;
10 import jalview.schemes.ResidueProperties;
12 import java.io.BufferedReader;
14 import java.io.FileNotFoundException;
15 import java.io.FileOutputStream;
16 import java.io.FileReader;
17 import java.io.IOException;
18 import java.io.InputStreamReader;
19 import java.io.PrintWriter;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.List;
24 import java.util.Random;
25 import java.util.Scanner;
26 import java.util.Vector;
29 * Processes probability data. The file indexes used in this program represent
30 * the index of the location of a family or hmm in their respective files,
36 public class HMMProbabilityDistributionAnalyser
38 AlignmentAnnotation reference = null;
40 Vector<SequenceI> sequences;
42 HiddenMarkovModel hmm;
44 // contains the raw data produced
45 List<ArrayList<Double>> raw = new ArrayList<>();
47 // contains binned data
48 Map<String, Double> binned = new HashMap<>();
50 // location of the family file
51 String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
53 // location of the file containing the family-clan links
54 final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
56 // location of the HMM file
57 String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
59 // suffix for raw file
60 final static String RAW = "/Raw.csv";
62 // suffix for binned file
63 final static String BINNED = "/Binned.csv";
65 // normalisation scale
66 final static double SCALE = 1;
68 // current position in file
69 int currentFilePosition = 0;
71 final static String NL = "\n";
73 Random generator = new Random();
78 boolean keepRaw = false;
81 * Sets the working directory.
85 public void setFolder(String path)
91 * Moves a buffered reader forward in the file by a certain amount of entries.
92 * Each entry in the file is delimited by '//'.
95 * The index of the location in the file.
99 public void moveLocationBy(int index, BufferedReader br)
102 for (int i = 0; i < index; i++)
104 String line = br.readLine();
105 while (!"//".equals(line))
107 line = br.readLine();
115 * Analyses a specified number of families and then saves the data. Before
116 * analysing the data, the previous saved data will be imported and after
117 * analysing this, the data is exported back into the file. The file must be
118 * in flat file format.
121 * The number of families to read before saving.
122 * @throws IOException
124 public void run(int increments, boolean keepRawData) throws IOException
126 keepRaw = keepRawData;
129 readPreviousData(currentFolder);
130 BufferedReader posReader = new BufferedReader(
131 new FileReader(currentFolder + "/CurrentPosition.txt"));
133 String line = posReader.readLine();
135 currentFilePosition = Integer.parseInt(line);
136 } catch (Exception e)
138 System.out.println("No previous data found");
143 BufferedReader inputSTO = new BufferedReader(new FileReader(families));
144 BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
148 moveLocationBy(currentFilePosition, inputHMM);
149 moveLocationBy(currentFilePosition, inputSTO);
153 while (filesRead < increments)
156 readStockholm(inputSTO);
160 int count = countValidResidues();
164 currentFilePosition++;
165 System.out.println(i);
169 PrintWriter p = new PrintWriter(
170 new File(currentFolder + "/CurrentPosition.txt"));
171 p.print(currentFilePosition);
173 exportData(currentFolder);
180 * Analyses all families and then saves the data. Before analysing the data,
181 * the previous saved data will be imported and after analysing this, the data
182 * is exported back into the file. The file must be in flat file format.
185 * The number of families to read before saving.
186 * @throws IOException
188 public void runToEnd(boolean keepRawData, boolean forClans)
191 keepRaw = keepRawData;
192 BufferedReader inputSTO = null;
193 BufferedReader inputHMM = null;
203 for (int clan = 0; clan < files; clan++)
205 System.out.println(clan);
206 String clanPath = "";
207 int numberOfFamilies = 0;
210 clanPath = currentFolder + "/Clan" + clan;
211 if (!new File(clanPath).exists())
215 BufferedReader famCountReader = new BufferedReader(
216 new FileReader(clanPath + "/NumberOfFamilies.txt"));
217 numberOfFamilies = Integer.parseInt(famCountReader.readLine());
221 numberOfFamilies = 1;
224 for (int fam = 0; fam < numberOfFamilies; fam++)
228 families = clanPath + "/Families/Fam" + fam + ".sto";
229 hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
232 inputSTO = new BufferedReader(new FileReader(families));
233 inputHMM = new BufferedReader(new FileReader(hmms));
237 boolean endReached = atEnd(inputSTO);
240 readStockholm(inputSTO);
243 int count = countValidResidues();
246 System.out.println(filesRead);
247 endReached = atEnd(inputSTO);
251 } catch (Exception e)
256 exportData(currentFolder);
263 * Reads the previous data from both files
266 * @throws IOException
268 public void readPreviousData(String source) throws IOException
278 * Reads the previous data from the binned file.
281 * @throws IOException
283 public void readBinned(String source) throws IOException
285 BufferedReader input = new BufferedReader(
286 new FileReader(source + BINNED));
287 String line = input.readLine();
288 binned = new HashMap<>();
289 while (!("".equals(line) || line == null))
291 Scanner scanner = new Scanner(line);
292 scanner.useDelimiter(",");
293 String key = scanner.next();
294 String value = scanner.next();
295 binned.put(key, Double.valueOf(value));
297 line = input.readLine();
304 * Reads the previous data from the raw file.
307 * @throws IOException
309 public void readRaw(String source) throws IOException
311 BufferedReader input = new BufferedReader(new FileReader(source + RAW));
312 String line = input.readLine();
318 Scanner numberScanner = new Scanner(line);
319 numberScanner.useDelimiter(",");
320 raw = new ArrayList<>();
321 while (numberScanner.hasNext())
323 numberScanner.next();
324 raw.add(new ArrayList<Double>());
326 numberScanner.close();
328 line = input.readLine();
329 while (!("".equals(line) || line == null))
331 Scanner scanner = new Scanner(line);
332 scanner.useDelimiter(",");
335 while (scanner.hasNext())
338 value = scanner.next();
339 if (!value.equals("EMPTY"))
341 raw.get(i).add(Double.parseDouble(value));
345 raw.get(i).add(null);
351 line = input.readLine();
358 * Counts the number of valid residues in the sequence.
362 public int countValidResidues()
366 for (int width = 0; width < sequences.size(); width++)
368 for (int length = 1; length < hmm.getLength() + 1; length++)
372 alignPos = hmm.getNodeAlignmentColumn(length);
374 symbol = sequences.get(width).getCharAt(alignPos);
375 if (ResidueProperties.backgroundFrequencies.get("amino")
376 .containsKey(symbol))
387 * Processes data, and stores it in both a raw and binned format.
391 public void processData(int count)
396 raw.add(new ArrayList<Double>());
397 rawPos = raw.size() - 1;
400 for (int width = 0; width < sequences.size(); width++)
402 for (int length = 1; length < hmm.getLength() + 1; length++)
406 alignPos = hmm.getNodeAlignmentColumn(length);
408 symbol = sequences.get(width).getCharAt(alignPos);
409 if (ResidueProperties.backgroundFrequencies.get("amino")
410 .containsKey(symbol))
415 prob = hmm.getMatchEmissionProbability(alignPos, symbol);
416 bfreq = ResidueProperties.backgroundFrequencies.get("amino")
418 if (prob == 0 || bfreq == 0)
420 System.out.println("error");
422 llr = Math.log(prob / bfreq);
425 raw.get(rawPos).add(llr);
429 output = String.format("%.1f", llr);
430 total += Double.parseDouble(output);
431 if ("-0.0".equals(output))
435 if (binned.containsKey(output))
437 double prev = binned.get(output);
438 prev += (SCALE / count);
439 binned.put(output, prev);
444 binned.put(output, SCALE / count);
449 System.out.println(total / count);
454 * Reads in the sequence data from a Stockholm file.
457 * @throws IOException
459 public void readStockholm(BufferedReader inputSTO) throws IOException
461 FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
462 StockholmFile file = new StockholmFile(parserSTO);
463 Vector<AlignmentAnnotation> annots = file.getAnnotations();
465 for (AlignmentAnnotation annot : annots)
467 if (annot.label.contains("Reference"))
472 sequences = file.getSeqs();
476 * Reads in the HMM data from a HMMer file.
479 * @throws IOException
481 public void readHMM(BufferedReader inputHMM) throws IOException
483 FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
484 HMMFile file = new HMMFile(parserHMM);
491 * Exports both the binned and raw data into separate files.
494 * @throws FileNotFoundException
496 public void exportData(String location) throws FileNotFoundException
498 PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
499 for (Map.Entry<String, Double> entry : binned.entrySet())
501 writerBin.println(entry.getKey() + "," + entry.getValue());
507 PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
509 StringBuilder identifier = new StringBuilder();
511 for (int i = 1; i < raw.size() + 1; i++)
513 identifier.append("Fam " + i + ",");
516 writerRaw.println(identifier);
518 boolean rowIsEmpty = false;
523 StringBuilder string = new StringBuilder();
524 for (int column = 0; column < raw.size(); column++)
526 if (raw.get(column).size() <= row)
528 string.append("EMPTY,");
532 string.append(raw.get(column).get(row) + ",");
537 writerRaw.println(string);
546 * Prints the specified family on the console.
549 * @throws IOException
551 public void printFam(int index) throws IOException
553 BufferedReader br = new BufferedReader(new FileReader(families));
555 moveLocationBy(index, br);
557 String line = br.readLine();
559 while (!"//".equals(line))
561 System.out.println(line);
562 line = br.readLine();
564 System.out.println(line);
570 * Prints the specified HMM on the console.
573 * @throws IOException
575 public void printHMM(int index) throws IOException
577 BufferedReader br = new BufferedReader(new FileReader(hmms));
579 moveLocationBy(index, br);
581 String line = br.readLine();
583 while (!"//".equals(line))
585 System.out.println(line);
586 line = br.readLine();
588 System.out.println(line);
594 * Prints the specified family to a .sto file.
597 * @throws IOException
599 public void exportFam(int index, String location) throws IOException
601 BufferedReader br = new BufferedReader(new FileReader(families));
603 moveLocationBy(index, br);
605 String line = br.readLine();
606 PrintWriter writer = new PrintWriter(
607 new FileOutputStream(new File(location), true));
608 while (!"//".equals(line))
610 writer.println(line);
611 line = br.readLine();
613 writer.println(line);
619 public void exportFile(BufferedReader br, String location, boolean append)
622 String line = br.readLine();
623 PrintWriter writer = new PrintWriter(
624 new FileOutputStream(location, append));
625 while (!"//".equals(line))
627 writer.println(line);
628 line = br.readLine();
630 writer.println(line);
636 public String getHMMName(int index) throws IOException
640 BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
642 moveLocationBy(index, nameFinder);
644 nameFinder.readLine();
646 Scanner scanner = new Scanner(nameFinder.readLine());
647 name = scanner.next();
648 name = scanner.next();
653 public String getFamilyName(int index) throws IOException
657 BufferedReader nameFinder = new BufferedReader(
658 new FileReader(families));
660 moveLocationBy(index, nameFinder);
662 nameFinder.readLine();
664 Scanner scanner = new Scanner(nameFinder.readLine());
665 name = scanner.next();
666 name = scanner.next();
667 name = scanner.next();
673 * Prints the specified family to a .hmm file.
676 * @throws IOException
678 public void exportHMM(int index, String location) throws IOException
682 BufferedReader br = new BufferedReader(new FileReader(hmms));
684 moveLocationBy(index, br);
686 String line = br.readLine();
688 PrintWriter writer = new PrintWriter(
689 new FileOutputStream(new File(location), true));
690 while (!"//".equals(line))
692 writer.println(line);
693 line = br.readLine();
695 writer.println(line);
702 * Clears all raw, binned and current position data in the current directory.
704 * @throws FileNotFoundException
706 public void clear() throws FileNotFoundException
708 PrintWriter pos = new PrintWriter(
709 currentFolder + "/CurrentPosition.txt");
712 PrintWriter raw = new PrintWriter(currentFolder + RAW);
714 PrintWriter bin = new PrintWriter(currentFolder + BINNED);
721 public void sortIntoClans(String directory) throws IOException
723 BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
724 BufferedReader familyReader = new BufferedReader(
725 new FileReader(families));
726 BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
728 // moveLocationBy(7000, familyReader);
729 // moveLocationBy(7000, clanFinder);
730 // moveLocationBy(7000, hmmReader);
731 HashMap<String, Integer> clanIndexes = new HashMap<>();
732 ArrayList<Integer> familyCounts = new ArrayList<>();
736 line = clanFinder.readLine();
738 while (!"".equals(line) && !" ".equals(line) && line != null)
741 boolean inClan = false;
742 while (!(line.indexOf("//") > -1))
745 if (line.indexOf("#=GF CL") > -1)
748 System.out.println(families);
750 Scanner scanner = new Scanner(line);
753 clanName = scanner.next();
756 if (!clanIndexes.containsKey(clanName))
758 clanIndexes.put(clanName, clanCount);
764 Integer clanI = clanIndexes.get(clanName);
765 String clanPath = directory + "/Clan" + clanI.toString();
766 createFolders(clanPath);
768 int index = clanIndexes.get(clanName);
769 exportFile(familyReader,
770 clanPath + "/Families/Fam" + familyCounts.get(index)
773 exportFile(hmmReader,
774 clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
777 int count = familyCounts.get(index);
779 familyCounts.set(index, count);
781 line = clanFinder.readLine();
786 moveLocationBy(1, familyReader);
787 moveLocationBy(1, hmmReader);
790 // System.out.println(filePos + " files read.");
791 line = clanFinder.readLine();
796 for (int clan = 0; clan < clanCount; clan++)
798 PrintWriter writer = new PrintWriter(
799 directory + "/Clan" + clan + "/NumberOfFamilies.txt");
800 int count = familyCounts.get(clan);
807 public String getFamilies()
812 public void setFamilies(String families)
814 this.families = currentFolder + families;
817 public String getHmms()
822 public void setHmms(String hmms)
824 this.hmms = currentFolder + hmms;
827 public void alignWithinClan(String exportLocation, String clansLocation)
828 throws IOException, InterruptedException
830 int alignmentsExported = 0;
831 for (int clan = 0; clan < 604; clan++)
833 System.out.println(clan);
835 String clanPath = clansLocation + "/Clan" + clan;
836 int numberOfFamilies;
837 BufferedReader br = new BufferedReader(
838 new FileReader(clanPath + "/NumberOfFamilies.txt"));
839 String line = br.readLine();
840 numberOfFamilies = Integer.parseInt(line);
842 if (numberOfFamilies == 1)
846 final String commandExportLocation = exportLocation + "/Clan" + clan;
847 createFolders(commandExportLocation);
848 for (int family = 0; family < numberOfFamilies; family++)
851 ArrayList<Integer> indexes = new ArrayList<>();
852 for (int i = 0; i < numberOfFamilies; i++)
860 int hmmIndex = getRandom(indexes);
861 String famPath = clanPath + "/Families/Fam" + family + ".sto";
862 String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
863 String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
864 + clanPath + "/Families/Fam" + hmmIndex + ".sto"
866 command += hmmPath + " ";
868 final int familyIndex = family;
869 final Process p = Runtime.getRuntime().exec(command);
871 new Thread(new Runnable()
876 BufferedReader input = new BufferedReader(
877 new InputStreamReader(p.getInputStream()));
882 PrintWriter writer = new PrintWriter(commandExportLocation
883 + "/Families/Fam" + familyIndex + ".sto");
884 String lastLine = "";
885 boolean dataFound = false;
886 while ((line = input.readLine()) != null)
888 if (line.contains("#=GR") && !dataFound)
890 writer.println(lastLine);
893 if (line.contains("#") || dataFound || " ".equals(line)
894 || "".equals(line) || "//".equals(line))
896 writer.println(line);
901 } catch (IOException e)
910 BufferedReader hmmExporter = new BufferedReader(
911 new FileReader(hmmPath));
913 exportFile(hmmExporter,
914 commandExportLocation + "/HMMs/HMM" + family + ".hmm",
917 alignmentsExported++;
921 PrintWriter writer = new PrintWriter(
922 commandExportLocation + "/NumberOfFamilies.txt");
923 writer.print(famCount);
929 public boolean atEnd(BufferedReader br) throws IOException
933 String line = br.readLine();
934 if ("".equals(line) || line == null)
942 public int getRandom(ArrayList<Integer> list)
944 if (!(list.size() > 0))
946 System.out.println("Error - size = " + list.size());
948 int index = generator.nextInt(list.size());
949 int value = list.get(index);
954 public void createFolders(String clanPath)
956 File clanFolder = new File(clanPath);
957 if (!clanFolder.exists())
962 File famFolder = new File(clanPath + "/Families");
963 File hmmFolder = new File(clanPath + "/HMMs");
964 if (!famFolder.exists())