3 import jalview.datamodel.AlignmentAnnotation;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
6 import jalview.io.DataSourceType;
7 import jalview.io.FileParse;
8 import jalview.io.HMMFile;
9 import jalview.io.StockholmFile;
10 import jalview.schemes.ResidueProperties;
12 import java.io.BufferedReader;
14 import java.io.FileNotFoundException;
15 import java.io.FileOutputStream;
16 import java.io.FileReader;
17 import java.io.IOException;
18 import java.io.InputStreamReader;
19 import java.io.PrintWriter;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.List;
24 import java.util.Random;
25 import java.util.Scanner;
26 import java.util.Vector;
29 * Processes probability data. The file indexes used in this program represent
30 * the index of the location of a family or hmm in their respective files,
36 public class HMMProbabilityDistributionAnalyser
38 AlignmentAnnotation reference = null;
40 Vector<SequenceI> sequences;
42 HiddenMarkovModel hmm;
44 // contains the raw data produced
45 List<ArrayList<Double>> raw = new ArrayList<>();
47 // contains binned data
48 Map<String, Double> binned = new HashMap<>();
50 // location of the family file
51 String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
53 // location of the file containing the family-clan links
54 final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
56 // location of the HMM file
57 String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
59 // suffix for raw file
60 final static String RAW = "/Raw.csv";
62 // suffix for binned file
63 final static String BINNED = "/Binned.csv";
65 // normalisation scale
66 final static double SCALE = 1;
68 // current position in file
69 int currentFilePosition = 0;
71 final static String NL = "\n";
73 Random generator = new Random();
78 boolean keepRaw = false;
81 * Sets the working directory.
85 public void setFolder(String path)
91 * Moves a buffered reader forward in the file by a certain amount of entries.
92 * Each entry in the file is delimited by '//'.
95 * The index of the location in the file.
99 public void moveLocationBy(int index, BufferedReader br)
102 for (int i = 0; i < index; i++)
104 String line = br.readLine();
105 while (!"//".equals(line))
107 line = br.readLine();
115 * Analyses a specified number of families and then saves the data. Before
116 * analysing the data, the previous saved data will be imported and after
117 * analysing this, the data is exported back into the file. The file must be
118 * in flat file format.
121 * The number of families to read before saving.
122 * @throws IOException
124 public void run(int increments, boolean keepRawData) throws IOException
126 keepRaw = keepRawData;
129 readPreviousData(currentFolder);
130 BufferedReader posReader = new BufferedReader(
131 new FileReader(currentFolder + "/CurrentPosition.txt"));
133 String line = posReader.readLine();
135 currentFilePosition = Integer.parseInt(line);
136 } catch (Exception e)
138 System.out.println("No previous data found");
143 BufferedReader inputSTO = new BufferedReader(new FileReader(families));
144 BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
148 moveLocationBy(currentFilePosition, inputHMM);
149 moveLocationBy(currentFilePosition, inputSTO);
153 while (filesRead < increments)
156 readStockholm(inputSTO);
160 int count = countValidResidues();
164 currentFilePosition++;
165 System.out.println(i);
169 PrintWriter p = new PrintWriter(
170 new File(currentFolder + "/CurrentPosition.txt"));
171 p.print(currentFilePosition);
173 exportData(currentFolder);
180 * Analyses all families and then saves the data. Before analysing the data,
181 * the previous saved data will be imported and after analysing this, the data
182 * is exported back into the file. The file must be in flat file format.
185 * The number of families to read before saving.
186 * @throws IOException
188 public void runToEnd(int minCount, int maxCount, boolean keepRawData,
192 keepRaw = keepRawData;
193 BufferedReader inputSTO = null;
194 BufferedReader inputHMM = null;
204 for (int clan = 0; clan < files; clan++)
206 System.out.println(clan);
207 String clanPath = "";
208 int numberOfFamilies = 0;
211 clanPath = currentFolder + "/Clan" + clan;
212 if (!new File(clanPath).exists())
216 BufferedReader famCountReader = new BufferedReader(
217 new FileReader(clanPath + "/NumberOfFamilies.txt"));
218 numberOfFamilies = Integer.parseInt(famCountReader.readLine());
222 numberOfFamilies = 1;
225 for (int fam = 0; fam < numberOfFamilies; fam++)
229 families = clanPath + "/Families/Fam" + fam + ".sto";
230 hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
233 inputSTO = new BufferedReader(new FileReader(families));
234 inputHMM = new BufferedReader(new FileReader(hmms));
238 boolean endReached = atEnd(inputSTO);
241 readStockholm(inputSTO);
244 int count = countValidResidues();
245 if (count >= minCount && count < maxCount)
250 System.out.println(filesRead);
251 endReached = atEnd(inputSTO);
255 } catch (Exception e)
260 exportData(currentFolder);
267 * Reads the previous data from both files
270 * @throws IOException
272 public void readPreviousData(String source) throws IOException
282 * Reads the previous data from the binned file.
285 * @throws IOException
287 public void readBinned(String source) throws IOException
289 BufferedReader input = new BufferedReader(
290 new FileReader(source + BINNED));
291 String line = input.readLine();
292 binned = new HashMap<>();
293 while (!("".equals(line) || line == null))
295 Scanner scanner = new Scanner(line);
296 scanner.useDelimiter(",");
297 String key = scanner.next();
298 String value = scanner.next();
299 binned.put(key, Double.valueOf(value));
301 line = input.readLine();
308 * Reads the previous data from the raw file.
311 * @throws IOException
313 public void readRaw(String source) throws IOException
315 BufferedReader input = new BufferedReader(new FileReader(source + RAW));
316 String line = input.readLine();
322 Scanner numberScanner = new Scanner(line);
323 numberScanner.useDelimiter(",");
324 raw = new ArrayList<>();
325 while (numberScanner.hasNext())
327 numberScanner.next();
328 raw.add(new ArrayList<Double>());
330 numberScanner.close();
332 line = input.readLine();
333 while (!("".equals(line) || line == null))
335 Scanner scanner = new Scanner(line);
336 scanner.useDelimiter(",");
339 while (scanner.hasNext())
342 value = scanner.next();
343 if (!value.equals("EMPTY"))
345 raw.get(i).add(Double.parseDouble(value));
349 raw.get(i).add(null);
355 line = input.readLine();
362 * Counts the number of valid residues in the sequence.
366 public int countValidResidues()
370 for (int width = 0; width < sequences.size(); width++)
372 for (int length = 1; length < hmm.getLength() + 1; length++)
376 alignPos = hmm.getNodeMapPosition(length);
378 symbol = sequences.get(width).getCharAt(alignPos);
379 if (ResidueProperties.backgroundFrequencies.get("amino")
380 .containsKey(symbol))
391 * Processes data, and stores it in both a raw and binned format.
395 public void processData(int count)
400 raw.add(new ArrayList<Double>());
401 rawPos = raw.size() - 1;
404 for (int width = 0; width < sequences.size(); width++)
406 for (int length = 1; length < hmm.getLength() + 1; length++)
410 alignPos = hmm.getNodeMapPosition(length);
412 symbol = sequences.get(width).getCharAt(alignPos);
413 if (ResidueProperties.backgroundFrequencies.get("amino")
414 .containsKey(symbol))
419 prob = hmm.getMatchEmissionProbability(alignPos, symbol);
420 bfreq = ResidueProperties.backgroundFrequencies.get("amino")
422 if (prob == 0 || bfreq == 0)
424 System.out.println("error");
426 llr = Math.log(prob / bfreq);
429 raw.get(rawPos).add(llr);
433 output = String.format("%.1f", llr);
434 total += Double.parseDouble(output);
435 if ("-0.0".equals(output))
439 if (binned.containsKey(output))
441 double prev = binned.get(output);
442 prev += (SCALE / count);
443 binned.put(output, prev);
448 binned.put(output, SCALE / count);
453 System.out.println(total / count);
458 * Reads in the sequence data from a Stockholm file.
461 * @throws IOException
463 public void readStockholm(BufferedReader inputSTO) throws IOException
465 FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
466 StockholmFile file = new StockholmFile(parserSTO);
467 Vector<AlignmentAnnotation> annots = file.getAnnotations();
469 for (AlignmentAnnotation annot : annots)
471 if (annot.label.contains("Reference"))
476 sequences = file.getSeqs();
480 * Reads in the HMM data from a HMMer file.
483 * @throws IOException
485 public void readHMM(BufferedReader inputHMM) throws IOException
487 FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
488 HMMFile file = new HMMFile(parserHMM);
495 * Exports both the binned and raw data into separate files.
498 * @throws FileNotFoundException
500 public void exportData(String location) throws FileNotFoundException
502 PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
503 for (Map.Entry<String, Double> entry : binned.entrySet())
505 writerBin.println(entry.getKey() + "," + entry.getValue());
511 PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
513 StringBuilder identifier = new StringBuilder();
515 for (int i = 1; i < raw.size() + 1; i++)
517 identifier.append("Fam " + i + ",");
520 writerRaw.println(identifier);
522 boolean rowIsEmpty = false;
527 StringBuilder string = new StringBuilder();
528 for (int column = 0; column < raw.size(); column++)
530 if (raw.get(column).size() <= row)
532 string.append("EMPTY,");
536 string.append(raw.get(column).get(row) + ",");
541 writerRaw.println(string);
550 * Prints the specified family on the console.
553 * @throws IOException
555 public void printFam(int index) throws IOException
557 BufferedReader br = new BufferedReader(new FileReader(families));
559 moveLocationBy(index, br);
561 String line = br.readLine();
563 while (!"//".equals(line))
565 System.out.println(line);
566 line = br.readLine();
568 System.out.println(line);
574 * Prints the specified HMM on the console.
577 * @throws IOException
579 public void printHMM(int index) throws IOException
581 BufferedReader br = new BufferedReader(new FileReader(hmms));
583 moveLocationBy(index, br);
585 String line = br.readLine();
587 while (!"//".equals(line))
589 System.out.println(line);
590 line = br.readLine();
592 System.out.println(line);
598 * Prints the specified family to a .sto file.
601 * @throws IOException
603 public void exportFam(int index, String location) throws IOException
605 BufferedReader br = new BufferedReader(new FileReader(families));
607 moveLocationBy(index, br);
609 String line = br.readLine();
610 PrintWriter writer = new PrintWriter(
611 new FileOutputStream(new File(location), true));
612 while (!"//".equals(line))
614 writer.println(line);
615 line = br.readLine();
617 writer.println(line);
623 public void exportFile(BufferedReader br, String location, boolean append)
626 String line = br.readLine();
627 PrintWriter writer = new PrintWriter(
628 new FileOutputStream(location, append));
629 while (!"//".equals(line))
631 writer.println(line);
632 line = br.readLine();
634 writer.println(line);
640 public String getHMMName(int index) throws IOException
644 BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
646 moveLocationBy(index, nameFinder);
648 nameFinder.readLine();
650 Scanner scanner = new Scanner(nameFinder.readLine());
651 name = scanner.next();
652 name = scanner.next();
657 public String getFamilyName(int index) throws IOException
661 BufferedReader nameFinder = new BufferedReader(
662 new FileReader(families));
664 moveLocationBy(index, nameFinder);
666 nameFinder.readLine();
668 Scanner scanner = new Scanner(nameFinder.readLine());
669 name = scanner.next();
670 name = scanner.next();
671 name = scanner.next();
677 * Prints the specified family to a .hmm file.
680 * @throws IOException
682 public void exportHMM(int index, String location) throws IOException
686 BufferedReader br = new BufferedReader(new FileReader(hmms));
688 moveLocationBy(index, br);
690 String line = br.readLine();
692 PrintWriter writer = new PrintWriter(
693 new FileOutputStream(new File(location), true));
694 while (!"//".equals(line))
696 writer.println(line);
697 line = br.readLine();
699 writer.println(line);
706 * Clears all raw, binned and current position data in the current directory.
708 * @throws FileNotFoundException
710 public void clear() throws FileNotFoundException
712 PrintWriter pos = new PrintWriter(
713 currentFolder + "/CurrentPosition.txt");
716 PrintWriter raw = new PrintWriter(currentFolder + RAW);
718 PrintWriter bin = new PrintWriter(currentFolder + BINNED);
725 public void sortIntoClans(String directory) throws IOException
727 BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
728 BufferedReader familyReader = new BufferedReader(
729 new FileReader(families));
730 BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
732 // moveLocationBy(7000, familyReader);
733 // moveLocationBy(7000, clanFinder);
734 // moveLocationBy(7000, hmmReader);
735 HashMap<String, Integer> clanIndexes = new HashMap<>();
736 ArrayList<Integer> familyCounts = new ArrayList<>();
740 line = clanFinder.readLine();
742 while (!"".equals(line) && !" ".equals(line) && line != null)
745 boolean inClan = false;
746 while (!(line.indexOf("//") > -1))
749 if (line.indexOf("#=GF CL") > -1)
752 System.out.println(families);
754 Scanner scanner = new Scanner(line);
757 clanName = scanner.next();
760 if (!clanIndexes.containsKey(clanName))
762 clanIndexes.put(clanName, clanCount);
768 Integer clanI = clanIndexes.get(clanName);
769 String clanPath = directory + "/Clan" + clanI.toString();
770 createFolders(clanPath);
772 int index = clanIndexes.get(clanName);
773 exportFile(familyReader,
774 clanPath + "/Families/Fam" + familyCounts.get(index)
777 exportFile(hmmReader,
778 clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
781 int count = familyCounts.get(index);
783 familyCounts.set(index, count);
785 line = clanFinder.readLine();
790 moveLocationBy(1, familyReader);
791 moveLocationBy(1, hmmReader);
794 // System.out.println(filePos + " files read.");
795 line = clanFinder.readLine();
800 for (int clan = 0; clan < clanCount; clan++)
802 PrintWriter writer = new PrintWriter(
803 directory + "/Clan" + clan + "/NumberOfFamilies.txt");
804 int count = familyCounts.get(clan);
811 public String getFamilies()
816 public void setFamilies(String families)
818 this.families = currentFolder + families;
821 public String getHmms()
826 public void setHmms(String hmms)
828 this.hmms = currentFolder + hmms;
831 public void alignWithinClan(String exportLocation, String clansLocation)
832 throws IOException, InterruptedException
834 int alignmentsExported = 0;
835 for (int clan = 0; clan < 604; clan++)
837 System.out.println(clan);
839 String clanPath = clansLocation + "/Clan" + clan;
840 int numberOfFamilies;
841 BufferedReader br = new BufferedReader(
842 new FileReader(clanPath + "/NumberOfFamilies.txt"));
843 String line = br.readLine();
844 numberOfFamilies = Integer.parseInt(line);
846 if (numberOfFamilies == 1)
850 final String commandExportLocation = exportLocation + "/Clan" + clan;
851 createFolders(commandExportLocation);
852 for (int family = 0; family < numberOfFamilies; family++)
855 ArrayList<Integer> indexes = new ArrayList<>();
856 for (int i = 0; i < numberOfFamilies; i++)
864 int hmmIndex = getRandom(indexes);
865 String famPath = clanPath + "/Families/Fam" + family + ".sto";
866 String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
867 String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
868 + clanPath + "/Families/Fam" + hmmIndex + ".sto"
870 command += hmmPath + " ";
872 final int familyIndex = family;
873 final Process p = Runtime.getRuntime().exec(command);
875 new Thread(new Runnable()
880 BufferedReader input = new BufferedReader(
881 new InputStreamReader(p.getInputStream()));
886 PrintWriter writer = new PrintWriter(commandExportLocation
887 + "/Families/Fam" + familyIndex + ".sto");
888 String lastLine = "";
889 boolean dataFound = false;
890 while ((line = input.readLine()) != null)
892 if (line.contains("#=GR") && !dataFound)
894 writer.println(lastLine);
897 if (line.contains("#") || dataFound || " ".equals(line)
898 || "".equals(line) || "//".equals(line))
900 writer.println(line);
905 } catch (IOException e)
914 BufferedReader hmmExporter = new BufferedReader(
915 new FileReader(hmmPath));
917 exportFile(hmmExporter,
918 commandExportLocation + "/HMMs/HMM" + family + ".hmm",
921 alignmentsExported++;
925 PrintWriter writer = new PrintWriter(
926 commandExportLocation + "/NumberOfFamilies.txt");
927 writer.print(famCount);
933 public boolean atEnd(BufferedReader br) throws IOException
937 String line = br.readLine();
938 if ("".equals(line) || line == null)
946 public int getRandom(ArrayList<Integer> list)
948 if (!(list.size() > 0))
950 System.out.println("Error - size = " + list.size());
952 int index = generator.nextInt(list.size());
953 int value = list.get(index);
958 public void createFolders(String clanPath)
960 File clanFolder = new File(clanPath);
961 if (!clanFolder.exists())
966 File famFolder = new File(clanPath + "/Families");
967 File hmmFolder = new File(clanPath + "/HMMs");
968 if (!famFolder.exists())