3 import jalview.datamodel.AlignmentAnnotation;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
6 import jalview.io.DataSourceType;
7 import jalview.io.FileParse;
8 import jalview.io.HMMFile;
9 import jalview.io.StockholmFile;
10 import jalview.schemes.ResidueProperties;
12 import java.io.BufferedReader;
14 import java.io.FileNotFoundException;
15 import java.io.FileOutputStream;
16 import java.io.FileReader;
17 import java.io.IOException;
18 import java.io.InputStreamReader;
19 import java.io.PrintWriter;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.List;
24 import java.util.Random;
25 import java.util.Scanner;
26 import java.util.Vector;
29 * Processes probability data. The file indexes used in this program represent
30 * the index of the location of a family or hmm in their respective files,
36 public class HMMProbabilityDistributionAnalyser
38 AlignmentAnnotation reference = null;
40 Vector<SequenceI> sequences;
42 HiddenMarkovModel hmm;
44 // contains the raw data produced
45 List<ArrayList<Double>> raw = new ArrayList<>();
47 // contains binned data
48 Map<String, Double> binned = new HashMap<>();
50 // location of the family file
51 String families = "/media/sf_Shared_Folder/PFAM/Family/SeedFamilies.seed";
53 // location of the file containing the family-clan links
54 final static String FAMILIESTOCLAN = "/media/sf_Shared_Folder/PFAM/Family/Clanlinks.dat";
56 // location of the HMM file
57 String hmms = "/media/sf_Shared_Folder/PFAM/HMMs/Pfam-A.hmm";
59 // suffix for raw file
60 final static String RAW = "/Raw.csv";
62 // suffix for binned file
63 final static String BINNED = "/Binned.csv";
65 // normalisation scale
66 final static double SCALE = 1;
68 // current position in file
69 int currentFilePosition = 0;
71 final static String NL = "\n";
73 Random generator = new Random();
78 boolean keepRaw = false;
81 * Sets the working directory.
85 public void setFolder(String path)
91 * Moves a buffered reader forward in the file by a certain amount of entries.
92 * Each entry in the file is delimited by '//'.
95 * The index of the location in the file.
99 public void moveLocationBy(int index, BufferedReader br)
102 for (int i = 0; i < index; i++)
104 String line = br.readLine();
105 while (!"//".equals(line))
107 line = br.readLine();
115 * Analyses a specified number of families and then saves the data. Before
116 * analysing the data, the previous saved data will be imported and after
117 * analysing this, the data is exported back into the file. The file must be
118 * in flat file format.
121 * The number of families to read before saving.
122 * @throws IOException
124 public void run(int increments, boolean keepRawData) throws IOException
126 keepRaw = keepRawData;
129 readPreviousData(currentFolder);
130 BufferedReader posReader = new BufferedReader(
131 new FileReader(currentFolder + "/CurrentPosition.txt"));
133 String line = posReader.readLine();
135 currentFilePosition = Integer.parseInt(line);
136 } catch (Exception e)
138 System.out.println("No previous data found");
143 BufferedReader inputSTO = new BufferedReader(new FileReader(families));
144 BufferedReader inputHMM = new BufferedReader(new FileReader(hmms));
148 moveLocationBy(currentFilePosition, inputHMM);
149 moveLocationBy(currentFilePosition, inputSTO);
153 while (filesRead < increments)
156 readStockholm(inputSTO);
160 int count = countValidResidues();
164 currentFilePosition++;
165 System.out.println(i);
169 PrintWriter p = new PrintWriter(
170 new File(currentFolder + "/CurrentPosition.txt"));
171 p.print(currentFilePosition);
173 exportData(currentFolder);
180 * Analyses all families and then saves the data. Before analysing the data,
181 * the previous saved data will be imported and after analysing this, the data
182 * is exported back into the file. The file must be in flat file format.
185 * The number of families to read before saving.
186 * @throws IOException
188 public void runToEnd(boolean keepRawData, boolean forClans)
191 keepRaw = keepRawData;
192 BufferedReader inputSTO = null;
193 BufferedReader inputHMM = null;
203 for (int clan = 0; clan < files; clan++)
205 System.out.println(clan);
206 String clanPath = "";
207 int numberOfFamilies = 0;
210 clanPath = currentFolder + "/Clan" + clan;
211 if (!new File(clanPath).exists())
215 BufferedReader famCountReader = new BufferedReader(
216 new FileReader(clanPath + "/NumberOfFamilies.txt"));
217 numberOfFamilies = Integer.parseInt(famCountReader.readLine());
221 numberOfFamilies = 1;
224 for (int fam = 0; fam < numberOfFamilies; fam++)
228 families = clanPath + "/Families/Fam" + fam + ".sto";
229 hmms = clanPath + "/HMMs/HMM" + fam + ".hmm";
232 inputSTO = new BufferedReader(new FileReader(families));
233 inputHMM = new BufferedReader(new FileReader(hmms));
237 boolean endReached = atEnd(inputSTO);
240 readStockholm(inputSTO);
243 int count = countValidResidues();
246 System.out.println(filesRead);
247 endReached = atEnd(inputSTO);
251 } catch (Exception e)
256 exportData(currentFolder);
263 * Reads the previous data from both files
266 * @throws IOException
268 public void readPreviousData(String source) throws IOException
278 * Reads the previous data from the binned file.
281 * @throws IOException
283 public void readBinned(String source) throws IOException
285 BufferedReader input = new BufferedReader(
286 new FileReader(source + BINNED));
287 String line = input.readLine();
288 binned = new HashMap<>();
289 while (!("".equals(line) || line == null))
291 Scanner scanner = new Scanner(line);
292 scanner.useDelimiter(",");
293 String key = scanner.next();
294 String value = scanner.next();
295 binned.put(key, Double.valueOf(value));
297 line = input.readLine();
304 * Reads the previous data from the raw file.
307 * @throws IOException
309 public void readRaw(String source) throws IOException
311 BufferedReader input = new BufferedReader(new FileReader(source + RAW));
312 String line = input.readLine();
318 Scanner numberScanner = new Scanner(line);
319 numberScanner.useDelimiter(",");
320 raw = new ArrayList<>();
321 while (numberScanner.hasNext())
323 numberScanner.next();
324 raw.add(new ArrayList<Double>());
326 numberScanner.close();
328 line = input.readLine();
329 while (!("".equals(line) || line == null))
331 Scanner scanner = new Scanner(line);
332 scanner.useDelimiter(",");
335 while (scanner.hasNext())
338 value = scanner.next();
339 if (!value.equals("EMPTY"))
341 raw.get(i).add(Double.parseDouble(value));
345 raw.get(i).add(null);
351 line = input.readLine();
358 * Counts the number of valid residues in the sequence.
362 public int countValidResidues()
366 for (int width = 0; width < sequences.size(); width++)
368 for (int length = 1; length < hmm.getLength() + 1; length++)
372 alignPos = hmm.getNodeAlignmentColumn(length);
374 symbol = sequences.get(width).getCharAt(alignPos);
375 if (ResidueProperties.aminoBackgroundFrequencies
376 .containsKey(symbol))
387 * Processes data, and stores it in both a raw and binned format.
391 public void processData(int count)
396 raw.add(new ArrayList<Double>());
397 rawPos = raw.size() - 1;
400 for (int width = 0; width < sequences.size(); width++)
402 for (int length = 1; length < hmm.getLength() + 1; length++)
406 alignPos = hmm.getNodeAlignmentColumn(length);
408 symbol = sequences.get(width).getCharAt(alignPos);
409 if (ResidueProperties.aminoBackgroundFrequencies
410 .containsKey(symbol))
415 prob = hmm.getMatchEmissionProbability(alignPos, symbol);
416 bfreq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
417 if (prob == 0 || bfreq == 0)
419 System.out.println("error");
421 llr = Math.log(prob / bfreq);
424 raw.get(rawPos).add(llr);
428 output = String.format("%.1f", llr);
429 total += Double.parseDouble(output);
430 if ("-0.0".equals(output))
434 if (binned.containsKey(output))
436 double prev = binned.get(output);
437 prev += (SCALE / count);
438 binned.put(output, prev);
443 binned.put(output, SCALE / count);
448 System.out.println(total / count);
453 * Reads in the sequence data from a Stockholm file.
456 * @throws IOException
458 public void readStockholm(BufferedReader inputSTO) throws IOException
460 FileParse parserSTO = new FileParse(inputSTO, "", DataSourceType.FILE);
461 StockholmFile file = new StockholmFile(parserSTO);
462 Vector<AlignmentAnnotation> annots = file.getAnnotations();
464 for (AlignmentAnnotation annot : annots)
466 if (annot.label.contains("Reference"))
471 sequences = file.getSeqs();
475 * Reads in the HMM data from a HMMer file.
478 * @throws IOException
480 public void readHMM(BufferedReader inputHMM) throws IOException
482 FileParse parserHMM = new FileParse(inputHMM, "", DataSourceType.FILE);
483 HMMFile file = new HMMFile(parserHMM);
490 * Exports both the binned and raw data into separate files.
493 * @throws FileNotFoundException
495 public void exportData(String location) throws FileNotFoundException
497 PrintWriter writerBin = new PrintWriter(new File(location + BINNED));
498 for (Map.Entry<String, Double> entry : binned.entrySet())
500 writerBin.println(entry.getKey() + "," + entry.getValue());
506 PrintWriter writerRaw = new PrintWriter(new File(location + RAW));
508 StringBuilder identifier = new StringBuilder();
510 for (int i = 1; i < raw.size() + 1; i++)
512 identifier.append("Fam " + i + ",");
515 writerRaw.println(identifier);
517 boolean rowIsEmpty = false;
522 StringBuilder string = new StringBuilder();
523 for (int column = 0; column < raw.size(); column++)
525 if (raw.get(column).size() <= row)
527 string.append("EMPTY,");
531 string.append(raw.get(column).get(row) + ",");
536 writerRaw.println(string);
545 * Prints the specified family on the console.
548 * @throws IOException
550 public void printFam(int index) throws IOException
552 BufferedReader br = new BufferedReader(new FileReader(families));
554 moveLocationBy(index, br);
556 String line = br.readLine();
558 while (!"//".equals(line))
560 System.out.println(line);
561 line = br.readLine();
563 System.out.println(line);
569 * Prints the specified HMM on the console.
572 * @throws IOException
574 public void printHMM(int index) throws IOException
576 BufferedReader br = new BufferedReader(new FileReader(hmms));
578 moveLocationBy(index, br);
580 String line = br.readLine();
582 while (!"//".equals(line))
584 System.out.println(line);
585 line = br.readLine();
587 System.out.println(line);
593 * Prints the specified family to a .sto file.
596 * @throws IOException
598 public void exportFam(int index, String location) throws IOException
600 BufferedReader br = new BufferedReader(new FileReader(families));
602 moveLocationBy(index, br);
604 String line = br.readLine();
605 PrintWriter writer = new PrintWriter(
606 new FileOutputStream(new File(location), true));
607 while (!"//".equals(line))
609 writer.println(line);
610 line = br.readLine();
612 writer.println(line);
618 public void exportFile(BufferedReader br, String location, boolean append)
621 String line = br.readLine();
622 PrintWriter writer = new PrintWriter(
623 new FileOutputStream(location, append));
624 while (!"//".equals(line))
626 writer.println(line);
627 line = br.readLine();
629 writer.println(line);
635 public String getHMMName(int index) throws IOException
639 BufferedReader nameFinder = new BufferedReader(new FileReader(hmms));
641 moveLocationBy(index, nameFinder);
643 nameFinder.readLine();
645 Scanner scanner = new Scanner(nameFinder.readLine());
646 name = scanner.next();
647 name = scanner.next();
652 public String getFamilyName(int index) throws IOException
656 BufferedReader nameFinder = new BufferedReader(
657 new FileReader(families));
659 moveLocationBy(index, nameFinder);
661 nameFinder.readLine();
663 Scanner scanner = new Scanner(nameFinder.readLine());
664 name = scanner.next();
665 name = scanner.next();
666 name = scanner.next();
672 * Prints the specified family to a .hmm file.
675 * @throws IOException
677 public void exportHMM(int index, String location) throws IOException
681 BufferedReader br = new BufferedReader(new FileReader(hmms));
683 moveLocationBy(index, br);
685 String line = br.readLine();
687 PrintWriter writer = new PrintWriter(
688 new FileOutputStream(new File(location), true));
689 while (!"//".equals(line))
691 writer.println(line);
692 line = br.readLine();
694 writer.println(line);
701 * Clears all raw, binned and current position data in the current directory.
703 * @throws FileNotFoundException
705 public void clear() throws FileNotFoundException
707 PrintWriter pos = new PrintWriter(
708 currentFolder + "/CurrentPosition.txt");
711 PrintWriter raw = new PrintWriter(currentFolder + RAW);
713 PrintWriter bin = new PrintWriter(currentFolder + BINNED);
720 public void sortIntoClans(String directory) throws IOException
722 BufferedReader clanFinder = new BufferedReader(new FileReader(FAMILIESTOCLAN));
723 BufferedReader familyReader = new BufferedReader(
724 new FileReader(families));
725 BufferedReader hmmReader = new BufferedReader(new FileReader(hmms));
727 // moveLocationBy(7000, familyReader);
728 // moveLocationBy(7000, clanFinder);
729 // moveLocationBy(7000, hmmReader);
730 HashMap<String, Integer> clanIndexes = new HashMap<>();
731 ArrayList<Integer> familyCounts = new ArrayList<>();
735 line = clanFinder.readLine();
737 while (!"".equals(line) && !" ".equals(line) && line != null)
740 boolean inClan = false;
741 while (!(line.indexOf("//") > -1))
744 if (line.indexOf("#=GF CL") > -1)
747 System.out.println(families);
749 Scanner scanner = new Scanner(line);
752 clanName = scanner.next();
755 if (!clanIndexes.containsKey(clanName))
757 clanIndexes.put(clanName, clanCount);
763 Integer clanI = clanIndexes.get(clanName);
764 String clanPath = directory + "/Clan" + clanI.toString();
765 createFolders(clanPath);
767 int index = clanIndexes.get(clanName);
768 exportFile(familyReader,
769 clanPath + "/Families/Fam" + familyCounts.get(index)
772 exportFile(hmmReader,
773 clanPath + "/HMMs/HMM" + familyCounts.get(index) + ".hmm",
776 int count = familyCounts.get(index);
778 familyCounts.set(index, count);
780 line = clanFinder.readLine();
785 moveLocationBy(1, familyReader);
786 moveLocationBy(1, hmmReader);
789 // System.out.println(filePos + " files read.");
790 line = clanFinder.readLine();
795 for (int clan = 0; clan < clanCount; clan++)
797 PrintWriter writer = new PrintWriter(
798 directory + "/Clan" + clan + "/NumberOfFamilies.txt");
799 int count = familyCounts.get(clan);
806 public String getFamilies()
811 public void setFamilies(String families)
813 this.families = currentFolder + families;
816 public String getHmms()
821 public void setHmms(String hmms)
823 this.hmms = currentFolder + hmms;
826 public void alignWithinClan(String exportLocation, String clansLocation)
827 throws IOException, InterruptedException
829 int alignmentsExported = 0;
830 for (int clan = 0; clan < 604; clan++)
832 System.out.println(clan);
834 String clanPath = clansLocation + "/Clan" + clan;
835 int numberOfFamilies;
836 BufferedReader br = new BufferedReader(
837 new FileReader(clanPath + "/NumberOfFamilies.txt"));
838 String line = br.readLine();
839 numberOfFamilies = Integer.parseInt(line);
841 if (numberOfFamilies == 1)
845 final String commandExportLocation = exportLocation + "/Clan" + clan;
846 createFolders(commandExportLocation);
847 for (int family = 0; family < numberOfFamilies; family++)
850 ArrayList<Integer> indexes = new ArrayList<>();
851 for (int i = 0; i < numberOfFamilies; i++)
859 int hmmIndex = getRandom(indexes);
860 String famPath = clanPath + "/Families/Fam" + family + ".sto";
861 String hmmPath = clanPath + "/HMMs/HMM" + hmmIndex + ".hmm";
862 String command = "/media/sf_Shared_Folder/hmmer/binaries/hmmalign --mapali "
863 + clanPath + "/Families/Fam" + hmmIndex + ".sto"
865 command += hmmPath + " ";
867 final int familyIndex = family;
868 final Process p = Runtime.getRuntime().exec(command);
870 new Thread(new Runnable()
875 BufferedReader input = new BufferedReader(
876 new InputStreamReader(p.getInputStream()));
881 PrintWriter writer = new PrintWriter(commandExportLocation
882 + "/Families/Fam" + familyIndex + ".sto");
883 String lastLine = "";
884 boolean dataFound = false;
885 while ((line = input.readLine()) != null)
887 if (line.contains("#=GR") && !dataFound)
889 writer.println(lastLine);
892 if (line.contains("#") || dataFound || " ".equals(line)
893 || "".equals(line) || "//".equals(line))
895 writer.println(line);
900 } catch (IOException e)
909 BufferedReader hmmExporter = new BufferedReader(
910 new FileReader(hmmPath));
912 exportFile(hmmExporter,
913 commandExportLocation + "/HMMs/HMM" + family + ".hmm",
916 alignmentsExported++;
920 PrintWriter writer = new PrintWriter(
921 commandExportLocation + "/NumberOfFamilies.txt");
922 writer.print(famCount);
928 public boolean atEnd(BufferedReader br) throws IOException
932 String line = br.readLine();
933 if ("".equals(line) || line == null)
941 public int getRandom(ArrayList<Integer> list)
943 if (!(list.size() > 0))
945 System.out.println("Error - size = " + list.size());
947 int index = generator.nextInt(list.size());
948 int value = list.get(index);
953 public void createFolders(String clanPath)
955 File clanFolder = new File(clanPath);
956 if (!clanFolder.exists())
961 File famFolder = new File(clanPath + "/Families");
962 File hmmFolder = new File(clanPath + "/HMMs");
963 if (!famFolder.exists())