package jalview.io; import jalview.datamodel.HMMNode; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.SequenceI; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import java.util.Scanner; /** * Adds capability to read in and write out HMMER3 files. Currently only supports HMMER3/f. * * * @author TZVanaalten * */ public class HMMFile extends AlignFile implements AlignmentFileReaderI, AlignmentFileWriterI { // HMM to store file data private HiddenMarkovModel hmm = new HiddenMarkovModel(); // number of possible transitions private final int NUMBER_OF_TRANSITIONS = 7; private final String NEW_LINE = "\n"; //number of symbols in the alphabet used in the hidden Markov model int numberOfSymbols; private final String SPACE = " "; private final String COMPO = "COMPO"; private final String EMPTY = ""; //This is a line that needs to be added to each HMMER£ file. It is purely for readability. private static final String TRANSITIONTYPELINE = "m->m m->i m->d i->m i->i d->m d->d"; /** * Constructor for HMMFile, parses immediately * * @param source * @throws IOException */ public HMMFile(FileParse source) throws IOException { super(false, source); parse(); } /** * Default constructor, do not use! */ public HMMFile() { } /** * Constructor for HMMFile used for exporting. * * @param hmm * @param exportImmediately */ public HMMFile(HiddenMarkovModel markov) { hmm = markov; } /** * Returns the HMM produced by reading in a HMMER3 file. * * @return */ public HiddenMarkovModel getHMM() { return hmm; } /** * Sets the HMM used in this file. * * @param model */ public void setHMM(HiddenMarkovModel model) { this.hmm = model; } /** * Gets the name of the hidden Markov model. * * @return */ public String getName() { return hmm.getName(); } /** * Reads the data from HMM file into the HMM field on this object. * * @throws IOException */ @Override public void parse() throws IOException { parseFileProperties(dataIn); parseModel(dataIn); } /** * Reads the data from HMM file into the HMM field on this object. * * @throws IOException */ public void parse(BufferedReader br) throws IOException { parseFileProperties(br); parseModel(br); } /** * Imports the file properties from a HMMER3 file. * * @param input * The buffered reader used to read in the file. * @throws IOException */ void parseFileProperties(BufferedReader input) throws IOException { boolean readingFile = true; hmm.setFileHeader(input.readLine()); String line = input.readLine(); while (readingFile) { if (line != null) { Scanner parser = new Scanner(line); String next = parser.next(); if ("HMM".equals(next)) // indicates start of HMM data (end of file // properties) { readingFile = false; hmm.fillSymbols(parser); numberOfSymbols = hmm.getNumberOfSymbols(); } else if ("STATS".equals(next)) { parser.next(); String key; String value; key = parser.next(); value = parser.next() + SPACE + SPACE + parser.next(); hmm.addFileProperty(key, value); } else { String key = next; String value = parser.next(); while (parser.hasNext()) { value = value + SPACE + parser.next(); } hmm.addFileProperty(key, value); } parser.close(); } line = input.readLine(); if (line == null) { readingFile = false; } } } /** * Parses the model data from the HMMER3 file * * @param input * The buffered reader used to read the file. * @throws IOException */ void parseModel(BufferedReader input) throws IOException { String line = input.readLine(); int node = 0; while (!"//".equals(line)) { hmm.getNodes().add(new HMMNode()); String next; Scanner matchReader = new Scanner(line); next = matchReader.next(); if (next.equals(COMPO) || node > 0) { // stores match emission line in list List matches = new ArrayList<>(); matches = fillList(matchReader, numberOfSymbols); hmm.getNodes().get(node).setMatchEmissions(matches); if (node > 0) { parseAnnotations(matchReader, node); } } matchReader.close(); // stores insert emission line in list line = input.readLine(); Scanner insertReader = new Scanner(line); List inserts = new ArrayList<>(); inserts = fillList(insertReader, numberOfSymbols); hmm.getNodes().get(node).setInsertEmissions(inserts); insertReader.close(); // stores state transition line in list line = input.readLine(); Scanner transitionReader = new Scanner(line); List transitions = new ArrayList<>(); transitions = fillList(transitionReader, NUMBER_OF_TRANSITIONS); hmm.getNodes().get(node).setStateTransitions(transitions); transitionReader.close(); line = input.readLine(); node++; } } /** * Parses the annotations on the match emission line. * * @param scanner * The scanner which is processing match emission line. * @param index * The index of node which is being scanned. */ void parseAnnotations(Scanner scanner, int index) { if (hmm.mapIsActive()) { int column; column = scanner.nextInt(); hmm.getNodes().get(index).setAlignmentColumn(column); hmm.getNodeLookup().put(column, index); } else { scanner.next(); } if (scanner.hasNext()) { char consensusR; consensusR = charValue(scanner.next()); hmm.getNodes().get(index).setConsensusResidue(consensusR); } if (scanner.hasNext()) { char reference; reference = charValue(scanner.next()); hmm.getNodes().get(index).setReferenceAnnotation(reference); } if (scanner.hasNext()) { char value; value = charValue(scanner.next()); hmm.getNodes().get(index).setMaskValue(value); } if (scanner.hasNext()) { char consensusS; consensusS = charValue(scanner.next()); hmm.getNodes().get(index).setConsensusStructure(consensusS); } } /** * Fills a list of doubles based on an input line. * * @param input * The scanner for the line containing the data to be transferred to * the list. * @param numberOfElements * The number of elements in the list to be filled. * @return filled list Returns the list of doubles. */ static List fillList(Scanner input, int numberOfElements) { List list = new ArrayList<>(); for (int i = 0; i < numberOfElements; i++) { String next = input.next(); if (next.contains("*")) // state transitions to or from delete states // occasionally have values of -infinity. These // values are represented by an * in the .hmm // file. { list.add(Double.NEGATIVE_INFINITY); } else { double prob = Double.valueOf(next); prob = Math.pow(Math.E, -prob); list.add(prob); } } return list; } /** * Writes a HMM to a file/ * * @param exportLocation * Filename, URL or Pasted String to write to. * @throws FileNotFoundException * @throws UnsupportedEncodingException * **/ public void exportFile(String exportLocation) throws IOException { PrintWriter writer = new PrintWriter(exportLocation); appendFileProperties(writer); appendModel(writer); writer.println("//"); writer.close(); } /** * Returns a string to be added to the StringBuilder containing the entire * output String. * * @param initialColumnSeparation * The initial whitespace separation between the left side of the * file and first character. * @param columnSeparation * The separation between subsequent data entries. * @param data * The list fo data to be added to the String. * @return */ String addData(int initialColumnSeparation, int columnSeparation, List data) { String line = EMPTY; int index = 0; for (String value : data) { if (index == 0) { line += String.format("%" + initialColumnSeparation + "s", value); } else { line += String.format("%" + columnSeparation + "s", value); } index++; } return line; } /** * Converts list of characters into a list of Strings. * * @param list * @return Returns the list of Strings. */ List charListToStringList(List list) { List strList = new ArrayList<>(); for (char value : list) { String strValue = Character.toString(value); strList.add(strValue); } return strList; } /** * Converts a list of doubles into a list of Strings, rounded to the nearest * 5th decimal place. * * @param list * @param noOfDecimals * @return */ List doubleListToStringList(List list) { List strList = new ArrayList<>(); for (double value : list) { String strValue; if (value > 0) { strValue = String.format("%.5f", value); } else if (value == -0.00000d) { strValue = "0.00000"; } else { strValue = "*"; } strList.add(strValue); } return strList; } /** * Converts a primitive array of Strings to a list of Strings. * * @param array * @return */ List stringArrayToStringList(String[] array) { List list = new ArrayList<>(); for (String value : array) { list.add(value); } return list; } /** * Appends the hidden Markov model data to the StringBuilder containing the * output * * @param file * The StringBuilder containing the output. */ void appendModel(PrintWriter writer) { String symbolLine = "HMM"; List charSymbols = hmm.getSymbols(); List strSymbols; strSymbols = charListToStringList(charSymbols); symbolLine += addData(11, 9, strSymbols); writer.println(symbolLine); writer.println(TRANSITIONTYPELINE); int length = hmm.getLength(); for (int node = 0; node <= length; node++) { String matchLine; if (node == 0) { matchLine = String.format("%7s", "COMPO"); } else { matchLine = String.format("%7s", node); } List strMatches; List doubleMatches; doubleMatches = convertListToLogSpace( hmm.getNode(node).getMatchEmissions()); strMatches = doubleListToStringList(doubleMatches); matchLine += addData(10, 9, strMatches); if (node != 0) { matchLine += SPACE + (hmm.getNodeAlignmentColumn(node) + 1); matchLine += SPACE + hmm.getConsensusResidue(node); matchLine += SPACE + hmm.getReferenceAnnotation(node); if (hmm.getFileHeader().contains("HMMER3/f")) { matchLine += SPACE + hmm.getMaskedValue(node); matchLine += SPACE + hmm.getConsensusStructure(node); } } writer.println(matchLine); String insertLine = EMPTY; List strInserts; List doubleInserts; doubleInserts = convertListToLogSpace( hmm.getNode(node).getInsertEmissions()); strInserts = doubleListToStringList(doubleInserts); insertLine += addData(17, 9, strInserts); writer.println(insertLine); String transitionLine = EMPTY; List strTransitions; List doubleTransitions; doubleTransitions = convertListToLogSpace( hmm.getNode(node).getStateTransitions()); strTransitions = doubleListToStringList(doubleTransitions); transitionLine += addData(17, 9, strTransitions); writer.println(transitionLine); } } /** * Appends the hidden Markov model file properties to the StringBuilder * containing the output * * @param file * The StringBuilder containing the output. */ void appendFileProperties(PrintWriter writer) { String line; writer.println(hmm.getFileHeader()); line = String.format("%-5s %1s", "NAME", hmm.getName()); writer.println((line)); if (hmm.getAccessionNumber() != null) { line = String.format("%-5s %1s", "ACC", hmm.getAccessionNumber()); writer.println((line)); } if (hmm.getDescription() != null) { line = String.format("%-5s %1s", "DESC", hmm.getDescription()); writer.println((line)); } line = String.format("%-5s %1s", "LENG", hmm.getLength()); writer.println((line)); if (hmm.getMaxInstanceLength() != null) { line = String.format("%-5s %1s", "MAXL", hmm.getMaxInstanceLength()); writer.println((line)); } line = String.format("%-5s %1s", "ALPH", hmm.getAlphabetType()); writer.println((line)); boolean status; String statusStr; status = hmm.referenceAnnotationIsActive(); statusStr = HiddenMarkovModel.findStringFromBoolean(status); line = String.format("%-5s %1s", "RF", statusStr); writer.println((line)); status = hmm.maskValueIsActive(); statusStr = HiddenMarkovModel.findStringFromBoolean(status); line = String.format("%-5s %1s", "MM", statusStr); writer.println((line)); status = hmm.consensusResidueIsActive(); statusStr = HiddenMarkovModel.findStringFromBoolean(status); line = String.format("%-5s %1s", "CONS", statusStr); writer.println((line)); status = hmm.consensusStructureIsActive(); statusStr = HiddenMarkovModel.findStringFromBoolean(status); line = String.format("%-5s %1s", "CS", statusStr); writer.println((line)); status = hmm.mapIsActive(); statusStr = HiddenMarkovModel.findStringFromBoolean(status); line = String.format("%-5s %1s", "MAP", statusStr); writer.println((line)); if (hmm.getDate() != null) { line = String.format("%-5s %1s", "DATE", hmm.getDate()); writer.println((line)); } if (hmm.getNumberOfSequences() != null) { line = String.format("%-5s %1s", "NSEQ", hmm.getNumberOfSequences()); writer.println((line)); } if (hmm.getEffectiveNumberOfSequences() != null) { line = String.format("%-5s %1s", "EFFN", hmm.getEffectiveNumberOfSequences()); writer.println((line)); } if (hmm.getCheckSum() != null) { line = String.format("%-5s %1s", "CKSUM", hmm.getCheckSum()); writer.println((line)); } if (hmm.getGatheringThreshold() != null) { line = String.format("%-5s %1s", "GA", hmm.getGatheringThreshold()); writer.println((line)); } if (hmm.getTrustedCutoff() != null) { line = String.format("%-5s %1s", "TC", hmm.getTrustedCutoff()); writer.println((line)); } if (hmm.getNoiseCutoff() != null) { line = String.format("%-5s %1s", "NC", hmm.getNoiseCutoff()); writer.println((line)); } if (hmm.getMSV() != null) { line = String.format("%-19s %18s", "STATS LOCAL MSV", hmm.getMSV()); writer.println((line)); line = String.format("%-19s %18s", "STATS LOCAL VITERBI", hmm.getViterbi()); writer.println((line)); line = String.format("%-19s %18s", "STATS LOCAL FORWARD", hmm.getForward()); writer.println((line)); } } /** * Returns the char value of a single lettered String. * * @param string * @return */ char charValue(String string) { char character; character = string.charAt(0); return character; } @Override public String print(SequenceI[] seqs, boolean jvsuffix) { return null; } /** * Converts the probabilities contained in a list into log space. * * @param list */ List convertListToLogSpace(List list) { List convertedList = new ArrayList<>(); for (int i = 0; i < list.size(); i++) { double prob = list.get(i); double logProb = -1 * Math.log(prob); convertedList.add(logProb); } return convertedList; } }