package jalview.io; import jalview.api.AlignExportSettingI; import jalview.api.AlignmentViewPanel; import jalview.datamodel.HMMNode; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.SequenceI; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Scanner; /** * Adds capability to read in and write out HMMER3 files. . * * * @author TZVanaalten * */ public class HMMFile extends AlignFile implements AlignmentFileReaderI, AlignmentFileWriterI { private static final String TERMINATOR = "//"; /* * keys to data in HMM file, used to store as properties of the HiddenMarkovModel */ public static final String HMM = "HMM"; public static final String NAME = "NAME"; public static final String ACCESSION_NUMBER = "ACC"; public static final String DESCRIPTION = "DESC"; public static final String LENGTH = "LENG"; public static final String MAX_LENGTH = "MAXL"; public static final String ALPHABET = "ALPH"; public static final String DATE = "DATE"; public static final String COMMAND_LOG = "COM"; public static final String NUMBER_OF_SEQUENCES = "NSEQ"; public static final String EFF_NUMBER_OF_SEQUENCES = "EFFN"; public static final String CHECK_SUM = "CKSUM"; public static final String STATISTICS = "STATS"; public static final String COMPO = "COMPO"; public static final String GATHERING_THRESHOLD = "GA"; public static final String TRUSTED_CUTOFF = "TC"; public static final String NOISE_CUTOFF = "NC"; public static final String VITERBI = "VITERBI"; public static final String MSV = "MSV"; public static final String FORWARD = "FORWARD"; public static final String MAP = "MAP"; public static final String REFERENCE_ANNOTATION = "RF"; public static final String CONSENSUS_RESIDUE = "CONS"; public static final String CONSENSUS_STRUCTURE = "CS"; public static final String MASKED_VALUE = "MM"; private static final String ALPH_AMINO = "amino"; private static final String ALPH_DNA = "DNA"; private static final String ALPH_RNA = "RNA"; private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY"; private static final String ALPHABET_DNA = "ACGT"; private static final String ALPHABET_RNA = "ACGU"; private static final int NUMBER_OF_TRANSITIONS = 7; private static final String SPACE = " "; /* * optional guide line added to an output HMMER file, purely for readability */ private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d"; private static String NL = System.lineSeparator(); private HiddenMarkovModel hmm; // number of symbols in the alphabet used in the hidden Markov model private int numberOfSymbols; /** * Constructor that parses immediately * * @param inFile * @param type * @throws IOException */ public HMMFile(String inFile, DataSourceType type) throws IOException { super(inFile, type); } /** * Constructor that parses immediately * * @param source * @throws IOException */ public HMMFile(FileParse source) throws IOException { super(source); } /** * Default constructor */ public HMMFile() { } /** * Constructor for HMMFile used for exporting * * @param hmm */ public HMMFile(HiddenMarkovModel markov) { hmm = markov; } /** * Returns the HMM produced by parsing a HMMER3 file * * @return */ public HiddenMarkovModel getHMM() { return hmm; } /** * Gets the name of the hidden Markov model * * @return */ public String getName() { return hmm.getName(); } /** * Reads the data from HMM file into the HMM model */ @Override public void parse() { try { hmm = new HiddenMarkovModel(); parseHeaderLines(dataIn); parseModel(dataIn); } catch (Exception e) { e.printStackTrace(); } } /** * Reads the header properties from a HMMER3 file and saves them in the * HiddeMarkovModel. This method exits after reading the next line after the * HMM line. * * @param input * @throws IOException */ void parseHeaderLines(BufferedReader input) throws IOException { boolean readingHeaders = true; hmm.setFileHeader(input.readLine()); String line = input.readLine(); while (readingHeaders && line != null) { Scanner parser = new Scanner(line); String next = parser.next(); if (ALPHABET.equals(next)) { String alphabetType = parser.next(); hmm.setProperty(ALPHABET, alphabetType); String alphabet = ALPH_DNA.equalsIgnoreCase(alphabetType) ? ALPHABET_DNA : (ALPH_RNA.equalsIgnoreCase(alphabetType) ? ALPHABET_RNA : ALPHABET_AMINO); numberOfSymbols = hmm.setAlphabet(alphabet); } else if (HMM.equals(next)) { readingHeaders = false; String symbols = line.substring(line.indexOf(HMM) + HMM.length()); numberOfSymbols = hmm.setAlphabet(symbols); } else if (STATISTICS.equals(next)) { parser.next(); String key; String value; key = parser.next(); value = parser.next() + SPACE + SPACE + parser.next(); hmm.setProperty(key, value); } else { String key = next; String value = parser.next(); while (parser.hasNext()) { value = value + SPACE + parser.next(); } hmm.setProperty(key, value); } parser.close(); line = input.readLine(); } } /** * Parses the model data from the HMMER3 file. The input buffer should be * positioned at the (optional) COMPO line if there is one, else at the insert * emissions line for the BEGIN node of the model. * * @param input * @throws IOException */ void parseModel(BufferedReader input) throws IOException { /* * specification says there must always be an HMM header (already read) * and one more header (guide headings) which is skipped here */ int nodeNo = 0; String line = input.readLine(); List nodes = new ArrayList<>(); while (line != null && !TERMINATOR.equals(line)) { HMMNode node = new HMMNode(); nodes.add(node); Scanner scanner = new Scanner(line); String next = scanner.next(); /* * expect COMPO (optional) for average match emissions * or a node number followed by node's match emissions */ if (COMPO.equals(next) || nodeNo > 0) { /* * parse match emissions */ double[] matches = parseDoubles(scanner, numberOfSymbols); node.setMatchEmissions(matches); if (!COMPO.equals(next)) { int resNo = parseAnnotations(scanner, node); if (resNo == 0) { /* * no MAP annotation provided, just number off from 0 (begin node) */ resNo = nodeNo; } node.setResidueNumber(resNo); } line = input.readLine(); } scanner.close(); /* * parse insert emissions */ scanner = new Scanner(line); double[] inserts = parseDoubles(scanner, numberOfSymbols); node.setInsertEmissions(inserts); scanner.close(); /* * parse state transitions */ line = input.readLine(); scanner = new Scanner(line); double[] transitions = parseDoubles(scanner, NUMBER_OF_TRANSITIONS); node.setStateTransitions(transitions); scanner.close(); line = input.readLine(); nodeNo++; } hmm.setNodes(nodes); } /** * Parses the annotations on the match emission line and add them to the node. * (See p109 of the HMMER User Guide (V3.1b2) for the specification.) Returns * the residue position that the node maps to, if provided, else zero. * * @param scanner * @param node */ int parseAnnotations(Scanner scanner, HMMNode node) { int mapTo = 0; /* * map from hmm node to sequence position, if provided */ if (scanner.hasNext()) { String value = scanner.next(); if (!"-".equals(value)) { try { mapTo = Integer.parseInt(value); node.setResidueNumber(mapTo); } catch (NumberFormatException e) { // ignore } } } /* * hmm consensus residue if provided, else '-' */ if (scanner.hasNext()) { node.setConsensusResidue(scanner.next().charAt(0)); } /* * RF reference annotation, if provided, else '-' */ if (scanner.hasNext()) { node.setReferenceAnnotation(scanner.next().charAt(0)); } /* * 'm' for masked position, if provided, else '-' */ if (scanner.hasNext()) { node.setMaskValue(scanner.next().charAt(0)); } /* * structure consensus symbol, if provided, else '-' */ if (scanner.hasNext()) { node.setConsensusStructure(scanner.next().charAt(0)); } return mapTo; } /** * Fills an array of doubles parsed from an input line * * @param input * @param numberOfElements * @return * @throws IOException */ static double[] parseDoubles(Scanner input, int numberOfElements) throws IOException { double[] values = new double[numberOfElements]; for (int i = 0; i < numberOfElements; i++) { if (!input.hasNext()) { throw new IOException("Incomplete data"); } String next = input.next(); if (next.contains("*")) { values[i] = Double.NEGATIVE_INFINITY; } else { double prob = Double.valueOf(next); prob = Math.pow(Math.E, -prob); values[i] = prob; } } return values; } /** * Returns a string to be added to the StringBuilder containing the entire * output String. * * @param initialColumnSeparation * The initial whitespace separation between the left side of the * file and first character. * @param columnSeparation * The separation between subsequent data entries. * @param data * The list of data to be added to the String. * @return */ String addData(int initialColumnSeparation, int columnSeparation, List data) { String line = ""; boolean first = true; for (String value : data) { int sep = first ? initialColumnSeparation : columnSeparation; line += String.format("%" + sep + "s", value); first = false; } return line; } /** * Converts list of characters into a list of Strings. * * @param list * @return Returns the list of Strings. */ List charListToStringList(List list) { List strList = new ArrayList<>(); for (char value : list) { String strValue = Character.toString(value); strList.add(strValue); } return strList; } /** * Converts an array of doubles into a list of Strings, rounded to the nearest * 5th decimal place * * @param doubles * @param noOfDecimals * @return */ List doublesToStringList(double[] doubles) { List strList = new ArrayList<>(); for (double value : doubles) { String strValue; if (value > 0) { strValue = String.format("%.5f", value); } else if (value == -0.00000d) { strValue = "0.00000"; } else { strValue = "*"; } strList.add(strValue); } return strList; } /** * Appends model data in string format to the string builder * * @param output */ void appendModelAsString(StringBuilder output) { output.append(HMM).append(" "); String charSymbols = hmm.getSymbols(); for (char c : charSymbols.toCharArray()) { output.append(String.format("%9s", c)); } output.append(NL).append(TRANSITIONTYPELINE); int length = hmm.getLength(); for (int nodeNo = 0; nodeNo <= length; nodeNo++) { String matchLine = String.format("%7s", nodeNo == 0 ? COMPO : Integer.toString(nodeNo)); double[] doubleMatches = convertToLogSpace( hmm.getNode(nodeNo).getMatchEmissions()); List strMatches = doublesToStringList(doubleMatches); matchLine += addData(10, 9, strMatches); if (nodeNo != 0) { matchLine += SPACE + (hmm.getNodeMapPosition(nodeNo)); matchLine += SPACE + hmm.getConsensusResidue(nodeNo); matchLine += SPACE + hmm.getReferenceAnnotation(nodeNo); if (hmm.getFileHeader().contains("HMMER3/f")) { matchLine += SPACE + hmm.getMaskedValue(nodeNo); matchLine += SPACE + hmm.getConsensusStructure(nodeNo); } } output.append(NL).append(matchLine); String insertLine = ""; double[] doubleInserts = convertToLogSpace( hmm.getNode(nodeNo).getInsertEmissions()); List strInserts = doublesToStringList(doubleInserts); insertLine += addData(17, 9, strInserts); output.append(NL).append(insertLine); String transitionLine = ""; double[] doubleTransitions = convertToLogSpace( hmm.getNode(nodeNo).getStateTransitions()); List strTransitions = doublesToStringList( doubleTransitions); transitionLine += addData(17, 9, strTransitions); output.append(NL).append(transitionLine); } } /** * Appends formatted HMM file properties to the string builder * * @param output */ void appendProperties(StringBuilder output) { output.append(hmm.getFileHeader()); String format = "%n%-5s %1s"; appendProperty(output, format, NAME); appendProperty(output, format, ACCESSION_NUMBER); appendProperty(output, format, DESCRIPTION); appendProperty(output, format, LENGTH); appendProperty(output, format, MAX_LENGTH); appendProperty(output, format, ALPHABET); appendBooleanProperty(output, format, REFERENCE_ANNOTATION); appendBooleanProperty(output, format, MASKED_VALUE); appendBooleanProperty(output, format, CONSENSUS_RESIDUE); appendBooleanProperty(output, format, CONSENSUS_STRUCTURE); appendBooleanProperty(output, format, MAP); appendProperty(output, format, DATE); appendProperty(output, format, NUMBER_OF_SEQUENCES); appendProperty(output, format, EFF_NUMBER_OF_SEQUENCES); appendProperty(output, format, CHECK_SUM); appendProperty(output, format, GATHERING_THRESHOLD); appendProperty(output, format, TRUSTED_CUTOFF); appendProperty(output, format, NOISE_CUTOFF); if (hmm.getMSV() != null) { format = "%n%-19s %18s"; output.append(String.format(format, "STATS LOCAL MSV", hmm.getMSV())); output.append(String.format(format, "STATS LOCAL VITERBI", hmm.getViterbi())); output.append(String.format(format, "STATS LOCAL FORWARD", hmm.getForward())); } } /** * Appends 'yes' or 'no' for the given property, according to whether or not * it is set in the HMM * * @param output * @param format * @param propertyName */ private void appendBooleanProperty(StringBuilder output, String format, String propertyName) { boolean set = hmm.getBooleanProperty(propertyName); output.append(String.format(format, propertyName, set ? HiddenMarkovModel.YES : HiddenMarkovModel.NO)); } /** * Appends the value of the given property to the output, if not null * * @param output * @param format * @param propertyName */ private void appendProperty(StringBuilder output, String format, String propertyName) { String value = hmm.getProperty(propertyName); if (value != null) { output.append(String.format(format, propertyName, value)); } } @Override public String print(SequenceI[] sequences, boolean jvsuffix) { if (sequences[0].getHMM() != null) { hmm = sequences[0].getHMM(); } return print(); } /** * Prints the .hmm file to a String. * * @return */ public String print() { StringBuilder output = new StringBuilder(); appendProperties(output); output.append(NL); appendModelAsString(output); output.append(NL).append(TERMINATOR).append(NL); return output.toString(); } /** * Converts the probabilities contained in an array into log space * * @param ds */ double[] convertToLogSpace(double[] ds) { double[] converted = new double[ds.length]; for (int i = 0; i < ds.length; i++) { double prob = ds[i]; double logProb = -1 * Math.log(prob); converted[i] = logProb; } return converted; } /** * Returns the HMM sequence produced by reading a .hmm file. */ @Override public SequenceI[] getSeqsAsArray() { SequenceI hmmSeq = hmm.getConsensusSequence(); SequenceI[] seq = new SequenceI[1]; seq[0] = hmmSeq; return seq; } @Override public void setNewlineString(String newLine) { NL = newLine; } @Override public void setExportSettings(AlignExportSettingI exportSettings) { } @Override public void configureForView(AlignmentViewPanel viewpanel) { } @Override public boolean hasWarningMessage() { return false; } @Override public String getWarningMessage() { return "warning message"; } }