3 import jalview.api.AlignExportSettingI;
4 import jalview.api.AlignmentViewPanel;
5 import jalview.datamodel.HMMNode;
6 import jalview.datamodel.HiddenMarkovModel;
7 import jalview.datamodel.SequenceI;
9 import java.io.BufferedReader;
10 import java.io.IOException;
11 import java.util.ArrayList;
12 import java.util.List;
13 import java.util.Scanner;
17 * Adds capability to read in and write out HMMER3 files. .
23 public class HMMFile extends AlignFile
24 implements AlignmentFileReaderI, AlignmentFileWriterI
26 private static final String TERMINATOR = "//";
29 * keys to data in HMM file, used to store as properties of the HiddenMarkovModel
31 public static final String HMM = "HMM";
33 public static final String NAME = "NAME";
35 public static final String ACCESSION_NUMBER = "ACC";
37 public static final String DESCRIPTION = "DESC";
39 public static final String LENGTH = "LENG";
41 public static final String MAX_LENGTH = "MAXL";
43 public static final String ALPHABET = "ALPH";
45 public static final String DATE = "DATE";
47 public static final String COMMAND_LOG = "COM";
49 public static final String NUMBER_OF_SEQUENCES = "NSEQ";
51 public static final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
53 public static final String CHECK_SUM = "CKSUM";
55 public static final String STATISTICS = "STATS";
57 public static final String COMPO = "COMPO";
59 public static final String GATHERING_THRESHOLD = "GA";
61 public static final String TRUSTED_CUTOFF = "TC";
63 public static final String NOISE_CUTOFF = "NC";
65 public static final String VITERBI = "VITERBI";
67 public static final String MSV = "MSV";
69 public static final String FORWARD = "FORWARD";
71 public static final String MAP = "MAP";
73 public static final String REFERENCE_ANNOTATION = "RF";
75 public static final String CONSENSUS_RESIDUE = "CONS";
77 public static final String CONSENSUS_STRUCTURE = "CS";
79 public static final String MASKED_VALUE = "MM";
81 private static final String ALPH_AMINO = "amino";
83 private static final String ALPH_DNA = "DNA";
85 private static final String ALPH_RNA = "RNA";
87 private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY";
89 private static final String ALPHABET_DNA = "ACGT";
91 private static final String ALPHABET_RNA = "ACGU";
93 private static final int NUMBER_OF_TRANSITIONS = 7;
95 private static final String SPACE = " ";
98 * optional guide line added to an output HMMER file, purely for readability
100 private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d";
102 private static String NL = System.lineSeparator();
104 private HiddenMarkovModel hmm;
106 // number of symbols in the alphabet used in the hidden Markov model
107 private int numberOfSymbols;
110 * Constructor that parses immediately
114 * @throws IOException
116 public HMMFile(String inFile, DataSourceType type) throws IOException
122 * Constructor that parses immediately
125 * @throws IOException
127 public HMMFile(FileParse source) throws IOException
133 * Default constructor
140 * Constructor for HMMFile used for exporting
143 * @param exportImmediately
145 public HMMFile(HiddenMarkovModel markov)
151 * Returns the HMM produced by parsing a HMMER3 file
155 public HiddenMarkovModel getHMM()
161 * Gets the name of the hidden Markov model
165 public String getName()
167 return hmm.getName();
171 * Reads the data from HMM file into the HMM model
178 hmm = new HiddenMarkovModel();
179 parseHeaderLines(dataIn);
181 } catch (Exception e)
188 * Reads the header properties from a HMMER3 file and saves them in the
189 * HiddeMarkovModel. This method exits after reading the next line after the
193 * @throws IOException
195 void parseHeaderLines(BufferedReader input) throws IOException
197 boolean readingHeaders = true;
198 hmm.setFileHeader(input.readLine());
199 String line = input.readLine();
200 while (readingHeaders && line != null)
202 Scanner parser = new Scanner(line);
203 String next = parser.next();
204 if (ALPHABET.equals(next))
206 String alphabetType = parser.next();
207 hmm.setProperty(ALPHABET, alphabetType);
208 String alphabet = ALPH_DNA.equalsIgnoreCase(alphabetType)
210 : (ALPH_RNA.equalsIgnoreCase(alphabetType) ? ALPHABET_RNA
212 numberOfSymbols = hmm.setAlphabet(alphabet);
214 else if (HMM.equals(next))
216 readingHeaders = false;
217 String symbols = line.substring(line.indexOf(HMM) + HMM.length());
218 numberOfSymbols = hmm.setAlphabet(symbols);
220 else if (STATISTICS.equals(next))
226 value = parser.next() + SPACE + SPACE + parser.next();
227 hmm.setProperty(key, value);
232 String value = parser.next();
233 while (parser.hasNext())
235 value = value + SPACE + parser.next();
237 hmm.setProperty(key, value);
240 line = input.readLine();
245 * Parses the model data from the HMMER3 file. The input buffer should be
246 * positioned at the (optional) COMPO line if there is one, else at the insert
247 * emissions line for the BEGIN node of the model.
250 * @throws IOException
252 void parseModel(BufferedReader input) throws IOException
255 * specification says there must always be an HMM header (already read)
256 * and one more header (guide headings) which is skipped here
259 String line = input.readLine();
260 List<HMMNode> nodes = new ArrayList<>();
262 while (line != null && !TERMINATOR.equals(line))
264 HMMNode node = new HMMNode();
266 Scanner scanner = new Scanner(line);
267 String next = scanner.next();
270 * expect COMPO (optional) for average match emissions
271 * or a node number followed by node's match emissions
273 if (COMPO.equals(next) || nodeNo > 0)
276 * parse match emissions
278 double[] matches = parseDoubles(scanner, numberOfSymbols);
279 node.setMatchEmissions(matches);
280 if (!COMPO.equals(next))
282 int resNo = parseAnnotations(scanner, node);
286 * no MAP annotation provided, just number off from 0 (begin node)
290 node.setResidueNumber(resNo);
292 line = input.readLine();
297 * parse insert emissions
299 scanner = new Scanner(line);
300 double[] inserts = parseDoubles(scanner, numberOfSymbols);
301 node.setInsertEmissions(inserts);
305 * parse state transitions
307 line = input.readLine();
308 scanner = new Scanner(line);
309 double[] transitions = parseDoubles(scanner,
310 NUMBER_OF_TRANSITIONS);
311 node.setStateTransitions(transitions);
313 line = input.readLine();
322 * Parses the annotations on the match emission line and add them to the node.
323 * (See p109 of the HMMER User Guide (V3.1b2) for the specification.) Returns
324 * the residue position that the node maps to, if provided, else zero.
329 int parseAnnotations(Scanner scanner, HMMNode node)
334 * map from hmm node to sequence position, if provided
336 if (scanner.hasNext())
338 String value = scanner.next();
339 if (!"-".equals(value))
343 mapTo = Integer.parseInt(value);
344 node.setResidueNumber(mapTo);
345 } catch (NumberFormatException e)
353 * hmm consensus residue if provided, else '-'
355 if (scanner.hasNext())
357 node.setConsensusResidue(scanner.next().charAt(0));
361 * RF reference annotation, if provided, else '-'
363 if (scanner.hasNext())
365 node.setReferenceAnnotation(scanner.next().charAt(0));
369 * 'm' for masked position, if provided, else '-'
371 if (scanner.hasNext())
373 node.setMaskValue(scanner.next().charAt(0));
377 * structure consensus symbol, if provided, else '-'
379 if (scanner.hasNext())
381 node.setConsensusStructure(scanner.next().charAt(0));
388 * Fills an array of doubles parsed from an input line
391 * @param numberOfElements
393 * @throws IOException
395 static double[] parseDoubles(Scanner input,
396 int numberOfElements) throws IOException
398 double[] values = new double[numberOfElements];
399 for (int i = 0; i < numberOfElements; i++)
401 if (!input.hasNext())
403 throw new IOException("Incomplete data");
405 String next = input.next();
406 if (next.contains("*"))
408 values[i] = Double.NEGATIVE_INFINITY;
412 double prob = Double.valueOf(next);
413 prob = Math.pow(Math.E, -prob);
421 * Returns a string to be added to the StringBuilder containing the entire
424 * @param initialColumnSeparation
425 * The initial whitespace separation between the left side of the
426 * file and first character.
427 * @param columnSeparation
428 * The separation between subsequent data entries.
430 * The list of data to be added to the String.
433 String addData(int initialColumnSeparation,
434 int columnSeparation, List<String> data)
437 boolean first = true;
438 for (String value : data)
440 int sep = first ? initialColumnSeparation : columnSeparation;
441 line += String.format("%" + sep + "s", value);
448 * Converts list of characters into a list of Strings.
451 * @return Returns the list of Strings.
453 List<String> charListToStringList(List<Character> list)
455 List<String> strList = new ArrayList<>();
456 for (char value : list)
458 String strValue = Character.toString(value);
459 strList.add(strValue);
465 * Converts an array of doubles into a list of Strings, rounded to the nearest
469 * @param noOfDecimals
472 List<String> doublesToStringList(double[] doubles)
474 List<String> strList = new ArrayList<>();
475 for (double value : doubles)
480 strValue = String.format("%.5f", value);
482 else if (value == -0.00000d)
484 strValue = "0.00000";
490 strList.add(strValue);
496 * Appends model data in string format to the string builder
500 void appendModelAsString(StringBuilder output)
502 output.append(HMM).append(" ");
503 String charSymbols = hmm.getSymbols();
504 for (char c : charSymbols.toCharArray())
506 output.append(String.format("%9s", c));
508 output.append(NL).append(TRANSITIONTYPELINE);
510 int length = hmm.getLength();
512 for (int nodeNo = 0; nodeNo <= length; nodeNo++)
514 String matchLine = String.format("%7s",
515 nodeNo == 0 ? COMPO : Integer.toString(nodeNo));
517 double[] doubleMatches = convertToLogSpace(
518 hmm.getNode(nodeNo).getMatchEmissions());
519 List<String> strMatches = doublesToStringList(doubleMatches);
520 matchLine += addData(10, 9, strMatches);
524 matchLine += SPACE + (hmm.getNodeMapPosition(nodeNo));
525 matchLine += SPACE + hmm.getConsensusResidue(nodeNo);
526 matchLine += SPACE + hmm.getReferenceAnnotation(nodeNo);
527 if (hmm.getFileHeader().contains("HMMER3/f"))
529 matchLine += SPACE + hmm.getMaskedValue(nodeNo);
530 matchLine += SPACE + hmm.getConsensusStructure(nodeNo);
534 output.append(NL).append(matchLine);
536 String insertLine = "";
538 double[] doubleInserts = convertToLogSpace(
539 hmm.getNode(nodeNo).getInsertEmissions());
540 List<String> strInserts = doublesToStringList(doubleInserts);
541 insertLine += addData(17, 9, strInserts);
543 output.append(NL).append(insertLine);
545 String transitionLine = "";
546 double[] doubleTransitions = convertToLogSpace(
547 hmm.getNode(nodeNo).getStateTransitions());
548 List<String> strTransitions = doublesToStringList(
550 transitionLine += addData(17, 9, strTransitions);
552 output.append(NL).append(transitionLine);
557 * Appends formatted HMM file properties to the string builder
561 void appendProperties(StringBuilder output)
563 output.append(hmm.getFileHeader());
565 String format = "%n%-5s %1s";
566 appendProperty(output, format, NAME);
567 appendProperty(output, format, ACCESSION_NUMBER);
568 appendProperty(output, format, DESCRIPTION);
569 appendProperty(output, format, LENGTH);
570 appendProperty(output, format, MAX_LENGTH);
571 appendProperty(output, format, ALPHABET);
572 appendBooleanProperty(output, format, REFERENCE_ANNOTATION);
573 appendBooleanProperty(output, format, MASKED_VALUE);
574 appendBooleanProperty(output, format, CONSENSUS_RESIDUE);
575 appendBooleanProperty(output, format, CONSENSUS_STRUCTURE);
576 appendBooleanProperty(output, format, MAP);
577 appendProperty(output, format, DATE);
578 appendProperty(output, format, NUMBER_OF_SEQUENCES);
579 appendProperty(output, format, EFF_NUMBER_OF_SEQUENCES);
580 appendProperty(output, format, CHECK_SUM);
581 appendProperty(output, format, GATHERING_THRESHOLD);
582 appendProperty(output, format, TRUSTED_CUTOFF);
583 appendProperty(output, format, NOISE_CUTOFF);
585 if (hmm.getMSV() != null)
587 format = "%n%-19s %18s";
588 output.append(String.format(format, "STATS LOCAL MSV", hmm.getMSV()));
590 output.append(String.format(format, "STATS LOCAL VITERBI",
593 output.append(String.format(format, "STATS LOCAL FORWARD",
599 * Appends 'yes' or 'no' for the given property, according to whether or not
600 * it is set in the HMM
604 * @param propertyName
606 private void appendBooleanProperty(StringBuilder output, String format,
609 boolean set = hmm.getBooleanProperty(propertyName);
610 output.append(String.format(format, propertyName,
611 set ? HiddenMarkovModel.YES : HiddenMarkovModel.NO));
615 * Appends the value of the given property to the output, if not null
619 * @param propertyName
621 private void appendProperty(StringBuilder output, String format,
624 String value = hmm.getProperty(propertyName);
627 output.append(String.format(format, propertyName, value));
632 public String print(SequenceI[] sequences, boolean jvsuffix)
634 if (sequences[0].getHMM() != null)
636 hmm = sequences[0].getHMM();
642 * Prints the .hmm file to a String.
646 public String print()
648 StringBuilder output = new StringBuilder();
649 appendProperties(output);
651 appendModelAsString(output);
652 output.append(NL).append(TERMINATOR).append(NL);
653 return output.toString();
657 * Converts the probabilities contained in an array into log space
661 double[] convertToLogSpace(double[] ds)
663 double[] converted = new double[ds.length];
664 for (int i = 0; i < ds.length; i++)
667 double logProb = -1 * Math.log(prob);
669 converted[i] = logProb;
675 * Returns the HMM sequence produced by reading a .hmm file.
678 public SequenceI[] getSeqsAsArray()
680 SequenceI hmmSeq = hmm.getConsensusSequence();
681 SequenceI[] seq = new SequenceI[1];
687 public void setNewlineString(String newLine)
693 public void setExportSettings(AlignExportSettingI exportSettings)
699 public void configureForView(AlignmentViewPanel viewpanel)
705 public boolean hasWarningMessage()
711 public String getWarningMessage()
713 return "warning message";